def worldbank_plot( df: pd.DataFrame, title: str, dates_are_yearly: bool, figure_size=(12, 6), add_points=False, **plot_kwargs, ) -> p9.ggplot: """ Carefully written to support all worldbank plots, this method is the one place where the app needs themes, colour maps and various plot related settings. For sparse datasets it used geom_point() in addition to geom_line() in case the data is so sparse that lines cannot be drawn. Returns a ggplot instance or raises an exception if the dataframe is empty. """ if df is None: print(f"No usable data/plot for {title}") raise Http404(f"No data for {title}") pct_na = (df["metric"].isnull().sum() / len(df)) * 100.0 assert pct_na >= 0.0 and pct_na <= 100.0 plot = (p9.ggplot(df, p9.aes("date", "metric", **plot_kwargs)) + p9.geom_path(size=1.2) + p9.scale_y_continuous(labels=label_shorten)) if dates_are_yearly: plot += p9.scale_x_datetime(labels=date_format( "%Y")) # yearly data? if so only print the year on the x-axis # if pct_na is too high, geom_path() may be unable to draw a line (each value is surrounded by nan preventing a path) # so we use geom_point() to highlight the sparse nature of the data if pct_na >= 30.0 or add_points or df["metric"].count() <= 3: plot += p9.geom_point(size=3.0) return user_theme(plot, y_axis_label="Value", figure_size=figure_size)
def plot_cumulative_returns(wanted_stocks: Iterable[str], ld: LazyDictionary) -> p9.ggplot: df = ld["cip_df"] df = df.filter(wanted_stocks, axis=0).filter(regex="^\d", axis=1) dates = set(df.columns) movers = df movers["asx_code"] = movers.index movers = movers.melt(id_vars="asx_code", value_vars=dates) movers = movers[(movers["value"] < -5.0) | (movers["value"] > 5.0)] # ignore small movers # print(movers) movers["fetch_date"] = pd.to_datetime(movers["fetch_date"], format="%Y-%m-%d") # need to have separate dataframe's for positive and negative stocks - otherwise plotnine plot will be wrong #print(df) pos_df = df.agg([positive_sum]) neg_df = df.agg([negative_sum]) pos_df = pos_df.melt(value_vars=dates) neg_df = neg_df.melt(value_vars=dates) pos_df["fetch_date"] = pd.to_datetime(pos_df["fetch_date"], format="%Y-%m-%d") neg_df["fetch_date"] = pd.to_datetime(neg_df["fetch_date"], format="%Y-%m-%d") plot = (p9.ggplot() + p9.geom_bar( p9.aes(x="fetch_date", y="value"), data=pos_df, stat="identity", fill="green", ) + p9.geom_bar( p9.aes(x="fetch_date", y="value"), data=neg_df, stat="identity", fill="red", ) + p9.geom_point( p9.aes( x="fetch_date", y="value", fill="asx_code", ), data=movers, size=3, position=p9.position_dodge(width=0.4), colour="black", )) return user_theme( plot, y_axis_label="Cumulative Return (%)", legend_position="right", asxtrade_want_cmap_d=False, asxtrade_want_fill_d= True, # points (stocks) are filled with the user-chosen theme, but everything else is fixed )
def plot_metrics(df: pd.DataFrame, use_short_labels=False, **kwargs): plot = ( p9.ggplot(df, p9.aes(x="date", y="value", colour="metric")) + p9.geom_line(size=1.3) + p9.geom_point(size=3) ) if use_short_labels: plot += p9.scale_y_continuous(labels=label_shorten) n_metrics = df["metric"].nunique() return user_theme( plot, subplots_adjust={"left": 0.2}, figure_size=(12, int(n_metrics * 1.5)), **kwargs, )
def make_plot(ld: LazyDictionary): plot_df = None has_yearly = False n_datasets = 0 add_points = False for i in indicators: try: df = fetch_data( i, [country], fill_missing=lambda df: df.resample("AS").asfreq()) if df is None or len(df) == 0: continue except: # Data load fail? print( f"WARNING: unable to load worldbank dataset {i} - ignored" ) traceback.print_exc() continue n_datasets += 1 df["dataset"] = f"{i.name} ({i.wb_id})" if "-yearly-" in i.tag: has_yearly = True pct_na = (df["metric"].isnull().sum() / len(df)) * 100.0 if pct_na > 30.0 or df["metric"].count() <= 3: add_points = True if plot_df is None: plot_df = df else: # if any indicator is sparse, we enable points for all indicators to be able to see them all plot_df = plot_df.append(df) # print(plot_df) figure_size = (12, n_datasets * 1.5) kwargs = {"group": "dataset", "colour": "dataset"} plot = worldbank_plot( plot_df, "", has_yearly, figure_size=figure_size, add_points=add_points, **kwargs, ) plot += p9.facet_wrap("~dataset", ncol=1, scales="free_y") return user_theme(plot, figure_size=figure_size)
def form_valid(self, form): df = fetch_dataframe(self.data_flow.name) if df is None or len(df) == 0: raise Http404(f"Unable to load dataframe: {self.data_flow}") filter_performance = [] for k, v in form.cleaned_data.items(): rows_at_start = len(df) print( f"Filtering rows for {k}: total {rows_at_start} rows at start") k = k[len("dimension_"):] if rows_at_start < 10000: unique_values_left = df[k].unique() else: unique_values_left = set() df = df[df[k] == v] rows_at_end = len(df) filter_performance.append( (k, v, rows_at_start, rows_at_end, unique_values_left)) print(f"After filtering: now {rows_at_end} rows") if len(df) == 0: warning(self.request, f"No rows of data left after filtering: {k} {v}") break plot = None plot_title = "" if len(df) > 0: plot_title, x_axis_column, y_axis_column, df = detect_dataframe( df, self.data_flow) plot = (p9.ggplot(df, p9.aes(x=x_axis_column, y=y_axis_column)) + p9.geom_point() + p9.geom_line()) plot = user_theme(plot) context = self.get_context_data() cache_key = "-".join(sorted(form.cleaned_data.values())) + "-ecb-plot" context.update({ "dataflow": self.data_flow, "dataflow_name": self.data_flow.name, "filter_performance": filter_performance, "plot_title": plot_title, "plot_uri": cache_plot(cache_key, lambda: plot), }) return render(self.request, self.template_name, context)
def plot(df: pd.DataFrame) -> p9.ggplot: plot = ( p9.ggplot( df, p9.aes( x="fetch_date", y="value", color="sector_name", # group="sector_name" ), ) + p9.geom_line(size=1.2) + p9.facet_wrap("~sector_name", ncol=2, scales="free_y") + p9.scale_y_continuous(labels=label_shorten) ) return user_theme( plot, y_axis_label="Total sector earnings ($AUD, positive contributions only)", figure_size=(12, 14), subplots_adjust={"wspace": 0.25}, )
def make_plot(self, df: pd.DataFrame, timeframe: Timeframe) -> p9.ggplot: # print(df) col_names = set(df.columns) if "High" in col_names and "Low" in col_names: df["Range"] = df["High"] - df["Low"] df["Date"] = pd.to_datetime(df.index, format="%Y-%m-%d") melted_df = df.melt(value_vars=self.value_vars, id_vars="Date", value_name="value") # print(melted_df) plot = (p9.ggplot( melted_df, p9.aes(x="Date", y="value", group="variable", color="variable"), ) + p9.geom_line(size=1.3) + p9.facet_wrap( "~variable", ncol=1, nrow=melted_df["variable"].nunique(), scales="free_y", )) return user_theme(plot)
def cluster_plot(ld: LazyDictionary): _, _, centroids, idx, data_df = make_kmeans_cluster_dataframe( timeframe, chosen_k, asx_codes ) centroids_df = pd.DataFrame.from_records( centroids, columns=["return", "volatility"] ) plot = ( p9.ggplot( data_df, p9.aes("return", "volatility", colour="factor(cluster_id)") ) + p9.geom_point(size=3) + p9.facet_wrap("~cluster_id", ncol=3, scales="free") ) return user_theme( plot, x_axis_label="Returns (%)", y_axis_label="Volatility (%)", figure_size=(15, 15), subplots_adjust={"hspace": 0.15, "wspace": 0.15}, )
def plot_abs_dataframe(self, df: pd.DataFrame) -> p9.ggplot: facets = [] n_per_facet = {} print(df) for col in df.columns: try: n_values = df[col].nunique() if n_values == 1 and col not in [ "TIME_PERIOD", "value", "Measure", "OBS_COMMENT", ]: self.fixed_datapoints.add(f"{col}={df.at[0, col]}") elif n_values > 1 and col not in [ "value", "TIME_PERIOD", "OBS_COMMENT", ]: facets.append(col) n_per_facet[col] = n_values except: print(f"Ignoring unusable column: {col}") continue extra_args = {} need_shape = False if len(facets) > 2: # can only use two variables as plotting facets, third value will be used as a group on each plot # any more facets is not supported at this stage sorted_facets = sorted(n_per_facet.keys(), key=lambda k: n_per_facet[k]) # print(n_per_facet) # print(sorted_facets) facets = sorted_facets[-2:] extra_args.update({ "group": sorted_facets[0], "color": facets[0], "shape": sorted_facets[0], }) need_shape = True print(f"Using {facets} as facets, {extra_args} as series") else: if len(facets) > 0: extra_args.update({"color": facets[0]}) # compute figure size to give enough room for each plot mult = 1 for facet in facets: mult *= n_per_facet[facet] mult /= len(facets) nrow = int(mult + 1) # facet column names must not have spaces in them as this is not permitted by plotnine facet formulas if len(facets) > 0: new_facets = [] for f in facets: if " " in f: new_name = f.replace(" ", "_") df = df.rename(columns={f: new_name}) new_facets.append(new_name) else: new_facets.append(f) facets = new_facets if "color" in extra_args: extra_args.update({"color": facets[0]}) print(f"Renamed facet columns due to whitespace: {facets}") plot = p9.ggplot(df, p9.aes(x="TIME_PERIOD", y="value", ** extra_args)) + p9.geom_point(size=3) if len(facets) > 0 and len(facets) <= 2: facet_str = "~" + " + ".join(facets[:2]) print(f"Using facet formula: {facet_str}") plot += p9.facet_wrap(facet_str, ncol=len(facets), scales="free_y") plot_theme = { "figure_size": (12, int(nrow * 1.5)), } if (len(facets) == 2 ): # two columns of plots? if so, make sure space for axis labels plot_theme.update({"subplots_adjust": {"wspace": 0.2}}) if need_shape: plot += p9.scale_shape(guide="legend") plot += p9.guides( colour=False ) # colour legend is not useful since it is included in the facet title plot_theme.update({"legend_position": "right"}) return user_theme(plot, **plot_theme)