Beispiel #1
0
def worldbank_plot(
        df: pd.DataFrame,
        title: str,
        dates_are_yearly: bool,
        figure_size=(12, 6),
        add_points=False,
        **plot_kwargs,
) -> p9.ggplot:
    """
    Carefully written to support all worldbank plots, this method is the one place where the app needs themes, colour maps
    and various plot related settings. For sparse datasets it used geom_point() in addition to geom_line() in case the data
    is so sparse that lines cannot be drawn. Returns a ggplot instance or raises an exception if the dataframe is empty.
    """
    if df is None:
        print(f"No usable data/plot for {title}")
        raise Http404(f"No data for {title}")

    pct_na = (df["metric"].isnull().sum() / len(df)) * 100.0
    assert pct_na >= 0.0 and pct_na <= 100.0

    plot = (p9.ggplot(df, p9.aes("date", "metric", **plot_kwargs)) +
            p9.geom_path(size=1.2) +
            p9.scale_y_continuous(labels=label_shorten))
    if dates_are_yearly:
        plot += p9.scale_x_datetime(labels=date_format(
            "%Y"))  # yearly data? if so only print the year on the x-axis
    # if pct_na is too high, geom_path() may be unable to draw a line (each value is surrounded by nan preventing a path)
    # so we use geom_point() to highlight the sparse nature of the data
    if pct_na >= 30.0 or add_points or df["metric"].count() <= 3:
        plot += p9.geom_point(size=3.0)
    return user_theme(plot, y_axis_label="Value", figure_size=figure_size)
Beispiel #2
0
def plot_cumulative_returns(wanted_stocks: Iterable[str],
                            ld: LazyDictionary) -> p9.ggplot:
    df = ld["cip_df"]
    df = df.filter(wanted_stocks, axis=0).filter(regex="^\d", axis=1)
    dates = set(df.columns)
    movers = df
    movers["asx_code"] = movers.index
    movers = movers.melt(id_vars="asx_code", value_vars=dates)
    movers = movers[(movers["value"] < -5.0) |
                    (movers["value"] > 5.0)]  # ignore small movers
    # print(movers)
    movers["fetch_date"] = pd.to_datetime(movers["fetch_date"],
                                          format="%Y-%m-%d")

    # need to have separate dataframe's for positive and negative stocks - otherwise plotnine plot will be wrong
    #print(df)
    pos_df = df.agg([positive_sum])
    neg_df = df.agg([negative_sum])
    pos_df = pos_df.melt(value_vars=dates)
    neg_df = neg_df.melt(value_vars=dates)
    pos_df["fetch_date"] = pd.to_datetime(pos_df["fetch_date"],
                                          format="%Y-%m-%d")
    neg_df["fetch_date"] = pd.to_datetime(neg_df["fetch_date"],
                                          format="%Y-%m-%d")

    plot = (p9.ggplot() + p9.geom_bar(
        p9.aes(x="fetch_date", y="value"),
        data=pos_df,
        stat="identity",
        fill="green",
    ) + p9.geom_bar(
        p9.aes(x="fetch_date", y="value"),
        data=neg_df,
        stat="identity",
        fill="red",
    ) + p9.geom_point(
        p9.aes(
            x="fetch_date",
            y="value",
            fill="asx_code",
        ),
        data=movers,
        size=3,
        position=p9.position_dodge(width=0.4),
        colour="black",
    ))
    return user_theme(
        plot,
        y_axis_label="Cumulative Return (%)",
        legend_position="right",
        asxtrade_want_cmap_d=False,
        asxtrade_want_fill_d=
        True,  # points (stocks) are filled with the user-chosen theme, but everything else is fixed
    )
Beispiel #3
0
 def plot_metrics(df: pd.DataFrame, use_short_labels=False, **kwargs):
     plot = (
         p9.ggplot(df, p9.aes(x="date", y="value", colour="metric"))
         + p9.geom_line(size=1.3)
         + p9.geom_point(size=3)
     )
     if use_short_labels:
         plot += p9.scale_y_continuous(labels=label_shorten)
     n_metrics = df["metric"].nunique()
     return user_theme(
         plot,
         subplots_adjust={"left": 0.2},
         figure_size=(12, int(n_metrics * 1.5)),
         **kwargs,
     )
Beispiel #4
0
        def make_plot(ld: LazyDictionary):
            plot_df = None
            has_yearly = False
            n_datasets = 0
            add_points = False
            for i in indicators:
                try:
                    df = fetch_data(
                        i, [country],
                        fill_missing=lambda df: df.resample("AS").asfreq())
                    if df is None or len(df) == 0:
                        continue
                except:  # Data load fail?
                    print(
                        f"WARNING: unable to load worldbank dataset {i} - ignored"
                    )
                    traceback.print_exc()
                    continue
                n_datasets += 1
                df["dataset"] = f"{i.name} ({i.wb_id})"
                if "-yearly-" in i.tag:
                    has_yearly = True
                pct_na = (df["metric"].isnull().sum() / len(df)) * 100.0

                if pct_na > 30.0 or df["metric"].count() <= 3:
                    add_points = True
                if plot_df is None:
                    plot_df = df
                else:
                    # if any indicator is sparse, we enable points for all indicators to be able to see them all
                    plot_df = plot_df.append(df)

            # print(plot_df)
            figure_size = (12, n_datasets * 1.5)
            kwargs = {"group": "dataset", "colour": "dataset"}
            plot = worldbank_plot(
                plot_df,
                "",
                has_yearly,
                figure_size=figure_size,
                add_points=add_points,
                **kwargs,
            )
            plot += p9.facet_wrap("~dataset", ncol=1, scales="free_y")

            return user_theme(plot, figure_size=figure_size)
Beispiel #5
0
    def form_valid(self, form):
        df = fetch_dataframe(self.data_flow.name)
        if df is None or len(df) == 0:
            raise Http404(f"Unable to load dataframe: {self.data_flow}")

        filter_performance = []
        for k, v in form.cleaned_data.items():
            rows_at_start = len(df)
            print(
                f"Filtering rows for {k}: total {rows_at_start} rows at start")
            k = k[len("dimension_"):]
            if rows_at_start < 10000:
                unique_values_left = df[k].unique()
            else:
                unique_values_left = set()
            df = df[df[k] == v]
            rows_at_end = len(df)

            filter_performance.append(
                (k, v, rows_at_start, rows_at_end, unique_values_left))
            print(f"After filtering: now {rows_at_end} rows")
            if len(df) == 0:
                warning(self.request,
                        f"No rows of data left after filtering: {k} {v}")
                break

        plot = None
        plot_title = ""
        if len(df) > 0:
            plot_title, x_axis_column, y_axis_column, df = detect_dataframe(
                df, self.data_flow)
            plot = (p9.ggplot(df, p9.aes(x=x_axis_column, y=y_axis_column)) +
                    p9.geom_point() + p9.geom_line())
            plot = user_theme(plot)

        context = self.get_context_data()
        cache_key = "-".join(sorted(form.cleaned_data.values())) + "-ecb-plot"
        context.update({
            "dataflow": self.data_flow,
            "dataflow_name": self.data_flow.name,
            "filter_performance": filter_performance,
            "plot_title": plot_title,
            "plot_uri": cache_plot(cache_key, lambda: plot),
        })
        return render(self.request, self.template_name, context)
Beispiel #6
0
 def plot(df: pd.DataFrame) -> p9.ggplot:
     plot = (
         p9.ggplot(
             df,
             p9.aes(
                 x="fetch_date",
                 y="value",
                 color="sector_name",  # group="sector_name"
             ),
         )
         + p9.geom_line(size=1.2)
         + p9.facet_wrap("~sector_name", ncol=2, scales="free_y")
         + p9.scale_y_continuous(labels=label_shorten)
     )
     return user_theme(
         plot,
         y_axis_label="Total sector earnings ($AUD, positive contributions only)",
         figure_size=(12, 14),
         subplots_adjust={"wspace": 0.25},
     )
Beispiel #7
0
 def make_plot(self, df: pd.DataFrame, timeframe: Timeframe) -> p9.ggplot:
     # print(df)
     col_names = set(df.columns)
     if "High" in col_names and "Low" in col_names:
         df["Range"] = df["High"] - df["Low"]
     df["Date"] = pd.to_datetime(df.index, format="%Y-%m-%d")
     melted_df = df.melt(value_vars=self.value_vars,
                         id_vars="Date",
                         value_name="value")
     # print(melted_df)
     plot = (p9.ggplot(
         melted_df,
         p9.aes(x="Date", y="value", group="variable", color="variable"),
     ) + p9.geom_line(size=1.3) + p9.facet_wrap(
         "~variable",
         ncol=1,
         nrow=melted_df["variable"].nunique(),
         scales="free_y",
     ))
     return user_theme(plot)
Beispiel #8
0
 def cluster_plot(ld: LazyDictionary):
     _, _, centroids, idx, data_df = make_kmeans_cluster_dataframe(
         timeframe, chosen_k, asx_codes
     )
     centroids_df = pd.DataFrame.from_records(
         centroids, columns=["return", "volatility"]
     )
     plot = (
         p9.ggplot(
             data_df, p9.aes("return", "volatility", colour="factor(cluster_id)")
         )
         + p9.geom_point(size=3)
         + p9.facet_wrap("~cluster_id", ncol=3, scales="free")
     )
     return user_theme(
         plot,
         x_axis_label="Returns (%)",
         y_axis_label="Volatility (%)",
         figure_size=(15, 15),
         subplots_adjust={"hspace": 0.15, "wspace": 0.15},
     )
Beispiel #9
0
    def plot_abs_dataframe(self, df: pd.DataFrame) -> p9.ggplot:
        facets = []
        n_per_facet = {}
        print(df)
        for col in df.columns:
            try:
                n_values = df[col].nunique()
                if n_values == 1 and col not in [
                        "TIME_PERIOD",
                        "value",
                        "Measure",
                        "OBS_COMMENT",
                ]:
                    self.fixed_datapoints.add(f"{col}={df.at[0, col]}")
                elif n_values > 1 and col not in [
                        "value",
                        "TIME_PERIOD",
                        "OBS_COMMENT",
                ]:
                    facets.append(col)
                    n_per_facet[col] = n_values
            except:
                print(f"Ignoring unusable column: {col}")
                continue

        extra_args = {}
        need_shape = False
        if len(facets) > 2:
            # can only use two variables as plotting facets, third value will be used as a group on each plot
            # any more facets is not supported at this stage
            sorted_facets = sorted(n_per_facet.keys(),
                                   key=lambda k: n_per_facet[k])
            # print(n_per_facet)
            # print(sorted_facets)
            facets = sorted_facets[-2:]
            extra_args.update({
                "group": sorted_facets[0],
                "color": facets[0],
                "shape": sorted_facets[0],
            })
            need_shape = True
            print(f"Using {facets} as facets, {extra_args} as series")
        else:
            if len(facets) > 0:
                extra_args.update({"color": facets[0]})

        # compute figure size to give enough room for each plot
        mult = 1
        for facet in facets:
            mult *= n_per_facet[facet]
        mult /= len(facets)
        nrow = int(mult + 1)

        # facet column names must not have spaces in them as this is not permitted by plotnine facet formulas
        if len(facets) > 0:
            new_facets = []
            for f in facets:
                if " " in f:
                    new_name = f.replace(" ", "_")
                    df = df.rename(columns={f: new_name})
                    new_facets.append(new_name)
                else:
                    new_facets.append(f)
            facets = new_facets
            if "color" in extra_args:
                extra_args.update({"color": facets[0]})
            print(f"Renamed facet columns due to whitespace: {facets}")

        plot = p9.ggplot(df, p9.aes(x="TIME_PERIOD", y="value", **
                                    extra_args)) + p9.geom_point(size=3)

        if len(facets) > 0 and len(facets) <= 2:
            facet_str = "~" + " + ".join(facets[:2])
            print(f"Using facet formula: {facet_str}")
            plot += p9.facet_wrap(facet_str, ncol=len(facets), scales="free_y")

        plot_theme = {
            "figure_size": (12, int(nrow * 1.5)),
        }
        if (len(facets) == 2
            ):  # two columns of plots? if so, make sure  space for axis labels
            plot_theme.update({"subplots_adjust": {"wspace": 0.2}})
        if need_shape:
            plot += p9.scale_shape(guide="legend")
            plot += p9.guides(
                colour=False
            )  # colour legend is not useful since it is included in the facet title
            plot_theme.update({"legend_position": "right"})
        return user_theme(plot, **plot_theme)