Exemple #1
0
def test_wrong_bases():
    # x axis not transformed
    p = (ggplot(df, aes('x', 'x')) +
         annotation_logticks(sides='b', size=.75, base=10) + geom_point())

    with pytest.warns(PlotnineWarning):
        p.draw_test()

    # x axis not transform, but ticks requested for a different base
    p = (ggplot(df, aes('x', 'x')) +
         annotation_logticks(sides='b', size=.75, base=10) +
         scale_x_continuous(trans=log_trans(8)) + geom_point())

    with pytest.warns(PlotnineWarning):
        p.draw_test()

    # x axis is discrete
    df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']]))
    p = (ggplot(df2, aes('discrete', 'x')) +
         annotation_logticks(sides='b', size=.75, base=None) + geom_point())

    with pytest.warns(PlotnineWarning):
        p.draw_test()

    # y axis is discrete
    df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']]))
    p = (ggplot(df2, aes('x', 'discrete')) +
         annotation_logticks(sides='l', size=.75, base=None) + geom_point())

    with pytest.warns(PlotnineWarning):
        p.draw_test()

    # x axis is discrete + coord flip.
    df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']]))
    p = (ggplot(df2, aes('discrete', 'x')) +
         annotation_logticks(sides='b', size=.75, base=None) + geom_point() +
         coord_flip())

    with pytest.warns(PlotnineWarning):
        p.draw_test()

    # y axis is discrete + coord_flip
    df2 = df.assign(discrete=pd.Categorical([str(a) for a in df['x']]))
    p = (ggplot(df2, aes('x', 'discrete')) +
         annotation_logticks(sides='l', size=.75, base=None) + geom_point() +
         coord_flip())

    with pytest.warns(PlotnineWarning):
        p.draw_test()
    def plot_data_point(self, data_point_ix, use_base=True,
                        figure_size=(8, 6)):
        """ Plot Shapley values for an individual data point

        Parameters
        ----------
        data_point_ix :  int
        use_base : boolean, optional default=True

        Returns
        -------
        g : ggplot object
        """
        # Check Shapley values exist
        if self._shapley_values is None:
            raise Exception("No Shapley values are available")

        d = self.get_shapley_values().loc[[data_point_ix]]

        if not use_base:
            d = d.drop("BASE", axis=1)

        g = (ggplot(d.reset_index(drop=False).melt(
            id_vars="index"),
            aes(x="variable", y="value", fill="variable")) +
            geom_bar(stat="identity") +
            labs(title="Shapley values (Index: " + str(data_point_ix) + ")",
                 x="Feature",
                 y="Shapley value",
                 fill="Feature") +
            coord_flip())
        g += theme(figure_size=figure_size)

        return g
Exemple #3
0
def test_annotation_logticks_coord_flip():
    p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) +
         geom_point() + scale_x_log10() + scale_y_log10() + coord_flip() +
         theme(panel_grid_minor=element_line(color='green'),
               panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks_coord_flip'
Exemple #4
0
def plot_breakdown(cip_df: pd.DataFrame):
    """Stacked bar plot of increasing and decreasing stocks per sector in the specified df"""
    cols_to_drop = [colname for colname in cip_df.columns if colname.startswith('bin_')]
    df = cip_df.drop(columns=cols_to_drop)
    df = pd.DataFrame(df.sum(axis='columns'), columns=['sum'])
    df = df.merge(stocks_by_sector(), left_index=True, right_on='asx_code')

    if len(df) == 0: # no stock in cip_df have a sector? ie. ETF?
        return None

    assert set(df.columns) == set(['sum', 'asx_code', 'sector_name'])
    df['increasing'] = df.apply(lambda row: 'up' if row['sum'] >= 0.0 else 'down', axis=1)
    sector_names = df['sector_name'].value_counts().index.tolist() # sort bars by value count (ascending)
    sector_names_cat = pd.Categorical(df['sector_name'], categories=sector_names)
    df = df.assign(sector_name_cat=sector_names_cat)

    #print(df)
    plot = (
        p9.ggplot(df, p9.aes(x='factor(sector_name_cat)', fill='factor(increasing)'))
        + p9.geom_bar()
        + p9.labs(x="Sector", y="Number of stocks")
        + p9.theme(axis_text_y=p9.element_text(size=7), 
                   subplots_adjust={"left": 0.2, 'right': 0.85},
                   legend_title=p9.element_blank()
                  )
        + p9.coord_flip()
    )
    return plot_as_inline_html_data(plot)
Exemple #5
0
def plot_bargraph(count_plot_df, plot_df):
    """
    Plots the bargraph 
    Arguments:
        count_plot_df - The dataframe that contains lemma counts
        plot_df - the dataframe that contains the odds ratio and lemmas
    """

    graph = (
        p9.ggplot(count_plot_df.astype({"count": int}),
                  p9.aes(x="lemma", y="count")) +
        p9.geom_col(position=p9.position_dodge(width=0.5), fill="#253494") +
        p9.coord_flip() + p9.facet_wrap("repository", scales='free_x') +
        p9.scale_x_discrete(limits=(plot_df.sort_values(
            "odds_ratio", ascending=True).lemma.tolist())) +
        p9.scale_y_continuous(labels=custom_format('{:,.0g}')) +
        p9.labs(x=None) + p9.theme_seaborn(
            context='paper', style="ticks", font="Arial", font_scale=0.95) +
        p9.theme(
            # 640 x 480
            figure_size=(6.66, 5),
            strip_background=p9.element_rect(fill="white"),
            strip_text=p9.element_text(size=12),
            axis_title=p9.element_text(size=12),
            axis_text_x=p9.element_text(size=10),
        ))
    return graph
Exemple #6
0
def plot_boxplot_series(df, normalisation_method=None):
    """
    Treating each column as a separate boxplot and each row as an independent observation
    (ie. different company)
    render a series of box plots to identify a shift in performance from the observations.
    normalisation_method should be one of the values present in
    SectorSentimentSearchForm.normalisation_choices
    """

    # and plot the normalised data
    if normalisation_method is None or normalisation_method == "1":
        normalized_df = df
        y_label = "Percentage change"
    elif normalisation_method == "2":
        normalized_df = (df - df.min()) / (df.max() - df.min())
        y_label = "Percentage change (min/max. scaled)"
    else:
        normalized_df = df / df.max(axis=0)  # div by max if all else fails...
        y_label = "Percentage change (normalised by dividing by max)"

    n_inches = len(df.columns) / 5
    melted = normalized_df.melt(ignore_index=False).dropna()
    plot = (p9.ggplot(melted, p9.aes(x="fetch_date", y="value")) +
            p9.geom_boxplot(outlier_colour="blue") + p9.coord_flip())
    return user_theme(plot, y_axis_label=y_label, figure_size=(12, n_inches))
Exemple #7
0
def test_coord_flip():
    p = (ggplot(df)
         + geom_rug(aes('x', 'y'), size=2, sides='l')
         + coord_flip()
         )

    assert p + _theme == 'coord_flip'
Exemple #8
0
def plot_pointplot(plot_df, y_axis_label="", use_log10=False, limits=[0, 3.2]):
    """
    Plots the pointplot
    Arguments:
        plot_df - the dataframe that contains the odds ratio and lemmas
        y_axis_label - the label for the y axis
        use_log10 - use log10 for the y axis?
    """
    graph = (
        p9.ggplot(plot_df, p9.aes(x="lemma", y="odds_ratio")) +
        p9.geom_pointrange(p9.aes(ymin="lower_odds", ymax="upper_odds"),
                           position=p9.position_dodge(width=1),
                           size=0.3,
                           color="#253494") +
        p9.scale_x_discrete(limits=(plot_df.sort_values(
            "odds_ratio", ascending=True).lemma.tolist())) +
        (p9.scale_y_log10() if use_log10 else p9.scale_y_continuous(
            limits=limits)) +
        p9.geom_hline(p9.aes(yintercept=1), linetype='--', color='grey') +
        p9.coord_flip() + p9.theme_seaborn(
            context='paper', style="ticks", font_scale=1, font='Arial') +
        p9.theme(
            # 640 x 480
            figure_size=(6.66, 5),
            panel_grid_minor=p9.element_blank(),
            axis_title=p9.element_text(size=12),
            axis_text_x=p9.element_text(size=10)) +
        p9.labs(x=None, y=y_axis_label))
    return graph
Exemple #9
0
def cell_cycle_phase_barplot(adata, palette='Set2'):
    """Plots the proportion of cells in each phase of the cell cycle

    See also: cell_cycle_phase_pieplot for the matplotlib pie chart


    Parameters
    -----------
    adata: AnnData
        The AnnData object being used for the analysis. Must be previously
        evaluated by `tl.annotate_cell_cycle`.

    Returns
    -----------
    A plotnine barplot with the total counts of cell in each phase of the
    cell cycle.

    """
    plt_data = adata.obs.copy()
    plt_data['cell_cycle_phase'] = pd.Categorical(
        plt_data['cell_cycle_phase'],
        categories=['G1 post-mitotic', 'G1 pre-replication', 'S/G2/M'])

    cycle_plot = (
        ggplot(plt_data, aes('cell_cycle_phase', fill='cell_cycle_phase')) +
        geom_bar() + coord_flip() + guides(fill=False) +
        labs(y='', x='Cell cycle phase') + theme_light() +
        theme(panel_grid_major_y=element_blank(),
              panel_grid_minor_y=element_blank(),
              panel_grid_major_x=element_line(size=1.5),
              panel_grid_minor_x=element_line(size=1.5)) +
        scale_fill_brewer(type='qual', palette=palette))

    return cycle_plot
Exemple #10
0
def plot_point_scores(stock: str, sector_companies, all_stocks_cip: pd.DataFrame, rules):
    """
    Visualise the stock in terms of point scores as described on the stock view page. Rules to apply
    can be specified by rules (default rules are provided by rule_*())

    Points are lost for equivalent downturns and the result plotted. All rows in all_stocks_cip will be
    used to calculate the market average on a given trading day, whilst only sector_companies will
    be used to calculate the sector average. A utf-8 base64 encoded plot image is returned
    """
    assert len(stock) >= 3
    assert all_stocks_cip is not None
    assert rules is not None and len(rules) > 0

    rows = []
    points = 0
    day_low_high_df = day_low_high(stock, all_dates=all_stocks_cip.columns)
    state = {
        "day_low_high_df": day_low_high_df,  # never changes each day, so we init it here
        "all_stocks_change_in_percent_df": all_stocks_cip,
        "stock": stock,
        "daily_range_threshold": 0.20,  # 20% at either end of the daily range gets a point
    }
    net_points_by_rule = defaultdict(int)
    for date in all_stocks_cip.columns:
        market_avg = all_stocks_cip[date].mean()
        sector_avg = all_stocks_cip[date].filter(items=sector_companies).mean()
        stock_move = all_stocks_cip.at[stock, date]
        state.update(
            {
                "market_avg": market_avg,
                "sector_avg": sector_avg,
                "stock_move": stock_move,
                "date": date,
            }
        )
        points += sum(map(lambda r: r(state), rules))
        for r in rules:
            k = r.__name__
            if k.startswith("rule_"):
                k = k[5:]
            net_points_by_rule[k] += r(state)
        rows.append({"points": points, "stock": stock, "date": date})

    df = pd.DataFrame.from_records(rows)
    df["date"] = pd.to_datetime(df["date"])
    point_score_plot = plot_series(df, x="date", y="points")

    rows = []
    for k, v in net_points_by_rule.items():
        rows.append({"rule": str(k), "net_points": v})
    df = pd.DataFrame.from_records(rows)
    net_rule_contributors_plot = (
        p9.ggplot(df, p9.aes(x="rule", y="net_points"))
        + p9.labs(x="Rule", y="Contribution to points by rule")
        + p9.geom_bar(stat="identity")
        + p9.theme(axis_text_y=p9.element_text(size=7), subplots_adjust={"left": 0.2})
        + p9.coord_flip()
    )
    return point_score_plot, plot_as_inline_html_data(net_rule_contributors_plot)
def test_annotation_stripes_coord_flip():
    p = (ggplot(df)
         + annotation_stripes()
         + geom_point(aes('factor(x)', 'y'))
         + geom_vline(xintercept=[0.5, 1.5, 2.5, 3.5])
         + coord_flip()
         )

    assert p == 'annotation_stripes_coord_flip'
Exemple #12
0
def test_annotation_logticks_coord_flip_discrete_bottom():
    df2 = df.assign(discrete=pd.Categorical(['A' + str(a) for a in df['x']]))
    p = (ggplot(df2, aes('x', 'discrete')) +
         annotation_logticks(sides='b', size=.75) + geom_point() +
         scale_x_log10() + coord_flip() +
         theme(panel_grid_minor=element_line(color='green'),
               panel_grid_major=element_line(color='red')))

    assert p == 'annotation_logticks_coord_flip_discrete_bottom'
Exemple #13
0
def plotfreq(freqdf):
    '''
    ----------   
    
    Parameters
    ----------
    freqdf  dataframe generated by freq()

    Returns
    -------
    Bar chart with frequencies & percentages in descending order
        
    Example 
    -------
    import exploretransform as et
    df, X, y = et.loadboston()
    et.plotfreq(et.freq(X['town']))

    Warning 
    -------
    This function will likely not plot more than 100 unique levels properly.
    
    ---------- 
    '''

    # input checks
    if isinstance(freqdf, (pd.core.frame.DataFrame)): pass
    else: return print("\nFunction only accetps dataframes\n")

    if len(freqdf.columns) == 4: pass
    else: return print("\nInput must be a dataframe generated by freq()\n")

    if sum(freqdf.columns[1:4] == ['freq', 'perc', 'cump']) == 3: pass
    else: return print("\nInput must be a dataframe generated by freq()\n")

    if len(freqdf) < 101: pass
    else: return print("\nUnable to plot more than 100 items")

    # label for plot
    lbl = freqdf['freq'].astype(str).str.cat(
        '[ ' + freqdf['perc'].astype(str) + '%' + ' ]', sep='   ')
    # create variable to be used in aes
    aesx = 'reorder(' + freqdf.columns[0] + ', freq)'

    # build plot
    plot = (pn.ggplot(freqdf) +
            pn.aes(x=aesx, y='freq', fill='freq', label=lbl) +
            pn.geom_bar(stat='identity') + pn.coord_flip() +
            pn.theme(axis_text_y=pn.element_text(size=6, weight='bold'),
                     legend_position='none') +
            pn.labs(x=freqdf.columns[0], y="Freq") +
            pn.scale_fill_gradient2(mid='bisque', high='blue') +
            pn.geom_text(size=6, nudge_y=.7))

    return plot
Exemple #14
0
def barplot(df, key, figsize=(8, 6), vertical=False):
    if vertical: figsize = tuple(list(reversed(list(figsize))))
    p9.options.figure_size = figsize
    top_l = df[key].value_counts().index.tolist()
    df[key] = pd.Categorical(df[key], categories=reversed(top_l))
    fig = p9.ggplot(p9.aes(x=key, y='..count..', label='..count..'), data=df)
    fig += p9.geom_bar(alpha=0.5)
    if vertical: fig += p9.coord_flip()
    fig += p9.stat_count(geom="text",
                         position=p9.position_stack(vjust=0.5),
                         size=10)
    fig += p9.theme_classic()
    return fig
Exemple #15
0
def plot_boxplot_series(df, normalisation_method=None):
    """
    Treating each column as a separate boxplot and each row as an independent observation 
    (ie. different company)
    render a series of box plots to identify a shift in performance from the observations.
    normalisation_method should be one of the values present in 
    SectorSentimentSearchForm.normalisation_choices
    """
    # compute star performers: those who are above the mean on a given day counted over all days
    count = defaultdict(int)
    for col in df.columns:
        avg = df.mean(axis=0)
        winners = df[df[col] > avg[col]][col]
        for winner in winners.index:
            count[winner] += 1
    winner_results = []
    for asx_code, n_wins in count.items():
        x = df.loc[asx_code].sum()
        # avoid "dead cat bounce" stocks which fall spectacularly and then post major increases in percentage terms
        if x > 0.0:  
            winner_results.append((asx_code, n_wins, x))

    # and plot the normalised data
    if normalisation_method is None or normalisation_method == "1":
        normalized_df = df
        y_label = "Percentage change"
    elif normalisation_method == "2":
        normalized_df = (df - df.min()) / (df.max() - df.min())
        y_label = "Percentage change (min/max. scaled)"
    else:
        normalized_df = df / df.max(axis=0)  # div by max if all else fails...
        y_label = "Percentage change (normalised by dividing by max)"

    n_inches = len(df.columns) / 5
    melted = normalized_df.melt(ignore_index=False).dropna()
    plot = (
        p9.ggplot(melted, p9.aes(x="fetch_date", y="value"))
        + p9.geom_boxplot(outlier_colour="blue")
        + p9.theme(
            axis_text_x=p9.element_text(size=7),
            axis_text_y=p9.element_text(size=7),
            figure_size=(12, n_inches),
        )
        + p9.labs(x="Date (YYYY-MM-DD)", y=y_label)
        + p9.coord_flip()
    )
    return (
        plot_as_inline_html_data(plot),
        list(reversed(sorted(winner_results, key=lambda t: t[2]))),
    )
def test_annotation_stripes_coord_flip():
    pdf = mtcars.assign(gear=pd.Categorical(mtcars.gear),
                        am=pd.Categorical(mtcars.am))
    p = (
        ggplot(pdf) + annotation_stripes(
            fills=["#AAAAAA", "#FFFFFF", "#7F7FFF"], alpha=0.3) + geom_jitter(
                aes("gear", "wt", shape="gear", color="am"), random_state=5) +
        geom_vline(xintercept=0.5, color="black") +
        geom_vline(xintercept=1.5, color="black") +
        geom_vline(xintercept=2.5, color="black") +
        geom_vline(xintercept=3.5, color="black") +
        scale_shape_discrete(guide=guide_legend(order=1))  # work around #229
        + coord_flip())
    assert p == "annotation_stripes_coord_flip"
Exemple #17
0
def plot_breakdown(ld: LazyDictionary) -> p9.ggplot:
    """Stacked bar plot of increasing and decreasing stocks per sector in the specified df"""
    cip_df = ld["cip_df"]

    cols_to_drop = [
        colname for colname in cip_df.columns if colname.startswith("bin_")
    ]
    df = cip_df.drop(columns=cols_to_drop)
    df = pd.DataFrame(df.sum(axis="columns"), columns=["sum"])
    ss = ld["stocks_by_sector"]
    # ss should be:
    #             asx_code             sector_name
    # asx_code
    # 14D           14D             Industrials
    # 1AD           1AD             Health Care
    # 1AG           1AG             Industrials
    # 1AL           1AL  Consumer Discretionary........
    # print(ss)
    df = df.merge(ss, left_index=True, right_index=True)

    if len(df) == 0:  # no stock in cip_df have a sector? ie. ETF?
        return None

    assert set(df.columns) == set(["sum", "asx_code", "sector_name"])
    df["increasing"] = df.apply(lambda row: "up"
                                if row["sum"] >= 0.0 else "down",
                                axis=1)
    sector_names = (df["sector_name"].value_counts().index.tolist()
                    )  # sort bars by value count (ascending)
    sector_names_cat = pd.Categorical(df["sector_name"],
                                      categories=sector_names)
    df = df.assign(sector_name_cat=sector_names_cat)

    # print(df)
    plot = (p9.ggplot(
        df, p9.aes(x="factor(sector_name_cat)", fill="factor(increasing)")) +
            p9.geom_bar() + p9.coord_flip())
    return user_theme(
        plot,
        x_axis_label="Sector",
        y_axis_label="Number of stocks",
        subplots_adjust={
            "left": 0.2,
            "right": 0.85
        },
        legend_title=p9.element_blank(),
        asxtrade_want_fill_d=True,
    )
Exemple #18
0
def plot_points_by_rule(net_points_by_rule: defaultdict(int)) -> p9.ggplot:
    if net_points_by_rule is None or len(net_points_by_rule) < 1:
        return None

    rows = []
    for k, v in net_points_by_rule.items():
        rows.append({"rule": str(k), "net_points": v})
    df = pd.DataFrame.from_records(rows)
    plot = (
        p9.ggplot(df, p9.aes(x="rule", y="net_points", fill="net_points")) +
        p9.geom_bar(stat="identity", alpha=0.7) + p9.coord_flip())
    return user_theme(
        plot,
        x_axis_label="Rule",
        y_axis_label="Contributions to points by rule",
        subplots_adjust={"left": 0.2},
        asxtrade_want_fill_continuous=True,
    )
Exemple #19
0
Fichier : plot.py Projet : NPSDC/qb
def protobowl(fold=BUZZER_DEV_FOLD):
    df_rnn = pickle.load(
        open("output/buzzer/RNNBuzzer/{}_protobowl.pkl".format(fold), "rb")
    )
    df_rnn = df_rnn.groupby(["Possibility", "Outcome"])
    df_rnn = df_rnn.size().reset_index().rename(columns={0: "Count"})
    df_rnn["Model"] = pd.Series(["RNN" for _ in range(len(df_rnn))], index=df_rnn.index)

    df_mlp = pickle.load(
        open("output/buzzer/MLPBuzzer/{}_protobowl.pkl".format(fold), "rb")
    )
    df_mlp = df_mlp.groupby(["Possibility", "Outcome"])
    df_mlp = df_mlp.size().reset_index().rename(columns={0: "Count"})
    df_mlp["Model"] = pd.Series(["MLP" for _ in range(len(df_mlp))], index=df_mlp.index)

    df_thr = pickle.load(
        open("output/buzzer/ThresholdBuzzer/{}_protobowl.pkl".format(fold), "rb")
    )
    df_thr = df_thr.groupby(["Possibility", "Outcome"])
    df_thr = df_thr.size().reset_index().rename(columns={0: "Count"})
    df_thr["Model"] = pd.Series(
        ["Threshold" for _ in range(len(df_thr))], index=df_thr.index
    )

    df = df_rnn.append(df_mlp, ignore_index=True)
    df = df.append(df_thr, ignore_index=True)

    outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15])
    df["Outcome"] = df["Outcome"].astype(outcome_type)
    model_type = CategoricalDtype(categories=["Threshold", "MLP", "RNN"])
    df["Model"] = df["Model"].astype(model_type)

    p = (
        ggplot(df)
        + geom_col(aes(x="Possibility", y="Count", fill="Outcome"), width=0.7)
        + facet_grid("Model ~")
        + coord_flip()
        + theme_fs()
        + theme(aspect_ratio=0.17)
        + scale_fill_brewer(type="div", palette=7)
    )

    figure_dir = os.path.join("output/buzzer/{}_protobowl.pdf".format(fold))
    p.save(figure_dir)
Exemple #20
0
def plot_sector_top_eps_contributors(
        df: pd.DataFrame, stocks_by_sector_df: pd.DataFrame) -> p9.ggplot:
    """
    Returns a plot of the top 20 contributors per sector, based on the most recent EPS value per stock in the dataframe. If no
    stocks in a given sector have positive EPS, the sector will not be plotted.
    """
    most_recent_date = df.columns[-1]
    last_known_eps = df[most_recent_date]
    last_known_eps = last_known_eps[last_known_eps >= 0.0].to_frame()
    # print(stocks_by_sector_df)
    last_known_eps = last_known_eps.merge(stocks_by_sector_df,
                                          left_index=True,
                                          right_on="asx_code")
    last_known_eps["rank"] = last_known_eps.groupby(
        "sector_name")[most_recent_date].rank("dense", ascending=False)
    last_known_eps = last_known_eps[last_known_eps["rank"] <= 10.0]
    n_sectors = last_known_eps["sector_name"].nunique()
    last_known_eps["eps"] = last_known_eps[most_recent_date]

    plot = (
        p9.ggplot(
            last_known_eps,
            p9.aes(
                y="eps",
                x="reorder(asx_code,eps)",  # sort bars by eps within each sub-plot
                group="sector_name",
                fill="sector_name",
            ),
        ) + p9.geom_bar(stat="identity") +
        p9.facet_wrap("~sector_name", ncol=1, nrow=n_sectors, scales="free") +
        p9.coord_flip())
    return user_theme(
        plot,
        y_axis_label="EPS ($AUD)",
        x_axis_label="Top 10 ASX stocks per sector as at {}".format(
            most_recent_date),
        subplots_adjust={"hspace": 0.4},
        figure_size=(12, int(n_sectors * 1.5)),
        asxtrade_want_cmap_d=False,
        asxtrade_want_fill_d=True,
    )
Exemple #21
0
def protobowl(fold=BUZZER_DEV_FOLD):
    df_rnn = pickle.load(
        open('output/buzzer/RNNBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_rnn = df_rnn.groupby(['Possibility', 'Outcome'])
    df_rnn = df_rnn.size().reset_index().rename(columns={0: 'Count'})
    df_rnn['Model'] = pd.Series(['RNN' for _ in range(len(df_rnn))], index=df_rnn.index)

    df_mlp = pickle.load(
        open('output/buzzer/MLPBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_mlp = df_mlp.groupby(['Possibility', 'Outcome'])
    df_mlp = df_mlp.size().reset_index().rename(columns={0: 'Count'})
    df_mlp['Model'] = pd.Series(['MLP' for _ in range(len(df_mlp))], index=df_mlp.index)

    df_thr = pickle.load(
        open('output/buzzer/ThresholdBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_thr = df_thr.groupby(['Possibility', 'Outcome'])
    df_thr = df_thr.size().reset_index().rename(columns={0: 'Count'})
    df_thr['Model'] = pd.Series(['Threshold' for _ in range(len(df_thr))], index=df_thr.index)

    df = df_rnn.append(df_mlp, ignore_index=True)
    df = df.append(df_thr, ignore_index=True)

    outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15])
    df['Outcome'] = df['Outcome'].astype(outcome_type)
    model_type = CategoricalDtype(
        categories=['Threshold', 'MLP', 'RNN'])
    df['Model'] = df['Model'].astype(model_type)

    p = (
        ggplot(df)
        + geom_col(aes(x='Possibility', y='Count', fill='Outcome'),
                   width=0.7)
        + facet_grid('Model ~')
        + coord_flip()
        + theme_fs()
        + theme(aspect_ratio=0.17)
        + scale_fill_brewer(type='div', palette=7)
    )

    figure_dir = os.path.join('output/buzzer/{}_protobowl.pdf'.format(fold))
    p.save(figure_dir)
def lollipop(data):
    data = data.sort_values(by=['probability']).reset_index(drop=True)
    custom_order = pd.Categorical(data['label'], categories=data.label)
    data = data.assign(label_custom=custom_order)


    p = ggplot(data, aes('label_custom', 'probability')) + \
        geom_point(color = "#88aa88", size = 4) + \
            geom_segment(aes(x = 'label_custom', y = 0, xend = 'label_custom', yend = 'probability'), color = "#88aa88") + \
                coord_flip(expand=True) + \
                    theme_minimal() + \
                        labs(x="", y="probability", title = "Most Likely Object") + \
                            guides(title_position = "left") + \
                                theme(plot_title = element_text(size = 20, face = "bold", ha= "right"))

    fig = p.draw()
    figfile = BytesIO()
    plt.savefig(figfile, format='png', bbox_inches='tight')
    figfile.seek(0)  # rewind to beginning of file
    figdata_png = base64.b64encode(figfile.getvalue()).decode()
    return p, figdata_png
Exemple #23
0
def protobowl(fold=BUZZER_DEV_FOLD):
    df_rnn = pickle.load(
        open('output/buzzer/RNNBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_rnn = df_rnn.groupby(['Possibility', 'Outcome'])
    df_rnn = df_rnn.size().reset_index().rename(columns={0: 'Count'})
    df_rnn['Model'] = pd.Series(['RNN' for _ in range(len(df_rnn))],
                                index=df_rnn.index)

    df_mlp = pickle.load(
        open('output/buzzer/MLPBuzzer/{}_protobowl.pkl'.format(fold), 'rb'))
    df_mlp = df_mlp.groupby(['Possibility', 'Outcome'])
    df_mlp = df_mlp.size().reset_index().rename(columns={0: 'Count'})
    df_mlp['Model'] = pd.Series(['MLP' for _ in range(len(df_mlp))],
                                index=df_mlp.index)

    df_thr = pickle.load(
        open('output/buzzer/ThresholdBuzzer/{}_protobowl.pkl'.format(fold),
             'rb'))
    df_thr = df_thr.groupby(['Possibility', 'Outcome'])
    df_thr = df_thr.size().reset_index().rename(columns={0: 'Count'})
    df_thr['Model'] = pd.Series(['Threshold' for _ in range(len(df_thr))],
                                index=df_thr.index)

    df = df_rnn.append(df_mlp, ignore_index=True)
    df = df.append(df_thr, ignore_index=True)

    outcome_type = CategoricalDtype(categories=[15, 10, 5, 0, -5, -10, -15])
    df['Outcome'] = df['Outcome'].astype(outcome_type)
    model_type = CategoricalDtype(categories=['Threshold', 'MLP', 'RNN'])
    df['Model'] = df['Model'].astype(model_type)

    p = (ggplot(df) +
         geom_col(aes(x='Possibility', y='Count', fill='Outcome'), width=0.7) +
         facet_grid('Model ~') + coord_flip() + theme_fs() +
         theme(aspect_ratio=0.17) + scale_fill_brewer(type='div', palette=7))

    figure_dir = os.path.join('output/buzzer/{}_protobowl.pdf'.format(fold))
    p.save(figure_dir)
Exemple #24
0
def plot_cor(df):
    # drop missing correlations
    out = df[~df['corr'].isnull()]
    # add pair column
    out = out.assign(pair=out.col_1 + '&' + out.col_2)
    # add a sign column
    sign = ((out['corr'] > 0).astype('int')).to_list()
    sign = [['Negative', 'Positive'][i] for i in sign]
    out['sign'] = sign
    #out  = out.sort_values('pair', ascending = False).reset_index(drop = True)
    # add ind column
    out['ind'] = [out.shape[0] - i for i in range(out.shape[0])]
    # plot using bands
    ggplt = p9.ggplot(data = out, mapping = p9.aes(x = 'pair', y = 'corr')) \
        + p9.geom_hline(
            yintercept = 0,
            linetype = "dashed",
            color = "#c2c6cc"
            ) \
        + p9.geom_rect(
            alpha = 0.4,
            xmin = out.ind.values - 0.4,
            xmax = out.ind.values + 0.4,
            ymin = out.lower.values,
            ymax = out.upper.values,
            fill = [['b', '#abaeb3'][int(x > 0.05)] for x in out.p_value]
          ) \
        + p9.geom_segment(
            x = out.ind.values - 0.4,
            y = out['corr'].values,
            xend = out.ind.values + 0.4,
            yend = out['corr'].values
          ) \
        + p9.coord_flip() \
        + p9.ylim(np.min(out.lower.values), np.max(out.upper.values)) \
        + p9.labs(x = "", y = "Correlation")
    return ggplt
Exemple #25
0
g = (p9.ggplot(binned_df, p9.aes(x="precision", y="edges",
                                 color="in_hetionet")) + p9.geom_point() +
     p9.geom_line() + p9.scale_color_manual(values={
         "Existing": color_map["Existing"],
         "Novel": color_map["Novel"]
     }) + p9.facet_wrap("relation") + p9.scale_y_log10() + p9.theme_bw())
print(g)

# In[8]:

g = (p9.ggplot(binned_df, p9.aes(x="precision", y="edges", fill="in_hetionet"))
     + p9.geom_bar(stat='identity', position='dodge') +
     p9.scale_fill_manual(values={
         "Existing": color_map["Existing"],
         "Novel": color_map["Novel"]
     }) + p9.coord_flip() + p9.facet_wrap("relation") + p9.scale_y_log10() +
     p9.theme(figure_size=(12, 8), aspect_ratio=9) + p9.theme_bw())
print(g)

# In[9]:

combined_sen_tree = {
    "DaG": {
        "file":
        "../../../disease_gene/disease_associates_gene/edge_prediction_experiment/output/combined_predicted_dag_sentences.tsv.xz",
        "group": ["doid_id", "entrez_gene_id"]
    },
    "CtD": {
        "file":
        "../../../compound_disease/compound_treats_disease/edge_prediction_experiment/output/combined_predicted_ctd_sentences.tsv.xz",
        "group": ["drugbank_id", "doid_id"]
Exemple #26
0
 def test_aesthetics_coordflip(self):
     assert self.p + coord_flip() == 'aesthetics+coord_flip'
Exemple #27
0
#              height=1, width=6)

## can also use seaborn for strip plotting...
plt.close()
# plt.figure(figsize=(6, 1))
sns.stripplot(data=gse75386, y='class', x='Gad1', color='black')
# plt.savefig('gse75386_gad1_stripchart_bw.pdf',
#             format='pdf', bbox_inches='tight')

## -----------------------------------------------------------------
## GSE75386 overplotted bars
## -----------------------------------------------------------------
plt.close()
ggbar = ggplot(gse75386, gg.aes(x='class', y='Gad1'))
ggbar += gg.geom_bar(alpha=0.1, position='identity', stat='identity')
ggbar += gg.coord_flip()
print(ggbar)
# ggbar.save('gse75386_gad1_barchart_id.pdf', format='pdf',
#            height=1, width=6)

## -----------------------------------------------------------------
## GSE75386 mean bars + SE lines
## -----------------------------------------------------------------
plt.close()
## use pandas functionality to compute stat transformations
gse75386means = gse75386[['class', 'Gad1']]\
                .groupby('class').agg(np.mean).iloc[:, 0]
gse75386ses = gse75386[['class', 'Gad1']]\
              .groupby('class').agg(lambda x: x.std() / np.sqrt(len(x)))\
              .iloc[:, 0]
gse75386stats = pd.DataFrame({
Exemple #28
0
metadata_df["author_type"].value_counts()

# # BioRxiv Research Article Categories

# Categories assigned to each research article. Neuroscience dominates majority of the articles as expected.

# In[9]:

category_list = metadata_df.category.value_counts().index.tolist()[::-1]

# plot nine doesn't implement reverse keyword for scale x discrete
# ugh...
g = (
    p9.ggplot(metadata_df, p9.aes(x="category")) +
    p9.geom_bar(size=10, fill="#253494", position=p9.position_dodge(width=3)) +
    p9.scale_x_discrete(limits=category_list) + p9.coord_flip() +
    p9.theme_seaborn(
        context="paper", style="ticks", font="Arial", font_scale=1))
g.save("output/figures/preprint_category.png", dpi=500)
print(g)

# In[10]:

metadata_df["category"].value_counts()

# # New, Confirmatory, Contradictory Results?

# In[11]:

heading_list = metadata_df.heading.value_counts().index.tolist()[::-1]
          ]])
category_sim_df.head()

# In[10]:

category_sim_df.to_csv("output/category_cossim_95_ci.tsv",
                       sep="\t",
                       index=False)

# In[11]:

g = (p9.ggplot(category_sim_df) + p9.aes(x="category",
                                         y="pca1_cossim",
                                         ymin="pca1_cossim_lower",
                                         ymax="pca1_cossim_upper") +
     p9.geom_pointrange() + p9.coord_flip() + p9.theme_bw() +
     p9.scale_x_discrete(limits=category_sim_df.category.tolist()[::-1]) +
     p9.theme(figure_size=(11, 7),
              text=p9.element_text(size=12),
              panel_grid_major_y=p9.element_blank()) +
     p9.labs(y="PC1 Cosine Similarity"))
g.save("output/pca_plots/figures/category_pca1_95_ci.svg", dpi=500)
g.save("output/pca_plots/figures/category_pca1_95_ci.png", dpi=500)
print(g)

# In[12]:

g = (p9.ggplot(category_sim_df) + p9.aes(x="category",
                                         y="pca2_cossim",
                                         ymax="pca2_cossim_upper",
                                         ymin="pca2_cossim_lower") +
Exemple #30
0
    ["is_same_paper_1", "is_same_paper_2", "is_same_paper_3"]].mode(axis=1))))
final_annotated_df.head()

# In[6]:

binned_stats_df = (final_annotated_df.groupby(
    "distance_bin").final_same_paper.mean().to_frame().rename(
        index=str, columns={
            "final_same_paper": "frac_correct"
        }).reset_index())
binned_stats_df

# In[7]:

g = (p9.ggplot(binned_stats_df, p9.aes(x="distance_bin", y="frac_correct")) +
     p9.geom_col(fill="#a6cee3") + p9.coord_flip() +
     p9.labs(x="Fraction Correct", y="Euclidean Distance Bins") +
     p9.theme_seaborn(
         context="paper", style="ticks", font="Arial", font_scale=1.5))
g.save("output/figures/distance_bin_accuracy.svg")
g.save("output/figures/distance_bin_accuracy.png", dpi=250)
print(g)

# # Logsitic Regression Performance

# In[8]:

biorxiv_embed_df = (pd.read_csv(Path("../word_vector_experiment/output/") /
                                "word2vec_output/" /
                                "biorxiv_all_articles_300.tsv.xz",
                                sep="\t").set_index("document"))
Exemple #31
0
def test_coord_flip():
    assert p + coord_flip() == 'coord_flip'
Exemple #32
0
 def test_aesthetics_coordflip(self):
     assert self.p + coord_flip() == 'aesthetics+coord_flip'
Exemple #33
0
print(best_result)

print("Best CV Fold")
print(model.scores_["polka"][:, best_result[0]])
model.scores_["polka"][:, best_result[0]].mean()

model_weights_df = pd.DataFrame.from_dict({
    "weight": model.coef_[0],
    "pc": list(range(1, 51)),
})
model_weights_df["pc"] = pd.Categorical(model_weights_df["pc"])
model_weights_df.head()

g = (p9.ggplot(model_weights_df, p9.aes(x="pc", y="weight")) +
     p9.geom_col(position=p9.position_dodge(width=5), fill="#253494") +
     p9.coord_flip() +
     p9.scale_x_discrete(limits=list(sorted(range(1, 51), reverse=True))) +
     p9.theme_seaborn(
         context="paper", style="ticks", font_scale=1.1, font="Arial") +
     p9.theme(figure_size=(10, 8)) + p9.labs(title="Regression Model Weights",
                                             x="Princpial Component",
                                             y="Model Weight"))
# g.save("output/figures/pca_log_regression_weights.svg")
# g.save("output/figures/pca_log_regression_weights.png", dpi=250)
print(g)

fold_features = model.coefs_paths_["polka"].transpose(1, 0, 2)
model_performance_df = pd.DataFrame.from_dict({
    "feat_num": ((fold_features.astype(bool).sum(axis=1)) > 0).sum(axis=1),
    "C":
    model.Cs_,
Exemple #34
0
def error_comparison():
    char_frames = {}
    first_frames = {}
    full_frames = {}
    train_times = {}
    use_wiki = {}
    best_accuracies = {}
    for p in glob.glob(f'output/guesser/best/qanta.guesser*/guesser_report_guesstest.pickle', recursive=True):
        with open(p, 'rb') as f:
            report = pickle.load(f)
            name = report['guesser_name']
            params = report['guesser_params']
            train_times[name] = params['training_time']
            use_wiki[name] = params['use_wiki'] if 'use_wiki' in params else False
            char_frames[name] = report['char_df']
            first_frames[name] = report['first_df']
            full_frames[name] = report['full_df']
            best_accuracies[name] = (report['first_accuracy'], report['full_accuracy'])
    first_df = pd.concat([f for f in first_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index()
    first_df['position'] = ' Start'
    full_df = pd.concat([f for f in full_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index()
    full_df['position'] = 'End'
    compare_df = pd.concat([first_df, full_df])
    compare_df = compare_df[compare_df.guesser != 'qanta.guesser.vw.VWGuesser']
    compare_results = {}
    comparisons = ['qanta.guesser.dan.DanGuesser', 'qanta.guesser.rnn.RnnGuesser', 'qanta.guesser.elasticsearch.ElasticSearchGuesser']
    cr_rows = []
    for (qnum, position), group in compare_df.groupby(['qanta_id', 'position']):
        group = group.set_index('guesser')
        correct_guessers = []
        wrong_guessers = []
        for name in comparisons:
            if group.loc[name].correct == 1:
                correct_guessers.append(name)
            else:
                wrong_guessers.append(name)
        if len(correct_guessers) > 3:
            raise ValueError('this should be unreachable')
        elif len(correct_guessers) == 3:
            cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Correct'})
        elif len(correct_guessers) == 0:
            cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Wrong'})
        elif len(correct_guessers) == 1:
            cr_rows.append({
                'qnum': qnum, 'Position': position,
                'model': to_shortname(correct_guessers[0]),
                'Result': 'Correct'
            })
        else:
            cr_rows.append({
                'qnum': qnum, 'Position': position,
                'model': to_shortname(wrong_guessers[0]),
                'Result': 'Wrong'
            })
    cr_df = pd.DataFrame(cr_rows)
    # samples = cr_df[(cr_df.Position == ' Start') & (cr_df.Result == 'Correct') & (cr_df.model == 'RNN')].qnum.values
    # for qid in samples:
    #     q = lookup[qid]
    #     print(q['first_sentence'])
    #     print(q['page'])
    #     print()
    p = (
        ggplot(cr_df)
        + aes(x='model', fill='Result') + facet_grid(['Result', 'Position']) #+ facet_wrap('Position', labeller='label_both')
        + geom_bar(aes(y='(..count..) / sum(..count..)'), position='dodge')
        + labs(x='Models', y='Fraction with Corresponding Result') + coord_flip()
        + theme_fs() + theme(aspect_ratio=.6)
    )
    p.save('output/plots/guesser_error_comparison.pdf')