Esempio n. 1
0
def test_errorbar_aesthetics():
    p = (ggplot(df, aes(ymin='ymin', ymax='ymax')) +
         geom_errorbar(aes('x'), size=2) +
         geom_errorbar(aes('x+1', alpha='z'), width=0.2, size=2) +
         geom_errorbar(aes('x+2', linetype='factor(z)'), size=2) +
         geom_errorbar(aes('x+3', color='z'), size=2) +
         geom_errorbar(aes('x+4', size='z')))

    assert p + _theme == 'errorbar_aesthetics'
def test_errorbar_aesthetics():
    p = (ggplot(df, aes(ymin='ymin', ymax='ymax')) +
         geom_errorbar(aes('x'), size=2) +
         geom_errorbar(aes('x+1', alpha='z'), width=0.2, size=2) +
         geom_errorbar(aes('x+2', linetype='factor(z)'), size=2) +
         geom_errorbar(aes('x+3', color='z'), size=2) +
         geom_errorbar(aes('x+4', size='z'))
         )

    assert p + _theme == 'errorbar_aesthetics'
# Plot
lst_num_partitions = list(all_svcca.index)

threshold = pd.DataFrame(pd.np.tile(permuted_score,
                                    (len(lst_num_partitions), 1)),
                         index=lst_num_partitions,
                         columns=['score'])

panel_A = ggplot(all_svcca)     + geom_line(all_svcca,
                aes(x=lst_num_partitions, y='score', color='Group'),
                size=1.5) \
    + geom_point(aes(x=lst_num_partitions, y='score'),
                 color ='darkgrey',
                size=0.5) \
    + geom_errorbar(all_svcca,
                  aes(x=lst_num_partitions, ymin='ymin', ymax='ymax'),
                   color='darkgrey') \
    + geom_line(threshold,
                aes(x=lst_num_partitions, y='score'),
                linetype='dashed',
                size=1,
                color="darkgrey",
                show_legend=False) \
    + labs(x = "Number of Partitions",
           y = "Similarity score (SVCCA)",
           title = "Similarity across varying numbers of partitions") \
    + theme(
            plot_background=element_rect(fill="white"),
            panel_background=element_rect(fill="white"),
            panel_grid_major_x=element_line(color="lightgrey"),
            panel_grid_major_y=element_line(color="lightgrey"),
Esempio n. 4
0
## use pandas functionality to compute stat transformations
gse75386means = gse75386[['class', 'Gad1']]\
                .groupby('class').agg(np.mean).iloc[:, 0]
gse75386ses = gse75386[['class', 'Gad1']]\
              .groupby('class').agg(lambda x: x.std() / np.sqrt(len(x)))\
              .iloc[:, 0]
gse75386stats = pd.DataFrame({
    'Gad1 (Mean)': gse75386means,
    'SE': gse75386ses,
    'ymin': gse75386means - gse75386ses,
    'ymax': gse75386means + gse75386ses,
    'class': gse75386means.index.values
})
ggbarse = ggplot(gse75386stats, gg.aes(x='class', y='Gad1 (Mean)')) +\
          gg.geom_bar(alpha=0.6, stat='identity') +\
          gg.geom_errorbar(mapping=gg.aes(ymin='ymin', ymax='ymax'), width=0.0001) +\
          gg.coord_flip()
print(ggbarse)
# ggbarse.save('gse75386_gad1_barchart_stat.pdf', format='pdf',
#              height=1, width=6)

## mean bars +/- standard error using seaborn
plt.close()
# plt.figure(figsize=(6, 1))
sns.barplot(data=gse75386, y='class', x='Gad1', color='slategray', ci=68)
# plt.savefig('gse75386_gad1_barchart_stat.pdf',
#             format='pdf', bbox_inches='tight')

## -----------------------------------------------------------------
## GSE75386 boxplot + stripchart
## -----------------------------------------------------------------
        lambda x: x.auroc_mean -
        (critical_val * x.auroc_std) / pd.np.sqrt(x.lf_num_len),
        'aupr_upper':
        lambda x: x.aupr_mean +
        (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len),
        'aupr_lower':
        lambda x: x.aupr_mean -
        (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len)
    }))
dev_set_stats_df

# In[9]:

(p9.ggplot(dev_set_stats_df,
           p9.aes(x="factor(lf_num)", y="auroc_mean", color="model")) +
 p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar(
     p9.aes(ymin="auroc_lower", ymax="auroc_upper", group="model")) +
 p9.theme_seaborn() + p9.labs(title="CtD Tune Set AUROC", color="Model") +
 p9.scale_color_manual({
     "disc_model": "blue",
     "gen_model": "orange"
 }))

# In[10]:

(p9.ggplot(dev_set_stats_df,
           p9.aes(x="factor(lf_num)", y="aupr_mean", color="model")) +
 p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar(
     p9.aes(ymin="aupr_lower", ymax="aupr_upper", group="model")) +
 p9.theme_seaborn() + p9.labs(title="CtD Tune Set AUPR", color="Model") +
 p9.scale_color_manual({
     "disc_model": "blue",
Esempio n. 6
0
                                                         max_depth=5,
                                                         min_samples_split=2,
                                                         max_features=5,
                                                         n_jobs=n_threads)

        sklearn_forest.fit(X, y)
        current_timing = (time.time() - start_time)
        if n >= n_burn_in:
            timing_data['implementation'].append('scikit-learn 0.23.1')
            timing_data['threads'].append(n_threads)
            timing_data['timing'].append(current_timing)

df = pd.DataFrame(data=timing_data)
df = df.groupby(['implementation', 'threads']).agg(['mean',
                                                    'std']).reset_index()
df.columns = ['Implementation', 'threads', 'mean', 'std']
print(df)

df['error_min'] = df['mean'] - df['std']
df['error_max'] = df['mean'] + df['std']
p = (ggplot(
    df,
    aes(x='threads', y='mean', group='Implementation',
        color='Implementation')) + geom_line() + geom_point() +
     geom_errorbar(aes(ymin='error_min', ymax='error_max'),
                   width=.2,
                   position=position_dodge(0.05)) +
     labs(x="Number of threads", y="timing [s]"))

p.save(filename='benchmark.png')
# Plot - uncorrected only
lst_num_experiments = list(all_svcca.index[0:int(len(all_svcca.index) / 2)])

threshold = pd.DataFrame(pd.np.tile(permuted_score,
                                    (len(lst_num_experiments), 1)),
                         index=lst_num_experiments,
                         columns=['score'])

g = ggplot(all_svcca[all_svcca['Group'] == 'uncorrected'])     + geom_line(all_svcca[all_svcca['Group'] == 'uncorrected'],
                aes(x=lst_num_experiments, y='score', color='Group'),
                size=1.5) \
    + geom_point(aes(x=lst_num_experiments, y='score'),
                 color ='darkgrey',
                size=0.5) \
    + geom_errorbar(all_svcca[all_svcca['Group'] == 'uncorrected'],
                  aes(x=lst_num_experiments, ymin='ymin', ymax='ymax'),
                   color='darkgrey') \
    + geom_line(threshold,
                aes(x=lst_num_experiments, y='score'),
                linetype='dashed',
                size=1,
                color="darkgrey",
                show_legend=False) \
    + labs(x = "Number of Partitions",
           y = "Similarity score (SVCCA)",
           title = "Similarity across varying numbers of partitions") \
    + theme(plot_title=element_text(weight='bold'),
            plot_background=element_rect(fill="white"),
            panel_background=element_rect(fill="white"),
            panel_grid_major_x=element_line(color="lightgrey"),
            panel_grid_major_y=element_line(color="lightgrey"),
Esempio n. 8
0
def gene_profile(genes: list, 
                 weights: pd.DataFrame, 
                 stddev: pd.DataFrame=None,
                 y_axis_label: str=None,
                 highlight_n: int=None, 
                 highlight_anno: list=None, 
                 figsize: tuple=None,
                 ylim: tuple=None) -> p9.ggplot:
    """
    
    Parameters
    ----------
    weights            : DataFrame of ES weights
    genes          : a single str or list of genes to include in plot as facets
    highlight_n    : number of highest ESw to highlight
    highlight_anno : specific annotations to highlight
    figsize : (float, float), optional (default: None)
        Specify width and height of plot.
    
    Returns
    -------
        g    : ggplot
        
    Todo:
        * find a better way for sorting cell-types along x-axis
        * report if gene in genes is not found in df
        * report if duplicate genes
        * replace hacky x-axis labelling
    
    """
    
    ### Reduce dataframe to genes of interest
    genes = [str.upper(s) for s in genes]
    idx = np.char.upper(weights.index.values.astype(str))
    mask = np.isin(idx, genes)
    df_tidy = weights[mask]
    n_genes = len(df_tidy)

    assert (n_genes >= 1), "No matching genes found in dataframe."

    stddev_tidy = None
    if stddev is not None:
        idx = np.char.upper(stddev.index.values.astype(str))
        mask = np.isin(idx, genes)
        stddev_tidy = stddev[mask]
        n_genes = len(df_tidy)
        assert (n_genes >= 1), "No matching genes found in stddev dataframe."

    # Constants, height and width of plot.
    if figsize is None:
        H = 5*n_genes
        W = 15
    else:
        W, H = figsize

    if ylim is None:
        ylim = (-1,1)
    
    if y_axis_label is None:
        y_axis_label = "Expression Specificity"
    
    ### Convert to tidy / long format if necessary
    # Org:
    #       ABC  ACBG  ACMB
    # POMC  0.0   0.5   0.9
    # AGRP  0.2   0.0   0.0
    # LEPR  0.1   0.1   0.4
    
    # Tidy:
    #   gene_name annotation    es_weight
    # 1 POMC      ABC           0.0
    # 2 AGRP      ABC           0.6
    # 3 LEPR      ABC           1.0     

    df_tidy.index.name = None # ensure that index name is none, so "index" is used for id_vars
    df_tidy = pd.melt(df_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="weight")
    
    if stddev_tidy is not None:
        stddev_tidy.index.name = None
        stddev_tidy = pd.melt(stddev_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="stddev")
        df_tidy = df_tidy.merge(stddev_tidy, on=["index", "annotation"])


    ### Sort values by gene_name and es_weight and add order
    # Sorted:
    #   gene_name annotation   es_weight   x_order
    # 1 AGRP      MOL2         0.0         1
    # 2 AGRP      ACNT1        0.1         2
    # 3 AGRP      MOL1         0.2         3
    
    df_tidy = df_tidy.sort_values(by=["index", "weight"])
    df_tidy["order"] = np.arange(len(df_tidy)) + 1
    
    ### Generate highlight
    # Default: highlight top 5
    if ((highlight_n is None) and (highlight_anno is None)):
        highlight_n = 5

    # highlight list of 
    if (highlight_anno is not None):
        df_tidy["highlight"] = df_tidy["annotation"].isin(highlight_anno)
    elif (highlight_n is not None):
        df_tidy["highlight"] = df_tidy.groupby("index")["order"].rank("first", ascending=False) <= highlight_n
    else:
        df_tidy["highlight"] = np.array([False] * len(df_tidy))
    
    df_highlight = df_tidy[df_tidy["highlight"]]
    
    ### Plot
    # linear function to compute x_axis text-size.
    # Mainly depends on number of genes in df per faceet, i.e. len(df_tidy) / len(genes).
    SIZE_TEXT_X_AXIS = 10.161 - 0.023 * (len(df_tidy) / len(genes))
    
    # Limits of the order for each index gene / facet, e.g. [0, 266, 531]
    # These limits are necessary to only plot the labels
    order_lims = [0, *(df_tidy.groupby("index")["order"].max().values)]
    
    def find_nearest(array,value):
        array = np.asarray(array)
        idx = (np.abs(array - value)).argmin()
        return array[idx]
        
    def getbreaks(lims):
        # function defined for use in debugging
        l = find_nearest(order_lims, lims[0])
        r = find_nearest(order_lims, lims[1])
        breaks = np.arange(l, r)
        return breaks

    def getlbls(idx):
        # function defined for use in debugging
        idx = idx
        lbls = df_tidy["annotation"].iloc[idx].values
        return lbls
    
    p = (
        ### data
        p9.ggplot(data=df_tidy, mapping=p9.aes(x="order", y="weight", label="annotation"))

        ### theming
        + p9.theme_classic()
        + p9.theme(
            figure_size = (W,H),
            axis_ticks_major_x = p9.element_blank(),
            axis_text_x = p9.element_text(rotation=75, hjust=0, size=SIZE_TEXT_X_AXIS), # 
            axis_text_y = p9.element_text(size=W),
            panel_spacing = 1,
            strip_background = p9.element_blank()
        )

        + p9.ylim(ylim[0],ylim[1])

        + p9.labs(
            x="", # e.g. "Cell-type"
            y=y_axis_label, # e.g. "ES weight"
        )

        ### viz
        # all
        + p9.geom_segment(mapping=p9.aes(x="order", xend="order", y=0, yend="weight"),
                       color="grey",
                       alpha=0.3,
                       show_legend=False
        )

        + p9.geom_point(mapping=p9.aes(size=2),
                     color="grey",
                    show_legend=False
        )

        # highlight
        + p9.geom_point(data=df_highlight, mapping=p9.aes(size=2), 
                     color="dodgerblue",
                    show_legend=False
        )

        + p9.geom_segment(data=df_highlight, mapping=p9.aes(x="order", xend="order", y=0, yend="weight"),
                       color="dodgerblue",
                       alpha=0.3,
                       show_legend=False
        )

        + p9.facet_wrap("index",
                     scales="free",
                     nrow=n_genes
                    )
        
        + p9.scale_x_continuous(
            # order_scale is continuous across all annotations
            # so the scale will look weird for each facet, e.g.
            # facet 1 may have order 1-7, and facet 2 has order 8-14.
            # therefore we must use a labeller function to get the 
            # correct labels for each interval of order.
            breaks = lambda lims: getbreaks(lims),
            labels = lambda idx: getlbls(idx)
        )
    )
    
    if stddev_tidy is not None:
        p = p + p9.geom_errorbar(mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"), 
                                    color="grey", width=0.1)\
                + p9.geom_errorbar(data=df_highlight, mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"),
                                color="dodgerblue", width=0.1)

    # add labels last for them to be on top
    p = p + p9.geom_label(data=df_highlight,
                    color = "dodgerblue",
                    adjust_text = {'expand_points': (2,2)}
        )

    return p
    
Esempio n. 9
0
    .wls(formula, data=abortion_bf15, weights=abortion_bf15.totpop.values)
    .fit(
        cov_type='cluster', 
        cov_kwds={'groups': abortion_bf15.fip.values}, 
        method='pinv')
)

reg.summary()


abortion_plot = pd.DataFrame(
    {
        'sd': reg.bse['C(repeal)[T.1.0]:C(year)[T.1986.0]':'C(repeal)[T.1.0]:C(year)[T.2000.0]'],
        'mean': reg.params['C(repeal)[T.1.0]:C(year)[T.1986.0]':'C(repeal)[T.1.0]:C(year)[T.2000.0]'],
        'year': np.arange(1986, 2001)
    })
abortion_plot['lb'] = abortion_plot['mean'] - abortion_plot['sd']*1.96
abortion_plot['ub'] = abortion_plot['mean'] + abortion_plot['sd']*1.96

(
    p.ggplot(abortion_plot, p.aes(x = 'year', y = 'mean')) + 
    p.geom_rect(p.aes(xmin=1985, xmax=1992, ymin=-np.inf, ymax=np.inf), fill="cyan", alpha = 0.01) +
    p.geom_point() +
    p.geom_text(p.aes(label = 'year'), ha='right') +
    p.geom_hline(yintercept = 0) +
    p.geom_errorbar(p.aes(ymin = 'lb', ymax = 'ub'), width = 0.2,
                    position = p.position_dodge(0.05)) +
    p.labs(title= "Estimated effect of abortion legalization on gonorrhea")
)

Esempio n. 10
0
# Plot
lst_num_partitions = list(summary_df.index)

threshold = pd.DataFrame(permuted_uncorrected_scores,
                         index=num_simulated_experiments,
                         columns=['score'])

panel_A = ggplot(summary_df)     + geom_line(summary_df,
                aes(x=lst_num_partitions, y='SVCCA score', color='group'),
                size=1.5) \
    + geom_point(aes(x=lst_num_partitions, y='SVCCA score'),
                 color ='darkgrey',
                 size=0.5) \
    + geom_errorbar(summary_df,
                    aes(x=lst_num_partitions, ymin='ymin', ymax='ymax'),
                    color='darkgrey') \
    + geom_line(threshold,
                aes(x=num_simulated_experiments, y='score'),
                linetype='dashed',
                size=1,
                color="darkgrey",
                show_legend=False) \
    + labs(x = "Number of Experiments",
           y = "Similarity score (SVCCA)",
           title = "Similarity across varying numbers of experiments") \
    + theme(
            plot_background=element_rect(fill="white"),
            panel_background=element_rect(fill="white"),
            panel_grid_major_x=element_line(color="lightgrey"),
            panel_grid_major_y=element_line(color="lightgrey"),
Esempio n. 11
0
    def barchart_make(roi, df, list_rois, config, ylimit, save_function,
                      find_ylim_function):
        thisroi = list_rois[roi]

        current_df = df.loc[df['index'] == thisroi]

        current_df = current_df.sort_values([config.single_roi_fig_x_axis])
        current_df = current_df.reset_index(
            drop=True)  # Reset index to remove grouping
        current_df[config.single_roi_fig_x_axis] = pd.Categorical(
            current_df[config.single_roi_fig_x_axis],
            categories=current_df[config.single_roi_fig_x_axis].unique())

        figure = (
            pltn.ggplot(
                current_df,
                pltn.aes(x=config.single_roi_fig_x_axis,
                         y='Mean',
                         ymin="Mean-Conf_Int_95",
                         ymax="Mean+Conf_Int_95",
                         fill='factor({colour})'.format(
                             colour=config.single_roi_fig_colour))) +
            pltn.theme_538() + pltn.geom_col(position=pltn.position_dodge(
                preserve='single', width=0.8),
                                             width=0.8,
                                             na_rm=True) +
            pltn.geom_errorbar(size=1,
                               position=pltn.position_dodge(
                                   preserve='single', width=0.8)) +
            pltn.labs(x=config.single_roi_fig_label_x,
                      y=config.single_roi_fig_label_y,
                      fill=config.single_roi_fig_label_fill) +
            pltn.scale_x_discrete(labels=[]) +
            pltn.theme(panel_grid_major_x=pltn.element_line(alpha=0),
                       axis_title_x=pltn.element_text(
                           weight='bold', color='black', size=20),
                       axis_title_y=pltn.element_text(
                           weight='bold', color='black', size=20),
                       axis_text_y=pltn.element_text(size=20, color='black'),
                       legend_title=pltn.element_text(size=20, color='black'),
                       legend_text=pltn.element_text(size=18, color='black'),
                       subplots_adjust={'right': 0.85},
                       legend_position=(0.9, 0.8),
                       dpi=config.plot_dpi) +
            pltn.geom_text(pltn.aes(y=-.7, label=config.single_roi_fig_x_axis),
                           color='black',
                           size=20,
                           va='top') + pltn.scale_fill_manual(
                               values=config.colorblind_friendly_plot_colours))

        if ylimit:
            # Set y limit of figure (used to make it the same for every barchart)
            figure += pltn.ylim(None, ylimit)
            thisroi += '_same_ylim'

        returned_ylim = 0
        if config.use_same_axis_limits in ('Same limits',
                                           'Create both') and ylimit == 0:
            returned_ylim = find_ylim_function(thisroi, figure, 'yaxis')

        if config.use_same_axis_limits == 'Same limits' and ylimit == 0:
            return returned_ylim
        elif ylimit != 0:
            folder = 'Same_yaxis'
        else:
            folder = 'Different_yaxis'

        save_function(figure, thisroi, config, folder, 'barchart')

        return returned_ylim
    (temp_df["time_to_published"].dt.total_seconds() / 60 / 60 / 24).max())
category_half_life

# In[14]:

g = (p9.ggplot(
    category_half_life.query("category!='none'").assign(
        half_life_time=lambda x: pd.to_timedelta(x.half_life_time, "D"),
        half_life_ci_l=lambda x: pd.to_timedelta(x.half_life_ci_l, "D"),
        half_life_ci_u=lambda x: pd.to_timedelta(x.half_life_ci_u, "D"),
    ),
    p9.aes(x="category",
           y="half_life_time",
           ymin="half_life_ci_l",
           ymax="half_life_ci_u"),
) + p9.geom_col(fill="#1f78b4") + p9.geom_errorbar() + p9.scale_x_discrete(
    limits=(category_half_life.query("category!='none'").sort_values(
        "half_life_time").category.tolist()[::-1]), ) +
     p9.scale_y_timedelta(labels=timedelta_format("d")) + p9.coord_flip() +
     p9.labs(
         x="Preprint Categories",
         y="Time Until 50% of Preprints are Published",
         title="Preprint Category Half-Life",
     ) + p9.theme_seaborn(context="paper", style="white", font_scale=1.2) +
     p9.theme(axis_ticks_minor_x=p9.element_blank(), ))
g.save("output/preprint_category_halflife.svg", dpi=250)
g.save("output/preprint_category_halflife.png", dpi=250)
print(g)

# Take home Results:
#     1. The average amount of time for half of all preprints to be published is 348 days (~1 year)
Esempio n. 13
0
g = (
    p9.ggplot(
        category_half_life.query("category!='none'").assign(
            half_life_time=lambda x: pd.to_timedelta(x.half_life_time, "D"),
            half_life_ci_l=lambda x: pd.to_timedelta(x.half_life_ci_l, "D"),
            half_life_ci_u=lambda x: pd.to_timedelta(x.half_life_ci_u, "D"),
        ),
        p9.aes(
            x="category",
            y="half_life_time",
            ymin="half_life_ci_l",
            ymax="half_life_ci_u",
        ),
    )
    + p9.geom_col(fill="#1f78b4")
    + p9.geom_errorbar()
    + p9.scale_x_discrete(
        limits=(
            category_half_life.query("category!='none'")
            .sort_values("half_life_time")
            .category.tolist()[::-1]
        ),
    )
    + p9.scale_y_timedelta(labels=timedelta_format("d"))
    + p9.coord_flip()
    + p9.labs(
        x="Preprint Categories",
        y="Time Until 50% of Preprints are Published",
        title="Preprint Category Half-Life",
    )
    + p9.theme_seaborn(context="paper", style="white", font_scale=1, font="Arial")
Esempio n. 14
0
abortion.loc[(abortion.wht==0) & (abortion.male==1), 'bm'] = 1

abortion['bf'] = 0
abortion.loc[(abortion.wht==0) & (abortion.male==0), 'bf'] = 1


abortion_filt = abortion[(abortion.bf==1) & (abortion.age.isin([15,25]))]

reg = (
    smf
    .wls("""lnr ~ C(repeal)*C(year) + C(younger)*C(repeal) + C(younger)*C(year) + 
C(yr)*C(year) + C(fip)*t + acc + ir + pi + alcohol + crack + poverty + income + ur""", 
        data=abortion_filt, weights=abortion_filt.totpop.values)
    .fit(
        cov_type='cluster', 
        cov_kwds={'groups': abortion_filt.fip.values}, 
        method='pinv')
)

abortion_plot = pd.DataFrame({'sd': reg.bse['C(yr)[T.1]:C(year)[T.1986.0]':'C(yr)[T.1]:C(year)[T.2000.0]'],
             'mean': reg.params['C(yr)[T.1]:C(year)[T.1986.0]':'C(yr)[T.1]:C(year)[T.2000.0]'],
             'year':np.arange(1986, 2001)})

abortion_plot['lb'] = abortion_plot['mean'] - abortion_plot['sd']*1.96
abortion_plot['ub'] = abortion_plot['mean'] + abortion_plot['sd']*1.96


p.ggplot(abortion_plot, p.aes(x = 'year', y = 'mean')) +     p.geom_rect(p.aes(xmin=1986, xmax=1991, ymin=-np.inf, ymax=np.inf), fill = "cyan", alpha = 0.01)+    p.geom_point()+    p.geom_text(p.aes(label = 'year'), ha='right')+    p.geom_hline(yintercept = 0) +    p.geom_errorbar(p.aes(ymin = 'lb', ymax = 'ub'), width = 0.2,
                position = p.position_dodge(0.05)) +\
    p.labs(title= "Estimated effect of abortion legalization on gonorrhea")
Esempio n. 15
0
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Fits logistic regression to predict labels.'
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    parser.add_argument(
        '-h5',
        '--h5_anndata',
        action='store',
        dest='h5',
        required=True,
        help='H5 AnnData file where clusters have been saved to cluster slot.')

    # parser.add_argument(
    #     '-ncpu', '--number_cpu',
    #     action='store',
    #     dest='number_cpu',
    #     default=50,
    #     type=int,
    #     help='Number of CPUs to use. Since we are testing the dask backend,\
    #         this corresponds to the number of CPUs available across all of\
    #         the worker jobs we spin out.\
    #         (default: %(default)s)'
    # )

    parser.add_argument('-s',
                        '--sparsity_l1',
                        action='store',
                        dest='sparsity_l1',
                        default=0.0001,
                        type=float,
                        help='Smaller values specify stronger regularization.\
            (default: %(default)s)')

    parser.add_argument('-nepoch',
                        '--number_epoch',
                        action='store',
                        dest='number_epoch',
                        default=25,
                        type=int,
                        help='Number of epochs.\
            (default: %(default)s)')

    parser.add_argument(
        '-bs',
        '--batch_size',
        action='store',
        dest='batch_size',
        default=32,
        type=int,
        help='Batch size. Divides the dataset into n batches and updates the\
            weights at the end of each one.\
            (default: %(default)s)')

    parser.add_argument(
        '-tsc',
        '--train_size_cells',
        action='store',
        dest='train_size_cells',
        default=0,
        type=int,
        help='Number of cells to use for training set. If > 0 all\
            remaining cells not randomly selected for training will be used\
            for the test set. Overrides <train_size_fraction>.\
            (default: %(default)s)')

    parser.add_argument('-tsf',
                        '--train_size_fraction',
                        action='store',
                        dest='train_size_fraction',
                        default=0.67,
                        type=float,
                        help='Fraction of the data to use for training set.\
            (default: %(default)s)')

    parser.add_argument(
        '--dict_add',
        action='store',
        dest='dict_add',
        default='',
        type=str,
        help='Additional information to add to output model_report.\
            Format: key::value:::key2::value2.\
            Example: method::leiden:::resolution::3.0\
            (default: %(default)s)')

    parser.add_argument('--grid_search',
                        action='store_true',
                        dest='grid_search',
                        default=False,
                        help='Run a grid search of hyperparameters.\
            (default: %(default)s)')

    parser.add_argument('--memory_limit',
                        action='store',
                        dest='memory_limit',
                        default=50,
                        type=int,
                        help='Memory limit in Gb.\
            (default: %(default)s)')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output files, assuming output in current working \
            directory.\
            (default: keras_model-<params>)')
    options = parser.parse_args()

    verbose = True

    # Set GPU memory limits
    gpus = tf.config.list_physical_devices('GPU')
    print(gpus)
    if gpus:
        # For TF v1
        # config = tf.ConfigProto()
        # config.gpu_options.allow_growth = True
        # session = tf.Session(config=config)

        # For TF v2
        try:
            # Method 1:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)

            # Method 2:
            # Restrict TensorFlow to only allocate 1GB of memory on the first
            # GPU
            # tf.config.experimental.set_virtual_device_configuration(
            #     gpus[0],
            #     [tf.config.experimental.VirtualDeviceConfiguration(
            #         memory_limit=options.memory_limit*1024
            #     )])
            # logical_gpus = tf.config.list_logical_devices('GPU')
            # print(
            #     len(gpus),
            #     "Physical GPUs,",
            #     len(logical_gpus),
            #     "Logical GPUs"
            # )
        except RuntimeError as e:
            # Virtual devices must be set before GPUs have been initialized
            print(e)
    else:
        raise Exception('ERROR: no GPUs detected.')

    # Get additional data we are going to append to the output model info
    dict_add = {}
    if options.dict_add != '':
        for item in options.dict_add.split(':::'):
            _tmp = item.split('::')
            if len(_tmp) != 2:
                raise Exception('ERROR: check dict_add.')
            else:
                dict_add[_tmp[0]] = _tmp[1]
    print(dict_add)

    # Load the AnnData file.
    # This file should already have clusters identified and saved to the
    # clusters slot.
    adata = sc.read_h5ad(filename=options.h5)

    # Set X to cp10k
    # adata.X = np.expm1(adata.layers['log1p_cp10k'])
    # Set X to ln(cp10k+1)
    # NOTE: Testing with 100k TI dataset, we were able to achieve higher
    # accuracy with log1p_cp10k - likely becuase better spread in distribution.
    adata.X = adata.layers['log1p_cp10k']
    # Set X to raw counts
    # adata.X = adata.layers['counts']

    # Add some info from adata to dict_add
    for key, value in adata.uns['neighbors']['params'].items():
        dict_add['neighbors__{}'.format(key)] = value
    for key, value in adata.uns['cluster']['params'].items():
        dict_add['cluster__{}'.format(key)] = value

    # If train_size_cells, override the fraction so that the total number of
    # cells in the training set will be equal to train_size_cells.
    train_size_fraction = options.train_size_fraction
    if options.train_size_cells > 0:
        if options.train_size_cells >= adata.n_obs:
            raise Exception('Invalid train_size_cells.')
        train_size_fraction = (
            1 - ((adata.n_obs - options.train_size_cells) / adata.n_obs))
        if verbose:
            print(
                'Set train_size_fraction to: {}.'.format(train_size_fraction))
    if verbose:
        print('Number cells training ({}) and testing ({}).'.format(
            int(train_size_fraction * adata.n_obs),
            int((1 - train_size_fraction) * adata.n_obs)))

    # Set X and y
    X = adata.X
    y = adata.obs['cluster'].values

    # Set other variables
    sparsity_l1 = options.sparsity_l1
    n_epochs = options.number_epoch
    batch_size = options.batch_size

    # Center and scale the data
    if sp.sparse.issparse(X):
        X = X.todense()
    X_std = X
    scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
    X_std = scaler.fit_transform(X)
    if verbose:
        print('center={} scale={}'.format(True, True))

    # One hot encode y (the cell type classes)
    # encode class values as integers
    encoder = preprocessing.LabelEncoder()
    encoder.fit(y)
    print('Found {} clusters'.format(len(encoder.classes_)))

    # Define the model
    # NOTE: Defaults determined via grid search of 160k TI single cells
    def classification_model(optimizer='sgd',
                             activation='softmax',
                             loss='categorical_crossentropy',
                             sparsity_l1__activity=0.0001,
                             sparsity_l2__activity=0.0,
                             sparsity_l1__kernel=0.0,
                             sparsity_l2__kernel=0.0,
                             sparsity_l1__bias=0.0,
                             sparsity_l2__bias=0.0):
        # create model
        model = Sequential()
        # Use a “softmax” activation function in the output layer. This is to
        # ensure the output values are in the range of 0 and 1 and may be used
        # as predicted probabilities.
        #
        # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax
        # Softmax assigns decimal probabilities to each class in a multi-class
        # problem. Those decimal probabilities must add up to 1.0. This
        # additional constraint helps training converge more quickly than it
        # otherwise would. Softmax is implemented through a neural network
        # layer just before the output layer. The Softmax layer must have the
        # same number of nodes as the output layer.
        # Softmax assumes that each example is a member of exactly one class.
        #
        # Softmax should be used for multi-class prediction with single label
        # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/video-lecture
        # NOTE: input dimension = number of features your data has
        model.add(
            Dense(
                len(encoder.classes_),  # output dim is number of classes
                use_bias=True,  # intercept
                activation=activation,  # softmax, sigmoid
                activity_regularizer=L1L2(l1=sparsity_l1__activity,
                                          l2=sparsity_l2__activity),
                kernel_regularizer=L1L2(l1=sparsity_l1__kernel,
                                        l2=sparsity_l2__kernel),
                bias_regularizer=L1L2(l1=sparsity_l1__bias,
                                      l2=sparsity_l2__bias),
                input_dim=X.shape[1]))
        # Example of adding additional layers
        # model.add(Dense(8, input_dim=4, activation='relu'))
        # model.add(Dense(3, activation='softmax'))

        # Metrics to check out over training epochs
        mets = [
            # loss,
            keras.metrics.CategoricalAccuracy(name='categorical_accuracy'),
            # keras.metrics.TruePositives(name='tp'),
            # keras.metrics.FalsePositives(name='fp'),
            # keras.metrics.TrueNegatives(name='tn'),
            # keras.metrics.FalseNegatives(name='fn'),
            # keras.metrics.Precision(name='precision'),
            # keras.metrics.Recall(name='recall'),
            # keras.metrics.AUC(name='auc'),
            keras.metrics.BinaryAccuracy(name='accuracy')
        ]
        # Use Adam gradient descent optimization algorithm with a logarithmic
        # loss function, which is called “categorical_crossentropy” in Keras.
        # UPDATE: sgd works better emperically.
        model.compile(
            optimizer=optimizer,  # adam, sgd
            loss=loss,
            metrics=mets)

        return model

    # Now, either call a grid search or specific model fit
    if options.grid_search:
        # Get the out file base.
        out_file_base = options.of
        if out_file_base == '':
            out_file_base = 'keras_model'
        out_file_base = '{}-grid_search'.format(out_file_base)

        # Call grid search of various parameters
        grid_result, df_grid_result = keras_grid(
            model_function=classification_model,
            encoder=encoder,
            X_std=X_std,
            y=y,
            n_epochs=n_epochs,
            batch_size=batch_size)

        # NOTE: This will fail because can't pickle KerasClassifier. This is
        # fine though becuase results are saved in tsv.gz format below.
        # Save the results
        # out_f = '{}-grid_result.gz'.format(out_file_base)
        # joblib.dump(
        #     grid_result,
        #     out_f,
        #     compress=('gzip', 3)
        # )
        # Load the model
        # lr = joblib.load(
        #     'test-lr_model.joblib.gz'
        # )
        # print(lr)

        # Save the results of our search to tsv
        out_f = '{}-grid_result.tsv.gz'.format(out_file_base)
        df_grid_result.to_csv(out_f,
                              sep='\t',
                              index=False,
                              quoting=csv.QUOTE_NONNUMERIC,
                              na_rep='',
                              compression=compression_opts)

        # Add a single columns that summarizes params
        param_columns = [
            col for col in df_grid_result.columns if 'param__' in col
        ]
        df_grid_result['params'] = df_grid_result[param_columns].astype(
            str).apply(lambda x: '-'.join(x), axis=1)

        # Plot the distribution of accuracy across folds
        split_columns = [
            col for col in df_grid_result.columns if 'split' in col
        ]
        split_columns = [col for col in split_columns if '_test_score' in col]
        df_plt = pd.melt(df_grid_result,
                         id_vars=['params'],
                         value_vars=split_columns)
        gplt = plt9.ggplot(df_plt, plt9.aes(x='params', y='value'))
        gplt = gplt + plt9.theme_bw()
        gplt = gplt + plt9.geom_boxplot(alpha=0.8)
        gplt = gplt + plt9.geom_jitter(alpha=0.75)
        gplt = gplt + plt9.scale_y_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0
            # limits=[0, 1]
        )
        gplt = gplt + plt9.labs(x='Parameters', y='Score', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt.save('{}-score.png'.format(out_file_base),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)

        # Plot the mean time and std err for fitting results
        gplt = plt9.ggplot(df_grid_result,
                           plt9.aes(x='params', y='mean_fit_time'))
        gplt = gplt + plt9.theme_bw()
        gplt = gplt + plt9.geom_point()
        gplt = gplt + plt9.geom_errorbar(plt9.aes(
            ymin='mean_fit_time-std_fit_time',
            ymax='mean_fit_time+std_fit_time'),
                                         width=0.2,
                                         position=plt9.position_dodge(0.05))
        gplt = gplt + plt9.scale_y_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0)
        gplt = gplt + plt9.labs(x='Parameters', y='Mean fit time', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt.save('{}-fit_time.png'.format(out_file_base),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)

    else:
        # Get the out file base.
        out_file_base = options.of
        if out_file_base == '':
            out_file_base = 'keras_model'
            # out_file_base = '{}-center={}-scale={}'.format(
            #     out_file_base,
            #     center,
            #     scale
            # )
            out_file_base = '{}-batch_size={}-epochs={}'.format(
                out_file_base, batch_size, n_epochs)
            out_file_base = '{}-sparsity_l1={}-train_size_fraction={}'.format(
                out_file_base,
                str(sparsity_l1).replace('.', 'pt'),
                str(train_size_fraction).replace('.', 'pt'))

        # Fit the specific model and save the results
        model, model_report, y_prob_df, history = fit_model_keras(
            model_function=classification_model,
            encoder=encoder,
            X_std=X_std,
            y=y,
            sparsity_l1=sparsity_l1,
            sparsity_l2=0.0,
            n_epochs=n_epochs,
            batch_size=batch_size,
            train_size_fraction=train_size_fraction)

        # Save the model, weights (coefficients), and bias (intercept)
        model.save('{}.h5'.format(out_file_base),
                   overwrite=True,
                   include_optimizer=True)

        # Save the model and weights (coefficients) seperately
        # open('{}.json'.format(out_file_base), 'w').write(model.to_json())
        open('{}.yml'.format(out_file_base), 'w').write(model.to_yaml())
        model.save_weights('{}-weights.h5'.format(out_file_base))
        # Example read functions
        # model = model_from_yaml(open('my_model_architecture.yaml').read())
        # model.load_weights('my_model_weights.h5')

        # Save the model report
        # Add column telling us if this is cluster or summary value
        is_cluster = []
        for i in model_report.index:
            if i in encoder.classes_:
                is_cluster.append(True)
            else:
                is_cluster.append(False)
        model_report['is_cluster'] = is_cluster
        # Add in extra data
        model_report['sparsity_l1'] = sparsity_l1
        if dict_add:
            for key, value in dict_add.items():
                model_report[key] = value
        print(model_report)
        out_f = '{}-model_report.tsv.gz'.format(out_file_base)
        model_report.to_csv(out_f,
                            sep='\t',
                            index=True,
                            index_label='cell_label',
                            quoting=csv.QUOTE_NONNUMERIC,
                            na_rep='',
                            compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Save the test results - each row is a cell and the columns are the
        # prob of that cell belonging to a particular class.
        # Add in extra data
        y_prob_df['sparsity_l1'] = sparsity_l1
        if dict_add:
            for key, value in dict_add.items():
                y_prob_df[key] = value
        out_f = '{}-test_result.tsv.gz'.format(out_file_base)
        y_prob_df.to_csv(
            out_f,
            sep='\t',
            index=False,  # NOTE: Not adding the label to test_result index.
            # index_label='cell_label',
            quoting=csv.QUOTE_NONNUMERIC,
            na_rep='',
            compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Make a matrix of weights per gene
        # Columns = genes tested and rows = cell type label
        weight, bias = model.layers[-1].get_weights()
        # weight, bias = model.get_layer("output").get_weights()
        df_weights = pd.DataFrame.from_records(
            weight,
            index=adata.var.index,  # index is gene
            columns=encoder.classes_)
        # Save the weights dataframe.
        out_f = '{}-weights.tsv.gz'.format(out_file_base)
        df_weights.to_csv(out_f,
                          sep='\t',
                          index=True,
                          index_label='ensembl_gene_id',
                          quoting=csv.QUOTE_NONNUMERIC,
                          na_rep='',
                          compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Plot the number of features with non-zero coefficients in each
        # cluster.
        out_f = '{}-n_features.png'.format(out_file_base)
        df_plt = pd.DataFrame({
            'classes': df_weights.columns,
            'features': (df_weights != 0).sum(axis=0)
        })
        df_plt = df_plt.set_index('classes')
        # print(df_plt)
        # Add in catgories with no predictive model (e.g., becuase they were
        # too few in training).
        for i in adata.obs['cluster'].cat.categories:
            if i not in df_plt.index:
                df_plt = df_plt.append(
                    pd.Series([0], index=df_plt.columns, name=i))
        fig = plt.figure(figsize=(max(0.5 * len(df_plt.index), 5), 4))
        # plt.bar(lr.classes_, n_features)
        plt.bar(df_plt.index, df_plt['features'])
        plt.xlabel('Cluster')
        plt.ylabel('Features with coefficient != 0')
        plt.xticks(rotation=90)
        for i in df_plt.index:
            plt.annotate(str(df_plt.loc[i, 'features']),
                         xy=(i, df_plt.loc[i, 'features']))
        fig.savefig(out_f, dpi=300, bbox_inches='tight')
        plt.close(fig)

        # Plot ROC of the test and truth.
        out_f = '{}-roc.png'.format(out_file_base)
        fig = plt.figure()
        cell_label_true = y_prob_df.pop('cell_label_true')
        # Drop columns that are not cell type labels
        for i in y_prob_df.columns:
            if 'class__' not in i:
                del y_prob_df[i]
        plot_roc(y_prob_df.values, cell_label_true.values, y_prob_df.columns)
        fig.savefig(out_f, dpi=300, bbox_inches='tight')
        plt.close(fig)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Plot metrics vs cluster size to see if smaller clusters have poorer
        # metric measures.
        df_plt = model_report.fillna(0)
        for i in df_plt.index:
            if i not in encoder.classes_:
                df_plt = df_plt.drop(i)
        for i in ['AUC', 'f1-score', 'average_precision_score', 'MCC']:
            out_f = '{}-cluster_size_{}.png'.format(out_file_base, i)
            fig = plt.figure()
            plt.scatter(df_plt['n_cells_full_dataset'], df_plt[i], alpha=0.5)
            plt.xlabel('Number of cells in cluster (full dataset)')
            plt.ylabel(i)
            if i in ['AUC', 'f1-score', 'average_precision_score']:
                plt.ylim(0, 1)
            elif i == 'MCC':
                plt.ylim(-1, 1)
            # Add annotation of the cluster
            for index, row in df_plt.iterrows():
                if row['n_cells_full_dataset'] == 0:
                    print('ERROP: n_cells_full_dataset = 0 for {}.'.format(
                        index))
                plt.annotate(
                    index,  # this is the text
                    (row['n_cells_full_dataset'], row[i]),  # point to label
                    textcoords='offset points',  # how to position the text
                    xytext=(0, 10),  # distance from text to points (x,y)
                    ha='center'  # horiz alignment can be left, right, center
                )
            fig.savefig(out_f, dpi=300, bbox_inches='tight')
            plt.xscale('log', basex=10)
            fig.savefig('{}-cluster_size_{}_log10.png'.format(
                out_file_base, i),
                        dpi=300,
                        bbox_inches='tight')
            plt.close(fig)
            if verbose:
                print('Completed: save {}.'.format(out_f))

        # Plot history of metrics over epochs
        for dat_i in history.history.keys():
            fig = plt.figure()
            plt.plot(history.history[dat_i])
            plt.ylabel(dat_i)
            plt.xlabel('Epoch')
            fig.savefig('{}-model_iter_{}.png'.format(out_file_base, dat_i),
                        dpi=300,
                        bbox_inches='tight')
            plt.close(fig)
threshold = pd.DataFrame(
    pd.np.tile(
        permuted_score,
        (len(lst_num_experiments), 1)),
    index=lst_num_experiments,
    columns=['score'])

panel_A = ggplot(all_svcca)     + geom_line(all_svcca,
                aes(x=lst_num_experiments, y='score', color='Group'),
                size=1.5) \
    + geom_point(aes(x=lst_num_experiments, y='score'), 
                 color ='darkgrey',
                size=0.5) \
    + geom_errorbar(all_svcca,
                  aes(x=lst_num_experiments, ymin='ymin', ymax='ymax'),
                   color='darkgrey') \
    + geom_line(threshold, 
                aes(x=lst_num_experiments, y='score'), 
                linetype='dashed',
                size=1.5,
                color="darkgrey",
                show_legend=False) \
    + labs(x = "Number of Partitions", 
           y = "Similarity score (SVCCA)", 
           title = "Similarity across varying numbers of partitions") \
    + theme(plot_title=element_text(weight='bold'),
            plot_background=element_rect(fill="white"),
            panel_background=element_rect(fill="white"),
            panel_grid_major_x=element_line(color="lightgrey"),
            panel_grid_major_y=element_line(color="lightgrey"),
Esempio n. 17
0
 # + geom_line(data = inc, mapping=aes(x="julian", y="uniqueID"), colour="black")
 #    + geom_smooth(data=inc, mapping=aes(x="julian", y="uniqueID"), colour="black", method="mavg", se=False, method_args={"window": 4, "center": True, "min_periods": 1})
 + scale_x_continuous(labels=label_x, limits=[xmin, xmax])).save("figs/ACI_BARW_incend.png", height=8, width=8, dpi=150)

inc = barw_nest.groupby("julian", as_index=False).uniqueID.count().reset_index()

(ggplot(data=inc, mapping=aes(x="julian", y="uniqueID"))
    + xlab("Day")
    + ylab("Number of nest initiation/hatch")
    + geom_smooth(method="mavg", se=False, method_args={"window": 4, "center": True, "min_periods": 1})
    + annotate("rect", xmin=[inc_start, hatch_start], xmax=[inc_end, hatch_end],
               ymin=-math.inf, ymax=math.inf, alpha=0.1, fill=["red", "blue"])
    + annotate("text", x=[inc_lbl_pos, hatch_lbl_pos], y=4.5, label=["incubation", "hatch"])
    + scale_x_continuous(labels=label_x, limits=[xmin, xmax])).save("figs/Nest_BARW_incend.png", height=8, width=8, dpi=150)


res3 = aci.loc[aci.site == "Barrow"]
res3 = res3.groupby(["plot"], as_index=False).apply(check_dates, site_data)
res3.reset_index()
res3 = res3.groupby(["plot", "julian"], as_index=False).agg({"ACI": ["mean", "std"], "lat": "mean", "lon": "mean"})
res3.columns = pd.Index(join_tuple(i, "_") for i in res3.columns)
res3
(ggplot(data=res3, mapping=aes(x='julian', y='ACI_mean', colour='plot'))
    + xlab("Day")
    + ylab("Mean daily ACI (standardized)")
    + facet_grid("plot~", scales="free")
    + geom_point()
    + geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std"))
    + geom_smooth(method="mavg", se=False, method_args={"window": 4, "center": True, "min_periods": 1})
    + scale_x_continuous(labels=label_x))  # .save("figs/ACI_BARW_plots2.png", height=12, width=8, dpi=150)
        'auroc_upper': lambda x: x.auroc_mean + (critical_val * x.auroc_std)/pd.np.sqrt(x.lf_num_len),
        'auroc_lower': lambda x: x.auroc_mean - (critical_val * x.auroc_std)/pd.np.sqrt(x.lf_num_len),
        'aupr_upper': lambda x: x.aupr_mean + (critical_val * x.aupr_std)/pd.np.sqrt(x.lf_num_len),
        'aupr_lower': lambda x: x.aupr_mean - (critical_val * x.aupr_std)/pd.np.sqrt(x.lf_num_len)
    })
)
dev_disc_df.head(2)


# In[7]:


g = ( 
    p9.ggplot(dev_disc_df, p9.aes(x="factor(lf_num)", y="auroc_mean", linetype="model", color="relation"))
    + p9.geom_point()
    + p9.geom_errorbar(p9.aes(ymin="auroc_lower", ymax="auroc_upper"))
    + p9.geom_line(p9.aes(group="model"))
    + p9.scale_x_discrete(limits=[0, 1, 6, 11, 16, 'All'])
    + p9.scale_color_manual(values={
        "DaG": mcolors.to_hex(color_map["DaG"]),
        'CtD': mcolors.to_hex(color_map["CtD"]),
        "CbG": mcolors.to_hex(color_map["CbG"]),
        "GiG": mcolors.to_hex(color_map["GiG"]),
        }, guide=False)
    + p9.facet_wrap("relation")
    + p9.labs(
        title="Disc Model Performance (Tune Set)",
    )
    + p9.xlab("Number of Label Functions")
    + p9.ylab("AUROC")
    + p9.theme_bw()