def plot_vs_discrete(data_table, discrete_metric_name, metric_name, segment_name, title, ylim=None, aggregate="mean" ): data_filtered = \ data_table.loc[((pd.notnull(data_table[metric_name])) & (pd.notnull(data_table[discrete_metric_name])))][ [discrete_metric_name, metric_name, segment_name]] data_filtered[[metric_name]] = data_filtered[[metric_name]].astype(float) result = data_filtered.groupby([discrete_metric_name, segment_name]).agg({metric_name: aggregate}).reset_index() result[metric_name] = round(result[metric_name], 3) gg_result = plot.ggplot(result) + plot.aes(x=discrete_metric_name, y=metric_name, fill=segment_name, label=metric_name ) + \ plot.geom_bar(stat="identity", position="dodge") + \ plot.geom_text(position=plot.position_dodge(width=.9), size=8) + \ plot.labs(x=discrete_metric_name, y=aggregate + "(" + metric_name + ")", title=title) if pd.notnull(ylim): gg_result = gg_result + plot.ylim(ylim) return gg_result
def plot_bargraph(count_plot_df, plot_df): """ Plots the bargraph Arguments: count_plot_df - The dataframe that contains lemma counts plot_df - the dataframe that contains the odds ratio and lemmas """ graph = ( p9.ggplot(count_plot_df.astype({"count": int}), p9.aes(x="lemma", y="count")) + p9.geom_col(position=p9.position_dodge(width=0.5), fill="#253494") + p9.coord_flip() + p9.facet_wrap("repository", scales='free_x') + p9.scale_x_discrete(limits=(plot_df.sort_values( "odds_ratio", ascending=True).lemma.tolist())) + p9.scale_y_continuous(labels=custom_format('{:,.0g}')) + p9.labs(x=None) + p9.theme_seaborn( context='paper', style="ticks", font="Arial", font_scale=0.95) + p9.theme( # 640 x 480 figure_size=(6.66, 5), strip_background=p9.element_rect(fill="white"), strip_text=p9.element_text(size=12), axis_title=p9.element_text(size=12), axis_text_x=p9.element_text(size=10), )) return graph
def plot_pointplot(plot_df, y_axis_label="", use_log10=False, limits=[0, 3.2]): """ Plots the pointplot Arguments: plot_df - the dataframe that contains the odds ratio and lemmas y_axis_label - the label for the y axis use_log10 - use log10 for the y axis? """ graph = ( p9.ggplot(plot_df, p9.aes(x="lemma", y="odds_ratio")) + p9.geom_pointrange(p9.aes(ymin="lower_odds", ymax="upper_odds"), position=p9.position_dodge(width=1), size=0.3, color="#253494") + p9.scale_x_discrete(limits=(plot_df.sort_values( "odds_ratio", ascending=True).lemma.tolist())) + (p9.scale_y_log10() if use_log10 else p9.scale_y_continuous( limits=limits)) + p9.geom_hline(p9.aes(yintercept=1), linetype='--', color='grey') + p9.coord_flip() + p9.theme_seaborn( context='paper', style="ticks", font_scale=1, font='Arial') + p9.theme( # 640 x 480 figure_size=(6.66, 5), panel_grid_minor=p9.element_blank(), axis_title=p9.element_text(size=12), axis_text_x=p9.element_text(size=10)) + p9.labs(x=None, y=y_axis_label)) return graph
def test_dodge_preserve_single_text(): df1 = pd.DataFrame({'x': ['a', 'b', 'b', 'b'], 'y': ['a', 'a', 'b', 'b']}) d = position_dodge(preserve='single', width=0.9) p = (ggplot(df1, aes('x', fill='y')) + geom_bar(position=d) + geom_text(aes(y=after_stat('count'), label=after_stat('count')), stat='count', position=d, va='bottom')) assert p + _theme == 'dodge_preserve_single_text'
def plot_cumulative_returns(wanted_stocks: Iterable[str], ld: LazyDictionary) -> p9.ggplot: df = ld["cip_df"] df = df.filter(wanted_stocks, axis=0).filter(regex="^\d", axis=1) dates = set(df.columns) movers = df movers["asx_code"] = movers.index movers = movers.melt(id_vars="asx_code", value_vars=dates) movers = movers[(movers["value"] < -5.0) | (movers["value"] > 5.0)] # ignore small movers # print(movers) movers["fetch_date"] = pd.to_datetime(movers["fetch_date"], format="%Y-%m-%d") # need to have separate dataframe's for positive and negative stocks - otherwise plotnine plot will be wrong #print(df) pos_df = df.agg([positive_sum]) neg_df = df.agg([negative_sum]) pos_df = pos_df.melt(value_vars=dates) neg_df = neg_df.melt(value_vars=dates) pos_df["fetch_date"] = pd.to_datetime(pos_df["fetch_date"], format="%Y-%m-%d") neg_df["fetch_date"] = pd.to_datetime(neg_df["fetch_date"], format="%Y-%m-%d") plot = (p9.ggplot() + p9.geom_bar( p9.aes(x="fetch_date", y="value"), data=pos_df, stat="identity", fill="green", ) + p9.geom_bar( p9.aes(x="fetch_date", y="value"), data=neg_df, stat="identity", fill="red", ) + p9.geom_point( p9.aes( x="fetch_date", y="value", fill="asx_code", ), data=movers, size=3, position=p9.position_dodge(width=0.4), colour="black", )) return user_theme( plot, y_axis_label="Cumulative Return (%)", legend_position="right", asxtrade_want_cmap_d=False, asxtrade_want_fill_d= True, # points (stocks) are filled with the user-chosen theme, but everything else is fixed )
def plot_result_stats(results, title): stats = results.describe().unstack().reset_index().rename(columns={ "level_0": "metric", "level_1": "group", 0: "value" }) stats = stats[~stats["group"].isin(["count", "min", "max"])] stats["value_presentation"] = round(stats["value"], 2) plot = (p9.ggplot(stats) + p9.aes("metric", "value", fill="group") + p9.geom_col(position="dodge") + p9.theme_bw() + p9.coord_cartesian(ylim=[0, 1.0]) + p9.ggtitle(title) + p9.geom_text(p9.aes(label="value_presentation"), position=p9.position_dodge(width=0.9), va="bottom")) return plot
def plot_vs_continuous(data_table, continuous_metric_name, breaks, metric_name, segment_name, title, aggregate="mean"): result = _aggregate_vs_continuous(data_table, continuous_metric_name, breaks, metric_name, segment_name, aggregate) gg_result = plot.ggplot(result) + plot.aes(x="level_0", y=metric_name, fill=segment_name, label=metric_name ) + \ plot.geom_bar(stat="identity", position="dodge") + \ plot.geom_text(position=plot.position_dodge(width=.9), size=8) + \ plot.labs(x=continuous_metric_name, y=aggregate + "(" + metric_name + ")", title=title) return gg_result
def plot_distributions_bar_plot_grid(dataframe, figure_size=(14, 4)): """ We create a function to plot the bar plot. """ return ( # Define the plot. p9.ggplot(dataframe, p9.aes(x='threshold', fill='value')) # Add the bars. + p9.geom_bar(position='dodge') + p9.geom_text(p9.aes(label='stat(count)'), stat='count', position=p9.position_dodge(0.9), size=7, va='bottom') # Rename the x axis. + p9.scale_x_discrete(name='Threshold') # Rename the y axis, give some space on top and bottom (mul_bottom, add_bottom, mul_top, add_top). + p9.scale_y_continuous(name='Count', expand=(0, 0, 0, 500)) # Replace the names in the legend and set the colors of the bars. + p9.scale_fill_manual(values={ 0: '#009e73', 1: '#d55e00' }, labels=lambda l: [{ 0: 'Stable', 1: 'Unstable' }[x] for x in l]) # Place the plots in a grid, renaming the labels. + p9.facet_grid('. ~ iterations', labeller=p9.labeller(cols=lambda x: f'iters = {x}')) # Define the theme for the plot. + p9.theme( # Remove the y axis name. axis_title_y=p9.element_blank(), # Set the size of x and y tick labels font. axis_text_x=p9.element_text(size=7), axis_text_y=p9.element_text(size=7), # Place the legend on top, without title, and reduce the margin. legend_title=p9.element_blank(), legend_position='top', legend_box_margin=2, # Set the size for the figure. figure_size=figure_size, ))
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Calcualte and compare LISI across a series of reduced dims and categorical variables. """) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) # parser.add_argument( # '-h5', '--h5_anndata', # action='store', # dest='h5', # required=True, # help='H5 AnnData file.' # ) parser.add_argument( '-rf', '--reduced_dims_tsv', action='store', dest='reduced_dims', required=True, help='List of tab-delimited files of reduced dimensions (e.g., PCs)\ for each cell. First column is cell_barcode. List should be\ split by "::" (e.g. file1.tsv.gz::file2.tsv.gz).') parser.add_argument( '-lbl', '--reduced_dims_tsv_labels', action='store', dest='reduced_dims_labels', required=True, help='String of labels for each reduced_dims_tsv file. List should be\ split by "::".') parser.add_argument( '-mf', '--metadata_tsv', action='store', dest='metadata_tsv', required=True, help='Tab-delimited file of metadata for each cell. First column\ is cell_barcode.') parser.add_argument( '-mv', '--metadata_columns', action='store', dest='metadata_columns', default='experiment_id', help='Comma separated string of categorical variables to calculate\ LISI with.\ (default: %(default)s)') parser.add_argument('-p', '--perplexity', action='store', dest='perplexity', default=30.0, type=float, help='Perplexity.\ (default: %(default)s)') parser.add_argument( '-of', '--output_file', action='store', dest='of', default='', help='Basename of output files, assuming output in current working \ directory.\ (default: <metadata_tsv>-lisi)') options = parser.parse_args() # Fixed settings. # verbose = True # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = '{}-lisi'.format( os.path.basename( options.metadata_tsv.rstrip('tsv.gz').rstrip('.'))) # Get the columns to use lisi_columns = options.metadata_columns.split(',') # lisi_columns = ['experiment_id', 'batch'] lisi_columns_dtype = dict( zip(lisi_columns, ['category'] * len(lisi_columns))) # Load the metadata file file_meta = options.metadata_tsv df_meta = pd.read_csv(file_meta, sep='\t', index_col='cell_barcode', dtype=lisi_columns_dtype) # Load the reduced dims. files = options.reduced_dims.split('::') labels = options.reduced_dims_labels.split('::') assert len(files) == len(labels), 'ERROR: check files and labels input' # Make a dict of theoretical maximum LISI value for each label. lisi_limit = {} for col in lisi_columns: n_cat = len(df_meta[col].cat.categories) lisi_limit[col] = n_cat list_lisi = [] for i in range(len(files)): df_reduced_dims = pd.read_csv(files[i], sep='\t', index_col='cell_barcode') # Run lisi and save results to dataframe _df_lisi = pd.DataFrame(hm.compute_lisi( df_reduced_dims.loc[df_meta.index, :], df_meta[lisi_columns], lisi_columns), columns=lisi_columns) _df_lisi['file'] = files[i] _df_lisi['label'] = labels[i] _df_lisi['cell_barcode'] = df_meta.index list_lisi.append(_df_lisi) # Make one long dataframe. df_lisi = pd.concat(list_lisi) # Make cell_barcode the first column. cols = list(df_lisi.columns) cols = [cols[-1]] + cols[:-1] # Save the results df_lisi[cols].to_csv('{}.tsv.gz'.format(out_file_base), sep='\t', index=False, quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression='gzip') # Compare the lisi distributions n_labels = len(labels) for lisi_column in lisi_columns: # Make density plot. gplt = plt9.ggplot(df_lisi, plt9.aes( fill='label', x='label', y=lisi_column, )) gplt = gplt + plt9.theme_bw(base_size=12) gplt = gplt + plt9.geom_violin(alpha=0.9) gplt = gplt + plt9.geom_boxplot( group='label', position=plt9.position_dodge(width=.9), width=.1, fill='white', outlier_alpha=0 # Do not know how to totally remove outliers. ) # Add a line at the theoretical maximum gplt = gplt + plt9.geom_hline( plt9.aes(yintercept=lisi_limit[lisi_column])) # gplt = gplt + plt9.facet_grid('{} ~ .'.format(label)) gplt = gplt + plt9.labs(x='Reduced dimensions', y='LISI', title='') gplt = gplt + plt9.theme( axis_text_x=plt9.element_text(angle=-45, hjust=0)) gplt = gplt + plt9.theme(legend_position='none') if n_labels != 0 and n_labels < 9: gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual') gplt.save( '{}-{}-violin.png'.format(out_file_base, lisi_column), dpi=300, width=4 * (n_labels / 4), height=10, # height=4*(n_samples/4), limitsize=False) # Make ecdf. gplt = plt9.ggplot(df_lisi, plt9.aes( x=lisi_column, color='label', )) gplt = gplt + plt9.theme_bw(base_size=12) gplt = gplt + plt9.stat_ecdf(alpha=0.8) gplt = gplt + plt9.labs( x='LISI', y='Cumulative density', # color='Reduction', title='') if n_labels != 0 and n_labels < 9: gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual') gplt.save('{}-{}-ecdf.pdf'.format(out_file_base, lisi_column), dpi=300, width=10, height=4, limitsize=False)
metadata_df["author_type"].value_counts() # # BioRxiv Research Article Categories # Categories assigned to each research article. Neuroscience dominates majority of the articles as expected. # In[9]: category_list = metadata_df.category.value_counts().index.tolist()[::-1] # plot nine doesn't implement reverse keyword for scale x discrete # ugh... g = ( p9.ggplot(metadata_df, p9.aes(x="category")) + p9.geom_bar(size=10, fill="#253494", position=p9.position_dodge(width=3)) + p9.scale_x_discrete(limits=category_list) + p9.coord_flip() + p9.theme_seaborn( context="paper", style="ticks", font="Arial", font_scale=1)) g.save("output/figures/preprint_category.png", dpi=500) print(g) # In[10]: metadata_df["category"].value_counts() # # New, Confirmatory, Contradictory Results? # In[11]: heading_list = metadata_df.heading.value_counts().index.tolist()[::-1]
def test_dodge_preserve_single(): df1 = pd.DataFrame({'x': ['a', 'b', 'b'], 'y': ['a', 'a', 'b']}) p = (ggplot(df1, aes('x', fill='y')) + geom_bar(position=position_dodge(preserve='single'))) assert p + _theme == 'dodge_preserve_single'
max_depth=5, min_samples_split=2, max_features=5, n_jobs=n_threads) sklearn_forest.fit(X, y) current_timing = (time.time() - start_time) if n >= n_burn_in: timing_data['implementation'].append('scikit-learn 0.23.1') timing_data['threads'].append(n_threads) timing_data['timing'].append(current_timing) df = pd.DataFrame(data=timing_data) df = df.groupby(['implementation', 'threads']).agg(['mean', 'std']).reset_index() df.columns = ['Implementation', 'threads', 'mean', 'std'] print(df) df['error_min'] = df['mean'] - df['std'] df['error_max'] = df['mean'] + df['std'] p = (ggplot( df, aes(x='threads', y='mean', group='Implementation', color='Implementation')) + geom_line() + geom_point() + geom_errorbar(aes(ymin='error_min', ymax='error_max'), width=.2, position=position_dodge(0.05)) + labs(x="Number of threads", y="timing [s]")) p.save(filename='benchmark.png')
gg_rep_act.save(os.path.join(dir_output, 'gg_rep_act.png'), width=8, height=4) di_notes = { 'chi2': 'χ2-correction', 'insig': 'Erroneous', 'specification': 'Specification', 'non-replicable': 'Inconsistent' } # (ii) Breakdown of counts tmp = acc_tt.merge( res_fisher.tt.value_counts().reset_index().rename(columns={ 'index': 'tt', 'tt': 'n_lit' })) tmp = tmp.assign(tt=lambda x: x.tt.map(di_tt), notes=lambda x: x.notes.map(di_notes), share=lambda x: x.n / x.n_lit) gg_acc_notes = ( pn.ggplot(tmp, pn.aes(x='notes', y='share', fill='tt')) + pn.theme_bw() + pn.scale_y_continuous(labels=percent_format(), limits=[0, 0.1]) + pn.scale_fill_discrete(name='Literature') + pn.geom_col(color='black', position=pn.position_dodge(0.5), width=0.5) + pn.labs(y='Percent', x='Investigation') + pn.theme(axis_text_x=pn.element_text(angle=45), axis_title_x=pn.element_blank())) gg_acc_notes.save(os.path.join(dir_output, 'gg_acc_notes.png'), width=7, height=3) print('~~~ End of 4_results_insig.py ~~~')
# ## Global View of PCA plot # In[5]: g = (p9.ggplot(biorxiv_pca_method_section_df) + p9.aes(x="pca1", y="pca2", color="category") + p9.geom_point() + p9.theme_bw() + p9.labs(title="TSNE Methods Section (300 dim)")) print(g) # ## Neuroscience Methods Section # In[6]: g = (p9.ggplot(biorxiv_pca_method_section_df.query("category=='neuroscience'")) + p9.aes(x="pca1", y="pca2", color="section") + p9.geom_point(position=p9.position_dodge(width=0.2)) + p9.facet_wrap("section") + p9.theme_bw() + p9.theme(subplots_adjust={'wspace': 0.10}) + p9.scale_color_manual({ "has_methods": "#d8b365", "no_methods": "#5ab4ac" }) + p9.labs(title="Neuroscience Methods Section")) g.save("output/pca/neuroscience_missing_methods.png", dpi=500) print(g) # In[7]: g = (p9.ggplot(biorxiv_pca_method_section_df.query("category=='neuroscience'")) + p9.aes(x="pca1", y="pca2", color="section") + p9.geom_point(position=p9.position_dodge(width=0.2)) + p9.theme_bw() + p9.scale_color_manual({
if group is None: g += p9.geom_crossbar(p9.aes(x="x", y='center', ymin='low', ymax='high'), colour=ez_colors(1)[0], na_rm=False) else: g += p9.geom_crossbar(p9.aes(x="x", y='center', ymin='low', ymax='high', group="factor(group_x)", colour="factor(group)", fill="factor(group)"), position=p9.position_dodge( 0.7, preserve='single'), na_rm=True, alpha=0.2) g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group'))) g += p9.scale_colour_manual(values=ez_colors(g.n_groups('group'))) elif geom == 'ribbon': g = EZPlot(gdata.dropna()) # set groups if group is None: g += p9.geom_ribbon(p9.aes(x="x", y='center', ymin='low',
def box_plot(df, x, y, group = None, facet_x = None, facet_y = None, dodge_groups=True, base_size = 10, figure_size = (6,3), **kwargs): ''' Aggregates data in df and plots as a line chart. Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis y : str quoted expression to be plotted on the y axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet base_size : int base size for theme_ez figure_size :tuple of int figure size **kwargs : kwargs additional kwargs for geom_boxplot Returns ------- g : EZPlot EZplot object ''' # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) names['y'], variables['y'] = unname(y) # fix special cases if x == '.index': groups['x'] = '.index' names['x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, None, fill_groups=True) gdata = gdata[[c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]] # add group_x column if group is not None: gdata['group_x'] = gdata['group'].astype('str') + '_' + gdata['x'].astype(str) g = EZPlot(gdata) # set groups if group is None: g += p9.geom_boxplot(p9.aes(x="factor(x)", y="y", group="factor(x)"), colour = ez_colors(1)[0], na_rm = False, **kwargs) else: if dodge_groups: g += p9.geom_boxplot(p9.aes(x="factor(x)", y="y", group="factor(group_x)", fill="factor(group)"), position=p9.position_dodge(0.9, preserve='single'), na_rm = True, **kwargs) else: g += p9.geom_boxplot(p9.aes(x="factor(x)", y="y", group="factor(group_x)", fill="factor(group)"), na_rm = True, **kwargs) g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group'))) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale g += p9.scale_x_discrete() # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab(names['y']) # set theme g += theme_ez(figure_size = figure_size, base_size = base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) return g
def barchart_make(roi, df, list_rois, config, ylimit, save_function, find_ylim_function): thisroi = list_rois[roi] current_df = df.loc[df['index'] == thisroi] current_df = current_df.sort_values([config.single_roi_fig_x_axis]) current_df = current_df.reset_index( drop=True) # Reset index to remove grouping current_df[config.single_roi_fig_x_axis] = pd.Categorical( current_df[config.single_roi_fig_x_axis], categories=current_df[config.single_roi_fig_x_axis].unique()) figure = ( pltn.ggplot( current_df, pltn.aes(x=config.single_roi_fig_x_axis, y='Mean', ymin="Mean-Conf_Int_95", ymax="Mean+Conf_Int_95", fill='factor({colour})'.format( colour=config.single_roi_fig_colour))) + pltn.theme_538() + pltn.geom_col(position=pltn.position_dodge( preserve='single', width=0.8), width=0.8, na_rm=True) + pltn.geom_errorbar(size=1, position=pltn.position_dodge( preserve='single', width=0.8)) + pltn.labs(x=config.single_roi_fig_label_x, y=config.single_roi_fig_label_y, fill=config.single_roi_fig_label_fill) + pltn.scale_x_discrete(labels=[]) + pltn.theme(panel_grid_major_x=pltn.element_line(alpha=0), axis_title_x=pltn.element_text( weight='bold', color='black', size=20), axis_title_y=pltn.element_text( weight='bold', color='black', size=20), axis_text_y=pltn.element_text(size=20, color='black'), legend_title=pltn.element_text(size=20, color='black'), legend_text=pltn.element_text(size=18, color='black'), subplots_adjust={'right': 0.85}, legend_position=(0.9, 0.8), dpi=config.plot_dpi) + pltn.geom_text(pltn.aes(y=-.7, label=config.single_roi_fig_x_axis), color='black', size=20, va='top') + pltn.scale_fill_manual( values=config.colorblind_friendly_plot_colours)) if ylimit: # Set y limit of figure (used to make it the same for every barchart) figure += pltn.ylim(None, ylimit) thisroi += '_same_ylim' returned_ylim = 0 if config.use_same_axis_limits in ('Same limits', 'Create both') and ylimit == 0: returned_ylim = find_ylim_function(thisroi, figure, 'yaxis') if config.use_same_axis_limits == 'Same limits' and ylimit == 0: return returned_ylim elif ylimit != 0: folder = 'Same_yaxis' else: folder = 'Different_yaxis' save_function(figure, thisroi, config, folder, 'barchart') return returned_ylim
rel }) edges_df = pd.DataFrame.from_records(datarows) edges_df # In[11]: import math g = (p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet")) + p9.geom_col(position="dodge") + p9.scale_fill_manual(values={ "Existing": color_map["Existing"], "Novel": color_map["Novel"] }) + p9.geom_text(p9.aes(label=( edges_df.apply(lambda x: f"{x['edges']}\n({x['recall']*100:.0f}%)" if not math.isnan(x['recall']) else f"{x['edges']}", axis=1))), position=p9.position_dodge(width=0.9), size=9, va="bottom") + p9.scale_y_log10() + p9.labs(y="# of Edges", x="Relation Type", title="Reconstructing Edges in Hetionet") + p9.guides(fill=p9.guide_legend(title="In Hetionet?")) + p9.theme( axis_text_y=p9.element_blank(), axis_ticks_major=p9.element_blank(), rect=p9.element_blank(), )) print(g) g.save(filename="../edges_added.png", dpi=300)
arti_start = time.time() df, separated_peaks = er.proof_artificial( model, ad_partial, region_length=parameters['pad_to'], nb_datasets=parameters['artificial_nb_datasets'], nb_tfs=parameters['artificial_nb_tfs'], n_iter=500, squish_factor=parameters['squish_factor']) arti_end = time.time() print('Artificial data generalisation completed in ' + str(arti_end - arti_start) + ' s') # The plots a = ggplot(df, aes(x="type", y="rebuilt_value", fill="tf_group")) a1 = a + geom_violin(position=position_dodge(1), width=1) a2 = a + geom_boxplot(position=position_dodge(1), width=0.5) b = ggplot(df, aes( x="brothers", y="rebuilt_value", group="brothers")) + scale_fill_grey() + geom_boxplot(width=0.4) a2.save(filename=plot_output_path + 'artifical_data_systematisation_value_per_type.png', height=10, width=14, units='in', dpi=400, verbose=False) b.save(filename=plot_output_path + 'artifical_data_systematisation_value_per_brothers.png', height=10,
def do_PCA(df, allowed_nas, groups, paired, outbasename, name, extra=None): """ PCA analysis from PSIs quantifications based on the 2000 events with greatest variance. Additionally, plots of principal components based on the defined groups are drawn :param pd.DataFrame df: Ready df with per-event PSIs to calculate principal components :param int allowed_nas: Allowed NAs. If > 0, imputations will be performed based on the row mean (PCA doesn't accept missing values) :param dict groups: Dictionary with info about each sample and the group they represent :param bool paired: Whether samples of each group are paired (if so, order of samples per group must be preserved) :param str outbasename: Output basename :param str name: str to add to output (e.g. software name) :param str extra: Extra str to add (e.g. rMATS event type) :return: """ print("Number of {} events that will be used in the PCA: {}".format( name, df.shape[0])) if allowed_nas > 0: print("Doing imputation of missing values") df = df.apply(lambda x: x.fillna(x.mean()), axis=1) df = df.loc[df.var(axis=1).nlargest(2000).index, ] pca = PCA(n_components=3) PCs = pca.fit_transform(df.T) print( "Amount of variance that the first PCs contain for {} data: {}".format( name, pca.explained_variance_ratio_)) PC_df = pd.DataFrame(data=PCs, columns=['PC1', 'PC2', 'PC3'], index=df.T.index) cols_groups_df = ['Group', 'Ind'] if paired else ['Group'] groups_df = pd.DataFrame.from_dict(groups, orient='index', columns=cols_groups_df) PC_df = PC_df.merge(groups_df, left_index=True, right_index=True).rename_axis('Sample').reset_index() for pc_pair in [("PC1", "PC2"), ("PC2", "PC3")]: if paired: p1 = (p9.ggplot( PC_df, p9.aes(x=pc_pair[0], y=pc_pair[1], fill="Group", shape='Ind')) + p9.geom_point(color="black", size=6, alpha=0.7, position=p9.position_dodge( width=0.3, preserve="total"))) else: p1 = (p9.ggplot( PC_df, p9.aes( x=pc_pair[0], y=pc_pair[1], fill="Group", shape='Sample')) + p9.geom_point(color="black", size=6, alpha=0.7, position=p9.position_dodge( width=0.3, preserve="total"))) if extra is not None: output = "{}_{}_{}_{}vs{}.pdf".format(outbasename, extra, name, pc_pair[0], pc_pair[1]) else: output = "{}_{}_{}vs{}.pdf".format(outbasename, name, pc_pair[0], pc_pair[1]) p1.save(output, verbose=False)
pdtypes.CategoricalDtype()) # Using only combis of each individual length all_lengths = sorted(set(df['combi_length'])) # No point in going beyond, roughly, 12 all_lengths = [l for l in all_lengths if l <= 12] for length in all_lengths: df_filtered = df.loc[df['combi_length'] == length, :] try: p = (ggplot(data=df_filtered, mapping=aes(x='entropy', y='fc')) + geom_violin(position=position_dodge(1), width=1) + geom_boxplot(position=position_dodge(1), width=0.25) + xlab("Entropy") + ylab("Fold change (log2)") + ggtitle("Order " + str(length))) p.save(filename=ROOT_PATH + "entropy_graph/entropy_length_" + str(length) + "_fc.png") p = (ggplot(data=df_filtered, mapping=aes(x='entropy', y='s')) + geom_violin(position=position_dodge(1), width=1) + geom_boxplot(position=position_dodge(1), width=0.25) + xlab("Entropy") + ylab("True total overlapping bp.") + ggtitle("Order " + str(length))) p.save(filename=ROOT_PATH + "entropy_graph/entropy_length_" + str(length) + "_s.png")
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Fits logistic regression to predict labels.' """) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser.add_argument( '-h5', '--h5_anndata', action='store', dest='h5', required=True, help='H5 AnnData file where clusters have been saved to cluster slot.') # parser.add_argument( # '-ncpu', '--number_cpu', # action='store', # dest='number_cpu', # default=50, # type=int, # help='Number of CPUs to use. Since we are testing the dask backend,\ # this corresponds to the number of CPUs available across all of\ # the worker jobs we spin out.\ # (default: %(default)s)' # ) parser.add_argument('-s', '--sparsity_l1', action='store', dest='sparsity_l1', default=0.0001, type=float, help='Smaller values specify stronger regularization.\ (default: %(default)s)') parser.add_argument('-nepoch', '--number_epoch', action='store', dest='number_epoch', default=25, type=int, help='Number of epochs.\ (default: %(default)s)') parser.add_argument( '-bs', '--batch_size', action='store', dest='batch_size', default=32, type=int, help='Batch size. Divides the dataset into n batches and updates the\ weights at the end of each one.\ (default: %(default)s)') parser.add_argument( '-tsc', '--train_size_cells', action='store', dest='train_size_cells', default=0, type=int, help='Number of cells to use for training set. If > 0 all\ remaining cells not randomly selected for training will be used\ for the test set. Overrides <train_size_fraction>.\ (default: %(default)s)') parser.add_argument('-tsf', '--train_size_fraction', action='store', dest='train_size_fraction', default=0.67, type=float, help='Fraction of the data to use for training set.\ (default: %(default)s)') parser.add_argument( '--dict_add', action='store', dest='dict_add', default='', type=str, help='Additional information to add to output model_report.\ Format: key::value:::key2::value2.\ Example: method::leiden:::resolution::3.0\ (default: %(default)s)') parser.add_argument('--grid_search', action='store_true', dest='grid_search', default=False, help='Run a grid search of hyperparameters.\ (default: %(default)s)') parser.add_argument('--memory_limit', action='store', dest='memory_limit', default=50, type=int, help='Memory limit in Gb.\ (default: %(default)s)') parser.add_argument( '-of', '--output_file', action='store', dest='of', default='', help='Basename of output files, assuming output in current working \ directory.\ (default: keras_model-<params>)') options = parser.parse_args() verbose = True # Set GPU memory limits gpus = tf.config.list_physical_devices('GPU') print(gpus) if gpus: # For TF v1 # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # session = tf.Session(config=config) # For TF v2 try: # Method 1: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # Method 2: # Restrict TensorFlow to only allocate 1GB of memory on the first # GPU # tf.config.experimental.set_virtual_device_configuration( # gpus[0], # [tf.config.experimental.VirtualDeviceConfiguration( # memory_limit=options.memory_limit*1024 # )]) # logical_gpus = tf.config.list_logical_devices('GPU') # print( # len(gpus), # "Physical GPUs,", # len(logical_gpus), # "Logical GPUs" # ) except RuntimeError as e: # Virtual devices must be set before GPUs have been initialized print(e) else: raise Exception('ERROR: no GPUs detected.') # Get additional data we are going to append to the output model info dict_add = {} if options.dict_add != '': for item in options.dict_add.split(':::'): _tmp = item.split('::') if len(_tmp) != 2: raise Exception('ERROR: check dict_add.') else: dict_add[_tmp[0]] = _tmp[1] print(dict_add) # Load the AnnData file. # This file should already have clusters identified and saved to the # clusters slot. adata = sc.read_h5ad(filename=options.h5) # Set X to cp10k # adata.X = np.expm1(adata.layers['log1p_cp10k']) # Set X to ln(cp10k+1) # NOTE: Testing with 100k TI dataset, we were able to achieve higher # accuracy with log1p_cp10k - likely becuase better spread in distribution. adata.X = adata.layers['log1p_cp10k'] # Set X to raw counts # adata.X = adata.layers['counts'] # Add some info from adata to dict_add for key, value in adata.uns['neighbors']['params'].items(): dict_add['neighbors__{}'.format(key)] = value for key, value in adata.uns['cluster']['params'].items(): dict_add['cluster__{}'.format(key)] = value # If train_size_cells, override the fraction so that the total number of # cells in the training set will be equal to train_size_cells. train_size_fraction = options.train_size_fraction if options.train_size_cells > 0: if options.train_size_cells >= adata.n_obs: raise Exception('Invalid train_size_cells.') train_size_fraction = ( 1 - ((adata.n_obs - options.train_size_cells) / adata.n_obs)) if verbose: print( 'Set train_size_fraction to: {}.'.format(train_size_fraction)) if verbose: print('Number cells training ({}) and testing ({}).'.format( int(train_size_fraction * adata.n_obs), int((1 - train_size_fraction) * adata.n_obs))) # Set X and y X = adata.X y = adata.obs['cluster'].values # Set other variables sparsity_l1 = options.sparsity_l1 n_epochs = options.number_epoch batch_size = options.batch_size # Center and scale the data if sp.sparse.issparse(X): X = X.todense() X_std = X scaler = preprocessing.StandardScaler(with_mean=True, with_std=True) X_std = scaler.fit_transform(X) if verbose: print('center={} scale={}'.format(True, True)) # One hot encode y (the cell type classes) # encode class values as integers encoder = preprocessing.LabelEncoder() encoder.fit(y) print('Found {} clusters'.format(len(encoder.classes_))) # Define the model # NOTE: Defaults determined via grid search of 160k TI single cells def classification_model(optimizer='sgd', activation='softmax', loss='categorical_crossentropy', sparsity_l1__activity=0.0001, sparsity_l2__activity=0.0, sparsity_l1__kernel=0.0, sparsity_l2__kernel=0.0, sparsity_l1__bias=0.0, sparsity_l2__bias=0.0): # create model model = Sequential() # Use a “softmax” activation function in the output layer. This is to # ensure the output values are in the range of 0 and 1 and may be used # as predicted probabilities. # # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax # Softmax assigns decimal probabilities to each class in a multi-class # problem. Those decimal probabilities must add up to 1.0. This # additional constraint helps training converge more quickly than it # otherwise would. Softmax is implemented through a neural network # layer just before the output layer. The Softmax layer must have the # same number of nodes as the output layer. # Softmax assumes that each example is a member of exactly one class. # # Softmax should be used for multi-class prediction with single label # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/video-lecture # NOTE: input dimension = number of features your data has model.add( Dense( len(encoder.classes_), # output dim is number of classes use_bias=True, # intercept activation=activation, # softmax, sigmoid activity_regularizer=L1L2(l1=sparsity_l1__activity, l2=sparsity_l2__activity), kernel_regularizer=L1L2(l1=sparsity_l1__kernel, l2=sparsity_l2__kernel), bias_regularizer=L1L2(l1=sparsity_l1__bias, l2=sparsity_l2__bias), input_dim=X.shape[1])) # Example of adding additional layers # model.add(Dense(8, input_dim=4, activation='relu')) # model.add(Dense(3, activation='softmax')) # Metrics to check out over training epochs mets = [ # loss, keras.metrics.CategoricalAccuracy(name='categorical_accuracy'), # keras.metrics.TruePositives(name='tp'), # keras.metrics.FalsePositives(name='fp'), # keras.metrics.TrueNegatives(name='tn'), # keras.metrics.FalseNegatives(name='fn'), # keras.metrics.Precision(name='precision'), # keras.metrics.Recall(name='recall'), # keras.metrics.AUC(name='auc'), keras.metrics.BinaryAccuracy(name='accuracy') ] # Use Adam gradient descent optimization algorithm with a logarithmic # loss function, which is called “categorical_crossentropy” in Keras. # UPDATE: sgd works better emperically. model.compile( optimizer=optimizer, # adam, sgd loss=loss, metrics=mets) return model # Now, either call a grid search or specific model fit if options.grid_search: # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = 'keras_model' out_file_base = '{}-grid_search'.format(out_file_base) # Call grid search of various parameters grid_result, df_grid_result = keras_grid( model_function=classification_model, encoder=encoder, X_std=X_std, y=y, n_epochs=n_epochs, batch_size=batch_size) # NOTE: This will fail because can't pickle KerasClassifier. This is # fine though becuase results are saved in tsv.gz format below. # Save the results # out_f = '{}-grid_result.gz'.format(out_file_base) # joblib.dump( # grid_result, # out_f, # compress=('gzip', 3) # ) # Load the model # lr = joblib.load( # 'test-lr_model.joblib.gz' # ) # print(lr) # Save the results of our search to tsv out_f = '{}-grid_result.tsv.gz'.format(out_file_base) df_grid_result.to_csv(out_f, sep='\t', index=False, quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) # Add a single columns that summarizes params param_columns = [ col for col in df_grid_result.columns if 'param__' in col ] df_grid_result['params'] = df_grid_result[param_columns].astype( str).apply(lambda x: '-'.join(x), axis=1) # Plot the distribution of accuracy across folds split_columns = [ col for col in df_grid_result.columns if 'split' in col ] split_columns = [col for col in split_columns if '_test_score' in col] df_plt = pd.melt(df_grid_result, id_vars=['params'], value_vars=split_columns) gplt = plt9.ggplot(df_plt, plt9.aes(x='params', y='value')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_boxplot(alpha=0.8) gplt = gplt + plt9.geom_jitter(alpha=0.75) gplt = gplt + plt9.scale_y_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0 # limits=[0, 1] ) gplt = gplt + plt9.labs(x='Parameters', y='Score', title='') gplt = gplt + plt9.theme( axis_text_x=plt9.element_text(angle=-45, hjust=0)) gplt.save('{}-score.png'.format(out_file_base), dpi=300, width=10, height=4, limitsize=False) # Plot the mean time and std err for fitting results gplt = plt9.ggplot(df_grid_result, plt9.aes(x='params', y='mean_fit_time')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_point() gplt = gplt + plt9.geom_errorbar(plt9.aes( ymin='mean_fit_time-std_fit_time', ymax='mean_fit_time+std_fit_time'), width=0.2, position=plt9.position_dodge(0.05)) gplt = gplt + plt9.scale_y_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.labs(x='Parameters', y='Mean fit time', title='') gplt = gplt + plt9.theme( axis_text_x=plt9.element_text(angle=-45, hjust=0)) gplt.save('{}-fit_time.png'.format(out_file_base), dpi=300, width=10, height=4, limitsize=False) else: # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = 'keras_model' # out_file_base = '{}-center={}-scale={}'.format( # out_file_base, # center, # scale # ) out_file_base = '{}-batch_size={}-epochs={}'.format( out_file_base, batch_size, n_epochs) out_file_base = '{}-sparsity_l1={}-train_size_fraction={}'.format( out_file_base, str(sparsity_l1).replace('.', 'pt'), str(train_size_fraction).replace('.', 'pt')) # Fit the specific model and save the results model, model_report, y_prob_df, history = fit_model_keras( model_function=classification_model, encoder=encoder, X_std=X_std, y=y, sparsity_l1=sparsity_l1, sparsity_l2=0.0, n_epochs=n_epochs, batch_size=batch_size, train_size_fraction=train_size_fraction) # Save the model, weights (coefficients), and bias (intercept) model.save('{}.h5'.format(out_file_base), overwrite=True, include_optimizer=True) # Save the model and weights (coefficients) seperately # open('{}.json'.format(out_file_base), 'w').write(model.to_json()) open('{}.yml'.format(out_file_base), 'w').write(model.to_yaml()) model.save_weights('{}-weights.h5'.format(out_file_base)) # Example read functions # model = model_from_yaml(open('my_model_architecture.yaml').read()) # model.load_weights('my_model_weights.h5') # Save the model report # Add column telling us if this is cluster or summary value is_cluster = [] for i in model_report.index: if i in encoder.classes_: is_cluster.append(True) else: is_cluster.append(False) model_report['is_cluster'] = is_cluster # Add in extra data model_report['sparsity_l1'] = sparsity_l1 if dict_add: for key, value in dict_add.items(): model_report[key] = value print(model_report) out_f = '{}-model_report.tsv.gz'.format(out_file_base) model_report.to_csv(out_f, sep='\t', index=True, index_label='cell_label', quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) if verbose: print('Completed: save {}.'.format(out_f)) # Save the test results - each row is a cell and the columns are the # prob of that cell belonging to a particular class. # Add in extra data y_prob_df['sparsity_l1'] = sparsity_l1 if dict_add: for key, value in dict_add.items(): y_prob_df[key] = value out_f = '{}-test_result.tsv.gz'.format(out_file_base) y_prob_df.to_csv( out_f, sep='\t', index=False, # NOTE: Not adding the label to test_result index. # index_label='cell_label', quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) if verbose: print('Completed: save {}.'.format(out_f)) # Make a matrix of weights per gene # Columns = genes tested and rows = cell type label weight, bias = model.layers[-1].get_weights() # weight, bias = model.get_layer("output").get_weights() df_weights = pd.DataFrame.from_records( weight, index=adata.var.index, # index is gene columns=encoder.classes_) # Save the weights dataframe. out_f = '{}-weights.tsv.gz'.format(out_file_base) df_weights.to_csv(out_f, sep='\t', index=True, index_label='ensembl_gene_id', quoting=csv.QUOTE_NONNUMERIC, na_rep='', compression=compression_opts) if verbose: print('Completed: save {}.'.format(out_f)) # Plot the number of features with non-zero coefficients in each # cluster. out_f = '{}-n_features.png'.format(out_file_base) df_plt = pd.DataFrame({ 'classes': df_weights.columns, 'features': (df_weights != 0).sum(axis=0) }) df_plt = df_plt.set_index('classes') # print(df_plt) # Add in catgories with no predictive model (e.g., becuase they were # too few in training). for i in adata.obs['cluster'].cat.categories: if i not in df_plt.index: df_plt = df_plt.append( pd.Series([0], index=df_plt.columns, name=i)) fig = plt.figure(figsize=(max(0.5 * len(df_plt.index), 5), 4)) # plt.bar(lr.classes_, n_features) plt.bar(df_plt.index, df_plt['features']) plt.xlabel('Cluster') plt.ylabel('Features with coefficient != 0') plt.xticks(rotation=90) for i in df_plt.index: plt.annotate(str(df_plt.loc[i, 'features']), xy=(i, df_plt.loc[i, 'features'])) fig.savefig(out_f, dpi=300, bbox_inches='tight') plt.close(fig) # Plot ROC of the test and truth. out_f = '{}-roc.png'.format(out_file_base) fig = plt.figure() cell_label_true = y_prob_df.pop('cell_label_true') # Drop columns that are not cell type labels for i in y_prob_df.columns: if 'class__' not in i: del y_prob_df[i] plot_roc(y_prob_df.values, cell_label_true.values, y_prob_df.columns) fig.savefig(out_f, dpi=300, bbox_inches='tight') plt.close(fig) if verbose: print('Completed: save {}.'.format(out_f)) # Plot metrics vs cluster size to see if smaller clusters have poorer # metric measures. df_plt = model_report.fillna(0) for i in df_plt.index: if i not in encoder.classes_: df_plt = df_plt.drop(i) for i in ['AUC', 'f1-score', 'average_precision_score', 'MCC']: out_f = '{}-cluster_size_{}.png'.format(out_file_base, i) fig = plt.figure() plt.scatter(df_plt['n_cells_full_dataset'], df_plt[i], alpha=0.5) plt.xlabel('Number of cells in cluster (full dataset)') plt.ylabel(i) if i in ['AUC', 'f1-score', 'average_precision_score']: plt.ylim(0, 1) elif i == 'MCC': plt.ylim(-1, 1) # Add annotation of the cluster for index, row in df_plt.iterrows(): if row['n_cells_full_dataset'] == 0: print('ERROP: n_cells_full_dataset = 0 for {}.'.format( index)) plt.annotate( index, # this is the text (row['n_cells_full_dataset'], row[i]), # point to label textcoords='offset points', # how to position the text xytext=(0, 10), # distance from text to points (x,y) ha='center' # horiz alignment can be left, right, center ) fig.savefig(out_f, dpi=300, bbox_inches='tight') plt.xscale('log', basex=10) fig.savefig('{}-cluster_size_{}_log10.png'.format( out_file_base, i), dpi=300, bbox_inches='tight') plt.close(fig) if verbose: print('Completed: save {}.'.format(out_f)) # Plot history of metrics over epochs for dat_i in history.history.keys(): fig = plt.figure() plt.plot(history.history[dat_i]) plt.ylabel(dat_i) plt.xlabel('Epoch') fig.savefig('{}-model_iter_{}.png'.format(out_file_base, dat_i), dpi=300, bbox_inches='tight') plt.close(fig)
int(grouped_candidates_pred_df.hetionet.value_counts()[1]), "relation": "DaG" }) datarows.append({ "edges": (grouped_candidates_pred_df.query( "pred_max > 0.5").hetionet.value_counts()[0]), "in_hetionet": "Novel", "relation": "DaG" }) edges_df = pd.DataFrame.from_records(datarows) edges_df # In[20]: g = (p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet")) + p9.geom_col(position="dodge") + p9.geom_text(p9.aes(label=( edges_df.apply(lambda x: f"{x['edges']} ({x['recall']*100:.0f}%)" if not math.isnan(x['recall']) else f"{x['edges']}", axis=1))), position=p9.position_dodge( width=1), size=9, va="bottom") + p9.scale_y_log10() + p9.theme(axis_text_y=p9.element_blank(), axis_ticks_major=p9.element_blank(), rect=p9.element_blank())) print(g)
# Different thresholds for the two cell lines if CELL_LINE == 'jurkat': if val > 0: toadd = 'a__0-0.5' if val > 0.5: toadd = 'b__0.5-1' if val > 1: toadd = 'c__1-2' if val > 2: toadd = 'd__2-3' if val > 3: toadd = 'e__3+' if CELL_LINE == "mcf7": if val > 0: toadd = 'a__0-3' if val > 3: toadd = 'b__3-10' if val > 5: toadd = 'c__5-10' if val > 10: toadd = 'd__10+' update_ratio_binarized += [toadd] sub['update_ratio_bin'] = update_ratio_binarized sub['sqrt_peak_score'] = np.sqrt(sub['peak_score']) # Now do violin plot p4 = (ggplot(data=sub[0:10000], mapping=aes(x='update_ratio_bin', y='peak_score')) + geom_violin(position=position_dodge(1), width=1) + scale_y_log10() + geom_boxplot(position=position_dodge(1), width=0.25)) p4.save( FIGURE_DIRECTORY + "peak_confirmation_nb_update_ratio_well_characterized_crm_violin_plot.pdf", verbose=False)
best_result = list(filter(lambda x: x[1] == model.C_, enumerate(model.Cs_)))[0] print(best_result) print("Best CV Fold") print(model.scores_["polka"][:, best_result[0]]) model.scores_["polka"][:, best_result[0]].mean() model_weights_df = pd.DataFrame.from_dict({ "weight": model.coef_[0], "pc": list(range(1, 51)), }) model_weights_df["pc"] = pd.Categorical(model_weights_df["pc"]) model_weights_df.head() g = (p9.ggplot(model_weights_df, p9.aes(x="pc", y="weight")) + p9.geom_col(position=p9.position_dodge(width=5), fill="#253494") + p9.coord_flip() + p9.scale_x_discrete(limits=list(sorted(range(1, 51), reverse=True))) + p9.theme_seaborn( context="paper", style="ticks", font_scale=1.1, font="Arial") + p9.theme(figure_size=(10, 8)) + p9.labs(title="Regression Model Weights", x="Princpial Component", y="Model Weight")) # g.save("output/figures/pca_log_regression_weights.svg") # g.save("output/figures/pca_log_regression_weights.png", dpi=250) print(g) fold_features = model.coefs_paths_["polka"].transpose(1, 0, 2) model_performance_df = pd.DataFrame.from_dict({ "feat_num": ((fold_features.astype(bool).sum(axis=1)) > 0).sum(axis=1), "C":
def test_dodge_preserve_single(): df1 = pd.DataFrame({'x': ['a', 'b', 'b'], 'y': ['a', 'a', 'b']}) p = (ggplot(df1, aes('x', fill='y')) + geom_bar(position=position_dodge(preserve='single'))) assert p + _theme == 'dodge_preserve_single'