def plot_compare_accuracy(self, expo=False): if expo: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy', fill='Dataset') + geom_bar(stat='identity', position='dodge') + xlab('Guessing Model') + ylab('Accuracy') ) else: return ( ggplot(self.acc_df) + facet_wrap('position') + aes(x='guesser', y='accuracy') + geom_bar(stat='identity') )
def test_ribbon_facetting(): p = (ggplot(df, aes('x', ymin='ymin', ymax='ymax', fill='factor(z)')) + geom_ribbon() + facet_wrap('~ z') ) assert p + _theme == 'ribbon_facetting'
def plot_char_percent_vs_accuracy_histogram(self, category=False): if category: return ( ggplot(self.char_plot_df) + facet_wrap('category_jmlr') + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05) ) else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05) )
def create_confidence_plot(conf_df): plt = ( ggplot(conf_df) + aes(x='x', color='Method', fill='Method') + geom_density(alpha=.45) + facet_wrap('Task', nrow=4) + xlab('Confidence') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( axis_text_y=element_blank(), axis_ticks_major_y=element_blank(), axis_title_y=element_blank(), legend_title=element_blank(), legend_position='top', legend_box='horizontal', ) ) return plt
def create_length_plot(len_df, legend_position='right', legend_box='vertical'): mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index() mean_len_df[' '] = 'Mean Length' plt = ( ggplot(len_df) + aes(x='x', fill='Method', y='..density..') + geom_histogram(binwidth=2, position='identity', alpha=.6) + geom_text( aes(x='x', y=.22, label='x', color='Method'), mean_len_df, inherit_aes=False, format_string='{:.1f}', show_legend=False ) + geom_segment( aes(x='x', xend='x', y=0, yend=.205, linetype=' '), mean_len_df, inherit_aes=False, color='black' ) + scale_linetype_manual(['dashed']) + facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) + xlab('Example Length') + ylab('Frequency') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( aspect_ratio=1, legend_title=element_blank(), legend_position=legend_position, legend_box=legend_box, ) ) return plt
def test_facet_wrap_one_var(): p = g + facet_wrap('~var1') p2 = g + facet_wrap('~class') # python keyword in formula assert p == 'facet_wrap_one_var' assert p2 == 'facet_wrap_one_var'
def test_aslabeller_func_hashtag(): func = as_labeller(lambda s: '#{}'.format(s)) p = g + facet_wrap('~ gear + am', labeller=func) assert p == 'aslabeller_func_hashtagit'
def test_label_context_wrap2vars(): p = g + facet_wrap('~ gear + am', labeller='label_context') assert p == 'label_context_wrap2vars'
def test_non_mapped_facetting(): p = (g + geom_abline(intercept=0, slope=1, size=1) + facet_wrap('var1')) assert p == 'non_mapped_facetting'
def syntactic_diversity_plots(): with open('data/external/syntactic_diversity_table.json') as f: rows = json.load(f) parse_df = pd.DataFrame(rows) parse_df['parse_ratio'] = parse_df['unique_parses'] / parse_df['parses'] melt_df = pd.melt( parse_df, id_vars=['dataset', 'depth', 'overlap', 'parses'], value_vars=['parse_ratio', 'unique_parses'], var_name='metric', value_name='y' ) def label_facet(name): if name == 'parse_ratio': return 'Average Unique Parses per Instance' elif name == 'unique_parses': return 'Count of Unique Parses' def label_y(ys): formatted_ys = [] for y in ys: y = str(y) if y.endswith('000.0'): formatted_ys.append(y[:-5] + 'K') else: formatted_ys.append(y) return formatted_ys p = ( ggplot(melt_df) + aes(x='depth', y='y', color='dataset') + facet_wrap('metric', scales='free_y', nrow=2, labeller=label_facet) + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('') + scale_color_discrete(name='Dataset') + scale_y_continuous(labels=label_y) + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'syn_div_plot.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='unique_parses', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Count of Unique Parses') + scale_color_discrete(name='Dataset') + scale_x_continuous( breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + theme_fs() ) p.save(path.join(output_path, 'n_unique_parses.pdf')) p = ( ggplot(parse_df) + aes(x='depth', y='parse_ratio', color='dataset') + geom_line() + geom_point() + xlab('Parse Truncation Depth') + ylab('Average Unique Parses per Instance') + scale_color_discrete(name='Dataset') + scale_x_continuous(breaks=list(range(1, 11)), minor_breaks=list(range(1, 11)), limits=[1, 10]) + scale_y_continuous(limits=[0, 1]) + theme_fs() ) p.save(path.join(output_path, 'parse_ratio.pdf'))
def test_facet_wrap_direction_v(): p = g + facet_wrap('~var1', dir='v') assert p == 'facet_wrap_direction_v'
def test_facet_wrap_expression(): p = g + facet_wrap('pd.cut(var1, (0, 2, 4), include_lowest=True)') assert p == 'facet_wrap_expression'
def plot_tag_repartition(self, data, options): tag_df = data["tags"] if not "background" in tag_df.columns: tag_df["background"] = False test = tag_df[["tag", "matched", "background", "id"]].copy() test.loc[:, "prop_matched"] = -1 test.loc[:, "prop_background"] = -1 test.loc[:, "lbl_matched"] = "" test.loc[:, "lbl_background"] = "" test.loc[:, "n_tags"] = -1 n_total = test.shape[0] n_matched = test.matched.value_counts() tags_summary = ( test.groupby("tag") .apply(self.get_proportions, by=["matched", "background"]) .reset_index(drop=True) ) tags_summary = tags_summary.sort_values(["tag", "matched", "background"]) plt = ggplot( data=tags_summary, mapping=aes( x="tag", # "factor(species, ordered=False)", y="n_tags", fill="background", ymax=max(tags_summary.n_tags) + 35, # "factor(species, ordered=False)", ), ) plot_width = 10 + len(tags_summary.tag.unique()) * 0.75 plt = ( plt + geom_bar(stat="identity", show_legend=True, position=position_dodge()) + facet_wrap( "matched", nrow=1, ncol=2, scales="fixed", labeller=(lambda x: self.get_matched_label(x, n_total, n_matched)), ) + xlab("Species") + ylab("Number of annotations") + geom_text( mapping=aes(label="lbl_background"), position=position_dodge(width=0.9), ) + geom_text( mapping=aes(y=max(tags_summary.n_tags) + 30, label="lbl_matched",) ) + theme_classic() + theme( axis_text_x=element_text(angle=90, vjust=1, hjust=1, margin={"r": -30}), plot_title=element_text( weight="bold", size=14, margin={"t": 10, "b": 10} ), figure_size=(plot_width, 10), text=element_text(size=12, weight="bold"), ) + ggtitle( ( "Tag repartition for model {}, database {}, class {}\n" + "with detector options {}" ).format( options["scenario_info"]["model"], options["scenario_info"]["database"], options["scenario_info"]["class"], options, ) ) ) return plt
.mean() .reset_index() ) for plate in platelist: os.makedirs(output_figuresdir, exist_ok=True) by_well_gg = ( gg.ggplot( cell_count_totalcells_df.loc[ cell_count_totalcells_df["site"].str.contains(plate) ], gg.aes(x="x_loc", y="y_loc"), ) + gg.geom_point(gg.aes(fill="total_cell_count"), shape="s", size=6) + gg.geom_text(gg.aes(label="site_location"), color="lightgrey", size=6) + gg.facet_wrap("~well") + gg.coord_fixed() + gg.theme_bw() + gg.ggtitle(f"Total Cells/Well\n{plate}") + gg.theme( axis_text=gg.element_blank(), axis_title=gg.element_blank(), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), ) + gg.labs(fill="Cells") + gg.scale_fill_cmap(name="Number of Cells") ) output_file = pathlib.Path( output_figuresdir, f"plate_layout_cells_count_per_well_{plate}.png" )
knnResultsSimplified = pd.DataFrame( [(x['p'], x['k'], x['cvAccuracy'], x['testAccuracy']) for x in repeatedKnnResults], columns=['p', 'k', 'cvAccuracy', 'testAccuracy']) ggdata = pd.concat([ pd.DataFrame({ 'p': knnResultsSimplified.p, 'k': knnResultsSimplified.k.apply(int), 'type': 'cv', 'Accuracy': knnResultsSimplified.cvAccuracy }), pd.DataFrame({ 'p': knnResultsSimplified.p, 'k': knnResultsSimplified.k.apply(int), 'type': 'test', 'Accuracy': knnResultsSimplified.testAccuracy }) ], axis=0) ggo = gg.ggplot( ggdata, gg.aes(x='p', y='Accuracy', color='type', group='type', linetype='type')) ggo += gg.scale_x_log10() ggo += gg.geom_point(alpha=0.6) ggo += gg.stat_smooth() ggo += gg.facet_wrap('~ k') ggo += gg.theme_bw() print(ggo)
# In[46]: all_data_df.columns = [ 'PC1', 'PC2', 'num_partitions', 'comparison', 'No. of partitions', 'Comparison' ] # In[52]: # Plot all comparisons in one figure panel_B = ggplot(all_data_df[all_data_df['Comparison'] != '1'], aes(x='PC1', y='PC2')) \ + geom_point(aes(color='No. of partitions'), alpha=0.2) \ + facet_wrap('~Comparison') \ + labs(x = "PC 1", y = "PC 2", title = "PCA of partition 1 vs multiple partitions") \ + theme_bw() \ + theme( legend_title_align = "center", plot_background=element_rect(fill='white'), legend_key=element_rect(fill='white', colour='white'), legend_text=element_text(family='sans-serif', size=12), plot_title=element_text(family='sans-serif', size=15), axis_text=element_text(family='sans-serif', size=12), axis_title=element_text(family='sans-serif', size=15) ) \ + guides(colour=guide_legend(override_aes={'alpha': 1})) \ + scale_color_manual(['#bdbdbd', '#b3e5fc']) \
std_accuracy=pd.NamedAgg('accuracy', np.std), ) summary.reset_index(inplace=True) print('Summary for all participants') print(summary) summary_per_participant = trials.groupby( by=['participant_id', 'freq_category']).agg(np.mean) summary_per_participant.reset_index(inplace=True) print('Summary per participant') print(summary_per_participant) plot = ( ggplot(gg.aes(x='freq_category', y='reaction_time'), data=trials) + gg.geom_boxplot(gg.aes(fill='freq_category')) + # of jitter gg.facet_wrap('participant_id')) plot.draw() plt.show() plot = (ggplot(gg.aes(x="freq_category", weight='accuracy'), summary_per_participant) + gg.geom_bar() + gg.facet_wrap(['participant_id'])) plot.draw() plt.show() plot = ( ggplot(gg.aes(x='freq_category', y='reaction_time'), data=trials) + gg.geom_boxplot(gg.aes(fill='freq_category')) # of jitter ) plot.draw() plt.show()
from plotnine import (ggplot, aes, geom_point, facet_wrap, stat_smooth, theme_xkcd) from plotnine.data import mtcars kwargs = dict(width=6, height=4) p1 = (ggplot(mtcars, aes('wt', 'mpg')) + geom_point()) p1.save('readme-image-1.png', **kwargs) p2 = p1 + aes(color='factor(gear)') p2.save('readme-image-2.png', **kwargs) p3 = p2 + stat_smooth(method='lm') p3.save('readme-image-3.png', **kwargs) p4 = p3 + facet_wrap('~gear') p4.save('readme-image-4.png', **kwargs) p5 = p4 + theme_xkcd() p5.save('readme-image-5.png', **kwargs)
def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False): if self.y_max is not None: limits = [0, float(self.y_max)] eprint(f'Setting limits to: {limits}') else: limits = [0, 1] if expo: if os.path.exists('data/external/all_human_gameplay.json' ) and not self.no_humans: with open('data/external/all_human_gameplay.json') as f: all_gameplay = json.load(f) frames = [] for event, name in [('parents', 'Intermediate'), ('maryland', 'Expert'), ('live', 'National')]: if self.merge_humans: name = 'Human' gameplay = all_gameplay[event] if event != 'live': control_correct_positions = gameplay[ 'control_correct_positions'] control_wrong_positions = gameplay[ 'control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array( len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[ argsort_control] control_y = control_sorted_result.cumsum( ) / control_sorted_result.shape[0] control_df = pd.DataFrame({ 'correct': control_y, 'char_percent': control_x }) control_df['Dataset'] = 'Regular Test' control_df['Guessing_Model'] = f' {name}' frames.append(control_df) adv_correct_positions = gameplay[ 'adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum( ) / adv_sorted_result.shape[0] adv_df = pd.DataFrame({ 'correct': adv_y, 'char_percent': adv_x }) adv_df['Dataset'] = 'IR Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) if len(gameplay['advneural_correct_positions']) > 0: adv_correct_positions = gameplay[ 'advneural_correct_positions'] adv_wrong_positions = gameplay[ 'advneural_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum( ) / adv_sorted_result.shape[0] adv_df = pd.DataFrame({ 'correct': adv_y, 'char_percent': adv_x }) adv_df['Dataset'] = 'RNN Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) human_df = pd.concat(frames) human_vals = sort_humans( list(human_df['Guessing_Model'].unique())) human_dtype = CategoricalDtype(human_vals, ordered=True) human_df['Guessing_Model'] = human_df[ 'Guessing_Model'].astype(human_dtype) dataset_dtype = CategoricalDtype( ['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True) human_df['Dataset'] = human_df['Dataset'].astype( dataset_dtype) if no_models: p = ggplot(human_df) + geom_point(shape='.') else: df = self.char_plot_df if 1 not in self.rounds: df = df[df['Dataset'] != 'Round 1 - IR Adversarial'] if 2 not in self.rounds: df = df[df['Dataset'] != 'Round 2 - IR Adversarial'] df = df[df['Dataset'] != 'Round 2 - RNN Adversarial'] p = ggplot(df) if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) if os.path.exists('data/external/all_human_gameplay.json' ) and not self.no_humans: eprint('Loading human data') p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap('Guessing_Model', ncol=1) else: facet_conf = facet_wrap('Guessing_Model', nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth(method='mavg', se=False, method_args={'window': 400}) else: chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5) else: chart = None p = (p + facet_conf + aes(x='char_percent', y='correct', color='Dataset')) if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, .5, 1]) + coord_cartesian(ylim=limits) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={ 't': 6, 'b': 6, 'l': 1, 'r': 5 })) + scale_color_manual( values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions')) if self.title != '': p += ggtitle(self.title) return p else: if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) return (ggplot(self.char_plot_df) + aes( x='char_percent', y='correct', color='Guessing_Model') + stat_smooth( method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + coord_cartesian(ylim=limits))
# In[11]: # Side by side original input vs simulated data # Add label for input or simulated dataset input_data_UMAPencoded_df['dataset'] = 'original' simulated_data_UMAPencoded_df['dataset'] = 'simulated' # Concatenate input and simulated dataframes together combined_data_df = pd.concat( [input_data_UMAPencoded_df, simulated_data_UMAPencoded_df]) # Plot ggplot(combined_data_df, aes( x='1', y='2')) + geom_point(alpha=0.3) + facet_wrap('~dataset') + labs( x="UMAP 1", y="UMAP 2", title="UMAP of original and simulated data") # In[12]: # Overlay original input vs simulated data # Add label for input or simulated dataset input_data_UMAPencoded_df['dataset'] = 'original' simulated_data_UMAPencoded_df['dataset'] = 'simulated' # Concatenate input and simulated dataframes together combined_data_df = pd.concat( [input_data_UMAPencoded_df, simulated_data_UMAPencoded_df]) # Plot
def test_non_mapped_facetting(): p = (g + geom_abline(intercept=0, slope=1, size=1) + facet_wrap('var1') ) assert p == 'non_mapped_facetting'
# In[12]: for level in levels: all_feature_results_subset_df = all_feature_results_df.query( "level == @level").reset_index(drop=True) output_dir = pathlib.Path(f"{output_fig_dir}/{level}") output_dir.mkdir(exist_ok=True) # Figure 1 - Per plate feature differences per_plate_feature_gg = ( gg.ggplot(all_feature_results_subset_df, gg.aes(x="plate", y="metric_value")) + gg.geom_point(size=0.1, alpha=0.5) + gg.facet_wrap("~metric", scales="free", nrow=len(metrics)) + gg.xlab("Plate") + gg.ylab("Feature Difference\nBetween Tools") + gg.ggtitle(f"Plate Summary\n{level}") + theme_summary) output_file = pathlib.Path(f"{output_dir}/{level}_metrics_per_plate.png") per_plate_feature_gg.save(output_file, dpi=dpi, height=height, width=width) print(per_plate_feature_gg) del per_plate_feature_gg # In[13]: for level in levels: all_feature_results_subset_df = all_feature_results_df.query( "level == @level").reset_index(drop=True)
def test_facet_wrap_label_both(): p = g + facet_wrap('~var1+var2', labeller='label_both') assert p == 'facet_wrap_label_both'
# print(aci) res # res.to_feather("data_glm.feather") def label_x(dates): res = [(datetime.datetime(2018, 1, 1) + datetime.timedelta(x)).strftime("%d-%m") for x in dates] print(res) return res (ggplot(data=res, mapping=aes(x='julian', y='ACI_mean', colour='site')) + xlab("Day") + ylab("Mean daily ACI (standardized)") # + facet_grid("site~", scales="free") + facet_wrap("site", nrow=2, ncol=3) + geom_point() + geom_errorbar(aes(ymin="ACI_mean - ACI_std", ymax="ACI_mean + ACI_std")) + geom_smooth(method="mavg", se=False, method_args={ "window": 4, "center": True, "min_periods": 1 }) + scale_colour_manual(values=cbbPalette, guide=False) + scale_x_continuous(labels=label_x)).save("figs/ACI_all_testfacet3.png", height=10, width=16, dpi=150) ################# ### Denoising ###
def plot_portfolio(portfolio_df, figure_size=(12, 4), line_size=1.5, date_text_size=7): """ Given a daily snapshot of virtual purchases plot both overall and per-stock performance. Return a tuple of figures representing the performance as inline data. """ assert portfolio_df is not None #print(portfolio_df) portfolio_df['date'] = pd.to_datetime(portfolio_df['date']) avg_profit_over_period = portfolio_df.filter( items=['stock', 'stock_profit']).groupby('stock').mean() avg_profit_over_period['contribution'] = [ 'positive' if profit >= 0.0 else 'negative' for profit in avg_profit_over_period.stock_profit ] avg_profit_over_period = avg_profit_over_period.drop( 'stock_profit', axis='columns') # dont want to override actual profit with average portfolio_df = portfolio_df.merge(avg_profit_over_period, left_on='stock', right_index=True, how='inner') #print(portfolio_df) # 1. overall performance df = portfolio_df.filter(items=[ 'portfolio_cost', 'portfolio_worth', 'portfolio_profit', 'date' ]) df = df.melt(id_vars=['date'], var_name='field') plot = ( p9.ggplot(df, p9.aes('date', 'value', group='field', color='field')) + p9.labs(x='', y='$ AUD') + p9.geom_line(size=1.5) + p9.facet_wrap('~ field', nrow=3, ncol=1, scales='free_y') + p9.theme(axis_text_x=p9.element_text(angle=30, size=date_text_size), figure_size=figure_size, legend_position='none')) overall_figure = plot_as_inline_html_data(plot) df = portfolio_df.filter( items=['stock', 'date', 'stock_profit', 'stock_worth', 'contribution']) melted_df = df.melt(id_vars=['date', 'stock', 'contribution'], var_name='field') all_dates = sorted(melted_df['date'].unique()) df = melted_df[melted_df['date'] == all_dates[-1]] df = df[df['field'] == 'stock_profit'] # only latest profit is plotted df['contribution'] = [ 'positive' if profit >= 0.0 else 'negative' for profit in df['value'] ] # 2. plot contributors ie. winners and losers plot = (p9.ggplot(df, p9.aes('stock', 'value', fill='stock')) + p9.geom_bar(stat='identity') + p9.labs(x='', y='$ AUD') + p9.facet_grid('contribution ~ field') + p9.theme(legend_position='none', figure_size=figure_size)) profit_contributors = plot_as_inline_html_data(plot) # 3. per purchased stock performance plot = ( p9.ggplot(melted_df, p9.aes('date', 'value', group='stock', colour='stock')) + p9.xlab('') + p9.geom_line(size=1.0) + p9.facet_grid('field ~ contribution', scales="free_y") + p9.theme( axis_text_x=p9.element_text(angle=30, size=date_text_size), figure_size=figure_size, panel_spacing= 0.5, # more space between plots to avoid tick mark overlap subplots_adjust={'right': 0.8})) stock_figure = plot_as_inline_html_data(plot) return overall_figure, stock_figure, profit_contributors
def test_label_both(): p = g + facet_wrap('~ gear', labeller='label_both') assert p == 'label_both'
def plot_cellranger_vs_cellbender(samplename, raw_cellranger_mtx, filtered_cellranger_mtx, cellbender_unfiltered_h5, fpr, n_expected_cells, n_total_droplets_included, out_dir): """compare cellranger raw vs cellranger filtered vs cellbender outputs""" logging.info('samplename ' + str(samplename)) logging.info('raw_cellranger_mtx ' + str(raw_cellranger_mtx)) logging.info('filtered_cellranger_mtx ' + str(filtered_cellranger_mtx)) logging.info('cellbender_unfiltered_h5 ' + str(cellbender_unfiltered_h5)) logging.info('fpr ' + str(fpr)) logging.info('n_expected_cells ' + str(n_expected_cells)) logging.info('n_total_droplets_included ' + str(n_total_droplets_included)) logging.info('out_dir ' + str(out_dir)) # Make the output directory if it does not exist. if out_dir == '': out_dir = os.getcwd() else: os.makedirs(out_dir, exist_ok=True) out_dir = out_dir + '/fpr_' + fpr os.makedirs(out_dir, exist_ok=True) os.makedirs(out_dir + '/' + samplename, exist_ok=True) logging.info(out_dir) # logging.info(df.head()) # Get compression opts for pandas compression_opts = 'gzip' if LooseVersion(pd.__version__) > '1.0.0': compression_opts = dict(method='gzip', compresslevel=9) # read cellranger raw adata_cellranger_raw = sc.read_10x_mtx(raw_cellranger_mtx, var_names='gene_symbols', make_unique=True, cache=False, cache_compression=compression_opts) # First filter out any cells that have 0 total counts zero_count_cells_cellranger_raw = adata_cellranger_raw.obs_names[np.where( adata_cellranger_raw.X.sum(axis=1) == 0)[0]] # sc.pp.filter_cells(adata, min_counts=1, inplace=True) # Minimum number of counts required for a cell to pass filtering. logging.info( "_cellranger_raw: Filtering {}/{} cells with 0 counts.".format( len(zero_count_cells_cellranger_raw), adata_cellranger_raw.n_obs)) adata_cellranger_raw = adata_cellranger_raw[ adata_cellranger_raw.obs_names.difference( zero_count_cells_cellranger_raw, sort=False)] sc.pp.calculate_qc_metrics(adata_cellranger_raw, inplace=True) logging.info('cellranger raw n barcodes(.obs) x cells(.var) .X.shape:') logging.info(adata_cellranger_raw.X.shape) logging.info('cellranger raw .obs:') logging.info(adata_cellranger_raw.obs) logging.info('cellranger raw .var:') logging.info(adata_cellranger_raw.var) df_total_counts = pd.DataFrame(data=adata_cellranger_raw.obs.sort_values( by=['total_counts'], ascending=False).total_counts) df_total_counts['barcode_row_number'] = df_total_counts.reset_index( ).index + 1 df_total_counts['barcodes'] = df_total_counts.index df_total_counts_cellranger_raw = df_total_counts df_total_counts_cellranger_raw['dataset'] = 'Cellranger Raw' logging.info(df_total_counts) # read cellranger filtered adata_cellranger_filtered = sc.read_10x_mtx( filtered_cellranger_mtx, var_names='gene_symbols', make_unique=True, cache=False, cache_compression=compression_opts) # First filter out any cells that have 0 total counts zero_count_cells_cellranger_filtered = adata_cellranger_filtered.obs_names[ np.where(adata_cellranger_filtered.X.sum(axis=1) == 0)[0]] # sc.pp.filter_cells(adata, min_counts=1, inplace=True) # Minimum number of counts required for a cell to pass filtering. logging.info( "_cellranger_filtered: Filtering {}/{} cells with 0 counts.".format( len(zero_count_cells_cellranger_filtered), adata_cellranger_filtered.n_obs)) adata_cellranger_filtered = adata_cellranger_filtered[ adata_cellranger_filtered.obs_names.difference( zero_count_cells_cellranger_filtered, sort=False)] sc.pp.calculate_qc_metrics(adata_cellranger_filtered, inplace=True) logging.info( 'cellranger filtered n barcodes(.obs) x cells(.var) .X.shape:') logging.info(adata_cellranger_filtered.X.shape) logging.info('cellranger filtered .obs:') logging.info(adata_cellranger_filtered.obs.columns) logging.info(adata_cellranger_filtered.obs) logging.info('cellranger filtered .var:') logging.info(adata_cellranger_filtered.var) df_total_counts = pd.DataFrame( data=adata_cellranger_filtered.obs.sort_values( by=['total_counts'], ascending=False).total_counts) df_total_counts['barcodes'] = df_total_counts.index df_total_counts['barcode_row_number'] = df_total_counts.reset_index( ).index + 1 df_total_counts_cellranger_filtered = df_total_counts df_total_counts_cellranger_filtered['dataset'] = 'Cellranger Filtered' logging.info(df_total_counts) # read cellbender output adata_cellbender = anndata_from_h5(cellbender_unfiltered_h5, analyzed_barcodes_only=True) # First filter out any cells that have 0 total counts zero_count_cells_cellbender_filtered = adata_cellbender.obs_names[np.where( adata_cellbender.X.sum(axis=1) == 0)[0]] # sc.pp.filter_cells(adata, min_counts=1, inplace=True) # Minimum number of counts required for a cell to pass filtering. logging.info( "_cellbender_filtered: Filtering {}/{} cells with 0 counts.".format( len(zero_count_cells_cellbender_filtered), adata_cellbender.n_obs)) adata_cellbender = adata_cellbender[adata_cellbender.obs_names.difference( zero_count_cells_cellbender_filtered, sort=False)] sc.pp.calculate_qc_metrics(adata_cellbender, inplace=True) logging.info( 'cellbender cellbender.n barcodes(.obs) x cells(.var) .X.shape:') logging.info(adata_cellbender.X.shape) logging.info('cellbender cellbender.obs:') logging.info(adata_cellbender.obs) logging.info('cellbender cellbender.var:') logging.info(adata_cellbender.var) df_total_counts = pd.DataFrame(data=adata_cellbender.obs.sort_values( by=['total_counts'], ascending=False).total_counts) df_total_counts['barcodes'] = df_total_counts.index df_total_counts['barcode_row_number'] = df_total_counts.reset_index( ).index + 1 df_total_counts_cellbender = df_total_counts df_total_counts_cellbender['dataset'] = 'Cellbender' logging.info(df_total_counts) # df_total_counts_cellranger_filtered.rename(columns={"total_counts": "cellranger_filtered_total_counts"}) df_cellranger_cellbender = pd.merge( df_total_counts_cellranger_filtered, df_total_counts_cellbender, how='outer', left_index=True, right_index=True, suffixes=('_cellranger', '_cellbender')).sort_values(by=['total_counts_cellbender'], ascending=False) logging.info(df_cellranger_cellbender) df_cellranger_cellbender[['cellranger', 'cellbender']] = np.where( df_cellranger_cellbender[[ 'total_counts_cellranger', 'total_counts_cellbender' ]].isnull(), 0, 1) #df_cellranger_cellbender.to_csv('df_cellranger_cellbender.csv', index=True, index_label='barcode') grouped = df_cellranger_cellbender[['cellranger', 'cellbender']].groupby( ["cellranger", "cellbender"]).size().reset_index(name='counts') logging.info(grouped.columns) #grouped.to_csv('cellranger_cellbender.csv', index=False) df_cellranger_cellbender[ 'barcode_row_number'] = df_cellranger_cellbender.reset_index( ).index + 1 ### plot UMI counts descending order df_merged = pd.concat([ df_total_counts_cellranger_raw, df_total_counts_cellranger_filtered, df_total_counts_cellbender ]) #df_merged.to_csv('df_merged.csv', index=True, index_label='barcode') df_vline = pd.DataFrame( data={ 'x': [int(n_expected_cells), int(n_total_droplets_included)], 'color': ['expected-cells', 'total-droplets-included'] }) gplt = ggplot(df_merged, aes(x='barcode_row_number', y='total_counts')) \ + geom_point() \ + geom_vline(df_vline, aes(xintercept='x', color='color')) \ + theme_bw() + facet_wrap('dataset') \ + labs(x='Barcodes (ordered by descending cell total couts)',color='Cellbender input', y='Cell total counts', title='Cells filtered out by Cellranger or Cellbender') \ + scale_y_continuous(trans='log10',minor_breaks=0) + scale_x_continuous(trans='log10',minor_breaks=0) gplt.save(out_dir + '/' + samplename + '/barcode_vs_total_counts.png', width=12, height=5, dpi=300) # dpi=300, df_cellranger_cellbender_count = grouped # pd.read_csv('cellranger_cellbender.csv') df = pd.merge(df_merged, df_cellranger_cellbender[['cellranger', 'cellbender']], how='left', left_index=True, right_index=True) df = pd.merge(df, df_cellranger_cellbender_count, how='left', left_on=['cellranger', 'cellbender'], right_on=['cellranger', 'cellbender']) df["counts"].fillna(df['counts'].isnull().sum(), inplace=True) df["counts"] = df["counts"].astype(int) # df.replace({"counts": {""} }, inplace=True) df["filtered"] = df["cellranger"].astype( str) + '-' + df["cellbender"].astype(str) df.replace( { "filtered": { "nan-nan": 'Cellranger Raw only', "1.0-1.0": "Cellranger Filtered + Cellbender", "1.0-0.0": "Cellranger Filtered only", "0.0-1.0": "Cellbender only", "0.0-0.0": "0.0-0.0" } }, inplace=True) df["filtered"] = df["filtered"] + ', n=' + df["counts"].astype(str) df['filtered'].value_counts() df.replace( { "dataset": { "cellbender": "Cellbender output", "cellranger_raw": "Cellranger Raw output", "cellranger_filtered": "Cellranger Filtered output" } }, inplace=True) gplt = ggplot(df, aes(x='filtered', y='total_counts', color='filtered')) \ + geom_boxplot() \ + theme_bw() \ + facet_wrap('dataset') \ + theme(axis_text_x=element_blank()) \ + scale_y_continuous(trans='log10',minor_breaks=0) \ + labs(color='n cells in intersection of datasets', x='', y='Cell total counts', title='Total cell counts compared across datasets (facets)') gplt.save(out_dir + '/' + samplename + '/boxplots_cellranger_vs_cellbender.png', width=12, height=5, dpi=300) # dpi=300, # plot difference cellbender filtered vs cellranger filtered for common cells between the 2 datasets df_cellranger_cellbender = df_cellranger_cellbender[ df_cellranger_cellbender['cellranger'] == 1] df_cellranger_cellbender = df_cellranger_cellbender[ df_cellranger_cellbender['cellbender'] == 1] # Subset the datasets to the relevant barcodes. adata_cellbender_common = adata_cellbender[ df_cellranger_cellbender.index.values] adata_cellranger_filtered_common = adata_cellranger_filtered[ df_cellranger_cellbender.index.values] # Put count matrices into 'layers' in anndata for clarity. adata = adata_cellbender_common adata.layers['counts_cellbender'] = adata_cellbender_common.X.copy() adata.layers['counts_raw'] = adata_cellranger_filtered_common.X.copy() # Get the differences in counts per cell X_raw_minus_cb = adata.layers['counts_raw'] - adata.layers[ 'counts_cellbender'] X_raw_minus_cb = abs(X_raw_minus_cb) # Get the top most different genes df_diff_genes = pd.DataFrame(data=adata.var.gene_symbols.values) df_diff_genes['ensembl_id'] = adata.var.index df_diff_genes['gene_symbol'] = adata.var.gene_symbols.values df_diff_genes['dif_across_cells'] = np.asarray( X_raw_minus_cb.sum(axis=0)).reshape(-1) df_diff_genes = df_diff_genes.sort_values('dif_across_cells', ascending=False).head(n=100) #df_diff_genes.to_csv('df_diff_genes.csv', index=True) top_genes = df_diff_genes['ensembl_id'] top_genes_symbols = df_diff_genes['gene_symbol'] logging.info('top_genes:') logging.info(top_genes) logging.info(adata_cellbender_common.var.index) adata_cellbender_common = adata_cellbender[ df_cellranger_cellbender.index.values, top_genes].to_df() adata_cellbender_common['barcode'] = adata_cellbender_common.index adata_cellbender_common = pd.melt(adata_cellbender_common, ignore_index=True, id_vars=['barcode'], var_name='ensembl_id', value_name='count') adata_cellbender_common = pd.merge( adata_cellbender_common, df_diff_genes[['ensembl_id', 'gene_symbol']], how='left', left_on='ensembl_id', right_on='ensembl_id') adata_cellbender_common = adata_cellbender_common.sort_values( by=['barcode', 'ensembl_id'], ascending=False) adata_cellbender_common['dataset'] = 'Cellbender' #adata_cellbender_common.to_csv('adata_cellbender_common.csv', index=True) logging.info(adata_cellranger_filtered.var.index) adata_cellranger_filtered_common = adata_cellranger_filtered[ df_cellranger_cellbender.index.values, top_genes_symbols].to_df() adata_cellranger_filtered_common[ 'barcode'] = adata_cellranger_filtered_common.index adata_cellranger_filtered_common = pd.melt( adata_cellranger_filtered_common, ignore_index=True, id_vars=['barcode'], var_name='gene_symbol', value_name='count') adata_cellranger_filtered_common = pd.merge( adata_cellranger_filtered_common, df_diff_genes[['ensembl_id', 'gene_symbol']], how='left', left_on='gene_symbol', right_on='gene_symbol') adata_cellranger_filtered_common['dataset'] = 'Cellranger Filtered' adata_cellranger_filtered_common = adata_cellranger_filtered_common.sort_values( by=['barcode', 'ensembl_id'], ascending=False) adata_cellranger_filtered_common = adata_cellranger_filtered_common[ adata_cellbender_common.columns] #adata_cellranger_filtered_common.to_csv('adata_cellranger_filtered_common.csv', index=True) logging.info(adata_cellranger_raw.var.index) adata_cellranger_raw_common = adata_cellranger_raw[ df_cellranger_cellbender.index.values, top_genes_symbols].to_df() adata_cellranger_raw_common['barcode'] = adata_cellranger_raw_common.index adata_cellranger_raw_common = pd.melt(adata_cellranger_raw_common, ignore_index=True, id_vars=['barcode'], var_name='gene_symbol', value_name='count') adata_cellranger_raw_common = pd.merge( adata_cellranger_raw_common, df_diff_genes[['ensembl_id', 'gene_symbol']], how='left', left_on='gene_symbol', right_on='gene_symbol') adata_cellranger_raw_common['dataset'] = 'Cellranger Raw' adata_cellranger_raw_common = adata_cellranger_raw_common.sort_values( by=['barcode', 'ensembl_id'], ascending=False) adata_cellranger_raw_common = adata_cellranger_raw_common[ adata_cellbender_common.columns] #adata_cellranger_raw_common.to_csv('adata_cellranger_raw_common.csv', index=True) logging.info(adata_cellranger_raw_common['gene_symbol'] == adata_cellbender_common['gene_symbol']) logging.info(adata_cellranger_raw_common['ensembl_id'] == adata_cellbender_common['ensembl_id']) adata_filtered_cellbender_diff = adata_cellbender_common.copy() adata_filtered_cellbender_diff['count'] = adata_cellranger_filtered_common[ 'count'] - adata_cellbender_common['count'] adata_filtered_cellbender_diff[ 'dataset'] = 'Cellranger Filtered - Cellbender' adata_raw_cellbender_diff = adata_cellbender_common.copy() adata_raw_cellbender_diff['count'] = adata_cellranger_raw_common[ 'count'] - adata_cellbender_common['count'] adata_raw_cellbender_diff['dataset'] = 'Cellranger Raw - Cellbender' df_merged = pd.concat([ adata_cellbender_common, adata_cellranger_filtered_common, adata_cellranger_raw_common, adata_filtered_cellbender_diff, adata_raw_cellbender_diff ], ignore_index=True) gplt = ggplot(df_merged, aes(x='gene_symbol',y='count')) \ + geom_boxplot() \ + theme_bw() \ + theme(axis_text_x = element_text(angle = 90, hjust = 1, size= 6)) \ + facet_wrap('dataset', scales = 'free', ncol = 1) \ + labs(x='Genes (top 100 Genes most different between Cellranger Filtered counts and Cellbender filtered counts)', y='Cell total counts', title='Total cell counts compared across most different genes (x-axis) and datasets (facets)') gplt.save(out_dir + '/' + samplename + '/boxplot_topgenes_cellranger_vs_cellbender.png', width=10, height=20, dpi=300) # dpi=300, logging.info('script done.')
def test_facet_wrap_two_vars(): p = g + facet_wrap('~var1+var2') p2 = g + facet_wrap('~class+var2') # python keyword in formula assert p == 'facet_wrap_two_vars' assert p2 == 'facet_wrap_two_vars'
# Add label for input or simulated dataset input_data_UMAPencoded_df['dataset'] = 'original' simulated_data_UMAPencoded_df['dataset'] = 'simulated' # Concatenate input and simulated dataframes together combined_data_df = pd.concat( [input_data_UMAPencoded_df, simulated_data_UMAPencoded_df]) # Plot sequentially #backgrd_data = combined_data_df[combined_data_df['experiment_id'] == 'Not selected'] #select_data = combined_data_df[combined_data_df['experiment_id'] != 'Not selected'] # Plot ggplot(combined_data_df, aes(x='1', y='2')) + geom_point( aes(color='experiment_id'), alpha=0.3) + facet_wrap('~dataset') + xlab('UMAP 1') + ylab( 'UMAP 2') + ggtitle('UMAP of original and simulated data (gene space)') #+ xlim(3,12) \ #+ ylim(-7,10) \ #+ scale_colour_manual(values=["blue", "purple", "orange", "red", "magenta", "lightgrey"]) \ # In[12]: # Overlay original and simulated data ggplot(combined_data_df, aes(x='1', y='2')) + geom_point( aes(color='dataset'), alpha=0.3) + scale_colour_manual( values=["grey", "blue"]) + xlab('UMAP 1') + ylab('UMAP 2') + ggtitle( 'UMAP of original and simulated data (gene space)') # ## Visualize simulated data (gene space) projected into PCA space
def plot_abs_dataframe(self, df: pd.DataFrame) -> p9.ggplot: facets = [] n_per_facet = {} print(df) for col in df.columns: try: n_values = df[col].nunique() if n_values == 1 and col not in [ "TIME_PERIOD", "value", "Measure", "OBS_COMMENT", ]: self.fixed_datapoints.add(f"{col}={df.at[0, col]}") elif n_values > 1 and col not in [ "value", "TIME_PERIOD", "OBS_COMMENT", ]: facets.append(col) n_per_facet[col] = n_values except: print(f"Ignoring unusable column: {col}") continue extra_args = {} need_shape = False if len(facets) > 2: # can only use two variables as plotting facets, third value will be used as a group on each plot # any more facets is not supported at this stage sorted_facets = sorted(n_per_facet.keys(), key=lambda k: n_per_facet[k]) # print(n_per_facet) # print(sorted_facets) facets = sorted_facets[-2:] extra_args.update({ "group": sorted_facets[0], "color": facets[0], "shape": sorted_facets[0], }) need_shape = True print(f"Using {facets} as facets, {extra_args} as series") else: if len(facets) > 0: extra_args.update({"color": facets[0]}) # compute figure size to give enough room for each plot mult = 1 for facet in facets: mult *= n_per_facet[facet] mult /= len(facets) nrow = int(mult + 1) # facet column names must not have spaces in them as this is not permitted by plotnine facet formulas if len(facets) > 0: new_facets = [] for f in facets: if " " in f: new_name = f.replace(" ", "_") df = df.rename(columns={f: new_name}) new_facets.append(new_name) else: new_facets.append(f) facets = new_facets if "color" in extra_args: extra_args.update({"color": facets[0]}) print(f"Renamed facet columns due to whitespace: {facets}") plot = p9.ggplot(df, p9.aes(x="TIME_PERIOD", y="value", ** extra_args)) + p9.geom_point(size=3) if len(facets) > 0 and len(facets) <= 2: facet_str = "~" + " + ".join(facets[:2]) print(f"Using facet formula: {facet_str}") plot += p9.facet_wrap(facet_str, ncol=len(facets), scales="free_y") plot_theme = { "figure_size": (12, int(nrow * 1.5)), } if (len(facets) == 2 ): # two columns of plots? if so, make sure space for axis labels plot_theme.update({"subplots_adjust": {"wspace": 0.2}}) if need_shape: plot += p9.scale_shape(guide="legend") plot += p9.guides( colour=False ) # colour legend is not useful since it is included in the facet title plot_theme.update({"legend_position": "right"}) return user_theme(plot, **plot_theme)
def test_label_value(): p = g + facet_wrap('~ gear', labeller='label_value') assert p == 'label_value'
def test_facet_wrap_one_var(): p = g + facet_wrap('~var1') assert p == 'facet_wrap_one_var'
def test_label_context(): p = g + facet_wrap('~ gear', labeller='label_context') assert p == 'label_context'
def test_labeller_cols_both_wrap(): p = g + facet_wrap('~ gear + am', labeller=labeller_cols_both) assert p == 'labeller_cols_both_wrap'
def test_facet_wrap_two_vars(): p = g + facet_wrap('~var1+var2') assert p == 'facet_wrap_two_vars'
def test_aslabeller_dict_0tag(): func = as_labeller({'0': '<tag>0</tag>'}) p = g + facet_wrap('~ gear + am', labeller=func) assert p == 'aslabeller_dict_0tag'
def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False): if self.y_max is not None: limits = [0, float(self.y_max)] eprint(f'Setting limits to: {limits}') else: limits = [0, 1] if expo: if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: with open('data/external/all_human_gameplay.json') as f: all_gameplay = json.load(f) frames = [] for event, name in [('parents', 'Intermediate'), ('maryland', 'Expert'), ('live', 'National')]: if self.merge_humans: name = 'Human' gameplay = all_gameplay[event] if event != 'live': control_correct_positions = gameplay['control_correct_positions'] control_wrong_positions = gameplay['control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array(len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[argsort_control] control_y = control_sorted_result.cumsum() / control_sorted_result.shape[0] control_df = pd.DataFrame({'correct': control_y, 'char_percent': control_x}) control_df['Dataset'] = 'Regular Test' control_df['Guessing_Model'] = f' {name}' frames.append(control_df) adv_correct_positions = gameplay['adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'IR Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) if len(gameplay['advneural_correct_positions']) > 0: adv_correct_positions = gameplay['advneural_correct_positions'] adv_wrong_positions = gameplay['advneural_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'RNN Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) human_df = pd.concat(frames) human_vals = sort_humans(list(human_df['Guessing_Model'].unique())) human_dtype = CategoricalDtype(human_vals, ordered=True) human_df['Guessing_Model'] = human_df['Guessing_Model'].astype(human_dtype) dataset_dtype = CategoricalDtype(['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True) human_df['Dataset'] = human_df['Dataset'].astype(dataset_dtype) if no_models: p = ggplot(human_df) + geom_point(shape='.') else: df = self.char_plot_df if 1 not in self.rounds: df = df[df['Dataset'] != 'Round 1 - IR Adversarial'] if 2 not in self.rounds: df = df[df['Dataset'] != 'Round 2 - IR Adversarial'] df = df[df['Dataset'] != 'Round 2 - RNN Adversarial'] p = ggplot(df) if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: eprint('Loading human data') p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap('Guessing_Model', ncol=1) else: facet_conf = facet_wrap('Guessing_Model', nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth(method='mavg', se=False, method_args={'window': 400}) else: chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5) else: chart = None p = ( p + facet_conf + aes(x='char_percent', y='correct', color='Dataset') ) if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, .5, 1]) + coord_cartesian(ylim=limits) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={'t': 6, 'b': 6, 'l': 1, 'r': 5}) ) + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions') ) if self.title != '': p += ggtitle(self.title) return p else: if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth(method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + coord_cartesian(ylim=limits) )
def test_facet_wrap_not_as_table(): p = g + facet_wrap('~var1', as_table=False) assert p == 'facet_wrap_not_as_table'
def plot_n_train_vs_accuracy(self): return ( ggplot(self.combined_df) + facet_wrap('seen') + aes(x='n_train', fill='Outcome') + geom_histogram(binwidth=1) )
def test_facet_wrap_not_as_table_direction_v(): p = g + facet_wrap('~var1', as_table=False, dir='v') assert p == 'facet_wrap_not_as_table_direction_v'
dfwords['Date'] = X_train_all.Date.values dfwords.Speaker = [to_speaker_dict[spkr] for spkr in dfwords.Speaker] pres_Nelson = (dfwords.query(f"Speaker in {first_presidency}").groupby( [dfwords['Date'].map(lambda x: x.year), 'Speaker']).mean()[interesting_words].reset_index()) pres_Nelson['combined'] = [str(dt) for dt in pres_Nelson.Date] + pres_Nelson.Speaker pres_Nelson = (pres_Nelson.drop( columns=['Date', 'Speaker']).set_index('combined').unstack().reset_index()) pres_Nelson['Date'] = [int(comb[:4]) for comb in pres_Nelson.combined] pres_Nelson['Speaker'] = [comb[4:] for comb in pres_Nelson.combined] pres_Nelson = pres_Nelson.drop(columns='combined') pres_Nelson.columns = ['Word', 'Mean TF-IDF Score', 'Date', 'Speaker'] (ggplot(pres_Nelson, aes(x='Date', y='Mean TF-IDF Score', color='Word')) + geom_line() + facet_wrap('Speaker', scales='free', nrow=3) + labs(title='Word Usage Change over Time in First Presidency')) int_words = (dfwords.groupby(dfwords['Date'].map(lambda x: x.year)).mean() [interesting_words].unstack().reset_index()) int_words.columns = ['Word', 'Date', 'Mean TF-IDF Score'] (ggplot(int_words, aes(x='Date', y='Mean TF-IDF Score', color='Word')) + geom_line() + labs(title='Word Usage Change over Time in First Presidency and the 12')) missionary_temple = (dfwords.groupby( dfwords['Date'].map(lambda x: x.year)).mean()[[ 'missionary work', 'family history' ]].unstack().reset_index()) missionary_temple.columns = ['Word', 'Date', 'Mean TF-IDF Score'] (ggplot(missionary_temple, aes(x='Date', y='Mean TF-IDF Score',
def plot_char_percent_vs_accuracy_smooth(self, expo=False): if expo: p = (ggplot(self.char_plot_df) + facet_wrap('Guessing_Model', nrow=1) + aes(x='char_percent', y='correct', color='Dataset') + stat_smooth( method='mavg', se=False, method_args={'window': 200}) + scale_y_continuous(breaks=np.linspace(0, 1, 11)) + scale_x_continuous(breaks=[0, .5, 1]) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme(legend_position='top')) if os.path.exists('data/external/human_gameplay.json'): with open('data/external/human_gameplay.json') as f: gameplay = json.load(f) control_correct_positions = gameplay[ 'control_correct_positions'] control_wrong_positions = gameplay[ 'control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array( len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[argsort_control] control_y = control_sorted_result.cumsum( ) / control_sorted_result.shape[0] control_df = pd.DataFrame({ 'correct': control_y, 'char_percent': control_x }) control_df['Dataset'] = 'Test Questions' control_df['Guessing_Model'] = ' Human' adv_correct_positions = gameplay['adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(control_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum( ) / adv_sorted_result.shape[0] adv_df = pd.DataFrame({ 'correct': adv_y, 'char_percent': adv_x }) adv_df['Dataset'] = 'Challenge Questions' adv_df['Guessing_Model'] = ' Human' human_df = pd.concat([control_df, adv_df]) p = p + (geom_line(data=human_df)) return p else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth( method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 21)))
def plot_n_train_vs_accuracy(self): return (ggplot(self.combined_df) + facet_wrap('seen') + aes(x='n_train', fill='Outcome') + geom_histogram(binwidth=1))
#----------------------------- #libraries https://pythonplot.com/#bar-count import numpy as np import pandas as pd import matplotlib.pyplot as plt #pip install plotnine #similar to ggplots #https://plotnine.readthedocs.io/en/stable/index.html import plotnine #ggplot type from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap help(plotnine.facet_wrap) from plotnine.data import mtcars (ggplot(mtcars, aes('wt', 'mpg', color='factor(gear)')) + geom_point() + stat_smooth(method='lm') + facet_wrap('~gear')) from plotnine import * (ggplot(mtcars, aes('factor(cyl)', fill='factor(am)')) + geom_bar(position='fill')) (ggplot(mtcars, aes('factor(cyl)', fill='factor(am)')) + geom_bar(position='fill') + geom_text(aes(label='stat(count)'), stat='count', position='fill')) (ggplot(mpg) + aes(x='manufacturer') + geom_bar(size=20) + coord_flip() + labs(y='Count', x='Manufacturer', title='Number of Cars by Make')) #https://plotnine.readthedocs.io/en/stable/tutorials/miscellaneous-order-plot-series.html from pydataset import data data()
def control_list(in_file=None, out_dir=None, reference_gene_file=None, log2=False, page_width=None, page_height=None, user_img_file=None, page_format=None, pseudo_count=1, set_colors=None, dpi=300, rug=False, jitter=False, skip_first=False): # ------------------------------------------------------------------------- # # Check in_file content # # ------------------------------------------------------------------------- for p, line in enumerate(in_file): line = chomp(line) line = line.split("\t") if len(line) > 2: message("Need a two columns file.", type="ERROR") if skip_first: if p == 0: continue try: fl = float(line[1]) except ValueError: msg = "It seems that column 2 of input file" msg += " contains non numeric values. " msg += "Check that no header is present and that " msg += "columns are ordered properly. " msg += "Or use '--skip-first'. " message(msg, type="ERROR") if log2: fl = fl + pseudo_count if fl <= 0: message("Can not log transform negative/zero values. Add a pseudo-count.", type="ERROR") # ------------------------------------------------------------------------- # # Check colors # # ------------------------------------------------------------------------- set_colors = set_colors.split(",") if len(set_colors) != 2: message("Need two colors. Please fix.", type="ERROR") mcolors_name = mcolors.cnames for i in set_colors: if i not in mcolors_name: if not is_hex_color(i): message(i + " is not a valid color. Please fix.", type="ERROR") # ------------------------------------------------------------------------- # # Preparing output files # # ------------------------------------------------------------------------- # Preparing pdf file name file_out_list = make_outdir_and_file(out_dir, ["control_list.txt", "reference_list.txt", "diagnostic_diagrams." + page_format], force=True) control_file, reference_file_out, img_file = file_out_list if user_img_file is not None: os.unlink(img_file.name) img_file = user_img_file if not img_file.name.endswith(page_format): msg = "Image format should be: {f}. Please fix.".format(f=page_format) message(msg, type="ERROR") test_path = os.path.abspath(img_file.name) test_path = os.path.dirname(test_path) if not os.path.exists(test_path): os.makedirs(test_path) # ------------------------------------------------------------------------- # # Read the reference list # # ------------------------------------------------------------------------- try: reference_genes = pd.read_csv(reference_gene_file.name, sep="\t", header=None) except pd.errors.EmptyDataError: message("No genes in --reference-gene-file.", type="ERROR") reference_genes.rename(columns={reference_genes.columns.values[0]: 'gene'}, inplace=True) # ------------------------------------------------------------------------- # # Delete duplicates # # ------------------------------------------------------------------------- before = len(reference_genes) reference_genes = reference_genes.drop_duplicates(['gene']) after = len(reference_genes) msg = "%d duplicate lines have been deleted in reference file." message(msg % (before - after)) # ------------------------------------------------------------------------- # # Read expression data and add the pseudo_count # # ------------------------------------------------------------------------- if skip_first: exp_data = pd.read_csv(in_file.name, sep="\t", header=None, index_col=None, skiprows=[0], names=['exprs']) else: exp_data = pd.read_csv(in_file.name, sep="\t", names=['exprs'], index_col=0) exp_data.exprs = exp_data.exprs.values + pseudo_count # ------------------------------------------------------------------------- # # log transformation # # ------------------------------------------------------------------------- ylabel = 'Expression' if log2: if len(exp_data.exprs.values[exp_data.exprs.values == 0]): message("Can't use log transformation on zero or negative values. Use -p.", type="ERROR") else: exp_data.exprs = np.log2(exp_data.exprs.values) ylabel = 'log2(Expression)' # ------------------------------------------------------------------------- # # Are reference gene found in control list # # ------------------------------------------------------------------------- # Sort in increasing order exp_data = exp_data.sort_values('exprs') # Vector with positions indicating which in the # expression data list are found in reference_gene reference_genes_found = [x for x in reference_genes['gene'] if x in exp_data.index] msg = "Found %d genes of the reference in the provided signal file" % len(reference_genes_found) message(msg) not_found = [x for x in reference_genes['gene'] if x not in exp_data.index] if len(not_found): if len(not_found) == len(reference_genes): message("Genes from reference file where not found in signal file (n=%d)." % len(not_found), type="ERROR") else: message("List of reference genes not found :%s" % not_found) else: message("All reference genes were found.") # ------------------------------------------------------------------------- # # Search for genes with matched signal # # ------------------------------------------------------------------------- exp_data_save = exp_data.copy() control_list = list() nb_candidate_left = exp_data.shape[0] - len(reference_genes_found) message("Searching for genes with matched signal.") if nb_candidate_left < len(reference_genes_found): message("Not enough element to perform selection. Exiting", type="ERROR") for i in reference_genes_found: not_candidates = reference_genes_found + control_list not_candidates = list(set(not_candidates)) diff = abs(exp_data.loc[i] - exp_data) control_list.extend(diff.loc[np.setdiff1d(diff.index, not_candidates)].idxmin(axis=0, skipna=True).tolist()) # ------------------------------------------------------------------------- # # Prepare a dataframe for plotting # # ------------------------------------------------------------------------- message("Preparing a dataframe for plotting.") reference = exp_data_save.loc[reference_genes_found].sort_values('exprs') reference = reference.assign(genesets=['Reference'] * reference.shape[0]) control = exp_data_save.loc[control_list].sort_values('exprs') control = control.assign(genesets=['Control'] * control.shape[0]) data = pd.concat([reference, control]) data['sets'] = pd.Series(['sets' for x in data.index.tolist()], index=data.index) data['genesets'] = Categorical(data['genesets']) # ------------------------------------------------------------------------- # # Diagnostic plots # # ------------------------------------------------------------------------- p = ggplot(data, aes(x='sets', y='exprs', fill='genesets')) p += scale_fill_manual(values=dict(zip(['Reference', 'Control'], set_colors))) p += geom_violin(color=None) p += xlab('Gene sets') + ylab(ylabel) p += facet_wrap('~genesets') if rug: p += geom_rug() if jitter: p += geom_jitter() p += theme_bw() p += theme(axis_text_x=element_blank()) # ------------------------------------------------------------------------- # Turn warning off. Both pandas and plotnine use warnings for deprecated # functions. I need to turn they off although I'm not really satisfied with # this solution... # ------------------------------------------------------------------------- def fxn(): warnings.warn("deprecated", DeprecationWarning) # ------------------------------------------------------------------------- # # Saving # # ------------------------------------------------------------------------- with warnings.catch_warnings(): warnings.simplefilter("ignore") fxn() message("Saving diagram to file : " + img_file.name) message("Be patient. This may be long for large datasets.") try: p.save(filename=img_file.name, width=page_width, height=page_height, dpi=dpi, limitsize=False) except PlotnineError as err: message("Plotnine message: " + err.message) message("Plotnine encountered an error.", type="ERROR") # ------------------------------------------------------------------------- # # write results # # ------------------------------------------------------------------------- exp_data_save.loc[reference_genes_found].sort_values('exprs').to_csv(reference_file_out.name, sep="\t") exp_data_save.loc[control_list].sort_values('exprs').to_csv(control_file.name, sep="\t")
('probability', pipeline.predict_proba(X)[:, 1]) ]) predict_df = predict_df.append(df) predict_df['probability_str'] = predict_df['probability'].apply('{:.1%}'.format) # In[27]: # Top predictions amongst negatives (potential hidden responders to a targeted cancer therapy) (predict_df .sort_values('decision_function', ascending=False) .query("status == 0 and feature_set == 'full'") .head(10) ) # In[28]: predict_df['status_'] = predict_df['status'].map( lambda x: 'negative' if x == 0 else 'positive') (gg.ggplot(predict_df, gg.aes(x='probability', fill='status_')) + gg.geom_density(alpha=0.6) + gg.facet_wrap('~feature_set', ncol=1) + gg.labs(x='probability', y='density') + gg.guides(fill=gg.guide_legend(title="")) + theme_cognoma())
def plot(df: 'DataFrame', group_colname: str = None, time_colname: str = None, max_num_groups: int = 1, split_dt: Optional[np.datetime64] = None, **kwargs) -> 'DataFrame': """ :param df: The output of `.to_dataframe()`. :param group_colname: The name of the group-column. :param time_colname: The name of the time-column. :param max_num_groups: Max. number of groups to plot; if the number of groups in the dataframe is greater than this, a random subset will be taken. :param split_dt: If supplied, will draw a vertical line at this date (useful for showing pre/post validation). :param kwargs: Further keyword arguments to pass to `plotnine.theme` (e.g. `figure_size=(x,y)`) :return: A plot of the predicted and actual values. """ from plotnine import ( ggplot, aes, geom_line, geom_ribbon, facet_grid, facet_wrap, theme_bw, theme, ylab, geom_vline ) is_components = ('process' in df.columns and 'state_element' in df.columns) if group_colname is None: group_colname = 'group' if group_colname not in df.columns: raise TypeError("Please specify group_colname") if time_colname is None: time_colname = 'time' if 'time' not in df.columns: raise TypeError("Please specify time_colname") df = df.copy() if df[group_colname].nunique() > max_num_groups: subset_groups = df[group_colname].drop_duplicates().sample(max_num_groups).tolist() if len(subset_groups) < df[group_colname].nunique(): print("Subsetting to groups: {}".format(subset_groups)) df = df.loc[df[group_colname].isin(subset_groups), :] num_groups = df[group_colname].nunique() aes_kwargs = {'x': time_colname} if is_components: aes_kwargs['group'] = 'state_element' plot = ( ggplot(df, aes(**aes_kwargs)) + geom_line(aes(y='mean'), color='#4C6FE7', size=1.5, alpha=.75) + geom_ribbon(aes(ymin='lower', ymax='upper'), color=None, alpha=.25) + ylab("") ) if is_components: num_processes = df['process'].nunique() if num_groups > 1 and num_processes > 1: raise ValueError("Cannot plot components for > 1 group and > 1 processes.") elif num_groups == 1: plot = plot + facet_wrap(f"~ measure + process", scales='free_y', labeller='label_both') if 'figure_size' not in kwargs: from plotnine.facets.facet_wrap import n2mfrow nrow, _ = n2mfrow(len(df[['process', 'measure']].drop_duplicates().index)) kwargs['figure_size'] = (12, nrow * 2.5) else: plot = plot + facet_grid(f"{group_colname} ~ measure", scales='free_y', labeller='label_both') if 'figure_size' not in kwargs: kwargs['figure_size'] = (12, num_groups * 2.5) if (df.groupby('measure')['process'].nunique() <= 1).all(): plot = plot + geom_line(aes(y='mean', color='state_element'), size=1.5) else: if 'actual' in df.columns: plot = plot + geom_line(aes(y='actual')) if num_groups > 1: plot = plot + facet_grid(f"{group_colname} ~ measure", scales='free_y', labeller='label_both') else: plot = plot + facet_wrap("~measure", scales='free_y', labeller='label_both') if 'figure_size' not in kwargs: kwargs['figure_size'] = (12, 5) if split_dt: plot = plot + geom_vline(xintercept=np.datetime64(split_dt), linetype='dashed') return plot + theme_bw() + theme(**kwargs)
x['k'], x['resubAccuracy'], x['testAccuracy']) for x in repeatedKnnResults], columns = ['p', 'k', 'resubAccuracy', 'testAccuracy']) ggdata = pd.concat( [DataFrame({'p' : knnResultsSimplified.p, 'k' : knnResultsSimplified.k.apply(int), 'type' : 'resub', 'Accuracy' : knnResultsSimplified.resubAccuracy}), DataFrame({'p' : knnResultsSimplified.p, 'k' : knnResultsSimplified.k.apply(int), 'type' : 'test', 'Accuracy' : knnResultsSimplified.testAccuracy})], axis = 0 ) plt.close() ggo = gg.ggplot(ggdata, gg.aes(x='p', y='Accuracy', color='type', group='type', linetype='type')) ggo += gg.facet_wrap('~ k') ggo += gg.scale_x_log10() ggo += gg.geom_point(alpha=0.6) ggo += gg.stat_smooth() ggo += gg.theme_bw() print(ggo)
p9.ggplot(data=df_p4k, mapping=p9.aes(x="score")) + p9.geom_density() # In[6]: df_p4k_sum = df_p4k.groupby("genre").mean() df_p4k_sum # In[25]: df_p4k_best = df_p4k[df_p4k['best'] == 1] p9.ggplot(data=df_p4k_best, mapping=p9.aes(x="score")) + p9.geom_density() # In[26]: p9.ggplot(data=df_p4k, mapping=p9.aes( x="score")) + p9.facet_wrap("~genre") + p9.geom_density() # Word Clouds of the review # In[23]: wc_text = " ".join(df_p4k['review'].head(10).as_matrix().astype('str')) wc_text = " ".join(stripNonAlphaNum(wc_text)) p4k_wordcloud = WordCloud().generate(wc_text) wordcloud = WordCloud(max_font_size=40).generate(wc_text) plt.figure() plt.imshow(wordcloud, interpolation="bilinear") # Split word clouds by genre # In[8]: