def plot(): outdir = 'output/protobowl/' pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df, questions = load_protobowl() df.result = df.result.apply(lambda x: x is True) df['log_n_records'] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby('uid') user_stat = df_user_grouped.agg(np.mean) print('{} users'.format(len(user_stat))) print('{} records'.format(len(df))) print('{} questions'.format(len(set(df.qid)))) max_color = user_stat.log_n_records.max() user_stat['alpha'] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = ggplot(user_stat) \ + geom_point(aes(x='relative_position', y='result', size='user_n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + labs(x='Average buzzing position', y='Accuracy') \ + theme(aspect_ratio=1) p0.save(os.path.join(outdir, 'protobowl_users.pdf')) # p0.draw() print('p0 done') # histogram of number of records p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + labs(x='Log number of records', y='Density') \ + theme(aspect_ratio=0.3) p1.save(os.path.join(outdir, 'protobowl_hist.pdf')) # p1.draw() print('p1 done') # histogram of accuracy p2 = ggplot(user_stat, aes(x='result', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density() \ + labs(x='Accuracy', y='Density') \ + theme(aspect_ratio=0.3) p2.save(os.path.join(outdir, 'protobowl_acc.pdf')) # p2.draw() print('p2 done') # histogram of buzzing position p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \ + geom_histogram(color='#3182bd', fill='#deebf7') \ + geom_density() \ + labs(x='Average buzzing position', y='Density') \ + theme(aspect_ratio=0.3) p3.save(os.path.join(outdir, 'protobowl_pos.pdf')) # p3.draw() print('p3 done')
def plot(): outdir = 'output/protobowl/' pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df = load_protobowl() df.result = df.result.apply(lambda x: x is True) df['log_n_records'] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby('uid') user_stat = df_user_grouped.agg(np.mean) print('{} users'.format(len(user_stat))) print('{} records'.format(len(df))) max_color = user_stat.log_n_records.max() user_stat['alpha'] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = ggplot(user_stat) \ + geom_point(aes(x='relative_position', y='result', size='user_n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + labs(x='Average buzzing position', y='Accuracy') \ + theme(aspect_ratio=1) p0.save(os.path.join(outdir, 'protobowl_users.pdf')) # p0.draw() print('p0 done') # histogram of number of records p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + labs(x='Log number of records', y='Density') \ + theme(aspect_ratio=0.3) p1.save(os.path.join(outdir, 'protobowl_hist.pdf')) # p1.draw() print('p1 done') # histogram of accuracy p2 = ggplot(user_stat, aes(x='result', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density() \ + labs(x='Accuracy', y='Density') \ + theme(aspect_ratio=0.3) p2.save(os.path.join(outdir, 'protobowl_acc.pdf')) # p2.draw() print('p2 done') # histogram of buzzing position p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \ + geom_histogram(color='#3182bd', fill='#deebf7') \ + geom_density() \ + labs(x='Average buzzing position', y='Density') \ + theme(aspect_ratio=0.3) p3.save(os.path.join(outdir, 'protobowl_pos.pdf')) # p3.draw() print('p3 done')
def test_few_datapoints(): df = pd.DataFrame({'x': [1, 2, 2, 3, 3, 3], 'z': list('abbccc')}) # Bandwidth not set p = (ggplot(df, aes('x', color='z')) + geom_density() + lims(x=(-3, 9))) with pytest.warns(PlotnineWarning) as record: p.draw_test() record = list(record) # iterate more than 1 time assert any('e.g `bw=0.1`' in str(r.message) for r in record) assert any('Groups with fewer than 2' in str(r.message) for r in record) p = (ggplot(df, aes('x', color='z')) + geom_density(bw=.1) + lims(x=(0, 4))) assert p == 'few_datapoints'
def plot_replicate_density( df, batch, plate, output_file_base=None, output_file_extensions=[".png", ".pdf", ".svg"], dpi=300, height=1.5, width=2, ): density_gg = ( gg.ggplot(df, gg.aes(x="pairwise_correlation", fill="replicate_info")) + gg.geom_density(alpha=0.3) + gg.scale_fill_manual( name="Replicate", labels={ "True": "True", "False": "False" }, values=["#B99638", "#2DB898"], ) + gg.xlab("Pearson Correlation") + gg.ylab("Density") + gg.ggtitle("{}: {}".format(batch, plate)) + gg.theme_bw() + gg.theme( title=gg.element_text(size=9), axis_text=gg.element_text(size=5), axis_title=gg.element_text(size=8), legend_text=gg.element_text(size=6), legend_title=gg.element_text(size=7), strip_text=gg.element_text(size=4, color="black"), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), )) if output_file_base: save_figure(density_gg, output_file_base, output_file_extensions, dpi, height, width) return density_gg
def show_community_prediction( self, percent_kept: float = 0.95, side_cut_from: str = "both", num_samples: int = 1000, ): """ Plot samples from the community prediction on this question :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param num_samples: number of samples from the community :return: ggplot graphics object """ community_samples = pd.DataFrame(data={ "samples": [self.sample_community() for _ in range(0, num_samples)] } # type: ignore ) (_xmin, _xmax) = self.get_central_quantiles(community_samples, percent_kept=percent_kept, side_cut_from=side_cut_from) title_name = ( f"Q: {self.name}" if self.name else "\n".join(textwrap.wrap(self.data["title"], 60)) + "\n\n" # type: ignore ) return (ggplot(community_samples, aes("samples")) + geom_density(fill="#b3cde3", alpha=0.8) + xlim(_xmin, _xmax) + self._scale_x() + labs(x="Prediction", y="Density", title=title_name + "Community Predictions") + ergo_theme)
def comparison_plot(self, df: pd.DataFrame, xmin=None, xmax=None, bw="normal_reference", **kwargs): return (ggplot(df, aes(df.columns[1], fill=df.columns[0])) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_density(bw=bw, alpha=0.8) + ggtitle(self.plot_title) + self._scale_x(xmin, xmax) + ergo_theme)
def density_plot1(num_matches_per_round: int, match_lengths_from_one_round: list): """ Density plot for match lengths, new rules, no blowouts, 85 matches/round """ match_lengths = pd.DataFrame( {'Match length': match_lengths_from_one_round}) (plt.ggplot(match_lengths, plt.aes(x='Match length')) + plt.geom_density() + plt.geom_vline(xintercept=50, color='black', size=2) + plt.theme_classic() + plt.xlim([0, 55])).save(filename='figures/match_length_density_plot.png')
def density_plot( self, df: pd.DataFrame, xmin=None, xmax=None, fill: str = "#fbb4ae", bw="normal_reference", **kwargs, ): return (ggplot(df, aes(df.columns[0])) + geom_density(fill=fill, alpha=0.8) + ggtitle(self.plot_title) + self._scale_x(xmin, xmax) + ergo_theme)
def plot_continuous_distribution(data_table, continuous_metric_name, segment_name, title, xlim=None): filtered_data = data_table[ pd.notnull(data_table[continuous_metric_name]) & pd.notnull(data_table[continuous_metric_name])] result = plot.ggplot(data=filtered_data) + plot.aes(x=continuous_metric_name, color=segment_name) + \ plot.geom_density() + plot.labs(x=continuous_metric_name, title=title, fill=segment_name) if pd.notnull(xlim): result = result + plot.xlim(xlim) return result
def create_confidence_plot(conf_df): plt = (ggplot(conf_df) + aes(x='x', color='Method', fill='Method') + geom_density(alpha=.45) + facet_wrap('Task', nrow=4) + xlab('Confidence') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( axis_text_y=element_blank(), axis_ticks_major_y=element_blank(), axis_title_y=element_blank(), legend_title=element_blank(), legend_position='top', legend_box='horizontal', )) return plt
def plot_replicate_density( df, batch, plate, cutoff, percent_strong, output_file_base=None, output_file_extensions=[".png", ".pdf", ".svg"], dpi=300, height=1.5, width=2, return_plot=False, ): density_gg = ( gg.ggplot(df, gg.aes(x="similarity_metric", fill="group_replicate")) + gg.geom_density(alpha=0.3) + gg.scale_fill_manual( name="Replicate", labels={"True": "True", "False": "False"}, values=["#B99638", "#2DB898"], ) + gg.xlab("Pearson Correlation") + gg.ylab("Density") + gg.geom_vline(xintercept=cutoff, color="red", linetype="dashed") + gg.ggtitle( f"{batch}; Plate: {plate}\n\nPercent Replicating: {np.round(percent_strong * 100, 2)}%" ) + gg.theme_bw() + gg.theme( title=gg.element_text(size=3.5), axis_text=gg.element_text(size=4), axis_title=gg.element_text(size=4), legend_text=gg.element_text(size=4), legend_title=gg.element_text(size=4), strip_text=gg.element_text(size=4, color="black"), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), ) ) if output_file_base: save_figure( density_gg, output_file_base, output_file_extensions, dpi, height, width ) if return_plot: return density_gg
def plot_trace(data_in, figure_size=(15, 5)): """ Returns trace and density plot of mcmc samples from data_in. Note: the values 'chain', 'sample_i', 'parameter; and 'value' must be in the inputted pd.DataFrame Parameters ---------- data_in : pd.DataFrame DataFrame containing samples from the sampler with columns: sample_i, chain, sample_i, and parameter figure_size : tuple, default = (15,5) Optional input for figure size Returns ------- None: Prints out the trace and density plot for mcmc chains(s) """ # Column validation name_check = set(data_in.columns) if name_check != set(['chain', 'sample_i', 'parameter', 'value']): raise MyValidationError( "Incorrect column names in data_in please check") # Set figure size pn.options.figure_size = figure_size # Trace plot plot_out_trace = pn.ggplot(pn.aes(x = 'sample_i', y = 'value', color = 'chain'), data = data_in)\ + pn.geom_line()\ + pn.facet_grid('parameter ~ .')\ + pn.labs(x = 'Sample', y = 'Parameter Value') # Distribution plot plot_out_distribution = pn.ggplot(pn.aes(x = 'value', color = 'chain'), data = data_in)\ + pn.geom_density()\ + pn.facet_grid('parameter ~ .')\ + pn.labs(x = 'Parameter Value', y = 'Density') print(plot_out_trace) print(plot_out_distribution) return (None)
def create_confidence_plot(conf_df): plt = ( ggplot(conf_df) + aes(x='x', color='Method', fill='Method') + geom_density(alpha=.45) + facet_wrap('Task', nrow=4) + xlab('Confidence') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( axis_text_y=element_blank(), axis_ticks_major_y=element_blank(), axis_title_y=element_blank(), legend_title=element_blank(), legend_position='top', legend_box='horizontal', ) ) return plt
def density_plot2(num_matches_per_round: int, match_lengths_from_one_round: list, match_lengths_from_one_round_with_blowouts: list): """ Density plot for match lengths, new rules, blowouts vs. no blowouts, 85 matches/round """ match_lengths_blowout = pd.DataFrame({ 'Match length': np.concatenate([ match_lengths_from_one_round, match_lengths_from_one_round_with_blowouts ]), 'Blowouts': np.concatenate([ np.repeat('No', num_matches_per_round), np.repeat('Yes', num_matches_per_round) ]) }) (plt.ggplot(match_lengths_blowout, plt.aes(x='Match length', color='Blowouts')) + plt.geom_density() + plt.geom_vline(xintercept=50, color='black', size=2) + plt.xlim([0, 55]) + plt.theme_classic()).save( filename='figures/match_length_with_blowout_density_plot.png')
def show_prediction( self, samples, percent_kept: float = 0.95, side_cut_from: str = "both", show_community: bool = False, num_samples: int = 1000, ): """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param show_community: boolean indicating whether comparison to community predictions should be made :param num_samples: number of samples from the community :return: ggplot graphics object """ if isinstance(samples, SubmissionMixtureParams): prediction = samples prediction_normed_samples = pd.Series([ logistic.sample_mixture(prediction) for _ in range(0, num_samples) ]) prediction_true_scale_samples = self.denormalize_samples( prediction_normed_samples) else: if isinstance(samples, list): samples = pd.Series(samples) if not type(samples) in [pd.Series, np.ndarray]: raise ValueError( "Samples should be a list, numpy arrray or pandas series") num_samples = samples.shape[0] prediction_true_scale_samples = samples title_name = ( f"Q: {self.name}" if self.name else "\n".join( textwrap.wrap(self.data["title"], 60)) # type: ignore ) if show_community: df = pd.DataFrame( data={ "community": [ # type: ignore self.sample_community() for _ in range(0, num_samples) ], "prediction": prediction_true_scale_samples, }) # get domain for graph given the percentage of distribution kept (_xmin, _xmax) = self.get_central_quantiles(df, percent_kept=percent_kept, side_cut_from=side_cut_from) df = pd.melt(df, var_name="sources", value_name="samples") # type: ignore return (ggplot(df, aes("samples", fill="sources")) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_density(alpha=0.8) + xlim(_xmin, _xmax) + self._scale_x() + labs(x="Prediction", y="Density", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1))) else: df = pd.DataFrame( data={"prediction": prediction_true_scale_samples}) # get domain for graph given the percentage of distribution kept (_xmin, _xmax) = self.get_central_quantiles(df, percent_kept=percent_kept, side_cut_from=side_cut_from) return (ggplot(df, aes("prediction")) + geom_density(fill="#b3cde3", alpha=0.8) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_density(alpha=0.8) + xlim(_xmin, _xmax) + self._scale_x() + labs(x="Prediction", y="Density", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
def test_triangular(): p3 = p + geom_density(kernel='triangular', alpha=.3) # other assert p3 + _theme == 'triangular'
) # In[18]: gg.options.figure_size = (6.4, 4.8) # Make sure to drop duplicates of redundant gene, perturbation, and cell line columns # Not removing replicates will put more weight on genes with more measurements cor_density_gg = ( gg.ggplot( summary_corr_df.drop_duplicates( ["Metadata_cell_line", "Metadata_gene_name", "replicate_type"] ), gg.aes(x="correlation_guide")) + \ gg.geom_density(gg.aes(fill="Metadata_cell_line"), alpha=0.4) + \ gg.geom_rug(gg.aes(color="Metadata_cell_line"), show_legend={'color': False}) + \ gg.theme_bw() + \ gg.theme( subplots_adjust={"wspace": 0.2}, axis_text=gg.element_text(size=7), axis_title=gg.element_text(size=9), strip_text=gg.element_text(size=6, color="black"), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), ) + \ gg.xlim([-0.5, 1]) + \ gg.xlab("Median Correlation of All Guides Across Genes") + \ gg.ylab("Density") + \ gg.facet_wrap("~replicate_type", nrow=2, scales="free") + \ gg.scale_fill_manual(name="Cell Line",
def density_plot(df, x, group=None, facet_x=None, facet_y=None, position='overlay', sort_groups=True, base_size=10, figure_size=(6, 3), **stat_kwargs): ''' Plot a 1-d density plot Parameters ---------- df : pd.DataFrame input dataframe x : str quoted expression to be plotted on the x axis group : str quoted expression to be used as group (ie color) facet_x : str quoted expression to be used as facet facet_y : str quoted expression to be used as facet position : str if groups are present, choose between `stack` or `overlay` base_size : int base size for theme_ez figure_size :tuple of int figure size stat_kwargs : kwargs kwargs for the density stat Returns ------- g : EZPlot EZplot object ''' if position not in ['overlay', 'stack']: log.error("position not recognized") raise NotImplementedError("position not recognized") # create a copy of the data dataframe = df.copy() # define groups and variables; remove and store (eventual) names names = {} groups = {} variables = {} for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]): names[label], groups[label] = unname(var) # fix special cases if x == '.index': groups['x'] = '.index' names[ 'x'] = dataframe.index.name if dataframe.index.name is not None else '' # aggregate data and reorder columns gdata = agg_data(dataframe, variables, groups, None, fill_groups=False) gdata = gdata[[ c for c in ['x', 'group', 'facet_x', 'facet_y'] if c in gdata.columns ]] # start plotting g = EZPlot(gdata) # determine order and create a categorical type colors = ez_colors(g.n_groups('group')) # set groups if group is None: g += p9.geom_density(p9.aes(x="x"), stat=p9.stats.stat_density(**stat_kwargs), colour=ez_colors(1)[0], fill=ez_colors(1)[0], **POSITION_KWARGS[position]) else: g += p9.geom_density(p9.aes(x="x", group="factor(group)", colour="factor(group)", fill="factor(group)"), stat=p9.stats.stat_density(**stat_kwargs), **POSITION_KWARGS[position]) g += p9.scale_fill_manual(values=colors, reverse=False) g += p9.scale_color_manual(values=colors, reverse=False) # set facets if facet_x is not None and facet_y is None: g += p9.facet_wrap('~facet_x') if facet_x is not None and facet_y is not None: g += p9.facet_grid('facet_y~facet_x') # set x scale if g.column_is_categorical('x'): g += p9.scale_x_discrete() else: g += p9.scale_x_continuous(labels=ez_labels) # set y scale g += p9.scale_y_continuous(labels=ez_labels) # set axis labels g += \ p9.xlab(names['x']) + \ p9.ylab('Density') # set theme g += theme_ez(figure_size=figure_size, base_size=base_size, legend_title=p9.element_text(text=names['group'], size=base_size)) if sort_groups: g += p9.guides(fill=p9.guide_legend(reverse=True)) return g
p0 = ggplot(user_stat) \ + geom_point(aes(x='ratio', y='accuracy', size='n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + theme(aspect_ratio=1) p0.save('protobowl_users.pdf') # p0.draw() print('p0 done') p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + theme(aspect_ratio=0.3) p1.save('protobowl_hist.pdf') # p1.draw() print('p1 done') p2 = ggplot(user_stat, aes(x='accuracy', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density(aes(x='accuracy')) \ + theme(aspect_ratio=0.3) p2.save('protobowl_acc.pdf') # p2.draw() print('p2 done')
predict_df = pd.DataFrame() for model, pipeline in final_pipelines.items(): df = pd.DataFrame.from_items([ ('feature_set', model), ('sample_id', X.index), ('test_set', X.index.isin(X_test.index).astype(int)), ('status', y), ('decision_function', pipeline.decision_function(X)), ('probability', pipeline.predict_proba(X)[:, 1]) ]) predict_df = predict_df.append(df) predict_df['probability_str'] = predict_df['probability'].apply( '{:.1%}'.format) # In[27]: # Top predictions amongst negatives (potential hidden responders to a targeted cancer therapy) (predict_df.sort_values( 'decision_function', ascending=False).query("status == 0 and feature_set == 'full'").head(10)) # In[28]: predict_df['status_'] = predict_df['status'].map(lambda x: 'negative' if x == 0 else 'positive') (gg.ggplot(predict_df, gg.aes(x='probability', fill='status_')) + gg.geom_density(alpha=0.6) + gg.facet_wrap('~feature_set', ncol=1) + gg.labs(x='probability', y='density') + gg.guides(fill=gg.guide_legend(title="")) + theme_cognoma())
def test_gaussian_trimmed(): p2 = p + geom_density(kernel='gaussian', alpha=.3, trim=True) assert p2 + _theme == 'gaussian-trimmed'
def test_gaussian_weighted(): p1 = p + geom_density(aes(weight='x'), kernel='gaussian', alpha=.3) assert p1 + _theme == 'gaussian_weighted'
def plot(): outdir = "output/protobowl/" pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df = load_protobowl() df.result = df.result.apply(lambda x: x is True) df["log_n_records"] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby("uid") user_stat = df_user_grouped.agg(np.mean) print("{} users".format(len(user_stat))) print("{} records".format(len(df))) max_color = user_stat.log_n_records.max() user_stat["alpha"] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = (ggplot(user_stat) + geom_point( aes( x="relative_position", y="result", size="user_n_records", color="log_n_records", alpha="alpha", ), show_legend={ "color": False, "alpha": False, "size": False }, ) + scale_color_gradient(high="#e31a1c", low="#ffffcc") + labs(x="Average buzzing position", y="Accuracy") + theme(aspect_ratio=1)) p0.save(os.path.join(outdir, "protobowl_users.pdf")) # p0.draw() print("p0 done") # histogram of number of records p1 = (ggplot(user_stat, aes(x="log_n_records", y="..density..")) + geom_histogram(color="#e6550d", fill="#fee6ce") + geom_density() + labs(x="Log number of records", y="Density") + theme(aspect_ratio=0.3)) p1.save(os.path.join(outdir, "protobowl_hist.pdf")) # p1.draw() print("p1 done") # histogram of accuracy p2 = (ggplot(user_stat, aes(x="result", y="..density..")) + geom_histogram(color="#31a354", fill="#e5f5e0") + geom_density() + labs(x="Accuracy", y="Density") + theme(aspect_ratio=0.3)) p2.save(os.path.join(outdir, "protobowl_acc.pdf")) # p2.draw() print("p2 done") # histogram of buzzing position p3 = (ggplot(user_stat, aes(x="relative_position", y="..density..")) + geom_histogram(color="#3182bd", fill="#deebf7") + geom_density() + labs(x="Average buzzing position", y="Density") + theme(aspect_ratio=0.3)) p3.save(os.path.join(outdir, "protobowl_pos.pdf")) # p3.draw() print("p3 done")
def density(df, key, figsize=(8, 6), vertical=False): p9.options.figure_size = figsize fig = p9.ggplot(p9.aes(x=key, y='..count..', label='..count..'), data=df) fig += p9.geom_density(alpha=0.5) fig += p9.theme_classic() return fig
def audit_site(df, audit_cols, batch, plate, resolution="full"): audit_title = "{}: {}".format(batch, plate) site_df = audit( df, audit_groups=audit_cols, audit_resolution=resolution ) same_well = site_df.Metadata_Well_pair_a == site_df.Metadata_Well_pair_b same_site = site_df.Metadata_Site_pair_a == site_df.Metadata_Site_pair_b same_plate = site_df.Metadata_Plate_pair_a == site_df.Metadata_Plate_pair_b if "Metadata_clone_number" in audit_cols: same_clone = site_df.Metadata_clone_number_pair_a == site_df.Metadata_clone_number_pair_b if "Metadata_treatment" in audit_cols: same_treatment = site_df.Metadata_treatment_pair_a == site_df.Metadata_treatment_pair_b else: same_treatment = same_clone else: same_treatment = site_df.Metadata_Dosage_pair_a == site_df.Metadata_Dosage_pair_b same_clone = site_df.Metadata_CellLine_pair_a == site_df.Metadata_CellLine_pair_b replicate = same_treatment & same_clone same_well_diff_site = ( same_well & ~same_site ) same_treatment_diff_well = ( replicate & ~same_well ) diff_treatment_diff_well = ( ~replicate & ~same_well ) diff_treatment_diff_site = ( ~replicate & ~same_site ) plot_ready_df = site_df.assign( replicate=replicate, same_site=same_site, same_well_diff_site=same_well_diff_site, same_treatment_diff_well=same_treatment_diff_well, diff_treatment_diff_well=diff_treatment_diff_well, diff_treatment_diff_site=diff_treatment_diff_site ) plot_ready_df.pairwise_correlation = plot_ready_df.pairwise_correlation.astype(float) plot_ready_df.same_well_diff_site = ( plot_ready_df .same_well_diff_site .replace( { True: "Same Well", False: "Different Well" } ) ) plot_ready_df.same_site = ( plot_ready_df .same_site .replace( { True: "Same Site", False: "Different Site" } ) ) site_audit_gg = ( gg.ggplot(plot_ready_df, gg.aes(x="pairwise_correlation")) + \ gg.geom_density(gg.aes(fill="replicate"), alpha=0.5) + gg.theme_bw() + \ gg.facet_grid("same_well_diff_site~same_site") + gg.ggtitle(audit_title) + gg.xlab("Pairwise Pearson Correlation") + gg.ylab("Density") + gg.theme( strip_background=gg.element_rect(colour="black", fill="#fdfff4") ) ) return site_audit_gg
print(ess_bps_per_sec) # 0.004061 print(ess_hmc_per_sec) # 0.005769 frames = [ bpsSamples[burninBPS:bpsSamples.shape[0]], hmcSamples[burninHMC:hmcSamples.shape[0]] ] allDF = pd.concat(frames) ## add a new column to DF list1 = ["lbps"] * (bpsSamples.shape[0] - burninBPS) list1.extend(["hmc"] * (hmcSamples.shape[0] - burninHMC)) allDF['method'] = list1 allDF.head(3) ggplot(allDF, aes('exchangeCoef1', fill='method')) + geom_density( alpha=0.3, position='identity') + scale_x_continuous(breaks=np.arange(3.5, 5.5, 0.3)) #import matplotlib.pyplot as plt #kdeplot(bpsSamples['exchangeCoef1'][1000:10000], shade=True) #kdeplot(hmcSamples['exchangeCoef1'][1000:10000], shade=True,color='r') #ggplot(df, aes( x=values, fill=method)) + # geom_density(alpha=.3, position="identity")+facet_wrap(~variable, ncol=3, scales="free")+ # geom_vline(aes(xintercept=vl), data=vline.dat, color="red", linetype="dashed") #sc
def test_triangular(): p3 = p + geom_density(kernel='triangular', bw='normal_reference', alpha=.3) # other assert p3 + _theme == 'triangular'
def test_gaussian(): p1 = p + geom_density(kernel='gaussian', alpha=.3) assert p1 + _theme == 'gaussian'
sizes = [] for sha1, sha2 in zip(commits, commits[1:]): res = subprocess.run(['git', 'diff', '--shortstat', sha1, sha2], stdout=subprocess.PIPE) words = res.stdout.decode().split() plus = 0 minus = 0 for i, word in enumerate(words): if 'insertion' in word: plus = int(words[i - 1]) if 'deletion' in word: minus = int(words[i - 1]) sizes.append({'insertions': plus, 'deletions': minus}) df = pandas.DataFrame(sizes) df['newlines'] = df.insertions - df.deletions df.describe() # show some basic stat for n in (-500, -100): rat = df[df.newlines < n].size / df.size print('<', n, round(rat * 100, 2), '%') for n in (0, 100, 500, 1000, 2000): rat = df[df.newlines > n].size / df.size print('>', n, round(rat * 100, 2), '%') # draw charts (gg.ggplot(df, gg.aes(x='newlines')) + gg.geom_density() + gg.xlim(-2000, 0)) (gg.ggplot(df, gg.aes(x='newlines')) + gg.geom_density() + gg.xlim(0, 2000))
"comparison_category"] = "across_batch_replicate" similarity_melted_df.loc[different_treatment_within_batch, "comparison_category"] = "same_batch_nonreplicate" # In[8]: similarity_melted_df.comparison_category.value_counts() # In[9]: similarity_melted_df.Metadata_clone_number_pair_a.value_counts() # In[10]: (gg.ggplot(similarity_melted_df, gg.aes(x="similarity_metric")) + gg.geom_density(gg.aes(fill="comparison_category"), alpha=0.5) + gg.theme_bw()) # In[11]: (gg.ggplot(similarity_melted_df, gg.aes(x="similarity_metric")) + gg.geom_density(gg.aes(fill="comparison_category"), alpha=0.5) + gg.theme_bw() + gg.facet_wrap("~Metadata_batch_pair_a")) # In[12]: (gg.ggplot( similarity_melted_df.query( "Metadata_clone_number_pair_a in ['WT_parental', 'CloneA', 'CloneE']"), gg.aes(x="similarity_metric")) + gg.geom_density(gg.aes(fill="comparison_category"), alpha=0.5) +
('probability', pipeline.predict_proba(X)[:, 1]) ]) predict_df = predict_df.append(df) predict_df['probability_str'] = predict_df['probability'].apply('{:.1%}'.format) # In[27]: # Top predictions amongst negatives (potential hidden responders to a targeted cancer therapy) (predict_df .sort_values('decision_function', ascending=False) .query("status == 0 and feature_set == 'full'") .head(10) ) # In[28]: predict_df['status_'] = predict_df['status'].map( lambda x: 'negative' if x == 0 else 'positive') (gg.ggplot(predict_df, gg.aes(x='probability', fill='status_')) + gg.geom_density(alpha=0.6) + gg.facet_wrap('~feature_set', ncol=1) + gg.labs(x='probability', y='density') + gg.guides(fill=gg.guide_legend(title="")) + theme_cognoma())
# access data ames = pd.read_csv("bank.csv") # initial dimension ames.shape # first few observations ames.head() train, test = train_test_split(ames, test_size=0.3, random_state=123) f"raw data dimensions: {ames.shape}; training dimensions: {train.shape}; testing dimensions: {test.shape}" (ggplot(train, aes('Deposit')) + geom_density() + geom_density(data = test, color = "red") + ggtitle("Random sampling with SciKit-Learn")) y = attrition["age"] train_strat, test_strat = train_test_split(age, test_size=0.3, random_state=123, stratify=y) # response distribution for raw data attrition["age"].value_counts(normalize=True) # response distribution for training data train_strat["Attrition"].value_counts(normalize=True)
import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf from itertools import combinations import plotnine as p # read data import ssl ssl._create_default_https_context = ssl._create_unverified_context def read_data(file): return pd.read_stata( "https://raw.github.com/scunning1975/mixtape/master/" + file) tb = pd.DataFrame({ 'd': np.concatenate((np.repeat(0, 20), np.repeat(1, 20))), 'y': (0.22, -0.87, -2.39, -1.79, 0.37, -1.54, 1.28, -0.31, -0.74, 1.72, 0.38, -0.17, -0.62, -1.10, 0.30, 0.15, 2.30, 0.19, -0.50, -0.9, -5.13, -2.19, 2.43, -3.83, 0.5, -3.25, 4.32, 1.63, 5.18, -0.43, 7.11, 4.87, -3.10, -5.81, 3.76, 6.31, 2.58, 0.07, 5.76, 3.50) }) p.ggplot() + p.geom_density(tb, p.aes(x='y', color='factor(d)')) + p.xlim( -7, 8) + p.labs(title="Kolmogorov-Smirnov Test") + p.scale_color_discrete( labels=("Control", "Treatment"))