def hist_pseudotime(adata, fill='#595959', alpha=1, bins=30): """Plots a histogram of pseudotime Parameters -------------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.pseudotime`. fill: str Controls the color of the histogram bars. Must be a supported color name or hex-code. alpha: float A float between 0 and 1. Controls the transparency of the bars. Returns ----------- A plotnine histogram of pseudotime. """ if fill in adata.obs.columns: hist_plt = (ggplot(adata.obs, aes('pseudotime', fill=fill)) + geom_histogram(alpha=alpha, bins=bins)) else: hist_plt = (ggplot(adata.obs, aes('pseudotime')) + geom_histogram(fill=fill, alpha=alpha, bins=bins)) hist_plt = (hist_plt + labs(x='Pseudotime', y='Count') + theme_std) return hist_plt
def plot(): outdir = 'output/protobowl/' pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df, questions = load_protobowl() df.result = df.result.apply(lambda x: x is True) df['log_n_records'] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby('uid') user_stat = df_user_grouped.agg(np.mean) print('{} users'.format(len(user_stat))) print('{} records'.format(len(df))) print('{} questions'.format(len(set(df.qid)))) max_color = user_stat.log_n_records.max() user_stat['alpha'] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = ggplot(user_stat) \ + geom_point(aes(x='relative_position', y='result', size='user_n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + labs(x='Average buzzing position', y='Accuracy') \ + theme(aspect_ratio=1) p0.save(os.path.join(outdir, 'protobowl_users.pdf')) # p0.draw() print('p0 done') # histogram of number of records p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + labs(x='Log number of records', y='Density') \ + theme(aspect_ratio=0.3) p1.save(os.path.join(outdir, 'protobowl_hist.pdf')) # p1.draw() print('p1 done') # histogram of accuracy p2 = ggplot(user_stat, aes(x='result', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density() \ + labs(x='Accuracy', y='Density') \ + theme(aspect_ratio=0.3) p2.save(os.path.join(outdir, 'protobowl_acc.pdf')) # p2.draw() print('p2 done') # histogram of buzzing position p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \ + geom_histogram(color='#3182bd', fill='#deebf7') \ + geom_density() \ + labs(x='Average buzzing position', y='Density') \ + theme(aspect_ratio=0.3) p3.save(os.path.join(outdir, 'protobowl_pos.pdf')) # p3.draw() print('p3 done')
def plot(): outdir = 'output/protobowl/' pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df = load_protobowl() df.result = df.result.apply(lambda x: x is True) df['log_n_records'] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby('uid') user_stat = df_user_grouped.agg(np.mean) print('{} users'.format(len(user_stat))) print('{} records'.format(len(df))) max_color = user_stat.log_n_records.max() user_stat['alpha'] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = ggplot(user_stat) \ + geom_point(aes(x='relative_position', y='result', size='user_n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + labs(x='Average buzzing position', y='Accuracy') \ + theme(aspect_ratio=1) p0.save(os.path.join(outdir, 'protobowl_users.pdf')) # p0.draw() print('p0 done') # histogram of number of records p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + labs(x='Log number of records', y='Density') \ + theme(aspect_ratio=0.3) p1.save(os.path.join(outdir, 'protobowl_hist.pdf')) # p1.draw() print('p1 done') # histogram of accuracy p2 = ggplot(user_stat, aes(x='result', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density() \ + labs(x='Accuracy', y='Density') \ + theme(aspect_ratio=0.3) p2.save(os.path.join(outdir, 'protobowl_acc.pdf')) # p2.draw() print('p2 done') # histogram of buzzing position p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \ + geom_histogram(color='#3182bd', fill='#deebf7') \ + geom_density() \ + labs(x='Average buzzing position', y='Density') \ + theme(aspect_ratio=0.3) p3.save(os.path.join(outdir, 'protobowl_pos.pdf')) # p3.draw() print('p3 done')
def plot_char_percent_vs_accuracy_histogram(self, category=False): if category: return (ggplot(self.char_plot_df) + facet_wrap('category_jmlr') + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05)) else: return (ggplot(self.char_plot_df) + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05))
def plot_char_percent_vs_accuracy_histogram(self, category=False): if category: return ( ggplot(self.char_plot_df) + facet_wrap('category_jmlr') + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05) ) else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', fill='Outcome') + geom_histogram(binwidth=.05) )
def plot_char_percent_vs_accuracy_histogram(self, category=False): if category: return ( ggplot(self.char_plot_df) + facet_wrap("category_jmlr") + aes(x="char_percent", fill="Outcome") + geom_histogram(binwidth=0.05) ) else: return ( ggplot(self.char_plot_df) + aes(x="char_percent", fill="Outcome") + geom_histogram(binwidth=0.05) )
def create_length_plot(len_df, legend_position='right', legend_box='vertical'): mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index() mean_len_df[' '] = 'Mean Length' plt = (ggplot(len_df) + aes(x='x', fill='Method', y='..density..') + geom_histogram(binwidth=2, position='identity', alpha=.6) + geom_text(aes(x='x', y=.22, label='x', color='Method'), mean_len_df, inherit_aes=False, format_string='{:.1f}', show_legend=False) + geom_segment(aes(x='x', xend='x', y=0, yend=.205, linetype=' '), mean_len_df, inherit_aes=False, color='black') + scale_linetype_manual(['dashed']) + facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) + xlab('Example Length') + ylab('Frequency') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( aspect_ratio=1, legend_title=element_blank(), legend_position=legend_position, legend_box=legend_box, )) return plt
def test_midpoint(): p = (ggplot(df, aes('x')) + geom_histogram(aes(fill='factor(z)'), bins=n, alpha=0.25) + geom_freqpoly(bins=n, size=4) + geom_point(stat='bin', bins=n, size=4, stroke=0, color='red')) assert p + _theme == 'midpoint'
def test_scale_transformed_breaks(): df = pd.DataFrame({'x': np.repeat(range(1, 5), range(1, 5))}) p = ggplot(df, aes('x')) + geom_histogram(breaks=[1, 2.5, 4]) out1 = layer_data(p) out2 = layer_data(p + scale_x_sqrt()) np.testing.assert_allclose(out1.xmin, [1, 2.5]) np.testing.assert_allclose(out2.xmin, np.sqrt([1, 2.5]))
def plot_n_train_vs_accuracy(self): return ( ggplot(self.combined_df) + facet_wrap("seen") + aes(x="n_train", fill="Outcome") + geom_histogram(binwidth=1) )
def plot_frequency(n = 200): """ Draws the histogram of the distribution of n tweets by date. Parameters ---------- n: int An integer specifying how many tweets should be analysed. Returns ------- It saves the histogram as a .png file in the static folder. """ from plotnine import ggplot, aes, geom_histogram, scale_x_datetime, labs, theme_minimal, ggsave from Mod_1_API import gather_tweets from mizani.breaks import date_breaks from mizani.formatters import date_format import pandas df = pandas.DataFrame(gather_tweets(n)) plot1 = (ggplot(df, aes(x = 'Date', fill = 'Author')) + geom_histogram() + scale_x_datetime(breaks=date_breaks('1 week')) + labs(x = "Time in weeks", y = "Number of tweets by source") + theme_minimal() ) ggsave(plot = plot1, filename = "test.png", path = "static/")
def plot_ccs_stats(self, variable, *, trim_frac=0.005, bins=25, histogram_stat='count', maxcol=None, panelsize=1.75): """Plot histograms of CCS stats for all runs. Parameters ---------- variable : {'length', 'passes', 'accuracy'} Variable for which we plot stats. You will get an error if :meth:`Summaries.has_stat` is not true for `variable`. trim_frac : float Trim this amount of the bottom and top fraction from the data before plotting. Useful if outliers greatly extend scale. bins : int Number of histogram binds histogram_stat : {'count', 'density'} Plot the count of CCSs or their density normalized for each run. maxcol : None or int Max number of columns in faceted plot. panelsize : float Size of each plot panel. Returns ------- plotnine.ggplot.ggplot A panel of histograms. """ df = (self.ccs_stats(variable) .assign(lower=lambda x: x[variable].quantile(trim_frac), upper=lambda x: x[variable].quantile(1 - trim_frac), trim=lambda x: ((x[variable] > x['upper']) | (x[variable] < x['lower'])) ) .query('not trim') ) npanels = len(df['name'].unique()) if maxcol is None: ncol = npanels else: ncol = min(maxcol, npanels) nrow = math.ceil(npanels / ncol) p = (p9.ggplot(df, p9.aes(variable, y=f"..{histogram_stat}..")) + p9.geom_histogram(bins=bins) + p9.facet_wrap('~ name', ncol=ncol) + p9.theme(figure_size=(panelsize * ncol, panelsize * nrow), axis_text_x=p9.element_text(angle=90, vjust=1, hjust=0.5) ) + p9.ylab('number of CCSs') ) return p
def plot_histogram(df_plot, variable_column, output_file='plot_distribution', facet_column='none', x_log10=False): """Plot plot_distribution to png. Parameters ---------- df_plot : pandas.DataFrame DataFrame with <variable_column> as a column. variable_column : string String of variable_column column to plot. output_file : string Basename of output file. facet_column : string Column to facet the plot by. Returns ------- NULL """ df_plot['x'] = df_plot[variable_column] if x_log10: if np.any(df_plot['x'].values < 0): return 1 elif np.any(df_plot['x'].values == 0): df_plot['x'] = np.log10(df_plot['x'].values + 1e-10) variable_column = variable_column + ' (log10)' else: df_plot['x'] = np.log10(df_plot['x'].values) variable_column = variable_column + ' (log10)' gplt = plt9.ggplot(df_plot, plt9.aes(x='x')) gplt = gplt + plt9.theme_bw() gplt = gplt + plt9.geom_histogram(alpha=0.8) gplt = gplt + plt9.scale_x_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.scale_y_continuous( # trans='log10', # labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.labs(title='', x=variable_column) gplt = gplt + plt9.theme(axis_text_x=plt9.element_text(angle=-45, hjust=0)) if facet_column != 'none': gplt = gplt + plt9.facet_wrap('~ {}'.format(facet_column), ncol=5) n_facets = df_plot[facet_column].nunique() gplt.save('{}.png'.format(output_file), dpi=300, width=6 * (n_facets / 4), height=4 * (n_facets / 4), limitsize=False) else: gplt.save('{}.png'.format(output_file), dpi=300, width=4, height=4) return 0
def test_deepcopy(): p = ggplot(aes('x'), data=df) + geom_histogram() p2 = deepcopy(p) assert p is not p2 # Not sure what we have to do for that... assert p.data is p2.data assert len(p.layers) == len(p2.layers) assert p.layers[0].geom is not p2.layers[0].geom assert len(p.mapping) == len(p2.mapping) assert p.mapping is not p2.mapping assert p.environment is p2.environment
def plot_pixel_values(im): plotr = pd.DataFrame( {'values': im.flatten()}, index=range(len(im.flatten())) ) return ( p9.ggplot() + p9.geom_histogram(data=plotr, mapping=p9.aes('values')) + p9.theme_xkcd() + p9.labels.xlab('Pixel values') )
def plot_hists(df, out=None, **kwargs): r"""Construct histograms Create a set of histograms. Often used to visualize the results of random sampling for multiple outputs. Usually called as a dispatch from plot_auto(). Args: out (list of strings): Variables to plot Returns: Seaborn histogram plot Examples: >>> import grama as gr >>> import matplotlib.pyplot as plt >>> from grama.models import make_cantilever_beam >>> md = make_cantilever_beam() >>> ## Dispatch from autoplotter >>> ( >>> md >>> >> gr.ev_sample(n=100, df_det="nom") >>> >> gr.pt_auto() >>> ) >>> ## Re-create without metadata >>> ( >>> md >>> >> gr.ev_sample(n=100, df_det="nom") >>> >> gr.pt_hists(out=md.out) >>> ) """ if out is None: raise ValueError("Must provide input columns list as keyword out") return ( df >> tf_pivot_longer( columns=out, names_to="var", values_to="value", ) >> ggplot(aes("value")) + geom_histogram(bins=30) + facet_wrap("var", scales="free") + theme_minimal() + labs( x="Output Value", y="Count", ) )
def plot_estimate_distribution(dist): return (pn.ggplot(dist, pn.aes(x='estimates')) + pn.geom_histogram(bins=25) + pn.geom_vline( xintercept=sum(pile['denomination']), color="#FF5500", size=2, ) + pn.geom_vline( xintercept=3363400, color="#FF5500", size=2, linetype='dotted', ))
def density_plot( # type: ignore self, df: pd.DataFrame, xmin=None, xmax=None, fill: str = "#fbb4ae", bins: int = 50, **kwargs, ): return (ggplot(df, aes(df.columns[0])) + geom_histogram(fill=fill, bins=bins) + self._scale_x(xmin, xmax) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
def comparison_plot( # type: ignore self, df: pd.DataFrame, xmin=None, xmax=None, bins: int = 50, **kwargs): return (ggplot(df, aes(df.columns[1], fill=df.columns[0])) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_histogram(position="identity", alpha=0.9, bins=bins) + self._scale_x(xmin, xmax) + facet_wrap(df.columns[0], ncol=1) + guides(fill=False) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
def create(self, file_path: str) -> None: (ggplot(self._data, aes("loc")) + geom_histogram(bins=100, fill="#1e4f79") + facet_grid(facets="category ~ .", scales='free_y') + scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) + scale_y_continuous(labels=comma_format()) #+ scale_y_continuous(labels=lambda l: ["%.2f%%" % (v * 100 / len(self._data)) for v in l]) + ggtitle("Class Sizes") + xlab("Lines of Code") + ylab("Number of Classes") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), subplots_adjust={"hspace": 0.1 })).save(file_path, width=8, height=18)
def create(self, file_path: str) -> None: (ggplot(self._data, aes("value")) + geom_histogram(bins=100, fill="#1e4f79") + facet_wrap(facets="variable", scales="free", ncol=3) + scale_x_continuous(trans=asinh_trans(), labels=asinh_labels) + scale_y_continuous(labels=comma_format()) + ggtitle("Distributions of QMOOD Quality Attributes") + xlab("Quality Attribute Value") + ylab("Number of Projects") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), subplots_adjust={ "wspace": 0.35, "hspace": 0.35 })).save(file_path, width=24, height=12)
def create(self, file_path: str) -> None: (ggplot(self._data, aes("value")) + geom_histogram(bins=100, fill="#1e4f79") + facet_wrap(facets="variable", scales="free", ncol=3) + xlim(0, 1) + scale_y_continuous(labels=comma_format()) + ggtitle("Intensity of Design Pattern Use") + xlab("Percentage of Classes Participating in Design Pattern") + ylab("Number of Projects") + theme_classic(base_size=32, base_family="Helvetica") + theme(text=element_text(size=32), axis_title_y=element_text(margin={"r": 40}), subplots_adjust={ "wspace": 0.3, "hspace": 0.5 })).save(file_path, width=24, height=24)
def plot_histogram_100_bins(histogram_df): """This function plots the data in histogram_df as a histogram with 100 bins Inputs ------ histogram_df: pandas.DataFrame The dataframe containing the data to be plotted Returns ------- plot: plotnine.ggplot The histogram figure """ plot = ggplot(histogram_df, aes(x='effect_size')) + geom_histogram(bins=100) return plot
def plot_dist_with_ci(dist): return (pn.ggplot(dist, pn.aes(x='estimates')) + pn.geom_histogram(bins=25) + pn.geom_vline( xintercept=dist.quantile(0.025), color="#FF5500", size=2, linetype='dotted', ) + pn.geom_vline( xintercept=dist.quantile(0.975), color="#FF5500", size=2, linetype='dotted', ) + pn.ggtitle("${0:,.0f} ({1:,.0f}, {2:,.0f})".format( np.mean(dist.estimates), dist.estimates.quantile(0.025), dist.estimates.quantile(0.975), )))
def plotMutsHistogram(self, value, *, mutant_order=1, bins=30, wt_vline=True): """Plot distribution of phenotype for all mutants of a given order. Parameters ---------- value : {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'} What value to plot. mutant_order : int Plot mutations of this order. Currently only works for 1 (single mutants). bins : int Number of bins in histogram. wt_vline : bool Draw a vertical line at the wildtype value. Returns ------- plotnine.ggplot.ggplot Histogram of phenotype for all mutants. """ if mutant_order != 1: raise ValueError('only implemented for `mutant_order` of 1') if value not in {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'}: raise ValueError(f"invalid `value` of {value}") func = getattr(self, value) xlist = [func(m) for m in self.muteffects.keys()] p = (p9.ggplot(pd.DataFrame({value: xlist}), p9.aes(value)) + p9.geom_histogram(bins=bins) + p9.theme(figure_size=(3.5, 2.5)) + p9.ylab(f"number of {mutant_order}-mutants") ) if wt_vline: p = p + p9.geom_vline( xintercept=func(''), color=CBPALETTE[1], linetype='dashed') return p
def plot_pred_hist(label_list, pred_list, names=None, n_bins=10): """ 予測確率のヒストグラムを描く :param: label_list: 正解ラベルリストの配列. [(y1, y2, ...), (y1, y2, ...)] のようにして与える, pred_list に対応させる :param: pred_list: 予測確率リストの配列. label_list と同じ長さにすること :param: names=None: モデルの名称. None または同じ長さにすること. 指定しない場合, ラベルの組が 2~3 ならば ['train', 'valid', 'test'] を与える. 3より多い場合は通し番号にする. :param: n_bins: ヒストグラムのビン数 :return: plotnine オブジェクト TODO: geom_vline の表示方法 """ if names is None: if len(label_list) == 2: names = ('train', 'test') elif len(label_list) == 3: names = ('train', 'valid', 'test') else: names = list(range(len(label_list))) else: pass name_order = {k: v for v, k in enumerate(names)} name_order_rev = {str(k): v for v, k in name_order.items()} d = pd.DataFrame( {col: v for col, v in zip(('y', 'prediction'), [list(chain.from_iterable(x)) for x in ([label_list, pred_list])])} ).assign( model=list(chain.from_iterable([[name] * len(l) for name, l in zip(names, label_list)])) ).melt( id_vars='model' ).assign( order=lambda x: x.model.replace(name_order) ).sort_values(['order', 'variable']) # 補助線としての平均値を引くためのデータ d_mean = d.drop(columns='order').groupby(['variable', 'model']).mean( ).reset_index().rename(columns={'value': 'mean'}) d = d.merge(d_mean, on=['variable', 'model']) return ggplot( d, aes(x='value', y='..density..', group='variable', fill='variable') ) + geom_histogram(position='identity', alpha=.5, bins=10 ) + geom_vline( aes(xintercept='mean', group='variable', color='variable', linetype='variable') ) + labs(x='prediction', fill='frequency', linetype='mean', color='mean' ) + facet_wrap( '~order', scales='free_y', labeller=lambda x: name_order_rev[x] ) + theme_classic() + theme(figure_size=(6, 4))
def hist_residuals(self, figure_size=(8, 4), sample_frac=1.0): """Histogram of residuals Parameters ---------- figure_size : tuple(int, int), optional default=(8, 4) Plot size (width, height) sample_frac : float, optional default=1.0 Fraction of data points to plot Returns ------- plot : ggplot object """ return (ggplot(self.df.sample(frac=sample_frac), aes(x="residual")) + geom_histogram(fill="lightblue", colour="grey") + geom_vline(xintercept=0, color="red", linetype="dashed") + labs(title="Residuals", x="Residuals") + theme(figure_size=figure_size))
def show_community_prediction( self, percent_kept: float = 0.95, side_cut_from: str = "both", num_samples: int = 1000, bins: int = 50, ): """ Plot samples from the community prediction on this question :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param num_samples: number of samples from the community :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation :return: ggplot graphics object """ community_samples = pd.Series([ self.sample_normalized_community() for _ in range(0, num_samples) ]) (_xmin, _xmax) = self.get_central_quantiles(community_samples, percent_kept=percent_kept, side_cut_from=side_cut_from) _xmin, _xmax = self.denormalize_samples([_xmin, _xmax]) df = pd.DataFrame( data={"samples": self.denormalize_samples(community_samples)}) title_name = ( f"Q: {self.name}" if self.name else "\n".join( textwrap.wrap(self.data["title"], 60)) # type: ignore ) return (ggplot(df, aes("samples")) + geom_histogram(fill="#b3cde3", bins=bins) + scale_x_datetime(limits=(_xmin, _xmax)) + labs(x="Prediction", y="Counts", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
def create_length_plot(len_df, legend_position='right', legend_box='vertical'): mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index() mean_len_df[' '] = 'Mean Length' plt = ( ggplot(len_df) + aes(x='x', fill='Method', y='..density..') + geom_histogram(binwidth=2, position='identity', alpha=.6) + geom_text( aes(x='x', y=.22, label='x', color='Method'), mean_len_df, inherit_aes=False, format_string='{:.1f}', show_legend=False ) + geom_segment( aes(x='x', xend='x', y=0, yend=.205, linetype=' '), mean_len_df, inherit_aes=False, color='black' ) + scale_linetype_manual(['dashed']) + facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) + xlab('Example Length') + ylab('Frequency') + scale_color_manual(values=COLORS) + scale_fill_manual(values=COLORS) + theme_fs() + theme( aspect_ratio=1, legend_title=element_blank(), legend_position=legend_position, legend_box=legend_box, ) ) return plt
def plotMutsHistogram(self, value, *, k=None, mutant_order=1, bins=30, wt_vline=True, ): """Plot distribution of phenotype for all mutants of a given order. Parameters ---------- value : {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'} What value to plot. k : int or None If value is `latentPhenotype, which phenotype (1 <= `k` <= :attr:`MultiLatentSigmoidPhenotypeSimulator.n_latent_phenotypes`) to plot. mutant_order : int Plot mutations of this order. Currently only works for 1 (single mutants). bins : int Number of bins in histogram. wt_vline : bool Draw a vertical line at the wildtype value. Returns ------- plotnine.ggplot.ggplot Histogram of phenotype for all mutants. """ if mutant_order != 1: raise ValueError('only implemented for `mutant_order` of 1') if value == 'latentPhenotype': if isinstance(k, int) and 1 <= k <= self.n_latent_phenotypes: kwargs = {'k': k} xlabel = f"latentPhenotype {k}" else: raise ValueError(f"invalid `k` of {k}") else: kwargs = {} xlabel = value if value not in {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'}: raise ValueError(f"invalid `value` of {value}") func = getattr(self, value) xlist = [func(m, **kwargs) for m in self._all_subs] p = (p9.ggplot(pd.DataFrame({value: xlist}), p9.aes(value)) + p9.geom_histogram(bins=bins) + p9.theme(figure_size=(3.5, 2.5)) + p9.ylab(f"number of {mutant_order}-mutants") + p9.xlab(xlabel) ) if wt_vline: p = p + p9.geom_vline( xintercept=func('', **kwargs), color=CBPALETTE[1], linetype='dashed') return p
def test_histogram_count(): p = (ggplot(df, aes('x')) + geom_histogram(aes(fill='factor(z)'), bins=n)) assert p + _theme == 'histogram-count'
def plot_n_train_vs_accuracy(self): return ( ggplot(self.combined_df) + facet_wrap('seen') + aes(x='n_train', fill='Outcome') + geom_histogram(binwidth=1) )
#Do we need to normalize our numeric and integer cols? #SeniorCitizen was already in binary form. So no. from plotnine import ggplot, aes, geom_histogram, geom_boxplot (ggplot(dat, aes(x='MonthlyCharges')) + geom_histogram()).save(filename="MonthlyCharges_Hist.png", dpi=300) (ggplot(dat, aes(x='TotalCharges')) + geom_histogram()).save(filename="TotalCharges_Hist.png", dpi=300) #Neither follow a normal distribution. Log transformation could help, but these are odd. dat["LogTotalCharges"] = np.log(dat["TotalCharges"]+1) dat["LogMonthlyCharges"] = np.log(dat["MonthlyCharges"]+1) (ggplot(dat, aes(x='LogMonthlyCharges')) + geom_histogram()) (ggplot(dat, aes(x='LogTotalCharges')) + geom_histogram()) #Doesn't really help so leave this for now. dat = dat.drop(columns = ["LogTotalCharges", "LogMonthlyCharges"])
def show_prediction( self, samples, percent_kept: float = 0.95, side_cut_from: str = "both", show_community: bool = False, num_samples: int = 1000, bins: int = 50, ): """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param show_community: boolean indicating whether comparison to community predictions should be made :param num_samples: number of samples from the community :param bins: The number of bins in the histogram, the more bins, the more 'fine grained' the graph. Fewer bins results in more aggregation :return: ggplot graphics object """ if isinstance(samples, SubmissionMixtureParams): prediction = samples prediction_normed_samples = pd.Series([ logistic.sample_mixture(prediction) for _ in range(0, num_samples) ]) else: if isinstance(samples, list): samples = pd.Series(samples) if not type(samples) in [pd.Series, np.ndarray]: raise ValueError( "Samples should be a list, numpy arrray or pandas series") num_samples = samples.shape[0] prediction_normed_samples = self.normalize_samples(samples) title_name = ( f"Q: {self.name}" if self.name else "\n".join( textwrap.wrap(self.data["title"], 60)) # type: ignore ) if show_community: df = pd.DataFrame( data={ "community": [ # type: ignore self.sample_normalized_community() for _ in range(0, num_samples) ], "prediction": prediction_normed_samples, # type: ignore }) # import pdb # pdb.set_trace() # get domain for graph given the percentage of distribution kept (_xmin, _xmax) = self.get_central_quantiles(df, percent_kept=percent_kept, side_cut_from=side_cut_from) _xmin, _xmax = self.denormalize_samples([_xmin, _xmax]) df["prediction"] = self.denormalize_samples(df["prediction"]) df["community"] = self.denormalize_samples(df["community"]) df = pd.melt(df, var_name="sources", value_name="samples") # type: ignore return (ggplot(df, aes("samples", fill="sources")) + scale_fill_brewer(type="qual", palette="Pastel1") + geom_histogram(position="identity", alpha=0.9) + scale_x_datetime(limits=(_xmin, _xmax)) + facet_wrap("sources", ncol=1) + labs( x="Prediction", y="Counts", title=title_name, ) + guides(fill=False) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1))) else: (_xmin, _xmax) = self.get_central_quantiles( prediction_normed_samples, percent_kept=percent_kept, side_cut_from=side_cut_from, ) _xmin, _xmax = self.denormalize_samples([_xmin, _xmax]) df = pd.DataFrame(data={ "prediction": self.denormalize_samples(prediction_normed_samples) }) return (ggplot(df, aes("prediction")) + geom_histogram(fill="#b3cde3", bins=bins) # + coord_cartesian(xlim = (_xmin,_xmax)) + scale_x_datetime(limits=(_xmin, _xmax)) + labs(x="Prediction", y="Counts", title=title_name) + ergo_theme + theme(axis_text_x=element_text(rotation=45, hjust=1)))
user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) p0 = ggplot(user_stat) \ + geom_point(aes(x='ratio', y='accuracy', size='n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + theme(aspect_ratio=1) p0.save('protobowl_users.pdf') # p0.draw() print('p0 done') p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + theme(aspect_ratio=0.3) p1.save('protobowl_hist.pdf') # p1.draw() print('p1 done') p2 = ggplot(user_stat, aes(x='accuracy', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density(aes(x='accuracy')) \ + theme(aspect_ratio=0.3) p2.save('protobowl_acc.pdf') # p2.draw() print('p2 done')