def calc_tiers(df_ranks, year, week, bw=0.09, order=4, show=False): """Calculate 3-5 tiers using Gaussian Kernel Density Estimation :param df_ranks: data frame with power rankings for each team :param year: current year :param week: current week :param bw: bandwidth for KDE :param order: order parameter for KDE :param show: flag to show plot :return: None """ logger.info('Calculating tiers for power rankings') # Estimate the kernel using power rankings kde = gaussian_kde(df_ranks.get('power'), bw_method=bw) # Create grid of points for plot x_grid = np.linspace( df_ranks.get('power').min() - 10., df_ranks.get('power').max() + 10, df_ranks.get('power').size * 10) # Calculate densities for each grid point for plotting df_kde = pd.DataFrame(dict(x=x_grid, kde=kde(x_grid))) # Calculate relative minimums to determine tiers rel_min = pd.DataFrame( dict(rel_min=x_grid[argrelmin(kde(x_grid), order=order)[0]])) # Only keep 5 tiers tier_mins = sorted(rel_min.rel_min.values, reverse=True)[:4] # Find position of power rank when added to list of minimums to get tier df_ranks['tier'] = df_ranks.apply(lambda x: sorted( tier_mins + [x.power], reverse=True).index(x.power) + 1, axis=1) # Plot KDE and overlay tiers and actual power rankings as vertical lines tier_plot = ( ggplot(aes(x='x', y='kde'), data=df_kde) + geom_line(size=1.5) + geom_vline( aes(xintercept='rel_min'), data=rel_min, color='red', alpha=0.7) + geom_vline(aes(xintercept='power'), data=df_ranks, color='blue', linetype='dashed', alpha=0.4) + theme_bw() + labs(x='Power Rankings', y=f'KDE (bw: {bw}, order: {order})', title=f'Tiers for week {week}')) # Show plot if show: tier_plot.draw() # Create directory if it doesn't exist to save plot out_dir = Path(f'output/{year}/week{week}') out_dir.mkdir(parents=True, exist_ok=True) out_name = out_dir / 'tiers.png' # Save plot (plotnine is throwing too many warnings...) warnings.filterwarnings('ignore') tier_plot.save(out_name, width=9, height=6, dpi=300) warnings.filterwarnings('default') logger.info(f'Saved Tiers plot to local file: {out_name.resolve()}') return df_ranks
def test_aesthetics(): p = (ggplot(df) + geom_point(aes('x', 'y')) + geom_vline(aes(xintercept='xintercept'), size=2) + geom_vline(aes(xintercept='xintercept+.1', alpha='z'), size=2) + geom_vline(aes(xintercept='xintercept+.2', linetype='factor(z)'), size=2) + geom_vline(aes(xintercept='xintercept+.3', color='factor(z)'), size=2) + geom_vline(aes(xintercept='xintercept+.4', size='z'))) assert p + _theme == 'aesthetics'
def plot_estimate_distribution(dist): return (pn.ggplot(dist, pn.aes(x='estimates')) + pn.geom_histogram(bins=25) + pn.geom_vline( xintercept=sum(pile['denomination']), color="#FF5500", size=2, ) + pn.geom_vline( xintercept=3363400, color="#FF5500", size=2, linetype='dotted', ))
def mungle_plot(data_df, users=USERS, aggregation="7D", start="2018-12-31", end=None): data = data_df.sort_values(["User", "Date"]) data.drop_duplicates(subset=["User", "Tweet"], inplace=True) data = data.astype({ "User": "******", "Tweet": "str", "Date": "datetime64[ns]", "Favorites": "int", "Type": "category" }) data["Date"] = data["Date"] - dt.timedelta(hours=3) filtered_data = data.loc[(data.Type == "Tweet")].loc[data.User.isin(users)] data_sums = (filtered_data.groupby("User").apply( lambda x: x.set_index("Date").resample("1D").sum().reindex( pd.date_range( dt.datetime(2018, 12, 30), data.max()["Date"], freq="D") )).drop("ID", axis=1).rename_axis(["User", "Date"]).reset_index()) data_count = (filtered_data.groupby("User").apply( lambda x: x.set_index("Date").resample("1D").count().reindex( pd.date_range( dt.datetime(2018, 12, 30), data.max()["Date"], freq="D"))). drop(["User", "Favorites", "Retweets", "ID", "Type"], axis=1).rename_axis(["User", "Date"]).reset_index()) full_data = data_count.merge(data_sums, how="left", on=["User", "Date"]) resampled_data = (full_data.set_index("Date").groupby("User").resample( aggregation, label="right", closed="right").sum()) agg_data = resampled_data.groupby("Date").agg("sum").reset_index() if end is None: end = agg_data.Date.max() agg_data = agg_data.loc[(agg_data.Date >= start) & (agg_data.Date <= end)] plot = (p9.ggplot(agg_data, p9.aes("Date", "Tweet")) + p9.geom_line(color="red", size=1) + p9.geom_vline(xintercept="2019-06-30", linetype="dotted") + p9.geom_vline(xintercept="2019-10-27", linetype="dotted") + p9.theme(axis_text_x=p9.element_text(angle=90, hjust=-1)) + p9.labs(title="Tweets de cuentas oficiales del gobierno en 2019", subtitle="Acumulados semanales", y="", x="")) return plot, agg_data
def test_annotation_stripes_coord_flip(): pdf = mtcars.assign(gear=pd.Categorical(mtcars.gear), am=pd.Categorical(mtcars.am)) p = ( ggplot(pdf) + annotation_stripes( fills=["#AAAAAA", "#FFFFFF", "#7F7FFF"], alpha=0.3) + geom_jitter( aes("gear", "wt", shape="gear", color="am"), random_state=5) + geom_vline(xintercept=0.5, color="black") + geom_vline(xintercept=1.5, color="black") + geom_vline(xintercept=2.5, color="black") + geom_vline(xintercept=3.5, color="black") + scale_shape_discrete(guide=guide_legend(order=1)) # work around #229 + coord_flip()) assert p == "annotation_stripes_coord_flip"
def scatter_plot(df, xcol, ycol, domain, xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5): assert len(domain) == 2 POINT_SIZE = 0.5 DASH_PATTERN = (0, (3, 1)) if xname == None: xname = xcol if yname == None: yname = ycol # formater for axes' labels ax_formatter = mizani.custom_format('{:n}') if clamp: # clamp overflowing values if required df = df.copy(deep=True) df.loc[df[xcol] > domain[1], xcol] = domain[1] df.loc[df[ycol] > domain[1], ycol] = domain[1] # generate scatter plot scatter = p9.ggplot(df) scatter += p9.aes(x=xcol, y=ycol) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True) scatter += p9.labs(x=xname, y=yname) if log: # log scale scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter) scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter) else: scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter) scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter) #scatter += p9.theme_xkcd() scatter += p9.theme_bw() scatter += p9.theme( panel_grid_major=p9.element_line(color='#666666', alpha=0.5)) scatter += p9.theme(figure_size=(width, height)) # generate additional lines scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN) # diagonal scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN) # vertical rule scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN) # horizontal rule res = scatter return res
def test_annotation_stripes_continuous_scale(): p = (ggplot(df) + annotation_stripes(fill_range=True) + geom_point(aes('x', 'y')) + geom_vline(xintercept=[0.5, 1.5, 2.5, 3.5]) ) assert p == 'annotation_stripes_continuous_scale'
def test_annotation_stripes_fill_range(): p = (ggplot(df) + annotation_stripes(fill_range=True) + geom_point(aes('factor(x)', 'y')) + geom_vline(xintercept=[0.5, 1.5, 2.5, 3.5]) ) assert p == 'annotation_stripes_fill_range'
def plot_dist_with_ci(dist): return (pn.ggplot(dist, pn.aes(x='estimates')) + pn.geom_histogram(bins=25) + pn.geom_vline( xintercept=dist.quantile(0.025), color="#FF5500", size=2, linetype='dotted', ) + pn.geom_vline( xintercept=dist.quantile(0.975), color="#FF5500", size=2, linetype='dotted', ) + pn.ggtitle("${0:,.0f} ({1:,.0f}, {2:,.0f})".format( np.mean(dist.estimates), dist.estimates.quantile(0.025), dist.estimates.quantile(0.975), )))
def test_annotation_stripes_coord_flip(): p = (ggplot(df) + annotation_stripes() + geom_point(aes('factor(x)', 'y')) + geom_vline(xintercept=[0.5, 1.5, 2.5, 3.5]) + coord_flip() ) assert p == 'annotation_stripes_coord_flip'
def scatter_plot2(df1, df2, xcol, ycol, domain, color1='black', color2='red', xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5): assert len(domain) == 2 POINT_SIZE = 1.5 DASH_PATTERN = (0, (6, 2)) if xname is None: xname = xcol if yname is None: yname = ycol # formatter for axes' labels ax_formatter = mizani.custom_format('{:n}') if clamp: # clamp overflowing values if required df1 = df1.copy(deep=True) df1.loc[df1[xcol] > domain[1], xcol] = domain[1] df1.loc[df1[ycol] > domain[1], ycol] = domain[1] df2 = df2.copy(deep=True) df2.loc[df2[xcol] > domain[1], xcol] = domain[1] df2.loc[df2[ycol] > domain[1], ycol] = domain[1] # generate scatter plot scatter = p9.ggplot(df1) scatter += p9.aes(x=xcol, y=ycol) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, color=color1, alpha=0.5) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, data=df2, color=color2, alpha=0.5) scatter += p9.labs(x=xname, y=yname) # rug plots scatter += p9.geom_rug(na_rm=True, sides="tr", color=color1, alpha=0.05) scatter += p9.geom_rug(na_rm=True, sides="tr", data=df2, color=color2, alpha=0.05) if log: # log scale scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter) scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter) else: scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter) scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter) # scatter += p9.theme_xkcd() scatter += p9.theme_bw() scatter += p9.theme(panel_grid_major=p9.element_line(color='#666666', alpha=0.5)) scatter += p9.theme(panel_grid_minor=p9.element_blank()) scatter += p9.theme(figure_size=(width, height)) scatter += p9.theme(text=p9.element_text(size=24, color="black")) # generate additional lines scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN) # diagonal scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN) # vertical rule scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN) # horizontal rule res = scatter return res
def density_plot1(num_matches_per_round: int, match_lengths_from_one_round: list): """ Density plot for match lengths, new rules, no blowouts, 85 matches/round """ match_lengths = pd.DataFrame( {'Match length': match_lengths_from_one_round}) (plt.ggplot(match_lengths, plt.aes(x='Match length')) + plt.geom_density() + plt.geom_vline(xintercept=50, color='black', size=2) + plt.theme_classic() + plt.xlim([0, 55])).save(filename='figures/match_length_density_plot.png')
def plot_and_save(scale_data_df_cleaned, smooth_factor, temp_file_name): fasting_start = to_datetime('2019-10-15') plot_output = ( ggplot(scale_data_df_cleaned, aes(x='timestamp', y='weight')) + # facet_wrap('~', ncol = 1, scales = 'free') + geom_point(size=0.5) + geom_smooth(span=smooth_factor, color='red') + geom_vline(aes(xintercept=fasting_start), color='blue', size=1.2) + geom_label(aes(x=to_datetime('2019-11-30'), y=max(scale_data_df_cleaned.loc[:, 'weight'])), label='IF starts!', size=15)) plot_output.save(temp_file_name, width=13, height=10, dpi=80)
def plot_elos(): diffs = np.linspace(-1000, +1000) rates = 1 / (1 + 10**(-diffs / 400)) df = pd.DataFrame({'elo': diffs, 'winrate': rates}) return (pn.ggplot(df) + pn.geom_line(pn.aes(x='elo', y='winrate')) + pn.geom_vline(xintercept=0, alpha=.1) + pn.geom_hline(yintercept=.5, alpha=.1) + pn.labs(x='Own Elo relative to opponent\'s Elo', y='Win rate v. opponent') + pn.scale_y_continuous(labels=percent_format()) + pn.coord_cartesian(expand=False) + plot.IEEE())
def estimate_cutoffs_plot(output_file, df_plt, df_cell_estimate_cutoff, df_fit=None, scale_x_log10=False, save_plot=True): """Plot UMI counts by sorted cell barcodes.""" if min(df_plt['umi_counts']) <= 0: fix_log_scale = min(df_plt['umi_counts']) + 1 df_plt['umi_counts'] = df_plt['umi_counts'] + fix_log_scale gplt = plt9.ggplot() gplt = gplt + plt9.theme_bw() if len(df_plt) <= 50000: gplt = gplt + plt9.geom_point(mapping=plt9.aes(x='barcode', y='umi_counts'), data=df_plt, alpha=0.05, size=0.1) else: gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='barcode', y='umi_counts'), data=df_plt, alpha=0.25, size=0.75, color='black') gplt = gplt + plt9.geom_vline(mapping=plt9.aes(xintercept='n_cells', color='method'), data=df_cell_estimate_cutoff, alpha=0.75, linetype='dashdot') gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual') if scale_x_log10: gplt = gplt + plt9.scale_x_continuous( trans='log10', labels=comma_labels, minor_breaks=0) else: gplt = gplt + plt9.scale_x_continuous(labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.scale_y_continuous( trans='log10', labels=comma_labels, minor_breaks=0) gplt = gplt + plt9.labs(title='', y='UMI counts', x='Barcode index, sorted by UMI count', color='Cutoff') # Add the fit of the droplet utils model if df_fit: gplt = gplt + plt9.geom_line(mapping=plt9.aes(x='x', y='y'), data=df_fit, alpha=1, color='yellow') if save_plot: gplt.save('{}.png'.format(output_file), dpi=300, width=5, height=4) return gplt
def test_annotation_stripes_faceting(): n = len(df) df2 = pd.DataFrame({ 'x': np.hstack([df['x'], df['x']]), 'y': np.hstack([df['y'], df['y']]), 'g': list('a' * n + 'b' * n) }) p = (ggplot() + annotation_stripes(fill_range='no') + geom_point(df2, aes('factor(x)', 'y')) + geom_vline(xintercept=[0.5, 1.5, 2.5, 3.5]) + facet_wrap('g')) assert p == 'annotation_stripes_faceting'
def plotLatentVsObserved(self, value, *, latent_min=-15, latent_max=10, npoints=200, wt_vline=True): """Plot observed enrichment/phenotype as function of latent phenotype. Parameters ---------- value : {'enrichment', 'phenotype'} Do we plot observed enrichment or observed phenotype? latent_min : float Smallest value of latent phenotype on plot. latent_max : float Largest value of latent phenotype on plot. npoints : int Plot a line fit to this many points. wt_vline : bool Draw a vertical line at the wildtype latent phenotype. Returns ------- plotnine.ggplot.ggplot Plot of observed enrichment or phenotype as function of latent phenotype. """ latent = numpy.linspace(latent_min, latent_max, npoints) observed = self.latentToObserved(latent, value) p = (p9.ggplot( pd.DataFrame({ "latent": latent, "observed": observed }), p9.aes("latent", "observed"), ) + p9.geom_line() + p9.theme(figure_size=(3.5, 2.5)) + p9.xlab("latent phenotype") + p9.ylab(f"observed {value}")) if wt_vline: p = p + p9.geom_vline(xintercept=self.wt_latent, color=CBPALETTE[1], linetype="dashed") return p
def plot_replicate_density( df, batch, plate, cutoff, percent_strong, output_file_base=None, output_file_extensions=[".png", ".pdf", ".svg"], dpi=300, height=1.5, width=2, return_plot=False, ): density_gg = ( gg.ggplot(df, gg.aes(x="similarity_metric", fill="group_replicate")) + gg.geom_density(alpha=0.3) + gg.scale_fill_manual( name="Replicate", labels={"True": "True", "False": "False"}, values=["#B99638", "#2DB898"], ) + gg.xlab("Pearson Correlation") + gg.ylab("Density") + gg.geom_vline(xintercept=cutoff, color="red", linetype="dashed") + gg.ggtitle( f"{batch}; Plate: {plate}\n\nPercent Replicating: {np.round(percent_strong * 100, 2)}%" ) + gg.theme_bw() + gg.theme( title=gg.element_text(size=3.5), axis_text=gg.element_text(size=4), axis_title=gg.element_text(size=4), legend_text=gg.element_text(size=4), legend_title=gg.element_text(size=4), strip_text=gg.element_text(size=4, color="black"), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), ) ) if output_file_base: save_figure( density_gg, output_file_base, output_file_extensions, dpi, height, width ) if return_plot: return density_gg
def plotMutsHistogram(self, value, *, mutant_order=1, bins=30, wt_vline=True): """Plot distribution of phenotype for all mutants of a given order. Parameters ---------- value : {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'} What value to plot. mutant_order : int Plot mutations of this order. Currently only works for 1 (single mutants). bins : int Number of bins in histogram. wt_vline : bool Draw a vertical line at the wildtype value. Returns ------- plotnine.ggplot.ggplot Histogram of phenotype for all mutants. """ if mutant_order != 1: raise ValueError('only implemented for `mutant_order` of 1') if value not in {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'}: raise ValueError(f"invalid `value` of {value}") func = getattr(self, value) xlist = [func(m) for m in self.muteffects.keys()] p = (p9.ggplot(pd.DataFrame({value: xlist}), p9.aes(value)) + p9.geom_histogram(bins=bins) + p9.theme(figure_size=(3.5, 2.5)) + p9.ylab(f"number of {mutant_order}-mutants") ) if wt_vline: p = p + p9.geom_vline( xintercept=func(''), color=CBPALETTE[1], linetype='dashed') return p
def plot_pred_hist(label_list, pred_list, names=None, n_bins=10): """ 予測確率のヒストグラムを描く :param: label_list: 正解ラベルリストの配列. [(y1, y2, ...), (y1, y2, ...)] のようにして与える, pred_list に対応させる :param: pred_list: 予測確率リストの配列. label_list と同じ長さにすること :param: names=None: モデルの名称. None または同じ長さにすること. 指定しない場合, ラベルの組が 2~3 ならば ['train', 'valid', 'test'] を与える. 3より多い場合は通し番号にする. :param: n_bins: ヒストグラムのビン数 :return: plotnine オブジェクト TODO: geom_vline の表示方法 """ if names is None: if len(label_list) == 2: names = ('train', 'test') elif len(label_list) == 3: names = ('train', 'valid', 'test') else: names = list(range(len(label_list))) else: pass name_order = {k: v for v, k in enumerate(names)} name_order_rev = {str(k): v for v, k in name_order.items()} d = pd.DataFrame( {col: v for col, v in zip(('y', 'prediction'), [list(chain.from_iterable(x)) for x in ([label_list, pred_list])])} ).assign( model=list(chain.from_iterable([[name] * len(l) for name, l in zip(names, label_list)])) ).melt( id_vars='model' ).assign( order=lambda x: x.model.replace(name_order) ).sort_values(['order', 'variable']) # 補助線としての平均値を引くためのデータ d_mean = d.drop(columns='order').groupby(['variable', 'model']).mean( ).reset_index().rename(columns={'value': 'mean'}) d = d.merge(d_mean, on=['variable', 'model']) return ggplot( d, aes(x='value', y='..density..', group='variable', fill='variable') ) + geom_histogram(position='identity', alpha=.5, bins=10 ) + geom_vline( aes(xintercept='mean', group='variable', color='variable', linetype='variable') ) + labs(x='prediction', fill='frequency', linetype='mean', color='mean' ) + facet_wrap( '~order', scales='free_y', labeller=lambda x: name_order_rev[x] ) + theme_classic() + theme(figure_size=(6, 4))
def main(argv: List[str]) -> None: parser = argparse.ArgumentParser() parser.add_argument("roll_rule", type=RollRule, choices=list(RollRule)) parser.add_argument("--num_iterations", type=int, default=10000) parser.add_argument("--seed", type=int, default=None) parser.add_argument("--plot_file", default="ability_roll_distribution.png") args = parser.parse_args(argv) if args.seed is not None: random.seed(args.seed) # Run the simulation and process the data roll_counts = simulate(args.roll_rule, args.num_iterations) data = process_data(roll_counts) # Calculate statistics mean = sum(data["value"] * data["percent"] / 100.0) mode = data.iloc[data["count"].idxmax()]["value"] stddev = math.sqrt( sum(data["percent"] / 100.0 * (data["value"] - mean)**2.0)) skewness = pearson_first_skewness(mean, mode, stddev) # Print out result information print(data) print() print("Mean:", mean) print("Mode:", mode) print("Standard deviation:", stddev) print("Skewness:", skewness) # Plot the data plot = (plt9.ggplot(data, plt9.aes("value", "percent")) + plt9.geom_bar(stat="identity") + plt9.geom_vline(xintercept=mean, color="black") + plt9.xlim(0, 21) + plt9.ylab("Chance (%)") + plt9.xlab("Ability Score") + plt9.ggtitle("Ability Score Distribution ({} iterations)".format( args.num_iterations))) plot.save(args.plot_file, dpi=300) print("Wrote plot image to:", args.plot_file)
def hist_residuals(self, figure_size=(8, 4), sample_frac=1.0): """Histogram of residuals Parameters ---------- figure_size : tuple(int, int), optional default=(8, 4) Plot size (width, height) sample_frac : float, optional default=1.0 Fraction of data points to plot Returns ------- plot : ggplot object """ return (ggplot(self.df.sample(frac=sample_frac), aes(x="residual")) + geom_histogram(fill="lightblue", colour="grey") + geom_vline(xintercept=0, color="red", linetype="dashed") + labs(title="Residuals", x="Residuals") + theme(figure_size=figure_size))
def plot_arima(df): df['Timestamp'] = pd.to_datetime(df['Timestamp']) p = ( ggplot(data=df, mapping=aes(x='Timestamp', y=df.columns.values[1])) + geom_point(colour='blue', alpha=0.3, na_rm=True) + geom_line(colour='blue', na_rm=True) + geom_point(mapping=aes(x='Timestamp', y=df.columns.values[2]), colour='red', alpha=0.3, na_rm=True) + geom_line(mapping=aes(x='Timestamp', y=df.columns.values[2]), colour='red', na_rm=True) + geom_vline(xintercept=max(df[['Timestamp', df.columns.values[1] ]].dropna(axis=0)['Timestamp']), color='green', linetype='dashed') + # geom_line(mapping=aes(x='Timestamp', y='Lower'), colour='green', na_rm=True, alpha=0.3) + # geom_line(mapping=aes(x='Timestamp', y='Upper'), colour='green', na_rm=True, alpha=0.3) + geom_ribbon(data=df, mapping=aes(ymin='Lower', ymax='Upper'), fill='red', alpha=0.1) + scale_x_datetime(breaks='1 days', date_labels='%y-%m-%d %H:%M') + xlab('Time') + ylab(df.columns.values[1]) + theme_bw() + theme(axis_text_x=element_text( angle=45, hjust=1, face='bold', color='black'), axis_text_y=element_text(face='bold', colour='black'))) ggplot.save(p, filename=df.columns.values[1] + '_predict.png', path=os.path.join(os.path.abspath(os.path.dirname(__file__)), 'png'), width=8, height=6, units='in', dpi=326, verbose=False) return p
def density_plot2(num_matches_per_round: int, match_lengths_from_one_round: list, match_lengths_from_one_round_with_blowouts: list): """ Density plot for match lengths, new rules, blowouts vs. no blowouts, 85 matches/round """ match_lengths_blowout = pd.DataFrame({ 'Match length': np.concatenate([ match_lengths_from_one_round, match_lengths_from_one_round_with_blowouts ]), 'Blowouts': np.concatenate([ np.repeat('No', num_matches_per_round), np.repeat('Yes', num_matches_per_round) ]) }) (plt.ggplot(match_lengths_blowout, plt.aes(x='Match length', color='Blowouts')) + plt.geom_density() + plt.geom_vline(xintercept=50, color='black', size=2) + plt.xlim([0, 55]) + plt.theme_classic()).save( filename='figures/match_length_with_blowout_density_plot.png')
np.sqrt(np.diag(event_study_formula.cov_params().loc[lags][lags])) ]), 'mean': np.concatenate([ event_study_formula.params[leads], np.array([0]), event_study_formula.params[lags] ]), 'label': np.arange(-9, 6) }) leadslags_plot['lb'] = leadslags_plot['mean'] - leadslags_plot['sd'] * 1.96 leadslags_plot['ub'] = leadslags_plot['mean'] + leadslags_plot['sd'] * 1.96 # This version has a point-range at each # estimated lead or lag # comes down to stylistic preference at the # end of the day! p.ggplot(leadslags_plot, p.aes(x = 'label', y = 'mean', ymin = 'lb', ymax = 'ub')) +\ p.geom_hline(yintercept = 0.035169444, color = "red") +\ p.geom_pointrange() +\ p.theme_minimal() +\ p.xlab("Years before and after castle doctrine expansion") +\ p.ylab("log(Homicide Rate)") +\ p.geom_hline(yintercept = 0, linetype = "dashed") +\ p.geom_vline(xintercept = 0, linetype = "dashed")
def rel_plot(sbs, variant, jitter=0.01): plotdata = sbs[sbs.variant == variant] xcol = "base" ycol = "ratio" plotdata = plotdata.assign(x=plotdata[xcol], y=plotdata[ycol]) plotdata = plotdata.assign(sbs_index=plotdata.index.values) session_text = (plotdata[["session_index", "base_session_index"]].apply( tuple, axis=1).map(lambda tup: f"{tup[0]} vs. {tup[1]}")) plotdata = plotdata.assign(session_text=session_text) x = np.geomspace(0.02, 1, num=5) y = 1 / x diag_df = pd.DataFrame({"x": x, "y": y}) scatterplot = ( ggplot(plotdata) + geom_jitter( aes(x="x", y="y", fill="dataset", color="dataset"), width=jitter, height=jitter, alpha=0.6, size=1.0, ) # shape=plotdata.dataset.map(lambda x : '.' if x in ['lvis','objectnet'] else 'o'), # size=plotdata.dataset.map(lambda x : 1. if x in ['lvis','objectnet'] else 2.)) # + geom_text(aes(x='base', y='delta', label='category', color='dataset'), va='bottom', # data=plotdata1[plotdata1.ratio < .6], # position=position_jitter(.05, .05), show_legend=False) + geom_line(aes(x="x", y="y"), data=diag_df) # + geom_text(aes(x='x', y='y', label='session_text'), va='top', data=plotdata[(plotdata.y < .4) | (plotdata.y > 3)]) + ylab(ycol) # + geom_area(aes(y2=1.1, y=.9), linetype='dashed', alpha=.7) + geom_hline(aes(yintercept=1.1), linetype="dashed", alpha=0.7) + geom_hline(aes(yintercept=0.9), linetype="dashed", alpha=0.7) + geom_vline( aes(xintercept=0.1, ), linetype="dashed", alpha=0.7, ) + geom_vline( aes(xintercept=0.3, ), linetype="dashed", alpha=0.7, ) # + geom_abline() # + geom_point(aes(x='recall', y='precision', color='variant'), size=1.) # + facet_wrap(facets=['cat'], ncol=6, scales='free_x') + xlab(xcol) # +scale_color_discrete() + theme( figure_size=(8, 5), legend_position="top", subplots_adjust={"hspace": 0.5}, legend_title=element_blank(), legend_box_margin=-1, legend_margin=0.0, axis_text=element_text(size=12, margin={ "t": 0.2, "l": -0.3 }), legend_text=element_text(size=11), axis_title=element_text(size=12, margin={ "r": -0.2, "b": 0.0, "l": 0, "t": 0.0 }), ) + scale_x_log10(labels=make_labeler(brief_format), breaks=[0.01, 0.1, 0.3, 1.0]) + scale_y_log10(labels=make_labeler(brief_format), breaks=[0.5, 0.9, 1.1, 2.0, 3.0, 6, 12])) return scatterplot
def scatter_cell_cycle( adata, scores=["signatures", "components"][0], size=1.5, alpha=1, curvature_shrink=1, lab_ypos=2, ): """Plots cell cycle signatures vs pseudotime Parameters ---------------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.cell_cycle_phase`. scores: str A string indicating what to plot as cell cycle scores against pseudotime. If 'signatures', standard S-phase, G2-M and Histones signatures are used; if 'components', the 4 cell cycle related components are used. size: float Controls the point size of the plot. alpha: float A value between 0 and 1. Controls point transparency. lab_ypos: float Controls the y-axis position of the cell cycle phase annotation. Returns -------------- A plotnine scatter plot of pseudotime vs 3 cell cycle signatures. """ if scores == "signatures": y = ["S-phase", "G2-M", "Histones"] colors = ["#66c2a5", "#fc8d62", "#8da0cb", "black"] elif scores == "components": _add_compScores(adata) y = ["G1/S comp", "G2/M+ comp", "G2/M- comp", "Histones comp"] colors = ["#66c2a5", "#fc8d62", "#8da0cb", "#e5c494", "black"] time_scatter = scatter_pseudotime( adata, y=y, size=size, alpha=alpha) + labs( x="Pseudotime", y="Signature scores", color="Signature") # -- Add cell cycle annotations if "cell_cycle_division" in adata.uns["scycle"]: cc_divs = adata.uns["scycle"]["cell_cycle_division"] # -- Curvature data curv_data = cc_divs["curvature"] curv = curv_data["curvature"].values cvz = zscore(curv) / curvature_shrink cvz = cvz - np.max(cvz) curv_data.loc[:, "curvature"] = cvz curv_data.loc[:, "signature"] = "Curvature" # -- Peak data (for segments) gr_min = np.min(curv_data["curvature"]) pk_data = curv_data[curv_data["ispeak"] == "peak"] pk_data.loc[:, "ymin"] = gr_min # -- Cell cycle annotation cc_phase = pd.DataFrame( dict( starts=[ None, cc_divs["s_start"], cc_divs["g2_start"], cc_divs["m_start"], ], labels=["G1", "S", "G2", "M"], labpos=[ np.mean([0, cc_divs["s_start"]]), np.mean([cc_divs["s_start"], cc_divs["g2_start"]]), np.mean([cc_divs["g2_start"], cc_divs["m_start"]]), np.mean([cc_divs["m_start"], 1]), ], y=lab_ypos, )) cell_cycle_plt = ( time_scatter + geom_point(aes("pseudotime", "curvature", color="signature"), data=curv_data) + geom_line(aes("pseudotime", "curvature"), data=curv_data) + scale_color_manual(values=colors) + geom_segment( aes(x="pseudotime", xend="pseudotime", y="ymin", yend="curvature"), linetype="dotted", data=pk_data, ) + geom_vline( aes(xintercept="starts"), linetype="dashed", data=cc_phase) + geom_text(aes(x="labpos", y="y", label="labels"), data=cc_phase)) return cell_cycle_plt else: return time_scatter
def test_aes_inheritance(): with pytest.raises(PlotnineError): p = (ggplot(df, aes('x', 'y', xintercept='xintercept')) + geom_point() + geom_vline(size=2)) p.draw_test()
def main(): mpl.rc('mathtext', fontset='cm') warnings.filterwarnings('ignore', r'(geom|position)_\w+ ?: Removed \d+ rows') warnings.filterwarnings('ignore', r'Saving .+ x .+ in image') warnings.filterwarnings('ignore', r'Filename: .+\.png') df = concat_map(Pf_Ob_Ol, 'P_f', np.linspace(0.1, 1, 10)) save_both(my_plot(df, 'O_b', 'O_l', 'P_f') + titles('P_f(O_b, O_l)') + limits((1, 10)) + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') + gg.geom_line() , 'Pf_Ob_Ol') df = concat_map(Pf_Ob_σ, 'P_f', np.linspace(0.1, 1, 10)) save_both(my_plot(df, 'O_b', 'σ', 'P_f') + titles('P_f(O_b, σ)') + limits((1, 10), (0, 5)) + gg.geom_line() , 'Pf_Ob_σ') df = concat_map(Pq_Ob_Ol, 'P_q', np.linspace(-0.9, 0, 10)) save_both(my_plot(df, 'O_b', 'O_l', 'P_q') + titles('P_q(O_b, O_l)') + limits((1, 10)) + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') + gg.geom_line() , 'Pq_Ob_Ol') df = concat_map(Pq_Ob_σ, 'P_q', np.linspace(-0.9, 0, 10)) save_both(my_plot(df, 'O_b', 'σ', 'P_q') + titles('P_q(O_b, σ)') + limits((1, 10), (0, 5)) + gg.geom_line() , 'Pq_Ob_σ') df = concat_map(Opr_Ob_Ol, 'Opr', np.linspace(1, 5, 9)) save_both(my_plot(df, 'O_b', 'O_l', 'Opr') + titles("O'(O_b, O_l)") + limits((1, 10), (1, 10)) + gg.geom_line() + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') , 'Opr_Ob_Ol') df = concat_map(Opr_Ob_σ, 'Opr', np.linspace(1, 5, 9)) save_both(my_plot(df, 'O_b', 'σ', 'Opr') + titles("O'(O_b, σ)") + limits((1, 10), (0, 5)) + gg.geom_line() , 'Opr_Ob_σ') df = (pd.DataFrame({'Opr': np.linspace(1, 21, 101)}) .assign(Pf=lambda x: Opr_Pf(x.Opr))) save_both(my_plot(df, 'Opr', 'Pf') + titles("P_f(O')") + labs("O'", 'P_f') + limits((1, 20), (0, 1), xbreaks=np.linspace(2, 20, 10), ybreaks=np.linspace(0, 1, 11)) + gg.geom_line() + gg.geom_hline(yintercept=C, linetype='dashed', color='grey') , 'Pf_Opr') df = concat_map(σpr_Ob_σ, 'σpr', np.linspace(0, 5, 11)) save_both(my_plot(df, 'O_b', 'σ', 'σpr') + titles("σ'(O_b, σ)") + limits((1, 10), (0, 5)) + gg.geom_line() , 'σpr_Ob_σ') df = (pd.DataFrame({'σpr': np.linspace(0, 21, 106)}) .assign(Pq=lambda x: σpr_Pq(x.σpr))) save_both(my_plot(df, 'σpr', 'Pq') + titles("P_q(σ')") + labs("σ'", 'P_q') + limits((0, 20), (-1, 0), xbreaks=np.linspace(0, 20, 11), ybreaks=np.linspace(-1, 0, 11)) + gg.geom_line() , 'Pq_σpr') df = concat_map(liab_Ob_Ol_free, 'liab', np.linspace(0, 10, 11)) save_both(my_plot(df, 'O_b', 'O_l', 'liab', clab='-R_{bl}') + titles("-R_{bl}(O_b, O_l)", "S_b = 1, C_b = 0, C_l = 0.02", mathrm('Free bet', dollars=False)) + limits((1,20), (1, 10)) + gg.geom_line() + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') , 'liab_Ob_Ol_free') df = concat_map(liab_Ob_Ol_free, 'liab', np.linspace(0, 10, 11)) save_both(my_plot(df, 'O_b', 'σ', 'liab', clab='-R_{bl}') + titles("-R_{bl}(O_b, σ)", "S_b = 1, C_b = 0, C_l = 0.02", mathrm('Free bet', dollars=False)) + limits((1,20), (1, 10)) + gg.geom_line() , 'liab_Ob_σ_free') df = concat_map(liab_Ob_Ol_qual, 'liab', np.linspace(0, 10, 11)) save_both(my_plot(df, 'O_b', 'O_l', 'liab', clab='-R_{bl}') + titles("-R_{bl}(O_b, O_l)", "S_b = 1, C_b = 0, C_l = 0.02", mathrm('Qualifying bet', dollars=False)) + limits((1,20), (1, 10)) + gg.geom_line() + gg.geom_abline(slope=1, intercept=0, linetype='dashed', color='grey') , 'liab_Ob_Ol_qual') df = concat_map(liab_Ob_Ol_qual, 'liab', np.linspace(0, 10, 11)) save_both(my_plot(df, 'O_b', 'σ', 'liab', clab='-R_{bl}') + titles("-R_{bl}(O_b, σ)", "S_b = 1, C_b = 0, C_l = 0.02", mathrm('Qualifying bet', dollars=False)) + limits((1,20), (1, 10)) + gg.geom_line() , 'liab_Ob_σ_qual') df_Pf = Pf_Ob_σ(0.6).assign(profit=dollars('P_f')) df_Pq = Pq_Ob_σ(-0.3).assign(profit=dollars('P_q')) df = pd.concat((df_Pf, df_Pq), ignore_index=True) df.drop_duplicates('O_b', inplace=True) Opr = df_Pf.query('σ==0').O_b[0] σpr = df_Pq.query('O_b==1').σ[0] labels = pd.DataFrame({ 'x': [Opr+0.1, 1, 9.8], 'y': [4.8, σpr, σpr + 0.3], 'label': ["$O'$", "$σ'$", mathrm('More profit')] }) lab_aes = gg.aes('x', 'y', label='label') save_both( gg.ggplot(df, gg.aes(x='O_b', y='σ')) + gg.geom_area(gg.aes(fill='profit'), alpha=0.3) + gg.geom_vline(xintercept=Opr, linetype='dashed') + gg.geom_hline(yintercept=σpr, linetype='dashed') # text alignment can't be specified in an aes + gg.geom_text(lab_aes, data=labels.ix[:0], ha='left', va='top') + gg.geom_text(lab_aes, data=labels.ix[1:1], ha='left', va='bottom') + gg.geom_text(lab_aes, data=labels.ix[2:], ha='right', va='bottom') + gg.scale_fill_discrete(name=mathrm('Bet type'), labels=[mathrm('Free'), mathrm('Qualifying')]) + limits((1, 10), (0, 5)) + gg.ggtitle('%s "%s" %s' % (mathrm('Shape of the'), mathrm('more profitable'), mathrm('space'))) + labs('O_b', 'σ') , 'Px_shapes')
def pseudotime_scatter(adata, y, facet = True, size = 1.5, alpha = 1, color = 'black', ncol = 2, lab_ypos = 2): """Plots a scatter plot of pseudotime vs one or multiple variables Parameters -------------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.pseudotime`. y: str or list If type(y) == str, y must be a variable annotated in adata.obs and will be used as the y-axis. If type(y) == list, then multiple variables will be plotted using a shared y-axis but different point colors. facet: bool Whether to return a facetted plot or all signatures in a single plot. Only used if y is a list. size: float Controls the point size of the plot. alpha: float A value between 0 and 1. Controls point transparency. color: str A supported color name. Controls the point color if type(y)==str. Ignored otherwise. ncol: int Number of columns in the facetting, if facet=True. Ignored otherwise. lab_ypos: float Controls the y-axis position of the cell cycle phase annotation, if present. Returns ------------- A plotnine scatter plot of pseudotime. """ if type(y) == str: #-- Get data if y in adata.obs.columns: plot_df = pd.DataFrame({'x': adata.obs['pseudotime'], 'y': adata.obs[y]}) elif y in adata.var_names: plot_df = pd.DataFrame({'x': adata.obs['pseudotime'], 'y': adata[:,y].X.flatten()}) else: raise Exception('`y` variable not found') #-- Make plot if color in adata.obs.columns: time_scatter = (ggplot(plot_df, aes(x = 'x', y = 'y')) + geom_point(aes(color = color), size = size, alpha = alpha) + labs(x = 'Pseudotime', y = y) + theme_std) else: time_scatter = (ggplot(plot_df, aes(x = 'x', y = 'y')) + geom_point(size = size, alpha = alpha, color = color) + labs(x = 'Pseudotime', y = y) + theme_std) else: #-- Make multiple color plot sannot = pd.DataFrame({'pseudotime': adata.obs['pseudotime']}) sannot['id'] = range(sannot.shape[0]) #-- Checks check1 = [var in adata.var_names for var in y] check2 = [var in adata.obs.columns.values for var in y] idx = np.array(check1) | np.array(check2) y_arr = np.array(y) if not np.any(idx): raise Exception('No variables in `y` found.') if not np.all(idx): warnings.warn('Variable not found! Dropping: ' + ', '.join((y_arr[~idx]))) y = y_arr[idx] #-- Get y from obs or matrix: for var in y: if var in adata.obs.columns: sannot[var] = adata.obs[var] elif var in adata.var_names: sannot[var] = adata[:,var].X.flatten() plot_df = pd.melt(sannot, id_vars = ['id', 'pseudotime'], var_name = 'signature', value_name = 'score') plot_df['signature'] = plot_df['signature'].astype('category') plot_df['signature'].cat.reorder_categories(y, inplace=True) if facet: time_scatter = (ggplot(plot_df, aes('pseudotime', 'score')) + facet_wrap('signature', scales = 'free_y', ncol = ncol) + geom_point(aes(color = 'signature'), alpha = alpha, size = size) + theme_std) else: time_scatter = (ggplot(plot_df, aes('pseudotime', 'score')) + geom_point(aes(color = 'signature'), alpha = alpha, size = size) + theme_std) if "cell_cycle_division" in adata.uns["scycle"]: cc_divs = adata.uns["scycle"]["cell_cycle_division"] # -- Cell cycle annotation cc_phase = pd.DataFrame( dict( starts=[ None, cc_divs["pr_start"], cc_divs["rep_start"], # cc_divs["m_start"], ], labels=["G1 PM", "G1 PR", "S/G2/M"], labpos=[ np.mean([0, cc_divs["pr_start"]]), np.mean([cc_divs["pr_start"], cc_divs["rep_start"]]), np.mean([cc_divs["rep_start"], 1]), # np.mean([cc_divs["m_start"], 1]), ], y=lab_ypos, ) ) time_scatter = (time_scatter + geom_vline(aes(xintercept="starts"), linetype="dashed", data=cc_phase) + geom_text(aes(x="labpos", y="y", label="labels"), data=cc_phase)) return time_scatter
def ggpca(x, y=None, center='col', scale='none', rlab=False, clab=False, cshow=None, rsize=4, csize=2, lsize=10, lnudge=0.03, ralpha=0.6, calpha=1.0, clightalpha=0, rname='sample', cname='variable', lname='', grid=True, printit=False, xsvd=None, invert1=False, invert2=False, colscale=None, **kwargs): if cshow is None: cshow = x.shape[1] if rlab is not None and isinstance(rlab, bool): rlab = x.index if rlab else '' if clab is not None and isinstance(clab, bool): clab = x.columns if clab else '' if y is not None: pass x = x.loc[:, x.isnull().sum(axis=0) == 0] if xsvd is None: xsvd = svdForPca(x, center, scale) rsf = np.max(xsvd[0].iloc[:, 0]) - np.min(xsvd[0].iloc[:, 0]) csf = np.max(xsvd[2].iloc[0, :]) - np.min(xsvd[2].iloc[0, :]) sizeRange = sorted([csize, rsize]) alphaRange = sorted([calpha, ralpha]) ggd = pd.DataFrame({ 'PC1': xsvd[0].iloc[:, 0] / rsf, 'PC2': xsvd[0].iloc[:, 1] / rsf, 'label': rlab, 'size': rsize, 'alpha': ralpha }) cclass = [] if cshow > 0: cdata = pd.DataFrame({ 'PC1': xsvd[2].iloc[0, :] / csf, 'PC2': xsvd[2].iloc[1, :] / csf, 'label': clab, 'size': csize, 'alpha': calpha }) if cshow < x.shape[1]: cscores = cdata['PC1']**2 + cdata['PC2']**2 keep = cscores.sort_values(ascending=False).head(cshow).index if clightalpha > 0: cdata.loc[~cdata.index.isin(keep), 'label'] = '' cdata.loc[~cdata.index.isin(keep), 'alpha'] = clightalpha alphaRange = [ np.min([alphaRange[0], clightalpha]), np.max([alphaRange[1], clightalpha]) ] else: cdata = cdata.loc[cdata.index.isin(keep)] ggd = pd.concat([cdata, ggd]) cclass = [cname] * cdata.shape[0] if invert1: ggd['PC1'] = -ggd['PC1'] if invert2: ggd['PC2'] = -ggd['PC2'] if y is not None: ggd['class'] = cclass + list(y.loc[x.index]) else: ggd['class'] = cclass + ([rname] * x.shape[0]) ggo = gg.ggplot( ggd, gg.aes(x='PC1', y='PC2', color='class', size='size', alpha='alpha', label='label')) ggo += gg.geom_hline(yintercept=0, color='lightgray') ggo += gg.geom_vline(xintercept=0, color='lightgray') ggo += gg.geom_point() ggo += gg.theme_bw() ggo += gg.geom_text(nudge_y=lnudge, size=lsize, show_legend=False) if colscale is None and len(ggd['class'].unique()) < 8: colscale = [ 'darkslategray', 'goldenrod', 'lightseagreen', 'orangered', 'dodgerblue', 'darkorchid' ] colscale = colscale[0:(len(ggd['class'].unique()) - 1)] + ['gray'] if len(colscale) == 2 and cshow > 0: colscale = ['black', 'darkgray'] if len(colscale) == 2 and cshow == 0: colscale = ['black', 'red'] if len(colscale) == 3: colscale = ['black', 'red', 'darkgray'] ggo += gg.scale_color_manual(values=colscale, name=lname) ggo += gg.scale_size_continuous(guide=False, range=sizeRange) ggo += gg.scale_alpha_continuous(guide=False, range=alphaRange) ggo += gg.xlab('PC1 (' + str(np.round(100 * xsvd[1][0]**2 / ((xsvd[1]**2).sum()), 1)) + '% explained var.)') ggo += gg.ylab('PC2 (' + str(np.round(100 * xsvd[1][1]**2 / ((xsvd[1]**2).sum()), 1)) + '% explained var.)') if not grid: ggo += gg.theme(panel_grid_minor=gg.element_blank(), panel_grid_major=gg.element_blank(), panel_background=gg.element_blank()) ggo += gg.theme(axis_ticks=gg.element_blank(), axis_text_x=gg.element_blank(), axis_text_y=gg.element_blank()) if printit: print(ggo) return ggo
def plotMutsHistogram(self, value, *, k=None, mutant_order=1, bins=30, wt_vline=True, ): """Plot distribution of phenotype for all mutants of a given order. Parameters ---------- value : {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'} What value to plot. k : int or None If value is `latentPhenotype, which phenotype (1 <= `k` <= :attr:`MultiLatentSigmoidPhenotypeSimulator.n_latent_phenotypes`) to plot. mutant_order : int Plot mutations of this order. Currently only works for 1 (single mutants). bins : int Number of bins in histogram. wt_vline : bool Draw a vertical line at the wildtype value. Returns ------- plotnine.ggplot.ggplot Histogram of phenotype for all mutants. """ if mutant_order != 1: raise ValueError('only implemented for `mutant_order` of 1') if value == 'latentPhenotype': if isinstance(k, int) and 1 <= k <= self.n_latent_phenotypes: kwargs = {'k': k} xlabel = f"latentPhenotype {k}" else: raise ValueError(f"invalid `k` of {k}") else: kwargs = {} xlabel = value if value not in {'latentPhenotype', 'observedPhenotype', 'observedEnrichment'}: raise ValueError(f"invalid `value` of {value}") func = getattr(self, value) xlist = [func(m, **kwargs) for m in self._all_subs] p = (p9.ggplot(pd.DataFrame({value: xlist}), p9.aes(value)) + p9.geom_histogram(bins=bins) + p9.theme(figure_size=(3.5, 2.5)) + p9.ylab(f"number of {mutant_order}-mutants") + p9.xlab(xlabel) ) if wt_vline: p = p + p9.geom_vline( xintercept=func('', **kwargs), color=CBPALETTE[1], linetype='dashed') return p