def test_rlm(self): p = (self.p + stat_smooth( method='rlm', formula='y ~ np.sin(x)', fill='red', )) assert p == 'rlm_formula'
def method_plot(df, baseline_rul, baseline_mse, method): plotnine.options.figure_size = (15, 8) jan = df[df['method'] == method] jan['percent_broken'] = jan['percent_broken'].round().astype(np.int) jan['percent_fail_runs'] = jan['percent_fail_runs'].round().astype(np.int) plotnine.ylim = (2, 10) gg = (plotnine.ggplot( jan, plotnine.aes(x='percent_broken', y='log_score', color='method')) + plotnine.facet_wrap('task', 2, 4) + plotnine.stat_boxplot(plotnine.aes(y='log_value', x=60), data=baseline_rul, width=80, color='#14639e', show_legend=False) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.stat_smooth(method='gls', show_legend=False) + plotnine.xlab('Grade of Degradation in %') + plotnine.ylab('Logarithmic RUL-Score') + plotnine.theme_classic(base_size=20)) gg.save('%s_log_rul.pdf' % method) plotnine.ylim = (90, 10) gg = (plotnine.ggplot( jan, plotnine.aes(x='percent_broken', y='mse', color='method')) + plotnine.facet_wrap('task', 2, 4) + plotnine.stat_boxplot(plotnine.aes(y='value', x=60), data=baseline_mse, width=80, color='#14639e', show_legend=False) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.stat_smooth(method='gls', show_legend=False) + plotnine.xlab('Grade of Degradation in %') + plotnine.ylab('RMSE') + plotnine.theme_classic(base_size=20)) gg.save('%s_rmse.pdf' % method)
def plot(self, plotDat, tag=None, log=True, by='cell_type', data_set=None, title=None, alpha=.4): pDat = plotDat.copy() gcorr = pearsonr(pDat.measured, pDat.prediction)[0] corrs = pDat.groupby( pDat[by]).apply(lambda x: pearsonr(x.measured, x.prediction)[0]) pDat['corr'] = corrs[pDat[by]].values by_str = '{}_pearson'.format(by) pDat[by_str] = pDat.apply( lambda x: '{} {:.2f}'.format(x[by], corrs[x[by]]), axis=1) if data_set: pDat = pDat.loc[pDat['dataset_name'] == data_set] pl = (pn.ggplot(pn.aes('measured', 'prediction', color=by_str), pDat) + pn.geom_point(alpha=alpha) + pn.stat_smooth(mapping=pn.aes( 'measured', 'prediction', color=by_str), method='lm', geom='line', alpha=0.5, se=False, inherit_aes=False)) if len(pDat['sample'].unique()) < 10: pl = pl + pn.aes(shape='sample') else: pl = pl + pn.aes(shape='dataset_name') if log is True: pl = pl + pn.scale_x_log10() + pn.scale_y_log10() if title is not None: pl = pl + pn.ggtitle(title) elif tag is not None: pl = pl + pn.ggtitle('{} pearson={}'.format(tag, gcorr)) else: pl = pl + pn.ggtitle('pearson={}'.format(gcorr)) return pl
from plotnine import (ggplot, aes, geom_point, facet_wrap, stat_smooth, theme_xkcd) from plotnine.data import mtcars kwargs = dict(width=6, height=4) p1 = (ggplot(mtcars, aes('wt', 'mpg')) + geom_point()) p1.save('readme-image-1.png', **kwargs) p2 = p1 + aes(color='factor(gear)') p2.save('readme-image-2.png', **kwargs) p3 = p2 + stat_smooth(method='lm') p3.save('readme-image-3.png', **kwargs) p4 = p3 + facet_wrap('~gear') p4.save('readme-image-4.png', **kwargs) p5 = p4 + theme_xkcd() p5.save('readme-image-5.png', **kwargs)
def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False): if self.y_max is not None: limits = [0, float(self.y_max)] eprint(f'Setting limits to: {limits}') else: limits = [0, 1] if expo: if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: with open('data/external/all_human_gameplay.json') as f: all_gameplay = json.load(f) frames = [] for event, name in [('parents', 'Intermediate'), ('maryland', 'Expert'), ('live', 'National')]: if self.merge_humans: name = 'Human' gameplay = all_gameplay[event] if event != 'live': control_correct_positions = gameplay['control_correct_positions'] control_wrong_positions = gameplay['control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array(len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[argsort_control] control_y = control_sorted_result.cumsum() / control_sorted_result.shape[0] control_df = pd.DataFrame({'correct': control_y, 'char_percent': control_x}) control_df['Dataset'] = 'Regular Test' control_df['Guessing_Model'] = f' {name}' frames.append(control_df) adv_correct_positions = gameplay['adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'IR Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) if len(gameplay['advneural_correct_positions']) > 0: adv_correct_positions = gameplay['advneural_correct_positions'] adv_wrong_positions = gameplay['advneural_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array(len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({'correct': adv_y, 'char_percent': adv_x}) adv_df['Dataset'] = 'RNN Adversarial' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) human_df = pd.concat(frames) human_vals = sort_humans(list(human_df['Guessing_Model'].unique())) human_dtype = CategoricalDtype(human_vals, ordered=True) human_df['Guessing_Model'] = human_df['Guessing_Model'].astype(human_dtype) dataset_dtype = CategoricalDtype(['Regular Test', 'IR Adversarial', 'RNN Adversarial'], ordered=True) human_df['Dataset'] = human_df['Dataset'].astype(dataset_dtype) if no_models: p = ggplot(human_df) + geom_point(shape='.') else: df = self.char_plot_df if 1 not in self.rounds: df = df[df['Dataset'] != 'Round 1 - IR Adversarial'] if 2 not in self.rounds: df = df[df['Dataset'] != 'Round 2 - IR Adversarial'] df = df[df['Dataset'] != 'Round 2 - RNN Adversarial'] p = ggplot(df) if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) if os.path.exists('data/external/all_human_gameplay.json') and not self.no_humans: eprint('Loading human data') p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap('Guessing_Model', ncol=1) else: facet_conf = facet_wrap('Guessing_Model', nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth(method='mavg', se=False, method_args={'window': 400}) else: chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.', linetype='None', size=0.5) else: chart = None p = ( p + facet_conf + aes(x='char_percent', y='correct', color='Dataset') ) if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, .5, 1]) + coord_cartesian(ylim=limits) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={'t': 6, 'b': 6, 'l': 1, 'r': 5}) ) + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF', '#FFFF33'], name='Questions') ) if self.title != '': p += ggtitle(self.title) return p else: if self.save_df is not None: eprint(f'Saving df to: {self.save_df}') df.to_json(self.save_df) return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth(method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + coord_cartesian(ylim=limits) )
from plotnine.data import mpg from plotnine import ggplot, aes, facet_grid, labs, geom_point, stat_smooth print(ggplot(mpg) + facet_grid(facets="year~class") + aes(x="displ", y="hwy") + labs( x="Engine Size", y="Miles per Gallon", title="Miles per Gallon for Each Year and Vehicle Class") + geom_point() + stat_smooth(method='lm'))
def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False): if expo: if os.path.exists('data/external/all_human_gameplay.json' ) and not self.no_humans: with open('data/external/all_human_gameplay.json') as f: all_gameplay = json.load(f) frames = [] for event, name in [('parents', 'Dilettante'), ('maryland', 'Expert'), ('live', 'National')]: if self.merge_humans: name = 'Human' gameplay = all_gameplay[event] if event != 'live': control_correct_positions = gameplay[ 'control_correct_positions'] control_wrong_positions = gameplay[ 'control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array( len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[ argsort_control] control_y = control_sorted_result.cumsum( ) / control_sorted_result.shape[0] control_df = pd.DataFrame({ 'correct': control_y, 'char_percent': control_x }) control_df['Dataset'] = 'Regular Test' control_df['Guessing_Model'] = f' {name}' frames.append(control_df) adv_correct_positions = gameplay[ 'adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(control_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum( ) / adv_sorted_result.shape[0] adv_df = pd.DataFrame({ 'correct': adv_y, 'char_percent': adv_x }) adv_df['Dataset'] = 'Round 1 - IR Interface' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) if len(gameplay['advneural_correct_positions']) > 0: adv_correct_positions = gameplay[ 'advneural_correct_positions'] adv_wrong_positions = gameplay[ 'advneural_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(control_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum( ) / adv_sorted_result.shape[0] adv_df = pd.DataFrame({ 'correct': adv_y, 'char_percent': adv_x }) adv_df['Dataset'] = 'Round 2 - NN Interface' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) human_df = pd.concat(frames) if no_models: p = ggplot(human_df) + geom_line() else: df = self.char_plot_df if 1 not in self.rounds: df = df[df['Dataset'] != 'Round 1 - IR Interface'] if 2 not in self.rounds: df = df[df['Dataset'] != 'Round 2 - IR Interface'] df = df[df['Dataset'] != 'Round 2 - NN Interface'] p = ggplot(df) if os.path.exists('data/external/all_human_gameplay.json' ) and not self.no_humans: eprint('Loading human data') p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap('Guessing_Model', ncol=1) else: facet_conf = facet_wrap('Guessing_Model', nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth(method='mavg', se=False, method_args={'window': 400}) else: chart = stat_summary_bin(fun_data=mean_no_se, bins=20, shape='.') else: chart = None p = (p + facet_conf + aes(x='char_percent', y='correct', color='Dataset')) if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 11)) + scale_x_continuous(breaks=[0, .5, 1]) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme( #legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={ 't': 6, 'b': 6, 'l': 1, 'r': 5 })) + scale_color_manual(values=['#FF3333', '#66CC00', '#3333FF'], name='Questions')) if self.title != '': p += ggtitle(self.title) return p else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth( method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 21)))
def plot_char_percent_vs_accuracy_smooth( self, expo=False, no_models=False, columns=False ): if self.y_max is not None: limits = [0, float(self.y_max)] eprint(f"Setting limits to: {limits}") else: limits = [0, 1] if expo: if ( os.path.exists("data/external/all_human_gameplay.json") and not self.no_humans ): with open("data/external/all_human_gameplay.json") as f: all_gameplay = json.load(f) frames = [] for event, name in [ ("parents", "Intermediate"), ("maryland", "Expert"), ("live", "National"), ]: if self.merge_humans: name = "Human" gameplay = all_gameplay[event] if event != "live": control_correct_positions = gameplay[ "control_correct_positions" ] control_wrong_positions = gameplay[ "control_wrong_positions" ] control_positions = ( control_correct_positions + control_wrong_positions ) control_positions = np.array(control_positions) control_result = np.array( len(control_correct_positions) * [1] + len(control_wrong_positions) * [0] ) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[argsort_control] control_y = ( control_sorted_result.cumsum() / control_sorted_result.shape[0] ) control_df = pd.DataFrame( {"correct": control_y, "char_percent": control_x} ) control_df["Dataset"] = "Regular Test" control_df["Guessing_Model"] = f" {name}" frames.append(control_df) adv_correct_positions = gameplay["adv_correct_positions"] adv_wrong_positions = gameplay["adv_wrong_positions"] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0] ) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum() / adv_sorted_result.shape[0] adv_df = pd.DataFrame({"correct": adv_y, "char_percent": adv_x}) adv_df["Dataset"] = "IR Adversarial" adv_df["Guessing_Model"] = f" {name}" frames.append(adv_df) if len(gameplay["advneural_correct_positions"]) > 0: adv_correct_positions = gameplay[ "advneural_correct_positions" ] adv_wrong_positions = gameplay["advneural_wrong_positions"] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(adv_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0] ) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = ( adv_sorted_result.cumsum() / adv_sorted_result.shape[0] ) adv_df = pd.DataFrame( {"correct": adv_y, "char_percent": adv_x} ) adv_df["Dataset"] = "RNN Adversarial" adv_df["Guessing_Model"] = f" {name}" frames.append(adv_df) human_df = pd.concat(frames) human_vals = sort_humans(list(human_df["Guessing_Model"].unique())) human_dtype = CategoricalDtype(human_vals, ordered=True) human_df["Guessing_Model"] = human_df["Guessing_Model"].astype( human_dtype ) dataset_dtype = CategoricalDtype( ["Regular Test", "IR Adversarial", "RNN Adversarial"], ordered=True, ) human_df["Dataset"] = human_df["Dataset"].astype(dataset_dtype) if no_models: p = ggplot(human_df) + geom_point(shape=".") else: df = self.char_plot_df if 1 not in self.rounds: df = df[df["Dataset"] != "Round 1 - IR Adversarial"] if 2 not in self.rounds: df = df[df["Dataset"] != "Round 2 - IR Adversarial"] df = df[df["Dataset"] != "Round 2 - RNN Adversarial"] p = ggplot(df) if self.save_df is not None: eprint(f"Saving df to: {self.save_df}") df.to_json(self.save_df) if ( os.path.exists("data/external/all_human_gameplay.json") and not self.no_humans ): eprint("Loading human data") p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap("Guessing_Model", ncol=1) else: facet_conf = facet_wrap("Guessing_Model", nrow=1) if not no_models: if self.mvg_avg_char: chart = stat_smooth( method="mavg", se=False, method_args={"window": 400} ) else: chart = stat_summary_bin( fun_data=mean_no_se, bins=20, shape=".", linetype="None", size=0.5, ) else: chart = None p = p + facet_conf + aes(x="char_percent", y="correct", color="Dataset") if chart is not None: p += chart p = ( p + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + scale_x_continuous(breaks=[0, 0.5, 1]) + coord_cartesian(ylim=limits) + xlab("Percent of Question Revealed") + ylab("Accuracy") + theme( # legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={"t": 6, "b": 6, "l": 1, "r": 5}) ) + scale_color_manual( values=["#FF3333", "#66CC00", "#3333FF", "#FFFF33"], name="Questions", ) ) if self.title != "": p += ggtitle(self.title) return p else: if self.save_df is not None: eprint(f"Saving df to: {self.save_df}") df.to_json(self.save_df) return ( ggplot(self.char_plot_df) + aes(x="char_percent", y="correct", color="Guessing_Model") + stat_smooth(method="mavg", se=False, method_args={"window": 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 6)) + coord_cartesian(ylim=limits) )
def mixed_linear_plots(df, x_axis, x_label): plotnine.options.figure_size = (8, 10) md = smf.mixedlm('log_score ~ percent_broken + percent_fail_runs', df, groups=df.index.values) mdf_rul = md.fit() print('#' * 18 + 'Log RUL' + '#' * 18) print(mdf_rul.summary()) md = smf.mixedlm('mse ~ percent_broken + percent_fail_runs', df, groups=df.index.values) mdf_mse = md.fit() print('#' * 18 + 'RMSE' + '#' * 18) print(mdf_mse.summary()) df['percent_broken'] = df['percent_broken'].round().astype(np.int) df['percent_fail_runs'] = df['percent_fail_runs'].round().astype(np.int) gg = (plotnine.ggplot( df, plotnine.aes(x=x_axis, y='log_score', color='method')) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.geom_abline( plotnine.aes(intercept=mdf_rul.params['Intercept'], slope=mdf_rul.params[x_axis])) + plotnine.stat_smooth(method='gls', show_legend=False) + plotnine.xlab(x_label) + plotnine.ylab('Logarithmic RUL-Score') + plotnine.scale_color_discrete(name='Method', labels=['DAAN', 'JAN']) + plotnine.theme_classic(base_size=20)) gg.save('%s_log_rul_by_method.pdf' % x_axis) gg = (plotnine.ggplot( df, plotnine.aes(x=x_axis, y='log_score', color='task')) + plotnine.geom_jitter(width=2.5, show_legend=False) + plotnine.geom_abline( plotnine.aes(intercept=mdf_rul.params['Intercept'], slope=mdf_rul.params[x_axis])) + plotnine.stat_smooth(method='gls', show_legend=False) + plotnine.xlab(x_label) + plotnine.ylab('Logarithmic RUL-Score') + plotnine.scale_color_discrete( name='Task', labels=['4→3', '4→2', '1→3', '1→2', '3→4', '3→1', '2→4', '2→1' ]) + plotnine.theme_classic(base_size=20)) gg.save('%s_log_rul_by_task.pdf' % x_axis) gg = ( plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='method')) + plotnine.geom_jitter(width=2.5) + plotnine.geom_abline( plotnine.aes(intercept=mdf_mse.params['Intercept'], slope=mdf_mse.params[x_axis])) + plotnine.stat_smooth(method='gls') + plotnine.ylab('RMSE') + plotnine.xlab(x_label) + plotnine.scale_color_discrete(name='Method', labels=['DAAN', 'JAN']) + plotnine.theme_classic(base_size=20)) gg.save('%s_mse_by_method.pdf' % x_axis) gg = (plotnine.ggplot(df, plotnine.aes(x=x_axis, y='mse', color='task')) + plotnine.geom_jitter(width=2.5) + plotnine.geom_abline( plotnine.aes(intercept=mdf_mse.params['Intercept'], slope=mdf_mse.params[x_axis])) + plotnine.stat_smooth(method='gls') + plotnine.ylab('RMSE') + plotnine.scale_color_discrete( name='Task', labels=['4→3', '4→2', '1→3', '1→2', '3→4', '3→1', '2→4', '2→1' ]) + plotnine.theme_classic(base_size=20)) gg.save('%s_mse_by_task.pdf' % x_axis)
def test_gls(self): p = (self.p + stat_smooth( method='gls', formula='y ~ np.sin(x)', fill='red', se=True)) assert p == 'gls_formula'
import numpy as np import pandas as pd import statsmodels.api as sm import statsmodels.formula.api as smf from itertools import combinations import plotnine as p # read data import ssl ssl._create_default_https_context = ssl._create_unverified_context def read_data(file): return pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/" + file) np.random.seed(12282020) dat = pd.DataFrame({'x': np.random.normal(50, 25, 1000)}) dat.loc[dat.x<0, 'x'] = 0 dat = dat[dat.x<100] dat['D'] = 0 dat.loc[dat.x>50, 'D'] = 1 dat['y1'] = 25 + 0*dat.D + 1.5 * dat.x + np.random.normal(0, 20, dat.shape[0]) dat['y2'] = 25 + 40*dat.D + 1.5 * dat.x + np.random.normal(0, 20, dat.shape[0]) print('"Counterfactual Potential Outcomes') p.ggplot(dat, p.aes(x='x', y='y1', color = 'factor(D)')) + p.geom_point(alpha = 0.5) + p.geom_vline(xintercept = 50, colour = "grey") + p.stat_smooth(method = "lm", se = 'F') + p.labs(x = "Test score (X)", y = "Potential Outcome (Y1)")
# # (C) Copyright 2021 Pavel Tisnovsky # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Eclipse Public License v1.0 # which accompanies this distribution, and is available at # http://www.eclipse.org/legal/epl-v10.html # # Contributors: # Pavel Tisnovsky # from plotnine.data import mpg from plotnine import ggplot, aes, facet_grid, labs, geom_point, stat_smooth print( ggplot(mpg) + facet_grid(facets="year~class") + aes(x="displ", y="hwy") + labs( x="Engine Size", y="Miles per Gallon", title="Miles per Gallon for Each Year and Vehicle Class", ) + geom_point() + stat_smooth(method="lm"))
#Topic ----Plot Nine- Bar Plot import numpy as np import pandas as pd import matplotlib.pyplot as plt #pip install plotnine --user from plotnine import * #https://datacarpentry.org/python-ecology-lesson/07-visualization-ggplot-python/index.html from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap from plotnine.data import mtcars mtcars (ggplot(mtcars, aes('wt', 'mpg', color='factor(gear)')) + geom_point() + stat_smooth(method='lm') + facet_wrap('~gear')) ggplot(mtcars, aes('wt', 'hp', color='factor(cyl)')) + geom_point(aes(size='mpg')) + labs(title='MT cars', subtitle ='wt vs hp', x='weight', y='horsepower') + geom_text(aes(label='name')) #%%% %matplotlib inline import plotnine as p9 from plotnine.data import mtcars from adjustText import adjust_text #https://github.com/Phlya/adjustText/wiki p9.ggplot(mtcars, aes('wt', 'hp', color='factor(cyl)')) + p9.geom_point(aes(size='mpg')) + p9.labs(title='MT cars', subtitle ='wt vs hp', x='weight', y='horsepower') + p9.geom_text(aes(label='name'), size=11, nudge_y=2) p9.geom_text? plt.ioff()# and plt.ion() plt.close() %matplotlib
def plot_char_percent_vs_accuracy_smooth(self, expo=False): if expo: p = (ggplot(self.char_plot_df) + facet_wrap('Guessing_Model', nrow=1) + aes(x='char_percent', y='correct', color='Dataset') + stat_smooth( method='mavg', se=False, method_args={'window': 200}) + scale_y_continuous(breaks=np.linspace(0, 1, 11)) + scale_x_continuous(breaks=[0, .5, 1]) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme(legend_position='top')) if os.path.exists('data/external/human_gameplay.json'): with open('data/external/human_gameplay.json') as f: gameplay = json.load(f) control_correct_positions = gameplay[ 'control_correct_positions'] control_wrong_positions = gameplay[ 'control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array( len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[argsort_control] control_y = control_sorted_result.cumsum( ) / control_sorted_result.shape[0] control_df = pd.DataFrame({ 'correct': control_y, 'char_percent': control_x }) control_df['Dataset'] = 'Test Questions' control_df['Guessing_Model'] = ' Human' adv_correct_positions = gameplay['adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(control_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum( ) / adv_sorted_result.shape[0] adv_df = pd.DataFrame({ 'correct': adv_y, 'char_percent': adv_x }) adv_df['Dataset'] = 'Challenge Questions' adv_df['Guessing_Model'] = ' Human' human_df = pd.concat([control_df, adv_df]) p = p + (geom_line(data=human_df)) return p else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth( method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 21)))
import plotnine as p # read data import ssl ssl._create_default_https_context = ssl._create_unverified_context def read_data(file): return pd.read_stata( "https://raw.github.com/scunning1975/mixtape/master/" + file) np.random.seed(12282020) dat = pd.DataFrame({'x': np.random.normal(50, 25, 1000)}) dat.loc[dat.x < 0, 'x'] = 0 dat = dat[dat.x < 100] dat['D'] = 0 dat.loc[dat.x > 50, 'D'] = 1 dat['y1'] = 25 + 0 * dat.D + 1.5 * dat.x + np.random.normal( 0, 20, dat.shape[0]) dat['y2'] = 25 + 40 * dat.D + 1.5 * dat.x + np.random.normal( 0, 20, dat.shape[0]) print('"Counterfactual Potential Outcomes') print('"Counterfactual Potential Outcomes after Treatment') p.ggplot(dat, p.aes(x='x', y='y2', color='factor(D)')) + p.geom_point( alpha=0.5) + p.geom_vline(xintercept=50, colour="grey") + p.stat_smooth( method="lm", se='F') + p.labs(x="Test score (X)", y="Potential Outcome (Y)")
from plotnine import (ggplot, aes, geom_point, facet_wrap, stat_smooth, theme_xkcd) from plotnine.data import mtcars kwargs = dict(width=6, height=4) p1 = (ggplot(mtcars, aes('wt', 'mpg')) + geom_point()) p1.save('readme-image-1.png', **kwargs) p2 = p1 + aes(color='factor(gear)') p2.save('readme-image-2.png', **kwargs) p3 = p2 + stat_smooth(method='lm') p3.save('readme-image-3.png', **kwargs) p4 = p3 + facet_wrap('~gear') p4.save('readme-image-4.png', **kwargs) p5 = p4 + theme_xkcd() p5.save('readme-image-5.png', **kwargs)
barrels = beer.loc[:, 'barrels'].values.reshape(-1, 1) crude_rate = beer.loc[:, 'crude.rate'].values.reshape(-1, 1) ## Linear model of barrels vs crude lm = LinearRegression() model = lm.fit(barrels, crude_rate) pred = lm.predict(barrels) r2 = model.score(barrels, crude_rate) ## Plot barrels and crude rate g = (ggplot(beer, aes(x = 'barrels', y = 'crude.rate')) + geom_point(color = 'black') + geom_text(aes(label = 'year'), position = positions.position_nudge(0,1)) + stat_smooth(aes(x = 'barrels', y = 'crude.rate'), color = 'blue', method = 'lm', se = False) + labs(title = "Crude Rate versus Beer Production", x = "Ten Thousands of Barrels", y = "Deaths per Million") + annotate("text", x = 18250, y = 65, label = "R2:" + str(round(r2, 3)))) g.save("figures/allBeer_crudeRate_lm_p9.png") g2 = (ggplot(beer, aes(x = 'year', y = 'barrels')) + geom_point() + labs(title = "National Beer Production 2008-2015", x = "Ten Thousands of Barrels", y = "Year")) g2.save("figures/allBeer_years_p9.png")
def gene_expression_dynamics( adata, selected_fate, gene_name_list, traj_threshold=0.1, source="transition_map", invert_PseudoTime=False, mask=None, compute_new=True, gene_exp_percentile=99, n_neighbors=8, plot_raw_data=False, stat_smooth_method="loess", ): """ Plot gene trend along the inferred dynamic trajectory. The results should be pre-computed from :func:`cospar.tl.progenitor` or :func:`cospar.tl.iterative_differentiation` Using the states that belong to the trajectory, it computes the pseudo time for these states and shows expression dynamics of selected genes along this pseudo time. Specifically, we first construct KNN graph, compute spectral embedding, and take the first component as the pseudo time. To create dynamics for a selected gene, we re-weight the expression of this gene at each cell by its probability belonging to the trajectory, and rescale the expression at selected percentile value. Finally, we fit a curve to the data points. Parameters ---------- adata: :class:`~anndata.AnnData` object Assume to contain transition maps at adata.uns. selected_fate: `str`, or `list` targeted cluster of the trajectory, as consistent with adata.obs['state_info'] When it is a list, the listed clusters are combined into a single fate cluster. gene_name_list: `list` List of genes to plot on the dynamic trajectory. traj_threshold: `float`, optional (default: 0.1), range: (0,1) Relative threshold, used to thresholding the inferred dynamic trajecotry to select states. invert_PseudoTime: `bool`, optional (default: False) If true, invert the pseudotime: 1-pseuotime. This is useful when the direction of pseudo time does not agree with intuition. mask: `np.array`, optional (default: None) A boolean array for further selecting cell states. compute_new: `bool`, optional (default: True) If true, compute everyting from stratch (as we save computed pseudotime) gene_exp_percentile: `int`, optional (default: 99) Plot gene expression below this percentile. n_neighbors: `int`, optional (default: 8) Number of nearest neighbors for constructing KNN graph. plot_raw_data: `bool`, optional (default: False) Plot the raw gene expression values of each cell along the pseudotime. stat_smooth_method: `str`, optional (default: 'loess') Smooth method used in the ggplot. Current available choices are: 'auto' (Use loess if (n<1000), glm otherwise), 'lm' (Linear Model), 'wls' (Linear Model), 'rlm' (Robust Linear Model), 'glm' (Generalized linear Model), 'gls' (Generalized Least Squares), 'lowess' (Locally Weighted Regression (simple)), 'loess' (Locally Weighted Regression), 'mavg' (Moving Average), 'gpr' (Gaussian Process Regressor)}. """ if mask == None: final_mask = np.ones(adata.shape[0]).astype(bool) else: if mask.shape[0] == adata.shape[0]: final_mask = mask else: logg.error( "mask must be a boolean array with the same size as adata.shape[0]." ) return None hf.check_available_map(adata) fig_width = settings.fig_width fig_height = settings.fig_height point_size = settings.fig_point_size if len(adata.uns["available_map"]) == 0: logg.error(f"There is no transition map available yet") else: if type(selected_fate) == str: selected_fate = [selected_fate] ( mega_cluster_list, valid_fate_list, fate_array_flat, sel_index_list, ) = hf.analyze_selected_fates(adata.obs["state_info"], selected_fate) if len(mega_cluster_list) == 0: logg.error("No cells selected. Computation aborted!") return adata else: fate_name = mega_cluster_list[0] target_idx = sel_index_list[0] x_emb = adata.obsm["X_emb"][:, 0] y_emb = adata.obsm["X_emb"][:, 1] data_des = adata.uns["data_des"][-1] data_path = settings.data_path figure_path = settings.figure_path file_name = os.path.join( data_path, f"{data_des}_fate_trajectory_pseudoTime_{fate_name}.npy" ) traj_name = f"diff_trajectory_{source}_{fate_name}" if traj_name not in adata.obs.keys(): logg.error( f"The target fate trajectory for {fate_name} with {source} have not been inferred yet.\n" "Please infer the trajectory with first with cs.tl.progenitor, \n" "or cs.tl.iterative_differentiation." ) else: prob_0 = np.array(adata.obs[traj_name]) sel_cell_idx = (prob_0 > traj_threshold * np.max(prob_0)) & final_mask if np.sum(sel_cell_idx) == 0: raise ValueError("No cells selected!") sel_cell_id = np.nonzero(sel_cell_idx)[0] if os.path.exists(file_name) and (not compute_new): logg.info("Load pre-computed pseudotime") PseudoTime = np.load(file_name) else: from sklearn import manifold data_matrix = adata.obsm["X_pca"][sel_cell_idx] method = SpectralEmbedding(n_components=1, n_neighbors=n_neighbors) PseudoTime = method.fit_transform(data_matrix) np.save(file_name, PseudoTime) # logg.info("Run time:",time.time()-t) PseudoTime = PseudoTime - np.min(PseudoTime) PseudoTime = (PseudoTime / np.max(PseudoTime)).flatten() ## re-order the pseudoTime such that the target fate has the pseudo time 1. if invert_PseudoTime: # target_fate_id=np.nonzero(target_idx)[0] # convert_fate_id=hf.converting_id_from_fullSpace_to_subSpace(target_fate_id,sel_cell_id)[0] # if np.mean(PseudoTime[convert_fate_id])<0.5: PseudoTime=1-PseudoTime PseudoTime = 1 - PseudoTime # pdb.set_trace() if ( np.sum((PseudoTime > 0.25) & (PseudoTime < 0.75)) == 0 ): # the cell states do not form a contiuum. Plot raw data instead logg.error( "The selected cell states do not form a connected graph. Cannot form a continuum of pseudoTime. Only plot the raw data" ) plot_raw_data = True ## plot the pseudotime ordering fig = plt.figure(figsize=(fig_width * 2, fig_height)) ax = plt.subplot(1, 2, 1) pl_util.customized_embedding( x_emb, y_emb, sel_cell_idx, ax=ax, title="Selected cells", point_size=point_size, ) ax1 = plt.subplot(1, 2, 2) pl_util.customized_embedding( x_emb[sel_cell_idx], y_emb[sel_cell_idx], PseudoTime, ax=ax1, title="Pseudo Time", point_size=point_size, ) # customized_embedding(x_emb[final_id],y_emb[final_id],PseudoTime,ax=ax1,title='Pseudo time') Clb = fig.colorbar( plt.cm.ScalarMappable(cmap=plt.cm.Reds), ax=ax1, label="Pseudo time" ) fig.savefig( os.path.join( figure_path, f"{data_des}_fate_trajectory_pseudoTime_{fate_name}.{settings.file_format_figs}", ) ) temp_dict = {"PseudoTime": PseudoTime} for gene_name in gene_name_list: yy_max = np.percentile( adata.obs_vector(gene_name), gene_exp_percentile ) # global blackground yy = np.array(adata.obs_vector(gene_name)[sel_cell_idx]) rescaled_yy = ( yy * prob_0[sel_cell_idx] / yy_max ) # rescaled by global background temp_dict[gene_name] = rescaled_yy from plotnine import ( aes, geom_point, ggplot, labs, stat_smooth, theme_classic, ) data2 = pd.DataFrame(temp_dict) data2_melt = pd.melt( data2, id_vars=["PseudoTime"], value_vars=gene_name_list ) gplot = ( ggplot( data=data2_melt, mapping=aes(x="PseudoTime", y="value", color="variable"), ) + ( geom_point() if plot_raw_data else stat_smooth(method=stat_smooth_method) ) + theme_classic() + labs( x="Pseudo time", y="Normalized gene expression", color="Gene name", ) ) gplot.save( os.path.join( figure_path, f"{data_des}_fate_trajectory_pseutoTime_gene_expression_{fate_name}.{settings.file_format_figs}", ), width=fig_width, height=fig_height, verbose=False, ) gplot.draw()
def test_lm_weights(self): p = (self.p + aes(weight='x.abs()') + stat_smooth( method='lm', formula='y ~ np.sin(x)', fill='red', se=True)) assert p == 'lm_formula_weights'
lmb_data['demvoteshare_c'] = lmb_data['demvoteshare'] - 0.5 # drop missing values lmb_data = lmb_data[~pd.isnull(lmb_data.demvoteshare_c)] lmb_data['demvoteshare_sq'] = lmb_data['demvoteshare_c']**2 #aggregating the data lmb_data = lmb_data[lmb_data.demvoteshare.between(.45, .55)] categories = lmb_data.lagdemvoteshare lmb_data['lagdemvoteshare_100'] = pd.cut(lmb_data.lagdemvoteshare, 100) agg_lmb_data = lmb_data.groupby('lagdemvoteshare_100')['score'].mean().reset_index() lmb_data['gg_group'] = [1 if x>.5 else 0 for x in lmb_data.lagdemvoteshare] agg_lmb_data['lagdemvoteshare'] = np.arange(0.01, 1.01, .01) # plotting p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) + p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) + p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), data=lmb_data, method = "lm", formula = 'y ~ x + I(x**2)') +\ p.xlim(0,1) + p.ylim(0,100) +\ p.geom_vline(xintercept = 0.5) p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) + p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) + p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), data=lmb_data, method = "lowess") +\ p.xlim(0,1) + p.ylim(0,100) +\ p.geom_vline(xintercept = 0.5) p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) + p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) + p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), data=lmb_data, method = "lm")+\ p.xlim(0,1) + p.ylim(0,100) +\ p.geom_vline(xintercept = 0.5)
x['k'], x['resubAccuracy'], x['testAccuracy']) for x in repeatedKnnResults], columns = ['p', 'k', 'resubAccuracy', 'testAccuracy']) ggdata = pd.concat( [DataFrame({'p' : knnResultsSimplified.p, 'k' : knnResultsSimplified.k.apply(int), 'type' : 'resub', 'Accuracy' : knnResultsSimplified.resubAccuracy}), DataFrame({'p' : knnResultsSimplified.p, 'k' : knnResultsSimplified.k.apply(int), 'type' : 'test', 'Accuracy' : knnResultsSimplified.testAccuracy})], axis = 0 ) plt.close() ggo = gg.ggplot(ggdata, gg.aes(x='p', y='Accuracy', color='type', group='type', linetype='type')) ggo += gg.facet_wrap('~ k') ggo += gg.scale_x_log10() ggo += gg.geom_point(alpha=0.6) ggo += gg.stat_smooth() ggo += gg.theme_bw() print(ggo)
# # (C) Copyright 2021 Pavel Tisnovsky # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Eclipse Public License v1.0 # which accompanies this distribution, and is available at # http://www.eclipse.org/legal/epl-v10.html # # Contributors: # Pavel Tisnovsky # from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap from plotnine.data import mtcars print( ggplot(mtcars, aes("wt", "mpg", color="factor(gear)")) + geom_point() + stat_smooth(method="lm") + facet_wrap("~gear"))
'set' + pd.Series(['1'] * anscombe.shape[0] + ['2'] * anscombe.shape[0] + ['3'] * anscombe.shape[0] + ['4'] * anscombe.shape[0]).values }) anscombe.head() plt.close() ggo = gg.ggplot(anscombe, gg.aes(x='x', y='y')) +\ gg.facet_wrap('~ set') +\ gg.geom_point() +\ gg.theme_bw() print(ggo) # ggo.save('anscombe_points.pdf', format='pdf', height=5, width=5) plt.close() ggo += gg.stat_smooth(method='lm') print(ggo) ## ggo.save('anscombe_lm.pdf', format='pdf', height=5, width=5) ## seaborn's lmplot function often useful in same situations ## one would want stat_smooth in R with ggplot2 plt.close() sns.lmplot(data=anscombe, x='x', y='y', col='set') plt.close() sns.lmplot(data=anscombe, x='x', y='y', col='set', robust=True, ci=None) plt.close() sns.lmplot(data=anscombe, x='x', y='y', col='set', lowess=True) ## -----------------------------------------------------------------
# -*- coding: utf-8 -*- """ Created on Wed Oct 14 08:12:35 2020 @author: Ashish Using plotnine library for plotting ggplot2 style graphics """ from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap from plotnine.data import mtcars plt = (ggplot(mtcars, aes('wt', 'mpg', color='factor(gear)')) + geom_point() + stat_smooth(method='lm') + facet_wrap('~gear')) # show the plot print(plt)
# # (C) Copyright 2021 Pavel Tisnovsky # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Eclipse Public License v1.0 # which accompanies this distribution, and is available at # http://www.eclipse.org/legal/epl-v10.html # # Contributors: # Pavel Tisnovsky # from plotnine import ggplot, geom_point, aes, stat_smooth from plotnine.data import mtcars g = (ggplot(mtcars, aes("wt", "mpg", color="factor(gear)")) + geom_point() + stat_smooth(method="lm")) g.save("10.png")
def plot_char_percent_vs_accuracy_smooth(self, expo=False, no_models=False, columns=False): if expo: if os.path.exists('data/external/all_human_gameplay.json'): with open('data/external/all_human_gameplay.json') as f: all_gameplay = json.load(f) frames = [] for event, name in [('parents', 'Dilettante'), ('maryland', 'Expert'), ('live', 'National')]: gameplay = all_gameplay[event] if event != 'live': control_correct_positions = gameplay[ 'control_correct_positions'] control_wrong_positions = gameplay[ 'control_wrong_positions'] control_positions = control_correct_positions + control_wrong_positions control_positions = np.array(control_positions) control_result = np.array( len(control_correct_positions) * [1] + len(control_wrong_positions) * [0]) argsort_control = np.argsort(control_positions) control_x = control_positions[argsort_control] control_sorted_result = control_result[ argsort_control] control_y = control_sorted_result.cumsum( ) / control_sorted_result.shape[0] control_df = pd.DataFrame({ 'correct': control_y, 'char_percent': control_x }) control_df['Dataset'] = 'Test Questions' control_df['Guessing_Model'] = f' {name}' frames.append(control_df) adv_correct_positions = gameplay[ 'adv_correct_positions'] adv_wrong_positions = gameplay['adv_wrong_positions'] adv_positions = adv_correct_positions + adv_wrong_positions adv_positions = np.array(control_positions) adv_result = np.array( len(adv_correct_positions) * [1] + len(adv_wrong_positions) * [0]) argsort_adv = np.argsort(adv_positions) adv_x = adv_positions[argsort_adv] adv_sorted_result = adv_result[argsort_adv] adv_y = adv_sorted_result.cumsum( ) / adv_sorted_result.shape[0] adv_df = pd.DataFrame({ 'correct': adv_y, 'char_percent': adv_x }) adv_df['Dataset'] = 'Challenge Questions' adv_df['Guessing_Model'] = f' {name}' frames.append(adv_df) human_df = pd.concat(frames) if no_models: p = ggplot(human_df) + geom_line() else: p = ggplot(self.char_plot_df) if os.path.exists('data/external/all_human_gameplay.json'): p = p + geom_line(data=human_df) if columns: facet_conf = facet_wrap('Guessing_Model', ncol=1) else: facet_conf = facet_wrap('Guessing_Model', nrow=1) p = (p + facet_conf + aes(x='char_percent', y='correct', color='Dataset') + stat_smooth( method='mavg', se=False, method_args={'window': 400}) + scale_y_continuous(breaks=np.linspace(0, 1, 11)) + scale_x_continuous(breaks=[0, .5, 1]) + xlab('Percent of Question Revealed') + ylab('Accuracy') + theme(legend_position='top', legend_box_margin=0, legend_title=element_blank(), strip_text_x=element_text(margin={ 't': 6, 'b': 6, 'l': 1, 'r': 5 }))) return p else: return ( ggplot(self.char_plot_df) + aes(x='char_percent', y='correct', color='Guessing_Model') + stat_smooth( method='mavg', se=False, method_args={'window': 500}) + scale_y_continuous(breaks=np.linspace(0, 1, 21)))