def plot_pointplot(plot_df, y_axis_label="", use_log10=False, limits=[0, 3.2]): """ Plots the pointplot Arguments: plot_df - the dataframe that contains the odds ratio and lemmas y_axis_label - the label for the y axis use_log10 - use log10 for the y axis? """ graph = ( p9.ggplot(plot_df, p9.aes(x="lemma", y="odds_ratio")) + p9.geom_pointrange(p9.aes(ymin="lower_odds", ymax="upper_odds"), position=p9.position_dodge(width=1), size=0.3, color="#253494") + p9.scale_x_discrete(limits=(plot_df.sort_values( "odds_ratio", ascending=True).lemma.tolist())) + (p9.scale_y_log10() if use_log10 else p9.scale_y_continuous( limits=limits)) + p9.geom_hline(p9.aes(yintercept=1), linetype='--', color='grey') + p9.coord_flip() + p9.theme_seaborn( context='paper', style="ticks", font_scale=1, font='Arial') + p9.theme( # 640 x 480 figure_size=(6.66, 5), panel_grid_minor=p9.element_blank(), axis_title=p9.element_text(size=12), axis_text_x=p9.element_text(size=10)) + p9.labs(x=None, y=y_axis_label)) return graph
def test_annotation_logticks_coord_flip(): p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) + geom_point() + scale_x_log10() + scale_y_log10() + coord_flip() + theme(panel_grid_minor=element_line(color='green'), panel_grid_major=element_line(color='red'))) assert p == 'annotation_logticks_coord_flip'
def plot_company_versus_sector( df: pd.DataFrame, stock: str, sector: str # pylint: disable=unused-argument ) -> p9.ggplot: if df is None or len(df) < 1: print("No data for stock vs. sector plot... ignored") return None df["date"] = pd.to_datetime(df["date"]) # print(df) plot = p9.ggplot( df, p9.aes("date", "value", group="group", color="group", fill="group")) + p9.geom_line(size=1.5) # if there are more than two orders of magnitude between best stock and this stock, use a log scale to improve comparability of performance # a pseudo log transform is used to handle stocks which are losing money if max(df['value']) - min(df['value']) > 100.0: plot += p9.scale_y_log10( trans=transforms.pseudo_log_trans ) # cant use log scale since negative values may be involved return user_theme( plot, y_axis_label="Change since start (%)", subplots_adjust={"right": 0.8}, legend_position="right", )
def scatter_plot(df, xcol, ycol, domain, xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5): assert len(domain) == 2 POINT_SIZE = 0.5 DASH_PATTERN = (0, (3, 1)) if xname == None: xname = xcol if yname == None: yname = ycol # formater for axes' labels ax_formatter = mizani.custom_format('{:n}') if clamp: # clamp overflowing values if required df = df.copy(deep=True) df.loc[df[xcol] > domain[1], xcol] = domain[1] df.loc[df[ycol] > domain[1], ycol] = domain[1] # generate scatter plot scatter = p9.ggplot(df) scatter += p9.aes(x=xcol, y=ycol) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True) scatter += p9.labs(x=xname, y=yname) if log: # log scale scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter) scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter) else: scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter) scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter) #scatter += p9.theme_xkcd() scatter += p9.theme_bw() scatter += p9.theme( panel_grid_major=p9.element_line(color='#666666', alpha=0.5)) scatter += p9.theme(figure_size=(width, height)) # generate additional lines scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN) # diagonal scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN) # vertical rule scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN) # horizontal rule res = scatter return res
def test_annotation_logticks(): # The grid should align with the logticks p = (ggplot(df, aes('x', 'x')) + annotation_logticks(sides='b', size=.75) + geom_point() + scale_x_log10() + scale_y_log10() + theme(panel_grid_minor=element_line(color='green'), panel_grid_major=element_line(color='red'))) assert p == 'annotation_logticks'
def multiplot(files, smooth=100, alpha=0.6, loss_padd=None): if not isinstance(files, dict): files = [files] def load_hist(entry): name, file = entry try: hist = np.loadtxt(file) except OSError: warn = "{} could not be loaded with np.loadtext({})." warnings.warn(warn.format(name, file), UserWarning) return name, None is_fine = np.isfinite(hist) if not any(is_fine): return name, None iters = np.where(is_fine)[0] hist = hist[is_fine] lb = min(hist) if loss_padd is not None and lb < 0: hist += loss_padd - lb lb = loss_padd ldf = pd.DataFrame({ "loss": hist, "iteration": iters, "model": [name] * len(hist) }) if smooth is not False: if lb > 0: ldf["sloss"] = np.exp( gaussian_filter1d(np.log(hist), sigma=smooth)) else: ldf["sloss"] = gaussian_filter1d(hist, sigma=smooth) return name, ldf tasks = list(files.items()) df = pd.DataFrame() with mp.Pool() as pool: for name, ldf in tqdm(pool.imap(load_hist, tasks), total=len(tasks), desc="models"): if ldf is not None: df = df.append(ldf) def breaks(limits): ll = np.log10(limits) if (ll[1] - ll[0]) > 3: ll = np.round(ll) ex = np.linspace(ll[0], ll[1], 10) ex = np.round(ex) else: ex = np.linspace(ll[0], ll[1], 10) return 10.0**ex pl = (pn.ggplot(pn.aes("iteration", "loss", color="model"), df) + pn.geom_line(alpha=alpha) + pn.scale_y_log10() + pn.theme_minimal()) if smooth is not False: pl += pn.geom_line(pn.aes(y="sloss"), size=1, alpha=alpha) return pl, df
def test_annotation_logticks_coord_flip_discrete(): df2 = df.assign(discrete=pd.Categorical(['A' + str(a) for a in df['x']])) p = (ggplot(df2, aes('discrete', 'x')) + annotation_logticks(sides='l', size=.75) + geom_point() + scale_y_log10() + coord_flip() + theme(panel_grid_minor=element_line(color='green'), panel_grid_major=element_line(color='red'))) assert p == 'annotation_logticks_coord_flip_discrete'
def plot_violin_plots( par_id: str, dims: List[str], draws: Dict, log_scale_variables: List[str], units: Dict[str, str], confidence_intervals, measurements, ): """Plot and save violin plots of parsed distributions. :param par_id: Name of the parameter plotted :param dims: Dimensions of the parameter :param draws: pd.Dataframe of parameter distribution indexed by dimensions and contains the population samples :param log_scale_variables: Parameters that are log-distributed :param units: Dictionary of units for each parameter """ par_units = units[par_id] x = fill = dims[0] if len(dims) <= 1 else "experiments" plot = (p9.ggplot(data=draws) + p9.geom_violin( p9.aes(y=f"{par_id}", x=x, fill=fill), position="identity", color="None", size=0.5, alpha=0.7, weight=0.7, linetype="None", ) + p9.labels.ylab(f"{par_id} {par_units}")) if par_id in confidence_intervals.keys(): plot += p9.geoms.geom_errorbar( p9.aes(x=x, ymin="lower_ci", ymax="upper_ci"), data=confidence_intervals[par_id], width=0.1, ) if par_id in measurements.keys(): if len(measurements[par_id]) > 0: plot += p9.geoms.geom_point( p9.aes(y="measurement", x=x), data=measurements[par_id], ) if len(dims) == 1: plot += p9.themes.theme(axis_text_x=p9.element_text(angle=70), ) if len(dims) > 1: plot += p9.facet_wrap(f"~{dims[1]}") + p9.themes.theme( panel_spacing_y=0.05, panel_spacing_x=0.35, axis_title=p9.element_text(size=10), axis_text=p9.element_text(size=11), axis_text_y=p9.element_text(size=8, angle=45), axis_title_x=p9.element_blank(), axis_text_x=p9.element_blank(), ) if par_id in log_scale_variables: plot += p9.scale_y_log10() return plot
def scatter_plot2(df1, df2, xcol, ycol, domain, color1='black', color2='red', xname=None, yname=None, log=False, width=6, height=6, clamp=True, tickCount=5): assert len(domain) == 2 POINT_SIZE = 1.5 DASH_PATTERN = (0, (6, 2)) if xname is None: xname = xcol if yname is None: yname = ycol # formatter for axes' labels ax_formatter = mizani.custom_format('{:n}') if clamp: # clamp overflowing values if required df1 = df1.copy(deep=True) df1.loc[df1[xcol] > domain[1], xcol] = domain[1] df1.loc[df1[ycol] > domain[1], ycol] = domain[1] df2 = df2.copy(deep=True) df2.loc[df2[xcol] > domain[1], xcol] = domain[1] df2.loc[df2[ycol] > domain[1], ycol] = domain[1] # generate scatter plot scatter = p9.ggplot(df1) scatter += p9.aes(x=xcol, y=ycol) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, color=color1, alpha=0.5) scatter += p9.geom_point(size=POINT_SIZE, na_rm=True, data=df2, color=color2, alpha=0.5) scatter += p9.labs(x=xname, y=yname) # rug plots scatter += p9.geom_rug(na_rm=True, sides="tr", color=color1, alpha=0.05) scatter += p9.geom_rug(na_rm=True, sides="tr", data=df2, color=color2, alpha=0.05) if log: # log scale scatter += p9.scale_x_log10(limits=domain, labels=ax_formatter) scatter += p9.scale_y_log10(limits=domain, labels=ax_formatter) else: scatter += p9.scale_x_continuous(limits=domain, labels=ax_formatter) scatter += p9.scale_y_continuous(limits=domain, labels=ax_formatter) # scatter += p9.theme_xkcd() scatter += p9.theme_bw() scatter += p9.theme(panel_grid_major=p9.element_line(color='#666666', alpha=0.5)) scatter += p9.theme(panel_grid_minor=p9.element_blank()) scatter += p9.theme(figure_size=(width, height)) scatter += p9.theme(text=p9.element_text(size=24, color="black")) # generate additional lines scatter += p9.geom_abline(intercept=0, slope=1, linetype=DASH_PATTERN) # diagonal scatter += p9.geom_vline(xintercept=domain[1], linetype=DASH_PATTERN) # vertical rule scatter += p9.geom_hline(yintercept=domain[1], linetype=DASH_PATTERN) # horizontal rule res = scatter return res
def plot_scaling_log(plt_df: pd.DataFrame, sweep_vars: Optional[Sequence[str]] = None, with_baseline=True) -> gg.ggplot: """Plot scaling of learning time against exponential baseline.""" p = _base_scaling(plt_df, sweep_vars, with_baseline) p += gg.scale_x_log10(breaks=[5, 10, 20, 50]) p += gg.scale_y_log10(breaks=[100, 300, 1000, 3000, 10000, 30000]) p += gg.xlab('deep sea problem size (log scale)') p += gg.ylab('#episodes until < 90% bad episodes (log scale)') return plotting.facet_sweep_plot(p, sweep_vars)
def plot_compare(stats, variant, variant_baseline, metric, mode="identity", jitter=0.01): assert mode in ["identity", "ratio", "difference"] plotdata = compare_stats(stats, variant, variant_baseline) bsw = bsw_table2(plotdata, metric=metric, reltol=1.0) display(bsw) baseline_name = f"{metric}_baseline" plotdata = plotdata[[metric, baseline_name, "dataset"]].assign( ratio=plotdata[metric] / plotdata[baseline_name], difference=plotdata[metric] - plotdata[baseline_name], ) if mode == "identity": return (ggplot(data=plotdata) + geom_jitter( aes(x=f"{metric}_baseline", y=metric, fill="dataset"), width=jitter, height=jitter, ) + scale_x_log10() + scale_y_log10() + geom_abline(aes(slope=1, intercept=0))) elif mode == "ratio": return ( ggplot(data=plotdata) + geom_jitter( aes(x=f"{metric}_baseline", y="ratio", fill="dataset"), width=jitter, height=jitter, ) + scale_x_log10() + scale_y_log10() ## ablines are drawn wrt the already log-transformed axes. hence 0 = log(1) in scale + geom_abline(aes(slope=0, intercept=0.0)) + geom_abline(aes(slope=-1, intercept=0.0)) # max ) elif mode == "difference": return (ggplot(data=plotdata) + geom_jitter( aes(x=f"{metric}_baseline", y="difference", fill="dataset"), width=jitter, height=jitter, ) + scale_x_log10() + scale_y_log10() + geom_abline(aes(slope=0, intercept=0))) else: assert False, "unknown mode"
def test_annotation_logticks_faceting(): n = len(df) df2 = pd.DataFrame({ 'x': np.hstack([df['x'], df['x']]), 'g': list('a' * n + 'b' * n) }) p = (ggplot(df2) + annotation_logticks(sides='b', size=.75) + geom_point(aes('x', 'x')) + scale_x_log10() + scale_y_log10() + facet_wrap('g') + theme(panel_grid_minor=element_line(color='green'), panel_grid_major=element_line(color='red'))) assert p == 'annotation_logticks_faceting'
def plot_replicates_log_axes(self): """ Plots replicate traces from a single run on logarithmic axes to determine the baseline metabolic charge production or other stabilization. """ from plotnine import ggplot, ylab, xlab, geom_line, aes, scale_y_log10, scale_x_log10 plot = ((ggplot(self.data, aes('Time', 'Current', color='Channel')) + ylab(u'Current (μA)') + xlab('Time (seconds)') + geom_line() + scale_y_log10() + scale_x_log10())) print(plot) return plot
def kernel_stats(inFile, log_scale=True): par = get_params(inFile) n_kernel = 0 for var in sorted(par["means"]): n_kernel += "mus_f" in var tf = pm.distributions.transforms.StickBreaking() dfs = list() for tissue_type in ["t", "f"]: weights = tf.backward( par["means"][f"w_{tissue_type}_stickbreaking__"]).eval() n_dim = par["means"][f"x_{tissue_type}"].shape[1] volumes = list() for kernel in range(n_kernel): # get covariance elipse parameters packed_cov = par["means"][ f"packed_L_{tissue_type}_{kernel}_cholesky-cov-packed__"] lower = pm.expand_packed_triangular(n_dim, packed_cov, lower=True).eval() cov = np.dot(lower, lower.T) volume = np.linalg.det(cov) volumes.append(volume) type_df = pd.DataFrame( { "tissue": "tumor" if tissue_type == "t" else "non-tumor", "weight": weights, "volume": volumes, }, index=[f"kernel {i}" for i in range(n_kernel)], ) dfs.append(type_df) df = pd.concat(dfs) pl = (pn.ggplot(pn.aes("volume", "weight", color="tissue"), df) + pn.geom_point()) if log_scale: pl += pn.scale_y_log10() pl += pn.scale_x_log10() pl += pn.theme_minimal() return pl, df
def plot(self, plotDat, tag=None, log=True, by='cell_type', data_set=None, title=None, alpha=.4): pDat = plotDat.copy() gcorr = pearsonr(pDat.measured, pDat.prediction)[0] corrs = pDat.groupby( pDat[by]).apply(lambda x: pearsonr(x.measured, x.prediction)[0]) pDat['corr'] = corrs[pDat[by]].values by_str = '{}_pearson'.format(by) pDat[by_str] = pDat.apply( lambda x: '{} {:.2f}'.format(x[by], corrs[x[by]]), axis=1) if data_set: pDat = pDat.loc[pDat['dataset_name'] == data_set] pl = (pn.ggplot(pn.aes('measured', 'prediction', color=by_str), pDat) + pn.geom_point(alpha=alpha) + pn.stat_smooth(mapping=pn.aes( 'measured', 'prediction', color=by_str), method='lm', geom='line', alpha=0.5, se=False, inherit_aes=False)) if len(pDat['sample'].unique()) < 10: pl = pl + pn.aes(shape='sample') else: pl = pl + pn.aes(shape='dataset_name') if log is True: pl = pl + pn.scale_x_log10() + pn.scale_y_log10() if title is not None: pl = pl + pn.ggtitle(title) elif tag is not None: pl = pl + pn.ggtitle('{} pearson={}'.format(tag, gcorr)) else: pl = pl + pn.ggtitle('pearson={}'.format(gcorr)) return pl
def plot_box_plots(var, draws, measurements, variable_id_map): """Return plotnine.geoms.geom_boxplot of given variable.""" plot = p9.ggplot(data=draws[var]) + p9.geom_boxplot( p9.aes(x=variable_id_map[var], y=var, fill=variable_id_map[var]), outlier_shape="", ) if measurements[var].empty is False: plot += p9.geoms.geom_point(p9.aes(y="measurement", x=variable_id_map[var]), data=measurements[var]) if var != "flux": plot += p9.scale_y_log10() plot += p9.facet_wrap("~experiments") + p9.themes.theme( panel_spacing_y=0.05, panel_spacing_x=0.35, axis_title=p9.element_text(size=10), axis_text=p9.themes.element_text(size=11), ) if var == "flux": plot += p9.scale_y_continuous(breaks=np.arange(-0.001, 0.002, 0.00025), limits=[-0.001, 0.002]) plot += p9.theme(axis_text_x=p9.themes.element_text(rotation=90, size=6)) return plot
# In[6]: color_map = { "Existing": mcolors.to_hex(pd.np.array([178, 223, 138, 255]) / 255), "Novel": mcolors.to_hex(pd.np.array([31, 120, 180, 255]) / 255) } # In[7]: g = (p9.ggplot(binned_df, p9.aes(x="precision", y="edges", color="in_hetionet")) + p9.geom_point() + p9.geom_line() + p9.scale_color_manual(values={ "Existing": color_map["Existing"], "Novel": color_map["Novel"] }) + p9.facet_wrap("relation") + p9.scale_y_log10() + p9.theme_bw()) print(g) # In[8]: g = (p9.ggplot(binned_df, p9.aes(x="precision", y="edges", fill="in_hetionet")) + p9.geom_bar(stat='identity', position='dodge') + p9.scale_fill_manual(values={ "Existing": color_map["Existing"], "Novel": color_map["Novel"] }) + p9.coord_flip() + p9.facet_wrap("relation") + p9.scale_y_log10() + p9.theme(figure_size=(12, 8), aspect_ratio=9) + p9.theme_bw()) print(g) # In[9]:
int(grouped_candidates_pred_df.hetionet.value_counts()[1]), "relation": "DaG" }) datarows.append({ "edges": (grouped_candidates_pred_df.query( "pred_max > 0.5").hetionet.value_counts()[0]), "in_hetionet": "Novel", "relation": "DaG" }) edges_df = pd.DataFrame.from_records(datarows) edges_df # In[20]: g = (p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet")) + p9.geom_col(position="dodge") + p9.geom_text(p9.aes(label=( edges_df.apply(lambda x: f"{x['edges']} ({x['recall']*100:.0f}%)" if not math.isnan(x['recall']) else f"{x['edges']}", axis=1))), position=p9.position_dodge( width=1), size=9, va="bottom") + p9.scale_y_log10() + p9.theme(axis_text_y=p9.element_blank(), axis_ticks_major=p9.element_blank(), rect=p9.element_blank())) print(g)
import math g = ( p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet")) + p9.geom_col(position="dodge") + p9.geom_text( p9.aes( label=( edges_df .apply( lambda x: f"{x['edges']} ({x['recall']*100:.0f}%)" if not math.isnan(x['recall']) else f"{x['edges']}", axis=1 ) ) ), position=p9.position_dodge(width=1), size=9, va="bottom" ) + p9.scale_y_log10() + p9.theme( axis_text_y=p9.element_blank(), axis_ticks_major = p9.element_blank(), rect=p9.element_blank() ) ) print(g)
{ 'set_nb': size, 'algo': 'DL_noiseless_data', 'time': stop_time - start_time }, ignore_index=True) df_bench['set_nb'] = df_bench['set_nb'].astype(int) p = (ggplot(df_bench) + aes('set_nb', 'time', color='algo', group='algo') + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + xlab("Number of sets") + ylab("Time (seconds)")) p.save(filename=OUTPUT_ROOT + "scaling_fig4") p = (ggplot(df_bench) + aes('set_nb', 'time', color='algo', group='algo') + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10() + xlab("Number of sets") + ylab("Time (seconds)")) p.save(filename=OUTPUT_ROOT + "scaling_fig4_log10") # Normalized time to minimum number of sets min_nb_sets = min(SETS_NB) minimum_time = df_bench[df_bench['set_nb'] == min_nb_sets][['algo', 'time']] df_bench['time_relative'] = df_bench['time'] # Placeholder for index, row in df_bench.iterrows(): my_algo = row['algo'] my_minimum_time = pd.to_numeric( minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min() df_bench.at[index, 'time_relative'] = row['time'] / my_minimum_time p = (ggplot(df_bench) + aes('set_nb', 'time_relative', color='algo', group='algo') + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() +
print([_.shape for _ in tss]) print(ts.shape) # convert data into dataframe df = arr2df(arrays, ts, titles) # , n=50) df2 = df.loc[:, ['tag', 'hr', 'v']].groupby( ['tag', 'hr']).quantile(q=[.9, .99, 1]).unstack() df2.columns = ['q090', 'q099', 'q100'] df2 = df2.reset_index().melt(id_vars=['tag', 'hr'], var_name='q', value_name='v') df2['g'] = df2['tag'] + df2['q'].astype(str) hrmax = df['hr'].max() p = (ggplot(df2) + geom_line(aes('hr', 'v', color='tag', alpha='q', size='q', group='g')) + scale_x_continuous( breaks=np.arange(0, hrmax, 24), minor_breaks=np.arange(0, hrmax, 6), ) + scale_color_brewer(type='qual', palette='Set1') + scale_alpha_manual(np.linspace(1, 0.2, num=3)) + scale_size_manual([2, 1, .5]) + labs(title=title, y='conc (ppb)')) p.save(oname) print(df2['v'].max()) pp = p + scale_y_log10(limits=[1.0, df2['v'].max()]) pp.save(oname[:-4] + '_log.png')
# Save res.to_csv(FIGURE_DIRECTORY + "crm_res.tsv", sep='\t') peakstats.to_csv(FIGURE_DIRECTORY + "peakstats.tsv", sep='\t') """ # To reload : res = pd.read_csv(FIGURE_DIRECTORY+"crm_res.tsv", sep = '\t') peakstats = pd.read_csv(FIGURE_DIRECTORY+"peakstats.tsv", sep = '\t') """ ## --------- For the figures p = (ggplot(data=res[0:10000], mapping=aes(x='nb_peaks_2020', y='nb_peaks_2018')) + geom_point(mapping=aes(color='average_atypeak_score')) + scale_x_log10() + scale_y_log10() + labs(x="Nb. peaks Remap 2020", y="Nb. peaks Remap 2018", color="Mean atyPeak score per CRM") + scale_color_gradient(low="red", high="blue")) p.save(FIGURE_DIRECTORY + "crm_nb_peaks_update.pdf", verbose=False) p = (ggplot(data=res[10000:13000], mapping=aes(x='nb_peaks_2018', y='update_ratio')) + geom_point(mapping=aes(color='average_atypeak_score')) + scale_x_log10() + scale_y_log10() + labs(x="Nb. peaks Remap 2018", y="Nb peaks ReMap 2020/2018", color="Mean atyPeak score per CRM") + scale_color_gradient(low="red", high="blue")) p.save(FIGURE_DIRECTORY + "crm_update_ratio.pdf", verbose=False)
# %% # Runs with small uploads/downloads look better with log scale. use_y_log10 = max(data["MiB"]) <= 8.0 # %% # A common facet for all plots facet = p9.facet_grid("Op ~ Crc32cEnabled + MD5Enabled", labeller="label_both", scales="free_y") # %% plot = (p9.ggplot( data=data, mapping=p9.aes(x="MiB", y="ElapsedSeconds", color="ApiName")) + p9.geom_point() + facet) (plot + p9.scale_y_log10() if use_y_log10 else plot).save(args.output_prefix + ".elapsed-vs-size.png") # %% plot = (p9.ggplot( data=data, mapping=p9.aes(x="MiB", y="CpuNanosPerByte", color="ApiName")) + p9.geom_point() + facet) (plot + p9.scale_y_log10() if use_y_log10 else plot).save(args.output_prefix + ".cpu-vs-size.png") # %% (p9.ggplot(data=data, mapping=p9.aes(x="MiB", y="MiBs", color="ApiName")) + p9.geom_point() + facet).save(args.output_prefix + ".tp-vs-size.png") # %% (p9.ggplot(data=data, mapping=p9.aes(x="ApiName", y="MiBs", color="ApiName")) +
def rel_plot(sbs, variant, jitter=0.01): plotdata = sbs[sbs.variant == variant] xcol = "base" ycol = "ratio" plotdata = plotdata.assign(x=plotdata[xcol], y=plotdata[ycol]) plotdata = plotdata.assign(sbs_index=plotdata.index.values) session_text = (plotdata[["session_index", "base_session_index"]].apply( tuple, axis=1).map(lambda tup: f"{tup[0]} vs. {tup[1]}")) plotdata = plotdata.assign(session_text=session_text) x = np.geomspace(0.02, 1, num=5) y = 1 / x diag_df = pd.DataFrame({"x": x, "y": y}) scatterplot = ( ggplot(plotdata) + geom_jitter( aes(x="x", y="y", fill="dataset", color="dataset"), width=jitter, height=jitter, alpha=0.6, size=1.0, ) # shape=plotdata.dataset.map(lambda x : '.' if x in ['lvis','objectnet'] else 'o'), # size=plotdata.dataset.map(lambda x : 1. if x in ['lvis','objectnet'] else 2.)) # + geom_text(aes(x='base', y='delta', label='category', color='dataset'), va='bottom', # data=plotdata1[plotdata1.ratio < .6], # position=position_jitter(.05, .05), show_legend=False) + geom_line(aes(x="x", y="y"), data=diag_df) # + geom_text(aes(x='x', y='y', label='session_text'), va='top', data=plotdata[(plotdata.y < .4) | (plotdata.y > 3)]) + ylab(ycol) # + geom_area(aes(y2=1.1, y=.9), linetype='dashed', alpha=.7) + geom_hline(aes(yintercept=1.1), linetype="dashed", alpha=0.7) + geom_hline(aes(yintercept=0.9), linetype="dashed", alpha=0.7) + geom_vline( aes(xintercept=0.1, ), linetype="dashed", alpha=0.7, ) + geom_vline( aes(xintercept=0.3, ), linetype="dashed", alpha=0.7, ) # + geom_abline() # + geom_point(aes(x='recall', y='precision', color='variant'), size=1.) # + facet_wrap(facets=['cat'], ncol=6, scales='free_x') + xlab(xcol) # +scale_color_discrete() + theme( figure_size=(8, 5), legend_position="top", subplots_adjust={"hspace": 0.5}, legend_title=element_blank(), legend_box_margin=-1, legend_margin=0.0, axis_text=element_text(size=12, margin={ "t": 0.2, "l": -0.3 }), legend_text=element_text(size=11), axis_title=element_text(size=12, margin={ "r": -0.2, "b": 0.0, "l": 0, "t": 0.0 }), ) + scale_x_log10(labels=make_labeler(brief_format), breaks=[0.01, 0.1, 0.3, 1.0]) + scale_y_log10(labels=make_labeler(brief_format), breaks=[0.5, 0.9, 1.1, 2.0, 3.0, 6, 12])) return scatterplot
{ 'lines': lines, 'algo': 'DL', 'time': stop_time - start_time }, ignore_index=True) df_bench['lines'] = df_bench['lines'].astype(int) p = (ggplot(df_bench) + aes('lines', 'time', color='algo', group='algo') + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + xlab("Number of lines") + ylab("Time (seconds)")) p.save(filename=OUTPUT_ROOT + "scaling_fig5") p = (ggplot(df_bench) + aes('lines', 'time', color='algo', group='algo') + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10() + xlab("Number of lines") + ylab("Time (seconds)")) p.save(filename=OUTPUT_ROOT + "scaling_fig5_log10") # Normalized time to minimum line number min_nb_lines = min(LINES_NB) minimum_time = df_bench[df_bench['lines'] == min_nb_lines][['algo', 'time']] df_bench['time_relative'] = df_bench['time'] # Placeholder for index, row in df_bench.iterrows(): my_algo = row['algo'] my_minimum_time = pd.to_numeric( minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min() df_bench.at[index, 'time_relative'] = row['time'] / my_minimum_time p = (ggplot(df_bench) + aes('lines', 'time_relative', color='algo', group='algo') + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() +
# Runs with small uploads/downloads look better with log scale. use_y_log10 = max(data["MiB"]) <= 8.0 # %% # A common facet for all plots facet = p9.facet_grid( "Op ~ Crc32cEnabled + MD5Enabled", labeller="label_both", scales="free_y" ) # %% plot = ( p9.ggplot(data=data, mapping=p9.aes(x="MiB", y="ElapsedSeconds", color="ApiName")) + p9.geom_point() + facet ) (plot + p9.scale_y_log10() if use_y_log10 else plot).save( args.output_prefix + ".elapsed-vs-size.png" ) # %% plot = ( p9.ggplot(data=data, mapping=p9.aes(x="MiB", y="CpuNanosPerByte", color="ApiName")) + p9.geom_point() + facet ) (plot + p9.scale_y_log10() if use_y_log10 else plot).save( args.output_prefix + ".cpu-vs-size.png" ) # %% (
# Dict learning start_time = time.time() U_df, V_df, error = learn_dictionary_and_encode(X, n_atoms = k, alpha = ALPHA, n_jobs = 1) stop_time = time.time() df_bench = df_bench.append({'scaling_factor':k, 'algo':'DL', 'time': stop_time - start_time}, ignore_index = True) df_bench['scaling_factor'] = df_bench['scaling_factor'].astype(int) p = (ggplot(df_bench) + aes('scaling_factor', 'time', color='algo', group='algo') + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + xlab("Scaling factor (k)") + ylab("Time (seconds)")) p.save(filename = OUTPUT_ROOT + "scaling_fig3") p = (ggplot(df_bench) + aes('scaling_factor', 'time', color='algo', group='algo') + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + scale_y_log10() + xlab("Scaling factor (k)") + ylab("Time (seconds)")) p.save(filename = OUTPUT_ROOT + "scaling_fig3_log10") # Normalized time to scaling factor of 1 minimum_time = df_bench[df_bench['scaling_factor']==1][['algo','time']] df_bench['time_relative'] = df_bench['time'] # Placeholder for index, row in df_bench.iterrows(): my_algo = row['algo'] my_minimum_time = pd.to_numeric(minimum_time.loc[minimum_time['algo'] == my_algo]['time']).min() df_bench.at[index,'time_relative'] = row['time']/my_minimum_time p = (ggplot(df_bench) + aes('scaling_factor', 'time_relative', color='algo', group='algo') + geom_point() + geom_smooth(method='gpr', span=.3) + scale_x_continuous() + xlab("Scaling factor (k)") + ylab("Time (relative)"))