def plot(): outdir = 'output/protobowl/' pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df = load_protobowl() df.result = df.result.apply(lambda x: x is True) df['log_n_records'] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby('uid') user_stat = df_user_grouped.agg(np.mean) print('{} users'.format(len(user_stat))) print('{} records'.format(len(df))) max_color = user_stat.log_n_records.max() user_stat['alpha'] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = ggplot(user_stat) \ + geom_point(aes(x='relative_position', y='result', size='user_n_records', color='log_n_records', alpha='alpha'), show_legend={'color': False, 'alpha': False, 'size': False}) \ + scale_color_gradient(high='#e31a1c', low='#ffffcc') \ + labs(x='Average buzzing position', y='Accuracy') \ + theme(aspect_ratio=1) p0.save(os.path.join(outdir, 'protobowl_users.pdf')) # p0.draw() print('p0 done') # histogram of number of records p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \ + geom_histogram(color='#e6550d', fill='#fee6ce') \ + geom_density() \ + labs(x='Log number of records', y='Density') \ + theme(aspect_ratio=0.3) p1.save(os.path.join(outdir, 'protobowl_hist.pdf')) # p1.draw() print('p1 done') # histogram of accuracy p2 = ggplot(user_stat, aes(x='result', y='..density..')) \ + geom_histogram(color='#31a354', fill='#e5f5e0') \ + geom_density() \ + labs(x='Accuracy', y='Density') \ + theme(aspect_ratio=0.3) p2.save(os.path.join(outdir, 'protobowl_acc.pdf')) # p2.draw() print('p2 done') # histogram of buzzing position p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \ + geom_histogram(color='#3182bd', fill='#deebf7') \ + geom_density() \ + labs(x='Average buzzing position', y='Density') \ + theme(aspect_ratio=0.3) p3.save(os.path.join(outdir, 'protobowl_pos.pdf')) # p3.draw() print('p3 done')
def plot(i): c = colors[i] if i == 2: p = (qplot(x, y, color=c, xlab='x', ylab='y') + lims(color=(1, 7)) + labs(color='color')) else: p = (qplot(x, y, stroke=c, xlab='x', ylab='y') + lims(stroke=(1, 7)) + labs(stroke='stroke')) return p + theme_minimal()
def plot(i): return (qplot(x, y, color=colors[i], xlab='x', ylab='y') + lims(color=(1, 7)) + labs(color='color') + theme_minimal() + _theme )
def test_theme_linedraw(self): p = self.g + labs(title='Theme Linedraw') + theme_linedraw() if six.PY2: # Small displacement in title assert p + _theme == ('theme_linedraw', {'tol': 8}) else: assert p + _theme == 'theme_linedraw'
def plot(i): if i == 2: p = qplot(x, y, xlab='x', ylab='y') else: p = (qplot(x, y, color=colors[i], xlab='x', ylab='y') + lims(color=(1, 7)) + labs(color='color')) return p + theme_minimal()
def test_theme_xkcd(self): p = self.g + labs(title='Theme Xkcd') + theme_xkcd() if os.environ.get('TRAVIS'): # Travis does not have the fonts, we still check # to catch any other errors assert p + _theme != 'theme_gray' else: assert p + _theme == 'theme_xkcd'
def plot(i): if i == 2: _lims = lims(color=(3, 7)) else: _lims = lims(color=(1, 7)) return (qplot(x, y, color=colors[i], xlab='x', ylab='y') + _lims + labs(color='color') + theme_minimal() + _theme )
def test_geometries(tmpdir): test_file = '{}/test_file.shp'.format(tmpdir) _create_test_input_files(test_file) df = GeoDataFrame.from_file(test_file) p = (ggplot(df) + aes(fill='geometry.bounds.miny') + geom_map() + geom_map(draw='Point', size=4) + geom_map(draw='LineString', size=2) + labs(fill='miny') ) assert p + _theme == 'geometries'
def fit_curve(self): df, questions = load_protobowl() # convert prompt to false df.result = df.result.apply(lambda x: x is True) xy = list(zip(df.relative_position.tolist(), df.result.tolist())) xy = sorted(xy, key=lambda x: x[0]) ratios = dict() cnt = 0 for x, y in xy: x = int(x*1000) ratios[x] = cnt cnt += y ratios = sorted(ratios.items(), key=lambda x: x[0]) ratios = [(x / 1000, y) for x, y in ratios] ttl_correct = df.result.tolist().count(True) ttl_correct = len(xy) curve = [(x, 1 - y / ttl_correct) for x, y in ratios] X, y = list(map(list, zip(*curve))) X = np.asarray(X) y = np.asarray(y) degree = 3 polynomial_features = PolynomialFeatures(degree=degree, include_bias=False) linear_regression = LinearRegression() pipeline = Pipeline([("polynomial_features", polynomial_features), ("linear_regression", linear_regression)]) pipeline.fit(X[:, np.newaxis], y) print(pipeline.steps[1][1].coef_) def get_weight(x): return pipeline.predict(np.asarray([[x]]))[0] ddf = pd.DataFrame({'x': X, 'y': y}) p0 = ggplot(ddf, aes(x='x', y='y')) \ + geom_point(size=0.3, color='blue', alpha=0.5, shape='+') \ + stat_function(fun=get_weight, color='red', size=2, alpha=0.5) \ + labs(x='Position', y='Weight') p0.save('output/reporting/curve_score.pdf') p0.draw() return pipeline
def test_rect_aesthetics(): p = (ggplot(df, aes(xmin='xmin', xmax='xmax', ymin='ymin', ymax='ymax')) + geom_rect() + geom_rect(aes(ymin='ymin+2', ymax='ymax+2', alpha='z'), show_legend=False) + geom_rect(aes(ymin='ymin+4', ymax='ymax+4', fill='factor(z)')) + geom_rect(aes(ymin='ymin+6', ymax='ymax+6', color='factor(z+1)'), size=2) + geom_rect(aes(ymin='ymin+8', ymax='ymax+8', linetype='factor(z+2)'), color='yellow', size=2) + _theme + # for comparison with geom_tile which # has labels by default labs(x='x', y='y')) assert p == 'rect-aesthetics'
'aupr_upper': lambda x: x.aupr_mean + (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len), 'aupr_lower': lambda x: x.aupr_mean - (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len) })) dev_set_stats_df # In[9]: (p9.ggplot(dev_set_stats_df, p9.aes(x="factor(lf_num)", y="auroc_mean", color="model")) + p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar( p9.aes(ymin="auroc_lower", ymax="auroc_upper", group="model")) + p9.theme_seaborn() + p9.labs(title="DaG Tune Set AUROC", color="Model") + p9.scale_color_manual({ "disc_model": "blue", "gen_model": "orange" }) + p9.scale_y_continuous(limits=[0.4, 0.75])) # In[10]: (p9.ggplot(dev_set_stats_df, p9.aes(x="factor(lf_num)", y="aupr_mean", color="model")) + p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar( p9.aes(ymin="aupr_lower", ymax="aupr_upper", group="model")) + p9.theme_seaborn() + p9.labs(title="DaG Tune Set AUPR", color="Model") + p9.scale_color_manual({ "disc_model": "blue", "gen_model": "orange"
yintercept=0.4196, linetype="solid", color=color_mapper["2018"]) + p9.geom_hline(yintercept=published / posted, linetype="solid", color=color_mapper["2020ML"]) + p9.annotate("text", x=8.5, y=0.395, label="overall: 0.4196", size=14) + p9.annotate("text", x=8.5, y=0.48, label=f"overall: {published/posted:.4f}", size=14) + p9.theme_seaborn( style="ticks", context="paper", font="Arial", font_scale=2) + p9.theme( figure_size=(11, 6.5), axis_text_x=p9.element_blank(), axis_title_x=p9.element_text(margin={"t": 15}), ) + p9.labs(y="Proportion Published", x="Month")) g.save("output/figures/publication_rate.svg") g.save("output/figures/publication_rate.png", dpi=250) print(g) # # Plot Publication Rate # + publish_rate_df["pub_month"] = pd.Categorical( publish_rate_df.pub_month.values.tolist(), ordered=True) posted_recency_adj = ( publish_rate_df.query("label=='2020 Snapshot+Missing Links'").query( "pub_month < '2019-01'").posted.sum()) published_recency_adj = (
similarity_score_df # In[10]: print("Similarity between input vs permuted data is {}".format(permuted_score)) # In[16]: # Plot threshold = pd.DataFrame(pd.np.tile(permuted_score, (len(lst_num_experiments), 1)), index=lst_num_experiments, columns=['score']) g = ggplot(similarity_score_df, aes(x=lst_num_experiments, y='score')) + geom_line() + geom_line(aes(x=lst_num_experiments, y='score'), threshold, linetype='dashed') + labs(x = "Number of Experiments", y = "Similarity score (SVCCA)", title = "Similarity after correcting for experiment variation") \ + theme_bw() \ + theme(plot_title=element_text(weight='bold')) print(g) ggsave(plot=g, filename=svcca_file, dpi=300) # In[17]: # Plot - black threshold = pd.DataFrame(pd.np.tile(permuted_score, (len(lst_num_experiments), 1)), index=lst_num_experiments, columns=['score'])
cv_results_df = cv_results_df.append(df) cv_results_summary = (cv_results_df .groupby(['classify__alpha', 'feature_set'])['mean_test_score'] .max() .reset_index()) # In[17]: (gg.ggplot(cv_results_summary, gg.aes(x='classify__alpha', y='mean_test_score', color='feature_set')) + gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05) + gg.scale_x_log10() + gg.labs(x='Regularization strength multiplier (log alpha)', y='CV AUROC') + gg.guides(fill=gg.guide_legend(title="Feature Set")) + gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1) + theme_cognoma() ) # ## Use optimal hyperparameters to output ROC curve # In[18]: y_pred_dict = { model: { 'train': pipeline.decision_function(X_train), 'test': pipeline.decision_function(X_test) } for model, pipeline in cv_pipelines.items()
print(rpkm_data.shape) # In[6]: # 0-1 normalize per gene rnaseq_scaled_df = preprocessing.MinMaxScaler().fit_transform(rpkm_data) rnaseq_scaled_df = pd.DataFrame(rnaseq_scaled_df, columns=rpkm_data.columns, index=rpkm_data.index).T rnaseq_scaled_df.head() # In[7]: # UMAP embedding of original input data model = umap.UMAP(random_state=randomState).fit(rnaseq_scaled_df.T) input_data_UMAPencoded = model.transform(rnaseq_scaled_df.T) input_data_UMAPencoded_df = pd.DataFrame(data=input_data_UMAPencoded, index=rnaseq_scaled_df.T.index, columns=['1', '2']) g_input = ggplot(input_data_UMAPencoded_df, aes(x='1', y='2')) + geom_point( alpha=0.3) + labs(x="UMAP 1", y="UMAP 2", title="Input data") print(g_input) # In[8]: # Save scaled data rnaseq_scaled_df.to_csv(out_data_file, sep='\t', compression='xz')
def show_prediction( self, samples, plot_samples: bool = True, plot_fitted: bool = False, percent_kept: float = 0.95, side_cut_from: str = "both", show_community: bool = False, num_samples: int = 1000, **kwargs, ): """ Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions :param samples: samples from a distribution answering the prediction question (true scale). Can either be a 1-d array corresponding to one model's predictions, or a pandas DataFrame with each column corresponding to a distinct model's predictions :param plot_samples: boolean indicating whether to plot the raw samples :param plot_fitted: boolean indicating whether to compute Logistic Mixture Params from samples and plot the resulting fitted distribution. Note this is currently only supported for 1-d samples :param percent_kept: percentage of sample distrubtion to keep :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper' :param show_community: boolean indicating whether comparison to community predictions should be made :param num_samples: number of samples from the community :param kwargs: additional plotting parameters """ df = pd.DataFrame() if not plot_fitted and not plot_samples: raise ValueError( "Nothing to plot. Niether plot_fitted nor plot_samples was True" ) if plot_samples: if isinstance(samples, list): samples = pd.Series(samples) if not type(samples) in ArrayLikes: raise ValueError( "Samples should be a list, numpy array or pandas series" ) num_samples = samples.shape[0] if type(samples) == pd.DataFrame: if plot_fitted and samples.shape[1] > 1: raise ValueError( "For multiple predictions comparisons, only samples can be compared (plot_fitted must be False)" ) for col in samples: # use numpy array to ensure df doesn't become read-only df[col] = onp.array(self.scale.normalize_points(samples[col])) else: # use numpy array to ensure df doesn't become read-only df["samples"] = onp.array(self.scale.normalize_points(samples)) if plot_fitted: prediction = self.get_submission_from_samples(samples) df["fitted"] = pd.Series( [prediction.sample() for _ in range(0, num_samples)] ) if show_community: df["community"] = [ # type: ignore self.sample_normalized_community() for _ in range(0, num_samples) ] # get domain for graph given the percentage of distribution kept xmin, xmax = self.scale.denormalize_points( self.get_central_quantiles( df, percent_kept=percent_kept, side_cut_from=side_cut_from, ) ) for col in df: df[col] = self.scale.denormalize_points(df[col]) df = pd.melt(df, var_name="sources", value_name="samples") # type: ignore plot = self.comparison_plot(df, xmin, xmax, **kwargs) + labs( x="Prediction", y="Density", title=self.plot_title + "\n\nPrediction vs Community" if show_community else self.plot_title, ) try: plot.draw() # type: ignore except RuntimeError as err: print(err) print( "The plot was unable to automatically determine a bandwidth. You can manually specify one with the keyword 'bw', e.g., show_prediction(..., bw=.1)" )
def pseudotime_lineplot(adata, y, facet=True, alpha=1, smoothness=0.3, size=1, color='black', ncol=2, lab_ypos=2): """Plots a line plot of pseudotime vs one or multiple variables Parameters -------------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.pseudotime`. y: str or list If type(y) == str, y must be a variable annotated in adata.obs and will be used as the y-axis. If type(y) == list, then multiple variables will be plotted using a shared y-axis but different point colors. facet: bool Whether to return a facetted plot or all signatures in a single plot. Only used if y is a list. alpha: float A value between 0 and 1. Controls point transparency. smoothness: float A value passed to geom_smooth as span. Controls how smooth the LOESS regression will be. size: float Controls the line width of the smooth line. color: str A supported color name. Controls the point color if type(y)==str. Ignored otherwise. ncol: int Number of columns in the facetting, if facet=True. Ignored otherwise. lab_ypos: float Controls the y-axis position of the cell cycle phase annotation, if present. Returns ------------- A plotnine line plot of pseudotime. """ if type(y) == str: #-- Get data if y in adata.obs.columns: plot_df = pd.DataFrame({ 'x': adata.obs['pseudotime'], 'y': adata.obs[y] }) elif y in adata.var_names: plot_df = pd.DataFrame({ 'x': adata.obs['pseudotime'], 'y': adata[:, y].X.flatten() }) else: raise Exception('`y` variable not found') #-- Make plot if color in adata.obs.columns: time_line = (ggplot(plot_df, aes(x='x', y='y')) + geom_smooth(aes(color=color), method='loess', size=size, alpha=alpha, se=False, span=smoothness) + labs(x='Pseudotime', y=y) + theme_std) else: time_line = (ggplot(plot_df, aes(x='x', y='y')) + geom_smooth(method='loess', size=size, alpha=alpha, color=color, se=False, span=smoothness) + labs(x='Pseudotime', y=y) + theme_std) else: #-- Make multiple color plot sannot = pd.DataFrame({'pseudotime': adata.obs['pseudotime']}) sannot['id'] = range(sannot.shape[0]) #-- Checks check1 = [var in adata.var_names for var in y] check2 = [var in adata.obs.columns.values for var in y] idx = np.array(check1) | np.array(check2) y_arr = np.array(y) if not np.any(idx): raise Exception('No variables in `y` found.') if not np.all(idx): warnings.warn('Variable not found! Dropping: ' + ', '.join((y_arr[~idx]))) y = y_arr[idx] #-- Get y from obs or matrix: for var in y: if var in adata.obs.columns: sannot[var] = adata.obs[var] elif var in adata.var_names: sannot[var] = adata[:, var].X.flatten() plot_df = pd.melt(sannot, id_vars=['id', 'pseudotime'], var_name='signature', value_name='score') plot_df['signature'] = plot_df['signature'].astype('category') plot_df['signature'].cat.categories plot_df['signature'].cat.reorder_categories(y, inplace=True) if facet: time_line = (ggplot(plot_df, aes('pseudotime', 'score')) + facet_wrap('signature', scales='free_y', ncol=ncol) + geom_smooth(aes(color='signature'), method='loess', size=size, se=False, span=smoothness) + theme_std) else: time_line = (ggplot(plot_df, aes('pseudotime', 'score')) + geom_smooth(aes(color='signature'), method='loess', size=size, se=False, span=smoothness) + theme_std) if "cell_cycle_division" in adata.uns["scycle"]: cc_divs = adata.uns["scycle"]["cell_cycle_division"] # -- Cell cycle annotation cc_phase = pd.DataFrame( dict( starts=[ None, cc_divs["pr_start"], cc_divs["rep_start"], # cc_divs["m_start"], ], labels=["G1 PM", "G1 PR", "S/G2/M"], labpos=[ np.mean([0, cc_divs["pr_start"]]), np.mean([cc_divs["pr_start"], cc_divs["rep_start"]]), np.mean([cc_divs["rep_start"], 1]), # np.mean([cc_divs["m_start"], 1]), ], y=lab_ypos, )) time_line = ( time_line + geom_vline( aes(xintercept="starts"), linetype="dashed", data=cc_phase) + geom_text(aes(x="labpos", y="y", label="labels"), data=cc_phase)) return time_line
def test_theme_seaborn(self): p = self.g + labs(title='Theme Seaborn') + theme_seaborn() assert p + _theme == 'theme_seaborn'
def test_theme_minimal(self): p = self.g + labs(title='Theme Minimal') + theme_minimal() assert p + _theme == 'theme_minimal'
def test_theme_matplotlib(self): p = self.g + labs(title='Theme Matplotlib') + theme_matplotlib() assert p + _theme == 'theme_matplotlib'
def test_theme_538(self): p = self.g + labs(title='Theme 538') + theme_538() assert p + _theme == 'theme_538'
def test_theme_light(self): p = self.g + labs(title='Theme Light') + theme_light() assert p + _theme == 'theme_light'
def test_theme_gray(self): p = self.g + labs(title='Theme Gray') + theme_gray() assert p + _theme == 'theme_gray'
def test_theme_dark(self): p = self.g + labs(title='Theme Dark') + theme_dark() assert p + _theme == 'theme_dark'
def test_theme_classic(self): p = self.g + labs(title='Theme Classic') + theme_classic() assert p + _theme == 'theme_classic'
def labs(x, y): return gg.labs(x=dollars(x), y=dollars(y))
def test_theme_void(self): p = self.g + labs(title='Theme Void') + theme_void() assert p + _theme == 'theme_void'
g = ( p9.ggplot(dev_disc_df, p9.aes(x="factor(lf_num)", y="auroc_mean", linetype="model", color="relation")) + p9.geom_point() + p9.geom_errorbar(p9.aes(ymin="auroc_lower", ymax="auroc_upper")) + p9.geom_line(p9.aes(group="model")) + p9.scale_x_discrete(limits=[0, 1, 6, 11, 16, 'All']) + p9.scale_color_manual(values={ "DaG": mcolors.to_hex(color_map["DaG"]), 'CtD': mcolors.to_hex(color_map["CtD"]), "CbG": mcolors.to_hex(color_map["CbG"]), "GiG": mcolors.to_hex(color_map["GiG"]), }, guide=False) + p9.facet_wrap("relation") + p9.labs( title="Disc Model Performance (Tune Set)", ) + p9.xlab("Number of Label Functions") + p9.ylab("AUROC") + p9.theme_bw() ) print(g) g.save(filename="../disc_model_dev_auroc.png", dpi=300) # In[8]: g = ( p9.ggplot(dev_disc_df, p9.aes(x="factor(lf_num)", y="aupr_mean", linetype="model", color="relation")) + p9.geom_point()
aes(x=lst_num_partitions, y='score', color='Group'), size=1.5) \ + geom_point(aes(x=lst_num_partitions, y='score'), color ='darkgrey', size=0.5) \ + geom_errorbar(all_svcca, aes(x=lst_num_partitions, ymin='ymin', ymax='ymax'), color='darkgrey') \ + geom_line(threshold, aes(x=lst_num_partitions, y='score'), linetype='dashed', size=1, color="darkgrey", show_legend=False) \ + labs(x = "Number of Partitions", y = "Similarity score (SVCCA)", title = "Similarity across varying numbers of partitions") \ + theme(plot_title=element_text(weight='bold'), plot_background=element_rect(fill="white"), panel_background=element_rect(fill="white"), panel_grid_major_x=element_line(color="lightgrey"), panel_grid_major_y=element_line(color="lightgrey"), axis_line=element_line(color="grey"), legend_key=element_rect(fill='white', colour='white') ) \ + scale_color_manual(['#1976d2', '#b3e5fc']) \ print(panel_A) ggsave(plot=panel_A, filename=svcca_file, device="svg", dpi=300) ggsave(plot=panel_A, filename=svcca_png_file, device="svg", dpi=300)
from plotnine.data import economics from plotnine import ggplot, aes, geom_line, labs g = ggplot(economics) + \ aes(x="date", y="uempmed") + \ geom_line() + \ labs(x="date", y="median duration of unemployment") g.save("07.png")
def gene_profile(genes: list, weights: pd.DataFrame, stddev: pd.DataFrame=None, y_axis_label: str=None, highlight_n: int=None, highlight_anno: list=None, figsize: tuple=None, ylim: tuple=None) -> p9.ggplot: """ Parameters ---------- weights : DataFrame of ES weights genes : a single str or list of genes to include in plot as facets highlight_n : number of highest ESw to highlight highlight_anno : specific annotations to highlight figsize : (float, float), optional (default: None) Specify width and height of plot. Returns ------- g : ggplot Todo: * find a better way for sorting cell-types along x-axis * report if gene in genes is not found in df * report if duplicate genes * replace hacky x-axis labelling """ ### Reduce dataframe to genes of interest genes = [str.upper(s) for s in genes] idx = np.char.upper(weights.index.values.astype(str)) mask = np.isin(idx, genes) df_tidy = weights[mask] n_genes = len(df_tidy) assert (n_genes >= 1), "No matching genes found in dataframe." stddev_tidy = None if stddev is not None: idx = np.char.upper(stddev.index.values.astype(str)) mask = np.isin(idx, genes) stddev_tidy = stddev[mask] n_genes = len(df_tidy) assert (n_genes >= 1), "No matching genes found in stddev dataframe." # Constants, height and width of plot. if figsize is None: H = 5*n_genes W = 15 else: W, H = figsize if ylim is None: ylim = (-1,1) if y_axis_label is None: y_axis_label = "Expression Specificity" ### Convert to tidy / long format if necessary # Org: # ABC ACBG ACMB # POMC 0.0 0.5 0.9 # AGRP 0.2 0.0 0.0 # LEPR 0.1 0.1 0.4 # Tidy: # gene_name annotation es_weight # 1 POMC ABC 0.0 # 2 AGRP ABC 0.6 # 3 LEPR ABC 1.0 df_tidy.index.name = None # ensure that index name is none, so "index" is used for id_vars df_tidy = pd.melt(df_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="weight") if stddev_tidy is not None: stddev_tidy.index.name = None stddev_tidy = pd.melt(stddev_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="stddev") df_tidy = df_tidy.merge(stddev_tidy, on=["index", "annotation"]) ### Sort values by gene_name and es_weight and add order # Sorted: # gene_name annotation es_weight x_order # 1 AGRP MOL2 0.0 1 # 2 AGRP ACNT1 0.1 2 # 3 AGRP MOL1 0.2 3 df_tidy = df_tidy.sort_values(by=["index", "weight"]) df_tidy["order"] = np.arange(len(df_tidy)) + 1 ### Generate highlight # Default: highlight top 5 if ((highlight_n is None) and (highlight_anno is None)): highlight_n = 5 # highlight list of if (highlight_anno is not None): df_tidy["highlight"] = df_tidy["annotation"].isin(highlight_anno) elif (highlight_n is not None): df_tidy["highlight"] = df_tidy.groupby("index")["order"].rank("first", ascending=False) <= highlight_n else: df_tidy["highlight"] = np.array([False] * len(df_tidy)) df_highlight = df_tidy[df_tidy["highlight"]] ### Plot # linear function to compute x_axis text-size. # Mainly depends on number of genes in df per faceet, i.e. len(df_tidy) / len(genes). SIZE_TEXT_X_AXIS = 10.161 - 0.023 * (len(df_tidy) / len(genes)) # Limits of the order for each index gene / facet, e.g. [0, 266, 531] # These limits are necessary to only plot the labels order_lims = [0, *(df_tidy.groupby("index")["order"].max().values)] def find_nearest(array,value): array = np.asarray(array) idx = (np.abs(array - value)).argmin() return array[idx] def getbreaks(lims): # function defined for use in debugging l = find_nearest(order_lims, lims[0]) r = find_nearest(order_lims, lims[1]) breaks = np.arange(l, r) return breaks def getlbls(idx): # function defined for use in debugging idx = idx lbls = df_tidy["annotation"].iloc[idx].values return lbls p = ( ### data p9.ggplot(data=df_tidy, mapping=p9.aes(x="order", y="weight", label="annotation")) ### theming + p9.theme_classic() + p9.theme( figure_size = (W,H), axis_ticks_major_x = p9.element_blank(), axis_text_x = p9.element_text(rotation=75, hjust=0, size=SIZE_TEXT_X_AXIS), # axis_text_y = p9.element_text(size=W), panel_spacing = 1, strip_background = p9.element_blank() ) + p9.ylim(ylim[0],ylim[1]) + p9.labs( x="", # e.g. "Cell-type" y=y_axis_label, # e.g. "ES weight" ) ### viz # all + p9.geom_segment(mapping=p9.aes(x="order", xend="order", y=0, yend="weight"), color="grey", alpha=0.3, show_legend=False ) + p9.geom_point(mapping=p9.aes(size=2), color="grey", show_legend=False ) # highlight + p9.geom_point(data=df_highlight, mapping=p9.aes(size=2), color="dodgerblue", show_legend=False ) + p9.geom_segment(data=df_highlight, mapping=p9.aes(x="order", xend="order", y=0, yend="weight"), color="dodgerblue", alpha=0.3, show_legend=False ) + p9.facet_wrap("index", scales="free", nrow=n_genes ) + p9.scale_x_continuous( # order_scale is continuous across all annotations # so the scale will look weird for each facet, e.g. # facet 1 may have order 1-7, and facet 2 has order 8-14. # therefore we must use a labeller function to get the # correct labels for each interval of order. breaks = lambda lims: getbreaks(lims), labels = lambda idx: getlbls(idx) ) ) if stddev_tidy is not None: p = p + p9.geom_errorbar(mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"), color="grey", width=0.1)\ + p9.geom_errorbar(data=df_highlight, mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"), color="dodgerblue", width=0.1) # add labels last for them to be on top p = p + p9.geom_label(data=df_highlight, color = "dodgerblue", adjust_text = {'expand_points': (2,2)} ) return p
def run_cluster_sim(n_sims = 1000, param = (.1, .5), n = 1000, n_cluster = 50, rho = .5, cluster_robust = False): res = [cluster_sim(param = param, n = n, rho = rho, n_cluster = n_cluster, cluster_robust = cluster_robust) for x in range(n_sims)] df = pd.DataFrame(res) df.columns = ('b1', 'se_b1', 'ci95_lower', 'ci95_upper') df['param_caught'] = (df['ci95_lower'] <= param[1]) & (param[1] <= df['ci95_upper']) df['id'] = df.index return df # Simulation clustered SE sim_params = [.4, 0] # beta1 = 0: no effect of x on y sim_nocluster = run_cluster_sim(n_sims=1000, param = sim_params, cluster_robust = False) p.ggplot(sim_nocluster.sample(100).sort_values('b1'), p.aes(x = 'factor(id)', y = 'b1', ymin = 'ci95_lower', ymax = 'ci95_upper', color = 'param_caught')) +\ p.geom_hline(yintercept = sim_params[1], linetype = 'dashed') +\ p.geom_pointrange() +\ p.labs(x = 'sim ID', y = 'b1', title = 'Randomly Chosen 100 95% CIs') +\ p.scale_color_discrete(name = 'True param value', labels = ('missed', 'hit')) +\ p.coord_flip()
items = {} data = {} all_models = set() all_tasks = set() data = {} with gzip.open(options.input) as ifd: for row in csv.DictReader(ifd, delimiter="\t"): for k, v in row.items(): data[k] = data.get(k, []) data[k].append(v) for k in data.keys(): floats = [maybe_float(x) for x in data[k]] if all([re.match(r"^\d+$", x) for x in data[k]]): data[k] = [int(x) for x in data[k]] elif all(floats): data[k] = floats df = pandas.DataFrame(data) #print df x = (ggplot(df, aes("factor(%s)" % (options.x), options.y, color="factor(%s)" % (options.color)))) + \ ggtitle(options.title.strip("'")) + \ ylab(options.ylabel.strip("'")) + \ xlab(options.xlabel.strip("'")) + \ labs(color=options.color_label.strip("'")) + \ geom_col(show_legend=False) + \ lims(y=(0.0, 1.0)) x.save(options.output) #theme(legend_title=element_text("")) + \
["x_loc", "y_loc", "well", "site_location", "site"])["total_cell_count"].mean().reset_index()) plate = cell_count_df["plate"].unique()[0] os.makedirs(output_figuresdir, exist_ok=True) by_well_gg = ( gg.ggplot(cell_count_totalcells_df, gg.aes(x="x_loc", y="y_loc")) + gg.geom_point(gg.aes(fill="total_cell_count"), size=10) + gg.geom_text(gg.aes(label="site_location"), color="lightgrey") + gg.facet_wrap("~well") + gg.coord_fixed() + gg.theme_bw() + gg.ggtitle(f"Total Cells/Well\n{plate}") + gg.theme( axis_text=gg.element_blank(), axis_title=gg.element_blank(), strip_background=gg.element_rect(colour="black", fill="#fdfff4"), ) + gg.labs(fill="Cells") + gg.scale_fill_cmap(name="magma")) output_file = pathlib.Path(output_figuresdir, "plate_layout_cells_count_per_well.png") if check_if_write(output_file, force, throw_warning=True): by_well_gg.save(output_file, dpi=300, verbose=False) # Plot cell category ratios per well ratio_df = pd.pivot_table( cell_count_df, values="cell_count", index=["site", "plate", "well", "site_location", "x_loc", "y_loc"], columns=["Cell_Quality"], ) ratio_df = ratio_df.assign(Sum=ratio_df.sum(axis=1), Pass_Filter=ratio_df[cell_filter].sum(axis=1))
"pca2": "last", "category": "last", "section": "last", }).reset_index(drop=True)) biorxiv_pca_method_section_df.head() # ## Global View of PCA plot # In[5]: g = (p9.ggplot(biorxiv_pca_method_section_df) + p9.aes(x="pca1", y="pca2", color="category") + p9.geom_point() + p9.theme_bw() + p9.labs(title="TSNE Methods Section (300 dim)")) print(g) # ## Neuroscience Methods Section # In[6]: g = (p9.ggplot(biorxiv_pca_method_section_df.query("category=='neuroscience'")) + p9.aes(x="pca1", y="pca2", color="section") + p9.geom_point(position=p9.position_dodge(width=0.2)) + p9.facet_wrap("section") + p9.theme_bw() + p9.theme(subplots_adjust={'wspace': 0.10}) + p9.scale_color_manual({ "has_methods": "#d8b365", "no_methods": "#5ab4ac" }) + p9.labs(title="Neuroscience Methods Section"))
yend = sol.Ra[k].imag + sol.Aa[k].imag * ACC_SCALE),\ colour='red', arrow=arrow()) + # Point A geom_segment(aes(x = sol.Rpa[k].real, y = sol.Rpa[k].imag, \ xend = sol.Rpa[k].real + sol.Apaa[k].real * ACC_SCALE, \ yend = sol.Rpa[k].imag + sol.Apaa[k].imag * ACC_SCALE),\ colour='red', arrow=arrow()) + # Point C # ACCELERATIONS TEXTS (you may comment if you wish to remove acceleration informations) # positions of the accelerations texts may be altered in case the plot gets hard to read annotate("text", x = sol.Rba[k].real, y = sol.Rba[k].imag+10, label = f'${np.absolute(sol.Aba[k])/1000:.2f}~m/s^2$', colour='red') + annotate("text", x = sol.Ra[k].real, y = sol.Ra[k].imag-20, label = f'${np.absolute(sol.Aa[k])/1000:.2f}~m/s^2$', colour='red') + annotate("text", x = sol.Rpa[k].real+10, y = sol.Rpa[k].imag-20, label = f'${np.absolute(sol.Apaa[k])/1000:.2f}~m/s^2$', colour='red') + # MECHANISM KINEMATIC PROPERTIES annotate("label", x = -50, y = -100, label = f'$\\theta_2={sol.theta2[k] * 180/(2*pi):.2f}^\\circ$') + # Brackets need to be doubled so Python doesn't interpret 3a or 4a as variables annotate("label", x = -10, y = -100, label = f'$\\theta_{{3a}}={sol.theta3a[k] * 180/(2*pi):.2f}^\\circ$, $\\theta_{{3c}}={sol.theta3c[k] * 180/(2*pi):.2f}^\\circ$') + annotate("label", x = 45, y = -100, label = f'$\\theta_{{4a}}={sol.theta4a[k] * 180/(2*pi):.2f}^\\circ$, $\\theta_{{4c}}={sol.theta4c[k] * 180/(2*pi):.2f}^\\circ$') + annotate("label", x = -50, y = -150, label = f'$\\omega_2={sol.omega2[k]:.2f}~rad/s$') + annotate("label", x = 0, y = -150, label = f'$\\omega_{{3a}}={sol.omega3a[k]:.2f}~rad/s$, $\\omega_{{3c}}={sol.omega3c[k]:.2f}~rad/s$') + annotate("label", x = 70, y = -150, label = f'$\\omega_{{4a}}={sol.omega4a[k]:.2f}~rad/s$, $\\omega_{{4c}}={sol.omega4c[k]:.2f}~rad/s$') + annotate("label", x = -50, y = -200, label = f'$\\alpha_2={sol.omega2[k]:.2f}~rad/s^2$') + annotate("label", x = 0, y = -200, label = f'$\\alpha_{{3a}}={sol.alpha3a[k]:.2f}~rad/s^2$, $\\alpha_{{3c}}={sol.alpha3c[k]:.2f}~rad/s^2$') + annotate("label", x = 70, y = -200, label = f'$\\alpha_{{4a}}={sol.alpha4a[k]:.2f}~rad/s^2$, $\\alpha_{{4c}}={sol.alpha4c[k]:.2f}~rad/s^2$') + # labs(x='$x~[mm]$', y='$y~[mm]$') + coord_cartesian(xlim=SCALE_X, ylim=SCALE_Y) + # Scales plot limits, avoiding it to be bigger than necessary. You may comment this out if you wish to do so. theme_bw() # Plot is prettier with this theme compared to the default. ) plot.save('SolutionPlot.pdf', dpi = 330, width = 50, height = 30, units = 'cm')
#Topic ----Plot Nine- Bar Plot import numpy as np import pandas as pd import matplotlib.pyplot as plt #pip install plotnine --user from plotnine import * #https://datacarpentry.org/python-ecology-lesson/07-visualization-ggplot-python/index.html from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap from plotnine.data import mtcars mtcars (ggplot(mtcars, aes('wt', 'mpg', color='factor(gear)')) + geom_point() + stat_smooth(method='lm') + facet_wrap('~gear')) ggplot(mtcars, aes('wt', 'hp', color='factor(cyl)')) + geom_point(aes(size='mpg')) + labs(title='MT cars', subtitle ='wt vs hp', x='weight', y='horsepower') + geom_text(aes(label='name')) #%%% %matplotlib inline import plotnine as p9 from plotnine.data import mtcars from adjustText import adjust_text #https://github.com/Phlya/adjustText/wiki p9.ggplot(mtcars, aes('wt', 'hp', color='factor(cyl)')) + p9.geom_point(aes(size='mpg')) + p9.labs(title='MT cars', subtitle ='wt vs hp', x='weight', y='horsepower') + p9.geom_text(aes(label='name'), size=11, nudge_y=2) p9.geom_text? plt.ioff()# and plt.ion() plt.close() %matplotlib
simulated_data.head(10) # In[9]: # UMAP embedding of original input data # Get and save model model = umap.UMAP(random_state=randomState).fit(normalized_data) input_data_UMAPencoded = model.transform(normalized_data) input_data_UMAPencoded_df = pd.DataFrame(data=input_data_UMAPencoded, index=normalized_data.index, columns=['1', '2']) g_input = ggplot(input_data_UMAPencoded_df, aes(x='1', y='2')) + geom_point( alpha=0.5) + labs(x="UMAP 1", y="UMAP 2", title="Input data") print(g_input) # In[10]: # UMAP embedding of simulated data simulated_data_UMAPencoded = model.transform(simulated_data) simulated_data_UMAPencoded_df = pd.DataFrame(data=simulated_data_UMAPencoded, index=simulated_data.index, columns=['1', '2']) g_sim = ggplot(simulated_data_UMAPencoded_df, aes(x='1', y='2')) + geom_point( alpha=0.5) + labs(x="UMAP 1", y="UMAP 2", title="Simulated data") print(g_sim) # In[11]:
def scatter_enrich_components(adata, plot_type='panel', palette='Set1'): """Plots a scatter plot of trajectory vs component scores for each component from the dimensionality reduction Parameters -------------- adata: AnnData The AnnData object being used for the analysis. Must be previously evaluated by `tl.enrich_components` and `tl.principal_circle`. plot_type: str One of 'all' or 'panel' palette: 'str' A palette supported by matplotlib.cm.get_cmap Returns ------------- A plotnine scatter plot of IC scores vs trajectory. It can be used to diagnose whether the cell cycle ICs vary through the trajectory, and if others do not. """ #-- Get projection data proj = adata.obsm['X_dimRed'] n_ics = proj.shape[1] spart = adata.obs['partition'].values comps = adata.uns['scycle']['enrich_components'] #-- Make IC dataframe ic_df = pd.DataFrame(proj) ic_names = ['IC' + str(i) for i in range(proj.shape[1])] ic_df.columns = ic_names ic_df['partition'] = spart #-- Melt for plotting ic_traj = ic_df.groupby('partition').sum() # ic_traj = pd.DataFrame(zscore(ic_traj)) ic_traj['partition'] = [i for i in range(np.max(spart) + 1)] ic_trajm = pd.melt(ic_traj, id_vars='partition', var_name='IC') if plot_type == 'all': #-- Add variables for mapping plotting ic_trajm = _update_ictrajm(ic_trajm, comps, 'G1/S') ic_trajm = _update_ictrajm(ic_trajm, comps, 'G2/M+') ic_trajm = _update_ictrajm(ic_trajm, comps, 'G2/M-') ic_trajm = _update_ictrajm(ic_trajm, comps, 'Histone') idx = [i not in list(comps.keys()) for i in ic_trajm['IC']] ic_trajm['ccIC'] = 'cell cycle IC' ic_trajm.loc[idx, 'ccIC'] = 'other' #-- Get colors cmap = mpl.cm.get_cmap(palette, n_ics) colors = np.array([mpl.colors.rgb2hex(cmap(i)) for i in range(n_ics)]) jmp = int(np.round(n_ics / 3)) cidx = np.array([0, 0 + jmp, 0 + 2 * jmp, n_ics - 1]) oidx = np.array([i not in cidx for i in range(n_ics)]) cc_cols = np.append(colors[cidx], colors[oidx]) #-- Plot splot = (ggplot( ic_trajm, aes(x='partition', y='value', color='IC', alpha='ccIC', size='ccIC')) + geom_point(size=3) + geom_line() + scale_alpha_manual(values=[1, 0.2], name='IC type') + scale_size_manual(values=[1.5, 1], name='IC type') + scale_color_manual(values=cc_cols) + labs(x='Trajectory', y='IC score') + theme_std) elif plot_type == 'panel': #-- Add variables for mapping plotting ic_trajm1 = _multi_ictrajm(ic_trajm, comps, 'G1/S') ic_trajm2 = _multi_ictrajm(ic_trajm, comps, 'G2/M+') ic_trajm3 = _multi_ictrajm(ic_trajm, comps, 'G2/M-') ic_trajm4 = _multi_ictrajm(ic_trajm, comps, 'Histone') ic_trajm4plot = pd.concat([ic_trajm1, ic_trajm2, ic_trajm3, ic_trajm4]) #-- Get mapping colors cmap = mpl.cm.get_cmap(palette, 5) cc_cols = np.append( np.array([mpl.colors.rgb2hex(cmap(i)) for i in range(4)]), 'grey') #-- Plot splot = (ggplot( ic_trajm4plot, aes(x='partition', y='value', color='IC', alpha='IC', size='IC')) + facet_wrap(facets='facet') + geom_point(size=3) + geom_line() + scale_size_manual(values=[1.5, 1.5, 1.5, 1.5, 1]) + scale_alpha_manual(values=[1, 1, 1, 1, 0.2]) + scale_color_manual(values=cc_cols) + theme_std + labs(x='Trajectory', y='IC score')) return splot
# # (C) Copyright 2021 Pavel Tisnovsky # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Eclipse Public License v1.0 # which accompanies this distribution, and is available at # http://www.eclipse.org/legal/epl-v10.html # # Contributors: # Pavel Tisnovsky # from plotnine.data import mpg from plotnine import ggplot, aes, facet_grid, labs, geom_point print( ggplot(mpg) + facet_grid(facets="year~class") + aes(x="displ", y="hwy") + labs( x="Engine Size", y="Miles per Gallon", title="Miles per Gallon for Each Year and Vehicle Class", ) + geom_point())
def test_theme_bw(self): p = self.g + labs(title='Theme BW') + theme_bw() assert p + _theme == 'theme_bw'
final_df.head() # # Distribution plot g = ( p9.ggplot( final_df.replace( { "pre_vs_published": "preprint-published", "pre_vs_random": "preprint-random", } ) ) + p9.aes(x="label", y="distance") + p9.geom_violin(fill="#a6cee3") + p9.labs(x="Document Pair Groups", y="Euclidean Distance") + p9.theme_seaborn(context="paper", style="ticks", font="Arial", font_scale=2) ) g.save("output/figures/biorxiv_article_distance.svg") g.save("output/figures/biorxiv_article_distance.png") print(g) # # Logistic Regression bioRxiv preprints -> published PMC articles model = LogisticRegressionCV( Cs=5, cv=10, random_state=100, penalty="elasticnet", solver="saga", l1_ratios=[0.1, 0.5, 0.8],
def error_comparison(): char_frames = {} first_frames = {} full_frames = {} train_times = {} use_wiki = {} best_accuracies = {} for p in glob.glob(f'output/guesser/best/qanta.guesser*/guesser_report_guesstest.pickle', recursive=True): with open(p, 'rb') as f: report = pickle.load(f) name = report['guesser_name'] params = report['guesser_params'] train_times[name] = params['training_time'] use_wiki[name] = params['use_wiki'] if 'use_wiki' in params else False char_frames[name] = report['char_df'] first_frames[name] = report['first_df'] full_frames[name] = report['full_df'] best_accuracies[name] = (report['first_accuracy'], report['full_accuracy']) first_df = pd.concat([f for f in first_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index() first_df['position'] = ' Start' full_df = pd.concat([f for f in full_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index() full_df['position'] = 'End' compare_df = pd.concat([first_df, full_df]) compare_df = compare_df[compare_df.guesser != 'qanta.guesser.vw.VWGuesser'] compare_results = {} comparisons = ['qanta.guesser.dan.DanGuesser', 'qanta.guesser.rnn.RnnGuesser', 'qanta.guesser.elasticsearch.ElasticSearchGuesser'] cr_rows = [] for (qnum, position), group in compare_df.groupby(['qanta_id', 'position']): group = group.set_index('guesser') correct_guessers = [] wrong_guessers = [] for name in comparisons: if group.loc[name].correct == 1: correct_guessers.append(name) else: wrong_guessers.append(name) if len(correct_guessers) > 3: raise ValueError('this should be unreachable') elif len(correct_guessers) == 3: cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Correct'}) elif len(correct_guessers) == 0: cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Wrong'}) elif len(correct_guessers) == 1: cr_rows.append({ 'qnum': qnum, 'Position': position, 'model': to_shortname(correct_guessers[0]), 'Result': 'Correct' }) else: cr_rows.append({ 'qnum': qnum, 'Position': position, 'model': to_shortname(wrong_guessers[0]), 'Result': 'Wrong' }) cr_df = pd.DataFrame(cr_rows) # samples = cr_df[(cr_df.Position == ' Start') & (cr_df.Result == 'Correct') & (cr_df.model == 'RNN')].qnum.values # for qid in samples: # q = lookup[qid] # print(q['first_sentence']) # print(q['page']) # print() p = ( ggplot(cr_df) + aes(x='model', fill='Result') + facet_grid(['Result', 'Position']) #+ facet_wrap('Position', labeller='label_both') + geom_bar(aes(y='(..count..) / sum(..count..)'), position='dodge') + labs(x='Models', y='Fraction with Corresponding Result') + coord_flip() + theme_fs() + theme(aspect_ratio=.6) ) p.save('output/plots/guesser_error_comparison.pdf')
final_annotated_df.head() # In[6]: binned_stats_df = (final_annotated_df.groupby( "distance_bin").final_same_paper.mean().to_frame().rename( index=str, columns={ "final_same_paper": "frac_correct" }).reset_index()) binned_stats_df # In[7]: g = (p9.ggplot(binned_stats_df, p9.aes(x="distance_bin", y="frac_correct")) + p9.geom_col(fill="#a6cee3") + p9.coord_flip() + p9.labs(x="Fraction Correct", y="Euclidean Distance Bins") + p9.theme_seaborn( context="paper", style="ticks", font="Arial", font_scale=1.5)) g.save("output/figures/distance_bin_accuracy.svg") g.save("output/figures/distance_bin_accuracy.png", dpi=250) print(g) # # Logsitic Regression Performance # In[8]: biorxiv_embed_df = (pd.read_csv(Path("../word_vector_experiment/output/") / "word2vec_output/" / "biorxiv_all_articles_300.tsv.xz", sep="\t").set_index("document")) biorxiv_embed_df.head()
def histogram_make(roi, combined_raw_df, list_rois, config, xlimit, save_function, find_xlim_function): if combined_raw_df.empty: if config.verbose: print( 'INFO: Histograms cannot be made for the No ROI category.') return else: thisroi = list_rois[roi] figure = ( pltn.ggplot(combined_raw_df, pltn.aes(x="voxel_value")) + pltn.theme_538() + pltn.geom_histogram( binwidth=config.histogram_binwidth, fill=config.histogram_fig_colour, boundary=0, na_rm=True ) # Boundary centers the bars, na_rm cancels error from setting an xlimit + pltn.facet_grid( f"{config.histogram_fig_y_facet}~{config.histogram_fig_x_facet}", drop=True, labeller="label_both") + pltn.labs(x=config.histogram_fig_label_x, y=config.histogram_fig_label_y) + pltn.theme( panel_grid_minor_x=pltn.themes.element_line(alpha=0), panel_grid_major_x=pltn.themes.element_line(alpha=1), panel_grid_major_y=pltn.element_line(alpha=0), plot_background=pltn.element_rect(fill="white"), panel_background=pltn.element_rect(fill="gray", alpha=0.1), axis_title_x=pltn.element_text( weight='bold', color='black', size=20), axis_title_y=pltn.element_text( weight='bold', color='black', size=20), strip_text_x=pltn.element_text( weight='bold', size=10, color='black'), strip_text_y=pltn.element_text( weight='bold', size=10, color='black'), axis_text_x=pltn.element_text(size=10, color='black'), axis_text_y=pltn.element_text(size=10, color='black'), dpi=config.plot_dpi)) # Display mean or median as vertical lines on plot if config.histogram_show_mean or config.histogram_show_median: figure += pltn.geom_vline(pltn.aes(xintercept="stat_value", color="Statistic"), size=config.histogram_stat_line_size) figure += pltn.scale_color_manual(values=[ config.colorblind_friendly_plot_colours[3], config.colorblind_friendly_plot_colours[1] ]) # Display legend for mean and median if not config.histogram_show_legend: figure += pltn.theme(legend_position='none') if xlimit: # Set y limit of figure (used to make it the same for every barchart) figure += pltn.xlim(-1, xlimit) thisroi += '_same_xlim' else: figure += pltn.xlim(-1, None) returned_xlim = 0 if config.use_same_axis_limits in ('Same limits', 'Create both') and xlimit == 0: returned_xlim = find_xlim_function(thisroi, figure, 'xaxis') if config.use_same_axis_limits == 'Same limits' and xlimit == 0: return returned_xlim elif xlimit != 0: folder = 'Same_xaxis' else: folder = 'Different_xaxis' # Suppress Pandas warning about alignment of non-concatenation axis warnings.simplefilter(action='ignore', category=FutureWarning) save_function(figure, thisroi, config, folder, 'histogram') warnings.simplefilter(action='default', category=FutureWarning) return returned_xlim
category_sim_df.to_csv("output/category_cossim_95_ci.tsv", sep="\t", index=False) # In[11]: g = (p9.ggplot(category_sim_df) + p9.aes(x="category", y="pca1_cossim", ymin="pca1_cossim_lower", ymax="pca1_cossim_upper") + p9.geom_pointrange() + p9.coord_flip() + p9.theme_bw() + p9.scale_x_discrete(limits=category_sim_df.category.tolist()[::-1]) + p9.theme(figure_size=(11, 7), text=p9.element_text(size=12), panel_grid_major_y=p9.element_blank()) + p9.labs(y="PC1 Cosine Similarity")) g.save("output/pca_plots/figures/category_pca1_95_ci.svg", dpi=500) g.save("output/pca_plots/figures/category_pca1_95_ci.png", dpi=500) print(g) # In[12]: g = (p9.ggplot(category_sim_df) + p9.aes(x="category", y="pca2_cossim", ymax="pca2_cossim_upper", ymin="pca2_cossim_lower") + p9.geom_pointrange() + p9.coord_flip() + p9.theme_bw() + p9.scale_x_discrete(limits=category_sim_df.category.tolist()[::-1]) + p9.theme(figure_size=(11, 7), text=p9.element_text(size=12), panel_grid_major_y=p9.element_blank()) +
def merge_ologram_stats(inputfiles=None, pdf_width=None, pdf_height=None, output=None, labels=None): # ------------------------------------------------------------------------- # Check user provided labels # ------------------------------------------------------------------------- if labels is not None: labels = labels.split(",") for elmt in labels: if not re.search("^[A-Za-z0-9_]+$", elmt): message( "Only alphanumeric characters and '_' allowed for --more-bed-labels", type="ERROR") if len(labels) != len(inputfiles): message( "--labels: the number of labels should be" " the same as the number of input files ", type="ERROR") if len(labels) != len(set(labels)): message("Redundant labels not allowed.", type="ERROR") # ------------------------------------------------------------------------- # Loop over input files # ------------------------------------------------------------------------- df_list = list() df_label = list() for pos, infile in enumerate(inputfiles): message("Reading file : " + infile.name) # Read the dataset into a temporay dataframe df_tmp = pd.read_csv(infile, sep='\t', header=0, index_col=None) # Change name of 'feature_type' column. df_tmp = df_tmp.rename(index=str, columns={"feature_type": "Feature"}) # Assign the name of the dataset to a new column if labels is None: file_short_name = os.path.basename( os.path.normpath(os.path.dirname(infile.name))) df_label += [file_short_name] else: file_short_name = labels[pos] df_label += [labels[pos]] df_tmp = df_tmp.assign( **{"dataset": [file_short_name] * df_tmp.shape[0]}) # Pval set to 0 or -1 are changed to 1e-320 and NaN respectively df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == 0, 'summed_bp_overlaps_pvalue'] = 1e-320 df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == -1, 'summed_bp_overlaps_pvalue'] = np.nan # Compute -log10(pval) df_tmp = df_tmp.assign( **{"-log_10(pval)": -np.log10(df_tmp.summed_bp_overlaps_pvalue)}) # Which p-values are signifcant ? # TODO Add Benjamini-Hochberg multitesting correction df_tmp = df_tmp.assign( **{"pval_signif": df_tmp.summed_bp_overlaps_pvalue < 0.01}) # Add the df to the list to be subsequently merged df_list += [df_tmp] if len(set(df_label)) < len(df_label): message( 'Enclosing directories are ambiguous and cannot be used as labels. You may use "--labels".', type="ERROR") # ------------------------------------------------------------------------- # Concatenate dataframes (row bind) # ------------------------------------------------------------------------- message("Merging dataframes.") df_merged = pd.concat(df_list, axis=0) # ------------------------------------------------------------------------- # Plotting # ------------------------------------------------------------------------- message("Plotting") my_plot = ggplot(data=df_merged, mapping=aes(y='Feature', x='dataset')) my_plot += geom_tile(aes(fill='summed_bp_overlaps_log2_fold_change')) my_plot += scale_fill_gradient2() my_plot += labs(fill="log2(fold change) for summed bp overlaps") # Points for p-val. Must be after geom_tile() my_plot += geom_point(data=df_merged.loc[df_merged['pval_signif']], mapping=aes(x='dataset', y='Feature', color='-log_10(pval)'), size=5, shape='D', inherit_aes=False) my_plot += scale_color_gradientn(colors=["#160E00", "#FFB025", "#FFE7BD"]) my_plot += labs(color="-log10(p-value)") # Theming my_plot += theme_bw() my_plot += theme(panel_grid_major=element_blank(), axis_text_x=element_text(rotation=90), panel_border=element_blank(), axis_ticks=element_blank()) # ------------------------------------------------------------------------- # Saving # ------------------------------------------------------------------------- message("Saving") nb_ft = len(list(df_merged['Feature'].unique())) nb_datasets = len(list(df_merged['dataset'].unique())) if pdf_width is None: panel_width = 0.6 pdf_width = panel_width * nb_datasets if pdf_width > 100: pdf_width = 100 message("Setting --pdf-width to 100 (limit)") if pdf_height is None: panel_height = 0.6 pdf_height = panel_height * nb_ft if pdf_height > 100: pdf_height = 100 message("Setting --pdf-height to 100 (limit)") message("Page width set to " + str(pdf_width)) message("Page height set to " + str(pdf_height)) figsize = (pdf_width, pdf_height) # ------------------------------------------------------------------------- # Turn warning off. Both pandas and plotnine use warnings for deprecated # functions. I need to turn they off although I'm not really satisfied with # this solution... # ------------------------------------------------------------------------- def fxn(): warnings.warn("deprecated", DeprecationWarning) # ------------------------------------------------------------------------- # Saving # ------------------------------------------------------------------------- with warnings.catch_warnings(): warnings.simplefilter("ignore") fxn() message("Saving diagram to file : " + output.name) message("Be patient. This may be long for large datasets.") # NOTE : We must manually specify figure size with save_as_pdf_pages save_as_pdf_pages(filename=output.name, plots=[my_plot + theme(figure_size=figsize)], width=pdf_width, height=pdf_height)
aes(x=lst_num_experiments, y='score', color='Group'), size=1.5) \ + geom_point(aes(x=lst_num_experiments, y='score'), color ='darkgrey', size=0.5) \ + geom_errorbar(all_svcca[all_svcca['Group'] == 'uncorrected'], aes(x=lst_num_experiments, ymin='ymin', ymax='ymax'), color='darkgrey') \ + geom_line(threshold, aes(x=lst_num_experiments, y='score'), linetype='dashed', size=1, color="darkgrey", show_legend=False) \ + labs(x = "Number of Partitions", y = "Similarity score (SVCCA)", title = "Similarity across varying numbers of partitions") \ + theme(plot_title=element_text(weight='bold'), plot_background=element_rect(fill="white"), panel_background=element_rect(fill="white"), panel_grid_major_x=element_line(color="lightgrey"), panel_grid_major_y=element_line(color="lightgrey"), axis_line=element_line(color="grey"), legend_key=element_rect(fill='white', colour='white') ) \ + scale_color_manual(['#b3e5fc']) \ print(g) ggsave(plot=g, filename=svcca_uncorrected_file, dpi=300) # In[9]:
def clone_rarefaction(self: Union[AnnData, Dandelion], color: str, clone_key: Union[None, str] = None, palette: Union[None, Sequence] = None, figsize: Tuple[Union[int, float], Union[int, float]] = (6, 4), save: Union[None, str] = None) -> ggplot: """ Plots rarefaction curve for cell numbers vs clone size. Parameters ---------- self : `AnnData`, `Dandelion` `AnnData` or `Dandelion` object. color : str Column name to split the calculation of clone numbers for a given number of cells for e.g. sample, patient etc. clone_key : str, optional Column name specifying the clone_id column in metadata/obs. palette : Sequence, optional Color mapping for unique elements in color. Will try to retrieve from AnnData `.uns` slot if present. figsize : Tuple[Union[int,float], Union[int,float]] Size of plot. save : str, optional Save path. Returns ------- rarefaction curve plot. """ if self.__class__ == AnnData: metadata = self.obs.copy() elif self.__class__ == Dandelion: metadata = self.metadata.copy() if clone_key is None: clonekey = 'clone_id' else: clonekey = clone_key groups = list(set(metadata[color])) metadata = metadata[metadata['contig_QC_pass'].isin([True, 'True'])] if type(metadata[clonekey]) == 'category': metadata[clonekey] = metadata[clonekey].cat.remove_unused_categories() res = {} for g in groups: _metadata = metadata[metadata[color] == g] res[g] = _metadata[clonekey].value_counts() res_ = pd.DataFrame.from_dict(res, orient='index') # remove those with no counts rowsum = res_.sum(axis=1) print( 'removing due to zero counts:', ', '.join( [res_.index[i] for i, x in enumerate(res_.sum(axis=1) == 0) if x])) sleep(0.5) res_ = res_[~(res_.sum(axis=1) == 0)] # set up for calculating rarefaction tot = res_.apply(sum, axis=1) S = res_.apply(lambda x: x[x > 0].shape[0], axis=1) nr = res_.shape[0] # append the results to a dictionary rarecurve = {} for i in tqdm(range(0, nr), desc='Calculating rarefaction curve '): n = np.arange(1, tot[i], step=10) if (n[-1:] != tot[i]): n = np.append(n, tot[i]) rarecurve[res_.index[i]] = [ rarefun(np.array(res_.iloc[i, ]), z) for z in n ] y = pd.DataFrame([rarecurve[c] for c in rarecurve]).T pred = pd.DataFrame( [np.append(np.arange(1, s, 10), s) for s in res_.sum(axis=1)], index=res_.index).T y = y.melt() pred = pred.melt() pred['yhat'] = y['value'] options.figure_size = figsize if palette is None: if self.__class__ == AnnData: try: pal = self.uns[str(color) + '_colors'] except: if len(list(set((pred.variable)))) <= 20: pal = palettes.default_20 elif len(list(set((pred.variable)))) <= 28: pal = palettes.default_28 elif len(list(set((pred.variable)))) <= 102: pal = palettes.default_102 else: pal = None if pal is not None: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=color) + scale_color_manual(values=(pal)) + geom_line()) else: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=color) + geom_line()) else: if len(list(set((pred.variable)))) <= 20: pal = palettes.default_20 elif len(list(set((pred.variable)))) <= 28: pal = palettes.default_28 elif len(list(set((pred.variable)))) <= 102: pal = palettes.default_102 else: pal = None if pal is not None: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=color) + scale_color_manual(values=(pal)) + geom_line()) else: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=color) + geom_line()) else: p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) + theme_classic() + xlab('number of cells') + ylab('number of clones') + ggtitle('rarefaction curve') + labs(color=color) + geom_line()) if save: p.save(filename='figures/rarefaction' + str(save), height=plt.rcParams['figure.figsize'][0], width=plt.rcParams['figure.figsize'][1], units='in', dpi=plt.rcParams["savefig.dpi"]) return (p)
out_i = pandas.DataFrame(sim_res_fwd[i], columns=out.columns[3:]) out_i['time'] = t out_i['signal'] = C3_scan[i] out_i['dir'] = 'Low $[S^{**}]$' out = pandas.concat([out, out_i[out.columns]]) for i in range(len(sim_res_rev)): out_i = pandas.DataFrame(sim_res_rev[i], columns=out.columns[3:]) out_i['time'] = t out_i['signal'] = numpy.flip(C3_scan)[i] out_i['dir'] = 'High $[S^{**}]$' out = pandas.concat([out, out_i[out.columns]]) out.to_csv("./num_cont_nuts_sub_1_model/sim.txt", sep="\t", index=False) ###################### plotting ################################## g = (ggplot(out, aes('time', response, group='signal', color='signal')) + geom_line(size=0.5) + ylim(0, 250) + labs(x="time", y="$[S^{**}]$") + scale_color_distiller( palette='RdYlBu', type="diverging", name="$B_{tot}$") + facet_wrap('~dir') + theme_bw()) g.save(filename="./num_cont_nuts_sub_1_model/sim_fwd_rev.png", format="png", width=8, height=4, units='in', verbose=False) eq = out[out.time == max(out.time)] print(eq['signal']) print(eq['s11'])
def plot(): outdir = "output/protobowl/" pathlib.Path(outdir).mkdir(parents=True, exist_ok=True) df = load_protobowl() df.result = df.result.apply(lambda x: x is True) df["log_n_records"] = df.user_n_records.apply(np.log) df_user_grouped = df.groupby("uid") user_stat = df_user_grouped.agg(np.mean) print("{} users".format(len(user_stat))) print("{} records".format(len(df))) max_color = user_stat.log_n_records.max() user_stat["alpha"] = pd.Series( user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index) # 2D user plot p0 = (ggplot(user_stat) + geom_point( aes( x="relative_position", y="result", size="user_n_records", color="log_n_records", alpha="alpha", ), show_legend={ "color": False, "alpha": False, "size": False }, ) + scale_color_gradient(high="#e31a1c", low="#ffffcc") + labs(x="Average buzzing position", y="Accuracy") + theme(aspect_ratio=1)) p0.save(os.path.join(outdir, "protobowl_users.pdf")) # p0.draw() print("p0 done") # histogram of number of records p1 = (ggplot(user_stat, aes(x="log_n_records", y="..density..")) + geom_histogram(color="#e6550d", fill="#fee6ce") + geom_density() + labs(x="Log number of records", y="Density") + theme(aspect_ratio=0.3)) p1.save(os.path.join(outdir, "protobowl_hist.pdf")) # p1.draw() print("p1 done") # histogram of accuracy p2 = (ggplot(user_stat, aes(x="result", y="..density..")) + geom_histogram(color="#31a354", fill="#e5f5e0") + geom_density() + labs(x="Accuracy", y="Density") + theme(aspect_ratio=0.3)) p2.save(os.path.join(outdir, "protobowl_acc.pdf")) # p2.draw() print("p2 done") # histogram of buzzing position p3 = (ggplot(user_stat, aes(x="relative_position", y="..density..")) + geom_histogram(color="#3182bd", fill="#deebf7") + geom_density() + labs(x="Average buzzing position", y="Density") + theme(aspect_ratio=0.3)) p3.save(os.path.join(outdir, "protobowl_pos.pdf")) # p3.draw() print("p3 done")
max_depth=5, min_samples_split=2, max_features=5, n_jobs=n_threads) sklearn_forest.fit(X, y) current_timing = (time.time() - start_time) if n >= n_burn_in: timing_data['implementation'].append('scikit-learn 0.23.1') timing_data['threads'].append(n_threads) timing_data['timing'].append(current_timing) df = pd.DataFrame(data=timing_data) df = df.groupby(['implementation', 'threads']).agg(['mean', 'std']).reset_index() df.columns = ['Implementation', 'threads', 'mean', 'std'] print(df) df['error_min'] = df['mean'] - df['std'] df['error_max'] = df['mean'] + df['std'] p = (ggplot( df, aes(x='threads', y='mean', group='Implementation', color='Implementation')) + geom_line() + geom_point() + geom_errorbar(aes(ymin='error_min', ymax='error_max'), width=.2, position=position_dodge(0.05)) + labs(x="Number of threads", y="timing [s]")) p.save(filename='benchmark.png')
normalized_all_data_UMAPencoded = model.transform(normalized_all_data_numeric) normalized_all_data_UMAPencoded_df = pd.DataFrame( data=normalized_all_data_UMAPencoded, index=normalized_all_data.index, columns=["1", "2"], ) # Add back label column normalized_all_data_UMAPencoded_df["sample group"] = normalized_all_data[ "sample group"] # Plot fig = pn.ggplot(normalized_all_data_UMAPencoded_df, pn.aes(x="1", y="2")) fig += pn.geom_point(pn.aes(color="sample group"), alpha=0.4) fig += pn.labs(x="UMAP 1", y="UMAP 2", title="Gene expression data in gene space") fig += pn.theme_bw() fig += pn.theme( legend_title_align="center", plot_background=pn.element_rect(fill="white"), legend_key=pn.element_rect(fill="white", colour="white"), legend_title=pn.element_text(family="sans-serif", size=15), legend_text=pn.element_text(family="sans-serif", size=12), plot_title=pn.element_text(family="sans-serif", size=15), axis_text=pn.element_text(family="sans-serif", size=12), axis_title=pn.element_text(family="sans-serif", size=15), ) fig += pn.scale_color_manual(["#bdbdbd", "red", "blue"]) fig += pn.guides(colour=pn.guide_legend(override_aes={"alpha": 1}))