def main(): gn = Granatum() adata = gn.ann_data_from_assay(gn.get_import('assay')) random_seed = gn.get_arg('random_seed') sc.tl.tsne(adata, random_state=random_seed) X_tsne = adata.obsm['X_tsne'] plt.figure() plt.scatter(X_tsne[:, 0], X_tsne[:, 1], 5000 / adata.shape[0]) plt.xlabel('t-SNE dim. 1') plt.ylabel('t-SNE dim. 2') plt.tight_layout() gn.add_current_figure_to_results('t-SNE plot: each dot represents a cell', dpi=75) pca_export = { 'dimNames': ['t-SNE dim. 1', 't-SNE dim. 2'], 'coords': { sample_id: X_tsne[i, :].tolist() for i, sample_id in enumerate(adata.obs_names) }, } gn.export_statically(pca_export, 't-SNE coordinates') gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) mingenes = gn.get_arg('min_genes_per_cell') maxgenes = gn.get_arg('max_genes_per_cell') mt_percent = gn.get_arg('mt_genes_percent')/100.0 uniquegenecount = df.astype(bool).sum(axis=0) totalgenecount = df.sum(axis=0) mtrows = df[df.index.str.startswith('MT')] mtgenecount = mtrows.sum(axis=0) mtpercent = mtgenecount.div(totalgenecount) colsmatching = uniquegenecount.T[(uniquegenecount.T >= mingenes) & (uniquegenecount.T <= maxgenes) & (mtpercent.T <= mt_percent)].index.values adata = df.loc[:, colsmatching] num_orig_cells = uniquegenecount.T.index.size num_filtered_cells = len(colsmatching) num_lt_min = uniquegenecount.T[(uniquegenecount.T < mingenes)].index.size num_gt_max = uniquegenecount.T[(uniquegenecount.T > maxgenes)].index.size num_gt_mt = uniquegenecount.T[(mtpercent.T > mt_percent)].index.size gn.add_result("Number of cells is now {} out of {} original cells with {} below min genes, {} above max genes, and {} above mt percentage threshold.".format(num_filtered_cells, num_orig_cells, num_lt_min, num_gt_max, num_gt_mt), "markdown") plt.figure() plt.subplot(2, 1, 1) plt.title('Unique gene count distribution') sns.distplot(uniquegenecount, bins=int(200), color = 'darkblue', kde_kws={'linewidth': 2}) plt.ylabel('Frequency') plt.xlabel('Gene count') plt.subplot(2, 1, 2) plt.title('MT Percent Distribution') sns.distplot(mtpercent*100.0, bins=int(200), color = 'darkblue', kde_kws={'linewidth': 2}) plt.ylabel('Frequency') plt.xlabel('MT Percent') plt.tight_layout() caption = ( 'The distribution of expression levels for each cell with various metrics.' ) gn.add_current_figure_to_results(caption, zoom=1, dpi=75) gn.export(gn.assay_from_pandas(adata), "Filtered Cells Assay", dynamic=False) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished cell filtering step in {} seconds*".format(time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): gn = Granatum() assay = gn.get_import('assay') matrix = np.array(assay.get('matrix')) sample_ids = assay.get('sampleIds') num_samples = matrix.shape[1] # ---- PCA -------------------------------------------------------------------- X = np.transpose(matrix) model = PCA(n_components=2) Y_pca = model.fit_transform(X) pca_export = { 'dimNames': ['PCA-1', 'PCA-2'], 'coords': { sample_id: Y_pca[i, :].tolist() for i, sample_id in enumerate(sample_ids) }, } gn.export_statically(pca_export, 'pca') plt.figure() plt.scatter(Y_pca[:, 0], Y_pca[:, 1], 5000 / num_samples) plt.tight_layout() gn.add_current_figure_to_results( 'Principal Component Analysis (PCA) scatter-plot', dpi=75) # ---- T-SNE ------------------------------------------------------------------ X = np.transpose(matrix) model = TSNE(n_jobs=multiprocessing.cpu_count()) Y_tsne = model.fit_transform(X) tsne_export = { 'dimNames': ['tSNE-1', 'tSNE-2'], 'coords': { sample_id: Y_tsne[i, :].tolist() for i, sample_id in enumerate(sample_ids) }, } gn.export_statically(tsne_export, 'tsne') plt.figure() plt.scatter(Y_tsne[:, 0], Y_tsne[:, 1], s=5000 / num_samples) plt.tight_layout() gn.add_current_figure_to_results( 't-Distributed Stochastic Neighbor Embedding (t-SNE) scatter-plot', dpi=75) gn.commit()
def main(): gn = Granatum() set1 = gn.get_import('set1') set2 = gn.get_import('set2') set3 = gn.get_import('set3') maxScore = gn.get_arg('maxScore') minScore = gn.get_arg('minScore') labelSet1 = gn.get_arg("labelSet1") labelSet2 = gn.get_arg("labelSet2") labelSet3 = gn.get_arg("labelSet3") wordcloud = gn.get_arg("wordcloud") filtered_set1 = dict(filter(lambda elem: (isinstance(elem[1], numbers.Number) & (not isnan(elem[1]))) & (elem[1] >= minScore) & (elem[1] <= maxScore), set1.items())) filtered_set2 = dict(filter(lambda elem: (isinstance(elem[1], numbers.Number) & (not isnan(elem[1]))) & (elem[1] >= minScore) & (elem[1] <= maxScore), set2.items())) filtered_set3 = dict(filter(lambda elem: (isinstance(elem[1], numbers.Number) & (not isnan(elem[1]))) & (elem[1] >= minScore) & (elem[1] <= maxScore), set3.items())) merged_frequencies = {**filtered_set1, **filtered_set2, **filtered_set3} packedsets = [set(filtered_set1.keys()), set(filtered_set2.keys()), set(filtered_set3.keys())] fig, ax = plt.subplots(1,1) fig.set_size_inches(5,4) caption = ( 'The area weighted Venn diagram is shown for the gene sets matching the criteria' ) if wordcloud: out = venn3_wordcloud(packedsets, set_labels=(labelSet1, labelSet2, labelSet3), wordcloud_kwargs=dict(max_font_size=36), word_to_frequency=merged_frequencies, ax=ax) for text in out.set_labels: if text: text.set_fontsize(18) for text in out.subset_labels: if text: text.set_fontsize(16) text.set_path_effects([path_effects.SimpleLineShadow(), path_effects.Normal()]) else: out = venn3(packedsets, set_labels=(labelSet1, labelSet2, labelSet3)) venn3_circles(packedsets, linestyle='dashed', linewidth=1, color="black") for text in out.set_labels: if text: text.set_fontsize(18) for text in out.subset_labels: if text: text.set_fontsize(16) text.set_path_effects([path_effects.SimpleLineShadow(), path_effects.Normal()]) gn.add_current_figure_to_results(caption) gn.commit()
def main(): gn = Granatum() assay = gn.get_import('assay') matrix = np.array(assay.get('matrix')) take_log = gn.get_arg('take_log') log_base = gn.get_arg('logBase') epsilon = gn.get_arg('epsilon') transformed_matrix = (matrix + epsilon) / (1 - matrix + epsilon) if take_log: transformed_matrix = np.log(transformed_matrix) / np.log(log_base) non_zero_values_before = matrix.flatten() non_zero_values_before = non_zero_values_before[( non_zero_values_before > np.percentile(non_zero_values_before, 5))] non_zero_values_after = transformed_matrix.flatten() non_zero_values_after = non_zero_values_after[( non_zero_values_after > np.percentile(non_zero_values_after, 5))] plt.figure() plt.subplot(2, 1, 1) plt.title('Before beta-to-m transformation') plt.hist(non_zero_values_before, bins=100) plt.ylabel('Frequency') plt.xlabel('Expression level') plt.subplot(2, 1, 2) plt.title('After beta-to-m transformation') plt.hist(non_zero_values_after, bins=100) plt.ylabel('Frequency') plt.xlabel('Expression level') plt.tight_layout() caption = ( 'The distribution of expression level before and after beta-to-m transformation. Only the values greater ' 'than the 5 percentile (usually zero in single-cell data) and lower than 95 percentile are considered.' ) gn.add_current_figure_to_results(caption, zoom=2, dpi=50) assay['matrix'] = transformed_matrix.tolist() gn.export_statically(assay, 'Beta-to-m transformed assay') gn.commit()
def main(): gn = Granatum() adata = gn.ann_data_from_assay(gn.get_import("assay")) num_top_comps = gn.get_arg("num_top_comps") sc.pp.pca(adata, 20) variance_ratios = adata.uns["pca"]["variance_ratio"] pc_labels = ["PC{}".format(x + 1) for x in range(len(variance_ratios))] plt.figure() plt.bar(pc_labels, variance_ratios) plt.tight_layout() gn.add_current_figure_to_results( "Explained variance (ratio) by each Principal Component (PC)", height=350, dpi=75) X_pca = adata.obsm["X_pca"] for i, j in combinations(range(num_top_comps), 2): xlabel = "PC{}".format(i + 1) ylabel = "PC{}".format(j + 1) plt.figure() plt.scatter(X_pca[:, i], X_pca[:, j], s=5000 / adata.shape[0]) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.tight_layout() gn.add_current_figure_to_results("PC{} vs. PC{}".format(i + 1, j + 1), dpi=75) pca_export = { "dimNames": [xlabel, ylabel], "coords": { sample_id: X_pca[k, [i, j]].tolist() for k, sample_id in enumerate(adata.obs_names) }, } gn.export(pca_export, "PC{} vs. PC{}".format(i + 1, j + 1), kind="sampleCoords", meta={}) gn.commit()
def main(): gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) n_steps = gn.get_arg('n_steps') min_theta = gn.get_arg('min_theta') max_theta = gn.get_arg('max_theta') jammit = JAMMIT.from_dfs([df]) jammit.scan( thetas=np.linspace(min_theta, max_theta, n_steps), calculate_fdr=True, n_perms=10, verbose=1, convergence_threshold=0.000000001, ) jammit_result = jammit.format(columns=['theta', 'alpha', 'n_sigs', 'fdr']) jammit_result['theta'] = jammit_result['theta'].round(3) jammit_result['alpha'] = jammit_result['alpha'].round(3) plt.plot(jammit_result['alpha'], jammit_result['fdr']) plt.xlabel('alpha') plt.ylabel('FDR') gn.add_current_figure_to_results('FDR plotted against alpha', height=400) gn.add_result( { 'pageSize': n_steps, 'orient': 'split', 'columns': [{ 'name': h, 'type': 'number', 'round': 3 } for h in jammit_result.columns], 'data': jammit_result.values.tolist(), }, data_type='table', ) gn.commit()
def main(): gn = Granatum() sample_coords = gn.get_import("viz_data") df = gn.pandas_from_assay(gn.get_import("assay")) gene_ids = parse(gn.get_arg("gene_ids")) groups = gn.get_import("groups") alpha = 1.0 - gn.get_arg("confint") / 100.0 min_zscore = st.norm.ppf(gn.get_arg("confint")) min_dist = 0.1 coords = sample_coords.get("coords") dim_names = sample_coords.get("dimNames") inv_map = {} for k, v in groups.items(): inv_map[v] = inv_map.get(v, []) + [k] for gene in gene_ids: plt.figure() # First form a statistic for all values, also puts out plot params = plot_fits(df.loc[gene, :].dropna().to_list(), color="r", alpha=alpha, min_dist=min_dist, min_zscore=min_zscore, label="All") for k, v in inv_map.items(): plt.subplot(1, 1, 1) plt.title('Gene expression level distribution for each cluster') plot_predict(df.loc[gene, v].dropna().to_list(), params, label=k) # sns.distplot(df.loc[gene,:].to_list(), bins=int(100), color = 'darkblue', kde_kws={'linewidth': 2}) plt.ylabel('Frequency') plt.xlabel('Gene expression') plt.legend() plt.tight_layout() caption = ( "The distribution of expression levels for gene {}.".format(gene)) gn.add_current_figure_to_results(caption, zoom=1, dpi=75) gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) n_neighbors = gn.get_arg('n_neighbors') min_dist = gn.get_arg('min_dist') metric = gn.get_arg('metric') random_seed = gn.get_arg('random_seed') embedding = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric, random_state=random_seed).fit_transform(df.values.T) plt.figure() plt.scatter(embedding[:, 0], embedding[:, 1], min(5000 / df.shape[0], 36.0)) plt.xlabel('UMAP dim. 1') plt.ylabel('UMAP dim. 2') plt.tight_layout() gn.add_current_figure_to_results('UMAP plot: each dot represents a cell', dpi=75) pca_export = { 'dimNames': ['UMAP dim. 1', 'UMAP dim. 2'], 'coords': { sample_id: embedding[i, :].tolist() for i, sample_id in enumerate(df.columns) }, } gn.export_statically(pca_export, 'UMAP coordinates') toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished UMAP step in {} seconds*".format(time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): gn = Granatum() adata = gn.ann_data_from_assay(gn.get_import("assay")) min_cells_expressed = gn.get_arg("min_cells_expressed") min_mean = gn.get_arg("min_mean") max_mean = gn.get_arg("max_mean") min_disp = gn.get_arg("min_disp") max_disp = gn.get_arg("max_disp") num_genes_before = adata.shape[1] sc.pp.filter_genes(adata, min_cells=min_cells_expressed) filter_result = sc.pp.filter_genes_dispersion( adata.X, flavor='seurat', min_mean=math.log(min_mean), max_mean=math.log(max_mean), min_disp=min_disp, max_disp=max_disp, ) adata = adata[:, filter_result.gene_subset] sc.pl.filter_genes_dispersion(filter_result) gn.add_current_figure_to_results( "Each dot represent a gene. The gray dots are the removed genes. The x-axis is log-transformed.", zoom=3, dpi=50, height=400, ) gn.add_result( "\n".join( [ "Number of genes before filtering: **{}**".format(num_genes_before), "", "Number of genes after filtering: **{}**".format(adata.shape[1]), ] ), type="markdown", ) gn.export(gn.assay_from_ann_data(adata), "Filtered Assay", dynamic=False) gn.commit()
def main(): gn = Granatum() n_neighbors = gn.get_arg('nNeighbors', 15) neighbor_method = gn.get_arg('neighborMethod', 'gauss') assay = gn.get_import('assay') adata = sc.AnnData(np.array(assay.get('matrix')).transpose()) adata.var_names = assay.get('geneIds') adata.obs_names = assay.get('sampleIds') sc.pp.neighbors(adata, n_neighbors=n_neighbors, use_rep='X', method=neighbor_method) sc.tl.dpt(adata, n_branchings=1) gn._pickle(adata, 'adata') # dpt_groups for spec in [{ 'col': 'dpt_order', 'caption': 'Cell order' }, { 'col': 'dpt_groups', 'caption': 'Cell groups' }]: fig = plt.figure() sc.pl.diffmap(adata, color=spec['col']) gn.add_current_figure_to_results(spec['caption']) gn.export_statically( dict( zip(adata.obs_names.tolist(), adata.obs[spec['col']].values.tolist())), spec['col']) gn.commit()
def main(): gn = Granatum() adata = gn.ann_data_from_assay(gn.get_import('assay')) sample_coords = gn.get_import('sampleCoords') random_seed = gn.get_arg('random_seed') sc.pp.neighbors(adata, n_neighbors=20, use_rep='X', method='gauss') sc.tl.louvain(adata, random_state=random_seed) cluster_assignment = dict( zip(adata.obs_names, ['Cluster {}'.format(int(c) + 1) for c in adata.obs['louvain']])) gn.export_statically(cluster_assignment, 'Cluster assignment') dim_names = sample_coords.get('dimNames') coords_dict = sample_coords.get('coords') plt.figure() clusters = adata.obs['louvain'].cat.categories for c in clusters: cell_ids = adata.obs_names[adata.obs['louvain'] == c] coords = [coords_dict.get(x) for x in cell_ids] coords_x = [x[0] for x in coords] coords_y = [x[1] for x in coords] plt.scatter(coords_x, coords_y, label='Cluster {}'.format(int(c) + 1)) plt.xlabel(dim_names[0]) plt.ylabel(dim_names[1]) plt.legend() plt.tight_layout() gn.add_current_figure_to_results( 'Scatter-plot using imported cell coordinates. Each dot represents a cell. The colors indicate the indentified cell clusters.', dpi=75) gn.commit()
def main(): gn = Granatum() assay = gn.get_import('assay') x = np.array(assay.get('matrix')).astype(np.float) log_base = gn.get_arg('log_base') n_top = gn.get_arg('n_top') n_bottom = gn.get_arg('n_bottom') which_mid = gn.get_arg('which_mid') gene_df = pd.DataFrame( { 'row_num': range(x.shape[0]), 'gene_id': assay.get('geneIds'), 'exp_mean': np.mean(x, axis=1), 'exp_std': np.std(x, axis=1), } ) gene_df = gene_df.sort_values('exp_mean', ascending=False) top_gene_row = gene_df.head(n_top).sort_values('exp_std', ascending=False).iloc[0] bottom_gene_row = gene_df.tail(n_bottom).sort_values('exp_std').iloc[0] hk_gene = np.clip(x[top_gene_row['row_num'], :], a_min=0.00001, a_max=None) neg_gene = x[bottom_gene_row['row_num'], :] if which_mid == 'mean': alphabk = np.mean(neg_gene[:]) elif which_mid == 'median': alphabk = np.median(neg_gene[:]) else: raise ValueError() loghkdatabk = np.log(hk_gene - alphabk) / np.log(log_base) # Drop NAN values loghkdatabk = loghkdatabk[~np.isnan(loghkdatabk)] c = (np.std(neg_gene[:], ddof=1) / np.std(loghkdatabk, ddof=1))**2 xbk = x - alphabk transformed_matrix = np.log((xbk + np.sqrt(xbk**2 + c)) / 2) / np.log(log_base) gn.add_result( '\n'.join( [ f"Selected benchmarking genes:", f" * housekeeping gene: **{top_gene_row['gene_id']}** " f"(mean: {top_gene_row['exp_mean']}, std: {top_gene_row['exp_std']}) ", f" * negative control gene: **{bottom_gene_row['gene_id']}**" f"(mean: {bottom_gene_row['exp_mean']}, std: {bottom_gene_row['exp_std']})", f"", f"Final formula is `y = log{log_base}((z + sqrt(z^2 + c))/2)`, where `z = x - {alphabk}` and `c = {c}`." ] ), 'markdown' ) non_zero_values_before = x.flatten() non_zero_values_before = non_zero_values_before[(non_zero_values_before > np.percentile(non_zero_values_before, 5))] non_zero_values_after = transformed_matrix.flatten() non_zero_values_after = non_zero_values_after[(non_zero_values_after > np.percentile(non_zero_values_after, 5))] plt.figure() plt.subplot(2, 1, 1) plt.title('Before glog transformation') plt.hist(non_zero_values_before, bins=100) plt.ylabel('Frequency') plt.xlabel('Expression level') plt.subplot(2, 1, 2) plt.title('After glog transformation') plt.hist(non_zero_values_after, bins=100) plt.ylabel('Frequency') plt.xlabel('Expression level') plt.tight_layout() caption = ( 'The distribution of expression level before and after glog transformation. Only the values greater ' 'than the 5 percentile (usually zero in single-cell data) and lower than 95 percentile are considered.' ) gn.add_current_figure_to_results(caption, zoom=2, dpi=50) assay['matrix'] = transformed_matrix.tolist() gn.export_statically(assay, 'GLog transformed assay') gn.commit()
def main(): gn = Granatum() sample_coords = gn.get_import("viz_data") df = gn.pandas_from_assay(gn.get_import("assay")) gene_ids = gn.get_arg("gene_ids") overlay_genes = gn.get_arg("overlay_genes") max_colors = gn.get_arg("max_colors") min_level = gn.get_arg("min_level") max_level = gn.get_arg("max_level") convert_to_zscore = gn.get_arg("convert_to_zscore") min_marker_area = gn.get_arg("min_marker_area") max_marker_area = gn.get_arg("max_marker_area") min_alpha = gn.get_arg("min_alpha") max_alpha = gn.get_arg("max_alpha") grey_level = gn.get_arg("grey_level") coords = sample_coords.get("coords") dim_names = sample_coords.get("dimNames") cmaps = [] if overlay_genes: if max_colors == "": numcolors = len(gene_ids.split(',')) cycol = cycle('bgrcmk') for i in range(numcolors): cmaps = cmaps + [ LinearSegmentedColormap("fire", produce_cdict(next(cycol), grey=grey_level, min_alpha=min_alpha, max_alpha=max_alpha), N=256) ] else: for col in max_colors.split(','): col = col.strip() cmaps = cmaps + [ LinearSegmentedColormap("fire", produce_cdict(col, grey=grey_level, min_alpha=min_alpha, max_alpha=max_alpha), N=256) ] else: if max_colors == "": cmaps = cmaps + [LinearSegmentedColormap("fire", cdict, N=256)] else: for col in max_colors.split(','): col = col.strip() cmaps = cmaps + [ LinearSegmentedColormap("fire", produce_cdict(col, grey=grey_level, min_alpha=min_alpha, max_alpha=max_alpha), N=256) ] colorbar_height = 10 plot_height = 650 num_cbars = 1 if overlay_genes: num_cbars = len(gene_ids.split(',')) cbar_height_ratio = plot_height / (num_cbars * colorbar_height) fig, ax = plt.subplots( 1 + num_cbars, 1, gridspec_kw={'height_ratios': [cbar_height_ratio] + [1] * num_cbars}) gene_index = -1 for gene_id in gene_ids.split(','): gene_id = gene_id.strip() gene_index = gene_index + 1 if gene_id in df.index: if not overlay_genes: plt.clf() fig, ax = plt.subplots( 1 + num_cbars, 1, gridspec_kw={ 'height_ratios': [cbar_height_ratio] + [1] * num_cbars }) transposed_df = df.T mean = transposed_df[gene_id].mean() stdev = transposed_df[gene_id].std(ddof=0) if convert_to_zscore: scatter_df = pd.DataFrame( { "x": [a[0] for a in coords.values()], "y": [a[1] for a in coords.values()], "value": (df.loc[gene_id, :] - mean) / stdev }, index=coords.keys()) else: scatter_df = pd.DataFrame( { "x": [a[0] for a in coords.values()], "y": [a[1] for a in coords.values()], "value": df.loc[gene_id, :] }, index=coords.keys()) values_df = np.clip(scatter_df["value"], min_level, max_level, out=None) min_value = np.nanmin(values_df) max_value = np.nanmax(values_df) scaled_marker_size = (max_marker_area - min_marker_area) * ( values_df - min_value) / (max_value - min_value) + min_marker_area scaled_marker_size = scaled_marker_size * scaled_marker_size # s = 5000 / scatter_df.shape[0] scatter = ax[0].scatter( x=scatter_df["x"], y=scatter_df["y"], s=scaled_marker_size, c=values_df, cmap=cmaps[gene_index % len(cmaps)]) #Amp_3.mpl_colormap) cbar = fig.colorbar(scatter, cax=ax[1 + (gene_index % num_cbars)], orientation='horizontal', aspect=40) cbar.set_label(gene_id, rotation=0) ax[0].set_xlabel(dim_names[0]) ax[0].set_ylabel(dim_names[1]) if not overlay_genes: gn.add_current_figure_to_results( "Scatter-plot of {} expression".format(gene_id), dpi=75) else: # if the gene ID entered is not present in the assay # Communicate it to the user and output a table of available gene ID's description = 'The selected gene is not present in the assay. See the step that generated the assay' genes_in_assay = pd.DataFrame( df.index.tolist(), columns=['Gene unavailable in assay: choose from below']) gn.add_pandas_df(genes_in_assay, description) if overlay_genes: gn.add_current_figure_to_results( "Scatter-plot of {} expression".format(gene_ids), height=650 + 100 * len(gene_ids.split(',')), dpi=75) gn.commit()
def main(): gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) alpha = gn.get_arg('alpha') jammit = JAMMIT.from_dfs([df]) res = jammit.run_for_one_alpha( alpha, verbose=1, convergence_threshold=0.000000001, ) u = res['u'] v = res['v'] gn.export(dict(zip(df.index, u)), 'Genes loadings', kind='geneMeta') gn.export(dict(zip(df.columns, v)), 'Sample scores', kind='sampleMeta') gene_df = pd.DataFrame({ 'id_': df.index, 'abs_loading': abs(u), 'loading': u }) gene_df = gene_df[['id_', 'abs_loading', 'loading']] gene_df = gene_df.loc[gene_df['loading'].abs() > EPSILON] gene_df = gene_df.sort_values('abs_loading', ascending=False) gn.add_result( { 'title': f"Signal genes ({len(gene_df)})", 'orient': 'split', 'columns': gene_df.columns.values.tolist(), 'data': gene_df.values.tolist(), }, data_type='table', ) gn.export(gene_df.to_csv(index=False), 'signal_genes.csv', kind='raw', meta=None, raw=True) sample_df = pd.DataFrame({ 'id_': df.columns, 'abs_score': abs(v), 'score': v }) sample_df = sample_df[['id_', 'abs_score', 'score']] sample_df = sample_df.loc[sample_df['score'].abs() > EPSILON] sample_df = sample_df.sort_values('abs_score', ascending=False) gn.add_result( { 'title': f"Signal samples ({len(sample_df)})", 'orient': 'split', 'columns': sample_df.columns.values.tolist(), 'data': sample_df.values.tolist(), }, data_type='table', ) gn.export(sample_df.to_csv(index=False), 'signal_samples.csv', kind='raw', meta=None, raw=True) subset_df = df.loc[gene_df['id_'], sample_df['id_']] gn.export(gn.assay_from_pandas(subset_df), 'Assay with only signal genes and samples', kind='assay') sns.clustermap(subset_df, cmap='RdBu') gn.add_current_figure_to_results( description='Cluster map of the signal genes and signal samples', zoom=2, width=750, height=850, dpi=50, ) plt.close() plt.figure() plt.scatter(range(len(u)), u, s=2, c='red') plt.xlabel('index') plt.ylabel('value in u') gn.add_current_figure_to_results( description= 'The *u* vector (loadings for genes) plotted as a scatter plot.', zoom=2, width=750, height=450, dpi=50, ) plt.close() plt.figure() plt.plot(range(len(v)), v) plt.scatter(range(len(v)), v, s=6, c='red') plt.xlabel('index') plt.ylabel('value in v') gn.add_current_figure_to_results( description= 'The *v* vector (scores for samples) plotted as a line plot.', zoom=2, width=750, height=450, dpi=50, ) plt.close() # gn.export_current_figure( # 'cluster_map.pdf', # zoom=2, # width=750, # height=850, # dpi=50, # ) gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() sample_coords = gn.get_import("viz_data") value = gn.get_import("value") coloring_type = gn.get_arg("coloring_type") bounding_stdev = gn.get_arg("bounding_stdev") label_location = gn.get_arg("label_location") label_transform = gn.get_arg("label_transform") labelXaxis = gn.get_arg("labelXaxis") labelYaxis = gn.get_arg("labelYaxis") sigfigs = gn.get_arg("sigfigs") numticks = gn.get_arg("numticks") font = gn.get_arg('font') coords = sample_coords.get("coords") dim_names = sample_coords.get("dimNames") seed = gn.get_arg('random_seed') random.seed(seed) np.random.seed(seed) df = pd.DataFrame( { "x": [a[0] for a in coords.values()], "y": [a[1] for a in coords.values()], "value": pd.Series(value) }, index=coords.keys()) target_dpi = 300 target_width = 7.5 # inches target_height = 6.5 # inches font_size_in_in = font / 72.0 # inches font_size_in_px = font_size_in_in * target_dpi try: if coloring_type == "categorical": uniq = df["value"].unique() uniq.sort(kind="stable") num = uniq.shape[0] COLORS2 = plt.get_cmap('gist_rainbow') carr = [0] * df.shape[0] listcats = list(df["value"]) miny = min(list(df["y"])) maxy = max(list(df["y"])) scaley = (maxy - miny) / (target_height * target_dpi) print("Scaley = {}".format(scaley)) colorhash = {} colorstep = np.ceil(256.0 / num) coffset = randrange(colorstep) grouptocolor = np.random.choice(np.arange(num), num, replace=False) for i, cat in enumerate(uniq): dff = df[df["value"] == cat] xs = list(dff["x"]) ys = list(dff["y"]) #avgx = sum(dff["x"]) / len(dff["x"]) #avgy = sum(dff["y"]) / len(dff["y"]) #plt.scatter(x=dff["x"], y=dff["y"], s=5000 / df.shape[0], c=COLORS[i].hex_l, label=cat) #plt.scatter(x=dff["x"], y=dff["y"], s=5000 / df.shape[0], c=[abs(hash(cat)) % 256]*len(dff["x"]), cmap=COLORS2, label=cat) #plt.scatter(x=dff["x"], y=dff["y"], s=5000 / df.shape[0], c=abs(hash(cat)) % 256, cmap=COLORS2, label=cat) #abs(hash(cat)) colorindex = (coffset + grouptocolor[i] * colorstep) % 256 colorhash[cat] = colorindex craw = COLORS2((colorindex + 0.0) / 256.0) clr = [craw[0], craw[1], craw[2], 0.2] whitetransparent = [1.0, 1.0, 1.0, 0.5] coloropaque = [craw[0], craw[1], craw[2], 1.0] if len(xs) > 3: pts = list(zip(xs, ys)) cent = np.mean(pts, axis=0) lengs = list( map( lambda p: math.sqrt( (p[0] - cent[0]) * (p[0] - cent[0]) + (p[1] - cent[1]) * (p[1] - cent[1])), pts)) avgleng = st.mean(lengs) stdleng = st.stdev(lengs) * bounding_stdev rpts = [] if (stdleng > 0.0): for j, ln in enumerate(lengs): if (ln - avgleng < stdleng): rpts.append(pts[j]) pts = rpts cent = np.mean(pts, axis=0) hull = ConvexHull(pts) ptslist = [] for pt in hull.simplices: ptslist.append(pts[pt[0]]) ptslist.append(pts[pt[1]]) ptslist.sort(key=lambda p: np.arctan2( p[1] - cent[1], p[0] - cent[0])) ptslist = ptslist[0::2] ptslist.insert(len(ptslist), ptslist[0]) lowestpt = ptslist[0] if label_location == 'bottom': for pt in ptslist: if (pt[1] < lowestpt[1]): lowestpt = pt else: lowestpt = ptslist[randrange(len(ptslist))] if (bounding_stdev >= 0.0): poly = Polygon(1.1 * (np.array(ptslist) - cent) + cent, facecolor=clr) poly.set_capstyle('round') plt.gca().add_patch(poly) poly.set_color(clr) label_text = cat if label_transform == "numbers": label_text = re.sub("[^0-9]", "", cat) txt = plt.text(lowestpt[0], lowestpt[1] - scaley * font_size_in_px * 1.2, label_text, fontsize=font, fontname="Arial", ha="center", va="center", color="black", bbox=dict(boxstyle="round", fc=whitetransparent, ec=coloropaque)) # plt.gca().add_artist(txt) for j, x in enumerate(listcats): if x == cat: carr[j] = colorhash[cat] #carr[j] = colorhash[cat] / 256.0 #int(abs(hash(cat)) % 256) plt.scatter(x=df["x"], y=df["y"], s=5000 / df.shape[0], c=carr, cmap=COLORS2) lgd = plt.legend(markerscale=6, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=5) #60 / (5000 / df.shape[0]) elif coloring_type == "continuous": plt.scatter(x=df["x"], y=df["y"], s=5000 / df.shape[0], c=df["value"], cmap="Reds") plt.colorbar() xmin, xmax = plt.gca().get_xlim() ymin, ymax = plt.gca().get_ylim() # stepsizex=(xmax-xmin)/numticks # stepsizey=(ymax-ymin)/numticks xtickArray = resetArray(xmin, xmax, numticks, sigfigs) ytickArray = resetArray(ymin, ymax, numticks, sigfigs) # plt.xticks(np.arange(xmin, xmax+stepsizex, step=stepsizex), fontsize=font, fontname="Arial") # plt.yticks(np.arange(ymin, ymax+stepsizey, step=stepsizey), fontsize=font, fontname="Arial") plt.xlim(xtickArray[0], xtickArray[-1]) plt.ylim(ytickArray[0], ytickArray[-1]) plt.xticks(xtickArray, fontsize=font, fontname="Arial") plt.yticks(ytickArray, fontsize=font, fontname="Arial") if labelXaxis == "": plt.xlabel(dim_names[0], fontsize=font, fontname="Arial") else: plt.xlabel(labelXaxis, fontsize=font, fontname="Arial") if labelYaxis == "": plt.ylabel(dim_names[1], fontsize=font, fontname="Arial") else: plt.ylabel(labelYaxis, fontsize=font, fontname="Arial") # plt.tight_layout() gn.add_current_figure_to_results( "Scatter-plot", dpi=target_dpi, width=target_width * target_dpi, height=target_height * target_dpi, savefig_kwargs={'bbox_inches': 'tight'}) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished sample coloring step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit() except Exception as e: plt.figure() plt.text( 0.05, 0.7, 'Values used as colors and type of sample metadata are incompatible with each other' ) if coloring_type == 'categorical': new_coloring_type = 'continuous' else: new_coloring_type = 'categorical' plt.text( 0.05, 0.5, 'Retry the step with ' + new_coloring_type + ' instead of ' + coloring_type) plt.axis('off') gn.add_current_figure_to_results('Scatter-plot') gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.get_import('assay') sample_ids = assay.get('sampleIds') group_dict = gn.get_import('groupVec') group_vec = pd.Categorical([group_dict.get(x) for x in sample_ids]) num_groups = len(group_vec.categories) figheight = 400 * (math.floor((num_groups - 1) / 7) + 1) adata = sc.AnnData(np.array(assay.get('matrix')).transpose()) adata.var_names = assay.get('geneIds') adata.obs_names = assay.get('sampleIds') adata.obs['groupVec'] = group_vec sc.pp.neighbors(adata, n_neighbors=20, use_rep='X', method='gauss') try: sc.tl.rank_genes_groups(adata, 'groupVec', n_genes=100000) sc.pl.rank_genes_groups(adata, n_genes=20) gn.add_current_figure_to_results('One-vs-rest marker genes', dpi=75, height=figheight) gn._pickle(adata, 'adata') rg_res = adata.uns['rank_genes_groups'] for group in rg_res['names'].dtype.names: genes_names = [str(x[group]) for x in rg_res['names']] scores = [float(x[group]) for x in rg_res['scores']] newdict = dict(zip(genes_names, scores)) gn.export(newdict, 'Marker score ({} vs. rest)'.format(group), kind='geneMeta') newdictstr = [ '"' + str(k) + '"' + ", " + str(v) for k, v in newdict.items() ] gn.export("\n".join(newdictstr), 'Marker score {} vs rest.csv'.format(group), kind='raw', meta=None, raw=True) # cluster_assignment = dict(zip(adata.obs_names, adata.obs['louvain'].values.tolist())) # gn.export_statically(cluster_assignment, 'cluster_assignment') toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished marker gene identification step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit() except Exception as e: plt.figure() plt.text(0.01, 0.5, 'Incompatible group vector due to insufficent cells') plt.text(0.01, 0.3, 'Please retry the step with a different group vector') plt.axis('off') gn.add_current_figure_to_results('One-vs-rest marker genes') gn.add_result('Error = {}'.format(e), "markdown") gn.commit()