def main(): tic = time.perf_counter() gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) mingenes = gn.get_arg('min_genes_per_cell') maxgenes = gn.get_arg('max_genes_per_cell') mt_percent = gn.get_arg('mt_genes_percent')/100.0 uniquegenecount = df.astype(bool).sum(axis=0) totalgenecount = df.sum(axis=0) mtrows = df[df.index.str.startswith('MT')] mtgenecount = mtrows.sum(axis=0) mtpercent = mtgenecount.div(totalgenecount) colsmatching = uniquegenecount.T[(uniquegenecount.T >= mingenes) & (uniquegenecount.T <= maxgenes) & (mtpercent.T <= mt_percent)].index.values adata = df.loc[:, colsmatching] num_orig_cells = uniquegenecount.T.index.size num_filtered_cells = len(colsmatching) num_lt_min = uniquegenecount.T[(uniquegenecount.T < mingenes)].index.size num_gt_max = uniquegenecount.T[(uniquegenecount.T > maxgenes)].index.size num_gt_mt = uniquegenecount.T[(mtpercent.T > mt_percent)].index.size gn.add_result("Number of cells is now {} out of {} original cells with {} below min genes, {} above max genes, and {} above mt percentage threshold.".format(num_filtered_cells, num_orig_cells, num_lt_min, num_gt_max, num_gt_mt), "markdown") plt.figure() plt.subplot(2, 1, 1) plt.title('Unique gene count distribution') sns.distplot(uniquegenecount, bins=int(200), color = 'darkblue', kde_kws={'linewidth': 2}) plt.ylabel('Frequency') plt.xlabel('Gene count') plt.subplot(2, 1, 2) plt.title('MT Percent Distribution') sns.distplot(mtpercent*100.0, bins=int(200), color = 'darkblue', kde_kws={'linewidth': 2}) plt.ylabel('Frequency') plt.xlabel('MT Percent') plt.tight_layout() caption = ( 'The distribution of expression levels for each cell with various metrics.' ) gn.add_current_figure_to_results(caption, zoom=1, dpi=75) gn.export(gn.assay_from_pandas(adata), "Filtered Cells Assay", dynamic=False) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished cell filtering step in {} seconds*".format(time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.pandas_from_assay(gn.get_import('assay')) groups = gn.get_import('groups') reflabels = gn.get_import('reflabels') remove_cells = gn.get_arg('remove_cells') inv_map = {} for k, v in groups.items(): inv_map[v] = inv_map.get(v, []) + [k] inv_map_ref = {} for k, v in reflabels.items(): inv_map_ref[v] = inv_map_ref.get(v, []) + [k] group_relabel = {} mislabelled_cells = [] for k, v in inv_map.items(): vset = set(v) label_scores = {} for kref, vref in inv_map_ref.items(): label_scores[kref] = len(set(vref).intersection(vset)) group_relabel[k] = max(label_scores, key=label_scores.get) mislabelled_cells = mislabelled_cells + list( vset.difference(set(inv_map_ref[group_relabel[k]]))) if remove_cells: gn.add_result( "Dropping {} mislabelled cells".format(len(mislabelled_cells)), "markdown") assay = assay.drop(mislabelled_cells, axis=1) groups = { key: val for key, val in groups.items() if not key in mislabelled_cells } for cell in groups: groups[cell] = group_relabel[groups[cell]] toc = time.perf_counter() time_passed = round(toc - tic, 2) gn.export_statically(gn.assay_from_pandas(assay), "Corresponded assay") gn.export_statically(groups, "Corresponded labels") timing = "* Finished sample coloring step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): gn = Granatum() assay = gn.get_import('assay') args_for_init = { 'selected_embedding': gn.get_arg('selectedEmbedding'), 'selected_clustering': gn.get_arg('selectedClustering'), 'n_components': gn.get_arg('nComponents'), 'n_clusters': gn.get_arg('nClusters'), 'find_best_number_of_cluster': gn.get_arg('findBestNumberOfCluster'), } args_for_fit = { 'matrix': np.transpose(np.array(assay.get('matrix'))), 'sample_ids': assay.get('sampleIds'), } granatum_clustering = GranatumDeepClustering(**args_for_init) fit_results = granatum_clustering.fit(**args_for_fit) fit_exp = fit_results.get('clusters') gn.export_statically(fit_exp, 'Cluster assignment') newdictstr = ['"'+str(k)+'"'+", "+str(v) for k, v in fit_exp.items()] gn.export("\n".join(newdictstr), 'Cluster assignment.csv', kind='raw', meta=None, raw=True) md_str = f"""\ ## Results * Cluster array: `{fit_results.get('clusters_array')}` * Cluster array: `{fit_results.get('clusters_array')}` * nClusters: {fit_results.get('n_clusters')} * Number of components: {fit_results.get('n_components')} * Outliers: {fit_results.get('outliers')}""" # gn.add_result(md_str, 'markdown') gn.add_result( { 'orient': 'split', 'columns': ['Sample ID', 'Cluster Assignment'], 'data': [{'Sample ID':x, 'Cluster Assignment':y} for x, y in zip(assay.get('sampleIds'), fit_results.get('clusters_array'))], }, 'table', ) gn.commit()
def main(): gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) n_steps = gn.get_arg('n_steps') min_theta = gn.get_arg('min_theta') max_theta = gn.get_arg('max_theta') jammit = JAMMIT.from_dfs([df]) jammit.scan( thetas=np.linspace(min_theta, max_theta, n_steps), calculate_fdr=True, n_perms=10, verbose=1, convergence_threshold=0.000000001, ) jammit_result = jammit.format(columns=['theta', 'alpha', 'n_sigs', 'fdr']) jammit_result['theta'] = jammit_result['theta'].round(3) jammit_result['alpha'] = jammit_result['alpha'].round(3) plt.plot(jammit_result['alpha'], jammit_result['fdr']) plt.xlabel('alpha') plt.ylabel('FDR') gn.add_current_figure_to_results('FDR plotted against alpha', height=400) gn.add_result( { 'pageSize': n_steps, 'orient': 'split', 'columns': [{ 'name': h, 'type': 'number', 'round': 3 } for h in jammit_result.columns], 'data': jammit_result.values.tolist(), }, data_type='table', ) gn.commit()
def main(): gn = Granatum() adata = gn.ann_data_from_assay(gn.get_import("assay")) num_cells_to_sample = gn.get_arg("num_cells_to_sample") random_seed = gn.get_arg("random_seed") np.random.seed(random_seed) num_cells_before = adata.shape[0] num_genes_before = adata.shape[1] if num_cells_to_sample > 0 and num_cells_to_sample < 1: num_cells_to_sample = round(num_cells_before * num_cells_to_sample) else: num_cells_to_sample = round(num_cells_to_sample) if num_cells_to_sample > num_cells_before: num_cells_to_sample = num_cells_before if num_cells_to_sample < 1: num_cells_to_sample = 1 sampled_cells_idxs = np.sort(np.random.choice(num_cells_before, num_cells_to_sample, replace=False)) adata = adata[sampled_cells_idxs, :] gn.add_result( "\n".join( [ "The assay before down-sampling has **{}** cells and {} genes.".format( num_cells_before, num_genes_before ), "", "The assay after down-sampling has **{}** cells and {} genes.".format(adata.shape[0], adata.shape[1]), ] ), type="markdown", ) gn.export(gn.assay_from_ann_data(adata), "Down-sampled Assay", dynamic=False) gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) n_neighbors = gn.get_arg('n_neighbors') min_dist = gn.get_arg('min_dist') metric = gn.get_arg('metric') random_seed = gn.get_arg('random_seed') embedding = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric, random_state=random_seed).fit_transform(df.values.T) plt.figure() plt.scatter(embedding[:, 0], embedding[:, 1], min(5000 / df.shape[0], 36.0)) plt.xlabel('UMAP dim. 1') plt.ylabel('UMAP dim. 2') plt.tight_layout() gn.add_current_figure_to_results('UMAP plot: each dot represents a cell', dpi=75) pca_export = { 'dimNames': ['UMAP dim. 1', 'UMAP dim. 2'], 'coords': { sample_id: embedding[i, :].tolist() for i, sample_id in enumerate(df.columns) }, } gn.export_statically(pca_export, 'UMAP coordinates') toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished UMAP step in {} seconds*".format(time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): gn = Granatum() sample_meta_true = gn.get_import("sample_meta_true") sample_meta_predicted = gn.get_import("sample_meta_predicted") # Using pandas series to align the two metas in case they have different sample IDs rand_score = adjusted_rand_score(pd.Series(sample_meta_true), pd.Series(sample_meta_predicted)) mutual_info_score = adjusted_mutual_info_score( pd.Series(sample_meta_true), pd.Series(sample_meta_predicted)) results_markdown = "\n".join([ "Adjusted Rand score: **{}**".format(rand_score), "", "Adjusted mutual information score: **{}**".format(mutual_info_score), ]) gn.add_result(results_markdown, "markdown") gn.commit()
def main(): gn = Granatum() adata = gn.ann_data_from_assay(gn.get_import("assay")) min_cells_expressed = gn.get_arg("min_cells_expressed") min_mean = gn.get_arg("min_mean") max_mean = gn.get_arg("max_mean") min_disp = gn.get_arg("min_disp") max_disp = gn.get_arg("max_disp") num_genes_before = adata.shape[1] sc.pp.filter_genes(adata, min_cells=min_cells_expressed) filter_result = sc.pp.filter_genes_dispersion( adata.X, flavor='seurat', min_mean=math.log(min_mean), max_mean=math.log(max_mean), min_disp=min_disp, max_disp=max_disp, ) adata = adata[:, filter_result.gene_subset] sc.pl.filter_genes_dispersion(filter_result) gn.add_current_figure_to_results( "Each dot represent a gene. The gray dots are the removed genes. The x-axis is log-transformed.", zoom=3, dpi=50, height=400, ) gn.add_result( "\n".join( [ "Number of genes before filtering: **{}**".format(num_genes_before), "", "Number of genes after filtering: **{}**".format(adata.shape[1]), ] ), type="markdown", ) gn.export(gn.assay_from_ann_data(adata), "Filtered Assay", dynamic=False) gn.commit()
def main(): gn = Granatum() df = gn.pandas_from_assay(gn.get_import("assay")) frob_norm = np.linalg.norm(df.values) df = df / frob_norm gn.add_result( f"""\ The original assay had Frobenius norm of {frob_norm}, after normalization its Frobenius norm is now {np.linalg.norm(df.values)}""", 'markdown', ) gn.export(gn.assay_from_pandas(df), "Frobenius normalized assay", dynamic=False) gn.commit()
def main(): gn = Granatum() adata = gn.ann_data_from_assay(gn.get_import('assay')) outliers = gn.get_arg('outliers') num_cells_before = adata.shape[0] kept_cell_ids = adata.obs_names.drop(outliers, errors='ignore').values adata = adata[kept_cell_ids, :] gn.export_statically(gn.assay_from_ann_data(adata), 'Outlier removed assay') gn.add_result( 'You removed {} outliers from {} cells, the result assay has {} cells (and {} genes).'.format( len(outliers), num_cells_before, adata.shape[0], adata.shape[1] ), type='markdown' ) gn.commit()
def main(): gn = Granatum() df = gn.pandas_from_assay(gn.get_import("assay")) epsilon = gn.get_arg('epsilon') min_cells_expressed = gn.get_arg('min_cells_expressed') filter_df = pd.DataFrame({'gene': df.index}) filter_df['sum_expr'] = [sum(df.values[i, :]) for i in range(df.shape[0])] filter_df['avg_expr'] = filter_df['sum_expr'] / df.shape[1] filter_df['num_expressed_genes'] = [ sum([x > epsilon for x in df.values[i, :]]) for i in range(df.shape[0]) ] filter_df[ 'removed'] = filter_df['num_expressed_genes'] < min_cells_expressed new_df = df.loc[np.logical_not(filter_df['removed'].values), :] gn.add_result( "\n".join([ "Number of genes before filtering: **{}**".format(df.shape[0]), "", "Number of genes after filtering: **{}**".format(new_df.shape[0]), ]), type="markdown", ) if filter_df.shape[0] > 0: filter_df_deleted = filter_df.loc[filter_df['removed'].values, :].drop( 'removed', axis=1) gn.add_result( { 'title': f"Removed genes ({filter_df_deleted.shape[0]})", 'orient': 'split', 'columns': filter_df_deleted.columns.values.tolist(), 'data': filter_df_deleted.values.tolist(), }, data_type='table', ) else: gn.add_result( f"No genes were removed. All {df.shape[0]} genes were kept. " f"See attachment **gene_selection.csv** for detail.", 'markdown', ) gn.export(filter_df.to_csv(index=False), 'gene_selection.csv', kind='raw', meta=None, raw=True) gn.export(gn.assay_from_pandas(new_df), "Filtered Assay", dynamic=False) gn.commit()
def main(): gn = Granatum() assay = gn.get_import('assay') x = np.array(assay.get('matrix')).astype(np.float) log_base = gn.get_arg('log_base') n_top = gn.get_arg('n_top') n_bottom = gn.get_arg('n_bottom') which_mid = gn.get_arg('which_mid') gene_df = pd.DataFrame( { 'row_num': range(x.shape[0]), 'gene_id': assay.get('geneIds'), 'exp_mean': np.mean(x, axis=1), 'exp_std': np.std(x, axis=1), } ) gene_df = gene_df.sort_values('exp_mean', ascending=False) top_gene_row = gene_df.head(n_top).sort_values('exp_std', ascending=False).iloc[0] bottom_gene_row = gene_df.tail(n_bottom).sort_values('exp_std').iloc[0] hk_gene = np.clip(x[top_gene_row['row_num'], :], a_min=0.00001, a_max=None) neg_gene = x[bottom_gene_row['row_num'], :] if which_mid == 'mean': alphabk = np.mean(neg_gene[:]) elif which_mid == 'median': alphabk = np.median(neg_gene[:]) else: raise ValueError() loghkdatabk = np.log(hk_gene - alphabk) / np.log(log_base) # Drop NAN values loghkdatabk = loghkdatabk[~np.isnan(loghkdatabk)] c = (np.std(neg_gene[:], ddof=1) / np.std(loghkdatabk, ddof=1))**2 xbk = x - alphabk transformed_matrix = np.log((xbk + np.sqrt(xbk**2 + c)) / 2) / np.log(log_base) gn.add_result( '\n'.join( [ f"Selected benchmarking genes:", f" * housekeeping gene: **{top_gene_row['gene_id']}** " f"(mean: {top_gene_row['exp_mean']}, std: {top_gene_row['exp_std']}) ", f" * negative control gene: **{bottom_gene_row['gene_id']}**" f"(mean: {bottom_gene_row['exp_mean']}, std: {bottom_gene_row['exp_std']})", f"", f"Final formula is `y = log{log_base}((z + sqrt(z^2 + c))/2)`, where `z = x - {alphabk}` and `c = {c}`." ] ), 'markdown' ) non_zero_values_before = x.flatten() non_zero_values_before = non_zero_values_before[(non_zero_values_before > np.percentile(non_zero_values_before, 5))] non_zero_values_after = transformed_matrix.flatten() non_zero_values_after = non_zero_values_after[(non_zero_values_after > np.percentile(non_zero_values_after, 5))] plt.figure() plt.subplot(2, 1, 1) plt.title('Before glog transformation') plt.hist(non_zero_values_before, bins=100) plt.ylabel('Frequency') plt.xlabel('Expression level') plt.subplot(2, 1, 2) plt.title('After glog transformation') plt.hist(non_zero_values_after, bins=100) plt.ylabel('Frequency') plt.xlabel('Expression level') plt.tight_layout() caption = ( 'The distribution of expression level before and after glog transformation. Only the values greater ' 'than the 5 percentile (usually zero in single-cell data) and lower than 95 percentile are considered.' ) gn.add_current_figure_to_results(caption, zoom=2, dpi=50) assay['matrix'] = transformed_matrix.tolist() gn.export_statically(assay, 'GLog transformed assay') gn.commit()
def main(): gn = Granatum() gene_scores_dict = gn.get_import("gene_scores") species = gn.get_arg("species") gset_group_id = gn.get_arg("gset_group_id") threshold = gn.get_arg("threshold") use_abs = gn.get_arg("use_abs") background = gn.get_arg("background") gene_ids = list(gene_scores_dict.keys()) gene_scores = list(gene_scores_dict.values()) gene_id_type = guess_gene_id_type(list(gene_ids)[:5]) if gene_id_type != 'symbol': gene_ids = convert_gene_ids(gene_ids, gene_id_type, 'symbol', species) if species == "human": pass elif species == "mouse": gene_ids = zgsea.to_human_homolog(gene_ids, "mouse") # problem is that gene_ids is NAN after this else: raise ValueError() if use_abs: input_list = np.array(gene_ids)[ np.abs(np.array(gene_scores)) >= threshold] else: input_list = np.array(gene_ids)[np.array(gene_scores) >= threshold] print(input_list) gn.add_result( f"""\ Number of genes after thresholding: {len(input_list)} (out of original {len(gene_ids)}). Please see the attachment `list_of_genes.csv` for the list of genes considered in this enrichment analysis.""", 'markdown', ) gn.export(pd.Series(input_list).to_csv(index=False), 'list_of_genes.csv', kind='raw', meta=None, raw=True) if background == 'all': background_list = get_all_genes('human') elif background == 'from_gene_sets': background_list = None elif background == 'from_input': background_list = gene_ids else: raise ValueError() result_df = zgsea.simple_fisher(input_list, gset_group_id, background_list=background_list) result_df = result_df.sort_values('fdr') result_df = result_df[[ 'gene_set_name', 'size', 'p_val', 'fdr', 'odds_ratio', 'n_overlaps', 'overlapping_genes', ]] result_df.columns = [ 'Gene set', 'Gene set size', 'p-value', 'FDR', 'Odds ratio', 'Number of overlapping genes', 'Overlapping genes', ] gn.add_pandas_df(result_df) gn.export(result_df.to_csv(index=False), 'enrichment_results.csv', kind='raw', meta=None, raw=True) gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.pandas_from_assay(gn.get_import('assay')) # Groups is {"cell":"cluster} groups = gn.get_import('groups') certainty = gn.get_arg('certainty') alpha = 1 - certainty / 100.0 min_zscore = st.norm.ppf(gn.get_arg("certainty") / 100.0) min_dist = 0.1 # Likely we want to filter genes before we get started, namely if we cannot create a good statistic norms_df = assay.apply(np.linalg.norm, axis=1) assay = assay.loc[norms_df.T >= min_dist, :] inv_map = {} inv_map_rest = {} for k, v in groups.items(): inv_map[v] = inv_map.get(v, []) + [k] clist = inv_map_rest.get(v, list(assay.columns)) clist.remove(k) inv_map_rest[v] = clist # Inv map is {"cluster": ["cell"]} print("Completed setup", flush=True) cols = list(inv_map.keys()) colnames = [] for coli in cols: for colj in cols: if coli != colj: colnames.append("{} vs {}".format(coli, colj)) for coli in cols: colnames.append("{} vs rest".format(coli)) # Instead of scoring into a dataframe, let's analyze each statistically # Dict (gene) of dict (cluster) of dict (statistics) # { "gene_name" : { "cluster_name" : { statistics data } }} # Export would be percentage more/less expressed in "on" state # For example gene "XIST" expresses at least 20% more in cluster 1 vs cluster 4 with 95% certainty total_genes = len(assay.index) print("Executing parallel for {} genes".format(total_genes), flush=True) results = Parallel( n_jobs=math.floor(multiprocessing.cpu_count() * 2 * 9 / 10))( delayed(compref)(gene, assay.loc[gene, :], colnames, inv_map, inv_map_rest, alpha, min_dist, min_zscore) for gene in tqdm(list(assay.index))) result = pd.concat(results, axis=0) gn.export_statically(gn.assay_from_pandas(result.T), 'Differential expression sets') gn.export(result.to_csv(), 'differential_gene_sets.csv', kind='raw', meta=None, raw=True) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished differential expression sets step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) alpha = gn.get_arg('alpha') jammit = JAMMIT.from_dfs([df]) res = jammit.run_for_one_alpha( alpha, verbose=1, convergence_threshold=0.000000001, ) u = res['u'] v = res['v'] gn.export(dict(zip(df.index, u)), 'Genes loadings', kind='geneMeta') gn.export(dict(zip(df.columns, v)), 'Sample scores', kind='sampleMeta') gene_df = pd.DataFrame({ 'id_': df.index, 'abs_loading': abs(u), 'loading': u }) gene_df = gene_df[['id_', 'abs_loading', 'loading']] gene_df = gene_df.loc[gene_df['loading'].abs() > EPSILON] gene_df = gene_df.sort_values('abs_loading', ascending=False) gn.add_result( { 'title': f"Signal genes ({len(gene_df)})", 'orient': 'split', 'columns': gene_df.columns.values.tolist(), 'data': gene_df.values.tolist(), }, data_type='table', ) gn.export(gene_df.to_csv(index=False), 'signal_genes.csv', kind='raw', meta=None, raw=True) sample_df = pd.DataFrame({ 'id_': df.columns, 'abs_score': abs(v), 'score': v }) sample_df = sample_df[['id_', 'abs_score', 'score']] sample_df = sample_df.loc[sample_df['score'].abs() > EPSILON] sample_df = sample_df.sort_values('abs_score', ascending=False) gn.add_result( { 'title': f"Signal samples ({len(sample_df)})", 'orient': 'split', 'columns': sample_df.columns.values.tolist(), 'data': sample_df.values.tolist(), }, data_type='table', ) gn.export(sample_df.to_csv(index=False), 'signal_samples.csv', kind='raw', meta=None, raw=True) subset_df = df.loc[gene_df['id_'], sample_df['id_']] gn.export(gn.assay_from_pandas(subset_df), 'Assay with only signal genes and samples', kind='assay') sns.clustermap(subset_df, cmap='RdBu') gn.add_current_figure_to_results( description='Cluster map of the signal genes and signal samples', zoom=2, width=750, height=850, dpi=50, ) plt.close() plt.figure() plt.scatter(range(len(u)), u, s=2, c='red') plt.xlabel('index') plt.ylabel('value in u') gn.add_current_figure_to_results( description= 'The *u* vector (loadings for genes) plotted as a scatter plot.', zoom=2, width=750, height=450, dpi=50, ) plt.close() plt.figure() plt.plot(range(len(v)), v) plt.scatter(range(len(v)), v, s=6, c='red') plt.xlabel('index') plt.ylabel('value in v') gn.add_current_figure_to_results( description= 'The *v* vector (scores for samples) plotted as a line plot.', zoom=2, width=750, height=450, dpi=50, ) plt.close() # gn.export_current_figure( # 'cluster_map.pdf', # zoom=2, # width=750, # height=850, # dpi=50, # ) gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() sample_coords = gn.get_import("viz_data") value = gn.get_import("value") coloring_type = gn.get_arg("coloring_type") bounding_stdev = gn.get_arg("bounding_stdev") label_location = gn.get_arg("label_location") label_transform = gn.get_arg("label_transform") labelXaxis = gn.get_arg("labelXaxis") labelYaxis = gn.get_arg("labelYaxis") sigfigs = gn.get_arg("sigfigs") numticks = gn.get_arg("numticks") font = gn.get_arg('font') coords = sample_coords.get("coords") dim_names = sample_coords.get("dimNames") seed = gn.get_arg('random_seed') random.seed(seed) np.random.seed(seed) df = pd.DataFrame( { "x": [a[0] for a in coords.values()], "y": [a[1] for a in coords.values()], "value": pd.Series(value) }, index=coords.keys()) target_dpi = 300 target_width = 7.5 # inches target_height = 6.5 # inches font_size_in_in = font / 72.0 # inches font_size_in_px = font_size_in_in * target_dpi try: if coloring_type == "categorical": uniq = df["value"].unique() uniq.sort(kind="stable") num = uniq.shape[0] COLORS2 = plt.get_cmap('gist_rainbow') carr = [0] * df.shape[0] listcats = list(df["value"]) miny = min(list(df["y"])) maxy = max(list(df["y"])) scaley = (maxy - miny) / (target_height * target_dpi) print("Scaley = {}".format(scaley)) colorhash = {} colorstep = np.ceil(256.0 / num) coffset = randrange(colorstep) grouptocolor = np.random.choice(np.arange(num), num, replace=False) for i, cat in enumerate(uniq): dff = df[df["value"] == cat] xs = list(dff["x"]) ys = list(dff["y"]) #avgx = sum(dff["x"]) / len(dff["x"]) #avgy = sum(dff["y"]) / len(dff["y"]) #plt.scatter(x=dff["x"], y=dff["y"], s=5000 / df.shape[0], c=COLORS[i].hex_l, label=cat) #plt.scatter(x=dff["x"], y=dff["y"], s=5000 / df.shape[0], c=[abs(hash(cat)) % 256]*len(dff["x"]), cmap=COLORS2, label=cat) #plt.scatter(x=dff["x"], y=dff["y"], s=5000 / df.shape[0], c=abs(hash(cat)) % 256, cmap=COLORS2, label=cat) #abs(hash(cat)) colorindex = (coffset + grouptocolor[i] * colorstep) % 256 colorhash[cat] = colorindex craw = COLORS2((colorindex + 0.0) / 256.0) clr = [craw[0], craw[1], craw[2], 0.2] whitetransparent = [1.0, 1.0, 1.0, 0.5] coloropaque = [craw[0], craw[1], craw[2], 1.0] if len(xs) > 3: pts = list(zip(xs, ys)) cent = np.mean(pts, axis=0) lengs = list( map( lambda p: math.sqrt( (p[0] - cent[0]) * (p[0] - cent[0]) + (p[1] - cent[1]) * (p[1] - cent[1])), pts)) avgleng = st.mean(lengs) stdleng = st.stdev(lengs) * bounding_stdev rpts = [] if (stdleng > 0.0): for j, ln in enumerate(lengs): if (ln - avgleng < stdleng): rpts.append(pts[j]) pts = rpts cent = np.mean(pts, axis=0) hull = ConvexHull(pts) ptslist = [] for pt in hull.simplices: ptslist.append(pts[pt[0]]) ptslist.append(pts[pt[1]]) ptslist.sort(key=lambda p: np.arctan2( p[1] - cent[1], p[0] - cent[0])) ptslist = ptslist[0::2] ptslist.insert(len(ptslist), ptslist[0]) lowestpt = ptslist[0] if label_location == 'bottom': for pt in ptslist: if (pt[1] < lowestpt[1]): lowestpt = pt else: lowestpt = ptslist[randrange(len(ptslist))] if (bounding_stdev >= 0.0): poly = Polygon(1.1 * (np.array(ptslist) - cent) + cent, facecolor=clr) poly.set_capstyle('round') plt.gca().add_patch(poly) poly.set_color(clr) label_text = cat if label_transform == "numbers": label_text = re.sub("[^0-9]", "", cat) txt = plt.text(lowestpt[0], lowestpt[1] - scaley * font_size_in_px * 1.2, label_text, fontsize=font, fontname="Arial", ha="center", va="center", color="black", bbox=dict(boxstyle="round", fc=whitetransparent, ec=coloropaque)) # plt.gca().add_artist(txt) for j, x in enumerate(listcats): if x == cat: carr[j] = colorhash[cat] #carr[j] = colorhash[cat] / 256.0 #int(abs(hash(cat)) % 256) plt.scatter(x=df["x"], y=df["y"], s=5000 / df.shape[0], c=carr, cmap=COLORS2) lgd = plt.legend(markerscale=6, loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=5) #60 / (5000 / df.shape[0]) elif coloring_type == "continuous": plt.scatter(x=df["x"], y=df["y"], s=5000 / df.shape[0], c=df["value"], cmap="Reds") plt.colorbar() xmin, xmax = plt.gca().get_xlim() ymin, ymax = plt.gca().get_ylim() # stepsizex=(xmax-xmin)/numticks # stepsizey=(ymax-ymin)/numticks xtickArray = resetArray(xmin, xmax, numticks, sigfigs) ytickArray = resetArray(ymin, ymax, numticks, sigfigs) # plt.xticks(np.arange(xmin, xmax+stepsizex, step=stepsizex), fontsize=font, fontname="Arial") # plt.yticks(np.arange(ymin, ymax+stepsizey, step=stepsizey), fontsize=font, fontname="Arial") plt.xlim(xtickArray[0], xtickArray[-1]) plt.ylim(ytickArray[0], ytickArray[-1]) plt.xticks(xtickArray, fontsize=font, fontname="Arial") plt.yticks(ytickArray, fontsize=font, fontname="Arial") if labelXaxis == "": plt.xlabel(dim_names[0], fontsize=font, fontname="Arial") else: plt.xlabel(labelXaxis, fontsize=font, fontname="Arial") if labelYaxis == "": plt.ylabel(dim_names[1], fontsize=font, fontname="Arial") else: plt.ylabel(labelYaxis, fontsize=font, fontname="Arial") # plt.tight_layout() gn.add_current_figure_to_results( "Scatter-plot", dpi=target_dpi, width=target_width * target_dpi, height=target_height * target_dpi, savefig_kwargs={'bbox_inches': 'tight'}) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished sample coloring step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit() except Exception as e: plt.figure() plt.text( 0.05, 0.7, 'Values used as colors and type of sample metadata are incompatible with each other' ) if coloring_type == 'categorical': new_coloring_type = 'continuous' else: new_coloring_type = 'categorical' plt.text( 0.05, 0.5, 'Retry the step with ' + new_coloring_type + ' instead of ' + coloring_type) plt.axis('off') gn.add_current_figure_to_results('Scatter-plot') gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.get_import('assay') sample_ids = assay.get('sampleIds') group_dict = gn.get_import('groupVec') group_vec = pd.Categorical([group_dict.get(x) for x in sample_ids]) num_groups = len(group_vec.categories) figheight = 400 * (math.floor((num_groups - 1) / 7) + 1) adata = sc.AnnData(np.array(assay.get('matrix')).transpose()) adata.var_names = assay.get('geneIds') adata.obs_names = assay.get('sampleIds') adata.obs['groupVec'] = group_vec sc.pp.neighbors(adata, n_neighbors=20, use_rep='X', method='gauss') try: sc.tl.rank_genes_groups(adata, 'groupVec', n_genes=100000) sc.pl.rank_genes_groups(adata, n_genes=20) gn.add_current_figure_to_results('One-vs-rest marker genes', dpi=75, height=figheight) gn._pickle(adata, 'adata') rg_res = adata.uns['rank_genes_groups'] for group in rg_res['names'].dtype.names: genes_names = [str(x[group]) for x in rg_res['names']] scores = [float(x[group]) for x in rg_res['scores']] newdict = dict(zip(genes_names, scores)) gn.export(newdict, 'Marker score ({} vs. rest)'.format(group), kind='geneMeta') newdictstr = [ '"' + str(k) + '"' + ", " + str(v) for k, v in newdict.items() ] gn.export("\n".join(newdictstr), 'Marker score {} vs rest.csv'.format(group), kind='raw', meta=None, raw=True) # cluster_assignment = dict(zip(adata.obs_names, adata.obs['louvain'].values.tolist())) # gn.export_statically(cluster_assignment, 'cluster_assignment') toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished marker gene identification step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit() except Exception as e: plt.figure() plt.text(0.01, 0.5, 'Incompatible group vector due to insufficent cells') plt.text(0.01, 0.3, 'Please retry the step with a different group vector') plt.axis('off') gn.add_current_figure_to_results('One-vs-rest marker genes') gn.add_result('Error = {}'.format(e), "markdown") gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay_file = gn.get_uploaded_file_path("assayFile") sample_meta_file = gn.get_uploaded_file_path("sampleMetaFile") file_format = gn.get_arg("fileFormat") file_format_meta = gn.get_arg("fileFormatMeta") species = gn.get_arg("species") # Share the email address among other gboxes using a pickle dump # email_address = gn.get_arg("email_address") shared = {"email_address": email_address} with open(gn.swd + "/shared.pkl", "wb") as fp: pickle.dump(shared, fp) if file_format == "und": file_format = Path(assay_file).suffix[1:] if file_format == "csv": tb = pd.read_csv(assay_file, sep=",", index_col=0, engine='c', memory_map=True) elif file_format == "tsv": tb = pd.read_csv(assay_file, sep="\t", index_col=0, engine='c', memory_map=True) elif file_format.startswith("xls"): tb = pd.read_excel(assay_file, index_col=0) elif file_format == "zip": os.system("zip -d {} __MACOSX/\\*".format(assay_file)) os.system("unzip -p {} > {}.csv".format(assay_file, assay_file)) tb = pd.read_csv("{}.csv".format(assay_file), sep=",", index_col=0, engine='c', memory_map=True) elif file_format == "gz": os.system("gunzip -c {} > {}.csv".format(assay_file, assay_file)) tb = pd.read_csv("{}.csv".format(assay_file), sep=",", index_col=0, engine='c', memory_map=True) else: gn.error("Unknown file format: {}".format(file_format)) sample_ids = tb.columns.values.tolist() gene_ids = tb.index.values.tolist() gene_id_type = guess_gene_id_type(gene_ids[:5]) whether_convert_id = gn.get_arg("whether_convert_id") if whether_convert_id: to_id_type = gn.get_arg("to_id_type") add_info = gn.get_arg("add_info") # if there are duplicated ids, pick the first row # TODO: Need to have a more sophisticated handling of duplicated ids gene_ids, new_meta = convert_gene_ids(gene_ids, gene_id_type, to_id_type, species, return_new_meta=True) # TODO: remove NaN rows # TODO: combine duplicated rows if add_info: for col_name, col in new_meta.iteritems(): gn.export(col.to_dict(), col_name, "geneMeta") assay_export_name = "[A]{}".format(basename(assay_file)) exported_assay = { "matrix": tb.values.tolist(), "sampleIds": sample_ids, "geneIds": gene_ids, } gn.export(exported_assay, assay_export_name, "assay") entry_preview = '\n'.join( [', '.join(x) for x in tb.values[:10, :10].astype(str).tolist()]) gn.add_result( f"""\ The assay has **{tb.shape[0]}** genes (with inferred ID type: {biomart_col_dict[gene_id_type]}) and **{tb.shape[1]}** samples. The first few rows and columns: ``` {entry_preview} ``` """, "markdown", ) meta_rows = [] if sample_meta_file is not None: if file_format_meta == "und": file_format_meta = Path(sample_meta_file).suffix[1:] if file_format_meta == "csv": sample_meta_tb = pd.read_csv(sample_meta_file) elif file_format_meta == "tsv": sample_meta_tb = pd.read_csv(sample_meta_file, sep="\t") elif file_format_meta.startswith("xls"): sample_meta_tb = pd.read_excel(sample_meta_file) elif file_format_meta == "zip": os.system("unzip -p {} > {}.csv".format(sample_meta_file, sample_meta_file)) sample_meta_tb = pd.read_csv("{}.csv".format(sample_meta_file)) elif file_format_meta == "gz": os.system("gunzip -c {} > {}.csv".format(sample_meta_file, sample_meta_file)) sample_meta_tb = pd.read_csv("{}.csv".format(sample_meta_file)) else: gn.error("Unknown file format: {}".format(file_format)) for meta_name in sample_meta_tb.columns: meta_output_name = "[M]{}".format(meta_name) sample_meta_dict = dict( zip(sample_ids, sample_meta_tb[meta_name].values.tolist())) gn.export(sample_meta_dict, meta_output_name, "sampleMeta") num_sample_values = 5 sample_values = ", ".join(sample_meta_tb[meta_name].astype( str).values[0:num_sample_values].tolist()) num_omitted_values = len( sample_meta_tb[meta_name]) - num_sample_values if num_omitted_values > 0: etc = ", ... and {} more entries".format(num_omitted_values) else: etc = "" meta_rows.append({ 'meta_name': meta_name, 'sample_values': str(sample_values) + etc, }) # meta_message = '\n'.join( # "* Sample meta with name **{meta_name}** is accepted ({sample_values}).".format(**x) for x in meta_rows # ) # gn.add_result(meta_message, "markdown") # gn.add_result({'columns': []}, 'table') # TODO: SAVE assay pickle toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished upload step in {} seconds*".format(time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.pandas_from_assay(gn.get_import('assay')) groups = gn.get_import('groups') min_zscore = gn.get_arg('min_zscore') max_zscore = gn.get_arg('max_zscore') min_expression_variation = gn.get_arg('min_expression_variation') inv_map = {} for k, v in groups.items(): inv_map[v] = inv_map.get(v, []) + [k] low_mean_dfs = [] high_mean_dfs = [] mean_dfs = [] std_dfs = [] colnames = [] for k, v in inv_map.items(): group_values = assay.loc[:, v] lowbound_clust = {} highbound_clust = {} for index, row in group_values.iterrows(): meanbounds = sms.DescrStatsW(row).tconfint_mean() lowbound_clust[index] = meanbounds[0] highbound_clust[index] = meanbounds[1] low_mean_dfs.append(pd.DataFrame.from_dict(lowbound_clust, orient="index", columns=[k])) high_mean_dfs.append(pd.DataFrame.from_dict(highbound_clust, orient="index", columns=[k])) mean_dfs.append(group_values.mean(axis=1)) std_dfs.append(group_values.std(axis=1)) colnames.append(k) mean_df = pd.concat(mean_dfs, axis=1) mean_df.columns = colnames low_mean_df = pd.concat(low_mean_dfs, axis=1) low_mean_df.columns = colnames high_mean_df = pd.concat(high_mean_dfs, axis=1) high_mean_df.columns = colnames std_df = pd.concat(std_dfs, axis=1) std_df.columns = colnames print(std_df) minvalues = std_df.min(axis=1).to_frame() minvalues.columns=["min"] print("Minvalues>>") print(minvalues, flush=True) genes_below_min = list((minvalues[minvalues["min"]<min_expression_variation]).index) print("{} out of {}".format(len(genes_below_min), len(minvalues.index)), flush=True) mean_df = mean_df.drop(genes_below_min, axis=0) low_mean_df = low_mean_df.drop(genes_below_min, axis=0) high_mean_df = high_mean_df.drop(genes_below_min, axis=0) std_df = std_df.drop(genes_below_min, axis=0) assay = assay.drop(genes_below_min, axis=0) print("Filtered assay to get {} columns by {} rows".format(len(assay.columns), len(assay.index)), flush=True) mean_rest_dfs = [] std_rest_dfs = [] colnames = [] for k, v in inv_map.items(): rest_v = list(set(list(assay.columns)).difference(set(v))) mean_rest_dfs.append(assay.loc[:, rest_v].mean(axis=1)) std_rest_dfs.append(assay.loc[:, rest_v].std(axis=1)) colnames.append(k) mean_rest_df = pd.concat(mean_rest_dfs, axis=1) mean_rest_df.columns = colnames std_rest_df = pd.concat(std_rest_dfs, axis=1) std_rest_df.columns = colnames zscore_dfs = [] cols = colnames colnames = [] for coli in cols: for colj in cols: if coli != colj: # Here we should check significance # Fetch most realistic mean comparison set, what is smallest difference between two ranges mean_diff_overlap_low_high = (low_mean_df[coli]-high_mean_df[colj]) mean_diff_overlap_high_low = (high_mean_df[coli]-low_mean_df[colj]) diff_df = mean_diff_overlap_low_high.combine(mean_diff_overlap_high_low, range_check) zscore_dfs.append((diff_df/(std_df[colj]+std_df[coli]/4)).fillna(0).clip(-max_zscore, max_zscore)) colnames.append("{} vs {}".format(coli, colj)) for coli in cols: zscore_dfs.append(((mean_df[coli]-mean_rest_df[colj])/(std_rest_df[colj]+std_rest_df[coli]/4)).fillna(0).clip(-max_zscore, max_zscore)) colnames.append("{} vs rest".format(coli)) zscore_df = pd.concat(zscore_dfs, axis=1) zscore_df.columns = colnames norms_df = zscore_df.apply(np.linalg.norm, axis=1) colsmatching = norms_df.T[(norms_df.T >= min_zscore)].index.values return_df = zscore_df.T[colsmatching] gn.export_statically(gn.assay_from_pandas(return_df), 'Differential expression sets') gn.export(return_df.T.to_csv(), 'differential_gene_sets.csv', kind='raw', meta=None, raw=True) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished differential expression sets step in {} seconds*".format(time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() clustersvsgenes = gn.pandas_from_assay(gn.get_import('clustersvsgenes')) max_dist = gn.get_arg('max_dist') min_zscore = gn.get_arg('min_zscore') clustercomparisonstotest = list(clustersvsgenes.index) G = nx.MultiDiGraph() clusternames = list(clustersvsgenes.T.columns) individualclusters = [ n[:n.index(" vs rest")] for n in clusternames if n.endswith("vs rest") ] print(individualclusters, flush=True) for cl in individualclusters: G.add_node(cl) # {pathway : {"cluster1":score1, "cluster2":score2}, pathway2 : {}} # resultsmap = {} relabels = {} keys = {} currentkeyindex = 0 maxexpression = np.max(np.max(clustersvsgenes)) print("Max expression = {}".format(maxexpression)) print("Number to analyze = {}".format( len(clustersvsgenes.columns) * len(clustercomparisonstotest)), flush=True) gene_count = 0 for gene_id in clustersvsgenes.columns: gene_count = gene_count + 1 print("Genecount = {}/{}".format(gene_count, len(clustersvsgenes.columns)), flush=True) add_all_edges_for_current_gene = True for cluster in clustercomparisonstotest: score = clustersvsgenes.loc[cluster, gene_id] if score >= min_zscore: add_edges = True if not gene_id in keys: # First check if within distance of another group closestkey = None closestkeyvalue = 1.0e12 for key in keys: gene_values = clustersvsgenes.loc[:, gene_id] ref_values = clustersvsgenes.loc[:, key] sc = np.sqrt( np.nansum(np.square(gene_values - ref_values)) / len(gene_values)) if sc <= max_dist and sc < closestkeyvalue: closestkeyvalue = sc closestkey = key break if closestkey == None: keys[gene_id] = currentkeyindex + 1 else: keys[gene_id] = keys[closestkey] add_edges = False add_all_edges_for_current_gene = False print("Found a near gene: {}".format(closestkey), flush=True) else: add_edges = add_all_edges_for_current_gene # print("Score = {}".format(score), flush=True) # olddict = resultsmap.get(gene_id, {}) # olddict[cluster] = score # resultsmap[gene_id] = olddict if add_edges: from_to = re.split(' vs ', cluster) if from_to[1] != 'rest': G.add_weighted_edges_from( [(from_to[1], from_to[0], score / maxexpression * 1.0)], label=str(keys[gene_id]), penwidth=str(score / maxexpression * 1.0)) else: relabel_dict = relabels.get(from_to[0], "") if relabel_dict == "": relabel_dict = from_to[0] + ": " + str( keys[gene_id]) else: relabel_dict = relabel_dict + ", " + str( keys[gene_id]) relabels[from_to[0]] = relabel_dict currentkeyindex = max(currentkeyindex, keys[gene_id]) print("Relabels {}".format(relabels), flush=True) G = nx.relabel_nodes(G, relabels) pos = nx.spring_layout(G) edge_labels = nx.get_edge_attributes(G, 'label') write_dot(G, 'plot.dot') os.system('dot plot.dot -Kcirco -Tpng -Gsize="6,6" -Gdpi=600 > plot.png') with open('plot.png', "rb") as f: image_b64 = b64encode(f.read()).decode("utf-8") gn.results.append({ "type": "png", "width": 650, "height": 480, "description": 'Network of clusters based on expression', "data": image_b64, }) footnote = "" inv_map = {} for k, v in keys.items(): inv_map[v] = inv_map.get(v, []) + [k] for k, v in sorted(inv_map.items(), key=lambda item: item[0]): newv = map(lambda gene: "[{}]({})".format(gene, geturl(gene)), v) vliststr = ", ".join(newv) newstr = "{}: {} {}".format( k, (clustersvsgenes.loc[clustersvsgenes[v[0]] > min_zscore, v[0]]).to_dict(), vliststr) if footnote == "": footnote = newstr else: footnote = footnote + " \n" + newstr gn.add_result(footnote, "markdown") # gn.export(return_df.T.to_csv(), 'differential_gene_sets.csv', kind='raw', meta=None, raw=True) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished differential expression sets step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() clustersvsgenes = gn.pandas_from_assay(gn.get_import('clustersvsgenes')) gset_group_id = gn.get_arg('gset_group_id') min_zscore = gn.get_arg('min_zscore') clustercomparisonstotest = list(clustersvsgenes.index) # Load all gene sets gsets = load_gsets(gset_group_id) G = nx.MultiDiGraph() clusternames = list(clustersvsgenes.T.columns) individualclusters = [ n[:n.index(" vs rest")] for n in clusternames if n.endswith("vs rest") ] print(individualclusters, flush=True) for cl in individualclusters: G.add_node(cl) # {pathway : {"cluster1":score1, "cluster2":score2}, pathway2 : {}} resultsmap = {} relabels = {} keys = {} urlsforkeys = {} currentkeyindex = 0 for gset in gsets: urlsforkeys[gset["name"]] = gset["url"] for cluster in clustercomparisonstotest: try: resultdf = clustersvsgenes.loc[cluster, gset["gene_ids"]] resultdf = np.nan_to_num(resultdf) score = np.nanmean(resultdf) if score >= min_zscore: keys[gset["name"]] = keys.get(gset["name"], currentkeyindex + 1) print("Score = {}".format(score), flush=True) olddict = resultsmap.get(gset["name"], {}) olddict[cluster] = score resultsmap[gset["name"]] = olddict from_to = re.split(' vs ', cluster) if from_to[1] != 'rest': G.add_weighted_edges_from( [(from_to[1], from_to[0], score * 2.0)], label=str(keys[gset["name"]]), penwidth=str(score * 2.0)) else: relabel_dict = relabels.get(from_to[0], "") if relabel_dict == "": relabel_dict = from_to[0] + ": " + str( keys[gset["name"]]) else: relabel_dict = relabel_dict + ", " + str( keys[gset["name"]]) relabels[from_to[0]] = relabel_dict currentkeyindex = max(currentkeyindex, keys[gset["name"]]) except Exception as inst: print("Key error with {}".format(gset["name"]), flush=True) print("Exception: {}".format(inst), flush=True) print("Relabels {}".format(relabels), flush=True) G = nx.relabel_nodes(G, relabels) pos = nx.spring_layout(G) edge_labels = nx.get_edge_attributes(G, 'label') write_dot(G, 'plot.dot') os.system("dot plot.dot -Tpng -Gdpi=600 > plot.png") with open('plot.png', "rb") as f: image_b64 = b64encode(f.read()).decode("utf-8") gn.results.append({ "type": "png", "width": 650, "height": 480, "description": 'Network of clusters based on expression', "data": image_b64, }) footnote = "" for k, v in sorted(keys.items(), key=lambda item: item[1]): newstr = "{}: [{}]({})".format(v, k, urlsforkeys[k]) if footnote == "": footnote = newstr else: footnote = footnote + " \n" + newstr gn.add_result(footnote, "markdown") # gn.export(return_df.T.to_csv(), 'differential_gene_sets.csv', kind='raw', meta=None, raw=True) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished differential expression sets step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.pandas_from_assay(gn.get_import('assay')) groups = gn.get_import('groups') inv_map = {} for k, v in groups.items(): inv_map[v] = inv_map.get(v, []) + [k] drop_set = parse(gn.get_arg('drop_set')) merge_set_1 = parse(gn.get_arg('merge_set_1')) merge_set_2 = parse(gn.get_arg('merge_set_2')) merge_set_3 = parse(gn.get_arg('merge_set_3')) relabel_set_1 = gn.get_arg('relabel_set_1') relabel_set_2 = gn.get_arg('relabel_set_2') relabel_set_3 = gn.get_arg('relabel_set_3') if len(merge_set_1) > 0: if relabel_set_1 == "": relabel_set_1 = " + ".join(merge_set_1) if len(merge_set_2) > 0: if relabel_set_2 == "": relabel_set_2 = " + ".join(merge_set_2) if len(merge_set_3) > 0: if relabel_set_3 == "": relabel_set_3 = " + ".join(merge_set_3) try: for ds in drop_set: cells = inv_map[ds] gn.add_result( "Dropping {} cells that match {}".format(len(cells), ds), "markdown") assay = assay.drop(cells, axis=1) groups = {key: val for key, val in groups.items() if val != ds} except Exception as e: gn.add_result( "Error found in drop set, remember it should be comma separated: {}" .format(e), "markdown") try: if len(merge_set_1) > 0: merge_set_1_cells = [] for ms1 in merge_set_1: merge_set_1_cells = merge_set_1_cells + inv_map[ms1] for cell in merge_set_1_cells: groups[cell] = relabel_set_1 if len(merge_set_2) > 0: merge_set_2_cells = [] for ms2 in merge_set_2: merge_set_2_cells = merge_set_2_cells + inv_map[ms2] for cell in merge_set_2_cells: groups[cell] = relabel_set_2 if len(merge_set_3) > 0: merge_set_3_cells = [] for ms3 in merge_set_3: merge_set_3_cells = merge_set_3_cells + inv_map[ms3] for cell in merge_set_3_cells: groups[cell] = relabel_set_3 except Exception as e: gn.add_result( "Error found in merge sets, remember it should be comma separated: {}" .format(e), "markdown") toc = time.perf_counter() time_passed = round(toc - tic, 2) gn.export_statically(gn.assay_from_pandas(assay), "Label adjusted assay") gn.export_statically(groups, "Adjusted labels") timing = "* Finished sample coloring step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit()