def main(): tic = time.perf_counter() gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) mingenes = gn.get_arg('min_genes_per_cell') maxgenes = gn.get_arg('max_genes_per_cell') mt_percent = gn.get_arg('mt_genes_percent')/100.0 uniquegenecount = df.astype(bool).sum(axis=0) totalgenecount = df.sum(axis=0) mtrows = df[df.index.str.startswith('MT')] mtgenecount = mtrows.sum(axis=0) mtpercent = mtgenecount.div(totalgenecount) colsmatching = uniquegenecount.T[(uniquegenecount.T >= mingenes) & (uniquegenecount.T <= maxgenes) & (mtpercent.T <= mt_percent)].index.values adata = df.loc[:, colsmatching] num_orig_cells = uniquegenecount.T.index.size num_filtered_cells = len(colsmatching) num_lt_min = uniquegenecount.T[(uniquegenecount.T < mingenes)].index.size num_gt_max = uniquegenecount.T[(uniquegenecount.T > maxgenes)].index.size num_gt_mt = uniquegenecount.T[(mtpercent.T > mt_percent)].index.size gn.add_result("Number of cells is now {} out of {} original cells with {} below min genes, {} above max genes, and {} above mt percentage threshold.".format(num_filtered_cells, num_orig_cells, num_lt_min, num_gt_max, num_gt_mt), "markdown") plt.figure() plt.subplot(2, 1, 1) plt.title('Unique gene count distribution') sns.distplot(uniquegenecount, bins=int(200), color = 'darkblue', kde_kws={'linewidth': 2}) plt.ylabel('Frequency') plt.xlabel('Gene count') plt.subplot(2, 1, 2) plt.title('MT Percent Distribution') sns.distplot(mtpercent*100.0, bins=int(200), color = 'darkblue', kde_kws={'linewidth': 2}) plt.ylabel('Frequency') plt.xlabel('MT Percent') plt.tight_layout() caption = ( 'The distribution of expression levels for each cell with various metrics.' ) gn.add_current_figure_to_results(caption, zoom=1, dpi=75) gn.export(gn.assay_from_pandas(adata), "Filtered Cells Assay", dynamic=False) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished cell filtering step in {} seconds*".format(time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): gn = Granatum() df = gn.pandas_from_assay(gn.get_import("assay")) epsilon = gn.get_arg('epsilon') min_cells_expressed = gn.get_arg('min_cells_expressed') filter_df = pd.DataFrame({'gene': df.index}) filter_df['sum_expr'] = [sum(df.values[i, :]) for i in range(df.shape[0])] filter_df['avg_expr'] = filter_df['sum_expr'] / df.shape[1] filter_df['num_expressed_genes'] = [ sum([x > epsilon for x in df.values[i, :]]) for i in range(df.shape[0]) ] filter_df[ 'removed'] = filter_df['num_expressed_genes'] < min_cells_expressed new_df = df.loc[np.logical_not(filter_df['removed'].values), :] gn.add_result( "\n".join([ "Number of genes before filtering: **{}**".format(df.shape[0]), "", "Number of genes after filtering: **{}**".format(new_df.shape[0]), ]), type="markdown", ) if filter_df.shape[0] > 0: filter_df_deleted = filter_df.loc[filter_df['removed'].values, :].drop( 'removed', axis=1) gn.add_result( { 'title': f"Removed genes ({filter_df_deleted.shape[0]})", 'orient': 'split', 'columns': filter_df_deleted.columns.values.tolist(), 'data': filter_df_deleted.values.tolist(), }, data_type='table', ) else: gn.add_result( f"No genes were removed. All {df.shape[0]} genes were kept. " f"See attachment **gene_selection.csv** for detail.", 'markdown', ) gn.export(filter_df.to_csv(index=False), 'gene_selection.csv', kind='raw', meta=None, raw=True) gn.export(gn.assay_from_pandas(new_df), "Filtered Assay", dynamic=False) gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.pandas_from_assay(gn.get_import('assay')) groups = gn.get_import('groups') reflabels = gn.get_import('reflabels') remove_cells = gn.get_arg('remove_cells') inv_map = {} for k, v in groups.items(): inv_map[v] = inv_map.get(v, []) + [k] inv_map_ref = {} for k, v in reflabels.items(): inv_map_ref[v] = inv_map_ref.get(v, []) + [k] group_relabel = {} mislabelled_cells = [] for k, v in inv_map.items(): vset = set(v) label_scores = {} for kref, vref in inv_map_ref.items(): label_scores[kref] = len(set(vref).intersection(vset)) group_relabel[k] = max(label_scores, key=label_scores.get) mislabelled_cells = mislabelled_cells + list( vset.difference(set(inv_map_ref[group_relabel[k]]))) if remove_cells: gn.add_result( "Dropping {} mislabelled cells".format(len(mislabelled_cells)), "markdown") assay = assay.drop(mislabelled_cells, axis=1) groups = { key: val for key, val in groups.items() if not key in mislabelled_cells } for cell in groups: groups[cell] = group_relabel[groups[cell]] toc = time.perf_counter() time_passed = round(toc - tic, 2) gn.export_statically(gn.assay_from_pandas(assay), "Corresponded assay") gn.export_statically(groups, "Corresponded labels") timing = "* Finished sample coloring step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): gn = Granatum() df = gn.pandas_from_assay(gn.get_import("assay")) frob_norm = np.linalg.norm(df.values) df = df / frob_norm gn.add_result( f"""\ The original assay had Frobenius norm of {frob_norm}, after normalization its Frobenius norm is now {np.linalg.norm(df.values)}""", 'markdown', ) gn.export(gn.assay_from_pandas(df), "Frobenius normalized assay", dynamic=False) gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.pandas_from_assay(gn.get_import('assay')) # Groups is {"cell":"cluster} groups = gn.get_import('groups') certainty = gn.get_arg('certainty') alpha = 1 - certainty / 100.0 min_zscore = st.norm.ppf(gn.get_arg("certainty") / 100.0) min_dist = 0.1 # Likely we want to filter genes before we get started, namely if we cannot create a good statistic norms_df = assay.apply(np.linalg.norm, axis=1) assay = assay.loc[norms_df.T >= min_dist, :] inv_map = {} inv_map_rest = {} for k, v in groups.items(): inv_map[v] = inv_map.get(v, []) + [k] clist = inv_map_rest.get(v, list(assay.columns)) clist.remove(k) inv_map_rest[v] = clist # Inv map is {"cluster": ["cell"]} print("Completed setup", flush=True) cols = list(inv_map.keys()) colnames = [] for coli in cols: for colj in cols: if coli != colj: colnames.append("{} vs {}".format(coli, colj)) for coli in cols: colnames.append("{} vs rest".format(coli)) # Instead of scoring into a dataframe, let's analyze each statistically # Dict (gene) of dict (cluster) of dict (statistics) # { "gene_name" : { "cluster_name" : { statistics data } }} # Export would be percentage more/less expressed in "on" state # For example gene "XIST" expresses at least 20% more in cluster 1 vs cluster 4 with 95% certainty total_genes = len(assay.index) print("Executing parallel for {} genes".format(total_genes), flush=True) results = Parallel( n_jobs=math.floor(multiprocessing.cpu_count() * 2 * 9 / 10))( delayed(compref)(gene, assay.loc[gene, :], colnames, inv_map, inv_map_rest, alpha, min_dist, min_zscore) for gene in tqdm(list(assay.index))) result = pd.concat(results, axis=0) gn.export_statically(gn.assay_from_pandas(result.T), 'Differential expression sets') gn.export(result.to_csv(), 'differential_gene_sets.csv', kind='raw', meta=None, raw=True) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished differential expression sets step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): gn = Granatum() df = gn.pandas_from_assay(gn.get_import('assay')) alpha = gn.get_arg('alpha') jammit = JAMMIT.from_dfs([df]) res = jammit.run_for_one_alpha( alpha, verbose=1, convergence_threshold=0.000000001, ) u = res['u'] v = res['v'] gn.export(dict(zip(df.index, u)), 'Genes loadings', kind='geneMeta') gn.export(dict(zip(df.columns, v)), 'Sample scores', kind='sampleMeta') gene_df = pd.DataFrame({ 'id_': df.index, 'abs_loading': abs(u), 'loading': u }) gene_df = gene_df[['id_', 'abs_loading', 'loading']] gene_df = gene_df.loc[gene_df['loading'].abs() > EPSILON] gene_df = gene_df.sort_values('abs_loading', ascending=False) gn.add_result( { 'title': f"Signal genes ({len(gene_df)})", 'orient': 'split', 'columns': gene_df.columns.values.tolist(), 'data': gene_df.values.tolist(), }, data_type='table', ) gn.export(gene_df.to_csv(index=False), 'signal_genes.csv', kind='raw', meta=None, raw=True) sample_df = pd.DataFrame({ 'id_': df.columns, 'abs_score': abs(v), 'score': v }) sample_df = sample_df[['id_', 'abs_score', 'score']] sample_df = sample_df.loc[sample_df['score'].abs() > EPSILON] sample_df = sample_df.sort_values('abs_score', ascending=False) gn.add_result( { 'title': f"Signal samples ({len(sample_df)})", 'orient': 'split', 'columns': sample_df.columns.values.tolist(), 'data': sample_df.values.tolist(), }, data_type='table', ) gn.export(sample_df.to_csv(index=False), 'signal_samples.csv', kind='raw', meta=None, raw=True) subset_df = df.loc[gene_df['id_'], sample_df['id_']] gn.export(gn.assay_from_pandas(subset_df), 'Assay with only signal genes and samples', kind='assay') sns.clustermap(subset_df, cmap='RdBu') gn.add_current_figure_to_results( description='Cluster map of the signal genes and signal samples', zoom=2, width=750, height=850, dpi=50, ) plt.close() plt.figure() plt.scatter(range(len(u)), u, s=2, c='red') plt.xlabel('index') plt.ylabel('value in u') gn.add_current_figure_to_results( description= 'The *u* vector (loadings for genes) plotted as a scatter plot.', zoom=2, width=750, height=450, dpi=50, ) plt.close() plt.figure() plt.plot(range(len(v)), v) plt.scatter(range(len(v)), v, s=6, c='red') plt.xlabel('index') plt.ylabel('value in v') gn.add_current_figure_to_results( description= 'The *v* vector (scores for samples) plotted as a line plot.', zoom=2, width=750, height=450, dpi=50, ) plt.close() # gn.export_current_figure( # 'cluster_map.pdf', # zoom=2, # width=750, # height=850, # dpi=50, # ) gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.pandas_from_assay(gn.get_import('assay')) groups = gn.get_import('groups') min_zscore = gn.get_arg('min_zscore') max_zscore = gn.get_arg('max_zscore') min_expression_variation = gn.get_arg('min_expression_variation') inv_map = {} for k, v in groups.items(): inv_map[v] = inv_map.get(v, []) + [k] low_mean_dfs = [] high_mean_dfs = [] mean_dfs = [] std_dfs = [] colnames = [] for k, v in inv_map.items(): group_values = assay.loc[:, v] lowbound_clust = {} highbound_clust = {} for index, row in group_values.iterrows(): meanbounds = sms.DescrStatsW(row).tconfint_mean() lowbound_clust[index] = meanbounds[0] highbound_clust[index] = meanbounds[1] low_mean_dfs.append(pd.DataFrame.from_dict(lowbound_clust, orient="index", columns=[k])) high_mean_dfs.append(pd.DataFrame.from_dict(highbound_clust, orient="index", columns=[k])) mean_dfs.append(group_values.mean(axis=1)) std_dfs.append(group_values.std(axis=1)) colnames.append(k) mean_df = pd.concat(mean_dfs, axis=1) mean_df.columns = colnames low_mean_df = pd.concat(low_mean_dfs, axis=1) low_mean_df.columns = colnames high_mean_df = pd.concat(high_mean_dfs, axis=1) high_mean_df.columns = colnames std_df = pd.concat(std_dfs, axis=1) std_df.columns = colnames print(std_df) minvalues = std_df.min(axis=1).to_frame() minvalues.columns=["min"] print("Minvalues>>") print(minvalues, flush=True) genes_below_min = list((minvalues[minvalues["min"]<min_expression_variation]).index) print("{} out of {}".format(len(genes_below_min), len(minvalues.index)), flush=True) mean_df = mean_df.drop(genes_below_min, axis=0) low_mean_df = low_mean_df.drop(genes_below_min, axis=0) high_mean_df = high_mean_df.drop(genes_below_min, axis=0) std_df = std_df.drop(genes_below_min, axis=0) assay = assay.drop(genes_below_min, axis=0) print("Filtered assay to get {} columns by {} rows".format(len(assay.columns), len(assay.index)), flush=True) mean_rest_dfs = [] std_rest_dfs = [] colnames = [] for k, v in inv_map.items(): rest_v = list(set(list(assay.columns)).difference(set(v))) mean_rest_dfs.append(assay.loc[:, rest_v].mean(axis=1)) std_rest_dfs.append(assay.loc[:, rest_v].std(axis=1)) colnames.append(k) mean_rest_df = pd.concat(mean_rest_dfs, axis=1) mean_rest_df.columns = colnames std_rest_df = pd.concat(std_rest_dfs, axis=1) std_rest_df.columns = colnames zscore_dfs = [] cols = colnames colnames = [] for coli in cols: for colj in cols: if coli != colj: # Here we should check significance # Fetch most realistic mean comparison set, what is smallest difference between two ranges mean_diff_overlap_low_high = (low_mean_df[coli]-high_mean_df[colj]) mean_diff_overlap_high_low = (high_mean_df[coli]-low_mean_df[colj]) diff_df = mean_diff_overlap_low_high.combine(mean_diff_overlap_high_low, range_check) zscore_dfs.append((diff_df/(std_df[colj]+std_df[coli]/4)).fillna(0).clip(-max_zscore, max_zscore)) colnames.append("{} vs {}".format(coli, colj)) for coli in cols: zscore_dfs.append(((mean_df[coli]-mean_rest_df[colj])/(std_rest_df[colj]+std_rest_df[coli]/4)).fillna(0).clip(-max_zscore, max_zscore)) colnames.append("{} vs rest".format(coli)) zscore_df = pd.concat(zscore_dfs, axis=1) zscore_df.columns = colnames norms_df = zscore_df.apply(np.linalg.norm, axis=1) colsmatching = norms_df.T[(norms_df.T >= min_zscore)].index.values return_df = zscore_df.T[colsmatching] gn.export_statically(gn.assay_from_pandas(return_df), 'Differential expression sets') gn.export(return_df.T.to_csv(), 'differential_gene_sets.csv', kind='raw', meta=None, raw=True) toc = time.perf_counter() time_passed = round(toc - tic, 2) timing = "* Finished differential expression sets step in {} seconds*".format(time_passed) gn.add_result(timing, "markdown") gn.commit()
def main(): gn = Granatum() tb1 = gn.pandas_from_assay(gn.get_import('assay1')) tb2 = gn.pandas_from_assay(gn.get_import('assay2')) label1 = gn.get_arg('label1') label2 = gn.get_arg('label2') direction = gn.get_arg('direction') normalization = gn.get_arg('normalization') if direction == 'samples': tb1 = tb1.T tb2 = tb2.T overlapped_index = set(tb1.index) & set(tb2.index) tb1.index = [ f"{label1}_{x}" if x in overlapped_index else x for x in tb1.index ] tb2.index = [ f"{label2}_{x}" if x in overlapped_index else x for x in tb2.index ] if normalization == 'none': tb = pd.concat([tb1, tb2], axis=0) elif normalization == 'frobenius': ntb1 = np.linalg.norm(tb1) ntb2 = np.linalg.norm(tb2) ntb = np.mean([ntb1, ntb2]) fct1 = ntb / ntb1 fct2 = ntb / ntb2 tb = pd.concat([tb1 * fct1, tb2 * fct2], axis=0) gn.add_markdown(f"""\ Normalization info: - Assay **{label1}** is multiplied by {fct1} - Assay **{label2}** is multiplied by {fct2} """) elif normalization == 'mean': ntb1 = np.mean(tb1) ntb2 = np.mean(tb2) ntb = np.mean([ntb1, ntb2]) fct1 = ntb / ntb1 fct2 = ntb / ntb2 tb = pd.concat([tb1 * fct1, tb2 * fct2], axis=0) gn.add_markdown(f"""\ Normalization info:", - Assay **{label1}** is multiplied by {fct1} - Assay **{label2}** is multiplied by {fct2} """) else: raise ValueError() if direction == 'samples': tb = tb.T gn.add_markdown(f"""\ You combined the following assays: - Assay 1 (with {tb1.shape[0]} genes and {tb1.shape[1]} cells) - Assay 2 (with {tb2.shape[0]} genes and {tb2.shape[1]} cells) into: - Combined Assay (with {tb.shape[0]} genes and {tb.shape[1]} cells) """) gn.export_statically(gn.assay_from_pandas(tb), 'Combined assay') if direction == 'samples': meta_type = 'sampleMeta' elif direction == 'genes': meta_type = 'geneMeta' else: raise ValueError() gn.export( { **{x: label1 for x in tb1.index}, **{x: label2 for x in tb2.index} }, 'Assay label', meta_type) gn.commit()
def main(): tic = time.perf_counter() gn = Granatum() assay = gn.pandas_from_assay(gn.get_import('assay')) groups = gn.get_import('groups') inv_map = {} for k, v in groups.items(): inv_map[v] = inv_map.get(v, []) + [k] drop_set = parse(gn.get_arg('drop_set')) merge_set_1 = parse(gn.get_arg('merge_set_1')) merge_set_2 = parse(gn.get_arg('merge_set_2')) merge_set_3 = parse(gn.get_arg('merge_set_3')) relabel_set_1 = gn.get_arg('relabel_set_1') relabel_set_2 = gn.get_arg('relabel_set_2') relabel_set_3 = gn.get_arg('relabel_set_3') if len(merge_set_1) > 0: if relabel_set_1 == "": relabel_set_1 = " + ".join(merge_set_1) if len(merge_set_2) > 0: if relabel_set_2 == "": relabel_set_2 = " + ".join(merge_set_2) if len(merge_set_3) > 0: if relabel_set_3 == "": relabel_set_3 = " + ".join(merge_set_3) try: for ds in drop_set: cells = inv_map[ds] gn.add_result( "Dropping {} cells that match {}".format(len(cells), ds), "markdown") assay = assay.drop(cells, axis=1) groups = {key: val for key, val in groups.items() if val != ds} except Exception as e: gn.add_result( "Error found in drop set, remember it should be comma separated: {}" .format(e), "markdown") try: if len(merge_set_1) > 0: merge_set_1_cells = [] for ms1 in merge_set_1: merge_set_1_cells = merge_set_1_cells + inv_map[ms1] for cell in merge_set_1_cells: groups[cell] = relabel_set_1 if len(merge_set_2) > 0: merge_set_2_cells = [] for ms2 in merge_set_2: merge_set_2_cells = merge_set_2_cells + inv_map[ms2] for cell in merge_set_2_cells: groups[cell] = relabel_set_2 if len(merge_set_3) > 0: merge_set_3_cells = [] for ms3 in merge_set_3: merge_set_3_cells = merge_set_3_cells + inv_map[ms3] for cell in merge_set_3_cells: groups[cell] = relabel_set_3 except Exception as e: gn.add_result( "Error found in merge sets, remember it should be comma separated: {}" .format(e), "markdown") toc = time.perf_counter() time_passed = round(toc - tic, 2) gn.export_statically(gn.assay_from_pandas(assay), "Label adjusted assay") gn.export_statically(groups, "Adjusted labels") timing = "* Finished sample coloring step in {} seconds*".format( time_passed) gn.add_result(timing, "markdown") gn.commit()