def add_genes_and_save(dat, fn, type='excel'): general.add_gene_symbols_to_ensembl_data(dat) if type == 'excel': save_func = lambda x: x.to_excel elif type == 'csv': save_func = lambda x: x.to_csv else: raise NotImplementedError("Unsupported type %s." % type) save_func(dat)(fn)
# insert spacing columns po_dat.insert(spacing1, '', np.nan) po_dat.insert(spacing2, ' ', np.nan) fig = plt.figure(figsize=(7, 3 + 10 / 50. * len(po_ref_diff))) ax = fig.add_subplot(111) ax = sns.heatmap(po_dat, cmap=cmap, ax=ax) plt.setp(ax.xaxis.get_ticklabels(), rotation=90) plt.setp(ax.yaxis.get_ticklabels(), rotation=0) fig.tight_layout() fig.savefig(os.path.join(outdir, "consistently_in_pair_only_%s.png" % c), dpi=200) # export those same genes to a file, adding gene symbols for_export = rnaseq_obj.data.loc[po_ref_diff, the_cols] general.add_gene_symbols_to_ensembl_data(for_export) for_export.to_excel(os.path.join(outdir, 'consistently_in_pair_only_%s.xlsx' % c)) # export the pair_only genes (GBM vs iNSC paired -- GBM vs ref) to a file print "Combination type: %s" % c po_de_export = {} for pid in pids: # get PO DE genes the_idx = pair_only.loc[pid, c] # subtract the correction genes the_corr = po_ref_diff print "Subtracting %d correction genes from %d PO DE genes to leave %d PO DE genes." % ( len(the_corr), len(the_idx), len(set(the_idx).difference(the_corr)) )
patient = matched_data.columns.str.replace('_.*', '').str.replace('DURA', '') library_prep = [ 'SS' if 'smartseq' in t else 'polyA' for t in matched_data.columns ] fit, design = differential_expression.edger_fit_glm( matched_data, de_method, '~patient + library_prep', patient=patient, library_prep=library_prep, ) de_res_ss = differential_expression.edger_test(fit, design, "library_prepSS") general.add_gene_symbols_to_ensembl_data(de_res_ss) # 2) separate comparisons SS vs polyA # restrict this to matching passage / subclone # use grouped dispersion approach (SS vs PolyA) and GLM method to avoid error when estimating DOF de_method = 'GLM' sample_pairs = { '019': ('DURA019_NSC_N8C_P2', 'DURA019_NSC_N8C_smartseq'), '031': ('DURA031_NSC_N44B_P2', 'DURA031_NSC_N44B_smartseq'), '049': ('DURA049_NSC_N19_P4', 'DURA049_NSC_N19_P4_smartseq'), '052': ('DURA052_NSC_N4_P3', 'DURA052_NSC_N4_P3_smartseq') } all_snames = [] [all_snames.extend(sample_pairs[p]) for p in pids] this_matched = matched_data.loc[:, all_snames]
# add null set manually from full DE results de_genes_all = setops.reduce_union(*venn_set.values()) k_null = ''.join(['0'] * len(pids)) venn_set[k_null] = list(de_res_full_s1[pids[0]].index.difference(de_genes_all)) venn_ct[k_null] = len(venn_set[k_null]) de_data = setops.venn_set_to_wide_dataframe(de_res_s1, venn_set, pids, full_data=de_res_full_s1, cols_to_include=['logFC', 'FDR'], consistency_check_col='logFC', consistency_check_method='sign') # add gene symbols back in general.add_gene_symbols_to_ensembl_data(de_data) de_data.to_excel(os.path.join(outdir, 'full_de.xlsx')) # load methylation data me_ff_obj, anno = tsgd.load_methylation(pids, type='ffpe', norm_method=norm_method_s1) me_cc_obj, anno = tsgd.load_methylation(pids, type='cell_culture', norm_method=norm_method_s1) # filter CC to include only iNSC me_cc_obj.filter_samples( me_cc_obj.meta.index.isin(consts.S1_METHYL_SAMPLES_INSC)) # FIXME: this is a bug in the loader? me_cc_obj.batch_id = me_cc_obj.meta.batch
for cmp in comparisons: lbl = "%s_vs_%s" % cmp jobs[lbl] = pool.apply_async(run_one_de, args=(dat, groups, cmp), kwds=de_params) # res[lbl] = run_one_de(dat, groups, cmp, **de_params) # print "%d DE genes\n" % (res[lbl].FDR <= de_params['fdr']).sum() for lbl in jobs: res[lbl] = jobs[lbl].get(1e6) print lbl print "%d DE genes\n" % (res[lbl].FDR <= de_params['fdr']).sum() for k, v in res.items(): general.add_gene_symbols_to_ensembl_data(v, tax_id=10090) res_sign[k] = v.loc[v.FDR <= de_params['fdr']] excel.pandas_to_excel(res, os.path.join(outdir, "mouse_GBM_NSC_DE_all.xlsx")) excel.pandas_to_excel( res_sign, os.path.join(outdir, "mouse_GBM_NSC_DE_significant.xlsx")) # finally, re-run with a lfc of zero # disabled for now to speed things up if False: de_params['lfc'] = 0 jobs2 = {} print "No logFC requirement"
scatter_colours = dict(zip(consts.PIDS, cmap)) scatter_markers = {'GBM': 's', 'iNSC': 'o'} # scaling parameter applied during SVD scale_preserved = 0.05 sample_colours = obj.meta.patient_id.map(scatter_colours.get).to_dict() sample_markers = obj.meta.type.map(scatter_markers.get).to_dict() dat = filter.filter_by_cpm(obj.data, min_n_samples=2) # TODO: include VST or similar here dat = np.log(dat + eps) # copy of dat with gene symbols dat_with_gs = dat.copy() general.add_gene_symbols_to_ensembl_data(dat_with_gs) # fill back in with ENS where no gene symbol is available dat_with_gs.loc[dat_with_gs['Gene Symbol'].isnull(), 'Gene Symbol'] = dat_with_gs.index[ dat_with_gs['Gene Symbol'].isnull()] # recreate Sven's original (unweighted) plot # we're not going to use this, it's just for validation / reproducibility fig, ax, _ = plot_biplot(dat, obj.meta, (0, 1), scatter_colours, scatter_markers, annotate_features_radius=0.4, include_weighting=False, scale=10.) fig.savefig(os.path.join(
def ens_index_to_gene_symbol(df): general.add_gene_symbols_to_ensembl_data(df) tmp = df['Gene Symbol'].dropna() df = df.loc[tmp.index] df.set_index('Gene Symbol', inplace=True) return df
from utils import output import os import numpy as np if __name__ == "__main__": min_cpm = 1 obj = loader.load_by_patient('all', type='ffpe') samples = [ 'NH15_1661DEF2C', 'NH15_1877_SP1C', 'NH15_2101_DEF1A', 'NH16_270_DEF1Ereplacement', 'NH16_616DEF1B', 'NH16_677_SP1A', 'NH16_1574DEF1A', 'NH16_1976_DEF1Areplacement', 'NH16_2063_DEF1Areplacement', 'NH16_2214DEF1A', 'NH16_2255DEF1B2', 'NH16_2806DEF3A1', ] # remove duplicates dat = obj.data.loc[:, samples] dat = filter.filter_by_cpm(dat, min_cpm=min_cpm, min_n_samples=1) cpm = (dat + 1).divide(dat.sum() + 1, axis=1) * 1e6 general.add_gene_symbols_to_ensembl_data(cpm) outdir = output.unique_output_dir("ffpe_logcpm_values") cpm.to_excel(os.path.join(outdir, 'cpm_ffpe_filtered.xlsx'))
treatment_colour = {'WT': '0.8', 'Rheb KO': '0.2'} outdir = output.unique_output_dir() # load our data obj_star = loader.load_references('wtchg_p190202', alignment_subdir='mouse', tax_id=10090) obj_salmon = loader.load_references('wtchg_p190202', alignment_subdir='mouse', source='salmon', tax_id=10090) # dump to file for sharing dat = obj_salmon.data.copy() general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090) dat.to_excel(os.path.join(outdir, 'salmon_tpm_all_data.xlsx')) dat = obj_star.data.copy() general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090) dat.to_excel(os.path.join(outdir, 'star_counts_all_data.xlsx')) # load Bowman data # plots with only our samples dat = filter.filter_by_cpm(obj_salmon.data, min_cpm=min_cpm, min_n_samples=2) log_dat = np.log10(obj_salmon.data + eps) # ECDF ax = rnaseq.log_cpm_ecdf_plot(dat, units='tpm', min_cpm=min_cpm)
y[aa == a, 1], facecolor=batch_colours[a], edgecolor='k', s=30, label=b ) ax.set_xlabel("PC1 (%.1f %%)" % (p.explained_variance_ratio_[0] * 100)) ax.set_ylabel("PC2 (%.1f %%)" % (p.explained_variance_ratio_[1] * 100)) ax.legend(loc='lower right', frameon=True, facecolor='w', framealpha=0.7) fig.tight_layout() fig.savefig(os.path.join(outdir, "pca_log_tpm.png"), dpi=200) # this looks almost the same! # log_tpm_qn = transformations.quantile_normalisation(np.log2(tpm + eps)) # cg = clustering.dendrogram_with_colours( # log_tpm_qn, # cc, # ) # export data cpm = obj_star.data.divide(obj_star.data.sum(), axis=1) * 1e6 counts = obj_star.data.copy() tpm = obj_salmon.data.copy() general.add_gene_symbols_to_ensembl_data(counts, tax_id=9606) general.add_gene_symbols_to_ensembl_data(cpm, tax_id=9606) general.add_gene_symbols_to_ensembl_data(tpm, tax_id=9606) counts.to_excel(os.path.join(outdir, "ICb1299_CRL3021.counts.xlsx")) cpm.to_excel(os.path.join(outdir, "ICb1299_CRL3021.cpm.xlsx")) tpm.to_excel(os.path.join(outdir, "ICb1299_CRL3021.tpm.xlsx")) obj_star.meta.to_excel(os.path.join(outdir, "ICb1299_CRL3021.meta.xlsx"))