def load_rnaseq(pids, ref_names, ref_name_filter='NSC', discard_filter='IPSC', strandedness=None): """ :param strandedness: Iterable of same length as ref_names giving the strandedness of each ref """ if strandedness is None: strandedness = ['u'] * len(ref_names) else: if len(strandedness) != len(ref_names): raise ValueError("Supplied strandedness must be a list of the same length as the ref_names.") # Load RNA-Seq from STAR obj = rnaseq_loader.load_by_patient(pids) # load additional references ref_objs = [] for rn, strnd in zip(ref_names, strandedness): ref_obj = rnaseq_loader.load_references(rn, strandedness=strnd) if ref_name_filter is not None: # only keep relevant references ref_obj.meta = ref_obj.meta.loc[ref_obj.meta.index.str.contains(ref_name_filter)] ref_obj.data = ref_obj.data.loc[:, ref_obj.meta.index] ref_objs.append(ref_obj) obj = loader.MultipleBatchLoader([obj] + ref_objs) if discard_filter is not None: if not hasattr(discard_filter, '__iter__'): discard_filter = [discard_filter] for d in discard_filter: obj.meta = obj.meta.loc[~obj.meta.index.str.contains(d)] obj.data = obj.data.loc[:, obj.meta.index] obj.batch_id = obj.batch_id.loc[obj.meta.index] return obj
def load_refs(ref_dict, **load_kwds): ref_objs_arr = [] for k, v in ref_dict.items(): the_kwds = copy(load_kwds) for k1, v1 in the_kwds.items(): if v1 is SetMe: the_kwds[k1] = v.get(k1) the_obj = loader.load_references(k, **the_kwds) the_obj.batch_id = v['batch'] ref_objs_arr.append(the_obj) if len(ref_objs_arr) == 1: return ref_objs_arr[0] else: return loader.loader.MultipleBatchLoader(ref_objs_arr)
cpm = dat.divide(dat.sum(), axis=1) * 1e6 else: cpm = dat.divide(dat.sum()) * 1e6 return np.log(cpm) / np.log(base) if __name__ == '__main__': pids = ['019', '031', '049', '052'] min_cpm = 1 min_cpm_individual = 0.1 outdir = output.unique_output_dir("james_opc_smartseq2_vs_polya") ## 1) STAR CPM estimates ss2_obj = loader.load_references('wtchg_p180059', strandedness='u') assigned_sum = ss2_obj.data.sum() unassigned_sum = ss2_obj.data_unassigned.drop('N_unmapped').sum() ss2_pct_assigned = assigned_sum / (assigned_sum + unassigned_sum) * 100. print "SmartSeq2 samples % assigned" print ss2_pct_assigned polya_obj = loader.load_by_patient(pids) # restrict to relevant samples for first part of the analysis idx = (polya_obj.meta.type == 'iNSC') polya_nsc_meta = polya_obj.meta.loc[idx] polya_nsc_data = polya_obj.data.loc[:, polya_nsc_meta.index] polya_nsc_unassigned = polya_obj.data_unassigned.loc[:,
c, p = the_func(many.loc[row], many.loc[col]) cor.loc[row, col] = c cor.loc[col, row] = c pval.loc[row, col] = p pval.loc[col, row] = p return cor, pval if __name__ == "__main__": min_tpm = 1. eps = 0.01 outdir = output.unique_output_dir() # load just 1st lane obj = loader.load_references('wtchg_p180443/180911_K00150_0372_AHWV7TBBXX', tax_id=10090, source='salmon') ix = (obj.meta.species == 'mouse') | (obj.meta.index.str.contains(r'[iI]MGL')) dat = obj.data.loc[:, ix] dat_filt = filter.filter_by_cpm(dat, min_cpm=min_tpm, unless_cpm_gt=10.) log_dat_filt = np.log2(dat_filt + eps) # correlation clustermap row_colours = pd.DataFrame('g', index=log_dat_filt.columns, columns=['Sample type']) row_colours.loc[row_colours.index.str.contains('mDURA')] = 'k' row_colours.loc[row_colours.index.str.contains( 'mDURA5_NSCmus_N3BE50.2')] = 'y' row_colours.loc[row_colours.index.str.contains('mDURA6_NSCmus')] = 'y'
'NH16_616DEF1B', 'NH16_677_SP1A', 'NH16_1574DEF1A', 'NH16_1976_DEF1Areplacement', 'NH16_2063_DEF1Areplacement', 'NH16_2214DEF1A', 'NH16_2255DEF1B2', 'NH16_2806DEF3A1', ] our_ffpe_obj = loader.load_by_patient('all', type='ffpe') our_ffpe_obj.meta = our_ffpe_obj.meta.loc[ffpe_samples] our_ffpe_obj.data = our_ffpe_obj.data[our_ffpe_obj.meta.index] # mouse data our_mouse_obj = loader.load_references(['wtchg_p170506', 'wtchg_p170390'], tax_id=10090, strandedness='r') # eliminate unneeded samples our_mouse_obj.meta = our_mouse_obj.meta.loc[ ~(our_mouse_obj.meta.index.str.contains('_1') | our_mouse_obj.meta.index.str.contains('CRL3034')) ] our_mouse_obj.data = our_mouse_obj.data[our_mouse_obj.meta.index.tolist()] # SS2 data ss2_obj = loader.load_references('wtchg_p180059', strandedness='u') ss2_obj.meta.index = ["%s_SS2" % t for t in ss2_obj.meta.index] ss2_obj.data.columns = ss2_obj.meta.index # define counts seprately for convenience polya_human_counts = our_patient_obj.data polya_mouse_counts = our_mouse_obj.data
'RTK II partial': '#d67373', 'MES partial': '#cc88ea', 'mixed': '#4C72B0', 'specific': '#f4e842', } min_cpm = 1 outdir = output.unique_output_dir("compare_de_gene_counts_s2", reuse_empty=True) # load our data obj = loader.load_by_patient(pids, include_control=True) # load reference data h9_obj = loader.load_references('GSE61794', tax_id=9606, source='star', strandedness='u') # h1_obj = loader.load_references('GSE38993', tax_id=9606, source='star', strandedness='u', samples=['H1 NSC']) # combine obj = loader.MultipleBatchLoader([obj, h9_obj]) # remove IPSC and rejected 061 samples for good idx = ( (~obj.meta.index.str.contains('IPSC')) & (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4'])) ) obj.meta = obj.meta.loc[idx] obj.data = obj.data.loc[:, idx] obj.batch_id = obj.batch_id.loc[idx] refs = ['GIBCO', 'H9']
if __name__ == '__main__': outdir = unique_output_dir("mouse_NSC_DE", reuse_empty=True) de_params = {'lfc': 1, 'fdr': 0.01, 'return_full': True} # load our data # all samples # samples = ['mDura%shuman' % i for i in ('3N1', '5N24A', '6N6')] # drop mDura3N1 as it is significantly different samples = ['mDura%shuman' % i for i in ('5N24A', '6N6')] obj1 = loader.load_references('wtchg_p170390', source='star', tax_id=10090, samples=samples, strandedness='r') obj1_s = loader.load_references('wtchg_p170390', source='salmon', tax_id=10090, samples=samples) samples = ['eNSC%dmed' % i for i in (3, 5, 6)] obj2 = loader.load_references('wtchg_p170506', source='star', tax_id=10090, samples=samples, strandedness='r') obj2_s = loader.load_references('wtchg_p170506', source='salmon',
# set gibco aside dat_gibco = obj.data.loc[:, obj.data.columns.str.contains('GIBCO')] dat_gibco = ens_index_to_gene_symbol(dat_gibco) # drop any cell types other than GBM and iNSC ix = obj.meta['type'].isin(['GBM', 'iNSC']) # drop unneeded GBM061 samples ix = ix & (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4'])) obj.filter_samples(ix) # convert to gene symbols dat = ens_index_to_gene_symbol(obj.data) # load reference dataset(s) ref_obj = loader.load_references('GSE61794', strandedness='u', source=source_by_units[units]) ix = ref_obj.meta.index.str.contains('NSC') ref_obj.filter_samples(ix) # convert to gene symbols dat_h9 = ens_index_to_gene_symbol(ref_obj.data) # write single patient syngeneic comparison data for pid in pids: the_idx = dat.columns.str.contains(pid) the_dat = dat.loc[:, the_idx] the_classes = pd.Series('GBM', index=the_dat.columns) the_classes.loc[the_classes.index.str.contains('NSC')] = 'iNSC' out_fn = os.path.join(out_subdir, "%s.{ext}" % (pid)) gsea.data_to_gct(the_dat, out_fn.format(ext='gct')) gsea.phenotypes_to_cls(the_classes, out_fn.format(ext='cls'))
from load_data import rnaseq_data from plotting import corr, clustering from rnaseq import loader from stats import transformations from utils import reference_genomes from utils.output import unique_output_dir if __name__ == "__main__": outdir = unique_output_dir('mouse_validation') # new code ref_obj = loader.load_references( ['GSE64411', 'GSE52564', 'GSE43916', 'GSE86248', 'GSE36114'], source='star', tax_id=10090, batch_names=[ 'GSE64411', 'GSE52564', 'GSE43916', 'GSE86248', 'GSE36114' ]) # old code # load mouse data obj = rnaseq_data.mouse_nsc_validation_samples( annotate_by='Ensembl Gene ID') # reorder for plotting niceness samples = ['eNSC%dmed' % i for i in (3, 5, 6)] \ + ['eNSC%dmouse' % i for i in (3, 5, 6)] \
if __name__ == '__main__': # parameters alpha = 0.05 min_logfc = 0 eps = 0.01 n_by_mad = 3000 min_cpm = 0.01 # for filtering purposes treatment_colour = {'WT': '0.8', 'Rheb KO': '0.2'} outdir = output.unique_output_dir() # load our data obj_star = loader.load_references('wtchg_p190202', alignment_subdir='mouse', tax_id=10090) obj_salmon = loader.load_references('wtchg_p190202', alignment_subdir='mouse', source='salmon', tax_id=10090) # dump to file for sharing dat = obj_salmon.data.copy() general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090) dat.to_excel(os.path.join(outdir, 'salmon_tpm_all_data.xlsx')) dat = obj_star.data.copy() general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090) dat.to_excel(os.path.join(outdir, 'star_counts_all_data.xlsx')) # load Bowman data
('Liu et al.', 'GSE96950'), ('Wapinski et al.', 'GSE43916'), ('Friedmann-Morvinski et al.', 'GSE73127'), ('Friedmann-Morvinski et al.', 'GSE64411/trimgalore'), ('Zhang et al.', 'GSE52564'), ('Chen et al.', 'GSE52125'), ('Yanez et al.', 'GSE88982'), ('Lynch', 'GSE78795'), ('Moyon et al.', 'GSE66029'), ('Schmid et al.', 'GSE75592'), ('Srinivasan et al.', 'GSE75246'), ] ref_objs = loader.load_references( [t[1] for t in ref_names], tax_id=10090, source='salmon', batch_names=[t[0] for t in ref_names], ) ref_obj = loader.load_references( [t[1] for t in ref_names], tax_id=10090, source='star', batch_names=[t[0] for t in ref_names], ) # remove unneeded samples ref_obj.meta = ref_obj.meta.loc[~ref_obj.meta.index.str. contains('Normal brain')] ref_obj.meta = ref_obj.meta.loc[~ref_obj.meta.index.str.contains('GBM')]
de_params = {'lfc': 1, 'fdr': 0.01, 'method': 'GLM'} subgroups = { 'RTK I': ['019', '030', '031'], 'RTK II': ['017', '050', '054'], } intersecter = lambda x, y: set(x).intersection(y) unioner = lambda x, y: set(x).union(y) # Load RNA-Seq from STAR rnaseq_obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID') # load additional references if required h9_obj = loader.load_references('gse61794') h1_obj = loader.load_references('gse38993') # h9_obj = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID') # h1_obj = rnaseq_data.gse38993(annotate_by='Ensembl Gene ID') rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj, h1_obj, h9_obj]) # discard unmapped, etc rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains( 'ENSG')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str. contains('PSC')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str. contains('fibroblast')] rnaseq_obj.data = rnaseq_obj.data.loc[:, rnaseq_obj.meta.index] # load RNA-Seq from Salmon (for normalised comparison)
(components[1] + 1, variance_explained[components[1]])) return p, ax if __name__ == '__main__': min_val = 1 n_above_min = 3 n_gene_by_mad = 5000 eps = 0.01 # offset to use when applying log transform source = 'salmon' quantile_norm = None outdir = output.unique_output_dir() obj1 = loader.load_references('wtchg_p170390', source=source, tax_id=10090) samples = ['eNSC%dmouse' % i for i in (3, 5, 6)] \ + ['mDura%smouse' % i for i in ('3N1', '5N24A', '6N6')] \ + ['mDura%shuman' % i for i in ('3N1', '5N24A', '6N6')] obj1.filter_samples(obj1.meta.index.isin(samples)) obj2 = loader.load_references('wtchg_p170506', source=source, tax_id=10090) samples = ['eNSC%dmed' % i for i in (3, 5, 6)] obj2.filter_samples(obj2.meta.index.isin(samples)) obj3 = loader.load_references('wtchg_p180443', source=source, tax_id=10090) obj = loader.loader.MultipleBatchLoader([obj1, obj2, obj3]) the_dat = np.log2(obj.data + eps) if quantile_norm is not None:
row_per_fig = 5 # load TPM data cols_syn_gic = consts.S1_RNASEQ_SAMPLES_GIC cols_syn_insc = consts.S1_RNASEQ_SAMPLES_INSC cols_ref_nsc = [ 'GIBCO_NSC_P4', 'H9_NSC_1', 'H9_NSC_2' ] outdir = output.unique_output_dir() obj1 = loader.load_by_patient(pids, source='salmon', include_control=True) obj2 = loader.load_references('GSE61794', source='salmon') obj = loader.MultipleBatchLoader([obj1, obj2]) obj.filter_by_sample_name(consts.ALL_RNASEQ_SAMPLES) dat = obj.data.copy() general.add_gene_symbols_to_ensembl_data(dat) # check that all GOIs are present vc = dat['Gene Symbol'].value_counts() for g in gois: if g not in vc: raise KeyError("Gene %s was not found." % g) if vc[g] != 1: raise AttributeError("Gene %s has %d hits." % (g, vc[g])) # create plots in multiple figures
fig_kws={'figsize': (5.5, 10)}, vertical=False) d['fig'].savefig(os.path.join(outdir, fname_log.format(ext='png')), dpi=200) plt.draw() plt.close('all') # bring in reference data # IDs (if req), lab (appears in label), loader ref_dats = [ ( None, 'Barres et al.', loader.load_references('GSE73721', source='salmon', units=units), ), ( None, 'Caren et al.', loader.load_references('E-MTAB-3867', source='salmon', units=units), ), ( None, 'Yang et al.', loader.load_references('GSE80732', source='salmon', units=units), ), ( None, 'Shahbazi et al.',