'method': 'GLM' } subgroups = { 'RTK I': ['019', '030', '031'], 'RTK II': ['017', '050', '054'], } intersecter = lambda x, y: set(x).intersection(y) unioner = lambda x, y: set(x).union(y) # Load RNA-Seq from STAR rnaseq_obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID') # load additional references if required h9_obj = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID') h1_obj = rnaseq_data.gse38993(annotate_by='Ensembl Gene ID') rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj, h1_obj, h9_obj]) # discard unmapped, etc rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains('ENSG')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('PSC')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('fibroblast')] rnaseq_obj.data = rnaseq_obj.data.loc[:, rnaseq_obj.meta.index] # load RNA-Seq from Salmon (for normalised comparison) # disabled for now if False: salmon_dat = rnaseq_data.load_salmon_by_patient_id(pids) idx = salmon_dat.index.str.replace(r'.[0-9]+$', '') salmon_dat.index = idx
import os from load_data import rnaseq_data import pandas as pd from rnaseq import filter from matplotlib import pyplot as plt from plotting import clustering from stats.transformations import median_absolute_deviation, variance_stabilizing_transform import numpy as np from utils.output import unique_output_dir if __name__ == "__main__": outdir = unique_output_dir("compare_gibco_h9") loader_hgic = rnaseq_data.all_hgic_loader(annotate_by='Ensembl Gene ID') loader_h9 = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID') genes = loader_hgic.data.index.intersection(loader_h9.data.index) genes = genes[genes.str.contains('ENSG')] # collapse H9 replicates h9_data = loader_h9.data.sum(axis=1).loc[genes] h9_data.name = 'H9_NSC' h9_meta = pd.Series( { 'type': 'NSC', 'read_count': sum(loader_h9.meta.read_count), 'sample': 'H9_NSC', 'disease_subgroup': 'control', }, name='H9_NSC') data = pd.concat((loader_hgic.data.loc[genes], h9_data), axis=1)
'SLC1A3': 4170, } OUTDIR = unique_output_dir("jb.marker_levels", reuse_empty=True) # GSE73721 (reference astrocytes, oligos, ...) obj73721 = rnaseq_data.gse73721(source='star', annotate_by='Ensembl Gene ID') # remove unneeded samples to_keep73721 = (obj73721.data.columns.str.contains('yo ctx astro') | obj73721.data.columns.str.contains('Hippocampus astro') | obj73721.data.columns.str.contains('oligo')) # GSE61794 (H9-derived NSC x 2) obj61794 = rnaseq_data.gse61794(source='star', annotate_by='Ensembl Gene ID') # combining replicates rc = obj61794.meta.read_count.sum() obj61794.meta = pd.DataFrame(data={ 'cell_type': 'NSC', 'srr': 'SRR1586371-2', 'read_count': rc, 'sample': 'H9 NSC', }, index=['SRR1586371-2']) obj61794.data = pd.DataFrame(obj61794.data.sum(axis=1), columns=['H9 NSC']) # WTCHG ALL samples objwtchg_all = rnaseq_data.all_hgic_loader(annotate_by='Ensembl Gene ID', include_derived=True) to_keep_wtchg = (
} intersecter = lambda x, y: set(x).intersection(y) unioner = lambda x, y: set(x).union(y) if njob != 1: pool = mp.Pool(njob) # Load RNA-Seq from STAR rnaseq_obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID') # load additional references if required refs = [('H1', rnaseq_data.gse38993(annotate_by='Ensembl Gene ID')), ('H9', rnaseq_data.gse61794(annotate_by='Ensembl Gene ID', collapse_replicates=False))] all_refs = [t[0] for t in refs] + ['GIBCO'] rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj] + [t[1] for t in refs]) # only keep gene counts rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains( 'ENSG')] # discard iPSC rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str. contains('PSC')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str. contains('NHF1-hTERT')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.