Exemple #1
0
def load_rnaseq(pids, ref_names, ref_name_filter='NSC', discard_filter='IPSC', strandedness=None):
    """
    :param strandedness: Iterable of same length as ref_names giving the strandedness of each ref
    """
    if strandedness is None:
        strandedness = ['u'] * len(ref_names)
    else:
        if len(strandedness) != len(ref_names):
            raise ValueError("Supplied strandedness must be a list of the same length as the ref_names.")

    # Load RNA-Seq from STAR
    obj = rnaseq_loader.load_by_patient(pids)

    # load additional references
    ref_objs = []
    for rn, strnd in zip(ref_names, strandedness):
        ref_obj = rnaseq_loader.load_references(rn, strandedness=strnd)
        if ref_name_filter is not None:
            # only keep relevant references
            ref_obj.meta = ref_obj.meta.loc[ref_obj.meta.index.str.contains(ref_name_filter)]
            ref_obj.data = ref_obj.data.loc[:, ref_obj.meta.index]
        ref_objs.append(ref_obj)
    obj = loader.MultipleBatchLoader([obj] + ref_objs)

    if discard_filter is not None:
        if not hasattr(discard_filter, '__iter__'):
            discard_filter = [discard_filter]
        for d in discard_filter:
            obj.meta = obj.meta.loc[~obj.meta.index.str.contains(d)]
            obj.data = obj.data.loc[:, obj.meta.index]

    obj.batch_id = obj.batch_id.loc[obj.meta.index]

    return obj
Exemple #2
0
def load_refs(ref_dict, **load_kwds):
    ref_objs_arr = []

    for k, v in ref_dict.items():
        the_kwds = copy(load_kwds)
        for k1, v1 in the_kwds.items():
            if v1 is SetMe:
                the_kwds[k1] = v.get(k1)
        the_obj = loader.load_references(k, **the_kwds)
        the_obj.batch_id = v['batch']
        ref_objs_arr.append(the_obj)

    if len(ref_objs_arr) == 1:
        return ref_objs_arr[0]
    else:
        return loader.loader.MultipleBatchLoader(ref_objs_arr)
        cpm = dat.divide(dat.sum(), axis=1) * 1e6
    else:
        cpm = dat.divide(dat.sum()) * 1e6
    return np.log(cpm) / np.log(base)


if __name__ == '__main__':
    pids = ['019', '031', '049', '052']
    min_cpm = 1
    min_cpm_individual = 0.1

    outdir = output.unique_output_dir("james_opc_smartseq2_vs_polya")

    ## 1) STAR CPM estimates

    ss2_obj = loader.load_references('wtchg_p180059', strandedness='u')
    assigned_sum = ss2_obj.data.sum()
    unassigned_sum = ss2_obj.data_unassigned.drop('N_unmapped').sum()

    ss2_pct_assigned = assigned_sum / (assigned_sum + unassigned_sum) * 100.

    print "SmartSeq2 samples % assigned"
    print ss2_pct_assigned

    polya_obj = loader.load_by_patient(pids)

    # restrict to relevant samples for first part of the analysis
    idx = (polya_obj.meta.type == 'iNSC')
    polya_nsc_meta = polya_obj.meta.loc[idx]
    polya_nsc_data = polya_obj.data.loc[:, polya_nsc_meta.index]
    polya_nsc_unassigned = polya_obj.data_unassigned.loc[:,
            c, p = the_func(many.loc[row], many.loc[col])
            cor.loc[row, col] = c
            cor.loc[col, row] = c
            pval.loc[row, col] = p
            pval.loc[col, row] = p

    return cor, pval


if __name__ == "__main__":
    min_tpm = 1.
    eps = 0.01
    outdir = output.unique_output_dir()
    # load just 1st lane
    obj = loader.load_references('wtchg_p180443/180911_K00150_0372_AHWV7TBBXX',
                                 tax_id=10090,
                                 source='salmon')
    ix = (obj.meta.species
          == 'mouse') | (obj.meta.index.str.contains(r'[iI]MGL'))
    dat = obj.data.loc[:, ix]
    dat_filt = filter.filter_by_cpm(dat, min_cpm=min_tpm, unless_cpm_gt=10.)
    log_dat_filt = np.log2(dat_filt + eps)

    # correlation clustermap
    row_colours = pd.DataFrame('g',
                               index=log_dat_filt.columns,
                               columns=['Sample type'])
    row_colours.loc[row_colours.index.str.contains('mDURA')] = 'k'
    row_colours.loc[row_colours.index.str.contains(
        'mDURA5_NSCmus_N3BE50.2')] = 'y'
    row_colours.loc[row_colours.index.str.contains('mDURA6_NSCmus')] = 'y'
Exemple #5
0
        'NH16_616DEF1B',
        'NH16_677_SP1A',
        'NH16_1574DEF1A',
        'NH16_1976_DEF1Areplacement',
        'NH16_2063_DEF1Areplacement',
        'NH16_2214DEF1A',
        'NH16_2255DEF1B2',
        'NH16_2806DEF3A1',
    ]
    our_ffpe_obj = loader.load_by_patient('all', type='ffpe')
    our_ffpe_obj.meta = our_ffpe_obj.meta.loc[ffpe_samples]
    our_ffpe_obj.data = our_ffpe_obj.data[our_ffpe_obj.meta.index]

    # mouse data

    our_mouse_obj = loader.load_references(['wtchg_p170506', 'wtchg_p170390'], tax_id=10090, strandedness='r')
    # eliminate unneeded samples
    our_mouse_obj.meta = our_mouse_obj.meta.loc[
        ~(our_mouse_obj.meta.index.str.contains('_1') | our_mouse_obj.meta.index.str.contains('CRL3034'))
    ]
    our_mouse_obj.data = our_mouse_obj.data[our_mouse_obj.meta.index.tolist()]

    # SS2 data
    ss2_obj = loader.load_references('wtchg_p180059', strandedness='u')
    ss2_obj.meta.index = ["%s_SS2" % t for t in ss2_obj.meta.index]
    ss2_obj.data.columns = ss2_obj.meta.index


    # define counts seprately for convenience
    polya_human_counts = our_patient_obj.data
    polya_mouse_counts = our_mouse_obj.data
Exemple #6
0
        'RTK II partial': '#d67373',
        'MES partial': '#cc88ea',
        'mixed': '#4C72B0',
        'specific': '#f4e842',
    }

    min_cpm = 1

    outdir = output.unique_output_dir("compare_de_gene_counts_s2", reuse_empty=True)

    # load our data

    obj = loader.load_by_patient(pids, include_control=True)

    # load reference data
    h9_obj = loader.load_references('GSE61794', tax_id=9606, source='star', strandedness='u')
    # h1_obj = loader.load_references('GSE38993', tax_id=9606, source='star', strandedness='u', samples=['H1 NSC'])

    # combine
    obj = loader.MultipleBatchLoader([obj, h9_obj])

    # remove IPSC and rejected 061 samples for good
    idx = (
        (~obj.meta.index.str.contains('IPSC'))
        & (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4']))
    )
    obj.meta = obj.meta.loc[idx]
    obj.data = obj.data.loc[:, idx]
    obj.batch_id = obj.batch_id.loc[idx]

    refs = ['GIBCO', 'H9']
if __name__ == '__main__':
    outdir = unique_output_dir("mouse_NSC_DE", reuse_empty=True)

    de_params = {'lfc': 1, 'fdr': 0.01, 'return_full': True}

    # load our data

    # all samples
    # samples = ['mDura%shuman' % i for i in ('3N1', '5N24A', '6N6')]

    # drop mDura3N1 as it is significantly different
    samples = ['mDura%shuman' % i for i in ('5N24A', '6N6')]

    obj1 = loader.load_references('wtchg_p170390',
                                  source='star',
                                  tax_id=10090,
                                  samples=samples,
                                  strandedness='r')
    obj1_s = loader.load_references('wtchg_p170390',
                                    source='salmon',
                                    tax_id=10090,
                                    samples=samples)

    samples = ['eNSC%dmed' % i for i in (3, 5, 6)]
    obj2 = loader.load_references('wtchg_p170506',
                                  source='star',
                                  tax_id=10090,
                                  samples=samples,
                                  strandedness='r')
    obj2_s = loader.load_references('wtchg_p170506',
                                    source='salmon',
    # set gibco aside
    dat_gibco = obj.data.loc[:, obj.data.columns.str.contains('GIBCO')]
    dat_gibco = ens_index_to_gene_symbol(dat_gibco)

    # drop any cell types other than GBM and iNSC
    ix = obj.meta['type'].isin(['GBM', 'iNSC'])
    # drop unneeded GBM061 samples
    ix = ix & (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4']))
    obj.filter_samples(ix)

    # convert to gene symbols
    dat = ens_index_to_gene_symbol(obj.data)

    # load reference dataset(s)
    ref_obj = loader.load_references('GSE61794', strandedness='u', source=source_by_units[units])
    ix = ref_obj.meta.index.str.contains('NSC')
    ref_obj.filter_samples(ix)

    # convert to gene symbols
    dat_h9 = ens_index_to_gene_symbol(ref_obj.data)

    # write single patient syngeneic comparison data
    for pid in pids:
        the_idx = dat.columns.str.contains(pid)
        the_dat = dat.loc[:, the_idx]
        the_classes = pd.Series('GBM', index=the_dat.columns)
        the_classes.loc[the_classes.index.str.contains('NSC')] = 'iNSC'
        out_fn = os.path.join(out_subdir, "%s.{ext}" % (pid))
        gsea.data_to_gct(the_dat, out_fn.format(ext='gct'))
        gsea.phenotypes_to_cls(the_classes, out_fn.format(ext='cls'))
Exemple #9
0
from load_data import rnaseq_data
from plotting import corr, clustering
from rnaseq import loader
from stats import transformations
from utils import reference_genomes
from utils.output import unique_output_dir

if __name__ == "__main__":
    outdir = unique_output_dir('mouse_validation')

    # new code
    ref_obj = loader.load_references(
        ['GSE64411', 'GSE52564', 'GSE43916', 'GSE86248', 'GSE36114'],
        source='star',
        tax_id=10090,
        batch_names=[
            'GSE64411', 'GSE52564', 'GSE43916', 'GSE86248', 'GSE36114'
        ])

    # old code

    # load mouse data

    obj = rnaseq_data.mouse_nsc_validation_samples(
        annotate_by='Ensembl Gene ID')

    # reorder for plotting niceness

    samples = ['eNSC%dmed' % i for i in (3, 5, 6)] \
              + ['eNSC%dmouse' % i for i in (3, 5, 6)] \
if __name__ == '__main__':
    # parameters
    alpha = 0.05
    min_logfc = 0
    eps = 0.01
    n_by_mad = 3000
    min_cpm = 0.01  # for filtering purposes

    treatment_colour = {'WT': '0.8', 'Rheb KO': '0.2'}

    outdir = output.unique_output_dir()

    # load our data
    obj_star = loader.load_references('wtchg_p190202',
                                      alignment_subdir='mouse',
                                      tax_id=10090)
    obj_salmon = loader.load_references('wtchg_p190202',
                                        alignment_subdir='mouse',
                                        source='salmon',
                                        tax_id=10090)

    # dump to file for sharing
    dat = obj_salmon.data.copy()
    general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090)
    dat.to_excel(os.path.join(outdir, 'salmon_tpm_all_data.xlsx'))
    dat = obj_star.data.copy()
    general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090)
    dat.to_excel(os.path.join(outdir, 'star_counts_all_data.xlsx'))

    # load Bowman data
Exemple #11
0
        ('Liu et al.', 'GSE96950'),
        ('Wapinski et al.', 'GSE43916'),
        ('Friedmann-Morvinski et al.', 'GSE73127'),
        ('Friedmann-Morvinski et al.', 'GSE64411/trimgalore'),
        ('Zhang et al.', 'GSE52564'),
        ('Chen et al.', 'GSE52125'),
        ('Yanez et al.', 'GSE88982'),
        ('Lynch', 'GSE78795'),
        ('Moyon et al.', 'GSE66029'),
        ('Schmid et al.', 'GSE75592'),
        ('Srinivasan et al.', 'GSE75246'),
    ]

    ref_objs = loader.load_references(
        [t[1] for t in ref_names],
        tax_id=10090,
        source='salmon',
        batch_names=[t[0] for t in ref_names],
    )

    ref_obj = loader.load_references(
        [t[1] for t in ref_names],
        tax_id=10090,
        source='star',
        batch_names=[t[0] for t in ref_names],
    )

    # remove unneeded samples

    ref_obj.meta = ref_obj.meta.loc[~ref_obj.meta.index.str.
                                    contains('Normal brain')]
    ref_obj.meta = ref_obj.meta.loc[~ref_obj.meta.index.str.contains('GBM')]
Exemple #12
0
    de_params = {'lfc': 1, 'fdr': 0.01, 'method': 'GLM'}

    subgroups = {
        'RTK I': ['019', '030', '031'],
        'RTK II': ['017', '050', '054'],
    }

    intersecter = lambda x, y: set(x).intersection(y)
    unioner = lambda x, y: set(x).union(y)

    # Load RNA-Seq from STAR
    rnaseq_obj = rnaseq_data.load_by_patient(pids,
                                             annotate_by='Ensembl Gene ID')

    # load additional references if required
    h9_obj = loader.load_references('gse61794')
    h1_obj = loader.load_references('gse38993')
    # h9_obj = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID')
    # h1_obj = rnaseq_data.gse38993(annotate_by='Ensembl Gene ID')
    rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj, h1_obj, h9_obj])

    # discard unmapped, etc
    rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains(
        'ENSG')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.
                                          contains('PSC')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.
                                          contains('fibroblast')]
    rnaseq_obj.data = rnaseq_obj.data.loc[:, rnaseq_obj.meta.index]

    # load RNA-Seq from Salmon (for normalised comparison)
Exemple #13
0
                  (components[1] + 1, variance_explained[components[1]]))

    return p, ax


if __name__ == '__main__':
    min_val = 1
    n_above_min = 3
    n_gene_by_mad = 5000
    eps = 0.01  # offset to use when applying log transform
    source = 'salmon'
    quantile_norm = None

    outdir = output.unique_output_dir()

    obj1 = loader.load_references('wtchg_p170390', source=source, tax_id=10090)
    samples = ['eNSC%dmouse' % i for i in (3, 5, 6)] \
    + ['mDura%smouse' % i for i in ('3N1', '5N24A', '6N6')] \
    + ['mDura%shuman' % i for i in ('3N1', '5N24A', '6N6')]
    obj1.filter_samples(obj1.meta.index.isin(samples))

    obj2 = loader.load_references('wtchg_p170506', source=source, tax_id=10090)
    samples = ['eNSC%dmed' % i for i in (3, 5, 6)]
    obj2.filter_samples(obj2.meta.index.isin(samples))

    obj3 = loader.load_references('wtchg_p180443', source=source, tax_id=10090)

    obj = loader.loader.MultipleBatchLoader([obj1, obj2, obj3])

    the_dat = np.log2(obj.data + eps)
    if quantile_norm is not None:
    row_per_fig = 5

    # load TPM data
    cols_syn_gic = consts.S1_RNASEQ_SAMPLES_GIC
    cols_syn_insc = consts.S1_RNASEQ_SAMPLES_INSC
    cols_ref_nsc = [
        'GIBCO_NSC_P4',
        'H9_NSC_1',
        'H9_NSC_2'
    ]

    outdir = output.unique_output_dir()

    obj1 = loader.load_by_patient(pids, source='salmon', include_control=True)
    obj2 = loader.load_references('GSE61794', source='salmon')
    obj = loader.MultipleBatchLoader([obj1, obj2])
    obj.filter_by_sample_name(consts.ALL_RNASEQ_SAMPLES)

    dat = obj.data.copy()
    general.add_gene_symbols_to_ensembl_data(dat)

    # check that all GOIs are present
    vc = dat['Gene Symbol'].value_counts()
    for g in gois:
        if g not in vc:
            raise KeyError("Gene %s was not found." % g)
        if vc[g] != 1:
            raise AttributeError("Gene %s has %d hits." % (g, vc[g]))

    # create plots in multiple figures
Exemple #15
0
            fig_kws={'figsize': (5.5, 10)},
            vertical=False)
        d['fig'].savefig(os.path.join(outdir, fname_log.format(ext='png')),
                         dpi=200)

    plt.draw()
    plt.close('all')

    # bring in reference data
    # IDs (if req), lab (appears in label), loader

    ref_dats = [
        (
            None,
            'Barres et al.',
            loader.load_references('GSE73721', source='salmon', units=units),
        ),
        (
            None,
            'Caren et al.',
            loader.load_references('E-MTAB-3867', source='salmon',
                                   units=units),
        ),
        (
            None,
            'Yang et al.',
            loader.load_references('GSE80732', source='salmon', units=units),
        ),
        (
            None,
            'Shahbazi et al.',