Beispiel #1
0
    ind = (ds.counts > args.n_cpm_min_genes[0]).sum(
        axis=1) >= args.n_cpm_min_genes[1]
    ds.counts = ds.counts.loc[ind]

    print('Ignore genes with multiple IDs')
    from collections import Counter
    genec = Counter(ds.featuresheet['GeneName'].values)
    genes_multiple = [k for k, v in genec.items() if v > 1]
    ds.featuresheet = ds.featuresheet.loc[~ds.featuresheet['GeneName'].
                                          isin(genes_multiple)]

    print('Translate to gene names')
    ds.rename(axis='features', column='GeneName', inplace=True)

    print('Restrict to virus genes')
    dsv = ds.query_features_by_metadata('Organism == "mCMV"')
    dsv.query_samples_by_metadata("moi in ('low', 'high')", inplace=True)

    print('Log counts')
    dsv.counts.log(inplace=True)

    print('Plot double hierarchical clustering')
    g = dsv.plot.clustermap(
        cluster_samples=True,
        cluster_features=True,
        labels_samples=False,
        annotate_samples={
            'moi': 'Set1',
        },
        figsize=(12, 13),
        #colorbars=True,
Beispiel #2
0
    with open('../../data/mouse_mCMV_1/virus_gene_modules_1.json', 'rt') as f:
        modules = json.load(f)
    ds.featuresheet['module'] = ds.featuresheet['Organism']
    ds.featuresheet.loc[ds.featuresheet['Organism'] == 'mCMV',
                        'module'] = 'other_virus'
    ds.featuresheet.loc[modules['1'], 'module'] = 'early'
    ds.featuresheet.loc[modules['2'], 'module'] = 'mid'
    ds.featuresheet.loc[modules['3'], 'module'] = 'late'

    print('Get average expression')
    ds.featuresheet.loc[:, 'expression_geometric_mean'] = 10**(
        (np.log10(0.1 + ds.counts)).mean(axis=1)) - 0.1

    print('Restrict to genes within the modules')
    dsg = ds.query_features_by_metadata(
        '(module in ("early", "mid", "late")) | (expression_geometric_mean > 100)'
    )
    dsg.query_samples_by_metadata('moi in ("low", "high")', inplace=True)
    dsg.query_samples_by_metadata('virus_reads_per_million >= 100',
                                  inplace=True)

    print('Make metadata for module expression')
    dsg.samplesheet['exp_module_early'] = dsg.counts.loc[
        ds.featuresheet['module'] == 'early'].sum(axis=0)
    dsg.samplesheet['exp_module_mid'] = dsg.counts.loc[
        ds.featuresheet['module'] == 'mid'].sum(axis=0)
    dsg.samplesheet['exp_module_late'] = dsg.counts.loc[
        ds.featuresheet['module'] == 'late'].sum(axis=0)

    corrs = dsg.correlation.correlate_features_phenotypes(phenotypes=[
        'exp_module_early', 'exp_module_mid', 'exp_module_late',
Beispiel #3
0
                                 local_dict=locals(),
                                 inplace=True)

    if not args.keep2:
        print('Filter out sample 2-uninfected (low quality)')
        ds.query_samples_by_metadata('biosample != "2-uninfected"',
                                     inplace=True)

    print('Add normalized virus counts')
    ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[
        'n_reads_virus'] / ds.samplesheet['n_reads']
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(
        0.1 + ds.samplesheet['virus_reads_per_million'])

    print('Limit to decently expressed genes')
    ind = (ds.counts >= 10).sum(axis=1) >= 10
    ds.featuresheet['detected_1010'] = ind.values

    print('Ignore genes with multiple IDs')
    from collections import Counter
    genec = Counter(ds.featuresheet['GeneName'].values)
    ind = [genec[gn] for gn in ds.featuresheet['GeneName'].values]
    ds.featuresheet['nGenes'] = ind

    ds.query_features_by_metadata(
        '((Organism == "mCMV") | detected_1010) & (nGenes == 1)', inplace=True)

    print('Translate to gene names')
    ds.featuresheet['EnsemblID'] = ds.featuresheet.index
    ds.rename(axis='features', column='GeneName', inplace=True)
Beispiel #4
0
    with open('../../data/mouse_mCMV_1/virus_gene_modules_1.json', 'rt') as f:
        modules = json.load(f)
    ds.featuresheet['module'] = ds.featuresheet['Organism']
    ds.featuresheet.loc[ds.featuresheet['Organism'] == 'mCMV',
                        'module'] = 'other_virus'
    ds.featuresheet.loc[modules['1'], 'module'] = 'early'
    ds.featuresheet.loc[modules['2'], 'module'] = 'mid'
    ds.featuresheet.loc[modules['3'], 'module'] = 'late'

    print('Get average expression')
    ds.featuresheet.loc[:, 'expression_geometric_mean'] = 10**(
        (np.log10(0.1 + ds.counts)).mean(axis=1)) - 0.1

    print('Restrict to genes within the modules')
    dsg = ds.query_features_by_metadata(
        '(module in ("early", "mid", "late")) | ((Organism == "mCMV") & (expression_geometric_mean > 0.05)) | (expression_geometric_mean > 100)'
    )

    print('Export to TSV')
    fdn = '../../data/mouse_mCMV_1/to_Paolo/'
    co = (1e-6 * dsg.counts * samplesheet['n_reads']).fillna(0).astype(int)
    co.to_csv(
        fdn + 'counts_not_normalized_filtered.tsv',
        sep='\t',
        index=True,
    )
    dsg.samplesheet.to_csv(
        fdn + 'samplesheet_filtered.tsv',
        sep='\t',
        index=True,
    )