ind = (ds.counts > args.n_cpm_min_genes[0]).sum( axis=1) >= args.n_cpm_min_genes[1] ds.counts = ds.counts.loc[ind] print('Ignore genes with multiple IDs') from collections import Counter genec = Counter(ds.featuresheet['GeneName'].values) genes_multiple = [k for k, v in genec.items() if v > 1] ds.featuresheet = ds.featuresheet.loc[~ds.featuresheet['GeneName']. isin(genes_multiple)] print('Translate to gene names') ds.rename(axis='features', column='GeneName', inplace=True) print('Restrict to virus genes') dsv = ds.query_features_by_metadata('Organism == "mCMV"') dsv.query_samples_by_metadata("moi in ('low', 'high')", inplace=True) print('Log counts') dsv.counts.log(inplace=True) print('Plot double hierarchical clustering') g = dsv.plot.clustermap( cluster_samples=True, cluster_features=True, labels_samples=False, annotate_samples={ 'moi': 'Set1', }, figsize=(12, 13), #colorbars=True,
with open('../../data/mouse_mCMV_1/virus_gene_modules_1.json', 'rt') as f: modules = json.load(f) ds.featuresheet['module'] = ds.featuresheet['Organism'] ds.featuresheet.loc[ds.featuresheet['Organism'] == 'mCMV', 'module'] = 'other_virus' ds.featuresheet.loc[modules['1'], 'module'] = 'early' ds.featuresheet.loc[modules['2'], 'module'] = 'mid' ds.featuresheet.loc[modules['3'], 'module'] = 'late' print('Get average expression') ds.featuresheet.loc[:, 'expression_geometric_mean'] = 10**( (np.log10(0.1 + ds.counts)).mean(axis=1)) - 0.1 print('Restrict to genes within the modules') dsg = ds.query_features_by_metadata( '(module in ("early", "mid", "late")) | (expression_geometric_mean > 100)' ) dsg.query_samples_by_metadata('moi in ("low", "high")', inplace=True) dsg.query_samples_by_metadata('virus_reads_per_million >= 100', inplace=True) print('Make metadata for module expression') dsg.samplesheet['exp_module_early'] = dsg.counts.loc[ ds.featuresheet['module'] == 'early'].sum(axis=0) dsg.samplesheet['exp_module_mid'] = dsg.counts.loc[ ds.featuresheet['module'] == 'mid'].sum(axis=0) dsg.samplesheet['exp_module_late'] = dsg.counts.loc[ ds.featuresheet['module'] == 'late'].sum(axis=0) corrs = dsg.correlation.correlate_features_phenotypes(phenotypes=[ 'exp_module_early', 'exp_module_mid', 'exp_module_late',
local_dict=locals(), inplace=True) if not args.keep2: print('Filter out sample 2-uninfected (low quality)') ds.query_samples_by_metadata('biosample != "2-uninfected"', inplace=True) print('Add normalized virus counts') ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[ 'n_reads_virus'] / ds.samplesheet['n_reads'] ds.samplesheet['log_virus_reads_per_million'] = np.log10( 0.1 + ds.samplesheet['virus_reads_per_million']) print('Limit to decently expressed genes') ind = (ds.counts >= 10).sum(axis=1) >= 10 ds.featuresheet['detected_1010'] = ind.values print('Ignore genes with multiple IDs') from collections import Counter genec = Counter(ds.featuresheet['GeneName'].values) ind = [genec[gn] for gn in ds.featuresheet['GeneName'].values] ds.featuresheet['nGenes'] = ind ds.query_features_by_metadata( '((Organism == "mCMV") | detected_1010) & (nGenes == 1)', inplace=True) print('Translate to gene names') ds.featuresheet['EnsemblID'] = ds.featuresheet.index ds.rename(axis='features', column='GeneName', inplace=True)
with open('../../data/mouse_mCMV_1/virus_gene_modules_1.json', 'rt') as f: modules = json.load(f) ds.featuresheet['module'] = ds.featuresheet['Organism'] ds.featuresheet.loc[ds.featuresheet['Organism'] == 'mCMV', 'module'] = 'other_virus' ds.featuresheet.loc[modules['1'], 'module'] = 'early' ds.featuresheet.loc[modules['2'], 'module'] = 'mid' ds.featuresheet.loc[modules['3'], 'module'] = 'late' print('Get average expression') ds.featuresheet.loc[:, 'expression_geometric_mean'] = 10**( (np.log10(0.1 + ds.counts)).mean(axis=1)) - 0.1 print('Restrict to genes within the modules') dsg = ds.query_features_by_metadata( '(module in ("early", "mid", "late")) | ((Organism == "mCMV") & (expression_geometric_mean > 0.05)) | (expression_geometric_mean > 100)' ) print('Export to TSV') fdn = '../../data/mouse_mCMV_1/to_Paolo/' co = (1e-6 * dsg.counts * samplesheet['n_reads']).fillna(0).astype(int) co.to_csv( fdn + 'counts_not_normalized_filtered.tsv', sep='\t', index=True, ) dsg.samplesheet.to_csv( fdn + 'samplesheet_filtered.tsv', sep='\t', index=True, )