Beispiel #1
0
    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv')

    print('Hierarchical clustering of samples')
    d = ds.cluster.hierarchical(
            'samples',
            optimal_ordering=True)
    assert(tuple(d['leaves']) == ('second_sample', 'test_pipeline',
                                  'first_sample', 'third_sample'))
    print('Done!')

    print('Hierarchical clustering of features')
    ds.counts = ds.counts.iloc[:200]
    d = ds.cluster.hierarchical(
            'features',
            optimal_ordering=True)
    assert(tuple(d['leaves'])[:3] == ('PNPLA4', 'ITGAL', 'HOXA11'))
    print('Done!')

    print('Hierarchical clustering of features and phenotypes')
    ds.counts = ds.counts.iloc[:200]
    d = ds.cluster.hierarchical(
            axis='features',
            phenotypes=('quantitative_phenotype_1_[A.U.]',),
            optimal_ordering=True)
    assert(d['leaves'][23] == 'quantitative_phenotype_1_[A.U.]')
    print('Done!')
Beispiel #2
0
    print('Add normalized virus counts')
    ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[
        'n_reads_virus'] / ds.samplesheet['n_reads']
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(
        0.1 + ds.samplesheet['virus_reads_per_million'])

    print('Filter low-quality cells')
    n_reads_min = args.n_reads_min
    ds.query_samples_by_metadata('n_reads > @n_reads_min',
                                 local_dict=locals(),
                                 inplace=True)

    print('Limit to decently expressed genes')
    ind = (ds.counts > args.n_cpm_min_genes[0]).sum(
        axis=1) >= args.n_cpm_min_genes[1]
    ds.counts = ds.counts.loc[ind]

    print('Ignore genes with multiple IDs')
    from collections import Counter
    genec = Counter(ds.featuresheet['GeneName'].values)
    genes_multiple = [k for k, v in genec.items() if v > 1]
    ds.featuresheet = ds.featuresheet.loc[~ds.featuresheet['GeneName'].
                                          isin(genes_multiple)]

    print('Translate to gene names')
    ds.rename(axis='features', column='GeneName', inplace=True)

    print('Restrict to virus genes')
    dsv = ds.query_features_by_metadata('Organism == "mCMV"')
    dsv.query_samples_by_metadata("moi in ('low', 'high')", inplace=True)
Beispiel #3
0
import pandas as pd
import xarray as xr
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml'
sys.path.append('/home/fabio/university/postdoc/singlet')
from singlet.dataset import Dataset, CountsTable

# Script
if __name__ == '__main__':

    ds = Dataset(samplesheet='dengue', )
    data = xr.open_dataset('../bigdata/allele_frequencies.nc')
    ds.counts = CountsTable(data['aaf'].to_pandas().fillna(0))

    # Sync with Felix metadata
    with open('../data/metadataD_SNV_with_tsne.pkl', 'rb') as ff:
        metadata_felix = pickle.load(ff)
    samples = metadata_felix.index[(
        ~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)]
    ds.samplesheet = ds.samplesheet.loc[samples]
    metadata_felix = metadata_felix.loc[samples]
    for col in [
            'coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10',
            'tsne2_MOI1_10'
    ]:
        ds.samplesheet[col] = metadata_felix[col]
    ds.samplesheet['log_Dn'] = np.log10(1e-6 + ds.samplesheet['Dn'])
    ds.samplesheet['log_Ds'] = np.log10(1e-6 + ds.samplesheet['Ds'])