Beispiel #1
0
def ds_ds2():
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')
    ds2 = ds.copy()
    ds.samplesheet = ds.samplesheet.iloc[:2]
    ds2.samplesheet = ds2.samplesheet.iloc[2:]
    return (ds, ds2)
Beispiel #2
0
content:    Test Dataset class.
'''
# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    print('Instantiating Dataset')
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')
    print('Done!')

    print('Testing Dataset.__str__')
    assert (str(ds) == 'Dataset with 4 samples and 60721 features')
    print('Done!')

    print('Testing Dataset.__repr__')
    assert (ds.__repr__() == '<Dataset: 4 samples, 60721 features>')
    print('Done!')

    print('Testing Dataset.copy')
    assert (ds.copy() == ds)
    print('Done!')

    print('Testing Dataset.copy with modifications')
    dsp = ds.copy()
    dsp._counts.iloc[0, 0] = -5
    assert (dsp != ds)
    print('Done!')
'''
author:     Fabio Zanini
date:       07/08/17
content:    Test Dataset class.
'''
import numpy as np

# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')
    ds2 = ds.copy()
    ds.samplesheet = ds.samplesheet.iloc[:2]
    ds2.samplesheet = ds2.samplesheet.iloc[2:]

    print('Bootstrap')
    dsboot = ds.bootstrap()
    assert ('--sampling_' in dsboot.samplenames[0])
    print('Done!')

    print('Test feature comparison (Mann-Whitney U)')
    pvals = ds.compare(ds2, method='mann-whitney')
    assert (np.isclose(pvals.values.min(), 0.193931))
    print('Done!')

    print('Test feature comparison (Kolmogorov-Smirnov)')
    pvals = ds.compare(ds2, method='kolmogorov-smirnov')
Beispiel #4
0
        'TNFRSF10B',
        'EIF2A',
        'DDIT3',
        'PPP1R15A',
        'ATF4',
        'EDEM1',
        'XBP1',
        'ATF6',
        'ATF3',
        'ATG5',
        'CASP3',
        'CASP9',
        'CASP4',
        'CASP6',
    ]
    dsd = ds.copy()
    gids = [
        ds.featuresheet.index[ds.featuresheet['GeneName'] == gname][0]
        for gname in gnames
    ]
    dsd.counts = dsd.counts.loc[gids]

    dsh = dsd.copy()
    if virus == 'dengue':
        dsh.query_samples_by_metadata('virus_reads_per_million > 1e3',
                                      inplace=True)
        dsh.query_samples_by_metadata('MOI != "0"', inplace=True)
    else:
        dsh.query_samples_by_metadata('virus_reads_per_million > 1e1',
                                      inplace=True)
        dsh.query_samples_by_metadata('MOI == "1"', inplace=True)
Beispiel #5
0
    n = ds.samplesheet['numberDengueReads'].astype(int)
    ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n)
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(
        0.1 + ds.samplesheet['virus_reads_per_million'])

    #print('Log counts')
    #ds.counts.log(inplace=True)

    print('Get cell cycle genes (Core 67)')
    cc = load_cell_cycle_table()[[
        'GeneName', 'Periodic Rank', 'Phase', 'Core'
    ]]
    cc.query('Core != "No"', inplace=True)

    print('Hierarchical clustering of cells based on cell cycle and virus')
    dsn = ds.copy()
    dsn.counts = dsn.counts.loc[cc.index]
    for col in ['Periodic Rank', 'Phase']:
        dsn.featuresheet.loc[:, col] = cc.loc[dsn.featuresheet.index, col]
    dsn.rename(axis='features', column='GeneName', inplace=True)

    print('Log counts')
    dsn.counts.log(inplace=True)
    dsn.featuresheet.loc[:, 'Mean'] = dsn.counts.mean(axis=1)

    # Only keep decently expressed genes
    dsn.query_features_by_metadata('Mean > 1', inplace=True)

    hier = dsn.cluster.hierarchical(
        axis='samples',
        #phenotypes=['virus_reads_per_million'],
Beispiel #6
0
            'tsne2_MOI1_10'
    ]:
        ds.samplesheet[col] = metadata_felix[col]
    ds.samplesheet['log_Dn'] = np.log10(1e-6 + ds.samplesheet['Dn'])
    ds.samplesheet['log_Ds'] = np.log10(1e-6 + ds.samplesheet['Ds'])

    cov = ds.samplesheet['coverage']
    n = ds.samplesheet['numberDengueReads'].astype(int)
    ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n)
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(0.1 + 1e6 * n /
                                                             (cov + n))

    vs = ds.samplesheet[['tsne1_MOI1_10', 'tsne2_MOI1_10']]

    # Restrict to high variance SNVs
    dsv = ds.copy()
    ind = ds.counts.values.var(axis=1).argsort()[-200:]
    dsv.counts = dsv.counts.iloc[ind]
    vsg = dsv.dimensionality.tsne(perplexity=30)

    # Cluster
    # NOTE: the number is manually chosen but does not matter much atm
    dsv.samplesheet['clusterN'] = ds.cluster.kmeans(axis='samples',
                                                    n_clusters=6)
    dss = dsv.split('clusterN')

    # Plot gene expression tSNE overlayed with viral genomics
    fig, axs = plt.subplots(3, 3, figsize=(7, 6), sharex=True, sharey=True)
    axs = axs.ravel()
    plots = [
        'log_virus_reads_per_million', 'coverage', 'depth', 'numSNV', 'log_Dn',
# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')

    print('Test feature selection by expression')
    res = ds.feature_selection.expressed(n_samples=1, exp_min=1)
    assert (res[0] == 'TSPAN6')
    print('Done!')

    print('Test feature selection by expression, in place')
    dsp = ds.copy()
    dsp.feature_selection.expressed(n_samples=1, exp_min=1, inplace=True)
    assert (dsp.featurenames[0] == 'TSPAN6')
    print('Done!')

    print('Test feature selection by overdispersed strata')
    res = ds.feature_selection.overdispersed_strata()
    assert (res[-1] == 'GLIPR2')
    print('Done!')

    print('Test feature selection by overdispersed strata, in place')
    dsp = ds.copy()
    dsp.feature_selection.overdispersed_strata(inplace=True)
    assert (dsp.featurenames[-1] == 'GLIPR2')
    print('Done!')
Beispiel #8
0
            featuresheet='humanGC38',
            )
    ds.query_samples_by_counts('total >= 50000', inplace=True)

    ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True)
    cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0)
    ds.counts.normalize('counts_per_million', inplace=True)
    ds.samplesheet['virus_reads_per_million'] = 0
    for virus in ('dengue', 'zika'):
        ind = ds.samplesheet['virus'] == virus
        n = ds.samplesheet.loc[ind, 'number'+virus.capitalize()+'Reads'].astype(int)
        ds.samplesheet.loc[ind, 'virus_reads_per_million'] = 1e6 * n / (cov.loc[ind] + n)
    ds.counts.log(inplace=True)

    # Select only some cells for comparison
    dsc = ds.copy()
    dsc.samplesheet = dsc.samplesheet.query('500 < virus_reads_per_million')

    print('Get correlations')
    dsv = dsc.split(phenotypes='virus')
    vs = []
    cos = []
    for virus, dsvi in dsv.items():
        co = dsvi.correlation.correlate_features_phenotypes(
                phenotypes='virus_reads_per_million',
                fillna=0).fillna(0)
        cos.append(co)
        vs.append(virus)
    cos = pd.concat(cos, axis=1)
    cos.columns = pd.Index(vs, name='virus')
Beispiel #9
0
    from singlet.dataset import Dataset
    ds = Dataset(counts_table='example_PBMC')

    # Normalize
    ds.counts.normalize(method='counts_per_million', inplace=True)
    ds.counts.log(inplace=True)

    # Select features
    ds.feature_selection.expressed(n_samples=3, exp_min=1, inplace=True)
    ds.feature_selection.overdispersed_strata(n_features_per_stratum=20,
                                              inplace=True)

    # Reduce dimensionality
    vs = ds.dimensionality.tsne(n_dims=2, theta=0.5, perplexity=0.8)

    dsr = ds.copy()
    dsr.counts = vs.T

    # Cluster
    dsr.samplesheet['dbscan'] = dsr.cluster.dbscan(eps=5, axis='samples')
    dsr.samplesheet['kmeans'] = dsr.cluster.kmeans(n_clusters=7,
                                                   axis='samples')

    # Plot t-SNE
    fig, axs = plt.subplots(nrows=1,
                            ncols=2,
                            sharex=True,
                            sharey=True,
                            figsize=(8, 4))
    dsr.plot.scatter_reduced_samples(vs,
                                     color_by='dbscan',