Beispiel #1
0
def ds_ds2():
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')
    ds2 = ds.copy()
    ds.samplesheet = ds.samplesheet.iloc[:2]
    ds2.samplesheet = ds2.samplesheet.iloc[2:]
    return (ds, ds2)
author:     Fabio Zanini
date:       07/08/17
content:    Test Dataset class.
'''
import numpy as np

# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(samplesheet='example_sheet_tsv',
                 counts_table='example_table_tsv')
    ds2 = ds.copy()
    ds.samplesheet = ds.samplesheet.iloc[:2]
    ds2.samplesheet = ds2.samplesheet.iloc[2:]

    print('Bootstrap')
    dsboot = ds.bootstrap()
    assert ('--sampling_' in dsboot.samplenames[0])
    print('Done!')

    print('Test feature comparison (Mann-Whitney U)')
    pvals = ds.compare(ds2, method='mann-whitney')
    assert (np.isclose(pvals.values.min(), 0.193931))
    print('Done!')

    print('Test feature comparison (Kolmogorov-Smirnov)')
    pvals = ds.compare(ds2, method='kolmogorov-smirnov')
    assert (np.isclose(pvals.values.min(), 0.097027))
Beispiel #3
0
if __name__ == '__main__':

    ds = Dataset(
        samplesheet='dengue',
        counts_table='dengue',
        featuresheet='humanGC38',
    )
    data_snv = xr.open_dataset('../bigdata/allele_frequencies.nc')
    #ds.counts = CountsTable(data['aaf'].to_pandas().fillna(0))

    # Sync with Felix metadata
    with open('../data/metadataD_SNV_with_tsne_and_tsneSNV.pkl', 'rb') as ff:
        metadata_felix = pickle.load(ff)
    samples = metadata_felix.index[(
        ~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)]
    ds.samplesheet = ds.samplesheet.loc[samples]
    metadata_felix = metadata_felix.loc[samples]
    for col in [
            'coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10',
            'tsne2_MOI1_10', 'tsne1_SNV', 'tsne2_SNV', 'clusterN_SNV'
    ]:
        ds.samplesheet[col] = metadata_felix[col]
    ds.samplesheet['log_Dn'] = np.log10(1e-6 + ds.samplesheet['Dn'])
    ds.samplesheet['log_Ds'] = np.log10(1e-6 + ds.samplesheet['Ds'])

    cov = ds.samplesheet['coverage']
    n = ds.samplesheet['numberDengueReads'].astype(int)
    ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n)
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(0.1 + 1e6 * n /
                                                             (cov + n))
Beispiel #4
0
    # htseq-count output, below is the explanation how I solved this.
    # velocyto skips quasi-empty BAM files, final order is lexicographic as in:
    # echo '' > ~/subfolders.tsv; for fdn in 10017006*; do si=$(du $fdn/star/Aligned.out.possorted.bam | cut -f1); if [ $si -ge "30" ]; then echo $fdn >> ~/subfolders.tsv; fi; done
    cellnames = pd.read_csv('../bigdata/rnavelocity_cellnames.tsv', header=None).values[:, 0]
    vlm = vcy.VelocytoLoom("../bigdata/rna_velocity.loom")
    vlm.ca['CellID'] = cellnames

    print('Load normal counts and metadata and sync them with the velocity results')
    # Load and sync external metadata
    ds = Dataset(
            #counts_table='dengue',
            samplesheet='dengue',
            )
    with open('../data/metadataD_SNV_with_tsne_and_tsneSNV.pkl', 'rb') as ff:
        metadata_felix = pickle.load(ff)
    ds.samplesheet = ds.samplesheet.loc[cellnames]
    metadata_felix = metadata_felix.loc[cellnames]

    vlm.ca['ClusterName'] = metadata_felix['clusterN_SNV'].fillna(6).values
    vlm.set_clusters(vlm.ca["ClusterName"])

    ds.samplesheet['clusterN_SNV'] = metadata_felix['clusterN_SNV'].fillna(6)
    ds.samplesheet['coverage'] = metadata_felix['coverage']
    ds.samplesheet['virus_reads_per_million'] = 1.0 * 1e6 * ds.samplesheet['numberDengueReads'] / (ds.samplesheet['numberDengueReads'] + ds.samplesheet['coverage'])
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(0.1 + ds.samplesheet['virus_reads_per_million'])

    print('Filter cells, genes, etc. using the velocity tutorial')
    vlm.normalize("S", size=True, log=True)
    vlm.filter_cells(bool_array=vlm.initial_Ucell_size > np.percentile(vlm.initial_Ucell_size, 0.5))
    ds.samplesheet = ds.samplesheet.loc[vlm.ca['CellID']]
    metadata_felix = metadata_felix.loc[vlm.ca['CellID']]