def ds_ds2(): from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') ds2 = ds.copy() ds.samplesheet = ds.samplesheet.iloc[:2] ds2.samplesheet = ds2.samplesheet.iloc[2:] return (ds, ds2)
def ds(): from singlet.dataset import Dataset dset = Dataset( samplesheet='example_sheet_tsv', counts_table='example_table_tsv') dset.counts.exclude_features(spikeins=True, other=True, inplace=True) return dset
def get_dataset(tissue, membrane_only=True): counts = parse_counts(tissue) if membrane_only: go = parse_go_plasma_membrane() genes_membrane = go[go.isin(counts.index)] counts = counts.loc[genes_membrane] ds = Dataset( samplesheet=SampleSheet(cell_types), counts_table=CountsTable(counts), ) return ds
import pandas as pd import xarray as xr import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml' sys.path.append('/home/fabio/university/postdoc/singlet') from singlet.dataset import Dataset, CountsTable # Script if __name__ == '__main__': ds = Dataset( samplesheet='dengue', counts_table='dengue', featuresheet='humanGC38', ) data_snv = xr.open_dataset('../bigdata/allele_frequencies.nc') #ds.counts = CountsTable(data['aaf'].to_pandas().fillna(0)) # Sync with Felix metadata with open('../data/metadataD_SNV_with_tsne_and_tsneSNV.pkl', 'rb') as ff: metadata_felix = pickle.load(ff) samples = metadata_felix.index[( ~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)] ds.samplesheet = ds.samplesheet.loc[samples] metadata_felix = metadata_felix.loc[samples] for col in [ 'coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10', 'tsne2_MOI1_10', 'tsne1_SNV', 'tsne2_SNV', 'clusterN_SNV'
#!/usr/bin/env python # vim: fdm=indent ''' author: Fabio Zanini date: 07/08/17 content: Test Dataset class. ''' import numpy as np # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') ds2 = ds.copy() ds.samplesheet = ds.samplesheet.iloc[:2] ds2.samplesheet = ds2.samplesheet.iloc[2:] print('Bootstrap') dsboot = ds.bootstrap() assert ('--sampling_' in dsboot.samplenames[0]) print('Done!') print('Test feature comparison (Mann-Whitney U)') pvals = ds.compare(ds2, method='mann-whitney') assert (np.isclose(pvals.values.min(), 0.193931)) print('Done!') print('Test feature comparison (Kolmogorov-Smirnov)')
def get_dataset(tissue, membrane_only=True, regenerate=False, go_contains=None, go_exclude=None): # Some tissues like brain were split for sorting, we merge them here dss = [] for tissue_facs in tissues_prediction[tissue]: cell_types, plates = parse_annotations(tissue_facs) counts = parse_counts(tissue_facs, regenerate=regenerate) if membrane_only: go = parse_go_plasma_membrane().index genes_membrane = go[go.isin(counts.index)] counts = counts.loc[genes_membrane] if (go_contains is not None) and (go_exclude is not None): raise ValueError('Use either go_contains or go_exclude') if go_contains is not None: go = parse_go_plasma_membrane() genes = go.index[go['GONames'].str.contains(go_contains)] genes = np.intersect1d(genes, counts.index) counts = counts.loc[genes] elif go_exclude is not None: go = parse_go_plasma_membrane() genes = go.index[~go['GONames'].str.contains(go_exclude)] genes = np.intersect1d(genes, counts.index) counts = counts.loc[genes] dss.append({'samplesheet': cell_types, 'counts': counts}) if len(dss) == 1: ds = Dataset( samplesheet=SampleSheet(cell_types), counts_table=counts, ) return ds else: # Merging is kind of messy because some genes are absent from either # subtissue (grrr); I put zeroes for now, Michelle is working on the # better solution (we have those numbers somewhere) genes = set() for ds in dss: genes |= set(ds['counts'].index.values) genes = pd.Index(sorted(genes), name=ds['counts'].index.name) for ds in dss: genes_missing = genes[~genes.isin(ds['counts'].index)] for gene in genes_missing: # The stuff is normalized, pseudocounted, and logged ds['counts'].loc[gene] = -1.0 ds['counts'] = ds['counts'].loc[genes] ngenes = len(genes) ncells = sum(ds['samplesheet'].shape[0] for ds in dss) samplesheet_all = pd.concat([ds['samplesheet'] for ds in dss], axis=0) counts_all = pd.DataFrame(np.zeros((ngenes, ncells), float), index=genes, columns=samplesheet_all.index) for ds in dss: counts_all.loc[:, ds['counts'].columns.values] = ds['counts'].values counts_all = CountsTable(counts_all) if ds['counts']._normalized: counts_all._normalized = ds['counts']._normalized ds = Dataset( samplesheet=SampleSheet(samplesheet_all), counts_table=counts_all, ) return ds
import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml' sys.path.append('/home/fabio/university/postdoc/singlet') from singlet.dataset import Dataset # Script if __name__ == '__main__': ds = Dataset( samplesheet='dengue', counts_table='dengue', featuresheet='humanGC38', ) # Sync with Felix metadata with open('../data/metadataD_SNV_with_tsne.pkl', 'rb') as ff: metadata_felix = pickle.load(ff) samples = metadata_felix.index[(~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)] ds.samplesheet = ds.samplesheet.loc[samples] metadata_felix = metadata_felix.loc[samples] for col in ['coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10', 'tsne2_MOI1_10']: ds.samplesheet[col] = metadata_felix[col] ds.samplesheet['log_Dn'] = np.log10(1e-6 + ds.samplesheet['Dn']) ds.samplesheet['log_Ds'] = np.log10(1e-6 + ds.samplesheet['Ds'])
tb.rename(columns={ 'Symbol': 'GeneName', 'Phase ': 'Phase', 'Core 67': 'Core', }, inplace=True) return tb # Script if __name__ == '__main__': print('Load dataset') ds = Dataset( counts_table='dengue', samplesheet='virus', featuresheet='humanGC38', ) ds.query_samples_by_counts('total >= 50000', inplace=True) ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True) ds.samplesheet.loc[:, 'time'] = pd.Categorical( ds.samplesheet.loc[:, 'time'].astype(int), ) cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0) print('Normalize') ds.counts.normalize('counts_per_million', inplace=True) print('Add virus reads') n = ds.samplesheet['numberDengueReads'].astype(int) ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n) ds.samplesheet['log_virus_reads_per_million'] = np.log10(
if __name__ == '__main__': print('Load RNA velocity results (loom file)') # NOTE: there is a mess trying to connect the velocyto output with the normal # htseq-count output, below is the explanation how I solved this. # velocyto skips quasi-empty BAM files, final order is lexicographic as in: # echo '' > ~/subfolders.tsv; for fdn in 10017006*; do si=$(du $fdn/star/Aligned.out.possorted.bam | cut -f1); if [ $si -ge "30" ]; then echo $fdn >> ~/subfolders.tsv; fi; done cellnames = pd.read_csv('../bigdata/rnavelocity_cellnames.tsv', header=None).values[:, 0] vlm = vcy.VelocytoLoom("../bigdata/rna_velocity.loom") vlm.ca['CellID'] = cellnames print('Load normal counts and metadata and sync them with the velocity results') # Load and sync external metadata ds = Dataset( #counts_table='dengue', samplesheet='dengue', ) with open('../data/metadataD_SNV_with_tsne_and_tsneSNV.pkl', 'rb') as ff: metadata_felix = pickle.load(ff) ds.samplesheet = ds.samplesheet.loc[cellnames] metadata_felix = metadata_felix.loc[cellnames] vlm.ca['ClusterName'] = metadata_felix['clusterN_SNV'].fillna(6).values vlm.set_clusters(vlm.ca["ClusterName"]) ds.samplesheet['clusterN_SNV'] = metadata_felix['clusterN_SNV'].fillna(6) ds.samplesheet['coverage'] = metadata_felix['coverage'] ds.samplesheet['virus_reads_per_million'] = 1.0 * 1e6 * ds.samplesheet['numberDengueReads'] / (ds.samplesheet['numberDengueReads'] + ds.samplesheet['coverage']) ds.samplesheet['log_virus_reads_per_million'] = np.log10(0.1 + ds.samplesheet['virus_reads_per_million']) print('Filter cells, genes, etc. using the velocity tutorial')
pa.add_argument('--save', action='store_true', help='Store filtered cells dataframe of counts to file') pa.add_argument('--n-reads-min', type=int, default=15000, help='Minimal number of reads for good cells') pa.add_argument('--keep2', action='store_true', help='Keep sample 2-uninfected despite low quality') args = pa.parse_args() print('Load dataset') ds = Dataset( counts_table='combined', featuresheet='combined', samplesheet='combined', ) print('Filter low-quality cells') n_reads_min = args.n_reads_min ds.query_samples_by_metadata('n_reads > @n_reads_min', local_dict=locals(), inplace=True) if not args.keep2: print('Filter out sample 2-uninfected (low quality)') ds.query_samples_by_metadata('biosample != "2-uninfected"', inplace=True) print('Add normalized virus counts')
''' author: Fabio Zanini date: 07/08/17 content: Test Dataset class. ''' import numpy as np import scipy as sp # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') print('Hierarchical clustering of samples') d = ds.cluster.hierarchical( 'samples', optimal_ordering=True) assert(tuple(d['leaves']) == ('second_sample', 'test_pipeline', 'first_sample', 'third_sample')) print('Done!') print('Hierarchical clustering of features') ds.counts = ds.counts.iloc[:200] d = ds.cluster.hierarchical( 'features', optimal_ordering=True) assert(tuple(d['leaves'])[:3] == ('PNPLA4', 'ITGAL', 'HOXA11'))
# Script if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--virus', choices=['dengue', 'Zika'], default='dengue', help='Virus to look at') args = parser.parse_args() virus = args.virus print('Load dataset') ds = Dataset( counts_table=virus.lower(), samplesheet='virus', featuresheet='humanGC38', ) ds.query_samples_by_counts('total >= 50000', inplace=True) ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True) cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0) print('Normalize') ds.counts.normalize('counts_per_million', inplace=True) print('Add virus reads') n = ds.samplesheet['number{:}Reads'.format(virus.capitalize())].astype(int) ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n) print('Log counts')
)) print('Read sample metadata') fn = '../../data/mouse_mCMV_1/all/samplesheet.tsv' samplesheet = SampleSheet( pd.read_csv( fn, sep='\t', index_col=0, dtype={0: str}, )) print('Build dataset') ds = Dataset( counts_table=counts, featuresheet=featuresheet, samplesheet=samplesheet, ) print('Add normalized virus counts') ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[ 'n_reads_virus'] / ds.samplesheet['n_reads'] ds.samplesheet['log_virus_reads_per_million'] = np.log10( 0.1 + ds.samplesheet['virus_reads_per_million']) print('Filter low-quality cells') n_reads_min = args.n_reads_min ds.query_samples_by_metadata('n_reads > @n_reads_min', local_dict=locals(), inplace=True)
import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml' sys.path.append('/home/fabio/university/postdoc/singlet') from singlet.dataset import Dataset # Script if __name__ == '__main__': ds = Dataset( counts_table='dengue', samplesheet='dengue', featuresheet='humanGC38', ) ds.query_samples_by_counts('total >= 50000', inplace=True) ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True) cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0) ds.counts.normalize('counts_per_million', inplace=True) ds.samplesheet['virus_reads_per_million'] = 0 for virus in ('dengue', 'zika'): ind = ds.samplesheet['virus'] == virus n = ds.samplesheet.loc[ind, 'number'+virus.capitalize()+'Reads'].astype(int) ds.samplesheet.loc[ind, 'virus_reads_per_million'] = 1e6 * n / (cov.loc[ind] + n) ds.counts.log(inplace=True) # Select only some cells for comparison
def ds(): from singlet.dataset import Dataset return Dataset(counts_table='example_PBMC')
def plot_qcs(ds, virus_threshold=60): fig, axs = plt.subplots(2, 3, figsize=(12, 7)) axs = axs.ravel() # Number of reads ax = axs[0] col = 'n_reads' col_label = 'Number of reads' plot_cumulative(0.1 + ds.samplesheet[col], label='all cells', color='k', ax=ax) for sn, datum in ds.samplesheet[[col, 'biosample']].groupby('biosample'): x = 0.1 + datum[col] plot_cumulative(x, label=sn, ax=ax) ax.grid(True) ax.set_xlabel(col_label) ax.set_xlim(xmin=0.9 * ds.samplesheet[col].min()) ax.set_xscale('log') # Number of genes ax = axs[1] col = 'n_genes_1+' col_label = 'Number of Genes (1+)' plot_cumulative(0.1 + ds.samplesheet[col], label='all cells: {:}'.format(ds.samplesheet.shape[0]), color='k', ax=ax) for sn, datum in ds.samplesheet[[col, 'biosample']].groupby('biosample'): x = 0.1 + datum[col] plot_cumulative(x, label='{:}: {:}'.format(sn, len(x)), ax=ax) ax.grid(True) ax.set_xlabel(col_label) ax.set_xlim(xmin=0.9 * ds.samplesheet[col].min()) ax.set_xscale('log') ax.legend(loc='lower left', fontsize=8) # Number of genes ax = axs[2] col = 'n_genes_3+' col_label = 'Number of Genes (3+)' plot_cumulative(0.1 + ds.samplesheet[col], label='all cells', color='k', ax=ax) for sn, datum in ds.samplesheet[[col, 'biosample']].groupby('biosample'): x = 0.1 + datum[col] plot_cumulative(x, label=sn, ax=ax) ax.grid(True) ax.set_xlabel(col_label) ax.set_xlim(xmin=0.9 * ds.samplesheet[col].min()) ax.set_xscale('log') # Number of virus reads ax = axs[3] col = 'n_reads_virus' col_label = 'Number of virus reads' plot_cumulative(0.1 + ds.samplesheet[col], label='all cells', color='k', ax=ax) for sn, datum in ds.samplesheet[[col, 'biosample']].groupby('biosample'): x = 0.1 + datum[col] plot_cumulative(x, label=sn, ax=ax) ax.plot([virus_threshold] * 2, [0, 1], lw=1.5, color='k', alpha=0.7, ls='--') ax.grid(True) ax.set_xlabel(col_label) ax.set_xlim(xmin=0.09) ax.set_xscale('log') # Housekeeping gnames = ['Actb', 'Gapdh'] ind = ds.featuresheet.loc[ds.featuresheet['GeneName'].isin( gnames)].index dsind = Dataset( counts_table=ds.counts.loc[ind], samplesheet=ds.samplesheet, featuresheet=ds.featuresheet.loc[ind], ) dsind.rename(axis='features', column='GeneName', inplace=True) for gname, ax in zip(gnames, axs[4:]): dd = dsind.counts.log().loc[[gname]].T dd['biosample'] = dsind.samplesheet['biosample'] sns.violinplot( data=dd, x='biosample', y=gname, ax=ax, zorder=10, ) ax.grid(True) ax.set_ylim(-1, 5) ax.set_yticks(np.arange(-1, 6)) ax.set_yticklabels([ '$0$', '$1$', '$10$', '$10^2$', '$10^3$', '$10^4$', '$10^5$', ]) ax.set_ylabel('{:} per million reads'.format(gname)) ax.set_xlabel('') for tk in ax.get_xticklabels(): tk.set_rotation(300) return fig
featuresheet = FeatureSheet(pd.read_csv(fn, sep='\t', index_col=0)) print('Read sample metadata') fn = '../../data/mouse_mCMV_1/all/samplesheet.tsv' samplesheet = SampleSheet( pd.read_csv( fn, sep='\t', index_col=0, dtype={0: str}, )) print('Build dataset') ds = Dataset( counts_table=counts, featuresheet=featuresheet, samplesheet=samplesheet, ) def plot_qcs(ds, virus_threshold=60): fig, axs = plt.subplots(2, 3, figsize=(12, 7)) axs = axs.ravel() # Number of reads ax = axs[0] col = 'n_reads' col_label = 'Number of reads' plot_cumulative(0.1 + ds.samplesheet[col], label='all cells', color='k', ax=ax)
#!/usr/bin/env python # vim: fdm=indent ''' author: Fabio Zanini date: 07/08/17 content: Test Dataset class. ''' # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset( samplesheet='example_sheet_tsv', counts_table='example_table_tsv') print('Query samples by metadata') ds_tmp = ds.query_samples_by_metadata( 'experiment == "test_pipeline"', inplace=False) assert(tuple(ds_tmp.samplenames) == ('test_pipeline',)) print('Done!') print('Query sample by counts in one gene') ds_tmp = ds.query_samples_by_counts('KRIT1 > 100', inplace=False) assert(tuple(ds_tmp.samplenames) == ('third_sample',)) print('Done!') print('Query sample by total counts') ds_tmp = ds.query_samples_by_counts('total < 3000000', inplace=False)
def ds(): from singlet.dataset import Dataset return Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv')
#!/usr/bin/env python # vim: fdm=indent ''' author: Fabio Zanini date: 07/08/17 content: Test Dataset class. ''' import numpy as np # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') print('Test feature selection by expression') res = ds.feature_selection.expressed(n_samples=1, exp_min=1) assert (res[0] == 'TSPAN6') print('Done!') print('Test feature selection by expression, in place') dsp = ds.copy() dsp.feature_selection.expressed(n_samples=1, exp_min=1, inplace=True) assert (dsp.featurenames[0] == 'TSPAN6') print('Done!') print('Test feature selection by overdispersed strata') res = ds.feature_selection.overdispersed_strata() assert (res[-1] == 'GLIPR2')
''' author: Fabio Zanini date: 07/08/17 content: Test examples on PBMCs. ''' import sys import matplotlib.pyplot as plt import numpy as np # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset(counts_table='example_PBMC') # Normalize ds.counts.normalize(method='counts_per_million', inplace=True) ds.counts.log(inplace=True) # Select features ds.feature_selection.expressed(n_samples=3, exp_min=1, inplace=True) ds.feature_selection.overdispersed_strata(n_features_per_stratum=20, inplace=True) # Reduce dimensionality vs = ds.dimensionality.tsne(n_dims=2, theta=0.5, perplexity=0.8) dsr = ds.copy() dsr.counts = vs.T
)) print('Read sample metadata') fn = '../../data/mouse_mCMV_1/all/samplesheet.tsv' samplesheet = SampleSheet( pd.read_csv( fn, sep='\t', index_col=0, dtype={0: str}, )) print('Build dataset') ds = Dataset( counts_table=counts, featuresheet=featuresheet, samplesheet=samplesheet, ) print('Add normalized virus counts') ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[ 'n_reads_virus'] / ds.samplesheet['n_reads'] ds.samplesheet['log_virus_reads_per_million'] = np.log10( 0.1 + ds.samplesheet['virus_reads_per_million']) print('Filter low-quality cells') n_reads_min = args.n_reads_min ds.query_samples_by_metadata('n_reads > @n_reads_min', local_dict=locals(), inplace=True)
)) print('Read sample metadata') fn = '../../data/mouse_mCMV_1/all/samplesheet.tsv' samplesheet = SampleSheet( pd.read_csv( fn, sep='\t', index_col=0, dtype={0: str}, )) print('Build dataset') ds = Dataset( counts_table=counts, featuresheet=featuresheet, samplesheet=samplesheet, ) print('Filter low-quality cells') n_reads_min = args.n_reads_min ds.query_samples_by_metadata('n_reads > @n_reads_min', local_dict=locals(), inplace=True) if not args.keep2: print('Filter out sample 2-uninfected (low quality)') ds.query_samples_by_metadata('biosample != "2-uninfected"', inplace=True) print('Add normalized virus counts')
import os import sys import pysam import numpy as np import pandas as pd import xarray as xr from collections import Counter, defaultdict from Bio import SeqIO os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml' sys.path.append('/home/fabio/university/postdoc/singlet') from singlet.dataset import Dataset, CountsTable if __name__ == '__main__': ds = Dataset(samplesheet='dengue', ) ## Histogram of SNVs #n_lines = {} #n_lines_hist = Counter() #fdn = '../bigdata/DENV_singleCellVCF' #for fn in os.listdir(fdn): # with pysam.VariantFile('{:}/{:}'.format(fdn, fn), 'r') as f: # nl = sum(1 for line in f) # n_lines[fn] = nl # n_lines_hist[nl] += 1 ## Example file #fdn = '../bigdata/DENV_singleCellVCF' #fn_ex = 'vars1001700612_I2.vcf' #f = pysam.VariantFile('{:}/{:}'.format(fdn, fn_ex), 'r')
import argparse import numpy as np import pandas as pd import xarray as xr import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml' sys.path.append('/home/fabio/university/postdoc/singlet') from singlet.dataset import Dataset, CountsTable # Script if __name__ == '__main__': ds = Dataset(samplesheet='dengue', ) data = xr.open_dataset('../bigdata/allele_frequencies.nc') ds.counts = CountsTable(data['aaf'].to_pandas().fillna(0)) # Sync with Felix metadata with open('../data/metadataD_SNV_with_tsne.pkl', 'rb') as ff: metadata_felix = pickle.load(ff) samples = metadata_felix.index[( ~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)] ds.samplesheet = ds.samplesheet.loc[samples] metadata_felix = metadata_felix.loc[samples] for col in [ 'coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10', 'tsne2_MOI1_10' ]: ds.samplesheet[col] = metadata_felix[col]
import argparse import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml' sys.path.append('/home/fabio/university/postdoc/singlet') from singlet.dataset import Dataset # Script if __name__ == '__main__': ds = Dataset( counts_table='dengue', samplesheet='virus', featuresheet='humanGC38', ) ds.query_samples_by_counts('total >= 50000', inplace=True) ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True) cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0) ds.counts.normalize('counts_per_million', inplace=True) n = ds.samplesheet['numberDengueReads'].astype(int) ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n) ds.counts.log(inplace=True) # Only select cells without virus ds.query_samples_by_metadata('virus_reads_per_million < 0.1', inplace=True)
#!/usr/bin/env python # vim: fdm=indent ''' author: Fabio Zanini date: 07/08/17 content: Test Dataset class. ''' import numpy as np # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') print('KNN graph via all pair comparisons') res = ds.graph.lshknn( axis='samples', n_neighbors=1, threshold=0.2, n_planes=128, slice_length=None, ) assert (np.allclose( res.data, [0.9996988186962041, 1.0, 1.0, 1.0, 0.9996988186962041, 1.0, 1.0, 1.0], rtol=1e-02, atol=1e-02)) print('Done!')
#!/usr/bin/env python # vim: fdm=indent ''' author: Fabio Zanini date: 07/08/17 content: Test Dataset class. ''' # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script print('Instantiating Dataset') from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') print('Done!') print('Testing Dataset.__str__') assert (str(ds) == 'Dataset with 4 samples and 60721 features') print('Done!') print('Testing Dataset.__repr__') assert (ds.__repr__() == '<Dataset: 4 samples, 60721 features>') print('Done!') print('Testing Dataset.copy') assert (ds.copy() == ds) print('Done!') print('Testing Dataset.copy with modifications')
if iax1 == iax2: ax.set_facecolor(list(colors[indu[iax1]]) + [0.2]) ax.grid(True) ax.set_xlim(0, 3.9) ax.set_xticks([0, 1, 2, 3, 4]) ax.set_xticks([0.5, 1.5, 2.5, 3.5], minor=True) ax.set_yticklabels([]) fig.text(0.52, 0.02, 'cluster #', ha='center') fig.text(0.02, 0.52, 'cluster #', va='center', rotation=90) fig.suptitle('Hamming distance distributions across SNV clusters') plt.tight_layout(h_pad=0, w_pad=0, rect=(0.03, 0.03, 1, 0.97)) # Calculate transciptome distances ds = Dataset( samplesheet='dengue', counts_table='dengue', featuresheet='humanGC38', ) ds.samplesheet['cluster_SNV'] = clusters ds.counts.normalize(inplace=True) ds.rename(axis='features', column='GeneName', inplace=True) ds.feature_selection.unique(inplace=True) # Restrict to differentially expresse genes with open('../data/genes_diff_expressed_clustersSNV.tsv', 'rt') as f: genes = f.read().split('\t') dsd = ds.query_features_by_name(genes) dsd.counts.log(inplace=True) dsp = dsd.split('cluster_SNV') dclut = {}