def ds_ds2(): from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') ds2 = ds.copy() ds.samplesheet = ds.samplesheet.iloc[:2] ds2.samplesheet = ds2.samplesheet.iloc[2:] return (ds, ds2)
author: Fabio Zanini date: 07/08/17 content: Test Dataset class. ''' import numpy as np # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script from singlet.dataset import Dataset ds = Dataset(samplesheet='example_sheet_tsv', counts_table='example_table_tsv') ds2 = ds.copy() ds.samplesheet = ds.samplesheet.iloc[:2] ds2.samplesheet = ds2.samplesheet.iloc[2:] print('Bootstrap') dsboot = ds.bootstrap() assert ('--sampling_' in dsboot.samplenames[0]) print('Done!') print('Test feature comparison (Mann-Whitney U)') pvals = ds.compare(ds2, method='mann-whitney') assert (np.isclose(pvals.values.min(), 0.193931)) print('Done!') print('Test feature comparison (Kolmogorov-Smirnov)') pvals = ds.compare(ds2, method='kolmogorov-smirnov') assert (np.isclose(pvals.values.min(), 0.097027))
if __name__ == '__main__': ds = Dataset( samplesheet='dengue', counts_table='dengue', featuresheet='humanGC38', ) data_snv = xr.open_dataset('../bigdata/allele_frequencies.nc') #ds.counts = CountsTable(data['aaf'].to_pandas().fillna(0)) # Sync with Felix metadata with open('../data/metadataD_SNV_with_tsne_and_tsneSNV.pkl', 'rb') as ff: metadata_felix = pickle.load(ff) samples = metadata_felix.index[( ~np.isnan(metadata_felix[['Dn', 'Ds']])).all(axis=1)] ds.samplesheet = ds.samplesheet.loc[samples] metadata_felix = metadata_felix.loc[samples] for col in [ 'coverage', 'Ds', 'Dn', 'depth', 'numSNV', 'Dn_s', 'tsne1_MOI1_10', 'tsne2_MOI1_10', 'tsne1_SNV', 'tsne2_SNV', 'clusterN_SNV' ]: ds.samplesheet[col] = metadata_felix[col] ds.samplesheet['log_Dn'] = np.log10(1e-6 + ds.samplesheet['Dn']) ds.samplesheet['log_Ds'] = np.log10(1e-6 + ds.samplesheet['Ds']) cov = ds.samplesheet['coverage'] n = ds.samplesheet['numberDengueReads'].astype(int) ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n) ds.samplesheet['log_virus_reads_per_million'] = np.log10(0.1 + 1e6 * n / (cov + n))
# htseq-count output, below is the explanation how I solved this. # velocyto skips quasi-empty BAM files, final order is lexicographic as in: # echo '' > ~/subfolders.tsv; for fdn in 10017006*; do si=$(du $fdn/star/Aligned.out.possorted.bam | cut -f1); if [ $si -ge "30" ]; then echo $fdn >> ~/subfolders.tsv; fi; done cellnames = pd.read_csv('../bigdata/rnavelocity_cellnames.tsv', header=None).values[:, 0] vlm = vcy.VelocytoLoom("../bigdata/rna_velocity.loom") vlm.ca['CellID'] = cellnames print('Load normal counts and metadata and sync them with the velocity results') # Load and sync external metadata ds = Dataset( #counts_table='dengue', samplesheet='dengue', ) with open('../data/metadataD_SNV_with_tsne_and_tsneSNV.pkl', 'rb') as ff: metadata_felix = pickle.load(ff) ds.samplesheet = ds.samplesheet.loc[cellnames] metadata_felix = metadata_felix.loc[cellnames] vlm.ca['ClusterName'] = metadata_felix['clusterN_SNV'].fillna(6).values vlm.set_clusters(vlm.ca["ClusterName"]) ds.samplesheet['clusterN_SNV'] = metadata_felix['clusterN_SNV'].fillna(6) ds.samplesheet['coverage'] = metadata_felix['coverage'] ds.samplesheet['virus_reads_per_million'] = 1.0 * 1e6 * ds.samplesheet['numberDengueReads'] / (ds.samplesheet['numberDengueReads'] + ds.samplesheet['coverage']) ds.samplesheet['log_virus_reads_per_million'] = np.log10(0.1 + ds.samplesheet['virus_reads_per_million']) print('Filter cells, genes, etc. using the velocity tutorial') vlm.normalize("S", size=True, log=True) vlm.filter_cells(bool_array=vlm.initial_Ucell_size > np.percentile(vlm.initial_Ucell_size, 0.5)) ds.samplesheet = ds.samplesheet.loc[vlm.ca['CellID']] metadata_felix = metadata_felix.loc[vlm.ca['CellID']]