def pass1(self, reader, saver): """ The first pass across the read data. It does the following: 1. If do_normalize is set, discard all read pairs with coverage above DIGINORM_COVERAGE. 2. For each remaining read pair, check if the read pair is above the coverage necessary for trimming (TRIM_AT_COVERAGE). If so, k-mer trim the reads at CUTOFF, and yield them. 3. If the read pair is not at the coverage necessary for trimming, consume the read pair with the graph and save the read pair for the second pass. """ graph = self.graph TRIM_AT_COVERAGE = self.trim_at_coverage CUTOFF = self.cutoff DIGINORM_COVERAGE = self.diginorm_coverage K = graph.ksize() for n, is_pair, read1, read2 in reader: bundle = ReadBundle(read1, read2) # clean up the sequences for examination. self.n_reads += bundle.num_reads self.n_bp += bundle.total_length min_coverage = min(bundle.coverages(graph)) if self.do_normalize and min_coverage >= DIGINORM_COVERAGE: # skip reads if normalizing continue # trim? if min_coverage >= TRIM_AT_COVERAGE: for read in bundle.reads: record, did_trim = trim_record(graph, read, CUTOFF) if did_trim: self.trimmed_reads += 1 if record: yield record # no, too low coverage to trim; consume & set aside for 2nd pass. else: for read in bundle.reads: graph.consume(read.cleaned_seq) write_record(read, saver) self.n_saved += 1
def snarf(is_paired, read0, read1, countgraph1, countgraph2): batch = ReadBundle(read0, read1) # if any in batch have differential coverage, consume &yield #coverage1 = batch.coverages(countgraph1) #coverage2 = batch.coverages(countgraph2) #aread = list(batch.reads)[0] #print(countgraph1.get_median_count(aread.cleaned_seq)) for record in batch.reads: median1, mean1, sd1 = countgraph1.get_median_count(record.cleaned_seq) median2, mean2, sd2 = countgraph2.get_median_count(record.cleaned_seq) nobs1 = nobs2 = len(record.cleaned_seq) - 31 res = (ttest_ind_from_stats(mean1, sd1, nobs1, mean2, sd2, nobs2)) if res.pvalue < 0.05: #self.countgraph.consume(record.cleaned_seq) yield record
def __call__(self, is_paired, read0, read1): """ Actually does digital normalization - the core algorithm. * get one (unpaired) or two (paired) reads; * sanitize the sequences (convert Ns to As); * get the median k-mer count of one/both reads; * if any read's median k-mer count is below desired coverage, keep all; * consume and yield kept reads. """ batch = ReadBundle(read0, read1) desired_coverage = self.desired_coverage # if any in batch have coverage below desired coverage, consume &yield if not batch.coverages_at_least(self.countgraph, desired_coverage): for record in batch.reads: self.countgraph.consume(record.cleaned_seq) yield record
def pass2(self, reader): """ The second pass across the data does the following. 1. For each read, evaluate the coverage. If the coverage is sufficient to trim, OR we are trimming low-abundance reads (-V not set), do trimming. 2. Otherwise, return the untrimmed read pair. """ graph = self.graph TRIM_AT_COVERAGE = self.trim_at_coverage CUTOFF = self.cutoff K = graph.ksize() for n, is_pair, read1, read2 in reader: bundle = ReadBundle(read1, read2) # clean up the sequences for examination. self.n_reads += bundle.num_reads self.n_bp += bundle.total_length if self.do_trim_low_abund or \ bundle.coverages_at_least(graph, TRIM_AT_COVERAGE): for read in bundle.reads: trimmed_record, did_trim = trim_record(graph, read, CUTOFF) if did_trim: self.trimmed_reads += 1 if trimmed_record: yield trimmed_record else: for read in bundle.reads: self.n_skipped += 1 self.bp_skipped += 1 yield read