def __call__(self, is_paired, read0, read1): """ Actually does digital normalization - the core algorithm. * get one (unpaired) or two (paired) reads; * sanitize the sequences (convert Ns to As); * get the median k-mer count of one/both reads; * if any read's median k-mer count is below desired coverage, keep all; * consume and yield kept reads. """ batch = ReadBundle(read0, read1) desired_coverage = self.desired_coverage # if any in batch have coverage below desired coverage, consume &yield if not batch.coverages_at_least(self.countgraph, desired_coverage): for record in batch.reads: self.countgraph.consume(record.cleaned_seq) yield record
def pass2(self, reader): """ The second pass across the data does the following. 1. For each read, evaluate the coverage. If the coverage is sufficient to trim, OR we are trimming low-abundance reads (-V not set), do trimming. 2. Otherwise, return the untrimmed read pair. """ graph = self.graph TRIM_AT_COVERAGE = self.trim_at_coverage CUTOFF = self.cutoff K = graph.ksize() for n, is_pair, read1, read2 in reader: bundle = ReadBundle(read1, read2) # clean up the sequences for examination. self.n_reads += bundle.num_reads self.n_bp += bundle.total_length if self.do_trim_low_abund or \ bundle.coverages_at_least(graph, TRIM_AT_COVERAGE): for read in bundle.reads: trimmed_record, did_trim = trim_record(graph, read, CUTOFF) if did_trim: self.trimmed_reads += 1 if trimmed_record: yield trimmed_record else: for read in bundle.reads: self.n_skipped += 1 self.bp_skipped += 1 yield read