def _get_contaminants(self): contaminant_matchers = create_contaminant_matchers( self.known_contaminants, self.kmer_size) counts = defaultdict(int) max_match_fracs = defaultdict(int) for seq in self._read_sequences: seqrc = reverse_complement(seq) for contam in contaminant_matchers: match = contam.match(seq, seqrc) if match[0] > self.min_kmer_match_frac: counts[contam] += 1 if match[0] > max_match_fracs[contam]: max_match_fracs[contam] = match[0] min_count = math.ceil( self.n_reads * (self._read_length - self._min_k + 1) * self.overrep_cutoff / float(4**self._min_k)) return [ Match( c[0], match_frac=max_match_fracs[c[0]], abundance=float(c[1]) / self.n_reads) for c in filter( lambda x: x[1] >= min_count, counts.items() ) ]
def __call__(self, read1, read2): len1 = len(read1.sequence) len2 = len(read2.sequence) min_overlap = self.min_overlap if min_overlap <= 1: min_overlap = max(2, round(self.min_overlap * min(len1, len2))) if len1 < min_overlap or len2 < min_overlap: return (read1, read2) insert_matched = read1.insert_overlap and read2.insert_overlap if insert_matched: # If we've already determined that there is an insert overlap # with a 3' overhang, we can constrain our alignment aflags = START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2 else: aflags = SEMIGLOBAL # align read1 to read2 reverse-complement to be compatible with # InsertAligner read2_rc = reverse_complement(read2.sequence) aligner = Aligner(read2_rc, self.error_rate, aflags) alignment = aligner.locate(read1.sequence) if alignment: r2_start, r2_stop, r1_start, r1_stop, matches, errors = alignment if matches >= min_overlap: # Only correct errors if we haven't already done correction in # the InsertAligner if self.mismatch_action and errors > 0 and not insert_matched: self.correct_errors(read1, read2, alignment) if r2_start == 0 and r2_stop == len2: # r2 is fully contained in r1 pass elif r1_start == 0 and r1_stop == len1: # r1 is fully contained in r2 read1.sequence = read2_rc read1.qualities = "".join(reversed(read2.qualities)) elif r1_start > 0: read1.sequence += read2_rc[r2_stop:] if read1.qualities and read2.qualities: read1.qualities += "".join( reversed(read2.qualities))[r2_stop:] elif r2_start > 0: read1.sequence = read2_rc + read1.sequence[r1_stop:] if read1.qualities and read2.qualities: read1.qualities = ( "".join(reversed(read2.qualities)) + read1.qualities[r1_stop:]) else: raise AtroposError( "Invalid alignment while trying to merge read " "{}: {}".format( read1.name, ",".join(str(i) for i in alignment))) read1.merged = True read2 = None return (read1, read2)
def add(self, name, seq): """Add a sequence to the cache. Args: name: Adapter name. seq: Adapter sequence. """ self._add(name, seq) if self.auto_reverse_complement: self._add("{}_rc".format(name), reverse_complement(seq))
def find_best_match(seq, best_matches, best_match_frac): seqrc = reverse_complement(seq) for contam in contaminants: match_frac1, match_frac2, compare_seq = contam.match(seq, seqrc) if match_frac1 < best_match_frac[0]: continue if contam.seq in compare_seq or align(compare_seq, contam.seq, self.min_contaminant_match_frac): if match_frac1 > best_match_frac[0] or ( match_frac1 == best_match_frac[0] and match_frac2 > best_match_frac[1] ): best_matches = {} best_match_frac = (match_frac1, match_frac2) best_matches[contam] = (match, (match_frac1, match_frac2)) return (best_matches, best_match_frac)
def find_best_match(seq, best_matches, best_match_frac): seqrc = reverse_complement(seq) for contam in contaminants: match_frac1, match_frac2, compare_seq = contam.match( seq, seqrc) if match_frac1 < best_match_frac[0]: continue if (contam.seq in compare_seq or align(compare_seq, contam.seq, self.min_contaminant_match_frac)): if (match_frac1 > best_match_frac[0] or (match_frac1 == best_match_frac[0] and match_frac2 > best_match_frac[1])): best_matches = {} best_match_frac = (match_frac1, match_frac2) best_matches[contam] = (match, (match_frac1, match_frac2)) return (best_matches, best_match_frac)
def _get_contaminants(self, read_seqs): contaminant_matchers = create_contaminant_matchers(known_contaminants, k) counts = defaultdict(lambda: 0) for seq in read_seqs: seqrc = reverse_complement(seq) for contam in contaminant_matchers: match = contam.match(seq, seqrc) if match[0] > self.min_match_frac: counts[contam] += 1 min_count = math.ceil( self.n_reads * (self._read_length - self._min_k + 1) * self.overrep_cutoff / float(4 ** self._min_k) ) return [ Match(c[0], match_frac=float(c[1]) / self.n_reads) for c in filter(lambda x: x[1] >= min_count, counts.items()) ]
def _get_contaminants(self, read_seqs): contaminant_matchers = create_contaminant_matchers( known_contaminants, k) counts = defaultdict(lambda: 0) for seq in read_seqs: seqrc = reverse_complement(seq) for contam in contaminant_matchers: match = contam.match(seq, seqrc) if match[0] > self.min_match_frac: counts[contam] += 1 min_count = math.ceil(self.n_reads * (self._read_length - self._min_k + 1) * self.overrep_cutoff / float(4**self._min_k)) return [ Match(c[0], match_frac=float(c[1]) / self.n_reads) for c in filter(lambda x: x[1] >= min_count, counts.items()) ]
def find_best_match(_seq, _best_matches, _best_match_frac): """Find best contaminant matches to `seq`. """ seqrc = reverse_complement(_seq) for _contam in contaminants: match_frac1, match_frac2, compare_seq = _contam.match( _seq, seqrc) if match_frac1 < _best_match_frac[0]: continue if (_contam.seq in compare_seq or align(compare_seq, _contam.seq, self.min_contaminant_match_frac)): if (match_frac1 > _best_match_frac[0] or (match_frac1 == _best_match_frac[0] and match_frac2 > _best_match_frac[1])): _best_matches = {} _best_match_frac = (match_frac1, match_frac2) _best_matches[_contam] = (match, (match_frac1, match_frac2)) return _best_matches, _best_match_frac
def match_insert(self, seq1, seq2): """Use cutadapt aligner for insert and adapter matching""" l1 = len(seq1) l2 = len(seq2) seq_len = min(l1, l2) if l1 > l2: seq1 = seq1[:l2] elif l2 > l1: seq2 = seq1[:l1] seq2_rc = reverse_complement(seq2) def _match(insert_match, offset, insert_match_size, prob): if offset < self.min_adapter_overlap: # The reads are mostly overlapping, to the point where # there's not enough overhang to do a confident adapter # match. We return just the insert match to signal that # error correction can be done even though no adapter # trimming is required. return (insert_match, None, None) # TODO: this is very sensitive to the exact correct choice of adapter. # For example, if you specifiy GATCGGAA... and the correct adapter is # AGATCGGAA..., the prefixes will not match exactly and the alignment # will fail. We need to use a comparison that is a bit more forgiving. a1_match = compare_prefixes(seq1[insert_match_size:], self.adapter1) a2_match = compare_prefixes(seq2[insert_match_size:], self.adapter2) adapter_len = min(offset, self.adapter1_len, self.adapter2_len) max_adapter_mismatches = round(adapter_len * self.max_adapter_mismatch_frac) if a1_match[5] > max_adapter_mismatches and a2_match[5] > max_adapter_mismatches: return None a1_prob = self.match_probability(a1_match[4], adapter_len) a2_prob = self.match_probability(a2_match[4], adapter_len) if (adapter_len > self.adapter_check_cutoff) and ((a1_prob * a2_prob) > self.adapter_max_rmp): return None adapter_len1 = min(self.adapter1_len, l1 - insert_match_size) adapter_len2 = min(self.adapter2_len, l2 - insert_match_size) best_adapter_matches, best_adapter_mismatches = (a1_match if a1_prob < a2_prob else a2_match)[4:6] return ( insert_match, Match(0, adapter_len1, insert_match_size, l1, best_adapter_matches, best_adapter_mismatches), Match(0, adapter_len2, insert_match_size, l2, best_adapter_matches, best_adapter_mismatches) ) # # This is the old way of doing things, where we use the built-in # # Aligner to do a single match. # aligner = Aligner( # seq2_rc, # self.max_insert_mismatch_frac, # START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2, # False, False) # aligner.min_overlap = self.min_insert_overlap # aligner.indel_cost = 100000 # # insert_match = aligner.locate(seq1) # # if not insert_match: # return None # # offset = min(insert_match[0], seq_len - insert_match[3]) # insert_match_size = seq_len - offset # prob = self.match_probability(insert_match[4], insert_match_size) # # if prob > self.insert_max_rmp: # return None # # return _match(insert_match, offset, insert_match_size, prob) # Use an aligner that returns all matches that satisfy the # overlap and error rate thresholds. We sort by matches and # then mismatches, and then check each in turn until we find # one with an adapter match (if any). insert_matches = self.aligner.locate(seq2_rc, seq1) if insert_matches: # Filter by random-match probability filtered_matches = [] for insert_match in insert_matches: offset = min(insert_match[0], seq_len - insert_match[3]) insert_match_size = seq_len - offset prob = self.match_probability(insert_match[4], insert_match_size, **self.base_probs) if prob <= self.insert_max_rmp: filtered_matches.append((insert_match, offset, insert_match_size, prob)) if filtered_matches: if len(filtered_matches) == 1: return _match(*filtered_matches[0]) else: # Test matches in order of random-match probability. # TODO: compare against sorting by length (which is how # SeqPurge essentially does it). #filtered_matches.sort(key=lambda x: x[2], reverse=True) filtered_matches.sort(key=lambda x: x[3]) for m in filtered_matches: match = _match(*m) if match: return match return None
def _get_contaminants(self): from khmer import khmer_args n_win = self._read_length - self.k + 1 # assuming all sequences are same length tablesize = n_reads * n_win countgraph = khmer.Countgraph(self.k, tablesize, khmer_args.DEFAULT_N_TABLES) countgraph.set_use_bigcount(True) for seq in self._read_sequences: countgraph.consume_and_tag(seq) n_expected = math.ceil(tablesize / float(4 ** k)) min_count = n_expected * self.overrep_cutoff if min_count >= 2 ** 16: raise Exception( "The minimum count for an over-represented k-kmer {} " "is greater than the max khmer count (2^16)".format(min_count) ) candidates = {} for tag in countgraph.get_tagset(): count = countgraph.get(tag) if count >= min_count: candidates[tag] = count if self.known_contaminants: matches = [] seen = set() def match(kmer): n = candidates.get(kmer, 0) if n > 0: seen.add(kmer) return n for seq, names in self.known_contaminants.iter_sequences(): l = len(seq) if l < k: print("Cannot check {}; sequence is shorter than {}".format(list(names)[0], k)) continue n_kmers = l - self.k + 1 num_matches = 0 match_counts = [] for i in range(n_kmers): kmer = seq[i : (i + k)] kmer_count = max(match(kmer), match(reverse_complement(kmer))) if kmer_count > 0: num_matches += 1 match_counts.append(kmer_count) if num_matches > 0: # not sure what the correct metric is to use here overall_count = sum(match_counts) / float(n_kmers) matches.append(Match(seq, overall_count / float(tablesize), names, float(num_matches) / n_kmers)) # Add remaining tags for tag in set(candidates.keys()) - seen: matches.append(Match(tag, candidates[tag] / float(tablesize))) else: matches = [Match(tag, count / float(tablesize)) for tag, count in candidates.items()] return matches
def match_insert(self, seq1, seq2): """Use cutadapt aligner for insert and adapter matching. Args: seq1, seq2: Sequences to match. Returns: A :class:`Match` object, or None if there is no match. """ len1 = len(seq1) len2 = len(seq2) seq_len = min(len1, len2) if len1 > len2: seq1 = seq1[:len2] elif len2 > len1: seq2 = seq1[:len1] seq2_rc = reverse_complement(seq2) def _match(insert_match, offset, insert_match_size, prob): # pylint disable=unused-argument if offset < self.min_adapter_overlap: # The reads are mostly overlapping, to the point where # there's not enough overhang to do a confident adapter # match. We return just the insert match to signal that # error correction can be done even though no adapter # trimming is required. return (insert_match, None, None) # TODO: this is very sensitive to the exact correct choice of # adapter. For example, if you specifiy GATCGGAA... and the correct # adapter is AGATCGGAA..., the prefixes will not match exactly and # the alignment will fail. We need to use a comparison that is a bit # more forgiving. a1_match = compare_prefixes(seq1[insert_match_size:], self.adapter1) a2_match = compare_prefixes(seq2[insert_match_size:], self.adapter2) adapter_len = min(offset, self.adapter1_len, self.adapter2_len) max_adapter_mismatches = round(adapter_len * self.max_adapter_mismatch_frac) if (a1_match[5] > max_adapter_mismatches and a2_match[5] > max_adapter_mismatches): return None a1_prob = self.match_probability(a1_match[4], adapter_len) a2_prob = self.match_probability(a2_match[4], adapter_len) if ((adapter_len > self.adapter_check_cutoff) and ((a1_prob * a2_prob) > self.adapter_max_rmp)): return None adapter_len1 = min(self.adapter1_len, len1 - insert_match_size) adapter_len2 = min(self.adapter2_len, len2 - insert_match_size) best_adapter_matches, best_adapter_mismatches = ( a1_match if a1_prob < a2_prob else a2_match)[4:6] return (insert_match, Match(0, adapter_len1, insert_match_size, len1, best_adapter_matches, best_adapter_mismatches), Match(0, adapter_len2, insert_match_size, len2, best_adapter_matches, best_adapter_mismatches)) # # This is the old way of doing things, where we use the built-in # # Aligner to do a single match. # aligner = Aligner( # seq2_rc, # self.max_insert_mismatch_frac, # START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2, # False, False) # aligner.min_overlap = self.min_insert_overlap # aligner.indel_cost = 100000 # # insert_match = aligner.locate(seq1) # # if not insert_match: # return None # # offset = min(insert_match[0], seq_len - insert_match[3]) # insert_match_size = seq_len - offset # prob = self.match_probability(insert_match[4], insert_match_size) # # if prob > self.insert_max_rmp: # return None # # return _match(insert_match, offset, insert_match_size, prob) # Use an aligner that returns all matches that satisfy the # overlap and error rate thresholds. We sort by matches and # then mismatches, and then check each in turn until we find # one with an adapter match (if any). insert_matches = self.aligner.locate(seq2_rc, seq1) if insert_matches: # Filter by random-match probability filtered_matches = [] for insert_match in insert_matches: offset = min(insert_match[0], seq_len - insert_match[3]) insert_match_size = seq_len - offset prob = self.match_probability(insert_match[4], insert_match_size, **self.base_probs) if prob <= self.insert_max_rmp: filtered_matches.append( (insert_match, offset, insert_match_size, prob)) if filtered_matches: if len(filtered_matches) == 1: return _match(*filtered_matches[0]) else: # Test matches in order of random-match probability. # TODO: compare against sorting by length (which is how # SeqPurge essentially does it). #filtered_matches.sort(key=lambda x: x[2], reverse=True) filtered_matches.sort(key=lambda x: x[3]) for match_args in filtered_matches: match = _match(*match_args) if match: return match return None
def _get_contaminants(self): from khmer import khmer_args n_win = self._read_length - self.k + 1 # assuming all sequences are same length tablesize = n_reads * n_win countgraph = khmer.Countgraph(self.k, tablesize, khmer_args.DEFAULT_N_TABLES) countgraph.set_use_bigcount(True) for seq in self._read_sequences: countgraph.consume_and_tag(seq) n_expected = math.ceil(tablesize / float(4**k)) min_count = n_expected * self.overrep_cutoff if min_count >= 2**16: raise Exception( "The minimum count for an over-represented k-kmer {} " "is greater than the max khmer count (2^16)".format(min_count)) candidates = {} for tag in countgraph.get_tagset(): count = countgraph.get(tag) if count >= min_count: candidates[tag] = count if self.known_contaminants: matches = [] seen = set() def match(kmer): n = candidates.get(kmer, 0) if n > 0: seen.add(kmer) return n for seq, names in self.known_contaminants.iter_sequences(): l = len(seq) if l < k: print( "Cannot check {}; sequence is shorter than {}".format( list(names)[0], k)) continue n_kmers = l - self.k + 1 num_matches = 0 match_counts = [] for i in range(n_kmers): kmer = seq[i:(i + k)] kmer_count = max(match(kmer), match(reverse_complement(kmer))) if kmer_count > 0: num_matches += 1 match_counts.append(kmer_count) if num_matches > 0: # not sure what the correct metric is to use here overall_count = sum(match_counts) / float(n_kmers) matches.append( Match(seq, overall_count / float(tablesize), names, float(num_matches) / n_kmers)) # Add remaining tags for tag in set(candidates.keys()) - seen: matches.append(Match(tag, candidates[tag] / float(tablesize))) else: matches = [ Match(tag, count / float(tablesize)) for tag, count in candidates.items() ] return matches
def _get_contaminants(self): from khmer import Countgraph, khmer_args # assuming all sequences are same length n_win = self._read_length - self.kmer_size + 1 tablesize = self.n_reads * n_win countgraph = Countgraph( self.kmer_size, tablesize, khmer_args.DEFAULT_N_TABLES) countgraph.set_use_bigcount(True) for seq in self._read_sequences: countgraph.consume_and_tag(seq) n_expected = math.ceil(tablesize / float(4**self.kmer_size)) min_count = n_expected * self.overrep_cutoff if min_count >= 2**16: raise ValueError( "The minimum count for an over-represented k-kmer {} is " "greater than the max khmer count (2^16)".format(min_count)) candidates = {} for tag in countgraph.get_tagset(): count = countgraph.get(tag) if count >= min_count: candidates[tag] = count if self.known_contaminants: matches = [] seen = set() def match(kmer): """Returns the frequency of `kmer` in `candidates`. """ freq = candidates.get(kmer, 0) if freq > 0: seen.add(kmer) return freq for seq, names in self.known_contaminants.iter_sequences(): seqlen = len(seq) if seqlen < self.kmer_size: print("Cannot check {}; sequence is shorter than {}".format( list(names)[0], self.kmer_size)) continue n_kmers = seqlen - self.kmer_size + 1 num_matches = 0 match_counts = [] for idx in range(n_kmers): kmer = seq[idx:(idx + self.kmer_size)] kmer_count = max( match(kmer), match(reverse_complement(kmer)) ) if kmer_count > 0: num_matches += 1 match_counts.append(kmer_count) if num_matches > 0: # not sure what the correct metric is to use here overall_count = sum(match_counts) / float(n_kmers) matches.append(Match( seq, count=overall_count / float(tablesize), names=names, match_frac=float(num_matches) / n_kmers)) # Add remaining tags for tag in set(candidates.keys()) - seen: matches.append(Match( tag, count=candidates[tag] / float(tablesize))) else: matches = [ Match(tag, count=count / float(tablesize)) for tag, count in candidates.items()] return matches
def __call__(self, read1, read2): len1 = len(read1.sequence) len2 = len(read2.sequence) min_overlap = self.min_overlap if min_overlap <= 1: min_overlap = max(2, round(self.min_overlap * min(len1, len2))) if len1 < min_overlap or len2 < min_overlap: return (read1, read2) insert_matched = read1.insert_overlap and read2.insert_overlap if insert_matched: # If we've already determined that there is an insert overlap # with a 3' overhang, we can constrain our alignment aflags = START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2 else: aflags = SEMIGLOBAL # align read1 to read2 reverse-complement to be compatible with # InsertAligner read2_rc = reverse_complement(read2.sequence) aligner = Aligner(read2_rc, self.error_rate, aflags) alignment = aligner.locate(read1.sequence) if alignment: r2_start, r2_stop, r1_start, r1_stop, matches, errors = alignment if matches >= min_overlap: # Only correct errors if we haven't already done correction in # the InsertAligner if ( self.mismatch_action and errors > 0 and not insert_matched and read1.corrected == 0 and read2.corrected == 0): self.correct_errors(read1, read2, alignment) if r2_start == 0 and r2_stop == len2: # r2 is fully contained in r1 pass elif r1_start == 0 and r1_stop == len1: # r1 is fully contained in r2 read1.sequence = read2_rc read1.qualities = "".join(reversed(read2.qualities)) elif r1_start > 0: read1.sequence += read2_rc[r2_stop:] if read1.qualities and read2.qualities: read1.qualities += "".join( reversed(read2.qualities))[r2_stop:] elif r2_start > 0: read1.sequence = read2_rc + read1.sequence[r1_stop:] if read1.qualities and read2.qualities: read1.qualities = ( "".join(reversed(read2.qualities)) + read1.qualities[r1_stop:]) else: raise AtroposError( "Invalid alignment while trying to merge read " "{}: {}".format( read1.name, ",".join(str(i) for i in alignment))) read1.merged = True read2 = None return (read1, read2)