def test_compare_prefixes(): assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == ( 0, 5, 0, 5, 5, 0 ) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == ( 0, 5, 0, 5, 5, 0 ) assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2) a = WILDCARD_SEQUENCES[0] for s in WILDCARD_SEQUENCES: r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' result = compare_prefixes(a, r, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0), result result = compare_prefixes(r, a, wildcard_ref=True) assert result == (0, 10, 0, 10, 10, 0) for s in WILDCARD_SEQUENCES: for t in WILDCARD_SEQUENCES: r = s + 'GCCAGGG' result = compare_prefixes(s, r) assert result == (0, 10, 0, 10, 10, 0) result = compare_prefixes(r, s, wildcard_ref=True, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0) r = WILDCARD_SEQUENCES[0] + 'GCCAGG' for wildc_ref in (False, True): for wildc_query in (False, True): result = compare_prefixes( 'CCCXTTXATC', r, wildcard_ref=wildc_ref, wildcard_query=wildc_query ) assert result == (0, 10, 0, 10, 8, 2)
def test_compare_prefixes(): assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2) a = WILDCARD_SEQUENCES[0] for s in WILDCARD_SEQUENCES: r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' result = compare_prefixes(a, r, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0), result result = compare_prefixes(r, a, wildcard_ref=True) assert result == (0, 10, 0, 10, 10, 0) for s in WILDCARD_SEQUENCES: for t in WILDCARD_SEQUENCES: r = s + 'GCCAGGG' result = compare_prefixes(s, r, ) assert result == (0, 10, 0, 10, 10, 0) result = compare_prefixes(r, s, wildcard_ref=True, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0) r = WILDCARD_SEQUENCES[0] + 'GCCAGG' for wildc_ref in (False, True): for wildc_query in (False, True): result = compare_prefixes('CCCXTTXATC', r, wildcard_ref=wildc_ref, wildcard_query=wildc_query) assert result == (0, 10, 0, 10, 8, 2)
def match_to(self, read): """Attempt to match this adapter to the given read. Args: read: A :class:`Sequence` instance. Returns: A :class:`Match` instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ read_seq = read.sequence.upper() # try to find an exact match first unless wildcards are allowed pos = -1 if not self.adapter_wildcards: if self.where == PREFIX: if read_seq.startswith(self.sequence): pos = 0 elif self.where == SUFFIX: if read_seq.endswith(self.sequence): pos = (len(read_seq) - len(self.sequence)) else: pos = read_seq.find(self.sequence) if pos >= 0: seqlen = len(self.sequence) return Match(0, seqlen, pos, pos + seqlen, seqlen, 0, self._front_flag, self, read) # try approximate matching if not self.indels and self.where in (PREFIX, SUFFIX): if self.where == PREFIX: alignment = align.compare_prefixes( self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) else: alignment = align.compare_suffixes( self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) else: alignment = self.aligner.locate(read_seq) if self.debug: print(self.aligner.dpmatrix) # pragma: no cover if alignment: astart, astop, rstart, rstop, matches, errors = alignment size = astop - astart if ((size >= self.min_overlap and errors / size <= self.max_error_rate) and (self.max_rmp is None or self.match_probability(matches, size) <= self.max_rmp)): return Match(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read) return None
def match_to(self, read): """ Attempt to match this adapter to the given read. Return an Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ read_seq = read.sequence.upper() # try to find an exact match first unless wildcards are allowed pos = -1 if not self.adapter_wildcards: if self.where == PREFIX: pos = 0 if read_seq.startswith(self.sequence) else -1 elif self.where == SUFFIX: pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1 else: pos = read_seq.find(self.sequence) if pos >= 0: l = len(self.sequence) return Match(0, l, pos, pos + l, l, 0, self._front_flag, self, read) # try approximate matching alignment = None if not self.indels and self.where in (PREFIX, SUFFIX): if self.where == PREFIX: alignment = align.compare_prefixes(self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) else: alignment = align.compare_suffixes(self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) else: alignment = self.aligner.locate(read_seq) if self.debug: print(self.aligner.dpmatrix) # pragma: no cover if alignment: astart, astop, rstart, rstop, matches, errors = alignment size = astop - astart if (size >= self.min_overlap and errors / size <= self.max_error_rate and ( self.match_probability is None or self.match_probability(matches, size) <= self.max_rmp)): return Match( astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read) return None