def test_compare_prefixes(): assert compare_prefixes(b'AAXAA', b'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) assert compare_prefixes(b'AANAA', b'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes(b'AANAA', b'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes(b'XAAAAA', b'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2)
def test_compare_prefixes(): assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2) a = WILDCARD_SEQUENCES[0] for s in WILDCARD_SEQUENCES: r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' result = compare_prefixes(a, r, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0), result result = compare_prefixes(r, a, wildcard_ref=True) assert result == (0, 10, 0, 10, 10, 0) for s in WILDCARD_SEQUENCES: for t in WILDCARD_SEQUENCES: r = s + 'GCCAGGG' result = compare_prefixes(s, r, ) assert result == (0, 10, 0, 10, 10, 0) result = compare_prefixes(r, s, wildcard_ref=True, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0) r = WILDCARD_SEQUENCES[0] + 'GCCAGG' for wildc_ref in (False, True): for wildc_query in (False, True): result = compare_prefixes('CCCXTTXATC', r, wildcard_ref=wildc_ref, wildcard_query=wildc_query) assert result == (0, 10, 0, 10, 8, 2)
def test_compare_prefixes(): assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2) a = WILDCARD_SEQUENCES[0] for s in WILDCARD_SEQUENCES: r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' result = compare_prefixes(a, r, degenerate=ALLOW_WILDCARD_SEQ2) assert result == (0, 10, 0, 10, 10, 0), result result = compare_prefixes(r, a, degenerate=ALLOW_WILDCARD_SEQ1) assert result == (0, 10, 0, 10, 10, 0) for s in WILDCARD_SEQUENCES: for t in WILDCARD_SEQUENCES: r = s + 'GCCAGGG' result = compare_prefixes(s, r, degenerate=ALLOW_WILDCARD_SEQ1|ALLOW_WILDCARD_SEQ2) assert result == (0, 10, 0, 10, 10, 0) result = compare_prefixes(r, s, degenerate=ALLOW_WILDCARD_SEQ1|ALLOW_WILDCARD_SEQ2) assert result == (0, 10, 0, 10, 10, 0) r = WILDCARD_SEQUENCES[0] + 'GCCAGG' for deg in 0, ALLOW_WILDCARD_SEQ1, ALLOW_WILDCARD_SEQ2, ALLOW_WILDCARD_SEQ1|ALLOW_WILDCARD_SEQ2: result = compare_prefixes('CCCXTTXATC', r, degenerate=deg) assert result == (0, 10, 0, 10, 8, 2)
def test_compare_prefixes(): assert compare_prefixes("AAXAA", "AAAAATTTTTTTTT") == (0, 5, 0, 5, 4, 1) assert compare_prefixes("AANAA", "AACAATTTTTTTTT", ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes("AANAA", "AACAATTTTTTTTT", ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes("XAAAAA", "AAAAATTTTTTTTT") == (0, 6, 0, 6, 4, 2) a = WILDCARD_SEQUENCES[0] for s in WILDCARD_SEQUENCES: r = s + "GCCAGGGTTGATTCGGCTGATCTGGCCG" result = compare_prefixes(a, r, degenerate=ALLOW_WILDCARD_SEQ2) assert result == (0, 10, 0, 10, 10, 0), result result = compare_prefixes(r, a, degenerate=ALLOW_WILDCARD_SEQ1) assert result == (0, 10, 0, 10, 10, 0) for s in WILDCARD_SEQUENCES: for t in WILDCARD_SEQUENCES: r = s + "GCCAGGG" result = compare_prefixes(s, r, degenerate=ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2) assert result == (0, 10, 0, 10, 10, 0) result = compare_prefixes(r, s, degenerate=ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2) assert result == (0, 10, 0, 10, 10, 0) r = WILDCARD_SEQUENCES[0] + "GCCAGG" for deg in 0, ALLOW_WILDCARD_SEQ1, ALLOW_WILDCARD_SEQ2, ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2: result = compare_prefixes("CCCXTTXATC", r, degenerate=deg) assert result == (0, 10, 0, 10, 8, 2)
def match_to(self, read, match_class=Match): """ Attempt to match this adapter to the given read. Return a Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ read_seq = read.sequence.upper() # temporary copy pos = -1 # try to find an exact match first unless wildcards are allowed if not self.adapter_wildcards: if self.where == PREFIX: pos = 0 if read_seq.startswith(self.sequence) else -1 elif self.where == SUFFIX: pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1 elif self.where == BACK or self.where == FRONT: pos = read_seq.find(self.sequence) # TODO BACK_NOT_INTERNAL, FRONT_NOT_INTERNAL if pos >= 0: match_args = ( 0, len(self.sequence), pos, pos + len(self.sequence), len(self.sequence), 0) else: # try approximate matching if not self.indels and self.where in (PREFIX, SUFFIX): if self.where == PREFIX: alignment = align.compare_prefixes(self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) else: alignment = align.compare_suffixes(self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) astart, astop, rstart, rstop, matches, errors = alignment if astop - astart >= self.min_overlap and errors / (astop - astart) <= self.max_error_rate: match_args = alignment else: match_args = None else: alignment = self.aligner.locate(read_seq) if self._debug: print(self.aligner.dpmatrix) # pragma: no cover if alignment is None: match_args = None else: astart, astop, rstart, rstop, matches, errors = alignment match_args = (astart, astop, rstart, rstop, matches, errors) if match_args is None: return None if self.remove == 'auto': # guess: if alignment starts at pos 0, it’s a 5' adapter remove_before = match_args[2] == 0 # index 2 is rstart else: remove_before = self.remove == 'prefix' match = match_class(*match_args, remove_before=remove_before, adapter=self, read=read) assert match.length > 0 and match.errors / match.length <= self.max_error_rate, match assert match.length >= self.min_overlap return match
def match(self, read): """ Try to match this adapter to the given read and return an AdapterMatch instance. Return None if the minimum overlap length is not met or the error rate is too high. """ read_seq = read.sequence.upper() pos = -1 # try to find an exact match first unless wildcards are allowed if not self.match_adapter_wildcards: if self.where == PREFIX: pos = 0 if read_seq.startswith(self.sequence) else -1 else: pos = read_seq.find(self.sequence) if pos >= 0: match = AdapterMatch( 0, len(self.sequence), pos, pos + len(self.sequence), len(self.sequence), 0, self._front_flag, self, read) else: # try approximate matching if not self.indels: alignment = align.compare_prefixes(self.sequence, read_seq, self.wildcard_flags) else: alignment = align.globalalign_locate(self.sequence, read_seq, self.max_error_rate, self.where, self.wildcard_flags) # TODO line-based profiling tells me that the following line # is slow (takes 30% of match()'s running time) match = AdapterMatch(*(alignment + (self._front_flag, self, read))) # TODO globalalign_locate should be modified to allow the following # assertion. # assert length == 0 or match.errors / length <= self.max_error_rate if match.length < self.min_overlap or match.errors / match.length > self.max_error_rate: return None return match
def match_to(self, read): """ Try to match this adapter to the given read and return an AdapterMatch instance. Return None if the minimum overlap length is not met or the error rate is too high. """ read_seq = read.sequence.upper() pos = -1 # try to find an exact match first unless wildcards are allowed if not self.adapter_wildcards: if self.where == PREFIX: pos = 0 if read_seq.startswith(self.sequence) else -1 elif self.where == SUFFIX: pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith( self.sequence) else -1 else: pos = read_seq.find(self.sequence) if pos >= 0: match = AdapterMatch(0, len(self.sequence), pos, pos + len(self.sequence), len(self.sequence), 0, self._front_flag, self, read) else: # try approximate matching if not self.indels and self.where in (PREFIX, SUFFIX): if self.where == PREFIX: alignment = align.compare_prefixes(self.sequence, read_seq, self.wildcard_flags) else: alignment = align.compare_suffixes(self.sequence, read_seq, self.wildcard_flags) astart, astop, rstart, rstop, matches, errors = alignment if astop - astart >= self.min_overlap and errors / ( astop - astart) <= self.max_error_rate: match = AdapterMatch(*(alignment + (self._front_flag, self, read))) else: match = None else: alignment = self.aligner.locate(read_seq) if self.debug: print(self.aligner.dpmatrix) if alignment is None: match = None else: astart, astop, rstart, rstop, matches, errors = alignment match = AdapterMatch(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read) if match is None: return None assert match.length > 0 and match.errors / match.length <= self.max_error_rate, match assert match.length >= self.min_overlap return match
def match(self, read): """ Try to match this adapter to the given read and return an AdapterMatch instance. Return None if the minimum overlap length is not met or the error rate is too high. """ read_seq = read.sequence.upper() pos = -1 # try to find an exact match first unless wildcards are allowed if not self.adapter_wildcards: if self.where == PREFIX: pos = 0 if read_seq.startswith(self.sequence) else -1 elif self.where == SUFFIX: pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1 else: pos = read_seq.find(self.sequence) if pos >= 0: match = AdapterMatch( 0, len(self.sequence), pos, pos + len(self.sequence), len(self.sequence), 0, self._front_flag, self, read, ) else: # try approximate matching if not self.indels: assert self.where in (PREFIX, SUFFIX) if self.where == PREFIX: alignment = align.compare_prefixes(self.sequence, read_seq, self.wildcard_flags) else: alignment = align.compare_suffixes(self.sequence, read_seq, self.wildcard_flags) astart, astop, rstart, rstop, matches, errors = alignment match = AdapterMatch(*(alignment + (self._front_flag, self, read))) else: alignment = self.aligner.locate(read_seq) astart, astop, rstart, rstop, matches, errors = alignment length = astop - astart if length < self.min_overlap or errors / length > self.max_error_rate: return None return AdapterMatch(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read) # TODO Aligner.locate should be modified to allow the following # assertion. # assert length == 0 or match.errors / length <= self.max_error_rate if match.length < self.min_overlap or match.errors / match.length > self.max_error_rate: return None return match
def match_to(self, read): """ Attempt to match this adapter to the given read. Return an Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ read_seq = read.sequence.upper() pos = -1 # try to find an exact match first unless wildcards are allowed if not self.adapter_wildcards: if self.where == PREFIX: pos = 0 if read_seq.startswith(self.sequence) else -1 elif self.where == SUFFIX: pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1 else: pos = read_seq.find(self.sequence) if pos >= 0: match = Match( 0, len(self.sequence), pos, pos + len(self.sequence), len(self.sequence), 0, self._front_flag, self, read) else: # try approximate matching if not self.indels and self.where in (PREFIX, SUFFIX): if self.where == PREFIX: alignment = align.compare_prefixes(self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) else: alignment = align.compare_suffixes(self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) astart, astop, rstart, rstop, matches, errors = alignment if astop - astart >= self.min_overlap and errors / (astop - astart) <= self.max_error_rate: match = Match(*(alignment + (self._front_flag, self, read))) else: match = None else: alignment = self.aligner.locate(read_seq) if self.debug: print(self.aligner.dpmatrix) # pragma: no cover if alignment is None: match = None else: astart, astop, rstart, rstop, matches, errors = alignment match = Match(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read) if match is None: return None assert match.length > 0 and match.errors / match.length <= self.max_error_rate, match assert match.length >= self.min_overlap return match
def test_compare_prefixes(): assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', wildcard_ref=True) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2) a = WILDCARD_SEQUENCES[0] for s in WILDCARD_SEQUENCES: r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' result = compare_prefixes(a, r, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0), result result = compare_prefixes(r, a, wildcard_ref=True) assert result == (0, 10, 0, 10, 10, 0) for s in WILDCARD_SEQUENCES: for t in WILDCARD_SEQUENCES: r = s + 'GCCAGGG' result = compare_prefixes( s, r, ) assert result == (0, 10, 0, 10, 10, 0) result = compare_prefixes(r, s, wildcard_ref=True, wildcard_query=True) assert result == (0, 10, 0, 10, 10, 0) r = WILDCARD_SEQUENCES[0] + 'GCCAGG' for wildc_ref in (False, True): for wildc_query in (False, True): result = compare_prefixes('CCCXTTXATC', r, wildcard_ref=wildc_ref, wildcard_query=wildc_query) assert result == (0, 10, 0, 10, 8, 2)
def test_compare_prefixes(): assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2) a = WILDCARD_SEQUENCES[0] for s in WILDCARD_SEQUENCES: r = s + 'GCCAGGGTTGATTCGGCTGATCTGGCCG' result = compare_prefixes(a, r, degenerate=ALLOW_WILDCARD_SEQ2) assert result == (0, 10, 0, 10, 10, 0), result result = compare_prefixes(r, a, degenerate=ALLOW_WILDCARD_SEQ1) assert result == (0, 10, 0, 10, 10, 0) for s in WILDCARD_SEQUENCES: for t in WILDCARD_SEQUENCES: r = s + 'GCCAGGG' result = compare_prefixes(s, r, degenerate=ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2) assert result == (0, 10, 0, 10, 10, 0) result = compare_prefixes(r, s, degenerate=ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2) assert result == (0, 10, 0, 10, 10, 0) r = WILDCARD_SEQUENCES[0] + 'GCCAGG' for deg in 0, ALLOW_WILDCARD_SEQ1, ALLOW_WILDCARD_SEQ2, ALLOW_WILDCARD_SEQ1 | ALLOW_WILDCARD_SEQ2: result = compare_prefixes('CCCXTTXATC', r, degenerate=deg) assert result == (0, 10, 0, 10, 8, 2)
def test_compare_prefixes(): assert compare_prefixes('AAXAA', 'AAAAATTTTTTTTT') == (0, 5, 0, 5, 4, 1) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('AANAA', 'AACAATTTTTTTTT', ALLOW_WILDCARD_SEQ1) == (0, 5, 0, 5, 5, 0) assert compare_prefixes('XAAAAA', 'AAAAATTTTTTTTT') == (0, 6, 0, 6, 4, 2)
def match_to(self, read): """ Attempt to match this adapter to the given read. Return an Match instance if a match was found; return None if no match was found given the matching criteria (minimum overlap length, maximum error rate). """ read_seq = read.sequence.upper() pos = -1 # try to find an exact match first unless wildcards are allowed if not self.adapter_wildcards: if self.where == PREFIX: pos = 0 if read_seq.startswith(self.sequence) else -1 elif self.where == SUFFIX: pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1 else: pos = read_seq.find(self.sequence) if pos >= 0: if self.partial_trim > 0: match_end = len(self.sequence) - self.partial_trim match = Match( 0, len(self.sequence), pos, match_end, len(self.sequence), 0, self._front_flag, self, read) else: match = Match( 0, len(self.sequence), pos, pos + len(self.sequence), len(self.sequence), 0, self._front_flag, self, read) else: # try approximate matching if not self.indels and self.where in (PREFIX, SUFFIX): if self.where == PREFIX: alignment = align.compare_prefixes(self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) else: alignment = align.compare_suffixes(self.sequence, read_seq, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards) astart, astop, rstart, rstop, matches, errors = alignment ## HACK HERE! Trying to trim off only a partial part of the adapter if self.partial_trim > 0: rstop = rstop - self.partial_trim #alignment = (astart, astop, rstart, rstop, matches, error) if astop - astart >= self.min_overlap and errors / (astop - astart) <= self.max_error_rate: match = Match(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read) #match = Match(*(alignment + (self._front_flag, self, read))) else: match = None else: alignment = self.aligner.locate(read_seq) if self.debug: print(self.aligner.dpmatrix) # pragma: no cover if alignment is None: match = None else: astart, astop, rstart, rstop, matches, errors = alignment ## HACK HERE! Trying to trim off only a partial part of the adapter if self.partial_trim > 0: rstop -= self.partial_trim match = Match(astart, astop, rstart, rstop, matches, errors, self._front_flag, self, read) if match is None: return None assert match.length > 0 and match.errors / match.length <= self.max_error_rate, match assert match.length >= self.min_overlap return match