def match(self, read): """ Try to match this adapter to the given read and return an AdapterMatch instance. Return None if the minimum overlap length is not met or the error rate is too high. """ read_seq = read.sequence.upper() pos = -1 # try to find an exact match first unless wildcards are allowed if not self.match_adapter_wildcards: if self.where == PREFIX: pos = 0 if read_seq.startswith(self.sequence) else -1 else: pos = read_seq.find(self.sequence) if pos >= 0: match = AdapterMatch( 0, len(self.sequence), pos, pos + len(self.sequence), len(self.sequence), 0, self._front_flag, self, read) else: # try approximate matching alignment = align.globalalign_locate(self.sequence, read_seq, self.max_error_rate, self.where, self.wildcard_flags) # TODO line-based profiling tells me that the following line # is slow (takes 30% of match()'s running time) match = AdapterMatch(*(alignment + (self._front_flag, self, read))) # TODO globalalign_locate should be modified to allow the following # assertion. # assert length == 0 or match.errors / length <= self.max_error_rate if match.length < self.min_overlap or match.errors / match.length > self.max_error_rate: return None return match
def match(self, read): """Return AdapterMatch instance""" if self.where != PREFIX: return super(ColorspaceAdapter, self).match(read) # create artificial adapter that includes a first color that encodes the # transition from primer base into adapter asequence = colorspace.ENCODE[ read.primer + self.nucleotide_sequence[0]] + self.sequence pos = 0 if read.sequence.startswith(asequence) else -1 if pos >= 0: match = AdapterMatch(0, len(asequence), pos, pos + len(asequence), len(asequence), 0, self._front_flag, self, read) else: # try approximate matching alignment = align.globalalign_locate(asequence, read.sequence, self.max_error_rate, self.where, self.wildcard_flags) match = AdapterMatch(*(alignment + (self._front_flag, self, read))) # TODO globalalign_locate should be modified to allow the following # assertion. # assert length == 0 or match.errors / length <= self.max_error_rate if match.length < self.min_overlap or match.errors / match.length > self.max_error_rate: return None return match
def match(self, read): """ Try to match this adapter to the given read and return an AdapterMatch instance. Return None if the minimum overlap length is not met or the error rate is too high. """ read_seq = read.sequence.upper() pos = -1 # try to find an exact match first unless wildcards are allowed if not self.match_adapter_wildcards: if self.where == PREFIX: pos = 0 if read_seq.startswith(self.sequence) else -1 else: pos = read_seq.find(self.sequence) if pos >= 0: match = AdapterMatch(0, len(self.sequence), pos, pos + len(self.sequence), len(self.sequence), 0, self._front_flag, self, read) else: # try approximate matching alignment = align.globalalign_locate(self.sequence, read_seq, self.max_error_rate, self.where, self.wildcard_flags) # TODO line-based profiling tells me that the following line # is slow (takes 30% of match()'s running time) match = AdapterMatch(*(alignment + (self._front_flag, self, read))) # TODO globalalign_locate should be modified to allow the following # assertion. # assert length == 0 or match.errors / length <= self.max_error_rate if match.length < self.min_overlap or match.errors / match.length > self.max_error_rate: return None return match
def seqs_align(seq1, seq2, error_rate=0.1): '''Do sequences 1 and 2 align given the following criteria: 1. Error rate = 0.1 per 10 nucleotides (int(floor(0.1 * len(seq)))) 2. The alignments perfectly match up ''' len_seq1 = len(seq1) if len_seq1 != len(seq2): return False # use C bindings for superfast alignment! aln = globalalign_locate(seq1, seq2, error_rate) # if start1 = start2 and stop1 = stop2 and return aln[0] == aln[2] and aln[1] == aln[3]
def match(self, read): """Return AdapterMatch instance""" if self.where != PREFIX: return super(ColorspaceAdapter, self).match(read) # create artificial adapter that includes a first color that encodes the # transition from primer base into adapter asequence = colorspace.ENCODE[read.primer + self.nucleotide_sequence[0]] + self.sequence pos = 0 if read.sequence.startswith(asequence) else -1 if pos >= 0: match = AdapterMatch( 0, len(asequence), pos, pos + len(asequence), len(asequence), 0, self._front_flag, self, read) else: # try approximate matching alignment = align.globalalign_locate(asequence, read.sequence, self.max_error_rate, self.where, self.wildcard_flags) match = AdapterMatch(*(alignment + (self._front_flag, self, read))) # TODO globalalign_locate should be modified to allow the following # assertion. # assert length == 0 or match.errors / length <= self.max_error_rate if match.length < self.min_overlap or match.errors / match.length > self.max_error_rate: return None return match
def test_polya(): s = 'AAAAAAAAAAAAAAAAA' t = 'ACAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' result = globalalign_locate(s, t, 0.0, BACK) #start_s, stop_s, start_t, stop_t, matches, cost = result assert result == (0, len(s), 4, 4 + len(s), len(s), 0)