def get_all_neighbors(self): for self_contig in self.contigs.values(): for other_contig, alignment in self_contig.overlaps.iteritems(): x, y = alignment[0][0] a, b = alignment[0][1] self_length = self_contig.length other_length = self.contigs[other_contig.name].length if a < b: # same direction if x == 1: make_neighbors(other_contig, self_contig) # case 2 elif a == 1: # case 1 make_neighbors(self_contig, other_contig) elif a > b: # 'other' is in reversed direction if x == 1: make_neighbors(self_contig, other_contig) if not other_contig.reversed: # TODO: this...probably needs to be more sensitive other_contig.sequence = util.reverse_complement(other_contig.sequence) other_contig.reversed = True # case 4 elif a == other_length: make_neighbors(other_contig, self_contig) if not other_contig.reversed: other_contig.sequence = util.reverse_complement(other_contig.sequence) other_contig.reversed = True
def _extend_match(self, pair, ap): run = self._run masklen = pair.mask.length() dumblen = len(run.dumbbell) if run.dumbbell else 0 ## Align remaining R2 first to find adapter overhang. ## TAI: probably better way of doing adapter alignment (see indels_7 testcase) r2suffix = "" if run.cotrans else reverse_complement( pair.r1.original_seq[:masklen]) + reverse_complement(run.adapter_t) pair.r2.extend_alignment(pair.target, ap, r2suffix) if run.dumbbell: pair.dumbbell = pair.r2.match_index - dumblen ## Trim the adapters off both R1 and R2 if not self._trim_adapters(pair): if not pair.failure: pair.failure = Failures.adapter_trim self.counters.adapter_trim_failure += pair.multiplicity return False ## Now align remaining R1 if necessary if not pair.r1.fully_matched: pair.r1.extend_alignment(pair.target, ap) # we may have not trimmed enough r1_overhang = pair.r1.right_est - pair.target.n if r1_overhang > 0: pair.r1.ltrim += r1_overhang # also updates match_len return True
def build_cotrans_lookups(self, run): # for cotrans experiments, R1 includes a linker and is relatively restricted # store RC in table so that we can directly lookup R1[4:] # TODO: this could be set to run.minimum_target_match_length, but given that there's linker # involved, it makes sense to assume that a match including linker will hit that minimum. # so for this we can include very small bits of the actual target. minimum_target_length = 3 linker = run.cotrans_linker linker_len = len(linker) r1_table = {} if 1 != len(self.targets): raise Exception("cotrans requires one target") target = self.targets[0] tseq = target.seq tlen = len(tseq) adapter_b = run.adapter_b pair_len = run.pair_length assert(0 < pair_len) masklen = 4 # TODO r1_match_len = pair_len - masklen for end in xrange(minimum_target_length, tlen + 1): target_subseq = tseq[:end] for i in xrange(0, r1_match_len - linker_len - minimum_target_length + 1): tstart = i - (r1_match_len - linker_len) if tstart + end < 0: continue target_bit = target_subseq[tstart:] r1_rc_match = target_bit + linker r1_match = reverse_complement(r1_rc_match) + adapter_b[:i] entries = r1_table.get(r1_match, []) entries.append( (target, end, i, []) ) # target, end, amount of adapter to trim, mutations r1_table[r1_match] = entries if run.count_mutations: bit_len = len(target_bit) for toggle_idx in xrange(bit_len): for nt in [ 'A', 'C', 'G', 'T' ]: if target_bit[toggle_idx] == nt: continue mutated_bit = target_bit[:toggle_idx] + nt + target_bit[toggle_idx + 1:] mutated_rc_match = mutated_bit + linker mutated_match = reverse_complement(mutated_rc_match) + adapter_b[:i] entries = r1_table.get(mutated_match, []) entries.append( (target, end, i, [end - (bit_len - toggle_idx) + 1]) ) r1_table[mutated_match] = entries self.r1_lookup = r1_table self._build_R1_aliases(adapter_b, r1_match_len) # we only need to build R2 lookups for full sequences (excepting linker) # trim cases are just tested against R1 self._build_R2_lookup(pair_len - linker_len - masklen, run.count_mutations)
def _check_targetrc(self, pair): rcpair = Pair() rcpair.set_from_data(pair.identifier, reverse_complement(pair.r1.original_seq), reverse_complement(pair.r2.original_seq)) if self._run.cotrans: rcpair.r2.linker_start = pair.r2.linker_start self._find_matches(rcpair) if rcpair.matched or (self._run.cotrans and self._cotrans_find_short_matches(rcpair)): self.counters.dna_residual_pairs += 1
def _cotrans_find_short_matches(self, pair): run = self._run r2li = pair.r2.linker_start if r2li <= 0 or pair.r2.original_len - r2li < len(run.cotrans_linker): self.counters.unmatched += pair.multiplicity pair.failure = Failures.nomatch return False target = self._targets.targets[0] r2spots = string_find_errors(pair.r2.subsequence[:r2li], target.seq, run.allowed_target_errors) r1spots = string_find_errors( reverse_complement(pair.r1.subsequence[:r2li]), target.seq, run.allowed_target_errors) if len(r1spots) == 0 or len( r2spots) == 0 or len(r1spots) != len(r2spots): self.counters.unmatched += pair.multiplicity pair.failure = Failures.nomatch return False elif len(r1spots) > 1 or len(r2spots) > 1: pair.failure = Failures.multiple_R1 return False pair.target = target pair.r1.match_index = r1spots[0] pair.r1.match_start = pair.r1.seq_len - r2li pair.r1.match_len = r2li pair.r2.match_index = r2spots[0] pair.r2.match_start = 0 pair.r2.match_len = r2li pair.fully_matched = True # also sets the trim pair.linker = pair.r1.match_index + pair.r1.match_len return True
def basic_indel(self, indel): my_sequence = indel.contig1.sequence other_sequence = indel.contig2.sequence indel_start = my_sequence[:indel.b] if indel.direction == "forward": indel_middle = other_sequence[indel.y:indel.z-1] elif indel.direction == "reverse": indel_middle = util.reverse_complement(other_sequence[indel.z:indel.y-1]) indel_end = my_sequence[indel.c-1:] indel_sequence = indel_start + indel_middle + indel_end indel_len = indel.z - indel.y - 1 L = indel_len self.contigs[indel.contig1.name].sequence = indel_sequence self.contigs[indel.contig1.name].length = len(indel_sequence) self.contigs[indel.contig1.name].origin = "indel" # TODO: adjust this to keep a running trail of added contigs try: del(self.contigs[indel.contig2.name].overlaps[indel.contig1]) if len(self.contigs[indel.contig2.name].overlaps) == 0: print("Deleted contig %s") % indel.contig2.name del(self.contigs[indel.contig2.name]) else: pass # update overlaps # MAJOR TODO: also, need to handle updating overlap info for contig1 del(self.contigs[indel.contig1.name].overlaps[indel.contig2]) except: printwithtime("Hey, you've probably found an error, please report this")
def _verify_full_match(self, pair): run = self._run if not run.cotrans: maxmatch = min(pair.target.n, pair.r1.match_index + pair.r1.match_len) if pair.r2.match_index + pair.r2.seq_len > maxmatch: masklen = pair.mask.length() trimmed = pair.r2.right_est - maxmatch if trimmed > masklen: pair.r2.adapter_trimmed = pair.r2.subsequence[masklen - trimmed:] pair.r2.rtrim = trimmed if pair.r2.adapter_trimmed: if not pair.r2.check_adapter_trim( reverse_complement(run.adapter_t), run): pair.failure = Failures.adapter_trim self.counters.adapter_trim_failure += pair.multiplicity return False self.counters.adapter_trimmed += pair.multiplicity if pair.r1.match_index == 0 and pair.r1.match_start > 0: pair.r1.adapter_trimmed = pair.r1.reverse_complement[-pair.r1. match_start:] pair.r1.rtrim += pair.r1.match_start if pair.linker: pair.linker -= pair.r1.match_start pair.r1.match_start = 0 # Note: if we had a full dumbbell, it would have been entirely trimmed already # So we either have no dumbbell and an adapter or a partial dumbbell only. if not run.dumbbell and pair.r1.adapter_trimmed: if not pair.r1.check_adapter_trim(run.adapter_b, run): pair.failure = Failures.adapter_trim self.counters.adapter_trim_failure += pair.multiplicity return False #self.counters.adapter_trimmed += pair.multiplicity # Note: only counting R2 trimming return self._recheck_targets(pair)
def _build_R1_lookup(self, adapter_b, length = 31, end_only = True, mutations = False, dumbbell = None): # we can pre-build the set of all possible (error-free) R1, b/c: # - R1 has to include the right-most nt # - R1 can include some adapter-b (or dumbbell) off the end # - this is done for each target # - note that in cases where R1 includes some (enough) adapter, then position and content of R2 is determined # note that this does *not* include the handle. r1_table = {} use_aliases = False for target in self.targets: tlen = target.n rc_tgt = reverse_complement(target.seq) rc_dumbbell = reverse_complement(dumbbell) if dumbbell else None tcandidates = 0 for i in xrange(1, length + 1): if rc_dumbbell: if length - i <= len(rc_dumbbell): r1_candidate = rc_tgt[:i] + rc_dumbbell[:length - i] else: r1_candidate = rc_tgt[:i] + rc_dumbbell + adapter_b[:length - len(rc_dumbbell) - i] else: r1_candidate = rc_tgt[:i] + adapter_b[:length - i] res = (target, None if i == length else tlen - i, length - i, []) # target, end, amount of adapter to trim, mutations existing = r1_table.get(r1_candidate) if existing: existing.append(res) else: r1_table[r1_candidate] = [ res ] tcandidates += 1 if mutations: for toggle_idx in xrange(i): for nt in [ 'A', 'C', 'G', 'T' ]: if r1_candidate[toggle_idx] == nt: continue mutated_bit = r1_candidate[:toggle_idx] + nt + r1_candidate[toggle_idx + 1:] mres = (res[0], res[1], res[2], [ tlen - toggle_idx ]) existing = r1_table.get(mutated_bit) if existing: existing.append(mres) else: r1_table[mutated_bit] = [ mres ] if 0 == tcandidates: _warn("!! No R1 match candidates for {}".format(target.name)) self.r1_lookup = r1_table self._build_R1_aliases(adapter_b, length)
def encode(dna, peptide): rna = transcribe(dna) reverse_complement_rna = transcribe(reverse_complement(dna)) rna_substr_len = len(peptide) * 3 return [dna[i:i + rna_substr_len] for i in range(0, len(rna) - rna_substr_len + 1) if translate(rna[i:i + rna_substr_len]) == peptide or translate(reverse_complement_rna[len(rna) - (i + rna_substr_len):len(rna) - i]) == peptide]
def valid(exclude, seq): if not exclude: return True for e in exclude: if (seq.find(e) >= 0 or seq.find(util.reverse_complement(e)) >= 0): return False return True
def _trim_adapters(self, pair): run = self._run masklen = pair.mask.length() if not run.cotrans: # trim everything beyond the end of the target (including mask if there) maxr2 = pair.r1.right_est if ( run.allow_multiple_rt_starts and pair.r1.right_est < pair.target.n) else pair.target.n full_trim = pair.r2.right_est - maxr2 if full_trim > 0: if full_trim > masklen: pair.r2.adapter_trimmed = pair.r2.subsequence[masklen - full_trim:] pair.r2.rtrim += full_trim # also updates indels and match_len in r2 if pair.r2.adapter_trimmed: if not pair.r2.check_adapter_trim( reverse_complement(run.adapter_t), run): return False self.counters.adapter_trimmed += pair.multiplicity if pair.r1.fully_rtrimmed: return True ## Note: we really shouldn't trim r1 prior to aligning b/c it could have big inserts, but ## trimming greatly saves alignment time, so compromise by assuming that indels (mostly) match ## between reads so use R2's indels_delta to buffer trim amount. if run.cotrans: r2id = pair.r2.indels_delta_before(pair.r1.match_index) r1_adapter_len = pair.r1.match_start - (pair.r1.match_index - pair.r2.match_index + r2id) else: longest = pair.r1.right_est if ( run.allow_multiple_rt_starts and pair.r1.right_est < pair.target.n) else pair.target.n possible_matchlen = min( longest - max(0, pair.r2.match_index - pair.r2.match_start), pair.r1.seq_len + pair.r1.match_index - pair.r1.match_start) r1_adapter_len = pair.r1.seq_len - possible_matchlen - pair.r2.indels_delta dumblen = len(run.dumbbell) if run.dumbbell else 0 if run.minimum_adapter_len and r1_adapter_len - dumblen < run.minimum_adapter_len: return False if r1_adapter_len > 0: if r1_adapter_len > dumblen: pair.r1.adapter_trimmed = pair.r1.subsequence[dumblen - r1_adapter_len:] if not pair.r1.check_adapter_trim(run.adapter_b, run): return False pair.r1.rtrim += r1_adapter_len pair.r1.match_start -= r1_adapter_len if pair.linker: pair.linker -= r1_adapter_len if pair.r1.match_start < 0: pair.r1.match_index -= pair.r1.match_start pair.r1.match_len += pair.r1.match_start pair.r1.match_start = 0 #self.counters.adapter_trimmed += pair.multiplicity # Note: only counting R2 trimming return self._recheck_targets(pair)
def _validate(self, seq): if not self.exclude: return True for e in self.exclude: test_seq = (self.upstream[-len(e)+1:] + seq + self.downstream[:len(e)-1]) if (test_seq.find(e) >= 0 or test_seq.find(util.reverse_complement(e)) >= 0): return False return True
def __init__(self, contig1, contig2, name=None): self.contig1 = contig1 self.contig2 = contig2 self.a, self.b = contig1.overlaps[contig2][0][0] self.c, self.d = contig1.overlaps[contig2][1][0] self.x, self.y = contig1.overlaps[contig2][0][1] self.z, self.w = contig1.overlaps[contig2][1][1] self.name = str(self.contig1) + " indel " + str(self.contig2) if self.x < self.w: self.direction = "forward" elif self.x > self.w: self.direction = "reverse" else: self.direction = "confused" if self.direction == "forward": self.R = contig1.sequence[self.c - 1:self.b - 1] self.I = contig2.sequence[self.y - 1:self.z - 1] elif self.direction == "reverse": self.R = util.reverse_complement(contig1.sequence[self.b - 1:self.c - 1]) self.I = util.reverse_complement(contig2.sequence[self.z - 1:self.y - 1])
def _make_adapter_line(self, part, adapter, label): is_R1 = (part == self.pair.r1) if is_R1: if part.right and part.rtrim > self.dumblen: alen = part.rtrim - self.dumblen d = self._make_prefix(label) d, match_index = self._adj_front(part, d) if alen > len(adapter): d += sp(alen - len(adapter)) alen = len(adapter) if part.adapter_errors: errors = sp(alen) for e in part.adapter_errors: if e < alen: errors = errors[:e] + "!" + errors[e + 1:] errors = errors[::-1] if errors[0] == " ": errors = "|" + errors[1:] if errors[-1] == " ": errors = errors[:-1] + "|" self._add_line(d + errors) d += adapter[:alen][::-1] self._add_line(d) if self.dumblen > 0: dumbbell_part = min(part.rtrim, self.dumblen) d = self._make_prefix('c(DUMBBELL)') d, _ = self._adj_front(part, d) d += sp(part.rtrim - dumbbell_part) d += reverse_complement( self.run.dumbbell[:dumbbell_part])[::-1] self._add_line(d) elif part.left is not None and ( part.rtrim > self.masklen + self.linkerlen or self.dumblen > 0): d = self._make_prefix(label) d, match_index = self._adj_front(part, d) if self.dumblen > 0: d += self.run.dumbbell d += sp(part.ltrim - len(self.run.dumbbell)) else: d += sp(match_index) d += sp(part.match_len + 1) d += sp(self.masklen + 1) if self.run.cotrans: d += sp(self.linkerlen) if adapter and part.rtrim > self.masklen + self.linkerlen: d += (adapter[:part.rtrim - self.masklen - self.linkerlen] + "...") self._add_line(d)
def run(): # Remove primers from FASTA/FASTQ file # Get command line arguments args = parse_args() # Get primer sequences if args.p: primers = [args.p] elif args.l: primers = [line.rstrip() for line in open(args.l)] else: quit('Error: must specify primer or primer list') # Get FASTA/FASTQ iterators if args.f: fn = args.f iter_fst = util.iter_fst elif args.q: fn = args.q iter_fst = util.iter_fsq else: quit('Error: must specify FASTA or FASTQ file') # Iterate through FASTA/FASTQ file n = 0 t = 0 for record in iter_fst(fn): t += 1 seq = record[1] [i,d,p] = find_best_match(seq, primers, args.w, args.d) if i != '': record[1] = record[1][i + len(p):] record[3] = record[3][i + len(p):] if args.b == False: print '\n'.join(record) n += 1 if args.b == True: SEQ = util.reverse_complement(record[1]) [I,D,P] = find_best_match(SEQ, primers, args.w, args.d) if I != '': record[1] = record[1][:-(I + len(P))] record[3] = record[3][:-(I + len(P))] print '\n'.join(record) n += 1 sys.stderr.write('%d total sequences\n' %(t)) sys.stderr.write('%d primers removed (%.2f%%)\n' %(n, 100.*n/t))
def run(): # Remove primers from FASTA/FASTQ file # Get command line arguments args = parse_args() # Get primer sequences if args.p: primers = [args.p] elif args.l: primers = [line.rstrip() for line in open(args.l)] else: quit('Error: must specify primer or primer list') # Get FASTA/FASTQ iterators if args.f: fn = args.f iter_fst = util.iter_fst elif args.q: fn = args.q iter_fst = util.iter_fsq else: quit('Error: must specify FASTA or FASTQ file') # Iterate through FASTA/FASTQ file n = 0 t = 0 for record in iter_fst(fn): t += 1 seq = record[1] [i, d, p] = find_best_match(seq, primers, args.w, args.d) if i != '': record[1] = record[1][i + len(p):] record[3] = record[3][i + len(p):] if args.b == False: print '\n'.join(record) n += 1 if args.b == True: SEQ = util.reverse_complement(record[1]) [I, D, P] = find_best_match(SEQ, primers, args.w, args.d) if I != '': record[1] = record[1][:-(I + len(P))] record[3] = record[3][:-(I + len(P))] print '\n'.join(record) n += 1 sys.stderr.write('%d total sequences\n' % (t)) sys.stderr.write('%d primers removed (%.2f%%)\n' % (n, 100. * n / t))
def _make_adapter_line(self, part, adapter, label): is_R1 = (part == self.pair.r1) if is_R1: if part.right and part.rtrim > self.dumblen: alen = part.rtrim - self.dumblen d = self._make_prefix(label) d, match_index = self._adj_front(part, d) if alen > len(adapter): d += sp(alen - len(adapter)) alen = len(adapter) if part.adapter_errors: errors = sp(alen) for e in part.adapter_errors: if e < alen: errors = errors[:e] + "!" + errors[e+1:] errors = errors[::-1] if errors[0] == " ": errors = "|" + errors[1:] if errors[-1] == " ": errors = errors[:-1] + "|" self._add_line(d + errors) d += adapter[:alen][::-1] self._add_line(d) if self.dumblen > 0: dumbbell_part = min(part.rtrim, self.dumblen) d = self._make_prefix('c(DUMBBELL)') d, _ = self._adj_front(part, d) d += sp(part.rtrim - dumbbell_part) d += reverse_complement(self.run.dumbbell[:dumbbell_part])[::-1] self._add_line(d) elif part.left is not None and (part.rtrim > self.masklen + self.linkerlen or self.dumblen > 0): d = self._make_prefix(label) d, match_index = self._adj_front(part, d) if self.dumblen > 0: d += self.run.dumbbell d += sp(part.ltrim - len(self.run.dumbbell)) else: d += sp(match_index) d += sp(part.match_len + 1) d += sp(self.masklen + 1) if self.run.cotrans: d += sp(self.linkerlen) if adapter and part.rtrim > self.masklen + self.linkerlen: d += (adapter[:part.rtrim - self.masklen - self.linkerlen] + "...") self._add_line(d)
def make(self): self.lines = [] # handling left-of-target matches self.prefix_len += max( self.pair.r1.rtrim - (self.pair.r1.match_index or 0) + 1, self.pair.r2.ltrim, max(-self.match_index(self.pair.r2), 0)) self._add_line("@" + self.pair.identifier) r1_bars = self._make_part(self.pair.r1) self.bars.append(r1_bars) if self.pair.r1.trimmed: self._make_adapter_line(self.pair.r1, self.run.adapter_b, "adapter_b") if self.pair.mask and self.pair.r1.matched: self._add_line("") if self.pair.r1.indels: self._make_part_ins(self.pair.r1) self._make_r1_rev() if self.pair.r1.match_errors or self.pair.r1.adapter_errors: r1_errors = self._make_part_errors(self.pair.r1) else: r1_errors = None if self.show_quality: self._make_part_quality(self.pair.r1) if self.pair.linker != None: self._make_linker() self._add_line("") if self.pair.r2.indels: self._make_part_ins(self.pair.r2) r2_bars = self._make_part(self.pair.r2) self.bars.append(r2_bars) if self.pair.r2.match_errors or self.pair.r2.adapter_errors: r2_errors = self._make_part_errors(self.pair.r2) else: r2_errors = None if self.show_quality: self._make_part_quality(self.pair.r2) if self.pair.r2.trimmed: label = "DB+RC(adp_t)" if self.pair.dumbbell is not None else "RC(adapter_t)" self._make_adapter_line(self.pair.r2, reverse_complement(self.run.adapter_t), label) elif self.pair.dumbbell is not None: self._make_adapter_line(self.pair.r2, None, "DUMBBELL") self._add_line("") self._make_target_lines() if r1_errors: self.bars.remove(r1_errors) if r2_errors: self.bars.remove(r2_errors) features = {} if self.pair.site != None: features['Site'] = self.pair.site if self.pair.end != None: # note: decrement to make off-by-one conventions align better # xref 'conventions' above features['End'] = self.pair.end - 1 if self.pair.mutations: idx = 1 for mut in self.pair.mutations: if self.pair.edge_mut: # note: decrement to make off-by-one conventions align better # xref 'conventions' above features['EdgeMut{} ({})'.format( idx, self.pair.edge_mut)] = mut - 1 else: # note: decrement to make off-by-one conventions align better # xref 'conventions' above features['Mut{}'.format(idx)] = mut - 1 idx += 1 for v in features.values(): self.bars.append([v + self.prefix_len]) if self.pair.r2.matched: self._make_result(self.pair.r2) self.bars.remove(r2_bars) if self.pair.r1.matched: self._make_result(self.pair.r1) else: self._add_line("") self.bars.remove(r1_bars) for key, value in features.items(): self._add_marker(key, value) self.bars.remove([value + self.prefix_len]) if self.pair.matched and self.pair.left > 100: self._snip_lines(self.prefix_len + 15, self.prefix_len + 85) self._add_line("") self._make_summary() return "\n".join(self.lines)
def build_cotrans_lookups(self, run): # for cotrans experiments, R1 includes a linker and is relatively restricted # store RC in table so that we can directly lookup R1[4:] # TODO: this could be set to run.minimum_target_match_length, but given that there's linker # involved, it makes sense to assume that a match including linker will hit that minimum. # so for this we can include very small bits of the actual target. minimum_target_length = 3 linker = run.cotrans_linker linker_len = len(linker) r1_table = {} if 1 != len(self.targets): raise Exception("cotrans requires one target") target = self.targets[0] tseq = target.seq tlen = len(tseq) adapter_b = run.adapter_b pair_len = run.pair_length assert (0 < pair_len) masklen = 4 # TODO r1_match_len = pair_len - masklen for end in xrange(minimum_target_length, tlen + 1): target_subseq = tseq[:end] for i in xrange( 0, r1_match_len - linker_len - minimum_target_length + 1): tstart = i - (r1_match_len - linker_len) if tstart + end < 0: continue target_bit = target_subseq[tstart:] r1_rc_match = target_bit + linker r1_match = reverse_complement(r1_rc_match) + adapter_b[:i] entries = r1_table.get(r1_match, []) entries.append( (target, end, i, [])) # target, end, amount of adapter to trim, mutations r1_table[r1_match] = entries if run.count_mutations: bit_len = len(target_bit) for toggle_idx in xrange(bit_len): for nt in ['A', 'C', 'G', 'T']: if target_bit[toggle_idx] == nt: continue mutated_bit = target_bit[: toggle_idx] + nt + target_bit[ toggle_idx + 1:] mutated_rc_match = mutated_bit + linker mutated_match = reverse_complement( mutated_rc_match) + adapter_b[:i] entries = r1_table.get(mutated_match, []) entries.append( (target, end, i, [end - (bit_len - toggle_idx) + 1])) r1_table[mutated_match] = entries self.r1_lookup = r1_table self._build_R1_aliases(adapter_b, r1_match_len) # we only need to build R2 lookups for full sequences (excepting linker) # trim cases are just tested against R1 self._build_R2_lookup(pair_len - linker_len - masklen, run.count_mutations)
def _adapter_t_rc(self): if not self.__adapter_t_rc: self.__adapter_t_rc = reverse_complement(self._run.adapter_t) return self.__adapter_t_rc
# get intron intron = fna[contig][ibeg:(beg - 1)] ibeg = end if ipre and not args.exon and len(intron) > 0: intron_seq = re.sub( '^>\.', '>', '>%s.intron_%s %s\n%s %s' % (pre, ipre, gname, intron)) writeout(intron_seq, org=pre, gene=gname) ipre = gid if ifin == '': ifin = beg # get gene gene = fna[contig][(beg - 1):end] if strand == '-': gene = util.reverse_complement(gene) if args.regex and not re.search(args.regex, gname): continue if len(gene) > 0: exon_seq = re.sub('^>\.', '>', '>%s.%s %s\n%s' % (pre, gid, gname, gene)) writeout(exon_seq, org=pre, gene=gname) # final intron intron = fna[contig][ibeg:] + fna[contig][:(ifin - 1)] if not args.exon and len(intron) > 0: intron_seq = re.sub('^>\.', '>', '>%s.intron_%s %s\n%s' % (pre, ipre, gname, intron)) writeout(intron_seq, org=pre, gene=gname) for fh in fhs:
def make(self): self.lines = [ ] # handling left-of-target matches self.prefix_len += max(self.pair.r1.rtrim - (self.pair.r1.match_index or 0) + 1, self.pair.r2.ltrim, max(-self.match_index(self.pair.r2), 0)) self._add_line("@" + self.pair.identifier) r1_bars = self._make_part(self.pair.r1) self.bars.append(r1_bars) if self.pair.r1.trimmed: self._make_adapter_line(self.pair.r1, self.run.adapter_b, "adapter_b") if self.pair.mask and self.pair.r1.matched: self._add_line("") if self.pair.r1.indels: self._make_part_ins(self.pair.r1) self._make_r1_rev() if self.pair.r1.match_errors or self.pair.r1.adapter_errors: r1_errors = self._make_part_errors(self.pair.r1) else: r1_errors = None if self.show_quality: self._make_part_quality(self.pair.r1) if self.pair.linker != None: self._make_linker() self._add_line("") if self.pair.r2.indels: self._make_part_ins(self.pair.r2) r2_bars = self._make_part(self.pair.r2) self.bars.append(r2_bars) if self.pair.r2.match_errors or self.pair.r2.adapter_errors: r2_errors = self._make_part_errors(self.pair.r2) else: r2_errors = None if self.show_quality: self._make_part_quality(self.pair.r2) if self.pair.r2.trimmed: label = "DB+RC(adp_t)" if self.pair.dumbbell is not None else "RC(adapter_t)" self._make_adapter_line(self.pair.r2, reverse_complement(self.run.adapter_t), label) elif self.pair.dumbbell is not None: self._make_adapter_line(self.pair.r2, None, "DUMBBELL") self._add_line("") self._make_target_lines() if r1_errors: self.bars.remove(r1_errors) if r2_errors: self.bars.remove(r2_errors) features = {} if self.pair.site != None: features['Site'] = self.pair.site if self.pair.end != None: # note: decrement to make off-by-one conventions align better # xref 'conventions' above features['End'] = self.pair.end - 1 if self.pair.mutations: idx = 1 for mut in self.pair.mutations: if self.pair.edge_mut: # note: decrement to make off-by-one conventions align better # xref 'conventions' above features['EdgeMut{} ({})'.format(idx, self.pair.edge_mut)] = mut - 1 else: # note: decrement to make off-by-one conventions align better # xref 'conventions' above features['Mut{}'.format(idx)] = mut - 1 idx += 1 for v in features.values(): self.bars.append([v + self.prefix_len]) if self.pair.r2.matched: self._make_result(self.pair.r2) self.bars.remove(r2_bars) if self.pair.r1.matched: self._make_result(self.pair.r1) else: self._add_line("") self.bars.remove(r1_bars) for key, value in features.items(): self._add_marker(key, value) self.bars.remove([value + self.prefix_len]) if self.pair.matched and self.pair.left > 100: self._snip_lines(self.prefix_len + 15, self.prefix_len + 85) self._add_line("") self._make_summary() return "\n".join(self.lines)
def get_node_seq(self, nodeID): if nodeID < 0: return reverse_complement(self.nodes[-nodeID - 1].seq) else: return self.nodes[nodeID - 1].seq
def reverse_complement(self): if not self._seq_rc: self._seq_rc = reverse_complement(self.subsequence) return self._seq_rc
def process_pair(self, pair): if not self._check_indeterminate(pair) or not self._match_mask(pair): return run = self._run masklen = pair.mask.length() pair.r2.auto_adjust_match = True #pair.r1.auto_adjust_match = True ## Pre-trim dumbbell from R2 immediately since it won't help with alignment ## and simplifies dealing with match_index, match_start, etc. ## (It's also a bit faster for SW.) dumblen = 0 if run.dumbbell: if not pair.r2.original_seq.startswith(run.dumbbell): pair.failure = Failures.dumbbell self.counters.dumbbell_failures += pair.multiplicity return dumblen = len(run.dumbbell) pair.r2.ltrim = dumblen pair.dumbbell = -dumblen # Also may as well trim R1 dumbbell if it's there in full... # TAI: this may get the adapter too. if so, do we need to count that? (no. only counting R2s.) # TAI: but we're not checking the adapter for errors here. dbi = pair.r1.original_seq.rfind( reverse_complement(run.dumbbell), masklen + run.minimum_target_match_length) if -1 != dbi: pair.r1.rtrim = pair.r1.original_len - dbi pair.r1.fully_rtrimmed = True ## Also pre-trim any (full) adapter on R1 for the same reason if not pair.r1.fully_rtrimmed and run.adapter_b: adi = pair.r1.original_seq.rfind( run.adapter_b, masklen + run.minimum_target_match_length) if -1 != adi: pair.r1.rtrim = pair.r1.original_len - adi pair.r1.fully_rtrimmed = True self.counters.adapter_trimmed += pair.multiplicity ## Finally, pre-trim any linker from R1 and R2 since these'll screw up alignment if run.cotrans: if pair.r1.subsequence.startswith( reverse_complement(run.cotrans_linker)): pair.r1.ltrim += len(run.cotrans_linker) pair.linker = pair.r1.seq_len # linker is currently in R1 coordinates else: pair.failure = Failures.linker return pair.r2.linker_start = string_find_with_overlap( run.cotrans_linker, pair.r2.subsequence) if -1 != pair.r2.linker_start: # Trimming partial R2 linker matches here is a little overzealous. # (if, say, only 1 bp matches the linker, it could also match the target.) # However, we need to do it to prevent partial linker matches from screwing # up our alignment with the target. # After alignment, we circle back to re-add what we trimmed if it was a mistake. full_trim_len = pair.r2.seq_len - pair.r2.linker_start pair.r2.linker_trim_len = full_trim_len r2_adapter_len = full_trim_len - len( run.cotrans_linker) - masklen if r2_adapter_len > 0: pair.r2.adapter_trimmed = pair.r2.subsequence[ -r2_adapter_len:] pair.r2.linker_trim_len = max( 0, full_trim_len - r2_adapter_len) pair.r2.rtrim += full_trim_len pair.r2.fully_rtrimmed = True ## Now find best exactly matching substring with target self._find_matches(pair) if not pair.matched: if not run.cotrans or pair.r1.ltrim == 0: self.counters.unmatched += pair.multiplicity pair.failure = Failures.nomatch self._check_targetrc(pair) return if not self._cotrans_find_short_matches(pair): self._check_targetrc(pair) return ap = None if run.handle_indels: if run.allow_indeterminate: simfn = lambda nt1, nt2: base_similarity_ind( nt1, nt2, run.indel_match_value, run.indel_mismatch_cost, .5 * run.indel_match_value) else: simfn = lambda nt1, nt2: AlignmentParams.char_sim( nt1, nt2, run.indel_match_value, run.indel_mismatch_cost) ap = AlignmentParams(simfn, run.indel_gap_open_cost, run.indel_gap_extend_cost) ## And extend the match if necessary using string alignment on the rest aligned = False if pair.fully_matched: # Don't just trim adapter here b/c in cotrans case it has already been done if not self._verify_full_match(pair): return elif run.handle_indels: if not self._extend_match(pair, ap): return aligned = True elif not self._trim_adapters(pair): if not pair.failure: pair.failure = Failures.adapter_trim self.counters.adapter_trim_failure += pair.multiplicity return if pair.linker: ## shift pair.linker into target coordinates. ## pair.linker now equates to 1 beyond the end of the match in the *target* (where the linker starts) ## TAI: keep in R1 coordinates? pair.linker += pair.r1.match_index - pair.r1.match_start - pair.r1.indels_delta if run.cotrans: if pair.r2.linker_start != -1 and pair.r2.linker_trim_len > 0 and pair.r2.right < pair.r1.right: # what we trimmed off of R2 above *wasn't* the linker! restore it. pair.r2.rtrim -= pair.r2.linker_trim_len pair.r2.fully_rtrimmed = False if run.handle_indels: pair.r2.extend_alignment(pair.target, ap) else: pair.r2.match_len += pair.r2.linker_trim_len if run.handle_indels and not pair.check_overlap(False): self.counters.mismatching_indel_pairs += pair.multiplicity #_warn("cotrans pair disagrees on overlap. alignment may be off.") #pair.failure = Failures.r1_r2_overlap #return # TAI: should we treat a mutation at the site as a prefix? what if it's at site=0? counted_prefix = None if pair.r2.match_start > pair.r2.match_index: assert ( not aligned or pair.r2.match_index == 0 ) # align_strings() will ensure this if penalize_ends is True r2_start_in_target = pair.r2.match_index - pair.r2.match_start # will be negative self.counters.left_of_target += pair.multiplicity prefix = pair.r2.original_seq[dumblen:dumblen - r2_start_in_target] if run.count_left_prefixes: if (run.mutations_require_quality_score is None) or pair.check_prefix_quality( dumblen - r2_start_in_target, run.mutations_require_quality_score, dumblen): self.counters.register_prefix(prefix, pair) counted_prefix = prefix else: self.counters.low_quality_prefixes += pair.multiplicity if run.collapse_left_prefixes and ( not run.collapse_only_prefixes or prefix in run._p_collapse_only_prefix_list): pair.r2.ltrim -= r2_start_in_target pair.r2.match_start += r2_start_in_target pair.r2.shift_indels(r2_start_in_target) if pair.r1.match_start > 0 and pair.r1.match_index == pair.r2.match_index: r1prefix = min(-r2_start_in_target, pair.r1.match_start) pair.r1.rtrim += r1prefix pair.r1.match_start -= r1prefix pair.r1.shift_indels(-r1prefix) else: pair.failure = Failures.left_of_zero self.counters.left_of_zero += pair.multiplicity return elif run.count_left_prefixes: counted_prefix = "NONE" self.counters.register_prefix(counted_prefix, pair) if run.cotrans and pair.right < run.cotrans_minimum_length: pair.failure = Failures.cotrans_min return if not aligned: pair.r1.match_to_seq() pair.r2.match_to_seq() pair.r1.match_errors = string_match_errors( pair.r1.reverse_complement, pair.target.seq[pair.r1.match_index:]) pair.r2.match_errors = string_match_errors( pair.r2.subsequence, pair.target.seq[pair.r2.match_index:]) if run._p_rois and (pair.r1.error_in_region(run._p_rois) or pair.r2.error_in_region(run._p_rois)): pair.interesting = True self.counters.interesting_pairs += pair.multiplicity if max(len(pair.r1.match_errors), len(pair.r2.match_errors)) + max( len(pair.r1.indels), len( pair.r1.indels)) > run.allowed_target_errors: if pair.r1.match_errors: _debug("R1 errors: {}".format(pair.r1.match_errors)) if pair.r2.match_errors: _debug("R2 errors: {}".format(pair.r2.match_errors)) pair.failure = Failures.match_errors self.counters.match_errors += pair.multiplicity return if ((not run.cotrans and pair.right != pair.target.n and not run.allow_multiple_rt_starts) or (run.cotrans and (pair.right > pair.target.n or pair.right < pair.left or pair.r1.rtrim + pair.r1.match_start + pair.r1.match_len + pair.r1.indels_delta + pair.r1.ltrim < pair.r1.original_len))): pair.failure = Failures.right_edge self.counters.r1_not_on_right_edge += pair.multiplicity return if run.ignore_stops_with_mismatched_overlap and not pair.check_overlap( True): pair.failure = Failures.r1_r2_overlap self.counters.r1_r2_overlap += pair.multiplicity return if run.count_mutations: pair.check_mutations() self.counters.register_mut_count(pair) if pair.mutations and len( pair.mutations) > run.allowed_target_errors: pair.failure = Failures.match_errors self.counters.match_errors += pair.multiplicity return self.counters.low_quality_muts += pair.check_mutation_quality( run.mutations_require_quality_score) if run.count_only_full_reads and pair.left != 0: pair.failure = Failures.not_full_read return if run.handle_indels and not pair.indels_match: self.counters.mismatching_indel_pairs += pair.multiplicity # TAI: might bail on these if run.allow_multiple_rt_starts and run.rt_primers: # Require the entire primer to be present (no stops/starts within) for primer in run._p_rt_primers_list: if len(primer) <= pair.r1.seq_len and pair.r1.subsequence[ 0:len(primer)] == primer: self.counters.increment_key('rt_primer_{}'.format(primer)) break else: pair.failure = Failures.no_rt_primer self.counters.no_rt_primer += pair.multiplicity return pair.site = pair.left if run._p_rois and not pair.interesting: for roi in run._p_rois: if roi[0] <= pair.site and pair.site <= roi[1]: pair.interesting = True self.counters.interesting_pairs += pair.multiplicity break self.counters.register_count(pair) if counted_prefix: self.counters.register_mapped_prefix(counted_prefix, pair) if run.count_mutations: self.counters.register_mapped_mut_count(pair)
def test_reverse_complement(): input = 'ACGT' expected = 'ACGT' actual = reverse_complement(input) assert (expected == actual)
def _build_R1_lookup(self, adapter_b, length=31, end_only=True, mutations=False, dumbbell=None): # we can pre-build the set of all possible (error-free) R1, b/c: # - R1 has to include the right-most nt # - R1 can include some adapter-b (or dumbbell) off the end # - this is done for each target # - note that in cases where R1 includes some (enough) adapter, then position and content of R2 is determined # note that this does *not* include the handle. r1_table = {} use_aliases = False for target in self.targets: tlen = target.n rc_tgt = reverse_complement(target.seq) rc_dumbbell = reverse_complement(dumbbell) if dumbbell else None tcandidates = 0 for i in xrange(1, length + 1): if rc_dumbbell: if length - i <= len(rc_dumbbell): r1_candidate = rc_tgt[:i] + rc_dumbbell[:length - i] else: r1_candidate = rc_tgt[: i] + rc_dumbbell + adapter_b[:length - len( rc_dumbbell ) - i] else: r1_candidate = rc_tgt[:i] + adapter_b[:length - i] res = (target, None if i == length else tlen - i, length - i, []) # target, end, amount of adapter to trim, mutations existing = r1_table.get(r1_candidate) if existing: existing.append(res) else: r1_table[r1_candidate] = [res] tcandidates += 1 if mutations: for toggle_idx in xrange(i): for nt in ['A', 'C', 'G', 'T']: if r1_candidate[toggle_idx] == nt: continue mutated_bit = r1_candidate[: toggle_idx] + nt + r1_candidate[ toggle_idx + 1:] mres = (res[0], res[1], res[2], [tlen - toggle_idx]) existing = r1_table.get(mutated_bit) if existing: existing.append(mres) else: r1_table[mutated_bit] = [mres] if 0 == tcandidates: _warn("!! No R1 match candidates for {}".format(target.name)) self.r1_lookup = r1_table self._build_R1_aliases(adapter_b, length)
def _try_lookup_hit(self, pair, r1_res): targets = self._targets run = self._run masklen = pair.mask.length() site = -1 target = r1_res[0] r2len = pair.r2.original_len r2_mutations = [] if r1_res[1] == None: r2_res = targets.lookup_r2(target.name, pair.r2.original_seq) if r2_res is not None: match_site = r2_res[0] else: pair.failure = Failures.nomatch return else: match_site = r1_res[1] match_len = min(r2len, target.n - match_site) r2_match_start = 0 if run.dumbbell: r2_match_start = len(run.dumbbell) # TODO: dumbbell errors if pair.r2.original_seq[:r2_match_start] != run.dumbbell: pair.failure = Failures.dumbbell return pair.dumbbell = match_site - r2_match_start match_len = min(r2len - r2_match_start, target.n - match_site) pair.r2.ltrim = r2_match_start # need to check R2 against expectation if match_len > 0: pair.r2.match_errors = string_match_errors(pair.r2.original_seq[r2_match_start:match_len], target.seq[match_site:match_site + match_len]) #+1 for M_j indexing convention, xref https://trello.com/c/2qIGo9ZR/201-stop-map-mutation-indexing-convention r2_mutations = map(lambda x : x + match_site + 1, pair.r2.match_errors) if match_len <= 0 or len(pair.r2.match_errors) > run.allowed_target_errors: pair.failure = Failures.match_errors return adapter_len = r2len - match_len - r2_match_start - masklen if adapter_len > 0: pair.r2.adapter_errors = string_match_errors(self._adapter_t_rc[:adapter_len], pair.r2.original_seq[-adapter_len:]) if len(pair.r2.adapter_errors) > run.allowed_adapter_errors: pair.failure = Failures.adapter_trim return site = match_site # in rare cases, need to double-check what R1 should be based on R2 if site != r1_res[1]: match_len = min(pair.r1.original_len - masklen, target.n - match_site) adapter_len = pair.r1.original_len - match_len - masklen pair.r1.match_errors = string_match_errors(reverse_complement(target.seq[-match_len:]), pair.r1.original_seq[masklen:masklen+match_len]) if len(pair.r1.match_errors) > run.allowed_target_errors: pair.failure = Failures.match_errors return if adapter_len > 0 and pair.dumbbell: if adapter_len > r2_match_start: dumbbell_len = len(run.dumbbell) adapter_len -= r2_match_start else: dumbbell_len = adapter_len adapter_len = 0 # TODO: dumbbell errors if pair.r1.reverse_complement[adapter_len:dumbbell_len] != run.dumbbell[-dumbbell_len:]: pair.failure = Failures.dumbbell return if adapter_len > 0: pair.r1.adapter_errors = string_match_errors(pair.r1.original_seq[-adapter_len:], self._run.adapter_b[:adapter_len]) if len(pair.r1.adapter_errors) > run.allowed_adapter_errors: pair.failure = Failures.adapter_trim return if not self._check_indeterminate(pair): return pair.r1.match_len = min(pair.r1.original_len - masklen, target.n - match_site) pair.r1.match_index = r1_res[1] or (target.n - pair.r1.match_len) pair.r1.rtrim = r1_res[2] pair.r2.match_index = site pair.r2.match_len = match_len if run.ignore_stops_with_mismatched_overlap and not pair.check_overlap(): pair.failure = Failures.r1_r2_overlap return pair.target = target if r2_mutations or r1_res[3]: pair.mutations = list(set(r2_mutations + r1_res[3])) if pair.mutations and len(pair.mutations) > run.allowed_target_errors: pair.failure = Failures.match_errors return self.counters.low_quality_muts += pair.check_mutation_quality(run.mutations_require_quality_score) if run.count_only_full_reads and site != 0: pair.failure = Failures.not_full_read return pair.site = site pair.end = target.n pair.failure = None self.counters.register_count(pair)