Ejemplo n.º 1
0
 def get_all_neighbors(self):
     for self_contig in self.contigs.values():
         for other_contig, alignment in self_contig.overlaps.iteritems():
             x, y = alignment[0][0]
             a, b = alignment[0][1]
             self_length     = self_contig.length
             other_length    = self.contigs[other_contig.name].length
             if a < b:       # same direction
                 if x == 1:
                     make_neighbors(other_contig, self_contig)
                     # case 2
                 elif a == 1:
                     # case 1
                     make_neighbors(self_contig, other_contig)
             elif a > b:    # 'other' is in reversed direction
                 if x == 1:
                     make_neighbors(self_contig, other_contig)
                     if not other_contig.reversed:
                         # TODO: this...probably needs to be more sensitive 
                         other_contig.sequence = util.reverse_complement(other_contig.sequence)
                         other_contig.reversed = True                      
                     # case 4
                 elif a == other_length:
                     make_neighbors(other_contig, self_contig)
                     if not other_contig.reversed: 
                         other_contig.sequence = util.reverse_complement(other_contig.sequence)
                         other_contig.reversed = True                      
Ejemplo n.º 2
0
    def _extend_match(self, pair, ap):
        run = self._run
        masklen = pair.mask.length()
        dumblen = len(run.dumbbell) if run.dumbbell else 0

        ## Align remaining R2 first to find adapter overhang.
        ## TAI: probably better way of doing adapter alignment  (see indels_7 testcase)
        r2suffix = "" if run.cotrans else reverse_complement(
            pair.r1.original_seq[:masklen]) + reverse_complement(run.adapter_t)
        pair.r2.extend_alignment(pair.target, ap, r2suffix)
        if run.dumbbell:
            pair.dumbbell = pair.r2.match_index - dumblen

        ## Trim the adapters off both R1 and R2
        if not self._trim_adapters(pair):
            if not pair.failure:
                pair.failure = Failures.adapter_trim
                self.counters.adapter_trim_failure += pair.multiplicity
            return False

        ## Now align remaining R1 if necessary
        if not pair.r1.fully_matched:
            pair.r1.extend_alignment(pair.target, ap)
            # we may have not trimmed enough
            r1_overhang = pair.r1.right_est - pair.target.n
            if r1_overhang > 0:
                pair.r1.ltrim += r1_overhang  # also updates match_len

        return True
Ejemplo n.º 3
0
    def build_cotrans_lookups(self, run):
        # for cotrans experiments, R1 includes a linker and is relatively restricted
        # store RC in table so that we can directly lookup R1[4:]

        # TODO: this could be set to run.minimum_target_match_length, but given that there's linker
        # involved, it makes sense to assume that a match including linker will hit that minimum.
        # so for this we can include very small bits of the actual target.
        minimum_target_length = 3

        linker = run.cotrans_linker
        linker_len = len(linker)
        r1_table = {}

        if 1 != len(self.targets):
            raise Exception("cotrans requires one target")
        target = self.targets[0]
        tseq = target.seq
        tlen = len(tseq)
        adapter_b = run.adapter_b
        pair_len = run.pair_length
        assert(0 < pair_len)
        masklen = 4    # TODO
        r1_match_len = pair_len - masklen

        for end in xrange(minimum_target_length, tlen + 1):
            target_subseq = tseq[:end]
            for i in xrange(0, r1_match_len - linker_len - minimum_target_length + 1):
                tstart = i - (r1_match_len - linker_len)
                if tstart + end < 0:
                    continue
                target_bit = target_subseq[tstart:]
                r1_rc_match = target_bit + linker
                r1_match = reverse_complement(r1_rc_match) + adapter_b[:i]
                entries = r1_table.get(r1_match, [])
                entries.append( (target, end, i, []) ) # target, end, amount of adapter to trim, mutations
                r1_table[r1_match] = entries

                if run.count_mutations:
                    bit_len = len(target_bit)
                    for toggle_idx in xrange(bit_len):
                        for nt in [ 'A', 'C', 'G', 'T' ]:
                            if target_bit[toggle_idx] == nt:
                                continue
                            mutated_bit = target_bit[:toggle_idx] + nt + target_bit[toggle_idx + 1:]
                            mutated_rc_match = mutated_bit + linker
                            mutated_match = reverse_complement(mutated_rc_match) + adapter_b[:i]
                            entries = r1_table.get(mutated_match, [])
                            entries.append( (target, end, i, [end - (bit_len - toggle_idx) + 1]) )
                            r1_table[mutated_match] = entries

        self.r1_lookup = r1_table
        self._build_R1_aliases(adapter_b, r1_match_len)

        # we only need to build R2 lookups for full sequences (excepting linker)
        # trim cases are just tested against R1
        self._build_R2_lookup(pair_len - linker_len - masklen, run.count_mutations)
Ejemplo n.º 4
0
 def _check_targetrc(self, pair):
     rcpair = Pair()
     rcpair.set_from_data(pair.identifier,
                          reverse_complement(pair.r1.original_seq),
                          reverse_complement(pair.r2.original_seq))
     if self._run.cotrans:
         rcpair.r2.linker_start = pair.r2.linker_start
     self._find_matches(rcpair)
     if rcpair.matched or (self._run.cotrans
                           and self._cotrans_find_short_matches(rcpair)):
         self.counters.dna_residual_pairs += 1
Ejemplo n.º 5
0
 def _cotrans_find_short_matches(self, pair):
     run = self._run
     r2li = pair.r2.linker_start
     if r2li <= 0 or pair.r2.original_len - r2li < len(run.cotrans_linker):
         self.counters.unmatched += pair.multiplicity
         pair.failure = Failures.nomatch
         return False
     target = self._targets.targets[0]
     r2spots = string_find_errors(pair.r2.subsequence[:r2li], target.seq,
                                  run.allowed_target_errors)
     r1spots = string_find_errors(
         reverse_complement(pair.r1.subsequence[:r2li]), target.seq,
         run.allowed_target_errors)
     if len(r1spots) == 0 or len(
             r2spots) == 0 or len(r1spots) != len(r2spots):
         self.counters.unmatched += pair.multiplicity
         pair.failure = Failures.nomatch
         return False
     elif len(r1spots) > 1 or len(r2spots) > 1:
         pair.failure = Failures.multiple_R1
         return False
     pair.target = target
     pair.r1.match_index = r1spots[0]
     pair.r1.match_start = pair.r1.seq_len - r2li
     pair.r1.match_len = r2li
     pair.r2.match_index = r2spots[0]
     pair.r2.match_start = 0
     pair.r2.match_len = r2li
     pair.fully_matched = True  # also sets the trim
     pair.linker = pair.r1.match_index + pair.r1.match_len
     return True
Ejemplo n.º 6
0
 def basic_indel(self, indel):
     my_sequence     = indel.contig1.sequence
     other_sequence  = indel.contig2.sequence
     indel_start     = my_sequence[:indel.b]
     if indel.direction == "forward":
         indel_middle = other_sequence[indel.y:indel.z-1]
     elif indel.direction == "reverse":
         indel_middle  = util.reverse_complement(other_sequence[indel.z:indel.y-1])
     indel_end       = my_sequence[indel.c-1:]
     indel_sequence  = indel_start + indel_middle + indel_end
     indel_len       = indel.z - indel.y - 1
     L = indel_len
     self.contigs[indel.contig1.name].sequence = indel_sequence
     self.contigs[indel.contig1.name].length   = len(indel_sequence)
     self.contigs[indel.contig1.name].origin   = "indel" # TODO: adjust this to keep a running trail of added contigs
     try:
         del(self.contigs[indel.contig2.name].overlaps[indel.contig1])
         if len(self.contigs[indel.contig2.name].overlaps) == 0:
             print("Deleted contig %s") % indel.contig2.name
             del(self.contigs[indel.contig2.name])
         else:
             pass
             # update overlaps
             # MAJOR TODO: also, need to handle updating overlap info for contig1
         del(self.contigs[indel.contig1.name].overlaps[indel.contig2])
     except:
         printwithtime("Hey, you've probably found an error, please report this")
Ejemplo n.º 7
0
 def _verify_full_match(self, pair):
     run = self._run
     if not run.cotrans:
         maxmatch = min(pair.target.n,
                        pair.r1.match_index + pair.r1.match_len)
         if pair.r2.match_index + pair.r2.seq_len > maxmatch:
             masklen = pair.mask.length()
             trimmed = pair.r2.right_est - maxmatch
             if trimmed > masklen:
                 pair.r2.adapter_trimmed = pair.r2.subsequence[masklen -
                                                               trimmed:]
                 pair.r2.rtrim = trimmed
     if pair.r2.adapter_trimmed:
         if not pair.r2.check_adapter_trim(
                 reverse_complement(run.adapter_t), run):
             pair.failure = Failures.adapter_trim
             self.counters.adapter_trim_failure += pair.multiplicity
             return False
         self.counters.adapter_trimmed += pair.multiplicity
     if pair.r1.match_index == 0 and pair.r1.match_start > 0:
         pair.r1.adapter_trimmed = pair.r1.reverse_complement[-pair.r1.
                                                              match_start:]
         pair.r1.rtrim += pair.r1.match_start
         if pair.linker:
             pair.linker -= pair.r1.match_start
         pair.r1.match_start = 0
         # Note: if we had a full dumbbell, it would have been entirely trimmed already
         # So we either have no dumbbell and an adapter or a partial dumbbell only.
         if not run.dumbbell and pair.r1.adapter_trimmed:
             if not pair.r1.check_adapter_trim(run.adapter_b, run):
                 pair.failure = Failures.adapter_trim
                 self.counters.adapter_trim_failure += pair.multiplicity
                 return False
             #self.counters.adapter_trimmed += pair.multiplicity    # Note: only counting R2 trimming
     return self._recheck_targets(pair)
Ejemplo n.º 8
0
    def _build_R1_lookup(self, adapter_b, length = 31, end_only = True, mutations = False, dumbbell = None):
        # we can pre-build the set of all possible (error-free) R1, b/c:
        #  - R1 has to include the right-most nt
        #  - R1 can include some adapter-b (or dumbbell) off the end
        #  - this is done for each target
        #  - note that in cases where R1 includes some (enough) adapter, then position and content of R2 is determined
        # note that this does *not* include the handle.
        r1_table = {}
        use_aliases = False
        for target in self.targets:
            tlen = target.n
            rc_tgt = reverse_complement(target.seq)
            rc_dumbbell = reverse_complement(dumbbell) if dumbbell else None
            tcandidates = 0
            for i in xrange(1, length + 1):
                if rc_dumbbell:
                    if length - i <= len(rc_dumbbell):
                        r1_candidate = rc_tgt[:i] + rc_dumbbell[:length - i]
                    else:
                        r1_candidate = rc_tgt[:i] + rc_dumbbell + adapter_b[:length - len(rc_dumbbell) - i]
                else:
                    r1_candidate = rc_tgt[:i] + adapter_b[:length - i]
                res = (target, None if i == length else tlen - i, length - i, []) # target, end, amount of adapter to trim, mutations
                existing = r1_table.get(r1_candidate)
                if existing:
                    existing.append(res)
                else:
                    r1_table[r1_candidate] = [ res ]
                tcandidates += 1
                if mutations:
                    for toggle_idx in xrange(i):
                        for nt in [ 'A', 'C', 'G', 'T' ]:
                            if r1_candidate[toggle_idx] == nt:
                                continue
                            mutated_bit = r1_candidate[:toggle_idx] + nt + r1_candidate[toggle_idx + 1:]
                            mres = (res[0], res[1], res[2], [ tlen - toggle_idx ])
                            existing = r1_table.get(mutated_bit)
                            if existing:
                                existing.append(mres)
                            else:
                                r1_table[mutated_bit] = [ mres ]

            if 0 == tcandidates:
                _warn("!! No R1 match candidates for {}".format(target.name))

        self.r1_lookup = r1_table
        self._build_R1_aliases(adapter_b, length)
Ejemplo n.º 9
0
def encode(dna, peptide):
    rna = transcribe(dna)
    reverse_complement_rna = transcribe(reverse_complement(dna))
    rna_substr_len = len(peptide) * 3

    return [dna[i:i + rna_substr_len] for i in range(0, len(rna) - rna_substr_len + 1) if
            translate(rna[i:i + rna_substr_len]) == peptide or
            translate(reverse_complement_rna[len(rna) - (i + rna_substr_len):len(rna) - i]) == peptide]
Ejemplo n.º 10
0
def valid(exclude, seq):
    if not exclude:
        return True
    for e in exclude:
        if (seq.find(e) >= 0 or 
            seq.find(util.reverse_complement(e)) >= 0):
            return False
    return True
Ejemplo n.º 11
0
    def _trim_adapters(self, pair):
        run = self._run
        masklen = pair.mask.length()
        if not run.cotrans:
            # trim everything beyond the end of the target (including mask if there)
            maxr2 = pair.r1.right_est if (
                run.allow_multiple_rt_starts
                and pair.r1.right_est < pair.target.n) else pair.target.n
            full_trim = pair.r2.right_est - maxr2
            if full_trim > 0:
                if full_trim > masklen:
                    pair.r2.adapter_trimmed = pair.r2.subsequence[masklen -
                                                                  full_trim:]
                pair.r2.rtrim += full_trim  # also updates indels and match_len in r2
        if pair.r2.adapter_trimmed:
            if not pair.r2.check_adapter_trim(
                    reverse_complement(run.adapter_t), run):
                return False
            self.counters.adapter_trimmed += pair.multiplicity
        if pair.r1.fully_rtrimmed:
            return True
        ## Note: we really shouldn't trim r1 prior to aligning b/c it could have big inserts, but
        ## trimming greatly saves alignment time, so compromise by assuming that indels (mostly) match
        ## between reads so use R2's indels_delta to buffer trim amount.
        if run.cotrans:
            r2id = pair.r2.indels_delta_before(pair.r1.match_index)
            r1_adapter_len = pair.r1.match_start - (pair.r1.match_index -
                                                    pair.r2.match_index + r2id)
        else:
            longest = pair.r1.right_est if (
                run.allow_multiple_rt_starts
                and pair.r1.right_est < pair.target.n) else pair.target.n
            possible_matchlen = min(
                longest - max(0, pair.r2.match_index - pair.r2.match_start),
                pair.r1.seq_len + pair.r1.match_index - pair.r1.match_start)

            r1_adapter_len = pair.r1.seq_len - possible_matchlen - pair.r2.indels_delta
        dumblen = len(run.dumbbell) if run.dumbbell else 0
        if run.minimum_adapter_len and r1_adapter_len - dumblen < run.minimum_adapter_len:
            return False
        if r1_adapter_len > 0:
            if r1_adapter_len > dumblen:
                pair.r1.adapter_trimmed = pair.r1.subsequence[dumblen -
                                                              r1_adapter_len:]
                if not pair.r1.check_adapter_trim(run.adapter_b, run):
                    return False
            pair.r1.rtrim += r1_adapter_len
            pair.r1.match_start -= r1_adapter_len
            if pair.linker:
                pair.linker -= r1_adapter_len
            if pair.r1.match_start < 0:
                pair.r1.match_index -= pair.r1.match_start
                pair.r1.match_len += pair.r1.match_start
                pair.r1.match_start = 0
            #self.counters.adapter_trimmed += pair.multiplicity   # Note:  only counting R2 trimming
        return self._recheck_targets(pair)
Ejemplo n.º 12
0
 def _validate(self, seq):
     if not self.exclude:
         return True
     for e in self.exclude:
         test_seq = (self.upstream[-len(e)+1:] + seq +
                     self.downstream[:len(e)-1])
         if (test_seq.find(e) >= 0 or 
             test_seq.find(util.reverse_complement(e)) >= 0):
             return False
     return True
Ejemplo n.º 13
0
    def __init__(self, contig1, contig2, name=None):
        self.contig1 = contig1
        self.contig2 = contig2
        self.a, self.b = contig1.overlaps[contig2][0][0]
        self.c, self.d = contig1.overlaps[contig2][1][0]
        self.x, self.y = contig1.overlaps[contig2][0][1]
        self.z, self.w = contig1.overlaps[contig2][1][1]
        self.name = str(self.contig1) + " indel " + str(self.contig2)
        if self.x < self.w:
            self.direction = "forward"
        elif self.x > self.w:
            self.direction = "reverse"
        else:
            self.direction = "confused"

        if self.direction == "forward":
            self.R = contig1.sequence[self.c - 1:self.b - 1]
            self.I = contig2.sequence[self.y - 1:self.z - 1]
        elif self.direction == "reverse":
            self.R = util.reverse_complement(contig1.sequence[self.b -
                                                              1:self.c - 1])
            self.I = util.reverse_complement(contig2.sequence[self.z -
                                                              1:self.y - 1])
Ejemplo n.º 14
0
 def _make_adapter_line(self, part, adapter, label):
     is_R1 = (part == self.pair.r1)
     if is_R1:
         if part.right and part.rtrim > self.dumblen:
             alen = part.rtrim - self.dumblen
             d = self._make_prefix(label)
             d, match_index = self._adj_front(part, d)
             if alen > len(adapter):
                 d += sp(alen - len(adapter))
                 alen = len(adapter)
             if part.adapter_errors:
                 errors = sp(alen)
                 for e in part.adapter_errors:
                     if e < alen:
                         errors = errors[:e] + "!" + errors[e + 1:]
                 errors = errors[::-1]
                 if errors[0] == " ":
                     errors = "|" + errors[1:]
                 if errors[-1] == " ":
                     errors = errors[:-1] + "|"
                 self._add_line(d + errors)
             d += adapter[:alen][::-1]
             self._add_line(d)
         if self.dumblen > 0:
             dumbbell_part = min(part.rtrim, self.dumblen)
             d = self._make_prefix('c(DUMBBELL)')
             d, _ = self._adj_front(part, d)
             d += sp(part.rtrim - dumbbell_part)
             d += reverse_complement(
                 self.run.dumbbell[:dumbbell_part])[::-1]
             self._add_line(d)
     elif part.left is not None and (
             part.rtrim > self.masklen + self.linkerlen
             or self.dumblen > 0):
         d = self._make_prefix(label)
         d, match_index = self._adj_front(part, d)
         if self.dumblen > 0:
             d += self.run.dumbbell
             d += sp(part.ltrim - len(self.run.dumbbell))
         else:
             d += sp(match_index)
         d += sp(part.match_len + 1)
         d += sp(self.masklen + 1)
         if self.run.cotrans:
             d += sp(self.linkerlen)
         if adapter and part.rtrim > self.masklen + self.linkerlen:
             d += (adapter[:part.rtrim - self.masklen - self.linkerlen] +
                   "...")
         self._add_line(d)
Ejemplo n.º 15
0
def run():
    # Remove primers from FASTA/FASTQ file
    
    # Get command line arguments
    args = parse_args()
    
    # Get primer sequences
    if args.p:
        primers = [args.p]
    elif args.l:
        primers = [line.rstrip() for line in open(args.l)]
    else:
        quit('Error: must specify primer or primer list')

    # Get FASTA/FASTQ iterators
    if args.f:
        fn = args.f
        iter_fst = util.iter_fst
    elif args.q:
        fn = args.q
        iter_fst = util.iter_fsq
    else:
        quit('Error: must specify FASTA or FASTQ file')
    
    # Iterate through FASTA/FASTQ file
    n = 0
    t = 0
    for record in iter_fst(fn):
        t += 1
        seq = record[1]
        [i,d,p] = find_best_match(seq, primers, args.w, args.d)
        if i != '':
            record[1] = record[1][i + len(p):]
            record[3] = record[3][i + len(p):]
            if args.b == False:
                print '\n'.join(record)
                n += 1
            if args.b == True:
                SEQ = util.reverse_complement(record[1])
                [I,D,P] = find_best_match(SEQ, primers, args.w, args.d)
                if I != '':
                    record[1] = record[1][:-(I + len(P))]
                    record[3] = record[3][:-(I + len(P))]
                    print '\n'.join(record)
                    n += 1
    
    sys.stderr.write('%d total sequences\n' %(t))
    sys.stderr.write('%d primers removed (%.2f%%)\n' %(n, 100.*n/t))
Ejemplo n.º 16
0
def run():
    # Remove primers from FASTA/FASTQ file

    # Get command line arguments
    args = parse_args()

    # Get primer sequences
    if args.p:
        primers = [args.p]
    elif args.l:
        primers = [line.rstrip() for line in open(args.l)]
    else:
        quit('Error: must specify primer or primer list')

    # Get FASTA/FASTQ iterators
    if args.f:
        fn = args.f
        iter_fst = util.iter_fst
    elif args.q:
        fn = args.q
        iter_fst = util.iter_fsq
    else:
        quit('Error: must specify FASTA or FASTQ file')

    # Iterate through FASTA/FASTQ file
    n = 0
    t = 0
    for record in iter_fst(fn):
        t += 1
        seq = record[1]
        [i, d, p] = find_best_match(seq, primers, args.w, args.d)
        if i != '':
            record[1] = record[1][i + len(p):]
            record[3] = record[3][i + len(p):]
            if args.b == False:
                print '\n'.join(record)
                n += 1
            if args.b == True:
                SEQ = util.reverse_complement(record[1])
                [I, D, P] = find_best_match(SEQ, primers, args.w, args.d)
                if I != '':
                    record[1] = record[1][:-(I + len(P))]
                    record[3] = record[3][:-(I + len(P))]
                    print '\n'.join(record)
                    n += 1

    sys.stderr.write('%d total sequences\n' % (t))
    sys.stderr.write('%d primers removed (%.2f%%)\n' % (n, 100. * n / t))
Ejemplo n.º 17
0
 def _make_adapter_line(self, part, adapter, label):
     is_R1 = (part == self.pair.r1)
     if is_R1:
         if part.right and part.rtrim > self.dumblen:
             alen = part.rtrim - self.dumblen
             d = self._make_prefix(label)
             d, match_index = self._adj_front(part, d)
             if alen > len(adapter):
                 d += sp(alen - len(adapter))
                 alen = len(adapter)
             if part.adapter_errors:
                 errors = sp(alen)
                 for e in part.adapter_errors:
                     if e < alen:
                         errors = errors[:e] + "!" + errors[e+1:]
                 errors = errors[::-1]
                 if errors[0] == " ":
                     errors = "|" + errors[1:]
                 if errors[-1] == " ":
                     errors = errors[:-1] + "|"
                 self._add_line(d + errors)
             d += adapter[:alen][::-1]
             self._add_line(d)
         if self.dumblen > 0:
             dumbbell_part = min(part.rtrim, self.dumblen)
             d = self._make_prefix('c(DUMBBELL)')
             d, _ = self._adj_front(part, d)
             d += sp(part.rtrim - dumbbell_part)
             d += reverse_complement(self.run.dumbbell[:dumbbell_part])[::-1]
             self._add_line(d)
     elif part.left is not None and (part.rtrim > self.masklen + self.linkerlen or self.dumblen > 0):
         d = self._make_prefix(label)
         d, match_index = self._adj_front(part, d)
         if self.dumblen > 0:
             d += self.run.dumbbell
             d += sp(part.ltrim - len(self.run.dumbbell))
         else:
             d += sp(match_index)
         d += sp(part.match_len + 1)
         d += sp(self.masklen + 1)
         if self.run.cotrans:
             d += sp(self.linkerlen)
         if adapter and part.rtrim > self.masklen + self.linkerlen:
             d += (adapter[:part.rtrim - self.masklen - self.linkerlen] + "...")
         self._add_line(d)
Ejemplo n.º 18
0
    def make(self):
        self.lines = []

        # handling left-of-target matches
        self.prefix_len += max(
            self.pair.r1.rtrim - (self.pair.r1.match_index or 0) + 1,
            self.pair.r2.ltrim, max(-self.match_index(self.pair.r2), 0))

        self._add_line("@" + self.pair.identifier)

        r1_bars = self._make_part(self.pair.r1)
        self.bars.append(r1_bars)
        if self.pair.r1.trimmed:
            self._make_adapter_line(self.pair.r1, self.run.adapter_b,
                                    "adapter_b")

        if self.pair.mask and self.pair.r1.matched:
            self._add_line("")
            if self.pair.r1.indels:
                self._make_part_ins(self.pair.r1)
            self._make_r1_rev()

        if self.pair.r1.match_errors or self.pair.r1.adapter_errors:
            r1_errors = self._make_part_errors(self.pair.r1)
        else:
            r1_errors = None
        if self.show_quality:
            self._make_part_quality(self.pair.r1)

        if self.pair.linker != None:
            self._make_linker()

        self._add_line("")

        if self.pair.r2.indels:
            self._make_part_ins(self.pair.r2)
        r2_bars = self._make_part(self.pair.r2)
        self.bars.append(r2_bars)
        if self.pair.r2.match_errors or self.pair.r2.adapter_errors:
            r2_errors = self._make_part_errors(self.pair.r2)
        else:
            r2_errors = None
        if self.show_quality:
            self._make_part_quality(self.pair.r2)
        if self.pair.r2.trimmed:
            label = "DB+RC(adp_t)" if self.pair.dumbbell is not None else "RC(adapter_t)"
            self._make_adapter_line(self.pair.r2,
                                    reverse_complement(self.run.adapter_t),
                                    label)
        elif self.pair.dumbbell is not None:
            self._make_adapter_line(self.pair.r2, None, "DUMBBELL")

        self._add_line("")

        self._make_target_lines()
        if r1_errors:
            self.bars.remove(r1_errors)
        if r2_errors:
            self.bars.remove(r2_errors)

        features = {}
        if self.pair.site != None:
            features['Site'] = self.pair.site
        if self.pair.end != None:
            # note: decrement to make off-by-one conventions align better
            # xref 'conventions' above
            features['End'] = self.pair.end - 1
        if self.pair.mutations:
            idx = 1
            for mut in self.pair.mutations:
                if self.pair.edge_mut:
                    # note: decrement to make off-by-one conventions align better
                    # xref 'conventions' above
                    features['EdgeMut{} ({})'.format(
                        idx, self.pair.edge_mut)] = mut - 1
                else:
                    # note: decrement to make off-by-one conventions align better
                    # xref 'conventions' above
                    features['Mut{}'.format(idx)] = mut - 1
                idx += 1
        for v in features.values():
            self.bars.append([v + self.prefix_len])

        if self.pair.r2.matched:
            self._make_result(self.pair.r2)
        self.bars.remove(r2_bars)
        if self.pair.r1.matched:
            self._make_result(self.pair.r1)
        else:
            self._add_line("")
        self.bars.remove(r1_bars)

        for key, value in features.items():
            self._add_marker(key, value)
            self.bars.remove([value + self.prefix_len])

        if self.pair.matched and self.pair.left > 100:
            self._snip_lines(self.prefix_len + 15, self.prefix_len + 85)

        self._add_line("")
        self._make_summary()

        return "\n".join(self.lines)
Ejemplo n.º 19
0
    def build_cotrans_lookups(self, run):
        # for cotrans experiments, R1 includes a linker and is relatively restricted
        # store RC in table so that we can directly lookup R1[4:]

        # TODO: this could be set to run.minimum_target_match_length, but given that there's linker
        # involved, it makes sense to assume that a match including linker will hit that minimum.
        # so for this we can include very small bits of the actual target.
        minimum_target_length = 3

        linker = run.cotrans_linker
        linker_len = len(linker)
        r1_table = {}

        if 1 != len(self.targets):
            raise Exception("cotrans requires one target")
        target = self.targets[0]
        tseq = target.seq
        tlen = len(tseq)
        adapter_b = run.adapter_b
        pair_len = run.pair_length
        assert (0 < pair_len)
        masklen = 4  # TODO
        r1_match_len = pair_len - masklen

        for end in xrange(minimum_target_length, tlen + 1):
            target_subseq = tseq[:end]
            for i in xrange(
                    0, r1_match_len - linker_len - minimum_target_length + 1):
                tstart = i - (r1_match_len - linker_len)
                if tstart + end < 0:
                    continue
                target_bit = target_subseq[tstart:]
                r1_rc_match = target_bit + linker
                r1_match = reverse_complement(r1_rc_match) + adapter_b[:i]
                entries = r1_table.get(r1_match, [])
                entries.append(
                    (target, end, i,
                     []))  # target, end, amount of adapter to trim, mutations
                r1_table[r1_match] = entries

                if run.count_mutations:
                    bit_len = len(target_bit)
                    for toggle_idx in xrange(bit_len):
                        for nt in ['A', 'C', 'G', 'T']:
                            if target_bit[toggle_idx] == nt:
                                continue
                            mutated_bit = target_bit[:
                                                     toggle_idx] + nt + target_bit[
                                                         toggle_idx + 1:]
                            mutated_rc_match = mutated_bit + linker
                            mutated_match = reverse_complement(
                                mutated_rc_match) + adapter_b[:i]
                            entries = r1_table.get(mutated_match, [])
                            entries.append(
                                (target, end, i,
                                 [end - (bit_len - toggle_idx) + 1]))
                            r1_table[mutated_match] = entries

        self.r1_lookup = r1_table
        self._build_R1_aliases(adapter_b, r1_match_len)

        # we only need to build R2 lookups for full sequences (excepting linker)
        # trim cases are just tested against R1
        self._build_R2_lookup(pair_len - linker_len - masklen,
                              run.count_mutations)
Ejemplo n.º 20
0
 def _adapter_t_rc(self):
     if not self.__adapter_t_rc:
         self.__adapter_t_rc = reverse_complement(self._run.adapter_t)
     return self.__adapter_t_rc
Ejemplo n.º 21
0
            # get intron
            intron = fna[contig][ibeg:(beg - 1)]
            ibeg = end
            if ipre and not args.exon and len(intron) > 0:
                intron_seq = re.sub(
                    '^>\.', '>',
                    '>%s.intron_%s %s\n%s %s' % (pre, ipre, gname, intron))
                writeout(intron_seq, org=pre, gene=gname)
            ipre = gid
            if ifin == '':
                ifin = beg

            # get gene
            gene = fna[contig][(beg - 1):end]
            if strand == '-':
                gene = util.reverse_complement(gene)
            if args.regex and not re.search(args.regex, gname):
                continue
            if len(gene) > 0:
                exon_seq = re.sub('^>\.', '>',
                                  '>%s.%s %s\n%s' % (pre, gid, gname, gene))
                writeout(exon_seq, org=pre, gene=gname)

# final intron
intron = fna[contig][ibeg:] + fna[contig][:(ifin - 1)]
if not args.exon and len(intron) > 0:
    intron_seq = re.sub('^>\.', '>',
                        '>%s.intron_%s %s\n%s' % (pre, ipre, gname, intron))
    writeout(intron_seq, org=pre, gene=gname)

for fh in fhs:
Ejemplo n.º 22
0
    def make(self):
        self.lines = [ ]

        # handling left-of-target matches
        self.prefix_len += max(self.pair.r1.rtrim - (self.pair.r1.match_index or 0) + 1,
                               self.pair.r2.ltrim, max(-self.match_index(self.pair.r2), 0))

        self._add_line("@" + self.pair.identifier)
        
        r1_bars = self._make_part(self.pair.r1)
        self.bars.append(r1_bars)
        if self.pair.r1.trimmed:
            self._make_adapter_line(self.pair.r1, self.run.adapter_b, "adapter_b")

        if self.pair.mask and self.pair.r1.matched:
            self._add_line("")
            if self.pair.r1.indels:
                self._make_part_ins(self.pair.r1)
            self._make_r1_rev()

        if self.pair.r1.match_errors or self.pair.r1.adapter_errors:
            r1_errors = self._make_part_errors(self.pair.r1)
        else:
            r1_errors = None
        if self.show_quality:
            self._make_part_quality(self.pair.r1)

        if self.pair.linker != None:
            self._make_linker()

        self._add_line("")

        if self.pair.r2.indels:
            self._make_part_ins(self.pair.r2)
        r2_bars = self._make_part(self.pair.r2)
        self.bars.append(r2_bars)
        if self.pair.r2.match_errors or self.pair.r2.adapter_errors:
            r2_errors = self._make_part_errors(self.pair.r2)
        else:
            r2_errors = None
        if self.show_quality:
            self._make_part_quality(self.pair.r2)
        if self.pair.r2.trimmed:
            label = "DB+RC(adp_t)" if self.pair.dumbbell is not None else "RC(adapter_t)"
            self._make_adapter_line(self.pair.r2, reverse_complement(self.run.adapter_t), label)
        elif self.pair.dumbbell is not None:
            self._make_adapter_line(self.pair.r2, None, "DUMBBELL")

        self._add_line("")

        self._make_target_lines()
        if r1_errors:
            self.bars.remove(r1_errors)
        if r2_errors:
            self.bars.remove(r2_errors)

        features = {}
        if self.pair.site != None:
            features['Site'] = self.pair.site
        if self.pair.end != None:
            # note: decrement to make off-by-one conventions align better
            # xref 'conventions' above
            features['End'] = self.pair.end - 1
        if self.pair.mutations:
            idx = 1
            for mut in self.pair.mutations:
                if self.pair.edge_mut:
                    # note: decrement to make off-by-one conventions align better
                    # xref 'conventions' above
                    features['EdgeMut{} ({})'.format(idx, self.pair.edge_mut)] = mut - 1
                else:
                    # note: decrement to make off-by-one conventions align better
                    # xref 'conventions' above
                    features['Mut{}'.format(idx)] = mut - 1
                idx += 1
        for v in features.values():
            self.bars.append([v + self.prefix_len])

        if self.pair.r2.matched:
            self._make_result(self.pair.r2)
        self.bars.remove(r2_bars)
        if self.pair.r1.matched:
            self._make_result(self.pair.r1)
        else:
            self._add_line("")
        self.bars.remove(r1_bars)

        for key, value in features.items():
            self._add_marker(key, value)
            self.bars.remove([value + self.prefix_len])

        if self.pair.matched and self.pair.left > 100:
            self._snip_lines(self.prefix_len + 15, self.prefix_len + 85)

        self._add_line("")
        self._make_summary()

        return "\n".join(self.lines)
Ejemplo n.º 23
0
 def get_node_seq(self, nodeID):
         if nodeID < 0:
                 return reverse_complement(self.nodes[-nodeID - 1].seq)
         else:
                 return self.nodes[nodeID - 1].seq
Ejemplo n.º 24
0
 def reverse_complement(self):
     if not self._seq_rc:
         self._seq_rc = reverse_complement(self.subsequence)
     return self._seq_rc
Ejemplo n.º 25
0
 def reverse_complement(self):
     if not self._seq_rc:
         self._seq_rc = reverse_complement(self.subsequence)
     return self._seq_rc
Ejemplo n.º 26
0
    def process_pair(self, pair):
        if not self._check_indeterminate(pair) or not self._match_mask(pair):
            return

        run = self._run
        masklen = pair.mask.length()
        pair.r2.auto_adjust_match = True
        #pair.r1.auto_adjust_match = True

        ## Pre-trim dumbbell from R2 immediately since it won't help with alignment
        ## and simplifies dealing with match_index, match_start, etc.
        ## (It's also a bit faster for SW.)
        dumblen = 0
        if run.dumbbell:
            if not pair.r2.original_seq.startswith(run.dumbbell):
                pair.failure = Failures.dumbbell
                self.counters.dumbbell_failures += pair.multiplicity
                return
            dumblen = len(run.dumbbell)
            pair.r2.ltrim = dumblen
            pair.dumbbell = -dumblen
            # Also may as well trim R1 dumbbell if it's there in full...
            # TAI:  this may get the adapter too. if so, do we need to count that?  (no. only counting R2s.)
            # TAI:  but we're not checking the adapter for errors here.
            dbi = pair.r1.original_seq.rfind(
                reverse_complement(run.dumbbell),
                masklen + run.minimum_target_match_length)
            if -1 != dbi:
                pair.r1.rtrim = pair.r1.original_len - dbi
                pair.r1.fully_rtrimmed = True

        ## Also pre-trim any (full) adapter on R1 for the same reason
        if not pair.r1.fully_rtrimmed and run.adapter_b:
            adi = pair.r1.original_seq.rfind(
                run.adapter_b, masklen + run.minimum_target_match_length)
            if -1 != adi:
                pair.r1.rtrim = pair.r1.original_len - adi
                pair.r1.fully_rtrimmed = True
                self.counters.adapter_trimmed += pair.multiplicity

        ## Finally, pre-trim any linker from R1 and R2 since these'll screw up alignment
        if run.cotrans:
            if pair.r1.subsequence.startswith(
                    reverse_complement(run.cotrans_linker)):
                pair.r1.ltrim += len(run.cotrans_linker)
                pair.linker = pair.r1.seq_len  # linker is currently in R1 coordinates
            else:
                pair.failure = Failures.linker
                return
            pair.r2.linker_start = string_find_with_overlap(
                run.cotrans_linker, pair.r2.subsequence)
            if -1 != pair.r2.linker_start:
                # Trimming partial R2 linker matches here is a little overzealous.
                # (if, say, only 1 bp matches the linker, it could also match the target.)
                # However, we need to do it to prevent partial linker matches from screwing
                # up our alignment with the target.
                # After alignment, we circle back to re-add what we trimmed if it was a mistake.
                full_trim_len = pair.r2.seq_len - pair.r2.linker_start
                pair.r2.linker_trim_len = full_trim_len
                r2_adapter_len = full_trim_len - len(
                    run.cotrans_linker) - masklen
                if r2_adapter_len > 0:
                    pair.r2.adapter_trimmed = pair.r2.subsequence[
                        -r2_adapter_len:]
                    pair.r2.linker_trim_len = max(
                        0, full_trim_len - r2_adapter_len)
                pair.r2.rtrim += full_trim_len
                pair.r2.fully_rtrimmed = True

        ## Now find best exactly matching substring with target
        self._find_matches(pair)
        if not pair.matched:
            if not run.cotrans or pair.r1.ltrim == 0:
                self.counters.unmatched += pair.multiplicity
                pair.failure = Failures.nomatch
                self._check_targetrc(pair)
                return
            if not self._cotrans_find_short_matches(pair):
                self._check_targetrc(pair)
                return

        ap = None
        if run.handle_indels:
            if run.allow_indeterminate:
                simfn = lambda nt1, nt2: base_similarity_ind(
                    nt1, nt2, run.indel_match_value, run.indel_mismatch_cost,
                    .5 * run.indel_match_value)
            else:
                simfn = lambda nt1, nt2: AlignmentParams.char_sim(
                    nt1, nt2, run.indel_match_value, run.indel_mismatch_cost)
            ap = AlignmentParams(simfn, run.indel_gap_open_cost,
                                 run.indel_gap_extend_cost)

        ## And extend the match if necessary using string alignment on the rest
        aligned = False
        if pair.fully_matched:
            # Don't just trim adapter here b/c in cotrans case it has already been done
            if not self._verify_full_match(pair):
                return
        elif run.handle_indels:
            if not self._extend_match(pair, ap):
                return
            aligned = True
        elif not self._trim_adapters(pair):
            if not pair.failure:
                pair.failure = Failures.adapter_trim
                self.counters.adapter_trim_failure += pair.multiplicity
            return

        if pair.linker:
            ## shift pair.linker into target coordinates.
            ## pair.linker now equates to 1 beyond the end of the match in the *target* (where the linker starts)
            ## TAI:  keep in R1 coordinates?
            pair.linker += pair.r1.match_index - pair.r1.match_start - pair.r1.indels_delta

        if run.cotrans:
            if pair.r2.linker_start != -1 and pair.r2.linker_trim_len > 0 and pair.r2.right < pair.r1.right:
                # what we trimmed off of R2 above *wasn't* the linker! restore it.
                pair.r2.rtrim -= pair.r2.linker_trim_len
                pair.r2.fully_rtrimmed = False
                if run.handle_indels:
                    pair.r2.extend_alignment(pair.target, ap)
                else:
                    pair.r2.match_len += pair.r2.linker_trim_len
            if run.handle_indels and not pair.check_overlap(False):
                self.counters.mismatching_indel_pairs += pair.multiplicity
                #_warn("cotrans pair disagrees on overlap. alignment may be off.")
                #pair.failure = Failures.r1_r2_overlap
                #return

        # TAI:  should we treat a mutation at the site as a prefix?  what if it's at site=0?
        counted_prefix = None
        if pair.r2.match_start > pair.r2.match_index:
            assert (
                not aligned or pair.r2.match_index == 0
            )  # align_strings() will ensure this if penalize_ends is True
            r2_start_in_target = pair.r2.match_index - pair.r2.match_start  # will be negative
            self.counters.left_of_target += pair.multiplicity
            prefix = pair.r2.original_seq[dumblen:dumblen - r2_start_in_target]
            if run.count_left_prefixes:
                if (run.mutations_require_quality_score is
                        None) or pair.check_prefix_quality(
                            dumblen - r2_start_in_target,
                            run.mutations_require_quality_score, dumblen):
                    self.counters.register_prefix(prefix, pair)
                    counted_prefix = prefix
                else:
                    self.counters.low_quality_prefixes += pair.multiplicity
            if run.collapse_left_prefixes and (
                    not run.collapse_only_prefixes
                    or prefix in run._p_collapse_only_prefix_list):
                pair.r2.ltrim -= r2_start_in_target
                pair.r2.match_start += r2_start_in_target
                pair.r2.shift_indels(r2_start_in_target)
                if pair.r1.match_start > 0 and pair.r1.match_index == pair.r2.match_index:
                    r1prefix = min(-r2_start_in_target, pair.r1.match_start)
                    pair.r1.rtrim += r1prefix
                    pair.r1.match_start -= r1prefix
                    pair.r1.shift_indels(-r1prefix)
            else:
                pair.failure = Failures.left_of_zero
                self.counters.left_of_zero += pair.multiplicity
                return
        elif run.count_left_prefixes:
            counted_prefix = "NONE"
            self.counters.register_prefix(counted_prefix, pair)

        if run.cotrans and pair.right < run.cotrans_minimum_length:
            pair.failure = Failures.cotrans_min
            return

        if not aligned:
            pair.r1.match_to_seq()
            pair.r2.match_to_seq()
            pair.r1.match_errors = string_match_errors(
                pair.r1.reverse_complement,
                pair.target.seq[pair.r1.match_index:])
            pair.r2.match_errors = string_match_errors(
                pair.r2.subsequence, pair.target.seq[pair.r2.match_index:])

        if run._p_rois and (pair.r1.error_in_region(run._p_rois)
                            or pair.r2.error_in_region(run._p_rois)):
            pair.interesting = True
            self.counters.interesting_pairs += pair.multiplicity

        if max(len(pair.r1.match_errors), len(pair.r2.match_errors)) + max(
                len(pair.r1.indels), len(
                    pair.r1.indels)) > run.allowed_target_errors:
            if pair.r1.match_errors:
                _debug("R1 errors: {}".format(pair.r1.match_errors))
            if pair.r2.match_errors:
                _debug("R2 errors: {}".format(pair.r2.match_errors))
            pair.failure = Failures.match_errors
            self.counters.match_errors += pair.multiplicity
            return

        if ((not run.cotrans and pair.right != pair.target.n
             and not run.allow_multiple_rt_starts) or
            (run.cotrans and
             (pair.right > pair.target.n or pair.right < pair.left
              or pair.r1.rtrim + pair.r1.match_start + pair.r1.match_len +
              pair.r1.indels_delta + pair.r1.ltrim < pair.r1.original_len))):
            pair.failure = Failures.right_edge
            self.counters.r1_not_on_right_edge += pair.multiplicity
            return

        if run.ignore_stops_with_mismatched_overlap and not pair.check_overlap(
                True):
            pair.failure = Failures.r1_r2_overlap
            self.counters.r1_r2_overlap += pair.multiplicity
            return

        if run.count_mutations:
            pair.check_mutations()
            self.counters.register_mut_count(pair)
            if pair.mutations and len(
                    pair.mutations) > run.allowed_target_errors:
                pair.failure = Failures.match_errors
                self.counters.match_errors += pair.multiplicity
                return
            self.counters.low_quality_muts += pair.check_mutation_quality(
                run.mutations_require_quality_score)

        if run.count_only_full_reads and pair.left != 0:
            pair.failure = Failures.not_full_read
            return

        if run.handle_indels and not pair.indels_match:
            self.counters.mismatching_indel_pairs += pair.multiplicity
            # TAI:  might bail on these

        if run.allow_multiple_rt_starts and run.rt_primers:
            # Require the entire primer to be present (no stops/starts within)
            for primer in run._p_rt_primers_list:
                if len(primer) <= pair.r1.seq_len and pair.r1.subsequence[
                        0:len(primer)] == primer:
                    self.counters.increment_key('rt_primer_{}'.format(primer))
                    break
            else:
                pair.failure = Failures.no_rt_primer
                self.counters.no_rt_primer += pair.multiplicity
                return

        pair.site = pair.left
        if run._p_rois and not pair.interesting:
            for roi in run._p_rois:
                if roi[0] <= pair.site and pair.site <= roi[1]:
                    pair.interesting = True
                    self.counters.interesting_pairs += pair.multiplicity
                    break
        self.counters.register_count(pair)
        if counted_prefix:
            self.counters.register_mapped_prefix(counted_prefix, pair)
        if run.count_mutations:
            self.counters.register_mapped_mut_count(pair)
Ejemplo n.º 27
0
def test_reverse_complement():
    input = 'ACGT'
    expected = 'ACGT'
    actual = reverse_complement(input)

    assert (expected == actual)
Ejemplo n.º 28
0
    def _build_R1_lookup(self,
                         adapter_b,
                         length=31,
                         end_only=True,
                         mutations=False,
                         dumbbell=None):
        # we can pre-build the set of all possible (error-free) R1, b/c:
        #  - R1 has to include the right-most nt
        #  - R1 can include some adapter-b (or dumbbell) off the end
        #  - this is done for each target
        #  - note that in cases where R1 includes some (enough) adapter, then position and content of R2 is determined
        # note that this does *not* include the handle.
        r1_table = {}
        use_aliases = False
        for target in self.targets:
            tlen = target.n
            rc_tgt = reverse_complement(target.seq)
            rc_dumbbell = reverse_complement(dumbbell) if dumbbell else None
            tcandidates = 0
            for i in xrange(1, length + 1):
                if rc_dumbbell:
                    if length - i <= len(rc_dumbbell):
                        r1_candidate = rc_tgt[:i] + rc_dumbbell[:length - i]
                    else:
                        r1_candidate = rc_tgt[:
                                              i] + rc_dumbbell + adapter_b[:length
                                                                           -
                                                                           len(
                                                                               rc_dumbbell
                                                                           ) -
                                                                           i]
                else:
                    r1_candidate = rc_tgt[:i] + adapter_b[:length - i]
                res = (target, None if i == length else tlen - i, length - i,
                       [])  # target, end, amount of adapter to trim, mutations
                existing = r1_table.get(r1_candidate)
                if existing:
                    existing.append(res)
                else:
                    r1_table[r1_candidate] = [res]
                tcandidates += 1
                if mutations:
                    for toggle_idx in xrange(i):
                        for nt in ['A', 'C', 'G', 'T']:
                            if r1_candidate[toggle_idx] == nt:
                                continue
                            mutated_bit = r1_candidate[:
                                                       toggle_idx] + nt + r1_candidate[
                                                           toggle_idx + 1:]
                            mres = (res[0], res[1], res[2],
                                    [tlen - toggle_idx])
                            existing = r1_table.get(mutated_bit)
                            if existing:
                                existing.append(mres)
                            else:
                                r1_table[mutated_bit] = [mres]

            if 0 == tcandidates:
                _warn("!! No R1 match candidates for {}".format(target.name))

        self.r1_lookup = r1_table
        self._build_R1_aliases(adapter_b, length)
Ejemplo n.º 29
0
    def _try_lookup_hit(self, pair, r1_res):

        targets = self._targets
        run = self._run
        masklen = pair.mask.length()

        site = -1
        target = r1_res[0]
        r2len = pair.r2.original_len
        r2_mutations = []
        if r1_res[1] == None:
            r2_res = targets.lookup_r2(target.name, pair.r2.original_seq)
            if r2_res is not None:
                match_site = r2_res[0]
            else:
                pair.failure = Failures.nomatch
                return
        else:
            match_site = r1_res[1]

        match_len = min(r2len, target.n - match_site)
        r2_match_start = 0
        if run.dumbbell:
            r2_match_start = len(run.dumbbell)
            # TODO: dumbbell errors
            if pair.r2.original_seq[:r2_match_start] != run.dumbbell:
                pair.failure = Failures.dumbbell
                return
            pair.dumbbell = match_site - r2_match_start
            match_len = min(r2len - r2_match_start, target.n - match_site)
            pair.r2.ltrim = r2_match_start

        # need to check R2 against expectation
        if match_len > 0:
            pair.r2.match_errors = string_match_errors(pair.r2.original_seq[r2_match_start:match_len], target.seq[match_site:match_site + match_len])
            #+1 for M_j indexing convention, xref https://trello.com/c/2qIGo9ZR/201-stop-map-mutation-indexing-convention
            r2_mutations = map(lambda x : x + match_site + 1, pair.r2.match_errors)
        if match_len <= 0  or  len(pair.r2.match_errors) > run.allowed_target_errors:
            pair.failure = Failures.match_errors
            return

        adapter_len = r2len - match_len - r2_match_start - masklen
        if adapter_len > 0:
            pair.r2.adapter_errors = string_match_errors(self._adapter_t_rc[:adapter_len], pair.r2.original_seq[-adapter_len:])
            if len(pair.r2.adapter_errors) > run.allowed_adapter_errors:
                pair.failure = Failures.adapter_trim
                return
        site = match_site

        # in rare cases, need to double-check what R1 should be based on R2
        if site != r1_res[1]:
            match_len = min(pair.r1.original_len - masklen, target.n - match_site)
            adapter_len = pair.r1.original_len - match_len - masklen
            pair.r1.match_errors = string_match_errors(reverse_complement(target.seq[-match_len:]), pair.r1.original_seq[masklen:masklen+match_len])
            if len(pair.r1.match_errors) > run.allowed_target_errors:
                pair.failure = Failures.match_errors
                return
            if adapter_len > 0 and pair.dumbbell:
                if adapter_len > r2_match_start:
                    dumbbell_len = len(run.dumbbell)
                    adapter_len -= r2_match_start
                else:
                    dumbbell_len = adapter_len
                    adapter_len = 0
                # TODO: dumbbell errors
                if pair.r1.reverse_complement[adapter_len:dumbbell_len] != run.dumbbell[-dumbbell_len:]:
                    pair.failure = Failures.dumbbell
                    return
            if adapter_len > 0:
                pair.r1.adapter_errors = string_match_errors(pair.r1.original_seq[-adapter_len:], self._run.adapter_b[:adapter_len])
                if len(pair.r1.adapter_errors) > run.allowed_adapter_errors:
                    pair.failure = Failures.adapter_trim
                    return

        if not self._check_indeterminate(pair):
            return

        pair.r1.match_len = min(pair.r1.original_len - masklen, target.n - match_site)
        pair.r1.match_index = r1_res[1] or (target.n - pair.r1.match_len)
        pair.r1.rtrim = r1_res[2]
        pair.r2.match_index = site
        pair.r2.match_len = match_len

        if run.ignore_stops_with_mismatched_overlap and not pair.check_overlap():
            pair.failure = Failures.r1_r2_overlap
            return

        pair.target = target
        if r2_mutations or r1_res[3]:
            pair.mutations = list(set(r2_mutations + r1_res[3]))
            if pair.mutations and len(pair.mutations) > run.allowed_target_errors:
                pair.failure = Failures.match_errors
                return
            self.counters.low_quality_muts += pair.check_mutation_quality(run.mutations_require_quality_score)

        if run.count_only_full_reads and site != 0:
            pair.failure = Failures.not_full_read
            return

        pair.site = site
        pair.end = target.n
        pair.failure = None
        self.counters.register_count(pair)
Ejemplo n.º 30
0
 def _adapter_t_rc(self):
     if not self.__adapter_t_rc:
         self.__adapter_t_rc = reverse_complement(self._run.adapter_t)
     return self.__adapter_t_rc