Exemple #1
0
 def _get_contaminants(self):
     contaminant_matchers = create_contaminant_matchers(
         self.known_contaminants, self.kmer_size)
     counts = defaultdict(int)
     max_match_fracs = defaultdict(int)
     
     for seq in self._read_sequences:
         seqrc = reverse_complement(seq)
         for contam in contaminant_matchers:
             match = contam.match(seq, seqrc)
             if match[0] > self.min_kmer_match_frac:
                 counts[contam] += 1
                 if match[0] > max_match_fracs[contam]:
                     max_match_fracs[contam] = match[0]
     
     min_count = math.ceil(
         self.n_reads * (self._read_length - self._min_k + 1) *
         self.overrep_cutoff / float(4**self._min_k))
     
     return [
         Match(
             c[0], match_frac=max_match_fracs[c[0]], 
             abundance=float(c[1]) / self.n_reads)
         for c in filter(
             lambda x: x[1] >= min_count,
             counts.items()
         )
     ]
Exemple #2
0
 def __call__(self, read1, read2):
     len1 = len(read1.sequence)
     len2 = len(read2.sequence)
     min_overlap = self.min_overlap
     if min_overlap <= 1:
         min_overlap = max(2, round(self.min_overlap * min(len1, len2)))
     
     if len1 < min_overlap or len2 < min_overlap:
         return (read1, read2)
     
     insert_matched = read1.insert_overlap and read2.insert_overlap
     
     if insert_matched:
         # If we've already determined that there is an insert overlap
         # with a 3' overhang, we can constrain our alignment
         aflags = START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2
     else:
         aflags = SEMIGLOBAL
     # align read1 to read2 reverse-complement to be compatible with
     # InsertAligner
     read2_rc = reverse_complement(read2.sequence)
     aligner = Aligner(read2_rc, self.error_rate, aflags)
     alignment = aligner.locate(read1.sequence)
     
     if alignment:
         r2_start, r2_stop, r1_start, r1_stop, matches, errors = alignment
         if matches >= min_overlap:
             # Only correct errors if we haven't already done correction in
             # the InsertAligner
             if self.mismatch_action and errors > 0 and not insert_matched:
                 self.correct_errors(read1, read2, alignment)
             
             if r2_start == 0 and r2_stop == len2:
                 # r2 is fully contained in r1
                 pass
             elif r1_start == 0 and r1_stop == len1:
                 # r1 is fully contained in r2
                 read1.sequence = read2_rc
                 read1.qualities = "".join(reversed(read2.qualities))
             elif r1_start > 0:
                 read1.sequence += read2_rc[r2_stop:]
                 if read1.qualities and read2.qualities:
                     read1.qualities += "".join(
                         reversed(read2.qualities))[r2_stop:]
             elif r2_start > 0:
                 read1.sequence = read2_rc + read1.sequence[r1_stop:]
                 if read1.qualities and read2.qualities:
                     read1.qualities = (
                         "".join(reversed(read2.qualities)) +
                         read1.qualities[r1_stop:])
             else:
                 raise AtroposError(
                     "Invalid alignment while trying to merge read "
                     "{}: {}".format(
                         read1.name, ",".join(str(i) for i in alignment)))
             
             read1.merged = True
             read2 = None
             
     return (read1, read2)
Exemple #3
0
    def add(self, name, seq):
        """Add a sequence to the cache.

        Args:
            name: Adapter name.
            seq: Adapter sequence.
        """
        self._add(name, seq)
        if self.auto_reverse_complement:
            self._add("{}_rc".format(name), reverse_complement(seq))
Exemple #4
0
 def add(self, name, seq):
     """Add a sequence to the cache.
     
     Args:
         name: Adapter name.
         seq: Adapter sequence.
     """
     self._add(name, seq)
     if self.auto_reverse_complement:
         self._add("{}_rc".format(name), reverse_complement(seq))
Exemple #5
0
 def find_best_match(seq, best_matches, best_match_frac):
     seqrc = reverse_complement(seq)
     for contam in contaminants:
         match_frac1, match_frac2, compare_seq = contam.match(seq, seqrc)
         if match_frac1 < best_match_frac[0]:
             continue
         if contam.seq in compare_seq or align(compare_seq, contam.seq, self.min_contaminant_match_frac):
             if match_frac1 > best_match_frac[0] or (
                 match_frac1 == best_match_frac[0] and match_frac2 > best_match_frac[1]
             ):
                 best_matches = {}
                 best_match_frac = (match_frac1, match_frac2)
             best_matches[contam] = (match, (match_frac1, match_frac2))
     return (best_matches, best_match_frac)
Exemple #6
0
 def find_best_match(seq, best_matches, best_match_frac):
     seqrc = reverse_complement(seq)
     for contam in contaminants:
         match_frac1, match_frac2, compare_seq = contam.match(
             seq, seqrc)
         if match_frac1 < best_match_frac[0]:
             continue
         if (contam.seq in compare_seq
                 or align(compare_seq, contam.seq,
                          self.min_contaminant_match_frac)):
             if (match_frac1 > best_match_frac[0]
                     or (match_frac1 == best_match_frac[0]
                         and match_frac2 > best_match_frac[1])):
                 best_matches = {}
                 best_match_frac = (match_frac1, match_frac2)
             best_matches[contam] = (match, (match_frac1,
                                             match_frac2))
     return (best_matches, best_match_frac)
Exemple #7
0
    def _get_contaminants(self, read_seqs):
        contaminant_matchers = create_contaminant_matchers(known_contaminants, k)
        counts = defaultdict(lambda: 0)

        for seq in read_seqs:
            seqrc = reverse_complement(seq)
            for contam in contaminant_matchers:
                match = contam.match(seq, seqrc)
                if match[0] > self.min_match_frac:
                    counts[contam] += 1

        min_count = math.ceil(
            self.n_reads * (self._read_length - self._min_k + 1) * self.overrep_cutoff / float(4 ** self._min_k)
        )

        return [
            Match(c[0], match_frac=float(c[1]) / self.n_reads)
            for c in filter(lambda x: x[1] >= min_count, counts.items())
        ]
Exemple #8
0
    def _get_contaminants(self, read_seqs):
        contaminant_matchers = create_contaminant_matchers(
            known_contaminants, k)
        counts = defaultdict(lambda: 0)

        for seq in read_seqs:
            seqrc = reverse_complement(seq)
            for contam in contaminant_matchers:
                match = contam.match(seq, seqrc)
                if match[0] > self.min_match_frac:
                    counts[contam] += 1

        min_count = math.ceil(self.n_reads *
                              (self._read_length - self._min_k + 1) *
                              self.overrep_cutoff / float(4**self._min_k))

        return [
            Match(c[0], match_frac=float(c[1]) / self.n_reads)
            for c in filter(lambda x: x[1] >= min_count, counts.items())
        ]
Exemple #9
0
            def find_best_match(_seq, _best_matches, _best_match_frac):
                """Find best contaminant matches to `seq`.
                """
                seqrc = reverse_complement(_seq)

                for _contam in contaminants:
                    match_frac1, match_frac2, compare_seq = _contam.match(
                        _seq, seqrc)
                    if match_frac1 < _best_match_frac[0]:
                        continue
                    if (_contam.seq in compare_seq
                            or align(compare_seq, _contam.seq,
                                     self.min_contaminant_match_frac)):
                        if (match_frac1 > _best_match_frac[0]
                                or (match_frac1 == _best_match_frac[0]
                                    and match_frac2 > _best_match_frac[1])):
                            _best_matches = {}
                            _best_match_frac = (match_frac1, match_frac2)
                        _best_matches[_contam] = (match, (match_frac1,
                                                          match_frac2))

                return _best_matches, _best_match_frac
Exemple #10
0
    def match_insert(self, seq1, seq2):
        """Use cutadapt aligner for insert and adapter matching"""
        l1 = len(seq1)
        l2 = len(seq2)
        seq_len = min(l1, l2)
        if l1 > l2:
            seq1 = seq1[:l2]
        elif l2 > l1:
            seq2 = seq1[:l1]

        seq2_rc = reverse_complement(seq2)
        
        def _match(insert_match, offset, insert_match_size, prob):
            if offset < self.min_adapter_overlap:
                # The reads are mostly overlapping, to the point where
                # there's not enough overhang to do a confident adapter
                # match. We return just the insert match to signal that
                # error correction can be done even though no adapter
                # trimming is required.
                return (insert_match, None, None)
            
            # TODO: this is very sensitive to the exact correct choice of adapter.
            # For example, if you specifiy GATCGGAA... and the correct adapter is
            # AGATCGGAA..., the prefixes will not match exactly and the alignment
            # will fail. We need to use a comparison that is a bit more forgiving.
            
            a1_match = compare_prefixes(seq1[insert_match_size:], self.adapter1)
            a2_match = compare_prefixes(seq2[insert_match_size:], self.adapter2)
            adapter_len = min(offset, self.adapter1_len, self.adapter2_len)
            max_adapter_mismatches = round(adapter_len * self.max_adapter_mismatch_frac)
            if a1_match[5] > max_adapter_mismatches and a2_match[5] > max_adapter_mismatches:
                return None
            
            a1_prob = self.match_probability(a1_match[4], adapter_len)
            a2_prob = self.match_probability(a2_match[4], adapter_len)
            if (adapter_len > self.adapter_check_cutoff) and ((a1_prob * a2_prob) > self.adapter_max_rmp):
                return None

            adapter_len1 = min(self.adapter1_len, l1 - insert_match_size)
            adapter_len2 = min(self.adapter2_len, l2 - insert_match_size)
            best_adapter_matches, best_adapter_mismatches = (a1_match if a1_prob < a2_prob else a2_match)[4:6]
            
            return (
                insert_match,
                Match(0, adapter_len1, insert_match_size, l1, best_adapter_matches, best_adapter_mismatches),
                Match(0, adapter_len2, insert_match_size, l2, best_adapter_matches, best_adapter_mismatches)
            )
        
        # # This is the old way of doing things, where we use the built-in
        # # Aligner to do a single match.
        # aligner = Aligner(
        #     seq2_rc,
        #     self.max_insert_mismatch_frac,
        #     START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
        #     False, False)
        # aligner.min_overlap = self.min_insert_overlap
        # aligner.indel_cost = 100000
        #
        # insert_match = aligner.locate(seq1)
        #
        # if not insert_match:
        #     return None
        #
        # offset = min(insert_match[0], seq_len - insert_match[3])
        # insert_match_size = seq_len - offset
        # prob = self.match_probability(insert_match[4], insert_match_size)
        #
        # if prob > self.insert_max_rmp:
        #     return None
        #
        # return _match(insert_match, offset, insert_match_size, prob)
        
        # Use an aligner that returns all matches that satisfy the
        # overlap and error rate thresholds. We sort by matches and
        # then mismatches, and then check each in turn until we find
        # one with an adapter match (if any).
        
        insert_matches = self.aligner.locate(seq2_rc, seq1)
        
        if insert_matches:
            # Filter by random-match probability
            filtered_matches = []
            for insert_match in insert_matches:
                offset = min(insert_match[0], seq_len - insert_match[3])
                insert_match_size = seq_len - offset
                prob = self.match_probability(insert_match[4], insert_match_size, **self.base_probs)
                if prob <= self.insert_max_rmp:
                    filtered_matches.append((insert_match, offset, insert_match_size, prob))
            
            if filtered_matches:
                if len(filtered_matches) == 1:
                    return _match(*filtered_matches[0])
                else:
                    # Test matches in order of random-match probability.
                    # TODO: compare against sorting by length (which is how
                    # SeqPurge essentially does it).
                    #filtered_matches.sort(key=lambda x: x[2], reverse=True)
                    filtered_matches.sort(key=lambda x: x[3])
                    for m in filtered_matches:
                        match = _match(*m)
                        if match:
                            return match
            
            return None
Exemple #11
0
    def _get_contaminants(self):
        from khmer import khmer_args

        n_win = self._read_length - self.k + 1  # assuming all sequences are same length
        tablesize = n_reads * n_win
        countgraph = khmer.Countgraph(self.k, tablesize, khmer_args.DEFAULT_N_TABLES)
        countgraph.set_use_bigcount(True)

        for seq in self._read_sequences:
            countgraph.consume_and_tag(seq)

        n_expected = math.ceil(tablesize / float(4 ** k))
        min_count = n_expected * self.overrep_cutoff
        if min_count >= 2 ** 16:
            raise Exception(
                "The minimum count for an over-represented k-kmer {} "
                "is greater than the max khmer count (2^16)".format(min_count)
            )

        candidates = {}

        for tag in countgraph.get_tagset():
            count = countgraph.get(tag)
            if count >= min_count:
                candidates[tag] = count

        if self.known_contaminants:
            matches = []
            seen = set()

            def match(kmer):
                n = candidates.get(kmer, 0)
                if n > 0:
                    seen.add(kmer)
                return n

            for seq, names in self.known_contaminants.iter_sequences():
                l = len(seq)
                if l < k:
                    print("Cannot check {}; sequence is shorter than {}".format(list(names)[0], k))
                    continue

                n_kmers = l - self.k + 1
                num_matches = 0
                match_counts = []
                for i in range(n_kmers):
                    kmer = seq[i : (i + k)]
                    kmer_count = max(match(kmer), match(reverse_complement(kmer)))
                    if kmer_count > 0:
                        num_matches += 1
                        match_counts.append(kmer_count)

                if num_matches > 0:
                    # not sure what the correct metric is to use here
                    overall_count = sum(match_counts) / float(n_kmers)
                    matches.append(Match(seq, overall_count / float(tablesize), names, float(num_matches) / n_kmers))

            # Add remaining tags
            for tag in set(candidates.keys()) - seen:
                matches.append(Match(tag, candidates[tag] / float(tablesize)))

        else:
            matches = [Match(tag, count / float(tablesize)) for tag, count in candidates.items()]

        return matches
Exemple #12
0
    def match_insert(self, seq1, seq2):
        """Use cutadapt aligner for insert and adapter matching.
        
        Args:
            seq1, seq2: Sequences to match.
        
        Returns:
            A :class:`Match` object, or None if there is no match.
        """
        len1 = len(seq1)
        len2 = len(seq2)
        seq_len = min(len1, len2)
        if len1 > len2:
            seq1 = seq1[:len2]
        elif len2 > len1:
            seq2 = seq1[:len1]

        seq2_rc = reverse_complement(seq2)

        def _match(insert_match, offset, insert_match_size,
                   prob):  # pylint disable=unused-argument
            if offset < self.min_adapter_overlap:
                # The reads are mostly overlapping, to the point where
                # there's not enough overhang to do a confident adapter
                # match. We return just the insert match to signal that
                # error correction can be done even though no adapter
                # trimming is required.
                return (insert_match, None, None)

            # TODO: this is very sensitive to the exact correct choice of
            # adapter. For example, if you specifiy GATCGGAA... and the correct
            # adapter is AGATCGGAA..., the prefixes will not match exactly and
            # the alignment will fail. We need to use a comparison that is a bit
            # more forgiving.

            a1_match = compare_prefixes(seq1[insert_match_size:],
                                        self.adapter1)
            a2_match = compare_prefixes(seq2[insert_match_size:],
                                        self.adapter2)
            adapter_len = min(offset, self.adapter1_len, self.adapter2_len)
            max_adapter_mismatches = round(adapter_len *
                                           self.max_adapter_mismatch_frac)
            if (a1_match[5] > max_adapter_mismatches
                    and a2_match[5] > max_adapter_mismatches):
                return None

            a1_prob = self.match_probability(a1_match[4], adapter_len)
            a2_prob = self.match_probability(a2_match[4], adapter_len)
            if ((adapter_len > self.adapter_check_cutoff)
                    and ((a1_prob * a2_prob) > self.adapter_max_rmp)):
                return None

            adapter_len1 = min(self.adapter1_len, len1 - insert_match_size)
            adapter_len2 = min(self.adapter2_len, len2 - insert_match_size)
            best_adapter_matches, best_adapter_mismatches = (
                a1_match if a1_prob < a2_prob else a2_match)[4:6]

            return (insert_match,
                    Match(0, adapter_len1, insert_match_size, len1,
                          best_adapter_matches, best_adapter_mismatches),
                    Match(0, adapter_len2, insert_match_size, len2,
                          best_adapter_matches, best_adapter_mismatches))

        # # This is the old way of doing things, where we use the built-in
        # # Aligner to do a single match.
        # aligner = Aligner(
        #     seq2_rc,
        #     self.max_insert_mismatch_frac,
        #     START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
        #     False, False)
        # aligner.min_overlap = self.min_insert_overlap
        # aligner.indel_cost = 100000
        #
        # insert_match = aligner.locate(seq1)
        #
        # if not insert_match:
        #     return None
        #
        # offset = min(insert_match[0], seq_len - insert_match[3])
        # insert_match_size = seq_len - offset
        # prob = self.match_probability(insert_match[4], insert_match_size)
        #
        # if prob > self.insert_max_rmp:
        #     return None
        #
        # return _match(insert_match, offset, insert_match_size, prob)

        # Use an aligner that returns all matches that satisfy the
        # overlap and error rate thresholds. We sort by matches and
        # then mismatches, and then check each in turn until we find
        # one with an adapter match (if any).

        insert_matches = self.aligner.locate(seq2_rc, seq1)

        if insert_matches:
            # Filter by random-match probability
            filtered_matches = []
            for insert_match in insert_matches:
                offset = min(insert_match[0], seq_len - insert_match[3])
                insert_match_size = seq_len - offset
                prob = self.match_probability(insert_match[4],
                                              insert_match_size,
                                              **self.base_probs)
                if prob <= self.insert_max_rmp:
                    filtered_matches.append(
                        (insert_match, offset, insert_match_size, prob))

            if filtered_matches:
                if len(filtered_matches) == 1:
                    return _match(*filtered_matches[0])
                else:
                    # Test matches in order of random-match probability.
                    # TODO: compare against sorting by length (which is how
                    # SeqPurge essentially does it).
                    #filtered_matches.sort(key=lambda x: x[2], reverse=True)
                    filtered_matches.sort(key=lambda x: x[3])
                    for match_args in filtered_matches:
                        match = _match(*match_args)
                        if match:
                            return match

            return None
Exemple #13
0
    def _get_contaminants(self):
        from khmer import khmer_args
        n_win = self._read_length - self.k + 1  # assuming all sequences are same length
        tablesize = n_reads * n_win
        countgraph = khmer.Countgraph(self.k, tablesize,
                                      khmer_args.DEFAULT_N_TABLES)
        countgraph.set_use_bigcount(True)

        for seq in self._read_sequences:
            countgraph.consume_and_tag(seq)

        n_expected = math.ceil(tablesize / float(4**k))
        min_count = n_expected * self.overrep_cutoff
        if min_count >= 2**16:
            raise Exception(
                "The minimum count for an over-represented k-kmer {} "
                "is greater than the max khmer count (2^16)".format(min_count))

        candidates = {}

        for tag in countgraph.get_tagset():
            count = countgraph.get(tag)
            if count >= min_count:
                candidates[tag] = count

        if self.known_contaminants:
            matches = []
            seen = set()

            def match(kmer):
                n = candidates.get(kmer, 0)
                if n > 0:
                    seen.add(kmer)
                return n

            for seq, names in self.known_contaminants.iter_sequences():
                l = len(seq)
                if l < k:
                    print(
                        "Cannot check {}; sequence is shorter than {}".format(
                            list(names)[0], k))
                    continue

                n_kmers = l - self.k + 1
                num_matches = 0
                match_counts = []
                for i in range(n_kmers):
                    kmer = seq[i:(i + k)]
                    kmer_count = max(match(kmer),
                                     match(reverse_complement(kmer)))
                    if kmer_count > 0:
                        num_matches += 1
                        match_counts.append(kmer_count)

                if num_matches > 0:
                    # not sure what the correct metric is to use here
                    overall_count = sum(match_counts) / float(n_kmers)
                    matches.append(
                        Match(seq, overall_count / float(tablesize), names,
                              float(num_matches) / n_kmers))

            # Add remaining tags
            for tag in set(candidates.keys()) - seen:
                matches.append(Match(tag, candidates[tag] / float(tablesize)))

        else:
            matches = [
                Match(tag, count / float(tablesize))
                for tag, count in candidates.items()
            ]

        return matches
Exemple #14
0
 def _get_contaminants(self):
     from khmer import Countgraph, khmer_args
     # assuming all sequences are same length
     n_win = self._read_length - self.kmer_size + 1
     tablesize = self.n_reads * n_win
     countgraph = Countgraph(
         self.kmer_size, tablesize, khmer_args.DEFAULT_N_TABLES)
     countgraph.set_use_bigcount(True)
     
     for seq in self._read_sequences:
         countgraph.consume_and_tag(seq)
     
     n_expected = math.ceil(tablesize / float(4**self.kmer_size))
     min_count = n_expected * self.overrep_cutoff
     if min_count >= 2**16:
         raise ValueError(
             "The minimum count for an over-represented k-kmer {} is "
             "greater than the max khmer count (2^16)".format(min_count))
 
     candidates = {}
     
     for tag in countgraph.get_tagset():
         count = countgraph.get(tag)
         if count >= min_count:
             candidates[tag] = count
     
     if self.known_contaminants:
         matches = []
         seen = set()
         
         def match(kmer):
             """Returns the frequency of `kmer` in `candidates`.
             """
             freq = candidates.get(kmer, 0)
             if freq > 0:
                 seen.add(kmer)
             return freq
         
         for seq, names in self.known_contaminants.iter_sequences():
             seqlen = len(seq)
             if seqlen < self.kmer_size:
                 print("Cannot check {}; sequence is shorter than {}".format(
                     list(names)[0], self.kmer_size))
                 continue
             
             n_kmers = seqlen - self.kmer_size + 1
             num_matches = 0
             match_counts = []
             for idx in range(n_kmers):
                 kmer = seq[idx:(idx + self.kmer_size)]
                 kmer_count = max(
                     match(kmer),
                     match(reverse_complement(kmer))
                 )
                 if kmer_count > 0:
                     num_matches += 1
                     match_counts.append(kmer_count)
             
             if num_matches > 0:
                 # not sure what the correct metric is to use here
                 overall_count = sum(match_counts) / float(n_kmers)
                 matches.append(Match(
                     seq, count=overall_count / float(tablesize), 
                     names=names, match_frac=float(num_matches) / n_kmers))
         
         # Add remaining tags
         for tag in set(candidates.keys()) - seen:
             matches.append(Match(
                 tag, count=candidates[tag] / float(tablesize)))
     
     else:
         matches = [
             Match(tag, count=count / float(tablesize))
             for tag, count in candidates.items()]
     
     return matches
Exemple #15
0
 def __call__(self, read1, read2):
     len1 = len(read1.sequence)
     len2 = len(read2.sequence)
     min_overlap = self.min_overlap
     if min_overlap <= 1:
         min_overlap = max(2, round(self.min_overlap * min(len1, len2)))
     
     if len1 < min_overlap or len2 < min_overlap:
         return (read1, read2)
     
     insert_matched = read1.insert_overlap and read2.insert_overlap
     
     if insert_matched:
         # If we've already determined that there is an insert overlap
         # with a 3' overhang, we can constrain our alignment
         aflags = START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2
     else:
         aflags = SEMIGLOBAL
     # align read1 to read2 reverse-complement to be compatible with
     # InsertAligner
     read2_rc = reverse_complement(read2.sequence)
     aligner = Aligner(read2_rc, self.error_rate, aflags)
     alignment = aligner.locate(read1.sequence)
     
     if alignment:
         r2_start, r2_stop, r1_start, r1_stop, matches, errors = alignment
         if matches >= min_overlap:
             # Only correct errors if we haven't already done correction in
             # the InsertAligner
             if (
                     self.mismatch_action and errors > 0 and
                     not insert_matched and
                     read1.corrected == 0 and read2.corrected == 0):
                 self.correct_errors(read1, read2, alignment)
             
             if r2_start == 0 and r2_stop == len2:
                 # r2 is fully contained in r1
                 pass
             elif r1_start == 0 and r1_stop == len1:
                 # r1 is fully contained in r2
                 read1.sequence = read2_rc
                 read1.qualities = "".join(reversed(read2.qualities))
             elif r1_start > 0:
                 read1.sequence += read2_rc[r2_stop:]
                 if read1.qualities and read2.qualities:
                     read1.qualities += "".join(
                         reversed(read2.qualities))[r2_stop:]
             elif r2_start > 0:
                 read1.sequence = read2_rc + read1.sequence[r1_stop:]
                 if read1.qualities and read2.qualities:
                     read1.qualities = (
                         "".join(reversed(read2.qualities)) +
                         read1.qualities[r1_stop:])
             else:
                 raise AtroposError(
                     "Invalid alignment while trying to merge read "
                     "{}: {}".format(
                         read1.name, ",".join(str(i) for i in alignment)))
             
             read1.merged = True
             read2 = None
             
     return (read1, read2)