Esempio n. 1
0
 def __init__(self,
              adapter1,
              adapter2,
              match_probability=RandomMatchProbability(),
              insert_max_rmp=1E-6,
              adapter_max_rmp=0.001,
              min_insert_overlap=1,
              max_insert_mismatch_frac=0.2,
              min_adapter_overlap=1,
              max_adapter_mismatch_frac=0.2,
              adapter_check_cutoff=9,
              base_probs=dict(p1=0.25, p2=0.75)):
     self.adapter1 = adapter1
     self.adapter1_len = len(adapter1)
     self.adapter2 = adapter2
     self.adapter2_len = len(adapter2)
     self.match_probability = match_probability
     self.insert_max_rmp = insert_max_rmp
     self.adapter_max_rmp = adapter_max_rmp
     self.min_insert_overlap = min_insert_overlap
     self.max_insert_mismatch_frac = float(max_insert_mismatch_frac)
     self.min_adapter_overlap = min_adapter_overlap
     self.max_adapter_mismatch_frac = float(max_adapter_mismatch_frac)
     self.adapter_check_cutoff = adapter_check_cutoff
     self.base_probs = base_probs
     self.aligner = MultiAligner(max_insert_mismatch_frac,
                                 START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
                                 min_insert_overlap)
Esempio n. 2
0
def test_multi_aligner_no_mismatches():
    from atropos._align import MultiAligner
    a = MultiAligner(max_error_rate=0, min_overlap=3)
    matches = a.locate('AGAGATCAGATGACAGATC', 'GATCA')
    assert len(matches) == 2
    
    matches.sort(key=lambda x: x[4], reverse=True)
    
    assert matches[0][0] == 3
    assert matches[0][1] == 8
    assert matches[0][2] == 0
    assert matches[0][3] == 5
    assert matches[0][4] == 5
    assert matches[0][5] == 0
    
    assert matches[1][0] == 15
    assert matches[1][1] == 19
    assert matches[1][2] == 0
    assert matches[1][3] == 4
    assert matches[1][4] == 4
    assert matches[1][5] == 0
Esempio n. 3
0
def test_multi_aligner_with_mismatches():
    from atropos._align import MultiAligner
    a = MultiAligner(max_error_rate=0.1, min_overlap=10)
    matches = a.locate('GATATCAGATGACAGATCAGAGATCAGAT', 'GAGATCAGATGA')
    
    assert len(matches) == 2
    
    matches.sort(key=lambda x: x[5])
    
    assert matches[0][0] == 19
    assert matches[0][1] == 29
    assert matches[0][2] == 0
    assert matches[0][3] == 10
    assert matches[0][4] == 10
    assert matches[0][5] == 0
    
    assert matches[1][0] == 0
    assert matches[1][1] == 12
    assert matches[1][2] == 0
    assert matches[1][3] == 12
    assert matches[1][4] == 11
    assert matches[1][5] == 1
Esempio n. 4
0
 def __init__(self, adapter1, adapter2, match_probability=RandomMatchProbability(),
              insert_max_rmp=1E-6, adapter_max_rmp=0.001,
              min_insert_overlap=1, max_insert_mismatch_frac=0.2,
              min_adapter_overlap=1, max_adapter_mismatch_frac=0.2,
              adapter_check_cutoff=9, base_probs=dict(p1=0.25, p2=0.75)):
     self.adapter1 = adapter1
     self.adapter1_len = len(adapter1)
     self.adapter2 = adapter2
     self.adapter2_len = len(adapter2)
     self.match_probability = match_probability
     self.insert_max_rmp = insert_max_rmp
     self.adapter_max_rmp = adapter_max_rmp
     self.min_insert_overlap = min_insert_overlap
     self.max_insert_mismatch_frac = float(max_insert_mismatch_frac)
     self.min_adapter_overlap = min_adapter_overlap
     self.max_adapter_mismatch_frac = float(max_adapter_mismatch_frac)
     self.adapter_check_cutoff = adapter_check_cutoff
     self.base_probs = base_probs
     self.aligner = MultiAligner(
         max_insert_mismatch_frac,
         START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
         min_insert_overlap)
Esempio n. 5
0
class InsertAligner(object):
    """
    Implementation of an insert matching algorithm.
    This only works with paired-end reads with 3' adapters.
    """
    def __init__(self, adapter1, adapter2, match_probability=RandomMatchProbability(),
                 insert_max_rmp=1E-6, adapter_max_rmp=0.001,
                 min_insert_overlap=1, max_insert_mismatch_frac=0.2,
                 min_adapter_overlap=1, max_adapter_mismatch_frac=0.2,
                 adapter_check_cutoff=9, base_probs=dict(p1=0.25, p2=0.75)):
        self.adapter1 = adapter1
        self.adapter1_len = len(adapter1)
        self.adapter2 = adapter2
        self.adapter2_len = len(adapter2)
        self.match_probability = match_probability
        self.insert_max_rmp = insert_max_rmp
        self.adapter_max_rmp = adapter_max_rmp
        self.min_insert_overlap = min_insert_overlap
        self.max_insert_mismatch_frac = float(max_insert_mismatch_frac)
        self.min_adapter_overlap = min_adapter_overlap
        self.max_adapter_mismatch_frac = float(max_adapter_mismatch_frac)
        self.adapter_check_cutoff = adapter_check_cutoff
        self.base_probs = base_probs
        self.aligner = MultiAligner(
            max_insert_mismatch_frac,
            START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
            min_insert_overlap)
    
    def match_insert(self, seq1, seq2):
        """Use cutadapt aligner for insert and adapter matching"""
        l1 = len(seq1)
        l2 = len(seq2)
        seq_len = min(l1, l2)
        if l1 > l2:
            seq1 = seq1[:l2]
        elif l2 > l1:
            seq2 = seq1[:l1]

        seq2_rc = reverse_complement(seq2)
        
        def _match(insert_match, offset, insert_match_size, prob):
            if offset < self.min_adapter_overlap:
                # The reads are mostly overlapping, to the point where
                # there's not enough overhang to do a confident adapter
                # match. We return just the insert match to signal that
                # error correction can be done even though no adapter
                # trimming is required.
                return (insert_match, None, None)
            
            # TODO: this is very sensitive to the exact correct choice of adapter.
            # For example, if you specifiy GATCGGAA... and the correct adapter is
            # AGATCGGAA..., the prefixes will not match exactly and the alignment
            # will fail. We need to use a comparison that is a bit more forgiving.
            
            a1_match = compare_prefixes(seq1[insert_match_size:], self.adapter1)
            a2_match = compare_prefixes(seq2[insert_match_size:], self.adapter2)
            adapter_len = min(offset, self.adapter1_len, self.adapter2_len)
            max_adapter_mismatches = round(adapter_len * self.max_adapter_mismatch_frac)
            if a1_match[5] > max_adapter_mismatches and a2_match[5] > max_adapter_mismatches:
                return None
            
            a1_prob = self.match_probability(a1_match[4], adapter_len)
            a2_prob = self.match_probability(a2_match[4], adapter_len)
            if (adapter_len > self.adapter_check_cutoff) and ((a1_prob * a2_prob) > self.adapter_max_rmp):
                return None

            adapter_len1 = min(self.adapter1_len, l1 - insert_match_size)
            adapter_len2 = min(self.adapter2_len, l2 - insert_match_size)
            best_adapter_matches, best_adapter_mismatches = (a1_match if a1_prob < a2_prob else a2_match)[4:6]
            
            return (
                insert_match,
                Match(0, adapter_len1, insert_match_size, l1, best_adapter_matches, best_adapter_mismatches),
                Match(0, adapter_len2, insert_match_size, l2, best_adapter_matches, best_adapter_mismatches)
            )
        
        # # This is the old way of doing things, where we use the built-in
        # # Aligner to do a single match.
        # aligner = Aligner(
        #     seq2_rc,
        #     self.max_insert_mismatch_frac,
        #     START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
        #     False, False)
        # aligner.min_overlap = self.min_insert_overlap
        # aligner.indel_cost = 100000
        #
        # insert_match = aligner.locate(seq1)
        #
        # if not insert_match:
        #     return None
        #
        # offset = min(insert_match[0], seq_len - insert_match[3])
        # insert_match_size = seq_len - offset
        # prob = self.match_probability(insert_match[4], insert_match_size)
        #
        # if prob > self.insert_max_rmp:
        #     return None
        #
        # return _match(insert_match, offset, insert_match_size, prob)
        
        # Use an aligner that returns all matches that satisfy the
        # overlap and error rate thresholds. We sort by matches and
        # then mismatches, and then check each in turn until we find
        # one with an adapter match (if any).
        
        insert_matches = self.aligner.locate(seq2_rc, seq1)
        
        if insert_matches:
            # Filter by random-match probability
            filtered_matches = []
            for insert_match in insert_matches:
                offset = min(insert_match[0], seq_len - insert_match[3])
                insert_match_size = seq_len - offset
                prob = self.match_probability(insert_match[4], insert_match_size, **self.base_probs)
                if prob <= self.insert_max_rmp:
                    filtered_matches.append((insert_match, offset, insert_match_size, prob))
            
            if filtered_matches:
                if len(filtered_matches) == 1:
                    return _match(*filtered_matches[0])
                else:
                    # Test matches in order of random-match probability.
                    # TODO: compare against sorting by length (which is how
                    # SeqPurge essentially does it).
                    #filtered_matches.sort(key=lambda x: x[2], reverse=True)
                    filtered_matches.sort(key=lambda x: x[3])
                    for m in filtered_matches:
                        match = _match(*m)
                        if match:
                            return match
            
            return None
Esempio n. 6
0
class InsertAligner(object):
    """
    Implementation of an insert matching algorithm.
    This only works with paired-end reads with 3' adapters.
    """
    def __init__(self,
                 adapter1,
                 adapter2,
                 match_probability=RandomMatchProbability(),
                 insert_max_rmp=1E-6,
                 adapter_max_rmp=0.001,
                 min_insert_overlap=1,
                 max_insert_mismatch_frac=0.2,
                 min_adapter_overlap=1,
                 max_adapter_mismatch_frac=0.2,
                 adapter_check_cutoff=9,
                 base_probs=dict(p1=0.25, p2=0.75)):
        self.adapter1 = adapter1
        self.adapter1_len = len(adapter1)
        self.adapter2 = adapter2
        self.adapter2_len = len(adapter2)
        self.match_probability = match_probability
        self.insert_max_rmp = insert_max_rmp
        self.adapter_max_rmp = adapter_max_rmp
        self.min_insert_overlap = min_insert_overlap
        self.max_insert_mismatch_frac = float(max_insert_mismatch_frac)
        self.min_adapter_overlap = min_adapter_overlap
        self.max_adapter_mismatch_frac = float(max_adapter_mismatch_frac)
        self.adapter_check_cutoff = adapter_check_cutoff
        self.base_probs = base_probs
        self.aligner = MultiAligner(max_insert_mismatch_frac,
                                    START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
                                    min_insert_overlap)

    def match_insert(self, seq1, seq2):
        """Use cutadapt aligner for insert and adapter matching"""
        l1 = len(seq1)
        l2 = len(seq2)
        seq_len = min(l1, l2)
        if l1 > l2:
            seq1 = seq1[:l2]
        elif l2 > l1:
            seq2 = seq1[:l1]

        seq2_rc = reverse_complement(seq2)

        def _match(insert_match, offset, insert_match_size, prob):
            if offset < self.min_adapter_overlap:
                # The reads are mostly overlapping, to the point where
                # there's not enough overhang to do a confident adapter
                # match. We return just the insert match to signal that
                # error correction can be done even though no adapter
                # trimming is required.
                return (insert_match, None, None)

            # TODO: this is very sensitive to the exact correct choice of adapter.
            # For example, if you specifiy GATCGGAA... and the correct adapter is
            # AGATCGGAA..., the prefixes will not match exactly and the alignment
            # will fail. We need to use a comparison that is a bit more forgiving.

            a1_match = compare_prefixes(seq1[insert_match_size:],
                                        self.adapter1)
            a2_match = compare_prefixes(seq2[insert_match_size:],
                                        self.adapter2)
            adapter_len = min(offset, self.adapter1_len, self.adapter2_len)
            max_adapter_mismatches = round(adapter_len *
                                           self.max_adapter_mismatch_frac)
            if a1_match[5] > max_adapter_mismatches and a2_match[
                    5] > max_adapter_mismatches:
                return None

            a1_prob = self.match_probability(a1_match[4], adapter_len)
            a2_prob = self.match_probability(a2_match[4], adapter_len)
            if (adapter_len > self.adapter_check_cutoff) and (
                (a1_prob * a2_prob) > self.adapter_max_rmp):
                return None

            adapter_len1 = min(self.adapter1_len, l1 - insert_match_size)
            adapter_len2 = min(self.adapter2_len, l2 - insert_match_size)
            best_adapter_matches, best_adapter_mismatches = (
                a1_match if a1_prob < a2_prob else a2_match)[4:6]

            return (insert_match,
                    Match(0, adapter_len1, insert_match_size, l1,
                          best_adapter_matches, best_adapter_mismatches),
                    Match(0, adapter_len2, insert_match_size, l2,
                          best_adapter_matches, best_adapter_mismatches))

        # # This is the old way of doing things, where we use the built-in
        # # Aligner to do a single match.
        # aligner = Aligner(
        #     seq2_rc,
        #     self.max_insert_mismatch_frac,
        #     START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2,
        #     False, False)
        # aligner.min_overlap = self.min_insert_overlap
        # aligner.indel_cost = 100000
        #
        # insert_match = aligner.locate(seq1)
        #
        # if not insert_match:
        #     return None
        #
        # offset = min(insert_match[0], seq_len - insert_match[3])
        # insert_match_size = seq_len - offset
        # prob = self.match_probability(insert_match[4], insert_match_size)
        #
        # if prob > self.insert_max_rmp:
        #     return None
        #
        # return _match(insert_match, offset, insert_match_size, prob)

        # Use an aligner that returns all matches that satisfy the
        # overlap and error rate thresholds. We sort by matches and
        # then mismatches, and then check each in turn until we find
        # one with an adapter match (if any).

        insert_matches = self.aligner.locate(seq2_rc, seq1)

        if insert_matches:
            # Filter by random-match probability
            filtered_matches = []
            for insert_match in insert_matches:
                offset = min(insert_match[0], seq_len - insert_match[3])
                insert_match_size = seq_len - offset
                prob = self.match_probability(insert_match[4],
                                              insert_match_size,
                                              **self.base_probs)
                if prob <= self.insert_max_rmp:
                    filtered_matches.append(
                        (insert_match, offset, insert_match_size, prob))

            if filtered_matches:
                if len(filtered_matches) == 1:
                    return _match(*filtered_matches[0])
                else:
                    # Test matches in order of random-match probability.
                    # TODO: compare against sorting by length (which is how
                    # SeqPurge essentially does it).
                    #filtered_matches.sort(key=lambda x: x[2], reverse=True)
                    filtered_matches.sort(key=lambda x: x[3])
                    for m in filtered_matches:
                        match = _match(*m)
                        if match:
                            return match

            return None