Ejemplo n.º 1
0
def test_match_probability():
    a = InsertAligner('TTAGACATAT', 'CAGTGGAGTA')
    k = 3
    n = 5
    i3 = (120 / (6 * 2)) * (0.25 ** 3) * (0.75 ** 2)
    i4 = (120 / 24) * (0.25 ** 4) * 0.75
    i5 = 0.25 ** 5
    assert approx_equal(a.match_probability(k, n), i3 + i4 + i5, 0.0001)
Ejemplo n.º 2
0
def test_match_probability():
    a = InsertAligner('TTAGACATAT', 'CAGTGGAGTA')
    k = 3
    n = 5
    i3 = (120 / (6 * 2)) * (0.25 ** 3) * (0.75 ** 2)
    i4 = (120 / 24) * (0.25 ** 4) * 0.75
    i5 = 0.25 ** 5
    assert approx_equal(a.match_probability(k, n), i3 + i4 + i5, 0.0001)
Ejemplo n.º 3
0
def test_short_adapter_overlap():
    a1_seq = 'TTAGACATAT'
    a2_seq = 'CAGTGGAGTA'
    seq1 = 'GACAGGCCGTTTGAATGTTGACGGGATGTT'
    seq2 = 'CATCCCGTCAACATTCAAACGGCCTGTCCA'
    aligner = InsertAligner(a1_seq, a2_seq)
    insert_match, match1, match2 = aligner.match_insert(seq1, seq2)
    assert match1.rstart == 28
    assert match1.length == 2
    assert match2.rstart == 28
    assert match2.length == 2
Ejemplo n.º 4
0
def test_insert_align():
    a1_seq = 'TTAGACATATGG'
    a2_seq = 'CAGTGGAGTATA'
    aligner = InsertAligner(a1_seq, a2_seq)
    r1 = 'AGTCGAGCCCATTGCAGACT' + a1_seq[0:10]
    r2 = 'AGTCTGCAATGGGCTCGACT' + a2_seq[0:10]
    insert_match, match1, match2 = aligner.match_insert(r1, r2)
    assert match1.rstart == 20
    assert match1.length == 10
    assert match2.rstart == 20
    assert match2.length == 10
Ejemplo n.º 5
0
def test_short_adapter_overlap():
    a1_seq = 'TTAGACATAT'
    a2_seq = 'CAGTGGAGTA'
    seq1 = 'GACAGGCCGTTTGAATGTTGACGGGATGTT'
    seq2 = 'CATCCCGTCAACATTCAAACGGCCTGTCCA'
    aligner = InsertAligner(a1_seq, a2_seq)
    insert_match, match1, match2 = aligner.match_insert(seq1, seq2)
    assert match1.rstart == 28
    assert match1.length == 2
    assert match2.rstart == 28
    assert match2.length == 2
Ejemplo n.º 6
0
def test_insert_align():
    a1_seq = 'TTAGACATATGG'
    a2_seq = 'CAGTGGAGTATA'
    aligner = InsertAligner(a1_seq, a2_seq)
    r1 = 'AGTCGAGCCCATTGCAGACT' + a1_seq[0:10]
    r2 = 'AGTCTGCAATGGGCTCGACT' + a2_seq[0:10]
    insert_match, match1, match2 = aligner.match_insert(r1, r2)
    assert match1.rstart == 20
    assert match1.length == 10
    assert match2.rstart == 20
    assert match2.length == 10
Ejemplo n.º 7
0
 def __init__(
         self, adapter1, adapter2, action='trim', mismatch_action=None,
         symmetric=True, min_insert_overlap=1, **aligner_args):
     ErrorCorrectorMixin.__init__(self, mismatch_action)
     self.adapter1 = adapter1
     self.adapter2 = adapter2
     self.aligner = InsertAligner(
         adapter1.sequence, adapter2.sequence,
         min_insert_overlap=min_insert_overlap, **aligner_args)
     self.min_insert_len = min_insert_overlap
     self.action = action
     self.symmetric = symmetric
     self.with_adapters = [0, 0]
Ejemplo n.º 8
0
 def __init__(self, adapter1, adapter2, action='trim', mismatch_action=None,
              symmetric=True, min_insert_overlap=1, **aligner_args):
     ErrorCorrectorMixin.__init__(self, mismatch_action)
     self.adapter1 = adapter1
     self.adapter2 = adapter2
     self.aligner = InsertAligner(adapter1.sequence, adapter2.sequence,
         min_insert_overlap=min_insert_overlap, **aligner_args)
     self.min_insert_len = min_insert_overlap
     self.action = action
     self.symmetric = symmetric
     self.with_adapters = [0, 0]
Ejemplo n.º 9
0
class InsertAdapterCutter(ReadPairModifier, ErrorCorrectorMixin):
    """AdapterCutter that uses InsertAligner to first try to identify
    insert overlap before falling back to semi-global adapter alignment.
    
    Args:
        adapter1, adapter2: Adapters.
        action: Action to take on adapter match: trim, mask (replace adapter
            with N's), lower (convert adapter bases to lower case),
            or None.
        mismatch_action: How to deal with mismatches. See
            :class:`ErrorCorrectorMixin`.
        symmetric: Whether to assume that the adapter should appear in the
            same place on overlapping reads.
        min_insert_overlap: Minimum overlap required between reads to be
            considered an insert match.
        aligner_args: Additional arguments to :class:`InsertAligner`.
    """
    def __init__(self,
                 adapter1,
                 adapter2,
                 action='trim',
                 mismatch_action=None,
                 symmetric=True,
                 min_insert_overlap=1,
                 **aligner_args):
        ErrorCorrectorMixin.__init__(self, mismatch_action)
        self.adapter1 = adapter1
        self.adapter2 = adapter2
        self.aligner = InsertAligner(adapter1.sequence,
                                     adapter2.sequence,
                                     min_insert_overlap=min_insert_overlap,
                                     **aligner_args)
        self.min_insert_len = min_insert_overlap
        self.action = action
        self.symmetric = symmetric
        self.with_adapters = [0, 0]

    def __call__(self, read1, read2):
        read_lengths = [len(r) for r in (read1, read2)]
        if any(l < self.min_insert_len for l in read_lengths):
            return (read1, read2)

        match = self.aligner.match_insert(read1.sequence, read2.sequence)
        read1.insert_overlap = read2.insert_overlap = (match is not None)
        insert_match = None
        correct_errors = False

        if match:
            insert_match, adapter_match1, adapter_match2 = match
            correct_errors = self.mismatch_action and insert_match[5] > 0
        else:
            adapter_match1 = self.adapter1.match_to(read1)
            adapter_match2 = self.adapter2.match_to(read2)
            # If the adapter matches are complementary, perform error correction
            if (self.mismatch_action and adapter_match1 and adapter_match2
                    and adapter_match1.rstart == adapter_match2.rstart):
                insert_match = (read_lengths[1] - adapter_match1.rstart,
                                read_lengths[1], 0, adapter_match1.rstart)
                correct_errors = True

        # If exactly one of the two alignments failed and symmetric is True,
        # duplicate the good alignment
        if self.symmetric and sum(
                bool(m) for m in (adapter_match1, adapter_match2)) == 1:

            def create_symmetric_match(match, read, read_len):
                if match.rstart > read_len:
                    return None
                match = match.copy()
                # If we're not dealing with equal-length reads, and this read
                # is shorter than the other, adjust the match end to be the
                # read length. The 'matches' and 'errors' attributes will be
                # wrong, but it shouldn't matter.
                if match.rstop < read_len:
                    match.astop -= (read_len - match.rstop)
                    match.rstop = read_len
                return match

            if adapter_match1:
                adapter_match2 = create_symmetric_match(
                    adapter_match1, read2, read_lengths[1])
            else:
                adapter_match1 = create_symmetric_match(
                    adapter_match2, read1, read_lengths[0])

            if self.mismatch_action and not insert_match and adapter_match1 and adapter_match2:
                # Assume that the symmetric read segments overlap and
                # perform error correction
                insert_match = (read_lengths[1] - adapter_match1.rstart,
                                read_lengths[1], 0, adapter_match1.rstart)
                correct_errors = True

        if correct_errors:
            self.correct_errors(read1, read2, insert_match)

        return (self.trim(read1, self.adapter1, adapter_match1,
                          0), self.trim(read2, self.adapter2, adapter_match2,
                                        1))

    def trim(self, read, adapter, match, read_idx):
        """Trim an adapter from a read.
        
        Args:
            read: The read to trim from.
            adapter: The Adapter to trim.
            match: The match details.
            read_idx: 0/1
        """
        if not match:
            read.match = None
            read.match_info = None
            return read

        match.adapter = adapter
        match.read = read
        match.front = False

        if self.action is None or match.rstart >= len(read):
            trimmed_read = read

        else:
            trimmed_read = adapter.trimmed(match)

            if self.action == 'mask':
                # add N from last modification
                masked_sequence = trimmed_read.sequence
                masked_sequence += 'N' * (len(read) - len(trimmed_read))
                # set masked sequence as sequence with original quality
                trimmed_read.sequence = masked_sequence
                trimmed_read.qualities = read.qualities
            elif self.action == 'lower':
                # TODO: offer option to mask with lower-case of trimmed base
                # This will happen as part of the refactoring to modify
                # Sequences in-place.
                pass

        trimmed_read.match = match
        trimmed_read.match_info = [match.get_info_record()]

        self.with_adapters[read_idx] += 1
        return trimmed_read

    def summarize(self):
        """Returns a summary dict.
        """
        adapters_summary = tuple({adapter.name: adapter.summarize()}
                                 for adapter in (self.adapter1, self.adapter2))
        summary = dict(records_with_adapters=self.with_adapters,
                       adapters=adapters_summary)
        if self.mismatch_action:
            summary.update(ErrorCorrectorMixin.summarize(self))
        return summary
Ejemplo n.º 10
0
class InsertAdapterCutter(ReadPairModifier, ErrorCorrectorMixin):
    """
    AdapterCutter that uses InsertAligner to first try to identify
    insert overlap before falling back to semi-global adapter alignment.
    """
    def __init__(self,
                 adapter1,
                 adapter2,
                 action='trim',
                 mismatch_action=None,
                 symmetric=True,
                 min_insert_overlap=1,
                 **aligner_args):
        ErrorCorrectorMixin.__init__(self, mismatch_action)
        self.adapter1 = adapter1
        self.adapter2 = adapter2
        self.aligner = InsertAligner(adapter1.sequence,
                                     adapter2.sequence,
                                     min_insert_overlap=min_insert_overlap,
                                     **aligner_args)
        self.min_insert_len = min_insert_overlap
        self.action = action
        self.symmetric = symmetric
        self.with_adapters = [0, 0]

    def __call__(self, read1, read2):
        read_lengths = [len(r) for r in (read1, read2)]
        if any(l < self.min_insert_len for l in read_lengths):
            return (read1, read2)

        match = self.aligner.match_insert(read1.sequence, read2.sequence)
        read1.insert_overlap = read2.insert_overlap = (match is not None)
        insert_match = None
        correct_errors = False

        if match:
            insert_match, adapter_match1, adapter_match2 = match
            correct_errors = self.mismatch_action and insert_match[5] > 0
        else:
            adapter_match1 = self.adapter1.match_to(read1)
            adapter_match2 = self.adapter2.match_to(read2)
            # If the adapter matches are complementary, perform error correction
            if (self.mismatch_action and adapter_match1 and adapter_match2
                    and adapter_match1.rstart == adapter_match2.rstart):
                insert_match = (read_lengths[1] - adapter_match1.rstart,
                                read_lengths[1], 0, adapter_match1.rstart)
                correct_errors = True

        # If exactly one of the two alignments failed and symmetrix is True,
        # duplicate the good alignment
        if self.symmetric and sum(
                bool(m) for m in (adapter_match1, adapter_match2)) == 1:
            if adapter_match1:
                adapter_match2 = adapter_match1.copy()
            else:
                adapter_match1 = adapter_match2.copy()
            if self.mismatch_action and not insert_match:
                # Assume that the symmetric read segments overlap and
                # perform error correction
                insert_match = (read_lengths[1] - adapter_match1.rstart,
                                read_lengths[1], 0, adapter_match1.rstart)
                correct_errors = True

        if correct_errors:
            self.correct_errors(read1, read2, insert_match)

        return (self.trim(read1, self.adapter1, adapter_match1,
                          0), self.trim(read2, self.adapter2, adapter_match2,
                                        1))

    def trim(self, read, adapter, match, read_idx):
        if not match:
            read.match = None
            read.match_info = None
            return read

        match.adapter = adapter
        match.read = read
        match.front = False

        if self.action is None or match.rstart >= len(read):
            trimmed_read = read

        else:
            trimmed_read = adapter.trimmed(match)

            if self.action == 'mask':
                # add N from last modification
                masked_sequence = trimmed_read.sequence
                masked_sequence += 'N' * (len(read) - len(trimmed_read))
                # set masked sequence as sequence with original quality
                trimmed_read.sequence = masked_sequence
                trimmed_read.qualities = read.qualities
            elif self.action == 'lower':
                # TODO: offer option to mask with lower-case of trimmed base
                # This will happen as part of the refactoring to modify
                # Sequences in-place.
                pass

        trimmed_read.match = match
        trimmed_read.match_info = [match.get_info_record()]

        self.with_adapters[read_idx] += 1
        return trimmed_read
Ejemplo n.º 11
0
class InsertAdapterCutter(ReadPairModifier, ErrorCorrectorMixin):
    """
    AdapterCutter that uses InsertAligner to first try to identify
    insert overlap before falling back to semi-global adapter alignment.
    """
    def __init__(self, adapter1, adapter2, action='trim', mismatch_action=None,
                 symmetric=True, min_insert_overlap=1, **aligner_args):
        ErrorCorrectorMixin.__init__(self, mismatch_action)
        self.adapter1 = adapter1
        self.adapter2 = adapter2
        self.aligner = InsertAligner(adapter1.sequence, adapter2.sequence,
            min_insert_overlap=min_insert_overlap, **aligner_args)
        self.min_insert_len = min_insert_overlap
        self.action = action
        self.symmetric = symmetric
        self.with_adapters = [0, 0]
    
    def __call__(self, read1, read2):
        read_lengths = [len(r) for r in (read1, read2)]
        if any(l < self.min_insert_len for l in read_lengths):
            return (read1, read2)
        
        match = self.aligner.match_insert(read1.sequence, read2.sequence)
        read1.insert_overlap = read2.insert_overlap = (match is not None)
        insert_match = None
        correct_errors = False
        
        if match:
            insert_match, adapter_match1, adapter_match2 = match
            correct_errors = self.mismatch_action and insert_match[5] > 0
        else:
            adapter_match1 = self.adapter1.match_to(read1)
            adapter_match2 = self.adapter2.match_to(read2)
            # If the adapter matches are complementary, perform error correction
            if (self.mismatch_action and adapter_match1 and adapter_match2 and
                        adapter_match1.rstart == adapter_match2.rstart):
                    insert_match = (
                        read_lengths[1] - adapter_match1.rstart,
                        read_lengths[1], 0, adapter_match1.rstart)
                    correct_errors = True
        
        # If exactly one of the two alignments failed and symmetrix is True,
        # duplicate the good alignment
        if self.symmetric and sum(
                bool(m) for m in (adapter_match1, adapter_match2)) == 1:
            if adapter_match1:
                adapter_match2 = adapter_match1.copy()
            else:
                adapter_match1 = adapter_match2.copy()
            if self.mismatch_action and not insert_match:
                # Assume that the symmetric read segments overlap and
                # perform error correction
                insert_match = (
                    read_lengths[1] - adapter_match1.rstart,
                    read_lengths[1], 0, adapter_match1.rstart)
                correct_errors = True
        
        if correct_errors:
            self.correct_errors(read1, read2, insert_match)
        
        return (
            self.trim(read1, self.adapter1, adapter_match1, 0),
            self.trim(read2, self.adapter2, adapter_match2, 1)
        )
    
    def trim(self, read, adapter, match, read_idx):
        if not match:
            read.match = None
            read.match_info = None
            return read
        
        match.adapter = adapter
        match.read = read
        match.front = False
    
        if self.action is None or match.rstart >= len(read):
            trimmed_read = read
        
        else:
            trimmed_read = adapter.trimmed(match)
            
            if self.action == 'mask':
                # add N from last modification
                masked_sequence = trimmed_read.sequence
                masked_sequence += ('N' * len(read) - len(trimmed_read))
                # set masked sequence as sequence with original quality
                trimmed_read.sequence = masked_sequence
                trimmed_read.qualities = read.qualities
            elif self.action == 'lower':
                # TODO: offer option to mask with lower-case of trimmed base
                # This will happen as part of the refactoring to modify
                # Sequences in-place.
                pass
        
        trimmed_read.match = match
        trimmed_read.match_info = [match.get_info_record()]
        
        self.with_adapters[read_idx] += 1
        return trimmed_read
Ejemplo n.º 12
0
class InsertAdapterCutter(ReadPairModifier, ErrorCorrectorMixin):
    """AdapterCutter that uses InsertAligner to first try to identify
    insert overlap before falling back to semi-global adapter alignment.
    
    Args:
        adapter1, adapter2: Adapters.
        action: Action to take on adapter match: trim, mask (replace adapter
            with N's), lower (convert adapter bases to lower case),
            or None.
        mismatch_action: How to deal with mismatches. See
            :class:`ErrorCorrectorMixin`.
        symmetric: Whether to assume that the adapter should appear in the
            same place on overlapping reads.
        min_insert_overlap: Minimum overlap required between reads to be
            considered an insert match.
        aligner_args: Additional arguments to :class:`InsertAligner`.
    """
    def __init__(
            self, adapter1, adapter2, action='trim', mismatch_action=None,
            symmetric=True, min_insert_overlap=1, **aligner_args):
        ErrorCorrectorMixin.__init__(self, mismatch_action)
        self.adapter1 = adapter1
        self.adapter2 = adapter2
        self.aligner = InsertAligner(
            adapter1.sequence, adapter2.sequence,
            min_insert_overlap=min_insert_overlap,
            **aligner_args)
        self.min_insert_len = min_insert_overlap
        self.action = action
        self.symmetric = symmetric
        self.with_adapters = [0, 0]
    
    def __call__(self, read1, read2):
        read_lengths = [len(r) for r in (read1, read2)]
        if any(l < self.min_insert_len for l in read_lengths):
            return (read1, read2)
        
        match = self.aligner.match_insert(read1.sequence, read2.sequence)
        read1.insert_overlap = read2.insert_overlap = (match is not None)
        insert_match = None
        correct_errors = False
        
        if match:
            insert_match, adapter_match1, adapter_match2 = match
            correct_errors = self.mismatch_action and insert_match[5] > 0
        else:
            adapter_match1 = self.adapter1.match_to(read1)
            adapter_match2 = self.adapter2.match_to(read2)
            # If the adapter matches are complementary, perform error correction
            if (
                    self.mismatch_action and adapter_match1 and
                    adapter_match2 and
                    adapter_match1.rstart == adapter_match2.rstart):
                insert_match = (
                    read_lengths[1] - adapter_match1.rstart,
                    read_lengths[1], 0, adapter_match1.rstart)
                correct_errors = True
        
        # If exactly one of the two alignments failed and symmetric is True,
        # duplicate the good alignment
        if self.symmetric and sum(
                bool(m) for m in (adapter_match1, adapter_match2)) == 1:

            def create_symmetric_match(match, read, read_len):
                if match.rstart > read_len:
                    return None
                match = match.copy()
                # If we're not dealing with equal-length reads, and this read
                # is shorter than the other, adjust the match end to be the
                # read length. The 'matches' and 'errors' attributes will be
                # wrong, but it shouldn't matter.
                if match.rstop < read_len:
                    match.astop -= (read_len - match.rstop)
                    match.rstop = read_len
                return match

            if adapter_match1:
                adapter_match2 = create_symmetric_match(adapter_match1, read2, read_lengths[1])
            else:
                adapter_match1 = create_symmetric_match(adapter_match2, read1, read_lengths[0])

            if self.mismatch_action and not insert_match and adapter_match1 and adapter_match2:
                # Assume that the symmetric read segments overlap and
                # perform error correction
                insert_match = (
                    read_lengths[1] - adapter_match1.rstart,
                    read_lengths[1], 0, adapter_match1.rstart)
                correct_errors = True
        
        if correct_errors:
            self.correct_errors(read1, read2, insert_match, truncate_seqs=True)
        
        return (
            self.trim(read1, self.adapter1, adapter_match1, 0),
            self.trim(read2, self.adapter2, adapter_match2, 1))
    
    def trim(self, read, adapter, match, read_idx):
        """Trim an adapter from a read.
        
        Args:
            read: The read to trim from.
            adapter: The Adapter to trim.
            match: The match details.
            read_idx: 0/1
        """
        if not match:
            read.match = None
            read.match_info = None
            return read
        
        match.adapter = adapter
        match.read = read
        match.front = False
        
        if self.action is None or match.rstart >= len(read):
            trimmed_read = read
        
        else:
            trimmed_read = adapter.trimmed(match)
            
            if self.action == 'mask':
                # add N from last modification
                masked_sequence = trimmed_read.sequence
                masked_sequence += 'N' * (len(read) - len(trimmed_read))
                # set masked sequence as sequence with original quality
                trimmed_read.sequence = masked_sequence
                trimmed_read.qualities = read.qualities
            elif self.action == 'lower':
                # TODO: offer option to mask with lower-case of trimmed base
                # This will happen as part of the refactoring to modify
                # Sequences in-place.
                pass
        
        trimmed_read.match = match
        trimmed_read.match_info = [match.get_info_record()]
        
        self.with_adapters[read_idx] += 1
        return trimmed_read
    
    def summarize(self):
        """Returns a summary dict.
        """
        adapters_summary = tuple(
            { adapter.name : adapter.summarize() }
            for adapter in (self.adapter1, self.adapter2))
        summary = dict(
            records_with_adapters=self.with_adapters,
            adapters=adapters_summary)
        if self.mismatch_action:
            summary.update(ErrorCorrectorMixin.summarize(self))
        return summary