Example #1
0
def test_end_trim_with_mismatch():
    """
	Test the not-so-obvious case where an adapter of length 13 is trimmed from
	the end of a sequence with overlap 9 and there is one deletion.
	In this case the algorithm starts with 10 bases of the adapter to get
	the hit and so the match is considered good. An insertion or substitution
	at the same spot is not a match.
	"""
    adapter = Adapter('TCGATCGATCGAT', BACK, 0.1)

    read = Sequence('foo1', 'AAAAAAAAAAATCGTCGATC')
    cutter = AdapterCutter([adapter], times=1)
    trimmed_read = cutter(read)

    assert trimmed_read.sequence == 'AAAAAAAAAAA'
    assert cutter.adapter_statistics[adapter].back.lengths == {9: 1}
    # We see 1 error at length 9 even though the number of allowed mismatches at
    # length 9 is 0.
    assert cutter.adapter_statistics[adapter].back.errors[9][1] == 1

    read = Sequence('foo2', 'AAAAAAAAAAATCGAACGA')
    cutter = AdapterCutter([adapter], times=1)
    trimmed_read = cutter(read)

    assert trimmed_read.sequence == read.sequence
    assert cutter.adapter_statistics[adapter].back.lengths == {}
Example #2
0
 def test_twoheaders(self):
     with FastqWriter(self.path) as fq:
         fq.write(Sequence("name", "CCATA", "!#!#!", second_header=True))
         fq.write(Sequence("name2", "HELLO", "&&&!&", second_header=True))
     assert fq._file.closed
     with open(self.path) as t:
         assert t.read() == '@name\nCCATA\n+name\n!#!#!\n@name2\nHELLO\n+name2\n&&&!&\n'
Example #3
0
 def test_write_to_file_like_object(self):
     sio = StringIO()
     with FastaWriter(sio) as fw:
         fw.write(Sequence("name", "CCATA"))
         fw.write(Sequence("name2", "HELLO"))
         assert sio.getvalue() == '>name\nCCATA\n>name2\nHELLO\n'
     assert not fw._file.closed
Example #4
0
 def test_write_sequence_object(self):
     with FastaWriter(self.path) as fw:
         fw.write(Sequence("name", "CCATA"))
         fw.write(Sequence("name2", "HELLO"))
     assert fw._file.closed
     with open(self.path) as t:
         assert t.read() == '>name\nCCATA\n>name2\nHELLO\n'
Example #5
0
def test_nend_trimmer():
    trimmer = NEndTrimmer()
    seqs = ['NNNNAAACCTTGGNNN', 'NNNNAAACNNNCTTGGNNN', 'NNNNNN']
    trims = ['AAACCTTGG', 'AAACNNNCTTGG', '']
    for seq, trimmed in zip(seqs, trims):
        _seq = Sequence('read1', seq, qualities='#' * len(seq))
        _trimmed = Sequence('read1', trimmed, qualities='#' * len(trimmed))
        assert trimmer(_seq) == _trimmed
Example #6
0
def test_nextseq_trim():
    s = Sequence('n', '', '')
    assert nextseq_trim_index(s, cutoff=22) == 0
    s = Sequence('n',
                 'TCTCGTATGCCGTCTTATGCTTGAAAAAAAAAAGGGGGGGGGGGGGGGGGNNNNNNNNNNNGGNGG',
                 'AA//EAEE//A6///E//A//EA/EEEEEEAEA//EEEEEEEEEEEEEEE###########EE#EA'
                 )
    assert nextseq_trim_index(s, cutoff=22) == 33
Example #7
0
 def test(self):
     reads = [(Sequence('A/1 comment', 'TTA',
                        '##H'), Sequence('A/2 comment', 'GCT', 'HH#')),
              (Sequence('B/1', 'CC', 'HH'), Sequence('B/2', 'TG', '#H'))]
     sio = StringIO()
     with InterleavedSequenceWriter(sio) as writer:
         for read1, read2 in reads:
             writer.write(read1, read2)
     assert sio.getvalue(
     ) == '@A/1 comment\nTTA\n+\n##H\n@A/2 comment\nGCT\n+\nHH#\n@B/1\nCC\n+\nHH\n@B/2\nTG\n+\n#H\n'
Example #8
0
def test_quality_trimmer():
    read = Sequence('read1', 'ACGTTTACGTA', '##456789###')

    qt = QualityTrimmer(10, 10, 33)
    assert qt(read) == Sequence('read1', 'GTTTAC', '456789')

    qt = QualityTrimmer(0, 10, 33)
    assert qt(read) == Sequence('read1', 'ACGTTTAC', '##456789')

    qt = QualityTrimmer(10, 0, 33)
    assert qt(read) == Sequence('read1', 'GTTTACGTA', '456789###')
Example #9
0
def test_shortener():
    read = Sequence('read1', 'ACGTTTACGTA', '##456789###')

    shortener = Shortener(0)
    assert shortener(read) == Sequence('read1', '', '')

    shortener = Shortener(1)
    assert shortener(read) == Sequence('read1', 'A', '#')

    shortener = Shortener(5)
    assert shortener(read) == Sequence('read1', 'ACGTT', '##456')

    shortener = Shortener(100)
    assert shortener(read) == read
Example #10
0
    def test(self):
        expected = [
            (Sequence('read1/1 some text', 'TTATTTGTCTCCAGC', '##HHHHHHHHHHHHH'),
             Sequence('read1/2 other text', 'GCTGGAGACAAATAA', 'HHHHHHHHHHHHHHH')),
            (Sequence('read3/1', 'CCAACTTGATATTAATAACA', 'HHHHHHHHHHHHHHHHHHHH'),
             Sequence('read3/2', 'TGTTATTAATATCAAGTTGG', '#HHHHHHHHHHHHHHHHHHH'))
        ]
        reads = list(InterleavedSequenceReader("tests/cut/interleaved.fastq"))
        for (r1, r2), (e1, e2) in zip(reads, expected):
            print(r1, r2, e1, e2)

        assert reads == expected
        with openseq("tests/cut/interleaved.fastq", interleaved=True) as f:
            reads = list(f)
        assert reads == expected
Example #11
0
def test_ncontentfilter_paired():
	params = [
		('AAA', 'AAA', 0, KEEP),
		('AAAN', 'AAA', 0, DISCARD),
		('AAA', 'AANA', 0, DISCARD),
		('ANAA', 'AANA', 1, KEEP),
	]
	for seq1, seq2, count, expected in params:
		filter = NContentFilter(count=count, check_second=False)
		filter_cs = NContentFilter(count=count, check_second=True)
		read1 = Sequence('read1', seq1, qualities='#'*len(seq1))
		read2 = Sequence('read1', seq2, qualities='#'*len(seq2))
		assert filter(read1, read2) == filter(read1)
		# discard entire pair if one of the reads fulfills criteria
		assert filter_cs(read1, read2) == expected
Example #12
0
def test_ncontentfilter_paired(seq1, seq2, count, expected):
    filter_ = NContentFilter(count=count)
    filter_legacy = PairedRedirector(None,
                                     filter_,
                                     filter_,
                                     pair_filter_mode='first')
    filter_any = PairedRedirector(None,
                                  filter_,
                                  filter_,
                                  pair_filter_mode='any')
    read1 = Sequence('read1', seq1, qualities='#' * len(seq1))
    read2 = Sequence('read1', seq2, qualities='#' * len(seq2))
    assert filter_legacy(read1, read2, [], []) == filter_(read1, [])
    # discard entire pair if one of the reads fulfills criteria
    assert filter_any(read1, read2, [], []) == expected
Example #13
0
def test_issue_80():
	# This issue turned out to not be an actual issue with the alignment
	# algorithm. The following alignment is found because it has more matches
	# than the 'obvious' one:
	#
	# TCGTATGCCGTCTTC
	# =========X==XX=
	# TCGTATGCCCTC--C
	#
	# This is correct, albeit a little surprising, since an alignment without
	# indels would have only two errors.

	adapter = Adapter(
		sequence="TCGTATGCCGTCTTC",
		where=BACK,
		remove='suffix',
		max_error_rate=0.2,
		min_overlap=3,
		read_wildcards=False,
		adapter_wildcards=False)
	read = Sequence(name="seq2", sequence="TCGTATGCCCTCC")
	result = adapter.match_to(read)
	assert result.errors == 3, result
	assert result.astart == 0, result
	assert result.astop == 15, result
Example #14
0
def test_ncontentfilter_paired():
    params = [
        ('AAA', 'AAA', 0, KEEP),
        ('AAAN', 'AAA', 0, DISCARD),
        ('AAA', 'AANA', 0, DISCARD),
        ('ANAA', 'AANA', 1, KEEP),
    ]
    for seq1, seq2, count, expected in params:
        filter = NContentFilter(count=count)
        filter_legacy = LegacyPairedRedirector(None, filter)
        filter_both = PairedRedirector(None, filter)
        read1 = Sequence('read1', seq1, qualities='#' * len(seq1))
        read2 = Sequence('read1', seq2, qualities='#' * len(seq2))
        assert filter_legacy(read1, read2) == filter(read1)
        # discard entire pair if one of the reads fulfills criteria
        assert filter_both(read1, read2) == expected
Example #15
0
def test_info_record():
    adapter = Adapter(
        sequence='GAACTCCAGTCACNNNNN',
        where=BACK,
        max_error_rate=0.12,
        min_overlap=5,
        read_wildcards=False,
        adapter_wildcards=True,
        name="Foo")
    read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC')
    am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, remove_before=False,
               adapter=adapter, read=read)
    assert am.get_info_record() == (
        "abc",
        2,
        5,
        21,
        'CCCCA',
        'GAACTACAGTCCCGGC',
        '',
        'Foo',
        '',
        '',
        ''
    )
Example #16
0
def test_linked_adapter():
    linked_adapter = LinkedAdapter('AAAA', 'TTTT')
    sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT')
    match = linked_adapter.match_to(sequence)
    trimmed = linked_adapter.trimmed(match)
    assert trimmed.name == 'seq'
    assert trimmed.sequence == 'CCCCC'
Example #17
0
def readFastq(inp=sys.stdin):
    i = 0
    for line in inp:
        if i & 4 == 0: read_id = line.strip()
        if i % 4 == 1: read_seq = line.strip()
        if i % 4 == 3: yield Sequence(read_id, read_seq)
        i += 1
Example #18
0
def test_issue_265():
    """Crash when accessing the matches property of non-anchored linked adapters"""
    s = Sequence('name', 'AAAATTTT')
    la = LinkedAdapter('GGG',
                       'TTT',
                       front_restriction=None,
                       back_restriction=None)
    assert la.match_to(s).matches == 3
Example #19
0
def test_ncontenttrimmer():
    # third parameter is True if read should be discarded
    params = [('AAA', 0, KEEP), ('AAA', 1, KEEP), ('AAACCTTGGN', 1, KEEP),
              ('AAACNNNCTTGGN', 0.5, KEEP), ('NNNNNN', 1, DISCARD),
              ('ANAAAA', 1 / 6, KEEP), ('ANAAAA', 0, DISCARD)]
    for seq, count, expected in params:
        writer = NContentTrimmer(count=count)
        _seq = Sequence('read1', seq, qualities='#' * len(seq))
        assert writer(_seq) == expected
Example #20
0
def test_linked_adapter():
    linked_adapter = LinkedAdapter('AAAA', 'TTTT', min_overlap=4)
    assert linked_adapter.front_adapter.min_overlap == 4
    assert linked_adapter.back_adapter.min_overlap == 4

    sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT')
    trimmed = linked_adapter.match_to(sequence).trimmed()
    assert trimmed.name == 'seq'
    assert trimmed.sequence == 'CCCCC'
Example #21
0
def test_anywhere_parameter():
	parser = AdapterParser(colorspace=False, max_error_rate=0.2, min_overlap=4, read_wildcards=False,
		adapter_wildcards=False, indels=True)
	adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'back'))[0]
	assert adapter.remove == 'suffix'
	assert adapter.where == ANYWHERE
	read = Sequence('foo1', 'TGAAGTACACGGTTAAAAAAAAAA')
	from cutadapt.modifiers import AdapterCutter
	cutter = AdapterCutter([adapter])
	trimmed_read = cutter(read, [])
	assert trimmed_read.sequence == ''
Example #22
0
def test_statistics():
    read = Sequence('name', 'AAAACCCCAAAA')
    adapters = [Adapter('CCCC', BACK, 0.1)]
    cutter = AdapterCutter(adapters, times=3)
    trimmed_read = cutter(read)
    # TODO make this a lot simpler
    trimmed_bp = 0
    for adapter in adapters:
        for d in (adapter.lengths_front, adapter.lengths_back):
            trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items())
    assert trimmed_bp <= len(read), trimmed_bp
Example #23
0
def test_statistics():
    read = Sequence('name', 'AAAACCCCAAAA')
    adapters = [Adapter('CCCC', BACK, max_error_rate=0.1)]
    cutter = AdapterCutter(adapters, times=3)
    trimmed_read = cutter(read, [])
    # TODO make this a lot simpler
    trimmed_bp = 0
    for adapter in adapters:
        for d in (cutter.adapter_statistics[adapter].front.lengths,
                  cutter.adapter_statistics[adapter].back.lengths):
            trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items())
    assert trimmed_bp <= len(read), trimmed_bp
Example #24
0
def test_issue_52():
	adapter = Adapter(
		sequence='GAACTCCAGTCACNNNNN',
		where=BACK,
		max_error_rate=0.12,
		min_overlap=5,
		read_wildcards=False,
		adapter_wildcards=True)
	read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC')
	am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, front=None, adapter=adapter, read=read)
	assert am.wildcards() == 'GGC'
	"""
Example #25
0
def test_anywhere_with_errors():
    adapter = Adapter('CCGCATTTAG', ANYWHERE, max_error_rate=0.1)
    for seq, expected_trimmed in (
        ('AACCGGTTccgcatttagGATC', 'AACCGGTT'),
        ('AACCGGTTccgcgtttagGATC', 'AACCGGTT'),  # one mismatch
        ('AACCGGTTccgcatttag', 'AACCGGTT'),
        ('ccgcatttagAACCGGTT', 'AACCGGTT'),
        ('ccgtatttagAACCGGTT', 'AACCGGTT'),  # one mismatch
        ('ccgatttagAACCGGTT', 'AACCGGTT'),  # one deletion
    ):
        read = Sequence('foo', seq)
        cutter = AdapterCutter([adapter], times=1)
        trimmed_read = cutter(read, [])
        assert trimmed_read.sequence == expected_trimmed
Example #26
0
def test_sequence():
    seq = VBIMSeq(sequence='CCACCATGGATTACAAGGATGACGACGATAAGAATTCTT',
                  where=ANYWHERE,
                  max_error_rate=0.1,
                  read_wildcards=False,
                  adapter_wildcards=False)
    read = Sequence(
        name="test1",
        sequence=
        'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCACCATGGATTACAAGGATGACGACGATAAGAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT'
    )
    m = seq.match_to(read)
    assert m.trimmed(
    ).sequence == 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTT'
Example #27
0
 def _trim(self):
     """Compute the trimmed read"""
     """
     #Only removes VBIM tag
     self._trimmed_read = Sequence(self.read.name, self.read.sequence[:self.rstart] + self.read.sequence[self.rstop:],
                                   qualities=self.read.qualities[:self.rstart] + self.read.qualities[self.rstop:]
                                   if self.read.qualities else None, second_header=self.read.second_header, match=self)
     """
     self._trimmed_read = Sequence(
         self.read.name,
         self.read.sequence[self.rstop:],
         qualities=self.read.qualities[self.rstop:]
         if self.read.qualities else None,
         second_header=self.read.second_header,
         match=self)
     adjacent_base = self.read.sequence[self.rstart - 1]
     if adjacent_base not in 'ACGT':
         adjacent_base = ''
     self.adjacent_base = adjacent_base
Example #28
0
 def test_too_many_qualities(self):
     Sequence(name="name", sequence="ACGT", qualities="#####")
Example #29
0
 def test_write_zero_length_sequence(self):
     sio = StringIO()
     with FastaWriter(sio) as fw:
         fw.write(Sequence("name", ""))
         assert sio.getvalue() == '>name\n\n', '{0!r}'.format(
             sio.getvalue())
Example #30
0
import sys
import os
import shutil
from textwrap import dedent
from nose.tools import raises
from tempfile import mkdtemp
from cutadapt.seqio import (Sequence, ColorspaceSequence, FormatError,
                            FastaReader, FastqReader, FastaQualReader,
                            InterleavedSequenceReader, FastaWriter,
                            FastqWriter, InterleavedSequenceWriter, open as
                            openseq)
from cutadapt.compat import StringIO

# files tests/data/simple.fast{q,a}
simple_fastq = [
    Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"),
    Sequence("second_sequence", "SEQUENCE2", "83<??:(61")
]

simple_fasta = [Sequence(x.name, x.sequence, None) for x in simple_fastq]


class TestSequence:
    @raises(FormatError)
    def test_too_many_qualities(self):
        Sequence(name="name", sequence="ACGT", qualities="#####")

    @raises(FormatError)
    def test_too_many_qualities_colorspace(self):
        ColorspaceSequence(name="name", sequence="T0123", qualities="#####")