def test_end_trim_with_mismatch(): """ Test the not-so-obvious case where an adapter of length 13 is trimmed from the end of a sequence with overlap 9 and there is one deletion. In this case the algorithm starts with 10 bases of the adapter to get the hit and so the match is considered good. An insertion or substitution at the same spot is not a match. """ adapter = Adapter('TCGATCGATCGAT', BACK, 0.1) read = Sequence('foo1', 'AAAAAAAAAAATCGTCGATC') cutter = AdapterCutter([adapter], times=1) trimmed_read = cutter(read) assert trimmed_read.sequence == 'AAAAAAAAAAA' assert cutter.adapter_statistics[adapter].back.lengths == {9: 1} # We see 1 error at length 9 even though the number of allowed mismatches at # length 9 is 0. assert cutter.adapter_statistics[adapter].back.errors[9][1] == 1 read = Sequence('foo2', 'AAAAAAAAAAATCGAACGA') cutter = AdapterCutter([adapter], times=1) trimmed_read = cutter(read) assert trimmed_read.sequence == read.sequence assert cutter.adapter_statistics[adapter].back.lengths == {}
def test_twoheaders(self): with FastqWriter(self.path) as fq: fq.write(Sequence("name", "CCATA", "!#!#!", second_header=True)) fq.write(Sequence("name2", "HELLO", "&&&!&", second_header=True)) assert fq._file.closed with open(self.path) as t: assert t.read() == '@name\nCCATA\n+name\n!#!#!\n@name2\nHELLO\n+name2\n&&&!&\n'
def test_write_to_file_like_object(self): sio = StringIO() with FastaWriter(sio) as fw: fw.write(Sequence("name", "CCATA")) fw.write(Sequence("name2", "HELLO")) assert sio.getvalue() == '>name\nCCATA\n>name2\nHELLO\n' assert not fw._file.closed
def test_write_sequence_object(self): with FastaWriter(self.path) as fw: fw.write(Sequence("name", "CCATA")) fw.write(Sequence("name2", "HELLO")) assert fw._file.closed with open(self.path) as t: assert t.read() == '>name\nCCATA\n>name2\nHELLO\n'
def test_nend_trimmer(): trimmer = NEndTrimmer() seqs = ['NNNNAAACCTTGGNNN', 'NNNNAAACNNNCTTGGNNN', 'NNNNNN'] trims = ['AAACCTTGG', 'AAACNNNCTTGG', ''] for seq, trimmed in zip(seqs, trims): _seq = Sequence('read1', seq, qualities='#' * len(seq)) _trimmed = Sequence('read1', trimmed, qualities='#' * len(trimmed)) assert trimmer(_seq) == _trimmed
def test_nextseq_trim(): s = Sequence('n', '', '') assert nextseq_trim_index(s, cutoff=22) == 0 s = Sequence('n', 'TCTCGTATGCCGTCTTATGCTTGAAAAAAAAAAGGGGGGGGGGGGGGGGGNNNNNNNNNNNGGNGG', 'AA//EAEE//A6///E//A//EA/EEEEEEAEA//EEEEEEEEEEEEEEE###########EE#EA' ) assert nextseq_trim_index(s, cutoff=22) == 33
def test(self): reads = [(Sequence('A/1 comment', 'TTA', '##H'), Sequence('A/2 comment', 'GCT', 'HH#')), (Sequence('B/1', 'CC', 'HH'), Sequence('B/2', 'TG', '#H'))] sio = StringIO() with InterleavedSequenceWriter(sio) as writer: for read1, read2 in reads: writer.write(read1, read2) assert sio.getvalue( ) == '@A/1 comment\nTTA\n+\n##H\n@A/2 comment\nGCT\n+\nHH#\n@B/1\nCC\n+\nHH\n@B/2\nTG\n+\n#H\n'
def test_quality_trimmer(): read = Sequence('read1', 'ACGTTTACGTA', '##456789###') qt = QualityTrimmer(10, 10, 33) assert qt(read) == Sequence('read1', 'GTTTAC', '456789') qt = QualityTrimmer(0, 10, 33) assert qt(read) == Sequence('read1', 'ACGTTTAC', '##456789') qt = QualityTrimmer(10, 0, 33) assert qt(read) == Sequence('read1', 'GTTTACGTA', '456789###')
def test_shortener(): read = Sequence('read1', 'ACGTTTACGTA', '##456789###') shortener = Shortener(0) assert shortener(read) == Sequence('read1', '', '') shortener = Shortener(1) assert shortener(read) == Sequence('read1', 'A', '#') shortener = Shortener(5) assert shortener(read) == Sequence('read1', 'ACGTT', '##456') shortener = Shortener(100) assert shortener(read) == read
def test(self): expected = [ (Sequence('read1/1 some text', 'TTATTTGTCTCCAGC', '##HHHHHHHHHHHHH'), Sequence('read1/2 other text', 'GCTGGAGACAAATAA', 'HHHHHHHHHHHHHHH')), (Sequence('read3/1', 'CCAACTTGATATTAATAACA', 'HHHHHHHHHHHHHHHHHHHH'), Sequence('read3/2', 'TGTTATTAATATCAAGTTGG', '#HHHHHHHHHHHHHHHHHHH')) ] reads = list(InterleavedSequenceReader("tests/cut/interleaved.fastq")) for (r1, r2), (e1, e2) in zip(reads, expected): print(r1, r2, e1, e2) assert reads == expected with openseq("tests/cut/interleaved.fastq", interleaved=True) as f: reads = list(f) assert reads == expected
def test_ncontentfilter_paired(): params = [ ('AAA', 'AAA', 0, KEEP), ('AAAN', 'AAA', 0, DISCARD), ('AAA', 'AANA', 0, DISCARD), ('ANAA', 'AANA', 1, KEEP), ] for seq1, seq2, count, expected in params: filter = NContentFilter(count=count, check_second=False) filter_cs = NContentFilter(count=count, check_second=True) read1 = Sequence('read1', seq1, qualities='#'*len(seq1)) read2 = Sequence('read1', seq2, qualities='#'*len(seq2)) assert filter(read1, read2) == filter(read1) # discard entire pair if one of the reads fulfills criteria assert filter_cs(read1, read2) == expected
def test_ncontentfilter_paired(seq1, seq2, count, expected): filter_ = NContentFilter(count=count) filter_legacy = PairedRedirector(None, filter_, filter_, pair_filter_mode='first') filter_any = PairedRedirector(None, filter_, filter_, pair_filter_mode='any') read1 = Sequence('read1', seq1, qualities='#' * len(seq1)) read2 = Sequence('read1', seq2, qualities='#' * len(seq2)) assert filter_legacy(read1, read2, [], []) == filter_(read1, []) # discard entire pair if one of the reads fulfills criteria assert filter_any(read1, read2, [], []) == expected
def test_issue_80(): # This issue turned out to not be an actual issue with the alignment # algorithm. The following alignment is found because it has more matches # than the 'obvious' one: # # TCGTATGCCGTCTTC # =========X==XX= # TCGTATGCCCTC--C # # This is correct, albeit a little surprising, since an alignment without # indels would have only two errors. adapter = Adapter( sequence="TCGTATGCCGTCTTC", where=BACK, remove='suffix', max_error_rate=0.2, min_overlap=3, read_wildcards=False, adapter_wildcards=False) read = Sequence(name="seq2", sequence="TCGTATGCCCTCC") result = adapter.match_to(read) assert result.errors == 3, result assert result.astart == 0, result assert result.astop == 15, result
def test_ncontentfilter_paired(): params = [ ('AAA', 'AAA', 0, KEEP), ('AAAN', 'AAA', 0, DISCARD), ('AAA', 'AANA', 0, DISCARD), ('ANAA', 'AANA', 1, KEEP), ] for seq1, seq2, count, expected in params: filter = NContentFilter(count=count) filter_legacy = LegacyPairedRedirector(None, filter) filter_both = PairedRedirector(None, filter) read1 = Sequence('read1', seq1, qualities='#' * len(seq1)) read2 = Sequence('read1', seq2, qualities='#' * len(seq2)) assert filter_legacy(read1, read2) == filter(read1) # discard entire pair if one of the reads fulfills criteria assert filter_both(read1, read2) == expected
def test_info_record(): adapter = Adapter( sequence='GAACTCCAGTCACNNNNN', where=BACK, max_error_rate=0.12, min_overlap=5, read_wildcards=False, adapter_wildcards=True, name="Foo") read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, remove_before=False, adapter=adapter, read=read) assert am.get_info_record() == ( "abc", 2, 5, 21, 'CCCCA', 'GAACTACAGTCCCGGC', '', 'Foo', '', '', '' )
def test_linked_adapter(): linked_adapter = LinkedAdapter('AAAA', 'TTTT') sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT') match = linked_adapter.match_to(sequence) trimmed = linked_adapter.trimmed(match) assert trimmed.name == 'seq' assert trimmed.sequence == 'CCCCC'
def readFastq(inp=sys.stdin): i = 0 for line in inp: if i & 4 == 0: read_id = line.strip() if i % 4 == 1: read_seq = line.strip() if i % 4 == 3: yield Sequence(read_id, read_seq) i += 1
def test_issue_265(): """Crash when accessing the matches property of non-anchored linked adapters""" s = Sequence('name', 'AAAATTTT') la = LinkedAdapter('GGG', 'TTT', front_restriction=None, back_restriction=None) assert la.match_to(s).matches == 3
def test_ncontenttrimmer(): # third parameter is True if read should be discarded params = [('AAA', 0, KEEP), ('AAA', 1, KEEP), ('AAACCTTGGN', 1, KEEP), ('AAACNNNCTTGGN', 0.5, KEEP), ('NNNNNN', 1, DISCARD), ('ANAAAA', 1 / 6, KEEP), ('ANAAAA', 0, DISCARD)] for seq, count, expected in params: writer = NContentTrimmer(count=count) _seq = Sequence('read1', seq, qualities='#' * len(seq)) assert writer(_seq) == expected
def test_linked_adapter(): linked_adapter = LinkedAdapter('AAAA', 'TTTT', min_overlap=4) assert linked_adapter.front_adapter.min_overlap == 4 assert linked_adapter.back_adapter.min_overlap == 4 sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT') trimmed = linked_adapter.match_to(sequence).trimmed() assert trimmed.name == 'seq' assert trimmed.sequence == 'CCCCC'
def test_anywhere_parameter(): parser = AdapterParser(colorspace=False, max_error_rate=0.2, min_overlap=4, read_wildcards=False, adapter_wildcards=False, indels=True) adapter = list(parser.parse('CTGAAGTGAAGTACACGGTT;anywhere', 'back'))[0] assert adapter.remove == 'suffix' assert adapter.where == ANYWHERE read = Sequence('foo1', 'TGAAGTACACGGTTAAAAAAAAAA') from cutadapt.modifiers import AdapterCutter cutter = AdapterCutter([adapter]) trimmed_read = cutter(read, []) assert trimmed_read.sequence == ''
def test_statistics(): read = Sequence('name', 'AAAACCCCAAAA') adapters = [Adapter('CCCC', BACK, 0.1)] cutter = AdapterCutter(adapters, times=3) trimmed_read = cutter(read) # TODO make this a lot simpler trimmed_bp = 0 for adapter in adapters: for d in (adapter.lengths_front, adapter.lengths_back): trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items()) assert trimmed_bp <= len(read), trimmed_bp
def test_statistics(): read = Sequence('name', 'AAAACCCCAAAA') adapters = [Adapter('CCCC', BACK, max_error_rate=0.1)] cutter = AdapterCutter(adapters, times=3) trimmed_read = cutter(read, []) # TODO make this a lot simpler trimmed_bp = 0 for adapter in adapters: for d in (cutter.adapter_statistics[adapter].front.lengths, cutter.adapter_statistics[adapter].back.lengths): trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items()) assert trimmed_bp <= len(read), trimmed_bp
def test_issue_52(): adapter = Adapter( sequence='GAACTCCAGTCACNNNNN', where=BACK, max_error_rate=0.12, min_overlap=5, read_wildcards=False, adapter_wildcards=True) read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, front=None, adapter=adapter, read=read) assert am.wildcards() == 'GGC' """
def test_anywhere_with_errors(): adapter = Adapter('CCGCATTTAG', ANYWHERE, max_error_rate=0.1) for seq, expected_trimmed in ( ('AACCGGTTccgcatttagGATC', 'AACCGGTT'), ('AACCGGTTccgcgtttagGATC', 'AACCGGTT'), # one mismatch ('AACCGGTTccgcatttag', 'AACCGGTT'), ('ccgcatttagAACCGGTT', 'AACCGGTT'), ('ccgtatttagAACCGGTT', 'AACCGGTT'), # one mismatch ('ccgatttagAACCGGTT', 'AACCGGTT'), # one deletion ): read = Sequence('foo', seq) cutter = AdapterCutter([adapter], times=1) trimmed_read = cutter(read, []) assert trimmed_read.sequence == expected_trimmed
def test_sequence(): seq = VBIMSeq(sequence='CCACCATGGATTACAAGGATGACGACGATAAGAATTCTT', where=ANYWHERE, max_error_rate=0.1, read_wildcards=False, adapter_wildcards=False) read = Sequence( name="test1", sequence= 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCACCATGGATTACAAGGATGACGACGATAAGAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT' ) m = seq.match_to(read) assert m.trimmed( ).sequence == 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTTTTTT'
def _trim(self): """Compute the trimmed read""" """ #Only removes VBIM tag self._trimmed_read = Sequence(self.read.name, self.read.sequence[:self.rstart] + self.read.sequence[self.rstop:], qualities=self.read.qualities[:self.rstart] + self.read.qualities[self.rstop:] if self.read.qualities else None, second_header=self.read.second_header, match=self) """ self._trimmed_read = Sequence( self.read.name, self.read.sequence[self.rstop:], qualities=self.read.qualities[self.rstop:] if self.read.qualities else None, second_header=self.read.second_header, match=self) adjacent_base = self.read.sequence[self.rstart - 1] if adjacent_base not in 'ACGT': adjacent_base = '' self.adjacent_base = adjacent_base
def test_too_many_qualities(self): Sequence(name="name", sequence="ACGT", qualities="#####")
def test_write_zero_length_sequence(self): sio = StringIO() with FastaWriter(sio) as fw: fw.write(Sequence("name", "")) assert sio.getvalue() == '>name\n\n', '{0!r}'.format( sio.getvalue())
import sys import os import shutil from textwrap import dedent from nose.tools import raises from tempfile import mkdtemp from cutadapt.seqio import (Sequence, ColorspaceSequence, FormatError, FastaReader, FastqReader, FastaQualReader, InterleavedSequenceReader, FastaWriter, FastqWriter, InterleavedSequenceWriter, open as openseq) from cutadapt.compat import StringIO # files tests/data/simple.fast{q,a} simple_fastq = [ Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"), Sequence("second_sequence", "SEQUENCE2", "83<??:(61") ] simple_fasta = [Sequence(x.name, x.sequence, None) for x in simple_fastq] class TestSequence: @raises(FormatError) def test_too_many_qualities(self): Sequence(name="name", sequence="ACGT", qualities="#####") @raises(FormatError) def test_too_many_qualities_colorspace(self): ColorspaceSequence(name="name", sequence="T0123", qualities="#####")