def test_min_cutter_T_T(): unconditional_before = UnconditionalCutter((2, -2)) unconditional_after = UnconditionalCutter((1, -1)) min_trimmer = MinCutter((5, -5), True, True) read1 = Sequence('read1', "CAATCGATCGAACGTACCGAT") assert read1.clipped == [0, 0, 0, 0], str(read1.clipped) read1 = unconditional_before(read1) assert read1.sequence == "ATCGATCGAACGTACCG" assert read1.clipped == [2, 2, 0, 0], str(read1.clipped) # test without adapter trimming assert min_trimmer(read1).sequence == "ATCGATCGAACGTACCG" # test with adapter trimming read2 = read1[:] read2.sequence = "ATCGAACGTACCG" read2.match, read2.match_info = front_match(read2) read3 = min_trimmer(read2) assert read3.sequence == "TCGAACGTACCG", read3.sequence assert read3.clipped == [2, 2, 1, 0] # test with subsequent clipping read4 = unconditional_after(read2) assert read4.sequence == "TCGAACGTACC", read4.sequence assert read4.clipped == [2, 2, 1, 1], read4.clipped read5 = min_trimmer(read4) assert read5.sequence == "TCGAACGTACC", read5.sequence assert read5.clipped == [2, 2, 1, 1], read5.clipped
def test_ncontentfilter_paired(): params = [ ('AAA', 'AAA', 0, KEEP), ('AAAN', 'AAA', 0, DISCARD), ('AAA', 'AANA', 0, DISCARD), ('ANAA', 'AANA', 1, KEEP), ] for seq1, seq2, count, expected in params: filter = NContentFilter(count=count) filter_legacy = SingleWrapper(filter) filter_both = PairedWrapper(filter) read1 = Sequence('read1', seq1, qualities='#' * len(seq1)) read2 = Sequence('read1', seq2, qualities='#' * len(seq2)) assert filter_legacy(read1, read2) == filter(read1) # discard entire pair if one of the reads fulfills criteria assert filter_both(read1, read2) == expected
def test(self): reads = [ (Sequence('A/1 comment', 'TTA', '##H'), Sequence('A/2 comment', 'GCT', 'HH#')), (Sequence('B/1', 'CC', 'HH'), Sequence('B/2', 'TG', '#H')) ] fmt = InterleavedFormatter(FastqFormat(), "foo") result = defaultdict(lambda: []) for read1, read2 in reads: fmt.format(result, read1, read2) assert fmt.written == 2 assert fmt.read1_bp == 5 assert fmt.read2_bp == 5 assert "foo" in result assert "".join(result["foo"]) == '@A/1 comment\nTTA\n+\n##H\n@A/2 comment\nGCT\n+\nHH#\n@B/1\nCC\n+\nHH\n@B/2\nTG\n+\n#H\n'
def test_unconditional_cutter(): uc = UnconditionalCutter(lengths=[5]) s = Sequence("read1", 'abcdefg') assert UnconditionalCutter(lengths=[2])(s).sequence == 'cdefg' assert UnconditionalCutter(lengths=[-2])(s).sequence == 'abcde' assert UnconditionalCutter(lengths=[100])(s).sequence == '' assert UnconditionalCutter(lengths=[-100])(s).sequence == ''
def test_linked_adapter(): linked_adapter = LinkedAdapter('AAAA', 'TTTT') sequence = Sequence(name='seq', sequence='AAAACCCCCTTTT') match = linked_adapter.match_to(sequence) trimmed = linked_adapter.trimmed(match) assert trimmed.name == 'seq' assert trimmed.sequence == 'CCCCC'
def test_error_correction_no_insert_match_one_adapter_match(): a1 = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTC' a2 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTTA' a2_mod = 'ACATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTTA' frag = 'CCAAGCAGACATTCACTCAGATTGCA' correct_frag = 'CCAAGTAGACATTCGCTCAGATTGCA' r1 = list(frag) # C>T at pos 6 r1[5] = 'T' q1 = ['#'] * 40 # quality of read1 > quality of read2 at pos 6 q1[5] = 'A' r1 = (''.join(r1) + a1)[0:40] q1 = ''.join(q1) r2 = list(frag) # A>G at pos 15 r2[14] = 'G' q2 = ['#'] * 40 # quality of read2 > quality of read1 at pos 11 q2[len(frag) - 15] = 'A' r2 = reverse_complement(reverse_complement(a2_mod) + ''.join(r2))[0:40] q2 = ''.join(q2) read1 = Sequence('foo', r1, q1) read2 = Sequence('foo', r2, q2) parser1 = AdapterParser() adapter1 = parser1.parse(a1) # Allow zero mismatches to prevent adapter alignment parser2 = AdapterParser(max_error_rate=0) adapter2 = parser2.parse(a2) # Allow zero mismatches to prevent insert alignment cutter = InsertAdapterCutter(adapter1, adapter2, mismatch_action='liberal', max_insert_mismatch_frac=0) new_read1, new_read2 = cutter(read1, read2) assert len(new_read1) == 26 assert not new_read1.insert_overlap assert new_read1.sequence == correct_frag assert len(new_read2) == 26 assert not new_read2.insert_overlap assert new_read2.sequence == reverse_complement(correct_frag)
def test_ncontentfilter(): # third parameter is True if read should be discarded params = [('AAA', 0, KEEP), ('AAA', 1, KEEP), ('AAACCTTGGN', 1, KEEP), ('AAACNNNCTTGGN', 0.5, KEEP), ('NNNNNN', 1, DISCARD), ('ANAAAA', 1 / 6, KEEP), ('ANAAAA', 0, DISCARD)] for seq, count, expected in params: filter = NContentFilter(count=count) _seq = Sequence('read1', seq, qualities='#' * len(seq)) assert filter(_seq) == expected
def test_statistics(): read = Sequence('name', 'AAAACCCCAAAA') adapters = [Adapter('CCCC', BACK, 0.1)] cutter = AdapterCutter(adapters, times=3) trimmed_read = cutter(read) # TODO make this a lot simpler trimmed_bp = 0 for adapter in adapters: for d in (adapter.lengths_front, adapter.lengths_back): trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items()) assert trimmed_bp <= len(read), trimmed_bp
def test_min_cutter_T_F(): unconditional_before = UnconditionalCutter((2, -2)) min_trimmer = MinCutter((4, -4), True, False) read1 = Sequence('read1', "CAATCGATCGAACGTACCGAT") read1 = unconditional_before(read1) assert read1.sequence == "ATCGATCGAACGTACCG" assert read1.clipped == [2, 2, 0, 0] # test without adapter trimming assert min_trimmer(read1).sequence == "CGATCGAACGTAC"
def test_Modifiers_single(): m = Modifiers(paired=False) m.add_modifier(UnconditionalCutter, lengths=[5]) mod1 = m.get_modifiers(read=1) mod2 = m.get_modifiers(read=2) assert len(mod1) == 1 assert isinstance(mod1[0], UnconditionalCutter) assert len(mod2) == 0 # test single-end read = Sequence('read1', 'ACGTTTACGTA', '##456789###') mod_read, mod_bp = m.modify(read) assert mod_read[0].sequence == 'TACGTA'
def test_overwrite_read(): overwrite = OverwriteRead(20, 40, 10) lowseq = 'ACGT' * 5 highseq = 'TCAG' * 5 # mean lowq > 20, mean highq > 40 lowq = (11, 31, 16, 24, 16, 20, 17, 19, 21, 28) * 2 highq = (22, 62, 32, 48, 32, 40, 34, 38, 42, 56) * 2 read1 = Sequence('foo', lowseq, ints2quals(lowq)) read2 = Sequence('foo', highseq, ints2quals(highq)) new_read1, new_read2 = overwrite(read1, read2) assert new_read1.sequence == lowseq assert new_read1.qualities == ints2quals(lowq) assert new_read2.sequence == highseq assert new_read2.qualities == ints2quals(highq) assert new_read1.corrected == new_read2.corrected == 0 # mean lowq < 20, mean highq > 40 lowq = tuple(i - 1 for i in lowq) read1 = Sequence('foo', lowseq, ints2quals(lowq)) new_read1, new_read2 = overwrite(read1, read2) assert new_read1.sequence == rc(highseq) assert new_read1.qualities == ints2quals(reversed(highq)) assert new_read2.sequence == highseq assert new_read2.qualities == ints2quals(highq) assert new_read1.corrected == new_read2.corrected == 1 # mean lowq < 20, mean highq < 40 highq = tuple(i - 1 for i in highq) read2 = Sequence('foo', highseq, ints2quals(highq)) new_read1, new_read2 = overwrite(read1, read2) assert new_read1.sequence == lowseq assert new_read1.qualities == ints2quals(lowq) assert new_read2.sequence == highseq assert new_read2.qualities == ints2quals(highq) assert new_read1.corrected == new_read2.corrected == 0
def test_issue_52(): adapter = Adapter(sequence='GAACTCCAGTCACNNNNN', where=BACK, max_error_rate=0.12, min_overlap=5, read_wildcards=False, adapter_wildcards=True) read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') am = Match(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, front=None, adapter=adapter, read=read) assert am.wildcards() == 'GGC' """
def test_issue_80(): # This issue turned out to not be an actual issue with the alignment # algorithm. The following alignment is found because it has more matches # than the 'obvious' one: # # TCGTATGCCGTCTTC # =========X==XX= # TCGTATGCCCTC--C # # This is correct, albeit a little surprising, since an alignment without # indels would have only two errors. adapter = Adapter(sequence="TCGTATGCCGTCTTC", where=BACK, max_error_rate=0.2, min_overlap=3, read_wildcards=False, adapter_wildcards=False) read = Sequence(name="seq2", sequence="TCGTATGCCCTCC") result = adapter.match_to(read) assert read.original_length == 13, result assert result.errors == 3, result assert result.astart == 0, result assert result.astop == 15, result
import sys import os from io import StringIO import shutil from textwrap import dedent from tempfile import mkdtemp from atropos.seqio import (Sequence, ColorspaceSequence, FormatError, FastaReader, FastqReader, FastaQualReader, InterleavedSequenceReader, FastaFormat, FastqFormat, InterleavedFormatter, get_format, open_reader as openseq, sequence_names_match, open_output) from atropos.xopen import xopen, open_output from .utils import temporary_path # files tests/data/simple.fast{q,a} simple_fastq = [ Sequence("first_sequence", "SEQUENCE1", ":6;;8<=:<"), Sequence("second_sequence", "SEQUENCE2", "83<??:(61") ] simple_fasta = [ Sequence(x.name, x.sequence, None) for x in simple_fastq ] class TestSequence: def test_too_many_qualities(self): with raises(FormatError): Sequence(name="name", sequence="ACGT", qualities="#####") def test_too_many_qualities_colorspace(self): with raises(FormatError): ColorspaceSequence(name="name", sequence="T0123", qualities="#####")
def test_str(): a = Adapter('ACGT', where=BACK, max_error_rate=0.1) str(a) str(a.match_to(Sequence(name='seq', sequence='TTACGT'))) ca = ColorspaceAdapter('0123', where=BACK, max_error_rate=0.1) str(ca)
def test_overlapping(): trimmer = MergeOverlapping(min_overlap=10, error_rate=0.1) a1 = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTC' a2 = reverse_complement('AGATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTTA') frag = 'CCAAGCAGACATTCACTCAGATTGCA' r1 = (frag + a1)[0:40] q1 = '#' * 40 r2 = reverse_complement(a2 + frag)[0:40] q2 = '!' * 40 parser = AdapterParser() adapter1 = parser.parse(a1) adapter2 = parser.parse(a2) cutter = AdapterCutter([adapter1, adapter2]) read1 = Sequence('foo', r1, q1) read1 = cutter(read1) assert len(read1) == 26 read2 = Sequence('foo', r2, q2) read2 = cutter(read2) assert len(read2) == 26 # complete overlap read1_merged, read2_merged = trimmer(read1, read2) assert read1_merged.merged assert read2_merged is None assert read1 == read1_merged # partial overlap read1.merged = False read2 = read2.subseq(0, 24)[2] read1_merged, read2_merged = trimmer(read1, read2) assert read1_merged.merged assert read2_merged is None assert read1 == read1_merged # partial overlap r1, r2 read1.merged = False read1 = read1.subseq(0, 24)[2] read1_merged, read2_merged = trimmer(read1, read2) assert read1_merged.merged assert read2_merged is None assert len(read1_merged) == 26 assert read1_merged.sequence == 'CCAAGCAGACATTCACTCAGATTGCA' assert read1_merged.qualities == ('#' * 24) + ('!' * 2) # errors # round(0.1 * 24) = 2, so 2 errors should pass but 3 should not read1.merged = False r1_seq = list(read1.sequence) r1_seq[10] = reverse_complement(r1_seq[10]) r1_seq[20] = reverse_complement(r1_seq[20]) read1.sequence = "".join(r1_seq) read1_merged, read2_merged = trimmer(read1, read2) assert read1_merged.merged assert read2_merged is None assert len(read1_merged) == 26 assert read1_merged.sequence == 'CCAAGCAGACTTTCACTCAGTTTGCA' assert read1_merged.qualities == ('#' * 24) + ('!' * 2) # too few overlapping bases read1.merged = False r1_seq[15] = reverse_complement(r1_seq[15]) read1.sequence = "".join(r1_seq) read1_merged, read2_merged = trimmer(read1, read2) assert read1_merged.merged is False assert read2 is not None
def match(name1, name2): seq1 = Sequence(name1, 'ACGT') seq2 = Sequence(name2, 'AACC') return sequence_names_match(seq1, seq2)
def test_too_many_qualities(self): with raises(FormatError): Sequence(name="name", sequence="ACGT", qualities="#####")
def test_TruSeq_trimmer(): trimmer = TruSeqBisulfiteTrimmer() read1 = Sequence('read1', "CTATCGATCCACGAGACTAAC") assert trimmer(read1).sequence == "ATCCACGAGACTAAC"