def test_giuseppe_reads(): 'It splits some real reads' seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) new_seqs = [] for packet in read_seq_packets([open(seq_fpath)], 2): new_seqs.extend(splitter(packet)) seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01BJHT8\\1' in seq_names assert 'G109AZL01BJHT8\\2' in seq_names assert len(new_seqs) == 20 # test with process_seq_packet seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) seq_packets = read_seq_packets([open(seq_fpath)], 2) seq_packets = process_seq_packets(seq_packets, [splitter])[0] new_seqs = [seq for l in list(seq_packets) for seq in l] seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01BJHT8\\1' in seq_names assert 'G109AZL01BJHT8\\2' in seq_names assert len(new_seqs) == 20 # reads 2 seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) new_seqs = [] for packet in read_seq_packets([open(seq_fpath)], 2): new_seqs.extend(splitter(packet)) seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01D8U3X\\1' in seq_names assert 'G109AZL01D8U3X\\2' in seq_names assert len(new_seqs) == 20 # test with process_seq_packet seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) seq_packets = read_seq_packets([open(seq_fpath)], 2) seq_packets = process_seq_packets(seq_packets, [splitter])[0] new_seqs = [seq for l in list(seq_packets) for seq in l] seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01D8U3X\\1' in seq_names assert 'G109AZL01D8U3X\\2' in seq_names assert len(new_seqs) == 20
def test_blast_short_trimming(self): 'It trims oligos using blast-short' oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG')) oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT')) oligo1 = SeqWrapper(SEQRECORD, oligo1, None) oligo2 = SeqWrapper(SEQRECORD, oligo2, None) adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = read_seq_packets([fhand], prefered_seq_classes=[SEQRECORD]) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(0, 29)], [(0, 29)], []] # With SeqItems oligo1 = SeqItem('oligo1', ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n']) oligo2 = SeqItem('oligo2', ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n']) oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta') oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta') adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = list( read_seq_packets([fhand], prefered_seq_classes=[SEQITEM])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(0, 29)], [(0, 29)], []]
def test_blast_short_trimming(self): 'It trims oligos using blast-short' oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG')) oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT')) oligo1 = SeqWrapper(SEQRECORD, oligo1, None) oligo2 = SeqWrapper(SEQRECORD, oligo2, None) adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = read_seq_packets([fhand], prefered_seq_classes=[SEQRECORD]) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l] assert res == [[(0, 29)], [(0, 29)], []] # With SeqItems oligo1 = SeqItem('oligo1', ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n']) oligo2 = SeqItem('oligo2', ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n']) oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta') oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta') adaptors = [oligo1, oligo2] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = list(read_seq_packets([fhand], prefered_seq_classes=[SEQITEM])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = blast_trim(trim_packets[0]) # It should trim the first and the second reads. res = [get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for l in trim_packets2[SEQS_PASSED] for s in l] assert res == [[(0, 29)], [(0, 29)], []]
def test_blast_short_trimming(self): 'It trims oligos using blast-short' adaptors = [SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG')), SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT'))] blast_trim = TrimWithBlastShort(oligos=adaptors) fhand = StringIO(FASTQ4) seq_packets = list(read_seq_packets([fhand])) # It should trim the first and the second reads. res = [seq.annotations.get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, []) for seq in blast_trim(seq_packets[0])] assert res == [[(0, 29)], [(0, 29)], []]
def test_split_mates(self): 'It tests the detection of oligos in sequence files' mate_fhand = NamedTemporaryFile(suffix='.fasta') linker = TITANIUM_LINKER # a complete linker seq5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' seq3 = 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT' mate_fhand.write('>seq1\n' + seq5 + linker + seq3 + '\n') # no linker mate_fhand.write('>seq2\n' + seq5 + '\n') # a partial linker mate_fhand.write('>seq3\n' + seq5 + linker[2:25] + seq3 + '\n') # the linker is 5 prima mate_fhand.write('>seq4\n' + linker[10:] + seq3 + '\n') # two linkers mate_fhand.write('>seq5\n' + linker + seq3 + FLX_LINKER + seq5 + '\n') mate_fhand.flush() splitter = MatePairSplitter() new_seqs = [] for packet in read_seq_packets([mate_fhand], 2): new_seqs.append(splitter(packet)) out_fhand = StringIO() write_seq_packets(out_fhand, new_seqs, file_format='fasta') result = out_fhand.getvalue() xpect = '>seq1\1\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += '>seq1\2\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq2\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += '>seq3_pl.part1\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTG\n' xpect += '>seq3_pl.part2\n' xpect += 'GTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq4\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq5_mlc.part1\n' xpect += 'TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACGATCGATCATGTTGTAT' xpect += 'TG\n' xpect += 'TGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq5_mlc.part2\n' xpect += 'ACCTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' xpect += '\n' assert xpect == result
def test_filter_by_bowtie2(): index_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes') fastq_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq') fasta_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fasta') passed = ['no_arabi'] for preffered_classes in [[SEQITEM], [SEQRECORD]]: for reads_fpath in [fastq_fpath, fasta_fpath]: seq_packets = read_seq_packets([open(reads_fpath)], prefered_seq_classes=preffered_classes) filter_packets = seq_to_filterpackets(seq_packets) filter_ = FilterBowtie2Match(index_fpath) filter_packet = list(filter_packets)[0] filter_packets = filter_(filter_packet) assert _seqs_to_names(filter_packets[SEQS_PASSED]) == passed assert _seqs_to_names(filter_packets[SEQS_FILTERED_OUT]) == [ 'read1', 'read2', 'read3']
def test_trim_chimeric_region(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n' query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query = query1 + query2 fhand = NamedTemporaryFile() fhand.write(query) fhand.flush() trim_chimeras = TrimMatePairChimeras(index_fpath) seq_packets = list(read_seq_packets([open(fhand.name)])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = trim_chimeras(trim_packets[0]) # It should trim the first and the second reads. res = [get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(OTHER, []) for l in trim_packets2[SEQS_PASSED] for s in l] assert res == [[(49, 105)], []]
def test_trim_chimeric_region(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n' query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query = query1 + query2 fhand = NamedTemporaryFile() fhand.write(query) fhand.flush() trim_chimeras = TrimMatePairChimeras(index_fpath) seq_packets = list(read_seq_packets([open(fhand.name)])) trim_packets = list(seq_to_trim_packets(seq_packets)) trim_packets2 = trim_chimeras(trim_packets[0]) # It should trim the first and the second reads. res = [ get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(OTHER, []) for l in trim_packets2[SEQS_PASSED] for s in l ] assert res == [[(49, 105)], []]
def test_split_mates(self): 'It tests the detection of oligos in sequence files' mate_fhand = NamedTemporaryFile(suffix='.fasta') linker = TITANIUM_LINKER # a complete linker seq5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' seq3 = 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT' mate_fhand.write('>seq1\n' + seq5 + linker + seq3 + '\n') # no linker mate_fhand.write('>seq2\n' + seq5 + '\n') # a partial linker mate_fhand.write('>seq3\n' + seq5 + linker[2:25] + seq3 + '\n') # the linker is 5 prima mate_fhand.write('>seq4\n' + linker[10:] + seq3 + '\n') # two linkers mate_fhand.write('>seq5\n' + linker + seq3 + FLX_LINKER + seq5 + '\n') # reverse linker rev_linker = get_setting('TITANIUM_LINKER_REV') mate_fhand.write('>seq6\n' + seq5 + rev_linker + seq3 + '\n') mate_fhand.flush() splitter = MatePairSplitter() new_seqs = [] for packet in read_seq_packets([mate_fhand], 2): new_seqs.append(splitter(packet)) out_fhand = StringIO() write_seq_packets(out_fhand, new_seqs, file_format='fasta') result = out_fhand.getvalue() xpect = r'>seq1\1' xpect += '\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += r'>seq1\2' xpect += '\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq2\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += '>seq3_pl.part1\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTG\n' xpect += '>seq3_pl.part2\n' xpect += 'GTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq4\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq5_mlc.part1\n' xpect += 'TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACGATCGATCATGTTGTAT' xpect += 'TG' xpect += 'TGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' xpect += '>seq5_mlc.part2\n' xpect += 'ACCTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' xpect += '\n' xpect += r'>seq6\1' xpect += '\n' xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n' xpect += r'>seq6\2' xpect += '\n' xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n' assert xpect == result # with short linker in 3 prima mate_fhand = NamedTemporaryFile(suffix='.fasta') seq = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAA" seq += "CTCACATACACTGCTGTACCGTAC" mate_fhand.write(seq) mate_fhand.flush() splitter = MatePairSplitter() new_seqs = [] for packet in read_seq_packets([mate_fhand], 1): new_seqs.append(splitter(packet)) out_fhand = StringIO() write_seq_packets(out_fhand, new_seqs, file_format='fasta') result = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAACTCACATACA\n" assert result == out_fhand.getvalue()