Ejemplo n.º 1
0
    def test_giuseppe_reads():
        'It splits some real reads'
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # reads 2
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20
Ejemplo n.º 2
0
    def test_giuseppe_reads():
        'It splits some real reads'
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 20

        # reads 2
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20
Ejemplo n.º 3
0
    def test_blast_short_trimming(self):
        'It trims oligos using blast-short'

        oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG'))
        oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT'))
        oligo1 = SeqWrapper(SEQRECORD, oligo1, None)
        oligo2 = SeqWrapper(SEQRECORD, oligo2, None)

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = read_seq_packets([fhand],
                                       prefered_seq_classes=[SEQRECORD])
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [
            get_annotations(s).get(TRIMMING_RECOMMENDATIONS,
                                   {}).get(VECTOR, [])
            for l in trim_packets2[SEQS_PASSED] for s in l
        ]
        assert res == [[(0, 29)], [(0, 29)], []]

        # With SeqItems
        oligo1 = SeqItem('oligo1',
                         ['>oligo1\n', 'AAGCAGTGGTATCAACGCAGAGTACATGGG\n'])
        oligo2 = SeqItem('oligo2',
                         ['>oligo2\n', 'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n'])
        oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta')
        oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta')

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = list(
            read_seq_packets([fhand], prefered_seq_classes=[SEQITEM]))
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [
            get_annotations(s).get(TRIMMING_RECOMMENDATIONS,
                                   {}).get(VECTOR, [])
            for l in trim_packets2[SEQS_PASSED] for s in l
        ]
        assert res == [[(0, 29)], [(0, 29)], []]
Ejemplo n.º 4
0
    def test_blast_short_trimming(self):
        'It trims oligos using blast-short'

        oligo1 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG'))
        oligo2 = SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT'))
        oligo1 = SeqWrapper(SEQRECORD, oligo1, None)
        oligo2 = SeqWrapper(SEQRECORD, oligo2, None)

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = read_seq_packets([fhand],
                                            prefered_seq_classes=[SEQRECORD])
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR,
                                                                        [])
                            for l in trim_packets2[SEQS_PASSED] for s in l]
        assert res == [[(0, 29)], [(0, 29)], []]

        # With SeqItems
        oligo1 = SeqItem('oligo1', ['>oligo1\n',
                                    'AAGCAGTGGTATCAACGCAGAGTACATGGG\n'])
        oligo2 = SeqItem('oligo2', ['>oligo2\n',
                                    'AAGCAGTGGTATCAACGCAGAGTACTTTTT\n'])
        oligo1 = SeqWrapper(SEQITEM, oligo1, 'fasta')
        oligo2 = SeqWrapper(SEQITEM, oligo2, 'fasta')

        adaptors = [oligo1, oligo2]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = list(read_seq_packets([fhand],
                                            prefered_seq_classes=[SEQITEM]))
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = blast_trim(trim_packets[0])
        # It should trim the first and the second reads.
        res = [get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR,
                                                                        [])
                            for l in trim_packets2[SEQS_PASSED] for s in l]
        assert res == [[(0, 29)], [(0, 29)], []]
Ejemplo n.º 5
0
    def test_blast_short_trimming(self):
        'It trims oligos using blast-short'

        adaptors = [SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACATGGG')),
                    SeqRecord(Seq('AAGCAGTGGTATCAACGCAGAGTACTTTTT'))]

        blast_trim = TrimWithBlastShort(oligos=adaptors)
        fhand = StringIO(FASTQ4)
        seq_packets = list(read_seq_packets([fhand]))
        # It should trim the first and the second reads.
        res = [seq.annotations.get(TRIMMING_RECOMMENDATIONS, {}).get(VECTOR, [])
                                         for seq in blast_trim(seq_packets[0])]
        assert res == [[(0, 29)], [(0, 29)], []]
Ejemplo n.º 6
0
    def test_split_mates(self):
        'It tests the detection of oligos in sequence files'

        mate_fhand = NamedTemporaryFile(suffix='.fasta')
        linker = TITANIUM_LINKER

        # a complete linker
        seq5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC'
        seq3 = 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT'

        mate_fhand.write('>seq1\n' + seq5 + linker + seq3 + '\n')
        # no linker
        mate_fhand.write('>seq2\n' + seq5 + '\n')
        # a partial linker
        mate_fhand.write('>seq3\n' + seq5 + linker[2:25] + seq3 + '\n')
        # the linker is 5 prima
        mate_fhand.write('>seq4\n' + linker[10:] + seq3 + '\n')
        # two linkers
        mate_fhand.write('>seq5\n' + linker + seq3 + FLX_LINKER + seq5 + '\n')
        mate_fhand.flush()

        splitter = MatePairSplitter()
        new_seqs = []
        for packet in read_seq_packets([mate_fhand], 2):
            new_seqs.append(splitter(packet))

        out_fhand = StringIO()
        write_seq_packets(out_fhand, new_seqs, file_format='fasta')

        result = out_fhand.getvalue()
        xpect = '>seq1\1\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n'
        xpect += '>seq1\2\n'
        xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq2\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n'
        xpect += '>seq3_pl.part1\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTG\n'
        xpect += '>seq3_pl.part2\n'
        xpect += 'GTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq4\n'
        xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq5_mlc.part1\n'
        xpect += 'TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACGATCGATCATGTTGTAT'
        xpect += 'TG\n'
        xpect += 'TGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq5_mlc.part2\n'
        xpect += 'ACCTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC'
        xpect += '\n'
        assert xpect == result
Ejemplo n.º 7
0
    def test_filter_by_bowtie2():
        index_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_genes')
        fastq_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fastq')
        fasta_fpath = os.path.join(TEST_DATA_DIR, 'arabidopsis_reads.fasta')

        passed = ['no_arabi']
        for preffered_classes in [[SEQITEM], [SEQRECORD]]:
            for reads_fpath in [fastq_fpath, fasta_fpath]:
                seq_packets = read_seq_packets([open(reads_fpath)],
                                               prefered_seq_classes=preffered_classes)
                filter_packets = seq_to_filterpackets(seq_packets)
                filter_ = FilterBowtie2Match(index_fpath)
                filter_packet = list(filter_packets)[0]
                filter_packets = filter_(filter_packet)
                assert _seqs_to_names(filter_packets[SEQS_PASSED]) == passed
                assert _seqs_to_names(filter_packets[SEQS_FILTERED_OUT]) == [
                                                    'read1', 'read2', 'read3']
Ejemplo n.º 8
0
    def test_trim_chimeric_region(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
        query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n'
        query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query = query1 + query2
        fhand = NamedTemporaryFile()
        fhand.write(query)
        fhand.flush()

        trim_chimeras = TrimMatePairChimeras(index_fpath)
        seq_packets = list(read_seq_packets([open(fhand.name)]))
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = trim_chimeras(trim_packets[0])
        # It should trim the first and the second reads.
        res = [get_annotations(s).get(TRIMMING_RECOMMENDATIONS, {}).get(OTHER,
                                                                        [])
                            for l in trim_packets2[SEQS_PASSED] for s in l]
        assert res == [[(49, 105)], []]
Ejemplo n.º 9
0
    def test_trim_chimeric_region(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
        query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n'
        query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query = query1 + query2
        fhand = NamedTemporaryFile()
        fhand.write(query)
        fhand.flush()

        trim_chimeras = TrimMatePairChimeras(index_fpath)
        seq_packets = list(read_seq_packets([open(fhand.name)]))
        trim_packets = list(seq_to_trim_packets(seq_packets))
        trim_packets2 = trim_chimeras(trim_packets[0])
        # It should trim the first and the second reads.
        res = [
            get_annotations(s).get(TRIMMING_RECOMMENDATIONS,
                                   {}).get(OTHER, [])
            for l in trim_packets2[SEQS_PASSED] for s in l
        ]
        assert res == [[(49, 105)], []]
Ejemplo n.º 10
0
    def test_split_mates(self):
        'It tests the detection of oligos in sequence files'

        mate_fhand = NamedTemporaryFile(suffix='.fasta')
        linker = TITANIUM_LINKER

        # a complete linker
        seq5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC'
        seq3 = 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT'

        mate_fhand.write('>seq1\n' + seq5 + linker + seq3 + '\n')
        # no linker
        mate_fhand.write('>seq2\n' + seq5 + '\n')
        # a partial linker
        mate_fhand.write('>seq3\n' + seq5 + linker[2:25] + seq3 + '\n')
        # the linker is 5 prima
        mate_fhand.write('>seq4\n' + linker[10:] + seq3 + '\n')
        # two linkers
        mate_fhand.write('>seq5\n' + linker + seq3 + FLX_LINKER + seq5 + '\n')
        # reverse linker
        rev_linker = get_setting('TITANIUM_LINKER_REV')
        mate_fhand.write('>seq6\n' + seq5 + rev_linker + seq3 + '\n')
        mate_fhand.flush()

        splitter = MatePairSplitter()
        new_seqs = []
        for packet in read_seq_packets([mate_fhand], 2):
            new_seqs.append(splitter(packet))

        out_fhand = StringIO()
        write_seq_packets(out_fhand, new_seqs, file_format='fasta')

        result = out_fhand.getvalue()
        xpect = r'>seq1\1'
        xpect += '\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n'
        xpect += r'>seq1\2'
        xpect += '\n'
        xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq2\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n'
        xpect += '>seq3_pl.part1\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTG\n'
        xpect += '>seq3_pl.part2\n'
        xpect += 'GTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq4\n'
        xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq5_mlc.part1\n'
        xpect += 'TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACGATCGATCATGTTGTAT'
        xpect += 'TG'
        xpect += 'TGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        xpect += '>seq5_mlc.part2\n'
        xpect += 'ACCTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC'
        xpect += '\n'
        xpect += r'>seq6\1'
        xpect += '\n'
        xpect += 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC\n'
        xpect += r'>seq6\2'
        xpect += '\n'
        xpect += 'ATCGATCATGTTGTATTGTGTACTATACACACACGTAGGTCGACTATCGTAGCTAGT\n'
        assert xpect == result

        # with short linker in 3 prima
        mate_fhand = NamedTemporaryFile(suffix='.fasta')
        seq = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAA"
        seq += "CTCACATACACTGCTGTACCGTAC"
        mate_fhand.write(seq)
        mate_fhand.flush()
        splitter = MatePairSplitter()
        new_seqs = []
        for packet in read_seq_packets([mate_fhand], 1):
            new_seqs.append(splitter(packet))
        out_fhand = StringIO()
        write_seq_packets(out_fhand, new_seqs, file_format='fasta')
        result = ">seq1\nCATCAATGACATCACAAATGACATCAACAAACTCAAACTCACATACA\n"
        assert result == out_fhand.getvalue()