Example #1
0
    def test_all_orphan():
        'All reads end up in orphan'
        seqs = [SeqRecord(Seq('ACT'), id='seq1'),
                SeqRecord(Seq('ACT'), id='seq2')]
        seqs = list(assing_kind_to_seqs(SEQRECORD, seqs, None))
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format='fasta')
        assert orphan_out_fhand.getvalue() == '>seq1\nACT\n>seq2\nACT\n'

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs(seqs, out_fhand, orphan_out_fhand, ordered=False,
                    out_format='fasta')
        assert '>seq1\nACT\n' in orphan_out_fhand.getvalue()
        assert '>seq2\nACT\n' in orphan_out_fhand.getvalue()
Example #2
0
    def test_all_orphan():
        'All reads end up in orphan'
        seqs = [
            SeqRecord(Seq('ACT'), id='seq1'),
            SeqRecord(Seq('ACT'), id='seq2')
        ]
        seqs = list(assing_kind_to_seqs(SEQRECORD, seqs, None))
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format='fasta')
        assert orphan_out_fhand.getvalue() == '>seq1\nACT\n>seq2\nACT\n'

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    ordered=False,
                    out_format='fasta')
        assert '>seq1\nACT\n' in orphan_out_fhand.getvalue()
        assert '>seq2\nACT\n' in orphan_out_fhand.getvalue()
Example #3
0
    def test_pair_matcher(self):
        'It test the pair matcher function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp

        # File is not sorted
        file1 = '''@s1.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s2.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s1.r
AACCAGTCAAC
+
CCCFFFFFGHH
'''
        file1 = StringIO(file1)
        set_format(file1, 'fastq')
        seqs = read_seqs([file1])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        try:
            match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                        check_order_buffer_size=10)
            output = out_fhand.getvalue()
            self.fail('ItemsNotSortedError error expected')
        except ItemsNotSortedError:
            pass
Example #4
0
    def test_mate_pair_unorderer_checker():
        'It test the mate pair function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # unordered file
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2_unordered.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp
Example #5
0
    def test_pair_matcher(self):
        'It test the pair matcher function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp

        # File is not sorted
        file1 = '''@s1.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s2.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s1.r
AACCAGTCAAC
+
CCCFFFFFGHH
'''
        file1 = StringIO(file1)
        set_format(file1, 'fastq')
        seqs = read_seqs([file1])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        try:
            match_pairs(seqs,
                        out_fhand,
                        orphan_out_fhand,
                        out_format,
                        check_order_buffer_size=10)
            output = out_fhand.getvalue()
            self.fail('ItemsNotSortedError error expected')
        except ItemsNotSortedError:
            pass
Example #6
0
    def test_mate_pair_unorderer_checker():
        'It test the mate pair function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # unordered file
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2_unordered.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp