Ejemplo n.º 1
0
    def test_calculate_stats():
        'It tests the calculate stat function'
        in_fhands = []
        for val in range(1, 6):
            fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
            in_fhands.append(fhand)
        seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQRECORD])
        results = calculate_sequence_stats(seqs, nxs=[50])
        assert 'maximum: 4' in results['length']
        assert 'N50' in results['length']
        assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in results['qual_boxplot']
        assert '[30 , 31[ (96): **********' in results['quality']
        assert 'Q30: 100.0' in results['quality']
        assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00' in  results['nucl_freq']
        assert results['kmer'] == ''

        infhands = [open(join(TEST_DATA_DIR, 'arabidopsis_genes'))]
        seqs = list(read_seqs(infhands, prefered_seq_classes=[SEQRECORD]))
        kmers = calculate_sequence_stats(seqs)['kmer']
        assert not 'Kmer distribution' in kmers

        kmers = calculate_sequence_stats(seqs, kmer_size=3)['kmer']
        assert 'Kmer distribution' in kmers
        assert 'TCT: 167' in kmers

        # dust
        dust = calculate_sequence_stats(seqs)['dustscore']
        assert not dust
        dust = calculate_sequence_stats(seqs, do_dust_stats=True)['dustscore']
        assert 'average: 1.83\nvariance: 0.14\nnum. seqs.: 6\n' in dust
        assert '% above 7 (low complexity): 0.00' in dust
Ejemplo n.º 2
0
    def test_sample_seq(self):
        sample_seq = os.path.join(SEQ_BIN_DIR, 'sample_seqs')
        assert 'usage' in check_output([sample_seq, '-h'])

        fasta_fhand = NamedTemporaryFile()
        fasta_fhand.write('>seq\nACTA\n>seq2\nACTA\n>seq3\nACTA\n')
        fasta_fhand.flush()

        # random sample
        result = check_output([sample_seq, '-n', '1', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 1

        # random sample
        result = check_output([sample_seq, '-n', '2', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2

        # random sample
        try:
            stderr = NamedTemporaryFile()
            check_output([sample_seq, '-n', '10', fasta_fhand.name],
                         stderr=stderr)
        except CalledProcessError:
            assert 'larger' in open(stderr.name).read()

        # random sample with stdin
        result = check_output([sample_seq, '-n', '2'],
                              stdin=open(fasta_fhand.name))
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2
Ejemplo n.º 3
0
    def test_sample_seq(self):
        sample_seq = os.path.join(BIN_DIR, 'sample_seqs')
        assert 'usage' in check_output([sample_seq, '-h'])

        fasta_fhand = NamedTemporaryFile()
        fasta_fhand.write('>seq\nACTA\n>seq2\nACTA\n>seq3\nACTA\n')
        fasta_fhand.flush()

        # random sample
        result = check_output([sample_seq, '-n', '1', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 1

        # random sample
        result = check_output([sample_seq, '-n', '2', fasta_fhand.name])
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2

        # random sample
        try:
            stderr = NamedTemporaryFile()
            check_output([sample_seq, '-n', '10', fasta_fhand.name],
                         stderr=stderr)
        except CalledProcessError:
            assert 'larger' in open(stderr.name).read()

        # random sample with stdin
        result = check_output([sample_seq, '-n', '2'],
                              stdin=open(fasta_fhand.name))
        assert count_seqs(read_seqs([StringIO(result)]))['num_seqs'] == 2
Ejemplo n.º 4
0
    def test_seqitems_io(self):
        'It checks the different seq class streams IO'
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM]))
        assert seqs[0].kind == SEQITEM
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
        assert seqs[0].object.name == 's1'

        # SeqRecord
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD]))
        assert seqs[0].kind == SEQRECORD
        fhand = StringIO()
        write_seqs(seqs, fhand, 'fasta')
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'

        # seqitem not possible with different input and output formats
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        try:
            seqs = list(read_seqs([fhand], out_format='fastq',
                        prefered_seq_classes=[SEQITEM]))
            self.fail('ValueError expected')
        except ValueError:
            pass

        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], out_format='fasta',
                              prefered_seq_classes=[SEQITEM]))
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
Ejemplo n.º 5
0
    def test_giuseppe_reads():
        'It splits some real reads'
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 19

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 19

        # reads 2
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20
Ejemplo n.º 6
0
    def test_giuseppe_reads():
        'It splits some real reads'
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 19

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01BJHT8\\1' in seq_names
        assert 'G109AZL01BJHT8\\2' in seq_names
        assert len(new_seqs) == 19

        # reads 2
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        new_seqs = []
        for packet in read_seq_packets([open(seq_fpath)], 2):
            new_seqs.extend(splitter(packet))
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20

        # test with process_seq_packet
        seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq')
        linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta')
        linkers = list(read_seqs([open(linker_fpath)]))

        splitter = MatePairSplitter(linkers=linkers)
        seq_packets = read_seq_packets([open(seq_fpath)], 2)
        seq_packets = process_seq_packets(seq_packets, [splitter])[0]

        new_seqs = [seq for l in list(seq_packets) for seq in l]
        seq_names = [get_name(seq) for seq in new_seqs]
        assert 'G109AZL01D8U3X\\1' in seq_names
        assert 'G109AZL01D8U3X\\2' in seq_names
        assert len(new_seqs) == 20
Ejemplo n.º 7
0
def sort_fastx_files(in_fhands, key, index_fpath=None, directory=None,
                     max_items_in_memory=None, tempdir=None):
    if key == 'seq':
        reads = read_seqs(in_fhands)
        return sorted_items(reads, key=get_str_seq, tempdir=tempdir,
                            max_items_in_memory=max_items_in_memory)
    elif key == 'coordinate':
        return sort_by_position_in_ref(in_fhands, index_fpath=index_fpath,
                                       directory=directory,
                                       tempdir=tempdir)
    elif key == 'name':
        reads = read_seqs(in_fhands)
        return sorted_items(reads, key=get_name, tempdir=tempdir,
                            max_items_in_memory=max_items_in_memory)
    else:
        raise ValueError('Non-supported sorting key')
Ejemplo n.º 8
0
def _read_pairs(in_fhands, paired_reads):
    seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM])
    if paired_reads:
        pairs = group_pairs_by_name(seqs)
    else:
        pairs = group_pairs(seqs, n_seqs_in_pair=1)
    return pairs
Ejemplo n.º 9
0
def _read_pairs(in_fhands, paired_reads):
    seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM])
    if paired_reads:
        pairs = group_pairs_by_name(seqs)
    else:
        pairs = group_pairs(seqs, n_seqs_in_pair=1)
    return pairs
Ejemplo n.º 10
0
def sort_fastx_files(in_fhands, key, index_fpath=None, directory=None,
                     max_items_in_memory=None, tempdir=None):
    if key == 'seq':
        reads = read_seqs(in_fhands)
        return sorted_items(reads, key=get_str_seq, tempdir=tempdir,
                            max_items_in_memory=max_items_in_memory)
    elif key == 'coordinate':
        return sort_by_position_in_ref(in_fhands, index_fpath=index_fpath,
                                       directory=directory,
                                       tempdir=tempdir)
    elif key == 'name':
        reads = read_seqs(in_fhands)
        return sorted_items(reads, key=get_name, tempdir=tempdir,
                            max_items_in_memory=max_items_in_memory)
    else:
        raise ValueError('Non-supported sorting key')
Ejemplo n.º 11
0
 def test_count_seqs():
     in_fhands = []
     for val in range(1, 6):
         fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
         in_fhands.append(fhand)
     seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQRECORD])
     counts = count_seqs(seqs)
     assert counts == {'total_length': 96, 'num_seqs': 24}
Ejemplo n.º 12
0
    def test_deinterleave(self):
        'It de-interleaves an iterator of alternating fwd and rev reads'

        fhand1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        fhand2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqs([open(fhand1)], 'fastq')
        rev_seqs = read_seqs([open(fhand2)], 'fastq')

        seqs = interleave_pairs(fwd_seqs, rev_seqs)
        out_fhand1 = StringIO()
        out_fhand2 = StringIO()
        out_format = 'fastq'
        deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format)
        result1 = out_fhand1.getvalue()
        result2 = out_fhand2.getvalue()
        assert result1.strip() == open(fhand1).read().strip()
        assert result2.strip() == open(fhand2).read().strip()
Ejemplo n.º 13
0
    def test_deinterleave(self):
        'It de-interleaves an iterator of alternating fwd and rev reads'

        fhand1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        fhand2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqs([open(fhand1)], 'fastq')
        rev_seqs = read_seqs([open(fhand2)], 'fastq')

        seqs = interleave_pairs(fwd_seqs, rev_seqs)
        out_fhand1 = StringIO()
        out_fhand2 = StringIO()
        out_format = 'fastq'
        deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format)
        result1 = out_fhand1.getvalue()
        result2 = out_fhand2.getvalue()
        assert result1.strip() == open(fhand1).read().strip()
        assert result2.strip() == open(fhand2).read().strip()
Ejemplo n.º 14
0
    def test_trim_chimeras_bin(self):
        trim_chimeras_bin = os.path.join(SEQ_BIN_DIR, 'trim_mp_chimeras')
        assert 'usage' in check_output([trim_chimeras_bin, '-h'])
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
        query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n'
        query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query = query1 + query2
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        out_fhand = NamedTemporaryFile()
        expected_seqs = [
            'GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT',
            'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG'
        ]
        cmd = [
            trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o',
            out_fhand.name
        ]
        # raw_input(" ".join(cmd))
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0

        # With several threads
        cmd = [
            trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o',
            out_fhand.name, '-p', '2'
        ]
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0
Ejemplo n.º 15
0
    def test_trim_chimeras_bin(self):
        trim_chimeras_bin = os.path.join(SEQ_BIN_DIR, 'trim_mp_chimeras')
        assert 'usage' in check_output([trim_chimeras_bin, '-h'])
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
        query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n'
        query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query = query1 + query2
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        out_fhand = NamedTemporaryFile()
        expected_seqs = ['GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT',
                         'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG']
        cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath,
               '-o', out_fhand.name]
        # raw_input(" ".join(cmd))
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0

        # With several threads
        cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath,
               '-o', out_fhand.name, '-p', '2']
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0
Ejemplo n.º 16
0
    def test_seqitems_io(self):
        'It checks the different seq class streams IO'
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM]))
        assert seqs[0].kind == SEQITEM
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
        assert seqs[0].object.name == 's1'

        # SeqRecord
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD]))
        assert seqs[0].kind == SEQRECORD
        fhand = StringIO()
        write_seqs(seqs, fhand, 'fasta')
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'

        # seqitem not possible with different input and output formats
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        try:
            seqs = list(
                read_seqs([fhand],
                          out_format='fastq',
                          prefered_seq_classes=[SEQITEM]))
            self.fail('ValueError expected')
        except ValueError:
            pass

        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(
            read_seqs([fhand],
                      out_format='fasta',
                      prefered_seq_classes=[SEQITEM]))
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
Ejemplo n.º 17
0
    def test_interleave(self):
        'It interleaves two iterators with paired reads'
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = list(read_seqs([open(file1)], 'fastq'))
        rev_seqs = list(read_seqs([open(file2)], 'fastq'))

        try:
            list(interleave_pairs(fwd_seqs, rev_seqs))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass

        # we skip the tests
        seqs = list(interleave_pairs(fwd_seqs, rev_seqs, skip_checks=True))
        assert len(seqs) == 8

        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')

        seqs = list(interleave_pairs(fwd_seqs, rev_seqs))
        assert len(seqs) == 8
Ejemplo n.º 18
0
    def test_interleave(self):
        'It interleaves two iterators with paired reads'
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = list(read_seqs([open(file1)], 'fastq'))
        rev_seqs = list(read_seqs([open(file2)], 'fastq'))

        try:
            list(interleave_pairs(fwd_seqs, rev_seqs))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass

        # we skip the tests
        seqs = list(interleave_pairs(fwd_seqs, rev_seqs, skip_checks=True))
        assert len(seqs) == 8

        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend1b.sfastq')
        fwd_seqs = read_seqs([open(file1)], 'fastq')
        rev_seqs = read_seqs([open(file2)], 'fastq')

        seqs = list(interleave_pairs(fwd_seqs, rev_seqs))
        assert len(seqs) == 8
Ejemplo n.º 19
0
 def test_calculate_stats_seqitems():
     'It tests the calculate stat function with seqitems'
     in_fhands = []
     for val in range(1, 6):
         fhand = open(join(TEST_DATA_DIR, 'pairend{0}.sfastq'.format(val)))
         in_fhands.append(fhand)
     seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQITEM])
     results = calculate_sequence_stats(seqs, nxs=[50])
     assert 'maximum: 4' in results['length']
     assert 'N50' in results['length']
     assert '1:30.0,30.0,30.0,30.0,30.0 <[|]>' in results['qual_boxplot']
     assert '[30 , 31[ (96): **********' in results['quality']
     assert 'Q30: 100.0' in results['quality']
     assert '0 (A: 1.00, C: 0.00, G: 0.00, T: 0.00' in  results['nucl_freq']
     assert results['kmer'] == ''
Ejemplo n.º 20
0
 def test_orf_annotator(self):
     'It tests orf annotator'
     fpath = os.path.join(TEST_DATA_DIR, 'orf_test.fasta')
     estscan_matrix = os.path.join(TEST_DATA_DIR,
                                   'Arabidopsis_thaliana.smat')
     seq_records = list(
         read_seqs([open(fpath)], prefered_seq_classes=[SEQRECORD]))
     orf_annotator = EstscanOrfAnnotator(estscan_matrix)
     seq_records = orf_annotator(seq_records)
     orf1 = seq_records[0].object.features[0]
     orf2 = seq_records[1].object.features[0]
     assert orf1.strand == 1
     assert orf1.location.start.position == 0
     assert orf1.location.end.position == 541
     assert orf2.strand == -1
     assert orf2.location.start.position == 0
     assert orf2.location.end.position == 541
     assert not seq_records[2].object.features
Ejemplo n.º 21
0
 def test_orf_annotator(self):
     'It tests orf annotator'
     fpath = os.path.join(TEST_DATA_DIR, 'orf_test.fasta')
     estscan_matrix = os.path.join(TEST_DATA_DIR,
                                   'Arabidopsis_thaliana.smat')
     seq_records = list(read_seqs([open(fpath)],
                                  prefered_seq_classes=[SEQRECORD]))
     orf_annotator = EstscanOrfAnnotator(estscan_matrix)
     seq_records = orf_annotator(seq_records)
     orf1 = seq_records[0].object.features[0]
     orf2 = seq_records[1].object.features[0]
     assert orf1.strand == 1
     assert orf1.location.start.position == 0
     assert orf1.location.end.position == 541
     assert orf2.strand == -1
     assert orf2.location.start.position == 0
     assert orf2.location.end.position == 541
     assert not seq_records[2].object.features
Ejemplo n.º 22
0
def _read_estcan_result(fhand, result, file_type):
    'It reads a dna or pep ESTscan result file'
    for seq in read_seqs([fhand]):
        items = [i.strip() for i in get_description(seq).split(';')]
        strand = -1 if 'minus strand' in items else 1
        start, end = items[0].split(' ', 3)[1:3]
        # estscan changes the name, we have to fix it
        seqid = get_name(seq).strip(';')
        try:
            seq_orfs = result[seqid]
        except KeyError:
            seq_orfs = {}
            result[seqid] = seq_orfs
        orf_key = (int(start), int(end), strand)
        if orf_key in seq_orfs:
            orf = seq_orfs[orf_key]
        else:
            orf = {}
            seq_orfs[orf_key] = orf
        orf[file_type] = get_str_seq(seq)
Ejemplo n.º 23
0
def _read_estcan_result(fhand, result, file_type):
    'It reads a dna or pep ESTscan result file'
    for seq in read_seqs([fhand]):
        items = [i.strip() for i in get_description(seq).split(';')]
        strand = -1 if 'minus strand' in items else 1
        start, end = items[0].split(' ', 3)[1:3]
        # estscan changes the name, we have to fix it
        seqid = get_name(seq).strip(';')
        try:
            seq_orfs = result[seqid]
        except KeyError:
            seq_orfs = {}
            result[seqid] = seq_orfs
        orf_key = (int(start), int(end), strand)
        if orf_key in seq_orfs:
            orf = seq_orfs[orf_key]
        else:
            orf = {}
            seq_orfs[orf_key] = orf
        orf[file_type] = get_str_seq(seq)
Ejemplo n.º 24
0
    def _get_chrom_lengths(self):
        chrom_lens = OrderedDict()
        if self._ref_fhand is None:
            vcf_fhand = gzip.open(self._reader.fhand.name)
            for line in vcf_fhand:
                line = line.strip()
                if line.startswith('#'):
                    continue
                items = line.split()
                chrom = items[0]
                loc = int(items[1])
                if chrom not in chrom_lens:
                    chrom_lens[chrom] = loc
                else:
                    if loc > chrom_lens[chrom]:
                        chrom_lens[chrom] = loc

        else:
            for read in read_seqs([self._ref_fhand]):
                chrom_lens[get_name(read)] = get_length(read)
        return chrom_lens
Ejemplo n.º 25
0
def _get_seq_lengths(fhand):
    return {get_name(seq): get_length(seq) for seq in read_seqs([fhand])}
Ejemplo n.º 26
0
    def test_bin_transcrip_orientator(self):
        'it tests the transcript orientator binary'
        orientate_bin = os.path.join(SEQ_BIN_DIR, 'orientate_transcripts')
        assert 'usage' in check_output([orientate_bin, '-h'])

        in_fpath = os.path.join(TEST_DATA_DIR, 'seqs_to_orientate.fasta')
        estscan_matrix = os.path.join(TEST_DATA_DIR,
                                      'Arabidopsis_thaliana.smat')
        blastdb1 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes')
        blastdb2 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'calabaza')

        out_fhand = NamedTemporaryFile()
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               '-v', '0.0001', in_fpath, '-o', out_fhand.name,
               '--polya_min_len', '4']
        check_output(cmd)

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert get_str_seq(init_seqs[0]) == get_str_seq(out_seqs[0])
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert 'polyA' in out_seqs[1].object.description
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert 'estscan_orf' in out_seqs[4].object.description
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        out_seq6 = str(out_seqs[6].object.seq.reverse_complement())
        assert str(init_seqs[6].object.seq) == out_seq6
        assert 'blast arabidopsis_genes' in out_seqs[6].object.description
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               in_fpath]
        stderr = NamedTemporaryFile()
        try:
            check_output(cmd, stderr=stderr)
            self.fail()
        except CalledProcessError:
            stde = open(stderr.name).read()
            assert 'Blast parameters are not well defined' in stde

        # witouth parameters
        out_fhand = NamedTemporaryFile()
        check_output([orientate_bin, in_fpath, '-o', out_fhand.name,
                      '--polya_min_len', '4'])

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        assert str(init_seqs[4].object.seq) == str(out_seqs[4].object.seq)
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq)

        # only with orf annotator
        check_output([orientate_bin, in_fpath, '-o', out_fhand.name, '-u',
                      estscan_matrix, '--polya_min_len', '4'])

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq)

        # multiprocessor
        out_fhand = NamedTemporaryFile()
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               '-v', '0.0001', in_fpath, '-o', out_fhand.name, '-p', '2',
               '--polya_min_len', '4']
        check_output(cmd)
        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert 'polyA' in out_seqs[1].object.description
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert 'estscan_orf' in out_seqs[4].object.description
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        out_seq6 = str(out_seqs[6].object.seq.reverse_complement())
        assert str(init_seqs[6].object.seq) == out_seq6
        assert 'blast arabidopsis_genes' in out_seqs[6].object.description
    def test_bin_transcrip_orientator(self):
        'it tests the transcript orientator binary'
        orientate_bin = os.path.join(BIN_DIR, 'orientate_transcripts')
        assert 'usage' in check_output([orientate_bin, '-h'])

        in_fpath = os.path.join(TEST_DATA_DIR, 'seqs_to_orientate.fasta')
        estscan_matrix = os.path.join(TEST_DATA_DIR,
                                      'Arabidopsis_thaliana.smat')
        blastdb1 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes')
        blastdb2 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'calabaza')

        out_fhand = NamedTemporaryFile()
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               '-v', '0.0001', in_fpath, '-o', out_fhand.name,
               '--polya_min_len', '4']
        check_output(cmd)

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert get_str_seq(init_seqs[0]) == get_str_seq(out_seqs[0])
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert 'polyA' in  out_seqs[1].object.description
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert 'estscan_orf' in  out_seqs[4].object.description
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        out_seq6 = str(out_seqs[6].object.seq.reverse_complement())
        assert str(init_seqs[6].object.seq) == out_seq6
        assert 'blast arabidopsis_genes' in  out_seqs[6].object.description
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               in_fpath]
        stderr = NamedTemporaryFile()
        try:
            check_output(cmd, stderr=stderr)
            self.fail()
        except CalledProcessError:
            stde = open(stderr.name).read()
            assert 'Blast parameters are not well defined' in stde

        # witouth parameters
        out_fhand = NamedTemporaryFile()
        check_output([orientate_bin, in_fpath, '-o', out_fhand.name,
                      '--polya_min_len', '4'])

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        assert str(init_seqs[4].object.seq) == str(out_seqs[4].object.seq)
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq)

        # only with orf annotator
        check_output([orientate_bin, in_fpath, '-o', out_fhand.name, '-u',
                      estscan_matrix, '--polya_min_len', '4'])

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq)

        # multiprocessor
        out_fhand = NamedTemporaryFile()
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               '-v', '0.0001', in_fpath, '-o', out_fhand.name, '-p', '2',
               '--polya_min_len', '4']
        check_output(cmd)
        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert 'polyA' in  out_seqs[1].object.description
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert 'estscan_orf' in  out_seqs[4].object.description
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        out_seq6 = str(out_seqs[6].object.seq.reverse_complement())
        assert str(init_seqs[6].object.seq) == out_seq6
        assert 'blast arabidopsis_genes' in  out_seqs[6].object.description
Ejemplo n.º 28
0
def _get_seq_lengths(fhand):
    return {get_name(seq): get_length(seq) for seq in read_seqs([fhand])}
Ejemplo n.º 29
0
    def test_mate_pair_unorderer_checker():
        'It test the mate pair function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # unordered file
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2_unordered.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs,
                    out_fhand,
                    orphan_out_fhand,
                    out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp
Ejemplo n.º 30
0
    def test_pair_matcher(self):
        'It test the pair matcher function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp

        # File is not sorted
        file1 = '''@s1.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s2.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s1.r
AACCAGTCAAC
+
CCCFFFFFGHH
'''
        file1 = StringIO(file1)
        set_format(file1, 'fastq')
        seqs = read_seqs([file1])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        try:
            match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                        check_order_buffer_size=10)
            output = out_fhand.getvalue()
            self.fail('ItemsNotSortedError error expected')
        except ItemsNotSortedError:
            pass
Ejemplo n.º 31
0
    def test_pair_matcher(self):
        'It test the pair matcher function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fwd_seqs = read_seqs([open(file1)])
        rev_seqs = read_seqs([open(file2)])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        seqs = flat_zip_longest(fwd_seqs, rev_seqs)
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp

        # File is not sorted
        file1 = '''@s1.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s2.f
AACCAGTCAAC
+
CCCFFFFFGHH
@s1.r
AACCAGTCAAC
+
CCCFFFFFGHH
'''
        file1 = StringIO(file1)
        set_format(file1, 'fastq')
        seqs = read_seqs([file1])
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        try:
            match_pairs(seqs,
                        out_fhand,
                        orphan_out_fhand,
                        out_format,
                        check_order_buffer_size=10)
            output = out_fhand.getvalue()
            self.fail('ItemsNotSortedError error expected')
        except ItemsNotSortedError:
            pass
Ejemplo n.º 32
0
    def test_mate_pair_unorderer_checker():
        'It test the mate pair function'
        # with equal seqs but the last ones
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with the firsts seqs different
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend3.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq4:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq5:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in orp
        assert '@seq3:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq6:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        file1 = os.path.join(TEST_DATA_DIR, 'pairend4.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)

        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # unordered file
        file1 = os.path.join(TEST_DATA_DIR, 'pairend1.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2_unordered.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        orp = orphan_out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in orp

        # with reads with no direcction
        file1 = os.path.join(TEST_DATA_DIR, 'pairend7.sfastq')
        file2 = os.path.join(TEST_DATA_DIR, 'pairend2.sfastq')
        fhand = NamedTemporaryFile()
        fhand.write(open(file1).read())
        fhand.write(open(file2).read())
        fhand.flush()
        seqs = read_seqs([fhand])

        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        out_format = 'fastq'

        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                    ordered=False)
        output = out_fhand.getvalue()
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG' in output
        assert '@seq8:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output
        assert '@seq1:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCACG' in output

        orp = orphan_out_fhand.getvalue()
        assert '@seq6:136:FC706VJ:2:2104:15343:197393.mpl_1' in orp
        assert '@seq7:136:FC706VJ:2:2104:15343:197393.hhhh' in orp
        assert '@seq2:136:FC706VJ:2:2104:15343:197393 2:Y:18:ATCAC' in orp