Esempio n. 1
0
    def test_sort_by_position_in_ref(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')

        # with fasta format
        query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n'
        query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n'
        query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n'
        query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n'
        query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n'
        query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names
        # it fails because bwa somehow gives a position to an unmapped seq

        # with fastq format
        query1 += '+\n??????????????????????????????????????????????????\n'
        query2 += '+\n??????????????????????????????????????????????????\n'
        query3 += '+\n??????????????????????????????????????????????????\n'
        query4 += '+\n??????????????????????????????????????????????????\n'
        query5 += '+\n??????????????????????????????????????????????????\n'
        query6 += '+\n??????????????????????????????????????????????????\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names

        # sort by sequence
        sorted_names = []
        for seq in sort_fastx_files([in_fhand],
                                    key='seq',
                                    directory=None,
                                    max_items_in_memory=None,
                                    tempdir=None):
            sorted_names.append(get_name(seq))
        expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4']
        assert sorted_names == expected_names
Esempio n. 2
0
    def test_sort_by_position_in_ref(self):
        reference = GENOME
        ref_fhand = NamedTemporaryFile()
        ref_fhand.write(reference)
        ref_fhand.flush()

        #with fasta format
        query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n'
        query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n'
        query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n'
        query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n'
        query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n'
        query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files([in_fhand], 'coordinate', ref_fhand.name):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names

        #with fastq format
        query1 += '+\n??????????????????????????????????????????????????\n'
        query2 += '+\n??????????????????????????????????????????????????\n'
        query3 += '+\n??????????????????????????????????????????????????\n'
        query4 += '+\n??????????????????????????????????????????????????\n'
        query5 += '+\n??????????????????????????????????????????????????\n'
        query6 += '+\n??????????????????????????????????????????????????\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files([in_fhand], 'coordinate', ref_fhand.name):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names

        #sort by sequence
        sorted_names = []
        for seq in sort_fastx_files([in_fhand], key='seq', directory=None,
                     max_items_in_memory=None, tempdir=None):
            sorted_names.append(get_name(seq))
        expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4']
        assert sorted_names == expected_names
Esempio n. 3
0
    def test_sort_by_position_in_ref(self):
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')

        #with fasta format
        query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n'
        query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n'
        query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n'
        query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n'
        query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n'
        query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names
        #it fails because bwa somehow gives a position to an unmapped seq

        #with fastq format
        query1 += '+\n??????????????????????????????????????????????????\n'
        query2 += '+\n??????????????????????????????????????????????????\n'
        query3 += '+\n??????????????????????????????????????????????????\n'
        query4 += '+\n??????????????????????????????????????????????????\n'
        query5 += '+\n??????????????????????????????????????????????????\n'
        query6 += '+\n??????????????????????????????????????????????????\n'
        query = query6 + query1 + query2 + query3 + query4 + query5
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        sorted_names = []
        for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath):
            sorted_names.append(get_name(seq))
        expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6']
        assert sorted_names == expected_names

        #sort by sequence
        sorted_names = []
        for seq in sort_fastx_files([in_fhand], key='seq', directory=None,
                     max_items_in_memory=None, tempdir=None):
            sorted_names.append(get_name(seq))
        expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4']
        assert sorted_names == expected_names