def test_sort_by_position_in_ref(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') # with fasta format query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n' query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n' query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n' query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n' query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n' query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names # it fails because bwa somehow gives a position to an unmapped seq # with fastq format query1 += '+\n??????????????????????????????????????????????????\n' query2 += '+\n??????????????????????????????????????????????????\n' query3 += '+\n??????????????????????????????????????????????????\n' query4 += '+\n??????????????????????????????????????????????????\n' query5 += '+\n??????????????????????????????????????????????????\n' query6 += '+\n??????????????????????????????????????????????????\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names # sort by sequence sorted_names = [] for seq in sort_fastx_files([in_fhand], key='seq', directory=None, max_items_in_memory=None, tempdir=None): sorted_names.append(get_name(seq)) expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4'] assert sorted_names == expected_names
def test_sort_by_position_in_ref(self): reference = GENOME ref_fhand = NamedTemporaryFile() ref_fhand.write(reference) ref_fhand.flush() #with fasta format query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n' query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n' query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n' query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n' query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n' query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files([in_fhand], 'coordinate', ref_fhand.name): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names #with fastq format query1 += '+\n??????????????????????????????????????????????????\n' query2 += '+\n??????????????????????????????????????????????????\n' query3 += '+\n??????????????????????????????????????????????????\n' query4 += '+\n??????????????????????????????????????????????????\n' query5 += '+\n??????????????????????????????????????????????????\n' query6 += '+\n??????????????????????????????????????????????????\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files([in_fhand], 'coordinate', ref_fhand.name): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names #sort by sequence sorted_names = [] for seq in sort_fastx_files([in_fhand], key='seq', directory=None, max_items_in_memory=None, tempdir=None): sorted_names.append(get_name(seq)) expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4'] assert sorted_names == expected_names
def test_sort_by_position_in_ref(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') #with fasta format query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n' query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n' query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n' query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n' query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n' query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names #it fails because bwa somehow gives a position to an unmapped seq #with fastq format query1 += '+\n??????????????????????????????????????????????????\n' query2 += '+\n??????????????????????????????????????????????????\n' query3 += '+\n??????????????????????????????????????????????????\n' query4 += '+\n??????????????????????????????????????????????????\n' query5 += '+\n??????????????????????????????????????????????????\n' query6 += '+\n??????????????????????????????????????????????????\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names #sort by sequence sorted_names = [] for seq in sort_fastx_files([in_fhand], key='seq', directory=None, max_items_in_memory=None, tempdir=None): sorted_names.append(get_name(seq)) expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4'] assert sorted_names == expected_names