def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)]) infile = in_seqs.toFasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) infile = [ '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA' ] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual(LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = [ '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA' ] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def _split_fasta(self, input_fp, params, jobs_to_start, job_prefix, output_dir): # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = self._compute_seqs_per_file(input_fp, jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps = split_fasta(open(input_fp), num_seqs_per_file, job_prefix, working_dir=output_dir) return tmp_fasta_fps, True
def _split_fasta(self, input_fp, params, jobs_to_start, job_prefix, output_dir): # compute the number of sequences that should be included in # each file after splitting the input fasta file num_seqs_per_file = self._compute_seqs_per_file( input_fp, jobs_to_start) # split the fasta files and get the list of resulting files tmp_fasta_fps =\ split_fasta(open(input_fp), num_seqs_per_file, job_prefix, working_dir=output_dir) return tmp_fasta_fps, True
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(_) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual( LoadSeqs(data=infile, aligned=False), LoadSeqs(data=actual_seqs, aligned=False))
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ filename_prefix = get_tmp_filename(tmp_dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='', result_constructor=str) infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\ '>seq3','CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(3)] self.assertEqual(actual,expected) self.assertEqual(\ LoadSeqs(data=infile,aligned=False),\ LoadSeqs(data=actual_seqs,aligned=False))