def __init__(self, target_files_dict, num_threads_to_use=1, progress=progress, run=run): """A class to streamline HMM runs.""" self.num_threads_to_use = num_threads_to_use self.progress = progress self.run = run self.tmp_dirs = [] self.target_files_dict = {} for source in target_files_dict: tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) part_file_name = os.path.join( tmp_dir, os.path.basename(target_files_dict[source])) # create splitted fasta files inside tmp directory self.target_files_dict[source] = utils.split_fasta( target_files_dict[source], parts=self.num_threads_to_use, prefix=part_file_name)
def __init__(self, target_files_dict, num_threads_to_use=1, program_to_use='hmmscan', progress=progress, run=run): """A class to streamline HMM runs. Notes ===== - HMMer user guide: http://eddylab.org/software/hmmer/Userguide.pdf """ self.num_threads_to_use = num_threads_to_use self.program_to_use = program_to_use self.progress = progress self.run = run self.tmp_dirs = [] self.target_files_dict = {} acceptable_programs = ["hmmscan", "hmmsearch"] if self.program_to_use not in acceptable_programs: raise ConfigError("HMMer class here. You are attempting to use the program %s to run HMMs, but we don't recognize it. The currently " "supported programs are: %s" % (self.program_to_use, ", ".join(acceptable_programs))) for source in target_files_dict: tmp_dir = filesnpaths.get_temp_directory_path() self.tmp_dirs.append(tmp_dir) part_file_name = os.path.join(tmp_dir, os.path.basename(target_files_dict[source])) # create splitted fasta files inside tmp directory self.target_files_dict[source] = utils.split_fasta(target_files_dict[source], parts=self.num_threads_to_use, prefix=part_file_name)
def test_custom_prefix(self): parts = 1 prefix = os.path.join(self.this_dir, 'silly') out_files = split_fasta(self.five_seq_fasta, parts=parts, prefix=prefix) expected_out_files = [os.path.join(self.this_dir, 'silly.0')] self.assertEqual(out_files, expected_out_files) for f in out_files: os.remove(f)
def test_more_parts_than_sequences(self): parts = 10 num_sequences = 5 expected_out_files = [ os.path.join(self.test_files, f'{self.five_seq_fasta}.{i}') for i in range(num_sequences) ] out_files = split_fasta(self.five_seq_fasta, parts=parts) self.assertEqual(out_files, expected_out_files) for f in out_files: os.remove(f)
def test_custom_prefix(self): parts = 1 file_name_prefix = 'silly' out_files = split_fasta(self.five_seq_fasta, parts=parts, file_name_prefix=file_name_prefix, output_dir=self.this_dir) expected_out_files = [os.path.join(self.this_dir, 'silly.0')] self.assertEqual(out_files, expected_out_files) for f in out_files: os.remove(f)
def test_shuffle_mode(self): parts = 2 out_files = split_fasta(self.five_seq_fasta, parts=parts, shuffle=True) fasta = ReadFasta(out_files[0]) self.assertEqual(fasta.ids, ['seq1 apple', 'seq3 cat', 'seq5 extra']) self.assertEqual(fasta.sequences, ['AA', 'ACTACT', 'ACTGAACTGA']) fasta.close() fasta = ReadFasta(out_files[1]) self.assertEqual(fasta.ids, ['seq2 banana', 'seq4 dog']) self.assertEqual(fasta.sequences, ['ACAC', 'ACTGACTG']) fasta.close() for f in out_files: os.remove(f)
def test_single_fasta_gives_one_split(self): out_files = split_fasta(self.single_seq_fasta) expected_out_file = os.path.join(self.test_files, f'{self.single_seq_fasta}.0') self.assertEqual(out_files, [expected_out_file]) self.assertTrue(os.path.exists(expected_out_file)) fasta = ReadFasta(expected_out_file) self.assertEqual(fasta.ids, ['seq1 apple']) self.assertEqual(fasta.sequences, ['AA']) fasta.close() os.remove(expected_out_file)
def _split_input_file(self): """Split input fasta into the correct number of splits. Returns `State` with the paths to each of the fasta splits. Raises `ValueError` if `self.number_of_splits < 0. See Superclass `_split_input_file` for more info. """ # Todo: probably should move this check into the constructor. if self.number_of_splits <= 0: ValueError( f'number_of_splits muts be > 0. Got {self.number_of_splits}') # Todo are there errors to catch here? self.input_file_splits = utils.split_fasta(self.input_file_path, parts=self.number_of_splits, shuffle=True) return State(input_file_splits=self.input_file_splits)
def test_fasta_splitting(self): parts = 2 expected_out_files = [ os.path.join(self.test_files, f'{self.five_seq_fasta}.{i}') for i in range(parts) ] out_files = split_fasta(self.five_seq_fasta, parts=parts) self.assertEqual(out_files, expected_out_files) fasta = ReadFasta(out_files[0]) self.assertEqual(fasta.ids, ['seq1 apple', 'seq2 banana']) self.assertEqual(fasta.sequences, ['AA', 'ACAC']) fasta.close() fasta = ReadFasta(out_files[1]) self.assertEqual(fasta.ids, ['seq3 cat', 'seq4 dog', 'seq5 extra']) self.assertEqual(fasta.sequences, ['ACTACT', 'ACTGACTG', 'ACTGAACTGA']) fasta.close() for f in out_files: os.remove(f)