예제 #1
0
    def __init__(self,
                 target_files_dict,
                 num_threads_to_use=1,
                 progress=progress,
                 run=run):
        """A class to streamline HMM runs."""
        self.num_threads_to_use = num_threads_to_use
        self.progress = progress
        self.run = run

        self.tmp_dirs = []
        self.target_files_dict = {}

        for source in target_files_dict:
            tmp_dir = filesnpaths.get_temp_directory_path()
            self.tmp_dirs.append(tmp_dir)

            part_file_name = os.path.join(
                tmp_dir, os.path.basename(target_files_dict[source]))

            # create splitted fasta files inside tmp directory
            self.target_files_dict[source] = utils.split_fasta(
                target_files_dict[source],
                parts=self.num_threads_to_use,
                prefix=part_file_name)
예제 #2
0
파일: hmmer.py 프로젝트: FBBJBB/anvio
    def __init__(self, target_files_dict, num_threads_to_use=1, program_to_use='hmmscan', progress=progress, run=run):
        """A class to streamline HMM runs.

        Notes
        =====
        - HMMer user guide: http://eddylab.org/software/hmmer/Userguide.pdf
        """

        self.num_threads_to_use = num_threads_to_use
        self.program_to_use = program_to_use
        self.progress = progress
        self.run = run

        self.tmp_dirs = []
        self.target_files_dict = {}

        acceptable_programs = ["hmmscan", "hmmsearch"]
        if self.program_to_use not in acceptable_programs:
            raise ConfigError("HMMer class here. You are attempting to use the program %s to run HMMs, but we don't recognize it. The currently "
                              "supported programs are: %s" % (self.program_to_use, ", ".join(acceptable_programs)))

        for source in target_files_dict:
            tmp_dir = filesnpaths.get_temp_directory_path()
            self.tmp_dirs.append(tmp_dir)

            part_file_name = os.path.join(tmp_dir, os.path.basename(target_files_dict[source]))

            # create splitted fasta files inside tmp directory
            self.target_files_dict[source] = utils.split_fasta(target_files_dict[source],
                                                               parts=self.num_threads_to_use,
                                                               prefix=part_file_name)
예제 #3
0
    def test_custom_prefix(self):
        parts = 1
        prefix = os.path.join(self.this_dir, 'silly')

        out_files = split_fasta(self.five_seq_fasta,
                                parts=parts,
                                prefix=prefix)
        expected_out_files = [os.path.join(self.this_dir, 'silly.0')]

        self.assertEqual(out_files, expected_out_files)

        for f in out_files:
            os.remove(f)
예제 #4
0
    def test_more_parts_than_sequences(self):
        parts = 10
        num_sequences = 5
        expected_out_files = [
            os.path.join(self.test_files, f'{self.five_seq_fasta}.{i}')
            for i in range(num_sequences)
        ]

        out_files = split_fasta(self.five_seq_fasta, parts=parts)

        self.assertEqual(out_files, expected_out_files)

        for f in out_files:
            os.remove(f)
예제 #5
0
    def test_custom_prefix(self):
        parts = 1
        file_name_prefix = 'silly'

        out_files = split_fasta(self.five_seq_fasta,
                                parts=parts,
                                file_name_prefix=file_name_prefix,
                                output_dir=self.this_dir)
        expected_out_files = [os.path.join(self.this_dir, 'silly.0')]

        self.assertEqual(out_files, expected_out_files)

        for f in out_files:
            os.remove(f)
예제 #6
0
    def test_shuffle_mode(self):
        parts = 2

        out_files = split_fasta(self.five_seq_fasta, parts=parts, shuffle=True)

        fasta = ReadFasta(out_files[0])
        self.assertEqual(fasta.ids, ['seq1 apple', 'seq3 cat', 'seq5 extra'])
        self.assertEqual(fasta.sequences, ['AA', 'ACTACT', 'ACTGAACTGA'])
        fasta.close()

        fasta = ReadFasta(out_files[1])
        self.assertEqual(fasta.ids, ['seq2 banana', 'seq4 dog'])
        self.assertEqual(fasta.sequences, ['ACAC', 'ACTGACTG'])
        fasta.close()

        for f in out_files:
            os.remove(f)
예제 #7
0
    def test_single_fasta_gives_one_split(self):
        out_files = split_fasta(self.single_seq_fasta)

        expected_out_file = os.path.join(self.test_files,
                                         f'{self.single_seq_fasta}.0')

        self.assertEqual(out_files, [expected_out_file])

        self.assertTrue(os.path.exists(expected_out_file))

        fasta = ReadFasta(expected_out_file)

        self.assertEqual(fasta.ids, ['seq1 apple'])
        self.assertEqual(fasta.sequences, ['AA'])

        fasta.close()

        os.remove(expected_out_file)
예제 #8
0
    def _split_input_file(self):
        """Split input fasta into the correct number of splits.

        Returns `State` with the paths to each of the fasta splits.

        Raises `ValueError` if `self.number_of_splits < 0.

        See Superclass `_split_input_file` for more info.
        """
        # Todo: probably should move this check into the constructor.
        if self.number_of_splits <= 0:
            ValueError(
                f'number_of_splits muts be > 0.  Got {self.number_of_splits}')

        # Todo are there errors to catch here?
        self.input_file_splits = utils.split_fasta(self.input_file_path,
                                                   parts=self.number_of_splits,
                                                   shuffle=True)

        return State(input_file_splits=self.input_file_splits)
예제 #9
0
    def test_fasta_splitting(self):
        parts = 2
        expected_out_files = [
            os.path.join(self.test_files, f'{self.five_seq_fasta}.{i}')
            for i in range(parts)
        ]

        out_files = split_fasta(self.five_seq_fasta, parts=parts)

        self.assertEqual(out_files, expected_out_files)

        fasta = ReadFasta(out_files[0])
        self.assertEqual(fasta.ids, ['seq1 apple', 'seq2 banana'])
        self.assertEqual(fasta.sequences, ['AA', 'ACAC'])
        fasta.close()

        fasta = ReadFasta(out_files[1])
        self.assertEqual(fasta.ids, ['seq3 cat', 'seq4 dog', 'seq5 extra'])
        self.assertEqual(fasta.sequences, ['ACTACT', 'ACTGACTG', 'ACTGAACTGA'])
        fasta.close()

        for f in out_files:
            os.remove(f)