Beispiel #1
0
    def _simulate_reads(self, dict_id_abundance, dict_id_file_path, factor,
                        directory_output):
        """
        Parallel simulation of reads

        @param dict_id_abundance: Dictionary of genome id to abundance
        @type dict_id_abundance: dict[str|unicode, float]
        @param dict_id_file_path: Dictionary of genome id to file path
        @type dict_id_file_path: dict[str|unicode, str|unicode]
        @param factor: Factor abundances will be multiplied by
        @type factor: float | int | long
        @param directory_output: Directory for the sam and fastq files output
        @type directory_output: str | unicode
        """
        self._logger.info("Simulating reads using %s readsimulator..." %
                          self._label)
        assert isinstance(
            dict_id_file_path,
            dict), "Expected dictionary, genome id as key, file path as value"
        assert isinstance(
            dict_id_abundance,
            dict), "Expected dictionary, genome id as key, abundance as value"
        assert isinstance(factor,
                          (int, long, float)), "Factor must be numerical"
        assert self.validate_dir(directory_output)

        # add commands to a list of tasks to run them in parallel instead of calling them sequentially
        tasks = []
        for genome_id in dict_id_abundance.keys():
            file_path_input = dict_id_file_path[genome_id]
            abundance = dict_id_abundance[genome_id]
            if abundance == 0:
                continue
            if self._label == "ReadSimulationWgsim" or self._label == "ReadSimulationNanosim":
                # name "fold_coverage" is misleading for wgsim/nanosim, which use number of reads as input
                fold_coverage = int(
                    round(abundance * factor / self._fragment_size_mean))
            else:
                fold_coverage = abundance * factor
            file_path_output_prefix = os.path.join(directory_output,
                                                   str(genome_id))
            self._logger.debug("{id}\t{fold_coverage}".format(
                id=genome_id, fold_coverage=fold_coverage))
            system_command = self._get_sys_cmd(
                file_path_input=file_path_input,
                fold_coverage=fold_coverage,
                file_path_output_prefix=file_path_output_prefix)
            self._logger.debug("SysCmd: '{}'".format(system_command))
            self._logger.info("Simulating reads from {}: '{}'".format(
                genome_id, file_path_input))
            tasks.append(TaskCmd(system_command))
        list_of_fails = runCmdParallel(tasks, maxProc=self._max_processes)

        if list_of_fails is not None:
            self._logger.error("{} commands returned errors!".format(
                len(list_of_fails)))
            reportFailedCmd(list_of_fails)
        self._logger.info("Simulating reads finished")
    def convert_sam_to_bam_by_list(self, list_of_sam_files, output_dir="./"):
        """
			Converts all SAM-files in current directory to BAM-Format

			@attention:

			@param list_of_sam_files: list of sam file paths
			@type list_of_sam_files: list[str|unicode]
			@param output_dir: output directory
			@type output_dir: str | unicode

			@return: None
			@rtype: None

			@raises: OSError | AssertionError
		"""
        bam_is_folder = self.validate_dir(output_dir, silent=True)
        assert isinstance(list_of_sam_files,
                          list), "Expected list of file paths"
        assert bam_is_folder, "Invalid file or directory: '{}'".format(
            output_dir)
        # add commands to a list of tasks to run them in parallel
        tasks = []
        # cmd = "{exe} {sam} {name}"
        for sam_file_path in list_of_sam_files:
            cmd = self._get_sam_to_bam_cmd(sam_file_path, output_dir)
            tasks.append(TaskCmd(cmd))

        fail_list = runCmdParallel(tasks, maxProc=self._max_processes)
        if fail_list is not None:
            for message in reportFailedCmd(fail_list):
                self._logger.error(message)
            msg = "Converting sam files to bam files failed."
            self._logger.error(msg)
            raise OSError(msg)
Beispiel #3
0
    def multiprocessing_run(self):
        """
		Distributes the passed command-line jobs using multiprocessing.

		@rtype: None
		"""
        self._logger.info("Running {} jobs with multiprocessing".format(
            len(self._cmd_lines)))
        list_cmd_task = [
            parallel.TaskCmd(cmd, self._tmp_dir) for cmd in self._cmd_lines
        ]
        fail_list = parallel.runCmdParallel(list_cmd_task,
                                            self._max_processors)
        if fail_list is not None:
            parallel.reportFailedCmd(fail_list)
            self._CUM_RETVALS = -1 * len(fail_list)
        self._logger.info("Multiprocessing jobs completed")
Beispiel #4
0
    def _simulate_strains(self,
                          genome_id_to_amounts,
                          genome_id_to_file_path_genome,
                          genome_id_to_file_path_gff=None):
        """
		Use sgEvolver to generate strain-level diversity around an isolate assembly.

		@attention genome_id_to_file_path_genome: Will be extended with IDs and file paths to the strains

		@param genome_id_to_amounts: Mapping from genome id to the amount of strains
		@type genome_id_to_amounts: dict[str, int]
		@param genome_id_to_file_path_genome: Mapping from genome id to the file path of the genome
		@type genome_id_to_file_path_genome: dict[str, str]
		@param genome_id_to_file_path_gff: Mapping from genome id to the file path of the gene annotations of a genome
		@type genome_id_to_file_path_gff: dict[str, str]

		@return: Nothing
		@rtype: None
		"""
        tasks = []
        file_path_empty_file = None
        if genome_id_to_file_path_gff is None:
            file_path_empty_file = self.get_full_path(
                tempfile.mktemp(dir=self._tmp_dir))
            touch(file_path_empty_file)

        for genome_id in genome_id_to_file_path_genome.keys():
            if self._keep_original and genome_id_to_amounts[genome_id] == 1:
                continue
            directory_strain = self._directory_strain.format(gid=genome_id)
            self._prepare_simulation_subfolder(directory_strain)
            file_path_genome = genome_id_to_file_path_genome[genome_id]
            if genome_id_to_file_path_gff is None:
                file_path_gff = file_path_empty_file
            else:
                file_path_gff = genome_id_to_file_path_gff[genome_id]
            self._logger.info(
                "Simulating strain evolution of '{}'".format(genome_id))
            tasks.append(
                TaskCmd(
                    self._get_simulate_cmd(directory_strains=directory_strain,
                                           filepath_genome=file_path_genome,
                                           filepath_gff=file_path_gff)))
        list_of_fails = runCmdParallel(tasks, maxProc=self._max_processors)

        if file_path_empty_file is not None:
            if os.path.exists(file_path_empty_file):
                os.remove(file_path_empty_file)

        if list_of_fails is not None:
            for message in reportFailedCmd(list_of_fails):
                self._logger.error(message)
            msg = "Simulation of strains failed."
            self._logger.error(msg)
            raise OSError(msg)
    def merge_bam_files_by_dict(self, dict_of_bam_files, output_dir):
        """
			Merge lists of bam files into one.

			@attention: dictionary keys used as file names

			@param dict_of_bam_files: dictionary list of bam file paths as value
			@type dict_of_bam_files: dict[str|unicode, list[str|unicode]]
			@param output_dir: output directory
			@type output_dir: str | unicode

			@return: None
			@rtype: None

			@raises: OSError | AssertionError
		"""
        output_dir = self.get_full_path(output_dir)
        bam_is_folder = self.validate_dir(output_dir, silent=True)
        assert isinstance(dict_of_bam_files,
                          dict), "Expected dictionary of file paths"
        assert bam_is_folder, "Invalid file or directory: '{}'".format(
            output_dir)
        for key, list_of_bam_paths in dict_of_bam_files.iteritems():
            for file_path in list_of_bam_paths:
                assert self.validate_file(
                    file_path), "Invalid file: '{}'".format(file_path)

        # add commands to a list of tasks to run them in parallel
        tasks = []
        for filename, list_of_bam_paths in dict_of_bam_files.iteritems():
            if len(list_of_bam_paths) == 1:
                # move bam instead of merge, if only one
                file_path = list_of_bam_paths[0]
                self._logger.warning(
                    "List contains only one file: '{}'".format(file_path))
                out_file_path = os.path.join(
                    output_dir, filename + self._bam_file_extension)
                shutil.copy2(file_path, out_file_path)
                continue
            cmd = self._get_merge_bam_cmd(list_of_bam_paths,
                                          os.path.join(output_dir, filename))
            tasks.append(TaskCmd(cmd))
        fail_list = runCmdParallel(tasks, maxProc=self._max_processes)
        if fail_list is not None:
            for message in reportFailedCmd(fail_list):
                self._logger.error(message)
            msg = "Converting sam files to bam files failed."
            self._logger.error(msg)
            raise OSError(msg)
Beispiel #6
0
    def gather_markergenes(self, hmmer, mg_type, file_path_output,
                           file_path_map_uid_sid):
        """
		Find and extract marker genes from genomes

		@param hmmer: hmmer2 or hmmer3
		@type hmmer: int | long
		@param mg_type: '16S', '5S' or '23S' etc
		@type mg_type: str | unicode
		@param file_path_output: Output for list of extracted marker genes sequences in fasta format
		@type file_path_output: str | unicode

		@rtype: None
		"""
        assert isinstance(hmmer, (int, long))
        assert isinstance(file_path_output, basestring)
        assert self.validate_number(hmmer, minimum=2, maximum=3)
        assert self.validate_dir(file_path_output, only_parent=True)
        assert mg_type in self._suffixes, "Marker gene '{}' is not supported."

        self._logger.info("Searching and extracting marker genes")
        start = time.time()
        query_genome_file_paths = self._get_genome_id_to_path_map(
            self._file_path_query_genome_file_paths)
        if self._file_path_reference_genome_file_paths is not None and self._file_path_reference_marker_genes is None:
            reference_genome_file_paths = self._get_genome_id_to_path_map(
                self._file_path_reference_genome_file_paths)
            query_genome_file_paths.update(reference_genome_file_paths)
        elif self._file_path_reference_genome_file_paths is not None and self._file_path_reference_marker_genes is not None:
            self._logger.warning(
                "Ignoring reference genome file paths and using previous reference marker genes!"
            )

        cmd_list = self._get_cmd_list(hmmer=hmmer,
                                      dict_of_fasta=query_genome_file_paths)
        list_of_tasks = []
        for cmd in cmd_list:
            list_of_tasks.append(parallel.TaskCmd(cmd))

        fail_list = parallel.runCmdParallel(list_of_tasks,
                                            self._max_processors)
        if fail_list is not None:
            for message in parallel.reportFailedCmd(fail_list):
                self._logger.error(message)
            msg = "Extracting marker genes failed."
            self._logger.error(msg)
            raise OSError(msg)

        tmp_out_file_path = tempfile.mktemp(suffix="_accepted",
                                            dir=self._temp_directory)
        tmp_out_file_bin_path = tempfile.mktemp(suffix="_rejected",
                                                dir=self._temp_directory)

        self._merge_marker_genes_files(
            query_genome_file_paths,
            tmp_out_file_path,
            file_path_out_bin=tmp_out_file_bin_path,
            file_path_map_uid_sid=file_path_map_uid_sid,
            mg_type=mg_type)
        if os.path.exists(tmp_out_file_path):
            shutil.copy2(tmp_out_file_path, file_path_output)
        else:
            self._logger.warning("No valid maker gene found!")
        if os.path.exists(tmp_out_file_bin_path):
            shutil.copy2(tmp_out_file_bin_path,
                         file_path_output + ".rejected.fna")

        if self._file_path_reference_marker_genes is not None:
            # append reference genome marker genes
            shutil.copy(file_path_output, file_path_output + ".no_ref")
            with open(file_path_output, 'a') as write_handler, open(
                    self._file_path_reference_marker_genes) as read_handler:
                write_handler.writelines(read_handler)

        end = time.time()
        self._logger.info("Extracting marker genes finished ({}s)".format(
            round(end - start, 1)))

        if not self._debug:
            for directory in self._working_dirs.values():
                shutil.rmtree(directory)
        else:
            for directory in self._working_dirs.values():
                self._logger.warning("Remove manually: '{}'".format(directory))