Ejemplo n.º 1
0
    def _design_community(self):
        """
        Start designing sample a community

        @return: map genome id to genome file path and list of distribution file paths
        @rtype: tuple[dict[str|unicode, str|unicode], list[str|unicode]]]
        """
        meta_data_table = MetadataTable(separator=self._separator,
                                        logfile=self._logfile,
                                        verbose=self._verbose)

        community_design = CommunityDesign(
            column_name_genome_id=self._column_name_genome_id,
            column_name_otu=self._column_name_otu,
            column_name_novelty_category=self._column_name_novelty_category,
            column_name_ncbi=self._column_name_ncbi,
            column_name_source=self._column_name_source,
            max_processors=self._max_processors,
            tmp_dir=self._project_file_folder_handler.get_tmp_wd(),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug,
            seed=None)

        directory_out_distributions = self._project_file_folder_handler.get_distribution_dir(
        )
        list_of_file_paths_distribution = community_design.get_distribution_file_paths(
            directory_out_distributions, self._number_of_samples)
        directory_out_metadata = self._project_file_folder_handler.get_meta_data_dir(
        )
        directory_simulation_template = self._strain_simulation_template
        merged_genome_id_to_path_map = community_design.design_samples(
            list_of_communities=self._list_of_communities,
            metadata_table=meta_data_table,
            list_of_file_paths_distribution=list_of_file_paths_distribution,
            directory_out_metadata=directory_out_metadata,
            directory_in_template=directory_simulation_template)
        #     directory_out_distributions=directory_out_distributions,
        self.write_profile_gold_standard(meta_data_table,
                                         list_of_file_paths_distribution)

        file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path(
        )
        meta_data_table.write(file_path_metadata, column_names=True)
        return merged_genome_id_to_path_map, list_of_file_paths_distribution
Ejemplo n.º 2
0
    def run_pipeline(self):
        """
        Run pipeline

        @rtype: None
        """
        if not self.is_valid():
            self._logger.info("Metagenome simulation aborted")
            return
        self._logger.info("Metagenome simulation starting")
        try:
            # Validate Genomes
            if self._phase_validate_raw_genomes:
                self._logger.info("Validating Genomes")
                self._validate_raw_genomes()

            # Design Communities
            if self._input_list_of_file_paths_distributions:
                assert len(self._input_list_of_file_paths_distributions) == self._number_of_samples
                
                meta_data_table = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose)
                file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path()
                
                for community in self._list_of_communities:
                    meta_data_table.read(community.file_path_metadata_table, column_names=True)
                    file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path()
                    meta_data_table.write(file_path_metadata, column_names=True)
                    out_locations = {}
                    # collect all paths
                    with open(community.file_path_genome_locations,'r') as in_locations: 
                        for line in in_locations:
                            genome, path = line.strip().split('\t')
                            out_locations[genome] = path 
                            # might overwrite path for genomes appearing multiple times and having been assigned different genomes
                # and write complete collection, so no genome appears multiple times
                with open(file_path_genome_locations, 'a') as locations:
                    for gen_id in out_locations:
                        locations.write("%s\t%s\n" % (gen_id, out_locations[gen_id]))
                
                genome_id_to_path_map = self.get_dict_gid_to_genome_file_path()
                directory_out_distributions = self._project_file_folder_handler.get_distribution_dir()
                list_of_file_paths_distributions = CommunityDesign.get_distribution_file_paths(
                    directory_out_distributions, self._number_of_samples)
                for file_path_src, file_path_dst in zip(self._input_list_of_file_paths_distributions, list_of_file_paths_distributions):
                    shutil.copy2(file_path_src, file_path_dst)
                self.write_profile_gold_standard(meta_data_table, list_of_file_paths_distributions)
            elif self._phase_design_community:
                self._logger.info("Design Communities")
                genome_id_to_path_map, list_of_file_paths_distributions = self._design_community()
            else:
                genome_id_to_path_map = self.get_dict_gid_to_genome_file_path()
                directory_out_distributions = self._project_file_folder_handler.get_distribution_dir()
                list_of_file_paths_distributions = CommunityDesign.get_distribution_file_paths(
                    directory_out_distributions, self._number_of_samples)

            # Move Genomes
            if self._phase_move_and_clean_genomes:
                self._logger.info("Move Genomes")
                self._move_and_cleanup_genomes(genome_id_to_path_map)

            # Read simulation (Art Illumina)
            if self._phase_simulate_reads:
                self._logger.info("Read simulation")
                for sample_index, file_path_distribution in enumerate(list_of_file_paths_distributions):
                    self._simulate_reads(file_path_distribution, sample_index)

            # Generate gold standard assembly
            list_of_output_gsa = None
            file_path_output_gsa_pooled = None
            if self._phase_pooled_gsa:
                self._logger.info("Generate gold standard assembly")
                list_of_output_gsa = self._generate_gsa()

            # Generate gold standard assembly from pooled reads of all samples
            if self._phase_pooled_gsa:
                self._logger.info("Generate pooled strains gold standard assembly")
                file_path_output_gsa_pooled = self._generate_gsa_pooled()

            # Anonymize Data (gsa)
            if self._phase_anonymize:
                self._logger.info("Anonymize Data")
                self._logger.debug(", ".join(list_of_output_gsa))
                self._anonymize_data(list_of_output_gsa, file_path_output_gsa_pooled)
            #elif self._phase_pooled_gsa: 
            else: # in any case create binning gold standard
                self._logger.info("Creating binning gold standard")
                self._logger.debug(", ".join(list_of_output_gsa))
                self._create_binning_gs(list_of_output_gsa)

            # Compress Data
            if self._phase_compress:
                self._logger.info("Compress Data")
                self._compress_data()

        except (KeyboardInterrupt, SystemExit, Exception, ValueError, RuntimeError) as e:
            self._logger.debug("\n{}\n".format(traceback.format_exc()))
            exc_tb = sys.exc_info()[-1]
            self._logger.error("%s in line %s" % (e, exc_tb.tb_lineno))
            self._logger.info("Metagenome simulation aborted")
        except AssertionError:
            self._logger.info("Metagenome simulation aborted, assertion %s failed" % e)
        else:
            self._logger.info("Metagenome simulation finished")

        if not self._debug:
            self._project_file_folder_handler.remove_directory_temp()
        else:
            self._logger.info("Temporary data stored at:\n{}".format(self._project_file_folder_handler.get_tmp_wd()))