Ejemplo n.º 1
0
    def marker_gene_extraction(self):
        """
		The first step is to find and extract 16S marker gene sequences. The sequences are found using "hmmsearch" and extracted based on the given positions.
		Two hmmer can currently be used. HMMER3.0 with a profile from 2010 and "rnammer" using HMMER2.0 with a profile from 2006.
		A third one using HMMER3.0 is still to be evaluated.
		So far it seems like rnammer provides better(more) results, but is also very slow.
		input:
		- file containing a list of fasta file paths, for example the genomes that someone wants to cluster.
		- file containing a list of reference fasta file paths or alternatively, a fasta formated file containing the already extracted marker genes of the reference genomes.
		- working directory where the results will be saved (intermediate files will be worked on in the designated /tmp folder)
		- the number of processors that are available for parallel processing. The program "parallel" is used to process several genomes at the same time.
		output:
		- fasta formatted file containing the extracted marker genes of all genomes

		@rtype: None
		"""
        assert isinstance(self, ArgumentHandler)
        mg_extract = MGExtract(
            mg_analyse_executable=self._get_mg_analyse_executable(),
            file_path_query_genome_file_paths=self.
            _file_path_query_genomes_location_file,
            file_path_reference_genome_file_paths=self.
            _file_path_reference_genome_locations,
            file_path_name_reference_marker_genes=self.
            _file_path_reference_markergene,
            config_path=self._file_path_config,
            file_path_map_reference_genome_id_to_tax_id=self.
            _file_path_map_reference_genome_id_to_tax_id,
            max_processors=self._max_processors,
            temp_directory=self._project_file_folder_handler.get_tmp_wd(),
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        mg_extract.gather_markergenes(
            hmmer=self._hmmer,
            mg_type="16S",
            file_path_output=self._project_file_folder_handler.
            get_file_path_mg_16s(),
            file_path_map_uid_sid=self._project_file_folder_handler.
            get_file_path_internal_id_map())

        # merge silva iid with genome iid
        data_table_iid_mapping_silva = MetadataTable(separator=self._separator,
                                                     logfile=self._logfile,
                                                     verbose=self._verbose)
        file_path_silva_map = os.path.join(self._silva_reference_directory,
                                           MGCluster.get_file_name_of_map())
        data_table_iid_mapping_silva.read(file_path_silva_map)
        data_table_iid_mapping = MetadataTable(separator=self._separator,
                                               logfile=self._logfile,
                                               verbose=self._verbose)
        data_table_iid_mapping.read(
            self._project_file_folder_handler.get_file_path_internal_id_map())
        data_table_iid_mapping.concatenate(data_table_iid_mapping_silva,
                                           strict=False)
        data_table_iid_mapping.write(
            self._project_file_folder_handler.get_file_path_internal_id_map())
	def create_meta_table(self, file_path_metadata_table):
		"""
		Generate a input metadata file with genome ids only

		@param file_path_metadata_table:
		@type file_path_metadata_table: str|unicode

		@rtype: None
		"""
		metadata_table = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose)
		metadata_table.read(self._file_path_reference_genome_locations, column_names=False)
		if metadata_table.get_number_of_rows() == 0:
			raise ValueError("Invalid file content")
		id_column = metadata_table.get_column(0)
		metadata_table.clear()
		metadata_table.insert_column(id_column, self._column_name_genome_id)
		metadata_table.write(file_path_metadata_table, column_names=True)
Ejemplo n.º 3
0
    def _design_community(self):
        """
        Start designing sample a community

        @return: map genome id to genome file path and list of distribution file paths
        @rtype: tuple[dict[str|unicode, str|unicode], list[str|unicode]]]
        """
        meta_data_table = MetadataTable(separator=self._separator,
                                        logfile=self._logfile,
                                        verbose=self._verbose)

        community_design = CommunityDesign(
            column_name_genome_id=self._column_name_genome_id,
            column_name_otu=self._column_name_otu,
            column_name_novelty_category=self._column_name_novelty_category,
            column_name_ncbi=self._column_name_ncbi,
            column_name_source=self._column_name_source,
            max_processors=self._max_processors,
            tmp_dir=self._project_file_folder_handler.get_tmp_wd(),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug,
            seed=None)

        directory_out_distributions = self._project_file_folder_handler.get_distribution_dir(
        )
        list_of_file_paths_distribution = community_design.get_distribution_file_paths(
            directory_out_distributions, self._number_of_samples)
        directory_out_metadata = self._project_file_folder_handler.get_meta_data_dir(
        )
        directory_simulation_template = self._strain_simulation_template
        merged_genome_id_to_path_map = community_design.design_samples(
            list_of_communities=self._list_of_communities,
            metadata_table=meta_data_table,
            list_of_file_paths_distribution=list_of_file_paths_distribution,
            directory_out_metadata=directory_out_metadata,
            directory_in_template=directory_simulation_template)
        #     directory_out_distributions=directory_out_distributions,
        self.write_profile_gold_standard(meta_data_table,
                                         list_of_file_paths_distribution)

        file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path(
        )
        meta_data_table.write(file_path_metadata, column_names=True)
        return merged_genome_id_to_path_map, list_of_file_paths_distribution
Ejemplo n.º 4
0
    def compute_novelty_for_metafile(self, in_meta_file, out_meta_file):
        """
		computes the novelty_category for each NCBI ID in the metafile and updates it to the output file
		(Note that the metafile must include a header with column name 'NCBI_ID'
							whereas novelty_category is added if it does not exist)

		@param in_meta_file: filepath to file named 'metadata_table_[version].csv'#
		@type in_meta_file: str | unicode
		@param out_meta_file: file path of the output
		@type out_meta_file: str | unicode

		@rtype: None
		"""
        assert self.validate_file(in_meta_file)
        assert self.validate_file(out_meta_file)
        self._logger.info(
            "Processing information from metafile: '{}'".format(in_meta_file))
        meta_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        meta_table.read(in_meta_file, column_names=True)
        self.compute_novelty(meta_table)
        meta_table.write(out_meta_file, column_names=True)
Ejemplo n.º 5
0
    def marker_gene_annotation(self):
        """As the third step, the unpublished genomes are classified based on the clusters they are found in.
		Since clusters were made in 0.01 distance steps, the classification can be done using the smallest clusters first, using bigger ones if a classification can not be made.
		If a marker gene of an unpublished genome is found in a cluster together with references, a common taxon that 90% of sequences agree with will be the predicted taxon.
		The 90% is arbitrary chosen and is required because of taxonomic inconsistencies.
		When a specific rank is checked for agreement, sequences with unknown classification on that rank are ignored.
		TODO: check for taxonomic consitency on higher ranks for those!
		Novelty prediction is based on the predicted taxon's rank. a high rank (phylum, order, class) with low distance can be a strong indicator for taxonomic inconsistencies.
		But it could also be caused by sequences that are not fully classified, yet.
		input:
		- meta data table with a list of the genomes that are to be classified
		- working directory where the results will be saved and which contains the mothur formatted file with the clusters
		output:
		- meta data table with a list of the genomes, with columns added that contain cluster based tax prediction, rank and novelty prediction

		@rtype: None
		"""
        # set of taxonomic ids of well known genomes
        data_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        data_table.read(self._file_path_map_reference_genome_id_to_tax_id)
        list_of_refernce_ncbi_id = data_table.get_column(1)

        # mapping of all internal ids
        # data_table_iid_mapping_silva = MetadataTable(
        # 	separator=self._separator, logfile=self._logfile, verbose=self._verbose)
        # file_path_silva_map = os.path.join(self._silva_reference_directory, MGCluster.get_file_name_of_map())
        # data_table_iid_mapping_silva.read(file_path_silva_map)
        data_table_iid_mapping = MetadataTable(separator=self._separator,
                                               logfile=self._logfile,
                                               verbose=self._verbose)
        data_table_iid_mapping.read(
            self._project_file_folder_handler.get_file_path_internal_id_map())
        # data_table_iid_mapping.concatenate(data_table_iid_mapping_silva, strict=False)

        mg_annotate = MGAnnotate(
            # ncbi_reference_directory=self._ncbi_reference_directory,
            file_path_query_genomes_location=self.
            _file_path_query_genomes_location_file,
            file_path_reference_genomes_location=self.
            _file_path_reference_genome_locations,
            file_path_reference_taxid_map=self.
            _file_path_map_reference_genome_id_to_tax_id,
            file_path_nucmer=self._file_path_nucmer,
            column_name_genome_id=self._column_name_genome_id,
            column_name_otu=self._column_name_otu_id,
            column_name_novelty_category=self._column_name_cluster_novelty,
            column_name_ncbi=self._column_name_ncbi,
            column_name_scientific_name=self.
            _column_name_cluster_scientific_name,
            column_name_ani=self._column_name_ani,
            column_name_ani_novelty=self._column_name_ani_novelty,
            column_name_ani_ncbi=self._column_name_ani_compare,
            column_name_ani_scientific_name=self.
            _column_name_ani_scientific_name,
            temp_directory=self._directory_temp,
            max_processors=self._max_processors,
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        metadata_table = MetadataTable(separator=self._separator,
                                       logfile=self._logfile,
                                       verbose=self._verbose)
        metadata_table.read(self._metadata_table_in, column_names=True)
        metadata_table.remove_empty_columns()

        list_query_gid = metadata_table.get_column(self._column_name_genome_id)
        if list_query_gid is None:
            msg = "Meta data file does not contain the required header '{}'".format(
                self._column_name_genome_id)
            self._logger.error(msg)
            raise IOError(msg)

        taxonomy = NcbiTaxonomy(self._ncbi_reference_directory,
                                verbose=self._verbose,
                                logfile=self._logfile)

        mothur_cluster = MothurCluster(
            self._precision,
            iid_gid_mapping=data_table_iid_mapping.get_map(0, 1),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)
        mothur_cluster.read(
            self._project_file_folder_handler.get_file_path_cluster_mg_16s(),
            list_query_gid)

        taxonomy_cluster = TaxonomicCluster(
            mothur_cluster,
            taxonomy,
            iid_tid_map=data_table_iid_mapping.get_map(0, 2),
            set_reference_genome_ncbi=set(list_of_refernce_ncbi_id),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        if self._annotate_classify:
            self._logger.info("Taxonomic classification")
            # also, novelty based clustering
            mg_annotate.taxonomic_classification(
                metadata_table, mothur_cluster, taxonomy_cluster, taxonomy,
                self._classification_distance_minimum)
            self._logger.info("Taxonomic classification Done")

        if self._annotate_novelty:
            self._logger.info("Novelty categorisation")
            # novelty by comparing with reference taxonomic ids
            mg_annotate.novelty_categorisation(taxonomy,
                                               set(list_of_refernce_ncbi_id),
                                               metadata_table)
            self._logger.info("Novelty categorisation Done")

        if self._annotate_otu:
            self._logger.info("OTU")
            mg_annotate.set_otu_id(metadata_table, mothur_cluster,
                                   self._otu_distance)
            self._logger.info("OTU Done")

        if self._annotate_ani:
            self._logger.info("Calculating ANI")
            mg_annotate.calculate_ani(mothur_cluster, taxonomy, metadata_table,
                                      self._distance_cutoff,
                                      self._ani_minimum_alignment)
            self._logger.info("Calculating ANI Done")
        metadata_table.write(
            self._project_file_folder_handler.get_file_path_meta_data_table(),
            column_names=True)
Ejemplo n.º 6
0
    def run_pipeline(self):
        """
        Run pipeline

        @rtype: None
        """
        if not self.is_valid():
            self._logger.info("Metagenome simulation aborted")
            return
        self._logger.info("Metagenome simulation starting")
        try:
            # Validate Genomes
            if self._phase_validate_raw_genomes:
                self._logger.info("Validating Genomes")
                self._validate_raw_genomes()

            # Design Communities
            if self._input_list_of_file_paths_distributions:
                assert len(self._input_list_of_file_paths_distributions) == self._number_of_samples
                
                meta_data_table = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose)
                file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path()
                
                for community in self._list_of_communities:
                    meta_data_table.read(community.file_path_metadata_table, column_names=True)
                    file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path()
                    meta_data_table.write(file_path_metadata, column_names=True)
                    out_locations = {}
                    # collect all paths
                    with open(community.file_path_genome_locations,'r') as in_locations: 
                        for line in in_locations:
                            genome, path = line.strip().split('\t')
                            out_locations[genome] = path 
                            # might overwrite path for genomes appearing multiple times and having been assigned different genomes
                # and write complete collection, so no genome appears multiple times
                with open(file_path_genome_locations, 'a') as locations:
                    for gen_id in out_locations:
                        locations.write("%s\t%s\n" % (gen_id, out_locations[gen_id]))
                
                genome_id_to_path_map = self.get_dict_gid_to_genome_file_path()
                directory_out_distributions = self._project_file_folder_handler.get_distribution_dir()
                list_of_file_paths_distributions = CommunityDesign.get_distribution_file_paths(
                    directory_out_distributions, self._number_of_samples)
                for file_path_src, file_path_dst in zip(self._input_list_of_file_paths_distributions, list_of_file_paths_distributions):
                    shutil.copy2(file_path_src, file_path_dst)
                self.write_profile_gold_standard(meta_data_table, list_of_file_paths_distributions)
            elif self._phase_design_community:
                self._logger.info("Design Communities")
                genome_id_to_path_map, list_of_file_paths_distributions = self._design_community()
            else:
                genome_id_to_path_map = self.get_dict_gid_to_genome_file_path()
                directory_out_distributions = self._project_file_folder_handler.get_distribution_dir()
                list_of_file_paths_distributions = CommunityDesign.get_distribution_file_paths(
                    directory_out_distributions, self._number_of_samples)

            # Move Genomes
            if self._phase_move_and_clean_genomes:
                self._logger.info("Move Genomes")
                self._move_and_cleanup_genomes(genome_id_to_path_map)

            # Read simulation (Art Illumina)
            if self._phase_simulate_reads:
                self._logger.info("Read simulation")
                for sample_index, file_path_distribution in enumerate(list_of_file_paths_distributions):
                    self._simulate_reads(file_path_distribution, sample_index)

            # Generate gold standard assembly
            list_of_output_gsa = None
            file_path_output_gsa_pooled = None
            if self._phase_pooled_gsa:
                self._logger.info("Generate gold standard assembly")
                list_of_output_gsa = self._generate_gsa()

            # Generate gold standard assembly from pooled reads of all samples
            if self._phase_pooled_gsa:
                self._logger.info("Generate pooled strains gold standard assembly")
                file_path_output_gsa_pooled = self._generate_gsa_pooled()

            # Anonymize Data (gsa)
            if self._phase_anonymize:
                self._logger.info("Anonymize Data")
                self._logger.debug(", ".join(list_of_output_gsa))
                self._anonymize_data(list_of_output_gsa, file_path_output_gsa_pooled)
            #elif self._phase_pooled_gsa: 
            else: # in any case create binning gold standard
                self._logger.info("Creating binning gold standard")
                self._logger.debug(", ".join(list_of_output_gsa))
                self._create_binning_gs(list_of_output_gsa)

            # Compress Data
            if self._phase_compress:
                self._logger.info("Compress Data")
                self._compress_data()

        except (KeyboardInterrupt, SystemExit, Exception, ValueError, RuntimeError) as e:
            self._logger.debug("\n{}\n".format(traceback.format_exc()))
            exc_tb = sys.exc_info()[-1]
            self._logger.error("%s in line %s" % (e, exc_tb.tb_lineno))
            self._logger.info("Metagenome simulation aborted")
        except AssertionError:
            self._logger.info("Metagenome simulation aborted, assertion %s failed" % e)
        else:
            self._logger.info("Metagenome simulation finished")

        if not self._debug:
            self._project_file_folder_handler.remove_directory_temp()
        else:
            self._logger.info("Temporary data stored at:\n{}".format(self._project_file_folder_handler.get_tmp_wd()))
    def design_community(self,
                         file_path_distributions,
                         community,
                         number_of_samples,
                         metadata_table,
                         directory_out_metadata,
                         directory_in_template=None):
        """
        Design artificial community, of a specific design, with different distributions for each sample

        @param file_path_distributions: File path where distributions will be written to
        @type file_path_distributions: str | unicode
        @param community: Input data for the creation of a community
        @type community: Community
        @param number_of_samples: Amount of samples to be simulated
        @type number_of_samples: int
        @param metadata_table: Will contain metadata of all (simulated) genomes/plasmids drawn
        @type metadata_table: MetadataTable
        @param directory_out_metadata: Metadata tables of separated by chosen and not chosen genomes are written to here
        @type directory_out_metadata: str | unicode
        @param directory_in_template: contains template data for strain simulation
        @type directory_in_template: str | unicode

        @return: Dictionary with drawn genome ids as key and file paths as value
        @rtype: dict[str|unicode, str|unicode]
        """
        assert isinstance(community, Community)
        assert isinstance(metadata_table, MetadataTable)

        number_of_strains = community.genomes_total

        # pick how much a strain will be simulated
        genome_amounts = []
        strain_simulation = None
        if community.simulate_strains:
            strain_simulation = StrainSimulationWrapper(
                executable_sim=None,
                directory_template=directory_in_template,
                column_name_gid=self._column_name_genome_id,
                column_name_ncbi=self._column_name_ncbi,
                column_name_source=self._column_name_source,
                separator='\t',
                filename_prefix="simulated_",
                keep_original=True,
                max_processors=self._max_processors,
                tmp_dir=self._tmp_dir,
                logfile=self._logfile,
                verbose=self._verbose,
                debug=self._debug,
                # seed=self._seed
            )

            probability = None  # 1-options.communities[community_id]["evolve"]
            genome_amounts = strain_simulation.get_genome_amounts(
                probability=probability,
                max_genome_amount=community.genomes_total,
                num_real_genomes=community.genomes_real,
                silent=not community.verbose)
            number_of_strains = len(genome_amounts)

        # draw strains
        self._logger.info("Drawing strains.")
        metadata_table_community = MetadataTable(logfile=self._logfile,
                                                 verbose=self._verbose)
        metadata_table_community.read(community.file_path_metadata_table,
                                      column_names=True)
        strain_selector = StrainSelector(
            column_name_genome_id=self._column_name_genome_id,
            column_name_otu=self._column_name_otu,
            column_name_novelty_category=self._column_name_novelty_category,
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)
        list_of_drawn_genome_id = strain_selector.get_drawn_genome_id(
            metadata_table=metadata_table_community,
            number_of_strains=number_of_strains,
            number_of_strains_per_otu=community.limit_per_otu)

        # write unused data to separate file
        old_base_name = os.path.basename(community.file_path_metadata_table)
        file_prefix, extention = os.path.splitext(old_base_name)
        new_file_name = "unused_c{index}_{prefix}{ext}".format(
            prefix=file_prefix, index=community.id, ext=extention)
        metadata_new_file_path = os.path.join(directory_out_metadata,
                                              new_file_name)
        metadata_table_community.write(
            metadata_new_file_path,
            exclude=True,
            value_list=list_of_drawn_genome_id,
            key_column_name=self._column_name_genome_id,
            column_names=True)

        # get path for every genome
        genome_id_to_file_path_gff = None
        if community.file_path_gff_locations:
            genome_id_to_file_path_gff = self._get_genome_id_to_path_map(
                community.file_path_gff_locations, list_of_drawn_genome_id)
        genome_id_to_path_map = self._get_genome_id_to_path_map(
            community.file_path_genome_locations, list_of_drawn_genome_id)

        # concatenate
        metadata_table_community.reduce_rows_to_subset(
            list_of_drawn_genome_id, self._column_name_genome_id)
        metadata_table.concatenate(metadata_table_community, strict=False)

        # validate correct format of files
        self._logger.info("Validating raw sequence files!")
        assert self.validate_format(
            list_of_file_paths=genome_id_to_path_map.values(),
            file_format="fasta",
            sequence_type="dna",
            ambiguous=True), "Validation of file format failed!"

        # simulate diversity around strains
        if community.simulate_strains:
            genome_id_to_amounts = strain_simulation.get_genome_id_to_amounts(
                list_of_drawn_genome_id, genome_amounts)
            strain_simulation.simulate_strains(
                meta_table=metadata_table,
                genome_id_to_amounts=genome_id_to_amounts,
                genome_id_to_file_path_genome=genome_id_to_path_map,
                genome_id_to_file_path_gff=genome_id_to_file_path_gff)
            # adopt new list that includes simulated strains
            self._logger.info("Validating simulated sequence files!")
            for genome_id, file_path in genome_id_to_path_map.iteritems():
                if genome_id in list_of_drawn_genome_id:
                    continue
                assert self.validate_sequence_file(file_path,
                                                   file_format="fasta",
                                                   sequence_type="dna",
                                                   ambiguous=True)
            list_of_drawn_genome_id = genome_id_to_path_map.keys()

        # get community distributions
        population_distribution = PopulationDistribution(logfile=self._logfile,
                                                         verbose=self._verbose,
                                                         debug=self._debug)
        list_of_distributions = population_distribution.get_lists_of_distributions(
            size_of_population=len(list_of_drawn_genome_id),
            number_of_samples=number_of_samples,
            modus=community.mode,
            log_mu=community.log_mu,
            log_sigma=community.log_sigma,
            gauss_mu=community.gauss_mu,
            gauss_sigma=community.gauss_sigma,
            view_distribution=community.verbose)

        # move and clean up files (removes sequence description)
        # genome_id_to_total_length = self.move_genome_files(
        #     genome_id_to_path_map,
        #     directory_output=directory_out_genomes,
        #     sequence_min_length=min_sequence_length,
        #     set_of_sequence_names=set_of_sequence_names)

        # write distribution file
        # genome_id_to_distributions = self._get_genome_id_to_distributions(list_of_drawn_genome_id, list_of_distributions)
        assert len(list_of_drawn_genome_id) == len(list_of_distributions)
        genome_id_to_distributions = dict(
            zip(list_of_drawn_genome_id, list_of_distributions))

        # genome_id_to_file_name = self._get_genome_id_to_file_name(genome_id_to_path_map)
        with open(file_path_distributions, 'w') as stream_out:
            self._write_distribution_file(
                stream_out=stream_out,
                genome_id_to_abundance=genome_id_to_distributions)
        return genome_id_to_path_map