Ejemplo n.º 1
0
    def _validate_raw_genomes(self):
        """
        Validate format raw genomes

        @return: True if all genomes valid
        @rtype: bool
        """
        prepare_genomes = GenomePreparation(
            logfile=self._logfile,
            verbose=self._verbose)

        meta_data_table = MetadataTable(
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose)

        are_valid = True
        for community in self._list_of_communities:
            meta_data_table.read(community.file_path_genome_locations)
            list_of_file_paths = meta_data_table.get_column(1)

            if not prepare_genomes.validate_format(
                list_of_file_paths,
                file_format="fasta",  # TODO: should be done dynamically
                sequence_type="dna",
                ambiguous=True):
                are_valid = False
        return are_valid
Ejemplo n.º 2
0
    def _read_distribution_file(self, file_path):
        """
        Read file with the distribution of a sample

        @param file_path: File genome id associated with the abundance of a genome
        @type file_path: str | unicode

        @return: Dictionary of genome id to file path
        @rtype: dict[str|unicode, float]
        """
        self._logger.info('Reading distribution file')
        assert self.validate_file(file_path)
        dict_id_abundance = {}
        # dict_id_file_path = {}
        metadata_table = MetadataTable(logfile=self._logfile, verbose=self._verbose, separator=self._separator)
        iterator_distributions = metadata_table.parse_file(file_path, as_list=True)
        # for genome_id, abundance, genome_length, file_path_genome in iterator_distributions:
        abundance_sum = 0.
        for genome_id, abundance in iterator_distributions:
            assert genome_id != '', "Invalid genom id: '{}'".format(genome_id)
            assert abundance != '', "Invalid abundance: '{}'".format(genome_id)
            abundance = float(abundance)
            assert self.validate_number(abundance, zero=True), "Invalid abundance: '{}'".format(genome_id)

            assert genome_id not in dict_id_abundance, "Genome '{}' not unique in the distribution file!".format(genome_id)
            dict_id_abundance[genome_id] = abundance
            abundance_sum += abundance
        dict_id_abundance = {x : dict_id_abundance[x]/abundance_sum for x in dict_id_abundance} # normalise to 1
        return dict_id_abundance
Ejemplo n.º 3
0
    def write_taxonomic_profile_from_abundance_files(self,
                                                     metadata_table,
                                                     list_of_file_paths,
                                                     directory_output,
                                                     sample_id=""):
        """
		Write a taxonomic profile file for each relative abundance file

		@param metadata_table: Contains metadata of all communities
		@type metadata_table: MetadataTable
		@param list_of_file_paths: List of abundance file paths
		@type list_of_file_paths: list[str | unicode]
		@param directory_output: Profiles are written in this directory
		@type directory_output: str | unicode
		@param sample_id: Identifier of a sample
		@type sample_id: str | unicode
		"""
        metadata_table_tmp = MetadataTable(logfile=self._logfile,
                                           verbose=self._verbose)
        for index_abundance, file_path in enumerate(list_of_file_paths):
            community_abundance = metadata_table_tmp.parse_file(
                file_path, column_names=False)
            file_path_output = os.path.join(
                directory_output,
                self._filename_taxonomic_profile.format(
                    sample_index=index_abundance))
            with open(file_path_output, 'w') as stream_output:
                self.write_taxonomic_profile(community_abundance,
                                             stream_output, metadata_table,
                                             sample_id)
Ejemplo n.º 4
0
    def _read_genome_location_file(self, file_path):
        """
        Read file with the file paths of gnomes

        @param file_path: File genome id associated with the file path of a genome
        @type file_path: str | unicode

        @return: Dictionary of genome id to file path
        @rtype: dict[str|unicode, str|unicode]
        """
        self._logger.info('Reading genome location file')
        assert self.validate_file(file_path)
        dict_id_file_path = {}
        metadata_table = MetadataTable(logfile=self._logfile,
                                       verbose=self._verbose,
                                       separator=self._separator)
        iterator_distributions = metadata_table.parse_file(file_path,
                                                           as_list=True)
        for genome_id, file_path_genome in iterator_distributions:
            assert genome_id != '', "Invalid genomid: '{}'".format(genome_id)
            assert file_path_genome != '', "Invalid file path: '{}'".format(
                genome_id)
            assert self.validate_file(
                file_path_genome), "Invalid file path: '{}'".format(genome_id)

            # check uniqueness
            assert genome_id not in dict_id_file_path, "Genome '{}' not unique in the distribution file!".format(
                genome_id)
            dict_id_file_path[genome_id] = file_path_genome
        return dict_id_file_path
Ejemplo n.º 5
0
    def _generate_gsa_pooled(self):
        """
        Create a perfect assembly of the reads of all samples.
            merge all sample bam files and create a assembly of all of them
            - create folder reads_on_genomes wherever you are
            - merge bamfiles from list_of_bamdirs into this dirs
            - run gsa for reads_on_genomes
            - create mapping

        @return: file paths of assembly
        @rtype: str|unicode
        """
        meta_data_table = MetadataTable(separator=self._separator,
                                        logfile=self._logfile,
                                        verbose=self._verbose)

        gs_handler = GoldStandardAssembly(
            file_path_samtools=self._executable_samtools,
            max_processes=self._max_processors,
            tmp_dir=self._project_file_folder_handler.get_tmp_wd(),
            logfile=self._logfile,
            verbose=self._verbose)

        file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path(
        )
        meta_data_table.read(file_path_genome_locations)
        dict_id_to_file_path_fasta = meta_data_table.get_map(0, 1)

        # list_of_directory_bam = [
        #     self._project_file_folder_handler.get_bam_dir(str(sample_index))
        #  for sample_index in range(self._number_of_samples)]
        list_of_directory_bam = self._project_file_folder_handler.get_bam_dirs(
        )
        list_of_sample_folders = [
            os.path.basename(os.path.dirname(directory_bam))
            for directory_bam in list_of_directory_bam
        ]
        self._logger.info("Samples used for pooled assembly: '{}'".format(
            "', '".join(list_of_sample_folders)))

        file_path_output_gsa_pooled = gs_handler.pooled_gold_standard_by_dir(
            list_of_directory_bam, dict_id_to_file_path_fasta)

        if not self._phase_anonymize:
            gsa_pooled_output = self._project_file_folder_handler.get_gsa_pooled_file_path(
            )
            if self._phase_compress:
                self._list_tuple_archive_files.append(
                    (file_path_output_gsa_pooled, gsa_pooled_output + ".gz"))
            else:
                shutil.move(file_path_output_gsa_pooled, gsa_pooled_output)

        return file_path_output_gsa_pooled
Ejemplo n.º 6
0
    def _validate_raw_genomes(self):
        """
		Validate format raw and reference genomes

		@rtype: None
		"""
        self._logger.info("Validating Genomes")
        meta_data_table = MetadataTable(separator=self._separator,
                                        logfile=self._logfile,
                                        verbose=self._verbose)

        are_valid = True
        meta_data_table.read(self._file_path_query_genomes_location_file,
                             column_names=False)
        list_of_file_paths = meta_data_table.get_column(1)
        if not self._validate_format(list_of_file_paths,
                                     file_format="fasta",
                                     sequence_type="dna",
                                     ambiguous=True):
            are_valid = False

        meta_data_table.read(self._file_path_reference_genome_locations,
                             column_names=False)
        list_of_file_paths = meta_data_table.get_column(1)
        if not self._validate_format(list_of_file_paths,
                                     file_format="fasta",
                                     sequence_type="dna",
                                     ambiguous=True):
            are_valid = False

        if not are_valid:
            msg = "Invalid genomes found!"
            self._logger.error(msg)
            raise RuntimeError(msg)
        self._logger.info("Validating Genomes Done")
Ejemplo n.º 7
0
    def _design_community(self):
        """
        Start designing sample a community

        @return: map genome id to genome file path and list of distribution file paths
        @rtype: tuple[dict[str|unicode, str|unicode], list[str|unicode]]]
        """
        meta_data_table = MetadataTable(separator=self._separator,
                                        logfile=self._logfile,
                                        verbose=self._verbose)

        community_design = CommunityDesign(
            column_name_genome_id=self._column_name_genome_id,
            column_name_otu=self._column_name_otu,
            column_name_novelty_category=self._column_name_novelty_category,
            column_name_ncbi=self._column_name_ncbi,
            column_name_source=self._column_name_source,
            max_processors=self._max_processors,
            tmp_dir=self._project_file_folder_handler.get_tmp_wd(),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug,
            seed=None)

        directory_out_distributions = self._project_file_folder_handler.get_distribution_dir(
        )
        list_of_file_paths_distribution = community_design.get_distribution_file_paths(
            directory_out_distributions, self._number_of_samples)
        directory_out_metadata = self._project_file_folder_handler.get_meta_data_dir(
        )
        directory_simulation_template = self._strain_simulation_template
        merged_genome_id_to_path_map = community_design.design_samples(
            list_of_communities=self._list_of_communities,
            metadata_table=meta_data_table,
            list_of_file_paths_distribution=list_of_file_paths_distribution,
            directory_out_metadata=directory_out_metadata,
            directory_in_template=directory_simulation_template)
        #     directory_out_distributions=directory_out_distributions,
        self.write_profile_gold_standard(meta_data_table,
                                         list_of_file_paths_distribution)

        file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path(
        )
        meta_data_table.write(file_path_metadata, column_names=True)
        return merged_genome_id_to_path_map, list_of_file_paths_distribution
Ejemplo n.º 8
0
    def get_dict_unique_id_to_genome_file_path(self, file_path_mapping):
        """
            Get a map, original sequence name to anonymous sequence name from a mapping file.

            @attention: anonymous name in second column

            @param file_path_mapping: File path to mapping file
            @type file_path_mapping: str | unicode

            @return: Mapping of anonymous sequence name to original sequence name
            @rtype: dict[str | unicode, str | unicode]
        """
        assert isinstance(file_path_mapping, str)

        table = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        table.read(file_path_mapping, separator=self._separator)
        dict_mapping = table.get_map(0, 1)
        return dict_mapping
Ejemplo n.º 9
0
    def get_dict_gid_to_genome_file_path(self):
        """
        Get map genome id to genome file path

        @return: Genome id to geone file path
        @rtype: dict[str|unicode, str|unicode]
        """
        meta_data_table = MetadataTable(
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose)

        file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path()
        if not self._validator.validate_file(file_path_genome_locations, silent=True):
            msg = "Required file not found! Was design of communities not completed?"
            raise RuntimeError(msg)
        meta_data_table.read(file_path_genome_locations)
        return meta_data_table.get_map(0, 1)
Ejemplo n.º 10
0
    def _get_genome_id_to_path_map(
            self, file_path_of_file_mapping_genome_id_to_paths,
            list_of_drawn_genome_id):
        """
		Get a dictionary mapping genome id to the path of their genome

		@param file_path_of_file_mapping_genome_id_to_paths: File path to file with format 'id \t path'
		@type file_path_of_file_mapping_genome_id_to_paths: str | unicode
		@param list_of_drawn_genome_id: List of genome identifiers
		@type list_of_drawn_genome_id: list[str|unicode]

		@return: genome ids mapped to their gnome file path
		@rtype: dict[str|unicode, str|unicode]
		"""
        genome_id_to_path_map = {}
        mdt = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        mdt.read(file_path_of_file_mapping_genome_id_to_paths)
        if mdt.get_number_of_rows() > 0:
            genome_id_to_path_map = mdt.get_map(0, 1, unique_key=True)
        msg = "'{}' is missing one or more genome id".format(
            os.path.basename(file_path_of_file_mapping_genome_id_to_paths))
        assert set(genome_id_to_path_map.keys()).issuperset(
            list_of_drawn_genome_id), msg
        return {
            genome_id: genome_id_to_path_map[genome_id]
            for genome_id in list_of_drawn_genome_id
        }
Ejemplo n.º 11
0
    def get_dict_sequence_name_to_positions(self, list_of_sam_position_files):
        """
            Get a map, sequence name to list of starting position from a mapping file.

            @attention: First column are sequence names, second column the start position

            @param list_of_sam_position_files: List of sam  position files
            @type list_of_sam_position_files: list[str|unicode]

            @return: Mapping of sequence name to list of starting position
            @rtype: dict[str | unicode, list[long]]
        """
        assert isinstance(list_of_sam_position_files, list)

        dict_original_seq_pos = {}

        table = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        for sam_position_file in list_of_sam_position_files:
            table.read(sam_position_file, separator=self._separator)
            column_key = table.get_column(0)
            column_values = table.get_column(1)

            for index_row in range(len(column_key)):
                key = column_key[index_row]
                value = column_values[index_row]
                seq_without_index = key.split("-")[0]
                if seq_without_index not in dict_original_seq_pos:
                    dict_original_seq_pos[seq_without_index] = []
                dict_original_seq_pos[seq_without_index].append(value)
        return dict_original_seq_pos
Ejemplo n.º 12
0
    def get_dict_genome_id_to_tax_id(
        self, file_path_metadata):
        """
            Get a map, genome id to taxonomic id from a metadata file.

            @attention: "genome_ID" and "NCBI_ID" assumed default column names.

            @param file_path_metadata: File path to metadata file
            @type file_path_metadata: str | unicode

            @return: Mapping of  genome id to taxonomic id
            @rtype: dict[str | unicode, str | unicode]
        """
        assert isinstance(file_path_metadata, str)
        assert isinstance(self._column_name_gid, str)
        assert isinstance(self._column_name_ncbi, str)
        assert isinstance(self._separator, str)

        table = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        table.read(file_path_metadata, separator=self._separator, column_names=True)
        dict_genome_id_to_tax_id = table.get_map(self._column_name_gid, self._column_name_ncbi)
        return dict_genome_id_to_tax_id
Ejemplo n.º 13
0
    def _get_genome_id_to_path_map(
            self, file_path_of_file_mapping_genome_id_to_paths,
            list_of_drawn_genome_id):
        """
		Get a dictionary mapping genome id to the path of their genome

		@param file_path_of_file_mapping_genome_id_to_paths: File path to file with format 'id \t path'
		@type file_path_of_file_mapping_genome_id_to_paths: str | unicode
		@param list_of_drawn_genome_id: List of genome identifiers
		@type list_of_drawn_genome_id: list[str|unicode]

		@return: genome ids mapped to their gnome file path
		@rtype: dict[str|unicode, str|unicode]
		"""
        mdt = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        mdt.read(file_path_of_file_mapping_genome_id_to_paths)
        genome_id_to_path_map = mdt.get_map(0, 1, unique_key=True)
        assert set(
            genome_id_to_path_map.keys()).issuperset(list_of_drawn_genome_id)
        return {
            genome_id: genome_id_to_path_map[genome_id]
            for genome_id in list_of_drawn_genome_id
        }
Ejemplo n.º 14
0
    def _get_genome_id_to_path_map(self, file_path):
        """
		Get a map of genome_id to genome path

		@param file_path: File path
		@type file_path: str | unicode

		@return: map of genome_id to genome path
		@rtype: dict[str|unicode, str|unicode]
		"""
        assert self.validate_file(file_path)

        data_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        data_table.read(file_path, column_names=False)
        if data_table.get_number_of_rows() == 0:
            self._logger.warning("No data in file '{}'.".format(file_path))
            return {}
        dict_genome_id_to_path = data_table.get_map(0, 1)
        return dict_genome_id_to_path
Ejemplo n.º 15
0
    def compute_novelty_for_metafile(self, in_meta_file, out_meta_file):
        """
		computes the novelty_category for each NCBI ID in the metafile and updates it to the output file
		(Note that the metafile must include a header with column name 'NCBI_ID'
							whereas novelty_category is added if it does not exist)

		@param in_meta_file: filepath to file named 'metadata_table_[version].csv'#
		@type in_meta_file: str | unicode
		@param out_meta_file: file path of the output
		@type out_meta_file: str | unicode

		@rtype: None
		"""
        assert self.validate_file(in_meta_file)
        assert self.validate_file(out_meta_file)
        self._logger.info(
            "Processing information from metafile: '{}'".format(in_meta_file))
        meta_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        meta_table.read(in_meta_file, column_names=True)
        self.compute_novelty(meta_table)
        meta_table.write(out_meta_file, column_names=True)
Ejemplo n.º 16
0
    def create_meta_table(self, file_path_metadata_table):
        """
		Generate a input metadata file with genome ids only

		@param file_path_metadata_table:
		@type file_path_metadata_table: str|unicode

		@rtype: None
		"""
        metadata_table = MetadataTable(separator=self._separator,
                                       logfile=self._logfile,
                                       verbose=self._verbose)
        metadata_table.read(self._file_path_reference_genome_locations,
                            column_names=False)
        if metadata_table.get_number_of_rows() == 0:
            raise ValueError("Invalid file content")
        id_column = metadata_table.get_column(0)
        metadata_table.clear()
        metadata_table.insert_column(id_column, self._column_name_genome_id)
        metadata_table.write(file_path_metadata_table, column_names=True)
Ejemplo n.º 17
0
    def marker_gene_annotation(self):
        """As the third step, the unpublished genomes are classified based on the clusters they are found in.
		Since clusters were made in 0.01 distance steps, the classification can be done using the smallest clusters first, using bigger ones if a classification can not be made.
		If a marker gene of an unpublished genome is found in a cluster together with references, a common taxon that 90% of sequences agree with will be the predicted taxon.
		The 90% is arbitrary chosen and is required because of taxonomic inconsistencies.
		When a specific rank is checked for agreement, sequences with unknown classification on that rank are ignored.
		TODO: check for taxonomic consitency on higher ranks for those!
		Novelty prediction is based on the predicted taxon's rank. a high rank (phylum, order, class) with low distance can be a strong indicator for taxonomic inconsistencies.
		But it could also be caused by sequences that are not fully classified, yet.
		input:
		- meta data table with a list of the genomes that are to be classified
		- working directory where the results will be saved and which contains the mothur formatted file with the clusters
		output:
		- meta data table with a list of the genomes, with columns added that contain cluster based tax prediction, rank and novelty prediction

		@rtype: None
		"""
        # set of taxonomic ids of well known genomes
        data_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        data_table.read(self._file_path_map_reference_genome_id_to_tax_id)
        list_of_refernce_ncbi_id = data_table.get_column(1)

        # mapping of all internal ids
        # data_table_iid_mapping_silva = MetadataTable(
        # 	separator=self._separator, logfile=self._logfile, verbose=self._verbose)
        # file_path_silva_map = os.path.join(self._silva_reference_directory, MGCluster.get_file_name_of_map())
        # data_table_iid_mapping_silva.read(file_path_silva_map)
        data_table_iid_mapping = MetadataTable(separator=self._separator,
                                               logfile=self._logfile,
                                               verbose=self._verbose)
        data_table_iid_mapping.read(
            self._project_file_folder_handler.get_file_path_internal_id_map())
        # data_table_iid_mapping.concatenate(data_table_iid_mapping_silva, strict=False)

        mg_annotate = MGAnnotate(
            # ncbi_reference_directory=self._ncbi_reference_directory,
            file_path_query_genomes_location=self.
            _file_path_query_genomes_location_file,
            file_path_reference_genomes_location=self.
            _file_path_reference_genome_locations,
            file_path_reference_taxid_map=self.
            _file_path_map_reference_genome_id_to_tax_id,
            file_path_nucmer=self._file_path_nucmer,
            column_name_genome_id=self._column_name_genome_id,
            column_name_otu=self._column_name_otu_id,
            column_name_novelty_category=self._column_name_cluster_novelty,
            column_name_ncbi=self._column_name_ncbi,
            column_name_scientific_name=self.
            _column_name_cluster_scientific_name,
            column_name_ani=self._column_name_ani,
            column_name_ani_novelty=self._column_name_ani_novelty,
            column_name_ani_ncbi=self._column_name_ani_compare,
            column_name_ani_scientific_name=self.
            _column_name_ani_scientific_name,
            temp_directory=self._directory_temp,
            max_processors=self._max_processors,
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        metadata_table = MetadataTable(separator=self._separator,
                                       logfile=self._logfile,
                                       verbose=self._verbose)
        metadata_table.read(self._metadata_table_in, column_names=True)
        metadata_table.remove_empty_columns()

        list_query_gid = metadata_table.get_column(self._column_name_genome_id)
        if list_query_gid is None:
            msg = "Meta data file does not contain the required header '{}'".format(
                self._column_name_genome_id)
            self._logger.error(msg)
            raise IOError(msg)

        taxonomy = NcbiTaxonomy(self._ncbi_reference_directory,
                                verbose=self._verbose,
                                logfile=self._logfile)

        mothur_cluster = MothurCluster(
            self._precision,
            iid_gid_mapping=data_table_iid_mapping.get_map(0, 1),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)
        mothur_cluster.read(
            self._project_file_folder_handler.get_file_path_cluster_mg_16s(),
            list_query_gid)

        taxonomy_cluster = TaxonomicCluster(
            mothur_cluster,
            taxonomy,
            iid_tid_map=data_table_iid_mapping.get_map(0, 2),
            set_reference_genome_ncbi=set(list_of_refernce_ncbi_id),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        if self._annotate_classify:
            self._logger.info("Taxonomic classification")
            # also, novelty based clustering
            mg_annotate.taxonomic_classification(
                metadata_table, mothur_cluster, taxonomy_cluster, taxonomy,
                self._classification_distance_minimum)
            self._logger.info("Taxonomic classification Done")

        if self._annotate_novelty:
            self._logger.info("Novelty categorisation")
            # novelty by comparing with reference taxonomic ids
            mg_annotate.novelty_categorisation(taxonomy,
                                               set(list_of_refernce_ncbi_id),
                                               metadata_table)
            self._logger.info("Novelty categorisation Done")

        if self._annotate_otu:
            self._logger.info("OTU")
            mg_annotate.set_otu_id(metadata_table, mothur_cluster,
                                   self._otu_distance)
            self._logger.info("OTU Done")

        if self._annotate_ani:
            self._logger.info("Calculating ANI")
            mg_annotate.calculate_ani(mothur_cluster, taxonomy, metadata_table,
                                      self._distance_cutoff,
                                      self._ani_minimum_alignment)
            self._logger.info("Calculating ANI Done")
        metadata_table.write(
            self._project_file_folder_handler.get_file_path_meta_data_table(),
            column_names=True)
Ejemplo n.º 18
0
    def marker_gene_extraction(self):
        """
		The first step is to find and extract 16S marker gene sequences. The sequences are found using "hmmsearch" and extracted based on the given positions.
		Two hmmer can currently be used. HMMER3.0 with a profile from 2010 and "rnammer" using HMMER2.0 with a profile from 2006.
		A third one using HMMER3.0 is still to be evaluated.
		So far it seems like rnammer provides better(more) results, but is also very slow.
		input:
		- file containing a list of fasta file paths, for example the genomes that someone wants to cluster.
		- file containing a list of reference fasta file paths or alternatively, a fasta formated file containing the already extracted marker genes of the reference genomes.
		- working directory where the results will be saved (intermediate files will be worked on in the designated /tmp folder)
		- the number of processors that are available for parallel processing. The program "parallel" is used to process several genomes at the same time.
		output:
		- fasta formatted file containing the extracted marker genes of all genomes

		@rtype: None
		"""
        assert isinstance(self, ArgumentHandler)
        mg_extract = MGExtract(
            mg_analyse_executable=self._get_mg_analyse_executable(),
            file_path_query_genome_file_paths=self.
            _file_path_query_genomes_location_file,
            file_path_reference_genome_file_paths=self.
            _file_path_reference_genome_locations,
            file_path_name_reference_marker_genes=self.
            _file_path_reference_markergene,
            config_path=self._file_path_config,
            file_path_map_reference_genome_id_to_tax_id=self.
            _file_path_map_reference_genome_id_to_tax_id,
            max_processors=self._max_processors,
            temp_directory=self._project_file_folder_handler.get_tmp_wd(),
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        mg_extract.gather_markergenes(
            hmmer=self._hmmer,
            mg_type="16S",
            file_path_output=self._project_file_folder_handler.
            get_file_path_mg_16s(),
            file_path_map_uid_sid=self._project_file_folder_handler.
            get_file_path_internal_id_map())

        # merge silva iid with genome iid
        data_table_iid_mapping_silva = MetadataTable(separator=self._separator,
                                                     logfile=self._logfile,
                                                     verbose=self._verbose)
        file_path_silva_map = os.path.join(self._silva_reference_directory,
                                           MGCluster.get_file_name_of_map())
        data_table_iid_mapping_silva.read(file_path_silva_map)
        data_table_iid_mapping = MetadataTable(separator=self._separator,
                                               logfile=self._logfile,
                                               verbose=self._verbose)
        data_table_iid_mapping.read(
            self._project_file_folder_handler.get_file_path_internal_id_map())
        data_table_iid_mapping.concatenate(data_table_iid_mapping_silva,
                                           strict=False)
        data_table_iid_mapping.write(
            self._project_file_folder_handler.get_file_path_internal_id_map())
    def merge_communities(self, list_of_communities,
                          list_of_comunity_distribution_file_paths,
                          index_sample, file_path_output):
        """
        Combine distributions of communities and adjust them according to their ratio.

        @param list_of_communities: List of community inputs
        @type list_of_communities: list[Community]
        @param list_of_comunity_distribution_file_paths: List of distributions
        @type list_of_comunity_distribution_file_paths: list[str | unicode]
        @param index_sample: Index of sample
        @type index_sample: int | long
        @param file_path_output: Sample distribution file path
        @type file_path_output: str | unicode

        @return: Nothing
        @rtype: None
        """
        assert isinstance(list_of_communities, list)
        for community in list_of_communities:
            assert isinstance(community, Community)
        # assert isinstance(metadata_table, MetadataTable)

        # read communities and adapt to ratio
        list_of_community_total_abundance = [0] * len(list_of_communities)
        sample_total_abundance = 0

        genomes = set()
        metadata_table_community = MetadataTable(logfile=self._logfile,
                                                 verbose=self._verbose)
        for index_community, file_path in enumerate(
                list_of_comunity_distribution_file_paths):
            community_distribution = metadata_table_community.parse_file(
                file_path, column_names=False)
            for row in community_distribution:
                genome_id = row[0]
                if genome_id in genomes:
                    raise ValueError(
                        "Genome id '{}' not unique".format(genome_id))
                genomes.add(genome_id)
                abundance = row[index_sample + 1]
                list_of_community_total_abundance[index_community] += float(
                    abundance)  # * float(sequence_info[4])
            community_distribution.close()

        for index_community, _ in enumerate(
                list_of_comunity_distribution_file_paths):
            sample_total_abundance += list_of_community_total_abundance[
                index_community]

        # out.append(read_communities[0][0])
        list_of_community_factor = [0.0] * len(list_of_communities)

        for index_community, _ in enumerate(
                list_of_comunity_distribution_file_paths):
            ratio = float(list_of_communities[index_community].ratio)
            community_total_abundance = float(
                list_of_community_total_abundance[index_community])
            current_proportion_in_sample = community_total_abundance / float(
                sample_total_abundance)
            list_of_community_factor[
                index_community] = ratio / current_proportion_in_sample
            # self.update_community(communities[index_community], factor)

        # join communities
        communities = []
        for index_community, file_path in enumerate(
                list_of_comunity_distribution_file_paths):
            communities.append(
                metadata_table_community.parse_file(file_path,
                                                    column_names=False))

        # print_ratios(communities)
        with open(file_path_output, 'w') as stream_output:
            self._write_joined_community(communities, list_of_community_factor,
                                         index_sample, stream_output)
Ejemplo n.º 20
0
    def __init__(self,
                 file_path_query_genomes_location,
                 file_path_reference_genomes_location,
                 file_path_reference_taxid_map,
                 file_path_nucmer="nucmer",
                 minimum_alignment=0.8,
                 separator='\t',
                 temp_directory=None,
                 max_processors=1,
                 logfile=None,
                 verbose=False,
                 debug=False):
        """
		Constructor

		@param file_path_query_genomes_location:
		@type file_path_query_genomes_location: str|unicode
		@param file_path_reference_genomes_location:
		@type file_path_reference_genomes_location: str|unicode
		@param file_path_reference_taxid_map:
		@type file_path_reference_taxid_map: str|unicode
		@param file_path_nucmer:
		@type file_path_nucmer: str|unicode
		@param minimum_alignment:
		@type minimum_alignment: str|unicode|int|long|float
		@param separator:
		@type separator: str|unicode
		@param temp_directory:
		@type temp_directory: str|unicode
		@param max_processors:
		@type max_processors: int|long
		@param logfile: file handler or file path to a log file
		@type logfile: file | FileIO | StringIO | basestring
		@param verbose: Not verbose means that only warnings and errors will be past to stream
		@type verbose: bool
		@param debug: Display debug messages
		@type debug: bool

		@rtype: None
		"""
        assert self.validate_file(file_path_query_genomes_location)
        assert self.validate_file(file_path_reference_genomes_location)
        assert self.validate_file(file_path_reference_taxid_map)
        assert self.validate_file(file_path_nucmer, executable=True)
        assert temp_directory is None or self.validate_dir(temp_directory)
        assert isinstance(minimum_alignment, (int, float))
        assert self.validate_number(minimum_alignment, minimum=0, maximum=1)
        assert isinstance(separator, basestring)
        assert isinstance(max_processors, (int, long))
        assert self.validate_number(max_processors, minimum=1)
        super(ANIm, self).__init__(logfile=logfile,
                                   verbose=verbose,
                                   debug=debug)
        self._CUM_RETVALS = 0
        self._max_processors = max_processors
        self._file_path_nucmer = file_path_nucmer
        self._tmp_dir = temp_directory
        self._separator = separator
        if temp_directory is None:
            self._tmp_dir = tempfile.mkdtemp()
        else:
            self._tmp_dir = tempfile.mkdtemp(dir=temp_directory)
        self._cmd_lines = []
        data_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        data_table.read(file_path_query_genomes_location)
        self._query_gid_to_location = data_table.get_map(0, 1)
        data_table.read(file_path_reference_genomes_location)
        self._reference_gid_to_location = data_table.get_map(0, 1)
        data_table.read(file_path_reference_taxid_map)
        self._reference_gid_to_taxid = data_table.get_map(0, 1)
        self._total_lengths = {}
        self._minimum_alignment = minimum_alignment
        self._used_file_names = {}
Ejemplo n.º 21
0
    def run_pipeline(self):
        """
        Run pipeline

        @rtype: None
        """
        if not self.is_valid():
            self._logger.info("Metagenome simulation aborted")
            return
        self._logger.info("Metagenome simulation starting")
        try:
            # Validate Genomes
            if self._phase_validate_raw_genomes:
                self._logger.info("Validating Genomes")
                self._validate_raw_genomes()

            # Design Communities
            if self._input_list_of_file_paths_distributions:
                assert len(self._input_list_of_file_paths_distributions) == self._number_of_samples
                
                meta_data_table = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose)
                file_path_genome_locations = self._project_file_folder_handler.get_genome_location_file_path()
                
                for community in self._list_of_communities:
                    meta_data_table.read(community.file_path_metadata_table, column_names=True)
                    file_path_metadata = self._project_file_folder_handler.get_genome_metadata_file_path()
                    meta_data_table.write(file_path_metadata, column_names=True)
                    out_locations = {}
                    # collect all paths
                    with open(community.file_path_genome_locations,'r') as in_locations: 
                        for line in in_locations:
                            genome, path = line.strip().split('\t')
                            out_locations[genome] = path 
                            # might overwrite path for genomes appearing multiple times and having been assigned different genomes
                # and write complete collection, so no genome appears multiple times
                with open(file_path_genome_locations, 'a') as locations:
                    for gen_id in out_locations:
                        locations.write("%s\t%s\n" % (gen_id, out_locations[gen_id]))
                
                genome_id_to_path_map = self.get_dict_gid_to_genome_file_path()
                directory_out_distributions = self._project_file_folder_handler.get_distribution_dir()
                list_of_file_paths_distributions = CommunityDesign.get_distribution_file_paths(
                    directory_out_distributions, self._number_of_samples)
                for file_path_src, file_path_dst in zip(self._input_list_of_file_paths_distributions, list_of_file_paths_distributions):
                    shutil.copy2(file_path_src, file_path_dst)
                self.write_profile_gold_standard(meta_data_table, list_of_file_paths_distributions)
            elif self._phase_design_community:
                self._logger.info("Design Communities")
                genome_id_to_path_map, list_of_file_paths_distributions = self._design_community()
            else:
                genome_id_to_path_map = self.get_dict_gid_to_genome_file_path()
                directory_out_distributions = self._project_file_folder_handler.get_distribution_dir()
                list_of_file_paths_distributions = CommunityDesign.get_distribution_file_paths(
                    directory_out_distributions, self._number_of_samples)

            # Move Genomes
            if self._phase_move_and_clean_genomes:
                self._logger.info("Move Genomes")
                self._move_and_cleanup_genomes(genome_id_to_path_map)

            # Read simulation (Art Illumina)
            if self._phase_simulate_reads:
                self._logger.info("Read simulation")
                for sample_index, file_path_distribution in enumerate(list_of_file_paths_distributions):
                    self._simulate_reads(file_path_distribution, sample_index)

            # Generate gold standard assembly
            list_of_output_gsa = None
            file_path_output_gsa_pooled = None
            if self._phase_pooled_gsa:
                self._logger.info("Generate gold standard assembly")
                list_of_output_gsa = self._generate_gsa()

            # Generate gold standard assembly from pooled reads of all samples
            if self._phase_pooled_gsa:
                self._logger.info("Generate pooled strains gold standard assembly")
                file_path_output_gsa_pooled = self._generate_gsa_pooled()

            # Anonymize Data (gsa)
            if self._phase_anonymize:
                self._logger.info("Anonymize Data")
                self._logger.debug(", ".join(list_of_output_gsa))
                self._anonymize_data(list_of_output_gsa, file_path_output_gsa_pooled)
            #elif self._phase_pooled_gsa: 
            else: # in any case create binning gold standard
                self._logger.info("Creating binning gold standard")
                self._logger.debug(", ".join(list_of_output_gsa))
                self._create_binning_gs(list_of_output_gsa)

            # Compress Data
            if self._phase_compress:
                self._logger.info("Compress Data")
                self._compress_data()

        except (KeyboardInterrupt, SystemExit, Exception, ValueError, RuntimeError) as e:
            self._logger.debug("\n{}\n".format(traceback.format_exc()))
            exc_tb = sys.exc_info()[-1]
            self._logger.error("%s in line %s" % (e, exc_tb.tb_lineno))
            self._logger.info("Metagenome simulation aborted")
        except AssertionError:
            self._logger.info("Metagenome simulation aborted, assertion %s failed" % e)
        else:
            self._logger.info("Metagenome simulation finished")

        if not self._debug:
            self._project_file_folder_handler.remove_directory_temp()
        else:
            self._logger.info("Temporary data stored at:\n{}".format(self._project_file_folder_handler.get_tmp_wd()))
Ejemplo n.º 22
0
    def _validate_genome_ids(self):
        """
		Validate genome ids

		@return:
		"""
        file_path_reference_genome_locations = self._file_path_reference_genome_locations
        file_path_query_genomes_location_file = self._file_path_query_genomes_location_file
        silva_reference_directory = self._silva_reference_directory
        assert isinstance(file_path_reference_genome_locations, str)
        assert isinstance(file_path_query_genomes_location_file, str)
        assert isinstance(silva_reference_directory, str)
        data_table_reference = MetadataTable(separator=self._separator,
                                             logfile=self._logfile,
                                             verbose=self._verbose)
        data_table_reference.read(file_path_reference_genome_locations)
        reference_gids = data_table_reference.get_column(0)
        reference_gids_set = set(reference_gids)
        if not len(reference_gids) == len(reference_gids_set):
            self._valid_args = False
            self._logger.error("Reference genome ids are not unique")
            return

        data_table_query = MetadataTable(separator=self._separator,
                                         logfile=self._logfile,
                                         verbose=self._verbose)
        data_table_query.read(file_path_query_genomes_location_file)
        query_gids = data_table_query.get_column(0)
        query_gids_set = set(query_gids)
        if not len(query_gids) == len(query_gids_set):
            self._valid_args = False
            self._logger.error("Query genome ids are not unique")
            return

        data_table_silva = MetadataTable(separator=self._separator,
                                         logfile=self._logfile,
                                         verbose=self._verbose)
        file_path_silva_map = os.path.join(silva_reference_directory,
                                           MGCluster.get_file_name_of_map())
        data_table_silva.read(file_path_silva_map)
        silver_ids_set = set(data_table_silva.get_column(1))
        # silva ids are allowed to be not unique

        if not query_gids_set.isdisjoint(reference_gids_set):
            self._valid_args = False
            self._logger.error(
                "Reference and query genomes ids must be unique!")
            return
        if not query_gids_set.isdisjoint(silver_ids_set):
            self._valid_args = False
            self._logger.error("Silva and query genomes ids must be unique!")
            return
    def design_community(self,
                         file_path_distributions,
                         community,
                         number_of_samples,
                         metadata_table,
                         directory_out_metadata,
                         directory_in_template=None):
        """
        Design artificial community, of a specific design, with different distributions for each sample

        @param file_path_distributions: File path where distributions will be written to
        @type file_path_distributions: str | unicode
        @param community: Input data for the creation of a community
        @type community: Community
        @param number_of_samples: Amount of samples to be simulated
        @type number_of_samples: int
        @param metadata_table: Will contain metadata of all (simulated) genomes/plasmids drawn
        @type metadata_table: MetadataTable
        @param directory_out_metadata: Metadata tables of separated by chosen and not chosen genomes are written to here
        @type directory_out_metadata: str | unicode
        @param directory_in_template: contains template data for strain simulation
        @type directory_in_template: str | unicode

        @return: Dictionary with drawn genome ids as key and file paths as value
        @rtype: dict[str|unicode, str|unicode]
        """
        assert isinstance(community, Community)
        assert isinstance(metadata_table, MetadataTable)

        number_of_strains = community.genomes_total

        # pick how much a strain will be simulated
        genome_amounts = []
        strain_simulation = None
        if community.simulate_strains:
            strain_simulation = StrainSimulationWrapper(
                executable_sim=None,
                directory_template=directory_in_template,
                column_name_gid=self._column_name_genome_id,
                column_name_ncbi=self._column_name_ncbi,
                column_name_source=self._column_name_source,
                separator='\t',
                filename_prefix="simulated_",
                keep_original=True,
                max_processors=self._max_processors,
                tmp_dir=self._tmp_dir,
                logfile=self._logfile,
                verbose=self._verbose,
                debug=self._debug,
                # seed=self._seed
            )

            probability = None  # 1-options.communities[community_id]["evolve"]
            genome_amounts = strain_simulation.get_genome_amounts(
                probability=probability,
                max_genome_amount=community.genomes_total,
                num_real_genomes=community.genomes_real,
                silent=not community.verbose)
            number_of_strains = len(genome_amounts)

        # draw strains
        self._logger.info("Drawing strains.")
        metadata_table_community = MetadataTable(logfile=self._logfile,
                                                 verbose=self._verbose)
        metadata_table_community.read(community.file_path_metadata_table,
                                      column_names=True)
        strain_selector = StrainSelector(
            column_name_genome_id=self._column_name_genome_id,
            column_name_otu=self._column_name_otu,
            column_name_novelty_category=self._column_name_novelty_category,
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)
        list_of_drawn_genome_id = strain_selector.get_drawn_genome_id(
            metadata_table=metadata_table_community,
            number_of_strains=number_of_strains,
            number_of_strains_per_otu=community.limit_per_otu)

        # write unused data to separate file
        old_base_name = os.path.basename(community.file_path_metadata_table)
        file_prefix, extention = os.path.splitext(old_base_name)
        new_file_name = "unused_c{index}_{prefix}{ext}".format(
            prefix=file_prefix, index=community.id, ext=extention)
        metadata_new_file_path = os.path.join(directory_out_metadata,
                                              new_file_name)
        metadata_table_community.write(
            metadata_new_file_path,
            exclude=True,
            value_list=list_of_drawn_genome_id,
            key_column_name=self._column_name_genome_id,
            column_names=True)

        # get path for every genome
        genome_id_to_file_path_gff = None
        if community.file_path_gff_locations:
            genome_id_to_file_path_gff = self._get_genome_id_to_path_map(
                community.file_path_gff_locations, list_of_drawn_genome_id)
        genome_id_to_path_map = self._get_genome_id_to_path_map(
            community.file_path_genome_locations, list_of_drawn_genome_id)

        # concatenate
        metadata_table_community.reduce_rows_to_subset(
            list_of_drawn_genome_id, self._column_name_genome_id)
        metadata_table.concatenate(metadata_table_community, strict=False)

        # validate correct format of files
        self._logger.info("Validating raw sequence files!")
        assert self.validate_format(
            list_of_file_paths=genome_id_to_path_map.values(),
            file_format="fasta",
            sequence_type="dna",
            ambiguous=True), "Validation of file format failed!"

        # simulate diversity around strains
        if community.simulate_strains:
            genome_id_to_amounts = strain_simulation.get_genome_id_to_amounts(
                list_of_drawn_genome_id, genome_amounts)
            strain_simulation.simulate_strains(
                meta_table=metadata_table,
                genome_id_to_amounts=genome_id_to_amounts,
                genome_id_to_file_path_genome=genome_id_to_path_map,
                genome_id_to_file_path_gff=genome_id_to_file_path_gff)
            # adopt new list that includes simulated strains
            self._logger.info("Validating simulated sequence files!")
            for genome_id, file_path in genome_id_to_path_map.iteritems():
                if genome_id in list_of_drawn_genome_id:
                    continue
                assert self.validate_sequence_file(file_path,
                                                   file_format="fasta",
                                                   sequence_type="dna",
                                                   ambiguous=True)
            list_of_drawn_genome_id = genome_id_to_path_map.keys()

        # get community distributions
        population_distribution = PopulationDistribution(logfile=self._logfile,
                                                         verbose=self._verbose,
                                                         debug=self._debug)
        list_of_distributions = population_distribution.get_lists_of_distributions(
            size_of_population=len(list_of_drawn_genome_id),
            number_of_samples=number_of_samples,
            modus=community.mode,
            log_mu=community.log_mu,
            log_sigma=community.log_sigma,
            gauss_mu=community.gauss_mu,
            gauss_sigma=community.gauss_sigma,
            view_distribution=community.verbose)

        # move and clean up files (removes sequence description)
        # genome_id_to_total_length = self.move_genome_files(
        #     genome_id_to_path_map,
        #     directory_output=directory_out_genomes,
        #     sequence_min_length=min_sequence_length,
        #     set_of_sequence_names=set_of_sequence_names)

        # write distribution file
        # genome_id_to_distributions = self._get_genome_id_to_distributions(list_of_drawn_genome_id, list_of_distributions)
        assert len(list_of_drawn_genome_id) == len(list_of_distributions)
        genome_id_to_distributions = dict(
            zip(list_of_drawn_genome_id, list_of_distributions))

        # genome_id_to_file_name = self._get_genome_id_to_file_name(genome_id_to_path_map)
        with open(file_path_distributions, 'w') as stream_out:
            self._write_distribution_file(
                stream_out=stream_out,
                genome_id_to_abundance=genome_id_to_distributions)
        return genome_id_to_path_map
Ejemplo n.º 24
0
    def __init__(self,
                 mg_analyse_executable,
                 file_path_query_genome_file_paths,
                 file_path_reference_genome_file_paths,
                 file_path_name_reference_marker_genes,
                 config_path,
                 file_path_map_reference_genome_id_to_tax_id=None,
                 max_processors=1,
                 temp_directory=None,
                 separator="\t",
                 logfile=None,
                 verbose=False,
                 debug=False):
        """
		Constructor

		@param mg_analyse_executable: File path to modified tool of Ivan
		@type mg_analyse_executable: str | unicode
		@param file_path_query_genome_file_paths: File path to file with the location of genomes to be classified
		@type file_path_query_genome_file_paths: str | unicode
		@param file_path_reference_genome_file_paths: File path to file with the location of reference genomes
		@type file_path_reference_genome_file_paths: str | unicode
		@param file_path_name_reference_marker_genes: File path to fasta file with list of marker gene sequences
		@type file_path_name_reference_marker_genes: str | unicode
		@param config_path: File path to configuration file
		@type config_path: str | unicode
		@param file_path_map_reference_genome_id_to_tax_id: Mapping of Reference genome_id to their taxonomic assignment
		@type file_path_map_reference_genome_id_to_tax_id: str | unicode
		@param max_processors: Amount of available processors
		@type max_processors: int | long
		@param temp_directory: File path to temporary storage
		@type temp_directory: str | unicode
		@param separator: Separator of metadata files
		@type separator: str | unicode
		"""
        super(MGExtract, self).__init__(logfile=logfile,
                                        verbose=verbose,
                                        debug=debug)
        assert file_path_map_reference_genome_id_to_tax_id is None or self.validate_file(
            file_path_map_reference_genome_id_to_tax_id)
        assert self.validate_file(file_path_query_genome_file_paths)
        assert file_path_reference_genome_file_paths is None or self.validate_file(
            file_path_reference_genome_file_paths)
        assert file_path_name_reference_marker_genes is None or self.validate_file(
            file_path_name_reference_marker_genes)
        assert self.validate_file(config_path)
        assert self.validate_file(mg_analyse_executable, executable=True)
        assert self.validate_number(max_processors, minimum=1)
        assert self.validate_dir(temp_directory)
        self._temp_directory = temp_directory
        self._mg_analyse_executable = mg_analyse_executable
        self._file_path_query_genome_file_paths = file_path_query_genome_file_paths
        self._file_path_reference_genome_file_paths = file_path_reference_genome_file_paths
        self._file_path_reference_marker_genes = file_path_name_reference_marker_genes
        self._config_path = config_path
        self._max_processors = max_processors
        self._debug = debug
        self._working_dirs = {}
        self._genome_id_to_tax_id = None
        self._separator = separator
        if file_path_map_reference_genome_id_to_tax_id is None:
            return
        meta_data_table = MetadataTable(separator=self._separator,
                                        logfile=logfile,
                                        verbose=verbose)
        meta_data_table.read(file_path_map_reference_genome_id_to_tax_id,
                             column_names=False)
        self._genome_id_to_tax_id = meta_data_table.get_map(0, 1)
        del meta_data_table