Exemple #1
0
    def _validate_raw_genomes(self):
        """
		Validate format raw and reference genomes

		@rtype: None
		"""
        self._logger.info("Validating Genomes")
        meta_data_table = MetadataTable(separator=self._separator,
                                        logfile=self._logfile,
                                        verbose=self._verbose)

        are_valid = True
        meta_data_table.read(self._file_path_query_genomes_location_file,
                             column_names=False)
        list_of_file_paths = meta_data_table.get_column(1)
        if not self._validate_format(list_of_file_paths,
                                     file_format="fasta",
                                     sequence_type="dna",
                                     ambiguous=True):
            are_valid = False

        meta_data_table.read(self._file_path_reference_genome_locations,
                             column_names=False)
        list_of_file_paths = meta_data_table.get_column(1)
        if not self._validate_format(list_of_file_paths,
                                     file_format="fasta",
                                     sequence_type="dna",
                                     ambiguous=True):
            are_valid = False

        if not are_valid:
            msg = "Invalid genomes found!"
            self._logger.error(msg)
            raise RuntimeError(msg)
        self._logger.info("Validating Genomes Done")
    def get_dict_sequence_name_to_positions(self, list_of_sam_position_files):
        """
            Get a map, sequence name to list of starting position from a mapping file.

            @attention: First column are sequence names, second column the start position

            @param list_of_sam_position_files: List of sam  position files
            @type list_of_sam_position_files: list[str|unicode]

            @return: Mapping of sequence name to list of starting position
            @rtype: dict[str | unicode, list[long]]
        """
        assert isinstance(list_of_sam_position_files, list)

        dict_original_seq_pos = {}

        table = MetadataTable(logfile=self._logfile, verbose=self._verbose)
        for sam_position_file in list_of_sam_position_files:
            table.read(sam_position_file, separator=self._separator)
            column_key = table.get_column(0)
            column_values = table.get_column(1)

            for index_row in range(len(column_key)):
                key = column_key[index_row]
                value = column_values[index_row]
                seq_without_index = key.split("-")[0]
                if seq_without_index not in dict_original_seq_pos:
                    dict_original_seq_pos[seq_without_index] = []
                dict_original_seq_pos[seq_without_index].append(value)
        return dict_original_seq_pos
Exemple #3
0
    def _validate_genome_ids(self):
        """
		Validate genome ids

		@return:
		"""
        file_path_reference_genome_locations = self._file_path_reference_genome_locations
        file_path_query_genomes_location_file = self._file_path_query_genomes_location_file
        silva_reference_directory = self._silva_reference_directory
        assert isinstance(file_path_reference_genome_locations, str)
        assert isinstance(file_path_query_genomes_location_file, str)
        assert isinstance(silva_reference_directory, str)
        data_table_reference = MetadataTable(separator=self._separator,
                                             logfile=self._logfile,
                                             verbose=self._verbose)
        data_table_reference.read(file_path_reference_genome_locations)
        reference_gids = data_table_reference.get_column(0)
        reference_gids_set = set(reference_gids)
        if not len(reference_gids) == len(reference_gids_set):
            self._valid_args = False
            self._logger.error("Reference genome ids are not unique")
            return

        data_table_query = MetadataTable(separator=self._separator,
                                         logfile=self._logfile,
                                         verbose=self._verbose)
        data_table_query.read(file_path_query_genomes_location_file)
        query_gids = data_table_query.get_column(0)
        query_gids_set = set(query_gids)
        if not len(query_gids) == len(query_gids_set):
            self._valid_args = False
            self._logger.error("Query genome ids are not unique")
            return

        data_table_silva = MetadataTable(separator=self._separator,
                                         logfile=self._logfile,
                                         verbose=self._verbose)
        file_path_silva_map = os.path.join(silva_reference_directory,
                                           MGCluster.get_file_name_of_map())
        data_table_silva.read(file_path_silva_map)
        silver_ids_set = set(data_table_silva.get_column(1))
        # silva ids are allowed to be not unique

        if not query_gids_set.isdisjoint(reference_gids_set):
            self._valid_args = False
            self._logger.error(
                "Reference and query genomes ids must be unique!")
            return
        if not query_gids_set.isdisjoint(silver_ids_set):
            self._valid_args = False
            self._logger.error("Silva and query genomes ids must be unique!")
            return
Exemple #4
0
    def _validate_raw_genomes(self):
        """
        Validate format raw genomes

        @return: True if all genomes valid
        @rtype: bool
        """
        prepare_genomes = GenomePreparation(
            logfile=self._logfile,
            verbose=self._verbose)

        meta_data_table = MetadataTable(
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose)

        are_valid = True
        for community in self._list_of_communities:
            meta_data_table.read(community.file_path_genome_locations)
            list_of_file_paths = meta_data_table.get_column(1)

            if not prepare_genomes.validate_format(
                list_of_file_paths,
                file_format="fasta",  # TODO: should be done dynamically
                sequence_type="dna",
                ambiguous=True):
                are_valid = False
        return are_valid
	def create_meta_table(self, file_path_metadata_table):
		"""
		Generate a input metadata file with genome ids only

		@param file_path_metadata_table:
		@type file_path_metadata_table: str|unicode

		@rtype: None
		"""
		metadata_table = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose)
		metadata_table.read(self._file_path_reference_genome_locations, column_names=False)
		if metadata_table.get_number_of_rows() == 0:
			raise ValueError("Invalid file content")
		id_column = metadata_table.get_column(0)
		metadata_table.clear()
		metadata_table.insert_column(id_column, self._column_name_genome_id)
		metadata_table.write(file_path_metadata_table, column_names=True)
Exemple #6
0
    def marker_gene_annotation(self):
        """As the third step, the unpublished genomes are classified based on the clusters they are found in.
		Since clusters were made in 0.01 distance steps, the classification can be done using the smallest clusters first, using bigger ones if a classification can not be made.
		If a marker gene of an unpublished genome is found in a cluster together with references, a common taxon that 90% of sequences agree with will be the predicted taxon.
		The 90% is arbitrary chosen and is required because of taxonomic inconsistencies.
		When a specific rank is checked for agreement, sequences with unknown classification on that rank are ignored.
		TODO: check for taxonomic consitency on higher ranks for those!
		Novelty prediction is based on the predicted taxon's rank. a high rank (phylum, order, class) with low distance can be a strong indicator for taxonomic inconsistencies.
		But it could also be caused by sequences that are not fully classified, yet.
		input:
		- meta data table with a list of the genomes that are to be classified
		- working directory where the results will be saved and which contains the mothur formatted file with the clusters
		output:
		- meta data table with a list of the genomes, with columns added that contain cluster based tax prediction, rank and novelty prediction

		@rtype: None
		"""
        # set of taxonomic ids of well known genomes
        data_table = MetadataTable(separator=self._separator,
                                   logfile=self._logfile,
                                   verbose=self._verbose)
        data_table.read(self._file_path_map_reference_genome_id_to_tax_id)
        list_of_refernce_ncbi_id = data_table.get_column(1)

        # mapping of all internal ids
        # data_table_iid_mapping_silva = MetadataTable(
        # 	separator=self._separator, logfile=self._logfile, verbose=self._verbose)
        # file_path_silva_map = os.path.join(self._silva_reference_directory, MGCluster.get_file_name_of_map())
        # data_table_iid_mapping_silva.read(file_path_silva_map)
        data_table_iid_mapping = MetadataTable(separator=self._separator,
                                               logfile=self._logfile,
                                               verbose=self._verbose)
        data_table_iid_mapping.read(
            self._project_file_folder_handler.get_file_path_internal_id_map())
        # data_table_iid_mapping.concatenate(data_table_iid_mapping_silva, strict=False)

        mg_annotate = MGAnnotate(
            # ncbi_reference_directory=self._ncbi_reference_directory,
            file_path_query_genomes_location=self.
            _file_path_query_genomes_location_file,
            file_path_reference_genomes_location=self.
            _file_path_reference_genome_locations,
            file_path_reference_taxid_map=self.
            _file_path_map_reference_genome_id_to_tax_id,
            file_path_nucmer=self._file_path_nucmer,
            column_name_genome_id=self._column_name_genome_id,
            column_name_otu=self._column_name_otu_id,
            column_name_novelty_category=self._column_name_cluster_novelty,
            column_name_ncbi=self._column_name_ncbi,
            column_name_scientific_name=self.
            _column_name_cluster_scientific_name,
            column_name_ani=self._column_name_ani,
            column_name_ani_novelty=self._column_name_ani_novelty,
            column_name_ani_ncbi=self._column_name_ani_compare,
            column_name_ani_scientific_name=self.
            _column_name_ani_scientific_name,
            temp_directory=self._directory_temp,
            max_processors=self._max_processors,
            separator=self._separator,
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        metadata_table = MetadataTable(separator=self._separator,
                                       logfile=self._logfile,
                                       verbose=self._verbose)
        metadata_table.read(self._metadata_table_in, column_names=True)
        metadata_table.remove_empty_columns()

        list_query_gid = metadata_table.get_column(self._column_name_genome_id)
        if list_query_gid is None:
            msg = "Meta data file does not contain the required header '{}'".format(
                self._column_name_genome_id)
            self._logger.error(msg)
            raise IOError(msg)

        taxonomy = NcbiTaxonomy(self._ncbi_reference_directory,
                                verbose=self._verbose,
                                logfile=self._logfile)

        mothur_cluster = MothurCluster(
            self._precision,
            iid_gid_mapping=data_table_iid_mapping.get_map(0, 1),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)
        mothur_cluster.read(
            self._project_file_folder_handler.get_file_path_cluster_mg_16s(),
            list_query_gid)

        taxonomy_cluster = TaxonomicCluster(
            mothur_cluster,
            taxonomy,
            iid_tid_map=data_table_iid_mapping.get_map(0, 2),
            set_reference_genome_ncbi=set(list_of_refernce_ncbi_id),
            logfile=self._logfile,
            verbose=self._verbose,
            debug=self._debug)

        if self._annotate_classify:
            self._logger.info("Taxonomic classification")
            # also, novelty based clustering
            mg_annotate.taxonomic_classification(
                metadata_table, mothur_cluster, taxonomy_cluster, taxonomy,
                self._classification_distance_minimum)
            self._logger.info("Taxonomic classification Done")

        if self._annotate_novelty:
            self._logger.info("Novelty categorisation")
            # novelty by comparing with reference taxonomic ids
            mg_annotate.novelty_categorisation(taxonomy,
                                               set(list_of_refernce_ncbi_id),
                                               metadata_table)
            self._logger.info("Novelty categorisation Done")

        if self._annotate_otu:
            self._logger.info("OTU")
            mg_annotate.set_otu_id(metadata_table, mothur_cluster,
                                   self._otu_distance)
            self._logger.info("OTU Done")

        if self._annotate_ani:
            self._logger.info("Calculating ANI")
            mg_annotate.calculate_ani(mothur_cluster, taxonomy, metadata_table,
                                      self._distance_cutoff,
                                      self._ani_minimum_alignment)
            self._logger.info("Calculating ANI Done")
        metadata_table.write(
            self._project_file_folder_handler.get_file_path_meta_data_table(),
            column_names=True)