def _validate_raw_genomes(self): """ Validate format raw and reference genomes @rtype: None """ self._logger.info("Validating Genomes") meta_data_table = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose) are_valid = True meta_data_table.read(self._file_path_query_genomes_location_file, column_names=False) list_of_file_paths = meta_data_table.get_column(1) if not self._validate_format(list_of_file_paths, file_format="fasta", sequence_type="dna", ambiguous=True): are_valid = False meta_data_table.read(self._file_path_reference_genome_locations, column_names=False) list_of_file_paths = meta_data_table.get_column(1) if not self._validate_format(list_of_file_paths, file_format="fasta", sequence_type="dna", ambiguous=True): are_valid = False if not are_valid: msg = "Invalid genomes found!" self._logger.error(msg) raise RuntimeError(msg) self._logger.info("Validating Genomes Done")
def get_dict_sequence_name_to_positions(self, list_of_sam_position_files): """ Get a map, sequence name to list of starting position from a mapping file. @attention: First column are sequence names, second column the start position @param list_of_sam_position_files: List of sam position files @type list_of_sam_position_files: list[str|unicode] @return: Mapping of sequence name to list of starting position @rtype: dict[str | unicode, list[long]] """ assert isinstance(list_of_sam_position_files, list) dict_original_seq_pos = {} table = MetadataTable(logfile=self._logfile, verbose=self._verbose) for sam_position_file in list_of_sam_position_files: table.read(sam_position_file, separator=self._separator) column_key = table.get_column(0) column_values = table.get_column(1) for index_row in range(len(column_key)): key = column_key[index_row] value = column_values[index_row] seq_without_index = key.split("-")[0] if seq_without_index not in dict_original_seq_pos: dict_original_seq_pos[seq_without_index] = [] dict_original_seq_pos[seq_without_index].append(value) return dict_original_seq_pos
def _validate_genome_ids(self): """ Validate genome ids @return: """ file_path_reference_genome_locations = self._file_path_reference_genome_locations file_path_query_genomes_location_file = self._file_path_query_genomes_location_file silva_reference_directory = self._silva_reference_directory assert isinstance(file_path_reference_genome_locations, str) assert isinstance(file_path_query_genomes_location_file, str) assert isinstance(silva_reference_directory, str) data_table_reference = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose) data_table_reference.read(file_path_reference_genome_locations) reference_gids = data_table_reference.get_column(0) reference_gids_set = set(reference_gids) if not len(reference_gids) == len(reference_gids_set): self._valid_args = False self._logger.error("Reference genome ids are not unique") return data_table_query = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose) data_table_query.read(file_path_query_genomes_location_file) query_gids = data_table_query.get_column(0) query_gids_set = set(query_gids) if not len(query_gids) == len(query_gids_set): self._valid_args = False self._logger.error("Query genome ids are not unique") return data_table_silva = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose) file_path_silva_map = os.path.join(silva_reference_directory, MGCluster.get_file_name_of_map()) data_table_silva.read(file_path_silva_map) silver_ids_set = set(data_table_silva.get_column(1)) # silva ids are allowed to be not unique if not query_gids_set.isdisjoint(reference_gids_set): self._valid_args = False self._logger.error( "Reference and query genomes ids must be unique!") return if not query_gids_set.isdisjoint(silver_ids_set): self._valid_args = False self._logger.error("Silva and query genomes ids must be unique!") return
def _validate_raw_genomes(self): """ Validate format raw genomes @return: True if all genomes valid @rtype: bool """ prepare_genomes = GenomePreparation( logfile=self._logfile, verbose=self._verbose) meta_data_table = MetadataTable( separator=self._separator, logfile=self._logfile, verbose=self._verbose) are_valid = True for community in self._list_of_communities: meta_data_table.read(community.file_path_genome_locations) list_of_file_paths = meta_data_table.get_column(1) if not prepare_genomes.validate_format( list_of_file_paths, file_format="fasta", # TODO: should be done dynamically sequence_type="dna", ambiguous=True): are_valid = False return are_valid
def create_meta_table(self, file_path_metadata_table): """ Generate a input metadata file with genome ids only @param file_path_metadata_table: @type file_path_metadata_table: str|unicode @rtype: None """ metadata_table = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose) metadata_table.read(self._file_path_reference_genome_locations, column_names=False) if metadata_table.get_number_of_rows() == 0: raise ValueError("Invalid file content") id_column = metadata_table.get_column(0) metadata_table.clear() metadata_table.insert_column(id_column, self._column_name_genome_id) metadata_table.write(file_path_metadata_table, column_names=True)
def marker_gene_annotation(self): """As the third step, the unpublished genomes are classified based on the clusters they are found in. Since clusters were made in 0.01 distance steps, the classification can be done using the smallest clusters first, using bigger ones if a classification can not be made. If a marker gene of an unpublished genome is found in a cluster together with references, a common taxon that 90% of sequences agree with will be the predicted taxon. The 90% is arbitrary chosen and is required because of taxonomic inconsistencies. When a specific rank is checked for agreement, sequences with unknown classification on that rank are ignored. TODO: check for taxonomic consitency on higher ranks for those! Novelty prediction is based on the predicted taxon's rank. a high rank (phylum, order, class) with low distance can be a strong indicator for taxonomic inconsistencies. But it could also be caused by sequences that are not fully classified, yet. input: - meta data table with a list of the genomes that are to be classified - working directory where the results will be saved and which contains the mothur formatted file with the clusters output: - meta data table with a list of the genomes, with columns added that contain cluster based tax prediction, rank and novelty prediction @rtype: None """ # set of taxonomic ids of well known genomes data_table = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose) data_table.read(self._file_path_map_reference_genome_id_to_tax_id) list_of_refernce_ncbi_id = data_table.get_column(1) # mapping of all internal ids # data_table_iid_mapping_silva = MetadataTable( # separator=self._separator, logfile=self._logfile, verbose=self._verbose) # file_path_silva_map = os.path.join(self._silva_reference_directory, MGCluster.get_file_name_of_map()) # data_table_iid_mapping_silva.read(file_path_silva_map) data_table_iid_mapping = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose) data_table_iid_mapping.read( self._project_file_folder_handler.get_file_path_internal_id_map()) # data_table_iid_mapping.concatenate(data_table_iid_mapping_silva, strict=False) mg_annotate = MGAnnotate( # ncbi_reference_directory=self._ncbi_reference_directory, file_path_query_genomes_location=self. _file_path_query_genomes_location_file, file_path_reference_genomes_location=self. _file_path_reference_genome_locations, file_path_reference_taxid_map=self. _file_path_map_reference_genome_id_to_tax_id, file_path_nucmer=self._file_path_nucmer, column_name_genome_id=self._column_name_genome_id, column_name_otu=self._column_name_otu_id, column_name_novelty_category=self._column_name_cluster_novelty, column_name_ncbi=self._column_name_ncbi, column_name_scientific_name=self. _column_name_cluster_scientific_name, column_name_ani=self._column_name_ani, column_name_ani_novelty=self._column_name_ani_novelty, column_name_ani_ncbi=self._column_name_ani_compare, column_name_ani_scientific_name=self. _column_name_ani_scientific_name, temp_directory=self._directory_temp, max_processors=self._max_processors, separator=self._separator, logfile=self._logfile, verbose=self._verbose, debug=self._debug) metadata_table = MetadataTable(separator=self._separator, logfile=self._logfile, verbose=self._verbose) metadata_table.read(self._metadata_table_in, column_names=True) metadata_table.remove_empty_columns() list_query_gid = metadata_table.get_column(self._column_name_genome_id) if list_query_gid is None: msg = "Meta data file does not contain the required header '{}'".format( self._column_name_genome_id) self._logger.error(msg) raise IOError(msg) taxonomy = NcbiTaxonomy(self._ncbi_reference_directory, verbose=self._verbose, logfile=self._logfile) mothur_cluster = MothurCluster( self._precision, iid_gid_mapping=data_table_iid_mapping.get_map(0, 1), logfile=self._logfile, verbose=self._verbose, debug=self._debug) mothur_cluster.read( self._project_file_folder_handler.get_file_path_cluster_mg_16s(), list_query_gid) taxonomy_cluster = TaxonomicCluster( mothur_cluster, taxonomy, iid_tid_map=data_table_iid_mapping.get_map(0, 2), set_reference_genome_ncbi=set(list_of_refernce_ncbi_id), logfile=self._logfile, verbose=self._verbose, debug=self._debug) if self._annotate_classify: self._logger.info("Taxonomic classification") # also, novelty based clustering mg_annotate.taxonomic_classification( metadata_table, mothur_cluster, taxonomy_cluster, taxonomy, self._classification_distance_minimum) self._logger.info("Taxonomic classification Done") if self._annotate_novelty: self._logger.info("Novelty categorisation") # novelty by comparing with reference taxonomic ids mg_annotate.novelty_categorisation(taxonomy, set(list_of_refernce_ncbi_id), metadata_table) self._logger.info("Novelty categorisation Done") if self._annotate_otu: self._logger.info("OTU") mg_annotate.set_otu_id(metadata_table, mothur_cluster, self._otu_distance) self._logger.info("OTU Done") if self._annotate_ani: self._logger.info("Calculating ANI") mg_annotate.calculate_ani(mothur_cluster, taxonomy, metadata_table, self._distance_cutoff, self._ani_minimum_alignment) self._logger.info("Calculating ANI Done") metadata_table.write( self._project_file_folder_handler.get_file_path_meta_data_table(), column_names=True)