def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress) # set these dudes so we have access to unique IDs: self.set_next_available_id(t.collections_bins_info_table_name) self.set_next_available_id(t.collections_contigs_table_name) self.set_next_available_id(t.collections_splits_table_name)
def __init__(self, db_path): self.db_path = db_path self.states = {} if utils.get_db_type(self.db_path) not in ['profile', 'pan', 'structure', 'genes']: raise ConfigError("Your database '%s' does not seem to have states table, which anvi'o tries to access.") Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress) self.init()
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path self.run = run self.progress = progress Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress) self.num_entries = 0 self.db_entries = [] self.set_next_available_id(t.variable_codons_table_name)
def call_genes_and_populate_genes_in_contigs_table(self, gene_caller='prodigal'): Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # get gene calls and amino acid sequences gene_calls_dict, amino_acid_sequences = self.run_gene_caller(gene_caller) # make sure the returning gene calls dict is proper self.check_gene_calls_dict(gene_calls_dict) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences)
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path self.run = run self.progress = progress utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress) self.set_next_available_id(t.gene_function_calls_table_name)
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path utils.is_pan_db(db_path) self.run = run self.progress = progress Table.__init__(self, self.db_path, anvio.__pan__version__, run, progress) self.set_next_available_id(t.pan_gene_clusters_table_name) self.entries = []
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path self.run = run self.progress = progress utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress) TaxonNamesTable.__init__(self, self.db_path, self.run, self.progress) # this class keeps track of genes that occur in splits, and responsible # for generating the necessary table in the contigs database self.genes_in_splits = GenesInSplits()
def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress): self.num_threads_to_use = num_threads_to_use self.db_path = db_path utils.is_contigs_db(self.db_path) self.contigs_db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('contigs_db_hash') Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress) if not self.genes_are_called: raise ConfigError("It seems the contigs database '%s' was created with '--skip-gene-calling' flag.\ Nothing to do here :/" % (self.db_path)) self.init_gene_calls_dict() if not len(self.gene_calls_dict): raise ConfigError("Tables that should contain gene calls are empty. Which probably means the gene\ caller reported no genes for your contigs.") self.set_next_available_id(t.hmm_hits_table_name) self.set_next_available_id(t.hmm_hits_splits_table_name)
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)
def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False, skip_amino_acid_sequences=False): """Add genes to the contigs database. Either provide an `input_file_path` for external gene calls, or provide an external gene calls dictionary. The format should follow this: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "source": "source_name", "version": "unknown" }, "2": { (...) }, (...) } If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. By default this function will also attempt to add translated DNA sequences into the corresponding table per gene call. Unless the `skip_amino_acid_sequences` flag is True. This may be useful if genes that are not translated are being added, such as ribosomal RNA genes, etc. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\ prematurely and assuming you know what's up. If you don't, stop here and try to\ identify what decisions you've made might have led you to this weird point your\ workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\ done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError("You must provide either an input file, or an gene calls dict to process external\ gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\ with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path, expected_fields=t.genes_in_contigs_table_structure, only_expected_fields=True, column_mapping=[int, str, int, int, str, int, str, str]) if not len(gene_calls_dict): raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\ this is an error, anvi'o will stop here and complain. If this is not an error and you\ in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\ instead of providing an emtpy external gene calls file. You don't agree? You need this\ for some weird step for you weird pipeline? Let us know, and we will consider changing\ this.") self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid seqeunces or create a blank dictionary if skip_amino_acid_sequences: amino_acid_sequences = dict([(g, '') for g in gene_calls_dict]) else: amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict(gene_calls_dict, ignore_internal_stop_codons=ignore_internal_stop_codons) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)
def populate_genes_in_splits_tables(self, gene_calls_dict=None): utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress) self.set_next_available_id(t.genes_in_splits_table_name) self.init_gene_calls_dict() if not gene_calls_dict: gene_calls_dict = self.gene_calls_dict genes_in_splits = GenesInSplits(entry_id_start=self.next_id(t.genes_in_splits_table_name)) # build a dictionary for fast access to all genes identified within a contig gene_calls_in_contigs_dict = {} for gene_callers_id in gene_calls_dict: contig = gene_calls_dict[gene_callers_id]['contig'] if contig in gene_calls_in_contigs_dict: gene_calls_in_contigs_dict[contig].add(gene_callers_id) else: gene_calls_in_contigs_dict[contig] = set([gene_callers_id]) contigs_without_any_gene_calls = list(set(self.contigs_info.keys()) - set(gene_calls_in_contigs_dict.keys())) self.run.info('Contigs with at least one gene call', '%d of %d (%.1f%%)' % (len(gene_calls_in_contigs_dict), len(self.contigs_info), len(gene_calls_in_contigs_dict) * 100.0 / len(self.contigs_info))) for contig in contigs_without_any_gene_calls: gene_calls_in_contigs_dict[contig] = set([]) splits_dict = {} for contig in self.contigs_info: for split_name in self.contig_name_to_splits[contig]: start = self.splits_info[split_name]['start'] stop = self.splits_info[split_name]['end'] gene_start_stops = [] # here we go through all genes in the contig and identify the all the ones that happen to be in # this particular split to generate summarized info for each split. BUT one important that is done # in the following loop is genes_in_splits.add call, which populates GenesInSplits class. for gene_callers_id in gene_calls_in_contigs_dict[contig]: if gene_calls_dict[gene_callers_id]['stop'] > start and gene_calls_dict[gene_callers_id]['start'] < stop: gene_start_stops.append((gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop']), ) genes_in_splits.add(split_name, start, stop, gene_callers_id, gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop']) # here we identify genes that are associated with a split even if one base of the gene spills into # the defined start or stop of a split, which means, split N, will include genes A, B and C in this # scenario: # # contig: (...)------[ gene A ]--------[ gene B ]----[gene C]---------[ gene D ]-----(...) # (...)----------x---------------------------------------x--------------------------------(...) # ^ (split N start) ^ (split N stop) # | | # |<- split N ->| # # however, when looking at the coding versus non-coding nucleotide ratios in a split, we have to make # sure that only the relevant portion of gene A and gene C is counted: total_coding_nts = 0 for gene_start, gene_stop in gene_start_stops: total_coding_nts += (gene_stop if gene_stop < stop else stop) - (gene_start if gene_start > start else start) splits_dict[split_name] = {'num_genes': len(gene_start_stops), 'avg_gene_length': numpy.mean([(l[1] - l[0]) for l in gene_start_stops]) if len(gene_start_stops) else 0.0, 'ratio_coding': total_coding_nts * 1.0 / (stop - start), } # open connection database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) # push entries for genes in splits table db_entries = [tuple([entry_id] + [genes_in_splits.splits_to_prots[entry_id][h] for h in t.genes_in_splits_table_structure[1:]]) for entry_id in genes_in_splits.splits_to_prots] database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?)''' % t.genes_in_splits_table_name, db_entries) # disconnect database.disconnect()
def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False): """Add genes to the contigs database. Either provide an `input_file_path` for external gene calls, or provide an external gene calls dictionary. The format should follow this: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "source": "source_name", "version": "unknown" }, "2": { (...) }, (...) } If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\ prematurely and assuming you know what's up. If you don't, stop here and try to\ identify what decisions you've made might have led you to this weird point your\ workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\ done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError("You must provide either an input file, or an gene calls dict to process external\ gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\ with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path, expected_fields=t.genes_in_contigs_table_structure, only_expected_fields=True, column_mapping=[int, str, int, int, str, int, str, str]) if not len(gene_calls_dict): raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\ this is an error, anvi'o will stop here and complain. If this is not an error and you\ in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\ instead of providing an emtpy external gene calls file. You don't agree? You need this\ for some weird step for you weird pipeline? Let us know, and we will consider changing\ this.") self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid sequences. during this operation we are going to have to read all contig sequences # into the damn memory. anvi'o is doing a pretty bad job with memory management :( amino_acid_sequences = {} contig_sequences = {} if self.contigs_fasta: fasta = u.SequenceSource(self.contigs_fasta) while next(fasta): contig_sequences[fasta.id] = {'sequence': fasta.seq} fasta.close() else: database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name) num_genes_with_internal_stops = 0 number_of_impartial_gene_calls = 0 for gene_callers_id in gene_calls_dict: gene_call = gene_calls_dict[gene_callers_id] contig_name = gene_call['contig'] if contig_name not in contig_sequences: # remove the partial contigs database so things don't get screwed later os.remove(self.db_path) raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file\ does not appear to be in the contigs FASTA file. How did this happen?" % contig_name) if gene_call['partial']: amino_acid_sequences[gene_callers_id] = '' number_of_impartial_gene_calls += 1 continue sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']] if gene_call['direction'] == 'r': sequence = utils.rev_comp(sequence) amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id) # check if there are any internal stops: if amino_acid_sequence.find('*') > -1: if ignore_internal_stop_codons: amino_acid_sequence = amino_acid_sequence.replace('*', 'X') num_genes_with_internal_stops += 1 else: os.remove(self.db_path) raise ConfigError("Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\ which had an internal stop codon :/ This usually indicates that your external gene calls\ have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\ codons on your own risk. It will probably look very ugly on your screen, but here is the\ DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\ anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence)) amino_acid_sequences[gene_callers_id] = amino_acid_sequence # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db) if num_genes_with_internal_stops: percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict) self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\ stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\ characters, and stored them in the contigs database that way. %d of your genes, which corresponded\ to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \ (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict))) if number_of_impartial_gene_calls: self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\ were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))
def use_external_gene_calls_to_populate_genes_in_contigs_table( self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False, skip_amino_acid_sequences=False): """Add genes to the contigs database. Either provide an `input_file_path` for external gene calls, or provide an external gene calls dictionary. The format should follow this: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "source": "source_name", "version": "unknown" }, "2": { (...) }, (...) } If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. By default this function will also attempt to add translated DNA sequences into the corresponding table per gene call. Unless the `skip_amino_acid_sequences` flag is True. This may be useful if genes that are not translated are being added, such as ribosomal RNA genes, etc. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError( "'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single( "'Use external gene calls' function found an empty gene calls dict, returning\ prematurely and assuming you know what's up. If you don't, stop here and try to\ identify what decisions you've made might have led you to this weird point your\ workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\ done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError( "You must provide either an input file, or an gene calls dict to process external\ gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\ with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary( input_file_path, expected_fields=t.genes_in_contigs_table_structure, only_expected_fields=True, column_mapping=[int, str, int, int, str, int, str, str]) if not len(gene_calls_dict): raise ConfigError( "You provided an external gene calls file, but it returned zero gene calls. Assuming that\ this is an error, anvi'o will stop here and complain. If this is not an error and you\ in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\ instead of providing an emtpy external gene calls file. You don't agree? You need this\ for some weird step for you weird pipeline? Let us know, and we will consider changing\ this.") self.run.info( "External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid sequences or create a blank dictionary if skip_amino_acid_sequences: amino_acid_sequences = dict([(g, '') for g in gene_calls_dict]) else: amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict( gene_calls_dict, ignore_internal_stop_codons=ignore_internal_stop_codons) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)
def use_external_gene_calls_to_populate_genes_in_contigs_table( self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False): """Add genes to the contigs database. Either provide an `input_file_path` for external gene calls, or provide an external gene calls dictionary. The format should follow this: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "source": "source_name", "version": "unknown" }, "2": { (...) }, (...) } If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError( "'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single( "'Use external gene calls' function found an empty gene calls dict, returning\ prematurely and assuming you know what's up. If you don't, stop here and try to\ identify what decisions you've made might have led you to this weird point your\ workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\ done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError( "You must provide either an input file, or an gene calls dict to process external\ gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\ with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary( input_file_path, expected_fields=t.genes_in_contigs_table_structure, only_expected_fields=True, column_mapping=[int, str, int, int, str, int, str, str]) if not len(gene_calls_dict): raise ConfigError( "You provided an external gene calls file, but it returned zero gene calls. Assuming that\ this is an error, anvi'o will stop here and complain. If this is not an error and you\ in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\ instead of providing an emtpy external gene calls file. You don't agree? You need this\ for some weird step for you weird pipeline? Let us know, and we will consider changing\ this.") self.run.info( "External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid sequences. during this operation we are going to have to read all contig sequences # into the damn memory. anvi'o is doing a pretty bad job with memory management :( amino_acid_sequences = {} contig_sequences = {} if self.contigs_fasta: fasta = u.SequenceSource(self.contigs_fasta) while next(fasta): contig_sequences[fasta.id] = {'sequence': fasta.seq} fasta.close() else: database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) contig_sequences = database.get_table_as_dict( t.contig_sequences_table_name) num_genes_with_internal_stops = 0 number_of_impartial_gene_calls = 0 for gene_callers_id in gene_calls_dict: gene_call = gene_calls_dict[gene_callers_id] contig_name = gene_call['contig'] if contig_name not in contig_sequences: # remove the partial contigs database so things don't get screwed later os.remove(self.db_path) raise ConfigError( "You are in big trouble :( The contig name '%s' in your external gene callers file\ does not appear to be in the contigs FASTA file. How did this happen?" % contig_name) if gene_call['partial']: amino_acid_sequences[gene_callers_id] = '' number_of_impartial_gene_calls += 1 continue sequence = contig_sequences[contig_name]['sequence'][ gene_call['start']:gene_call['stop']] if gene_call['direction'] == 'r': sequence = utils.rev_comp(sequence) amino_acid_sequence = utils.get_DNA_sequence_translated( sequence, gene_callers_id) # check if there are any internal stops: if amino_acid_sequence.find('*') > -1: if ignore_internal_stop_codons: amino_acid_sequence = amino_acid_sequence.replace('*', 'X') num_genes_with_internal_stops += 1 else: os.remove(self.db_path) raise ConfigError( "Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\ which had an internal stop codon :/ This usually indicates that your external gene calls\ have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\ codons on your own risk. It will probably look very ugly on your screen, but here is the\ DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\ anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence)) amino_acid_sequences[gene_callers_id] = amino_acid_sequence # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db) if num_genes_with_internal_stops: percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len( gene_calls_dict) self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\ stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\ characters, and stored them in the contigs database that way. %d of your genes, which corresponded\ to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \ (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict))) if number_of_impartial_gene_calls: self.run.warning( '%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\ were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))
def populate_genes_in_splits_tables(self): utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress) self.init_gene_calls_dict() genes_in_splits = GenesInSplits() # build a dictionary for fast access to all genes identified within a contig gene_calls_in_contigs_dict = {} for gene_callers_id in self.gene_calls_dict: contig = self.gene_calls_dict[gene_callers_id]['contig'] if contig in gene_calls_in_contigs_dict: gene_calls_in_contigs_dict[contig].add(gene_callers_id) else: gene_calls_in_contigs_dict[contig] = set([gene_callers_id]) contigs_without_any_gene_calls = list( set(self.contigs_info.keys()) - set(gene_calls_in_contigs_dict.keys())) run.info( 'Contigs with at least one gene call', '%d of %d (%.1f%%)' % (len(gene_calls_in_contigs_dict), len(self.contigs_info), len(gene_calls_in_contigs_dict) * 100.0 / len(self.contigs_info))) for contig in contigs_without_any_gene_calls: gene_calls_in_contigs_dict[contig] = set([]) splits_dict = {} for contig in self.contigs_info: for split_name in self.contig_name_to_splits[contig]: start = self.splits_info[split_name]['start'] stop = self.splits_info[split_name]['end'] gene_start_stops = [] # here we go through all genes in the contig and identify the all the ones that happen to be in # this particular split to generate summarized info for each split. BUT one important that is done # in the following loop is genes_in_splits.add call, which populates GenesInSplits class. for gene_callers_id in gene_calls_in_contigs_dict[contig]: if self.gene_calls_dict[gene_callers_id][ 'stop'] > start and self.gene_calls_dict[ gene_callers_id]['start'] < stop: gene_start_stops.append( (self.gene_calls_dict[gene_callers_id]['start'], self.gene_calls_dict[gene_callers_id]['stop']), ) genes_in_splits.add( split_name, start, stop, gene_callers_id, self.gene_calls_dict[gene_callers_id]['start'], self.gene_calls_dict[gene_callers_id]['stop']) # here we identify genes that are associated with a split even if one base of the gene spills into # the defined start or stop of a split, which means, split N, will include genes A, B and C in this # scenario: # # contig: (...)------[ gene A ]--------[ gene B ]----[gene C]---------[ gene D ]-----(...) # (...)----------x---------------------------------------x--------------------------------(...) # ^ (split N start) ^ (split N stop) # | | # |<- split N ->| # # however, when looking at the coding versus non-coding nucleotide ratios in a split, we have to make # sure that only the relevant portion of gene A and gene C is counted: total_coding_nts = 0 for gene_start, gene_stop in gene_start_stops: total_coding_nts += ( gene_stop if gene_stop < stop else stop) - (gene_start if gene_start > start else start) splits_dict[split_name] = { 'num_genes': len(gene_start_stops), 'avg_gene_length': numpy.mean([(l[1] - l[0]) for l in gene_start_stops]) if len(gene_start_stops) else 0.0, 'ratio_coding': total_coding_nts * 1.0 / (stop - start), } # open connection database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) # push raw entries for splits table db_entries = [ tuple([split] + [ splits_dict[split][h] for h in t.genes_in_splits_summary_table_structure[1:] ]) for split in splits_dict ] database._exec_many( '''INSERT INTO %s VALUES (?,?,?,?)''' % t.genes_in_splits_summary_table_name, db_entries) # push entries for genes in splits table db_entries = [ tuple([entry_id] + [ genes_in_splits.splits_to_prots[entry_id][h] for h in t.genes_in_splits_table_structure[1:] ]) for entry_id in genes_in_splits.splits_to_prots ] database._exec_many( '''INSERT INTO %s VALUES (?,?,?,?,?,?)''' % t.genes_in_splits_table_name, db_entries) # disconnect database.disconnect()
def use_external_gene_calls_to_populate_genes_in_contigs_table( self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False, skip_predict_frame=False, skip_amino_acid_sequences=False): """Add genes to the contigs database. Primary input is either an `input_file_path` for external gene calls, or an external `gene_calls_dict` dictionary object. Parameters ========== input_file_path : str Path to file with one of the following structures. Option 1: gene_callers_id contig start stop direction partial call_type source version 0 CACHJY01_00016 0 693 r 1 1 prodigal v2.6.3 1 CACHJY01_00016 711 1140 r 0 1 prodigal v2.6.3 Option 2: gene_callers_id contig start stop direction partial call_type source version aa_sequence 0 CACHJY01_00016 0 693 r 1 1 prodigal v2.6.3 MSKKIYFTEYSKVNRLQTISNFTGSA 1 CACHJY01_00016 711 1140 r 0 1 prodigal v2.6.3 MVNVDYHGLIAGAGSGKTKVLTSRIAHIIK gene_calls_dict : dict, None Alternative to `input_file_path`. If provided, entries will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. Should look like: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "call_type": 1, "source": "source_name", "version": "unknown", "aa_sequence": "MSKKIYFTEYSKVNRLQTISNFTGSA" }, "2": { (...) }, (...) } All entries are required except "aa_sequence", which is optional. If provided, it should be present for ALL entries, even if it is an empty string. It's presence will be used to populate `gene_amino_acid_sequences`. ignore_internal_stop_codons : bool, False If False, ConfigError will be raised if a stop codon is found inside any gene. If True, this is suppressed and the stop codon is replaced with the character `X`. skip_predict_frame : bool, False If True, ConfigError will be raised if a gene is not divisible by 3. If False, anvi'o predicts the most likley open reading frame and trims the start/stop of the gene call to reflect this change so that the gene *is* divisible by 3. This flag allows the retention of amino acid sequences even if genes are not divisible by 3, or when it is flagged as partial. skip_amino_acid_sequences : bool, False Should the gene_amino_acid_sequences table be populated? This may be useful if genes that are not translated are being added, such as ribosomal RNA genes, etc. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError( "'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True has_aa_seq = lambda x: True if 'aa_sequence' in x else False num_with_aa_seqs = sum([ has_aa_seq(gene_call) for gene_call in gene_calls_dict.values() ]) num_gene_calls = len(gene_calls_dict) if num_with_aa_seqs != 0 and num_with_aa_seqs != num_gene_calls: raise ConfigError( "The gene_calls_dict passed to use_external_gene_calls_to_populate_genes_in_contigs_table " "has %d entries with 'aa_sequence' and %d without. Either 0 or all (%d) should have " "'aa_sequence'" % (num_with_aa_seqs, num_gene_calls - num_with_aa_seqs, num_gene_calls)) if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single( "'Use external gene calls' function found an empty gene calls dict, returning " "prematurely and assuming you know what's up. If you don't, stop here and try to " "identify what decisions you've made might have led you to this weird point your " "workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've " "done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError( "You must provide either an input file, or an gene calls dict to process external " "gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table` " "with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: expected_fields = t.genes_in_contigs_table_structure column_mapping = [int, str, int, int, str, int, int, str, str] if 'aa_sequence' in utils.get_columns_of_TAB_delim_file( input_file_path): expected_fields = t.genes_in_contigs_table_structure + [ 'aa_sequence' ] column_mapping.append( lambda x: '' if x is None else str(x)) # str(None) is 'None', amazingly gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary( input_file_path, expected_fields=expected_fields, only_expected_fields=True, column_mapping=column_mapping) if not len(gene_calls_dict): raise ConfigError( "You provided an external gene calls file, but it returned zero gene calls. Assuming that " "this is an error, anvi'o will stop here and complain. If this is not an error and you " "in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag, " "instead of providing an emtpy external gene calls file. You don't agree? You need this " "for some weird step for you weird pipeline? Let us know, and we will consider changing " "this.") self.run.info( "External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid sequences or create a blank dictionary if skip_amino_acid_sequences: amino_acid_sequences = dict([(g, '') for g in gene_calls_dict]) else: gene_calls_dict, amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict( gene_calls_dict, ignore_internal_stop_codons=ignore_internal_stop_codons, skip_predict_frame=skip_predict_frame, ) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)