def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress) # set these dudes so we have access to unique IDs: self.set_next_available_id(t.collections_bins_info_table_name) self.set_next_available_id(t.collections_contigs_table_name) self.set_next_available_id(t.collections_splits_table_name)
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path self.run = run self.progress = progress Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress) self.num_entries = 0 self.db_entries = [] self.set_next_available_id(t.variable_codons_table_name)
def __init__(self, db_path): self.db_path = db_path self.states = {} if utils.get_db_type(self.db_path) not in ['profile', 'pan', 'structure', 'genes']: raise ConfigError("Your database '%s' does not seem to have states table, which anvi'o tries to access.") Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress) self.init()
def call_genes_and_populate_genes_in_contigs_table(self, gene_caller='prodigal'): Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # get gene calls and amino acid sequences gene_calls_dict, amino_acid_sequences = self.run_gene_caller(gene_caller) # make sure the returning gene calls dict is proper self.check_gene_calls_dict(gene_calls_dict) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences)
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path self.run = run self.progress = progress utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress) self.set_next_available_id(t.gene_function_calls_table_name)
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path self.run = run self.progress = progress utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress) TaxonNamesTable.__init__(self, self.db_path, self.run, self.progress) # this class keeps track of genes that occur in splits, and responsible # for generating the necessary table in the contigs database self.genes_in_splits = GenesInSplits()
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path utils.is_pan_db(db_path) self.run = run self.progress = progress Table.__init__(self, self.db_path, anvio.__pan__version__, run, progress) self.set_next_available_id(t.pan_gene_clusters_table_name) self.entries = []
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path self.run = run self.progress = progress Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress) self.num_entries = 0 self.db_entries = [] self.set_next_available_id(t.variable_nts_table_name)
def __init__(self, db_path): self.db_path = db_path self.states = {} if utils.get_db_type( self.db_path) not in ['profile', 'pan', 'structure', 'genes']: raise ConfigError( "Your database '%s' does not seem to have states table, which anvi'o tries to access." ) Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress) self.init()
def __init__(self, db_path, run=run, progress=progress, profile_db_path=False): self.db_path = db_path self.run = run self.progress = progress utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress) self.set_next_available_id(t.scg_taxonomy_table_name)
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path self.run = run self.progress = progress Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress) self.num_entries = 0 self.db_entries = [] self.max_num_entries_in_storage_buffer = 15000
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path self.run = run self.progress = progress Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress) self.num_entries = self.get_num_entries() self.db_entries = [] # after getting an instance, we don't want things to keep accumulating # in memory. the purpose of the following variable is to ensure whenever # the number of entries in `self.db_entries` variable exceeds a certain # value, it will be written to the database and the global variable # `self.db_entries` will be emptied, saving significant memory space: self.max_num_entries_in_storage_buffer = 50000
def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress, initializing_for_deletion=False, just_do_it=False, hmm_program_to_use='hmmscan'): self.num_threads_to_use = num_threads_to_use self.db_path = db_path self.just_do_it = just_do_it self.hmm_program = hmm_program_to_use or 'hmmscan' utils.is_contigs_db(self.db_path) filesnpaths.is_program_exists(self.hmm_program) self.contigs_db_hash = db.DB( self.db_path, utils.get_required_version_for_db( self.db_path)).get_meta_value('contigs_db_hash') Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress) self.init_gene_calls_dict() if not len(self.gene_calls_dict): if self.genes_are_called: self.run.warning( "Tables in this contigs database that should contain gene calls are empty despite the fact that " "you didn't skip the gene calling step while generating this contigs database. This probably means " "that the gene caller did not find any genes among contigs. This is OK for now. But might explode " "later. If it does explode and you decide to let us know about that problem, please remember to mention " "this warning. By the way, this warning probably has been seen by like only 2 people on the planet. Who " "works with contigs with no gene calls? A better implementation of anvi'o will unite researchers who " "study weird stuff.") else: self.run.warning( "It seems you have skipped gene calling step while generating your contigs database, and you have no " "genes calls in tables that should contain gene calls. Anvi'o will let you go with this since some HMM " "sources only operate on DNA sequences, and at this point it doesn't know which HMMs you wish to run. " "If the lack of genes causes a problem, you will get another error message later probably :/" ) if not initializing_for_deletion: self.set_next_available_id(t.hmm_hits_table_name) self.set_next_available_id(t.hmm_hits_splits_table_name)
def call_genes_and_populate_genes_in_contigs_table(self, gene_caller='prodigal'): Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # get gene calls and amino acid sequences gene_calls_dict, amino_acid_sequences = self.run_gene_caller( gene_caller) # make sure the returning gene calls dict is proper self.check_gene_calls_dict(gene_calls_dict) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences)
def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress): self.num_threads_to_use = num_threads_to_use self.db_path = db_path utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress) if not self.genes_are_called: raise ConfigError("It seems the contigs database '%s' was created with '--skip-gene-calling' flag.\ Nothing to do here :/" % (self.db_path)) self.init_gene_calls_dict() if not len(self.gene_calls_dict): raise ConfigError("Tables that should contain gene calls are empty. Which probably means the gene\ caller reported no genes for your contigs.") self.set_next_available_id(t.hmm_hits_table_name) self.set_next_available_id(t.hmm_hits_splits_table_name)
def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress): self.num_threads_to_use = num_threads_to_use self.db_path = db_path utils.is_contigs_db(self.db_path) self.contigs_db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('contigs_db_hash') Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress) if not self.genes_are_called: raise ConfigError("It seems the contigs database '%s' was created with '--skip-gene-calling' flag.\ Nothing to do here :/" % (self.db_path)) self.init_gene_calls_dict() if not len(self.gene_calls_dict): raise ConfigError("Tables that should contain gene calls are empty. Which probably means the gene\ caller reported no genes for your contigs.") self.set_next_available_id(t.hmm_hits_table_name) self.set_next_available_id(t.hmm_hits_splits_table_name)
def __init__(self, db_path, parameters, split_names=None, ignore_splits_name_check=False, run=run, progress=progress): self.db_path = db_path self.parameters = parameters self.split_names = split_names self.ignore_splits_name_check = ignore_splits_name_check if not isinstance(parameters, dict): raise ConfigError( "Parameters must be of type. These are basically the parameters such as \ min_cov_for_detection, outliers_threshold, or zeros_are_outliers, that \ are used to establish gene-level coverage data. Anvi'o stores them in\ the gene database, so it can warn the user what they're about to read\ from the database is not what they actually want to read (because the\ parameters have changed at some point).") self.run = run self.progress = progress Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress) self.num_entries = 0 self.set_next_available_id(t.gene_level_coverage_stats_table_name) self.collection_name = db.DB( self.db_path, None, ignore_version=True).get_meta_value('collection_name') self.bin_name = db.DB(self.db_path, None, ignore_version=True).get_meta_value('bin_name')
def __init__(self, args): A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.db_path = A('pan_or_profile_db') or A('profile_db') or A('pan_db') self.just_do_it = A('just_do_it') if not self.db_path: raise ConfigError( "The AdditionalAndOrderDataBaseClass is inherited with an args object that did not\ contain any database path :/ Even though any of the following would\ have worked: `pan_or_profile_db`, `profile_db`, `pan_db` :(" ) if not self.table_name: raise ConfigError( "The AdditionalAndOrderDataBaseClass does not know anything about the table it should\ be working with.") utils.is_pan_or_profile_db(self.db_path) self.db_type = utils.get_db_type(self.db_path) self.db_version = utils.get_required_version_for_db(self.db_path) database = db.DB(self.db_path, self.db_version) self.additional_data_keys = database.get_single_column_from_table( self.table_name, 'data_key') database.disconnect() Table.__init__(self, self.db_path, self.db_version, self.run, self.progress) self.nulls_per_type = { 'str': '', 'int': 0, 'float': 0, 'stackedbar': None, 'unknown': None }
def __init__(self, db_path, parameters, mode, split_names=None, ignore_splits_name_check=False, run=run, progress=progress): self.run = run self.progress = progress self.db_path = db_path self.parameters = parameters self.split_names = split_names self.ignore_splits_name_check = ignore_splits_name_check self.mode = mode if self.mode == 'INSEQ': self.table_name = t.gene_level_inseq_stats_table_name self.table_structure = t.gene_level_inseq_stats_table_structure elif self.mode == 'STANDARD': self.table_name = t.gene_level_coverage_stats_table_name self.table_structure = t.gene_level_coverage_stats_table_structure else: raise ConfigError("TableForGeneLevelCoverages class is speaking: you came here with an improper 'mode' " "when you were expected to come with 'INSEQ' or 'STANDARD' modes. Your mode, '%s', " "is not welcome here :(" % (self.mode)) if not isinstance(parameters, dict): raise ConfigError("Parameters must be of type. These are basically the parameters such as " "min_cov_for_detection, outliers_threshold, or zeros_are_outliers, that " "are used to establish gene-level coverage data. Anvi'o stores them in " "the gene database, so it can warn the user what they're about to read " "from the database is not what they actually want to read (because the " "parameters have changed at some point).") Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress) self.num_entries = 0 self.collection_name = db.DB(self.db_path, None, ignore_version=True).get_meta_value('collection_name') self.bin_name = db.DB(self.db_path, None, ignore_version=True).get_meta_value('bin_name')
def populate_genes_in_splits_tables(self, gene_calls_dict=None): utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress) self.init_gene_calls_dict() if not gene_calls_dict: gene_calls_dict = self.gene_calls_dict genes_in_splits = GenesInSplits() # build a dictionary for fast access to all genes identified within a contig gene_calls_in_contigs_dict = {} for gene_callers_id in gene_calls_dict: contig = gene_calls_dict[gene_callers_id]['contig'] if contig in gene_calls_in_contigs_dict: gene_calls_in_contigs_dict[contig].add(gene_callers_id) else: gene_calls_in_contigs_dict[contig] = set([gene_callers_id]) contigs_without_any_gene_calls = list( set(self.contigs_info.keys()) - set(gene_calls_in_contigs_dict.keys())) self.run.info( 'Contigs with at least one gene call', '%d of %d (%.1f%%)' % (len(gene_calls_in_contigs_dict), len(self.contigs_info), len(gene_calls_in_contigs_dict) * 100.0 / len(self.contigs_info))) for contig in contigs_without_any_gene_calls: gene_calls_in_contigs_dict[contig] = set([]) splits_dict = {} for contig in self.contigs_info: for split_name in self.contig_name_to_splits[contig]: start = self.splits_info[split_name]['start'] stop = self.splits_info[split_name]['end'] gene_start_stops = [] # here we go through all genes in the contig and identify the all the ones that happen to be in # this particular split to generate summarized info for each split. BUT one important that is done # in the following loop is genes_in_splits.add call, which populates GenesInSplits class. for gene_callers_id in gene_calls_in_contigs_dict[contig]: if gene_calls_dict[gene_callers_id][ 'stop'] > start and gene_calls_dict[ gene_callers_id]['start'] < stop: gene_start_stops.append( (gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop']), ) genes_in_splits.add( split_name, start, stop, gene_callers_id, gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop']) # here we identify genes that are associated with a split even if one base of the gene spills into # the defined start or stop of a split, which means, split N, will include genes A, B and C in this # scenario: # # contig: (...)------[ gene A ]--------[ gene B ]----[gene C]---------[ gene D ]-----(...) # (...)----------x---------------------------------------x--------------------------------(...) # ^ (split N start) ^ (split N stop) # | | # |<- split N ->| # # however, when looking at the coding versus non-coding nucleotide ratios in a split, we have to make # sure that only the relevant portion of gene A and gene C is counted: total_coding_nts = 0 for gene_start, gene_stop in gene_start_stops: total_coding_nts += ( gene_stop if gene_stop < stop else stop) - (gene_start if gene_start > start else start) splits_dict[split_name] = { 'num_genes': len(gene_start_stops), 'avg_gene_length': numpy.mean([(l[1] - l[0]) for l in gene_start_stops]) if len(gene_start_stops) else 0.0, 'ratio_coding': total_coding_nts * 1.0 / (stop - start), } # open connection database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) # push entries for genes in splits table db_entries = [[d[h] for h in t.genes_in_splits_table_structure] for d in genes_in_splits.d] database._exec_many( '''INSERT INTO %s VALUES (?,?,?,?,?)''' % t.genes_in_splits_table_name, db_entries) # disconnect database.disconnect()
def use_external_gene_calls_to_populate_genes_in_contigs_table( self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False, skip_predict_frame=False, skip_amino_acid_sequences=False): """Add genes to the contigs database. Primary input is either an `input_file_path` for external gene calls, or an external `gene_calls_dict` dictionary object. Parameters ========== input_file_path : str Path to file with one of the following structures. Option 1: gene_callers_id contig start stop direction partial call_type source version 0 CACHJY01_00016 0 693 r 1 1 prodigal v2.6.3 1 CACHJY01_00016 711 1140 r 0 1 prodigal v2.6.3 Option 2: gene_callers_id contig start stop direction partial call_type source version aa_sequence 0 CACHJY01_00016 0 693 r 1 1 prodigal v2.6.3 MSKKIYFTEYSKVNRLQTISNFTGSA 1 CACHJY01_00016 711 1140 r 0 1 prodigal v2.6.3 MVNVDYHGLIAGAGSGKTKVLTSRIAHIIK gene_calls_dict : dict, None Alternative to `input_file_path`. If provided, entries will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. Should look like: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "call_type": 1, "source": "source_name", "version": "unknown", "aa_sequence": "MSKKIYFTEYSKVNRLQTISNFTGSA" }, "2": { (...) }, (...) } All entries are required except "aa_sequence", which is optional. If provided, it should be present for ALL entries, even if it is an empty string. It's presence will be used to populate `gene_amino_acid_sequences`. ignore_internal_stop_codons : bool, False If False, ConfigError will be raised if a stop codon is found inside any gene. If True, this is suppressed and the stop codon is replaced with the character `X`. skip_predict_frame : bool, False If True, ConfigError will be raised if a gene is not divisible by 3. If False, anvi'o predicts the most likley open reading frame and trims the start/stop of the gene call to reflect this change so that the gene *is* divisible by 3. This flag allows the retention of amino acid sequences even if genes are not divisible by 3, or when it is flagged as partial. skip_amino_acid_sequences : bool, False Should the gene_amino_acid_sequences table be populated? This may be useful if genes that are not translated are being added, such as ribosomal RNA genes, etc. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError( "'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True has_aa_seq = lambda x: True if 'aa_sequence' in x else False num_with_aa_seqs = sum([ has_aa_seq(gene_call) for gene_call in gene_calls_dict.values() ]) num_gene_calls = len(gene_calls_dict) if num_with_aa_seqs != 0 and num_with_aa_seqs != num_gene_calls: raise ConfigError( "The gene_calls_dict passed to use_external_gene_calls_to_populate_genes_in_contigs_table " "has %d entries with 'aa_sequence' and %d without. Either 0 or all (%d) should have " "'aa_sequence'" % (num_with_aa_seqs, num_gene_calls - num_with_aa_seqs, num_gene_calls)) if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single( "'Use external gene calls' function found an empty gene calls dict, returning " "prematurely and assuming you know what's up. If you don't, stop here and try to " "identify what decisions you've made might have led you to this weird point your " "workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've " "done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError( "You must provide either an input file, or an gene calls dict to process external " "gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table` " "with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: expected_fields = t.genes_in_contigs_table_structure column_mapping = [int, str, int, int, str, int, int, str, str] if 'aa_sequence' in utils.get_columns_of_TAB_delim_file( input_file_path): expected_fields = t.genes_in_contigs_table_structure + [ 'aa_sequence' ] column_mapping.append( lambda x: '' if x is None else str(x)) # str(None) is 'None', amazingly gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary( input_file_path, expected_fields=expected_fields, only_expected_fields=True, column_mapping=column_mapping) if not len(gene_calls_dict): raise ConfigError( "You provided an external gene calls file, but it returned zero gene calls. Assuming that " "this is an error, anvi'o will stop here and complain. If this is not an error and you " "in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag, " "instead of providing an emtpy external gene calls file. You don't agree? You need this " "for some weird step for you weird pipeline? Let us know, and we will consider changing " "this.") self.run.info( "External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid sequences or create a blank dictionary if skip_amino_acid_sequences: amino_acid_sequences = dict([(g, '') for g in gene_calls_dict]) else: gene_calls_dict, amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict( gene_calls_dict, ignore_internal_stop_codons=ignore_internal_stop_codons, skip_predict_frame=skip_predict_frame, ) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)
def __init__(self, db_path, run=run, progress=progress): self.db_path = db_path Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)
def use_external_gene_calls_to_populate_genes_in_contigs_table( self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False, skip_amino_acid_sequences=False): """Add genes to the contigs database. Either provide an `input_file_path` for external gene calls, or provide an external gene calls dictionary. The format should follow this: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "source": "source_name", "version": "unknown" }, "2": { (...) }, (...) } If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. By default this function will also attempt to add translated DNA sequences into the corresponding table per gene call. Unless the `skip_amino_acid_sequences` flag is True. This may be useful if genes that are not translated are being added, such as ribosomal RNA genes, etc. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError( "'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single( "'Use external gene calls' function found an empty gene calls dict, returning\ prematurely and assuming you know what's up. If you don't, stop here and try to\ identify what decisions you've made might have led you to this weird point your\ workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\ done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError( "You must provide either an input file, or an gene calls dict to process external\ gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\ with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary( input_file_path, expected_fields=t.genes_in_contigs_table_structure, only_expected_fields=True, column_mapping=[int, str, int, int, str, int, str, str]) if not len(gene_calls_dict): raise ConfigError( "You provided an external gene calls file, but it returned zero gene calls. Assuming that\ this is an error, anvi'o will stop here and complain. If this is not an error and you\ in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\ instead of providing an emtpy external gene calls file. You don't agree? You need this\ for some weird step for you weird pipeline? Let us know, and we will consider changing\ this.") self.run.info( "External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid sequences or create a blank dictionary if skip_amino_acid_sequences: amino_acid_sequences = dict([(g, '') for g in gene_calls_dict]) else: amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict( gene_calls_dict, ignore_internal_stop_codons=ignore_internal_stop_codons) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)
def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False): """Add genes to the contigs database. Either provide an `input_file_path` for external gene calls, or provide an external gene calls dictionary. The format should follow this: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "source": "source_name", "version": "unknown" }, "2": { (...) }, (...) } If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\ prematurely and assuming you know what's up. If you don't, stop here and try to\ identify what decisions you've made might have led you to this weird point your\ workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\ done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError("You must provide either an input file, or an gene calls dict to process external\ gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\ with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path, expected_fields=t.genes_in_contigs_table_structure, only_expected_fields=True, column_mapping=[int, str, int, int, str, int, str, str]) if not len(gene_calls_dict): raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\ this is an error, anvi'o will stop here and complain. If this is not an error and you\ in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\ instead of providing an emtpy external gene calls file. You don't agree? You need this\ for some weird step for you weird pipeline? Let us know, and we will consider changing\ this.") self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid sequences. during this operation we are going to have to read all contig sequences # into the damn memory. anvi'o is doing a pretty bad job with memory management :( amino_acid_sequences = {} contig_sequences = {} if self.contigs_fasta: fasta = u.SequenceSource(self.contigs_fasta) while next(fasta): contig_sequences[fasta.id] = {'sequence': fasta.seq} fasta.close() else: database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name) num_genes_with_internal_stops = 0 number_of_impartial_gene_calls = 0 for gene_callers_id in gene_calls_dict: gene_call = gene_calls_dict[gene_callers_id] contig_name = gene_call['contig'] if contig_name not in contig_sequences: # remove the partial contigs database so things don't get screwed later os.remove(self.db_path) raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file\ does not appear to be in the contigs FASTA file. How did this happen?" % contig_name) if gene_call['partial']: amino_acid_sequences[gene_callers_id] = '' number_of_impartial_gene_calls += 1 continue sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']] if gene_call['direction'] == 'r': sequence = utils.rev_comp(sequence) amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id) # check if there are any internal stops: if amino_acid_sequence.find('*') > -1: if ignore_internal_stop_codons: amino_acid_sequence = amino_acid_sequence.replace('*', 'X') num_genes_with_internal_stops += 1 else: os.remove(self.db_path) raise ConfigError("Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\ which had an internal stop codon :/ This usually indicates that your external gene calls\ have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\ codons on your own risk. It will probably look very ugly on your screen, but here is the\ DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\ anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence)) amino_acid_sequences[gene_callers_id] = amino_acid_sequence # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db) if num_genes_with_internal_stops: percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict) self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\ stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\ characters, and stored them in the contigs database that way. %d of your genes, which corresponded\ to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \ (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict))) if number_of_impartial_gene_calls: self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\ were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))
def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False, skip_amino_acid_sequences=False): """Add genes to the contigs database. Either provide an `input_file_path` for external gene calls, or provide an external gene calls dictionary. The format should follow this: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "source": "source_name", "version": "unknown" }, "2": { (...) }, (...) } If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. By default this function will also attempt to add translated DNA sequences into the corresponding table per gene call. Unless the `skip_amino_acid_sequences` flag is True. This may be useful if genes that are not translated are being added, such as ribosomal RNA genes, etc. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\ prematurely and assuming you know what's up. If you don't, stop here and try to\ identify what decisions you've made might have led you to this weird point your\ workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\ done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError("You must provide either an input file, or an gene calls dict to process external\ gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\ with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path, expected_fields=t.genes_in_contigs_table_structure, only_expected_fields=True, column_mapping=[int, str, int, int, str, int, str, str]) if not len(gene_calls_dict): raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\ this is an error, anvi'o will stop here and complain. If this is not an error and you\ in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\ instead of providing an emtpy external gene calls file. You don't agree? You need this\ for some weird step for you weird pipeline? Let us know, and we will consider changing\ this.") self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid seqeunces or create a blank dictionary if skip_amino_acid_sequences: amino_acid_sequences = dict([(g, '') for g in gene_calls_dict]) else: amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict(gene_calls_dict, ignore_internal_stop_codons=ignore_internal_stop_codons) # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)
def populate_genes_in_splits_tables(self, gene_calls_dict=None): utils.is_contigs_db(self.db_path) Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress) self.set_next_available_id(t.genes_in_splits_table_name) self.init_gene_calls_dict() if not gene_calls_dict: gene_calls_dict = self.gene_calls_dict genes_in_splits = GenesInSplits(entry_id_start=self.next_id(t.genes_in_splits_table_name)) # build a dictionary for fast access to all genes identified within a contig gene_calls_in_contigs_dict = {} for gene_callers_id in gene_calls_dict: contig = gene_calls_dict[gene_callers_id]['contig'] if contig in gene_calls_in_contigs_dict: gene_calls_in_contigs_dict[contig].add(gene_callers_id) else: gene_calls_in_contigs_dict[contig] = set([gene_callers_id]) contigs_without_any_gene_calls = list(set(self.contigs_info.keys()) - set(gene_calls_in_contigs_dict.keys())) self.run.info('Contigs with at least one gene call', '%d of %d (%.1f%%)' % (len(gene_calls_in_contigs_dict), len(self.contigs_info), len(gene_calls_in_contigs_dict) * 100.0 / len(self.contigs_info))) for contig in contigs_without_any_gene_calls: gene_calls_in_contigs_dict[contig] = set([]) splits_dict = {} for contig in self.contigs_info: for split_name in self.contig_name_to_splits[contig]: start = self.splits_info[split_name]['start'] stop = self.splits_info[split_name]['end'] gene_start_stops = [] # here we go through all genes in the contig and identify the all the ones that happen to be in # this particular split to generate summarized info for each split. BUT one important that is done # in the following loop is genes_in_splits.add call, which populates GenesInSplits class. for gene_callers_id in gene_calls_in_contigs_dict[contig]: if gene_calls_dict[gene_callers_id]['stop'] > start and gene_calls_dict[gene_callers_id]['start'] < stop: gene_start_stops.append((gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop']), ) genes_in_splits.add(split_name, start, stop, gene_callers_id, gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop']) # here we identify genes that are associated with a split even if one base of the gene spills into # the defined start or stop of a split, which means, split N, will include genes A, B and C in this # scenario: # # contig: (...)------[ gene A ]--------[ gene B ]----[gene C]---------[ gene D ]-----(...) # (...)----------x---------------------------------------x--------------------------------(...) # ^ (split N start) ^ (split N stop) # | | # |<- split N ->| # # however, when looking at the coding versus non-coding nucleotide ratios in a split, we have to make # sure that only the relevant portion of gene A and gene C is counted: total_coding_nts = 0 for gene_start, gene_stop in gene_start_stops: total_coding_nts += (gene_stop if gene_stop < stop else stop) - (gene_start if gene_start > start else start) splits_dict[split_name] = {'num_genes': len(gene_start_stops), 'avg_gene_length': numpy.mean([(l[1] - l[0]) for l in gene_start_stops]) if len(gene_start_stops) else 0.0, 'ratio_coding': total_coding_nts * 1.0 / (stop - start), } # open connection database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) # push entries for genes in splits table db_entries = [tuple([entry_id] + [genes_in_splits.splits_to_prots[entry_id][h] for h in t.genes_in_splits_table_structure[1:]]) for entry_id in genes_in_splits.splits_to_prots] database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?)''' % t.genes_in_splits_table_name, db_entries) # disconnect database.disconnect()
def use_external_gene_calls_to_populate_genes_in_contigs_table( self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False): """Add genes to the contigs database. Either provide an `input_file_path` for external gene calls, or provide an external gene calls dictionary. The format should follow this: { "1": { "contig": "contig_name", "start": 20, "stop": 1544, "direction": "f", "partial": 0, "source": "source_name", "version": "unknown" }, "2": { (...) }, (...) } If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you need to make sure gene caller ids in your dict does not overlap with the ones in the database. """ # by default we assume that this is a pristine run. but if the user sends a dictionary append_to_the_db = False gene_calls_found = False # let's do a rigorous check whether the user provided a gene_calls_dict. if (gene_calls_dict is not None and gene_calls_dict is not False): if not isinstance(gene_calls_dict, dict): raise ConfigError( "'Use external gene calls' function received a non-empty gene_calls_dict object,\ but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({}))) # congrats, we have a dict. gene_calls_found = True if not len(gene_calls_dict): # but it is empty ... silly user. self.run.info_single( "'Use external gene calls' function found an empty gene calls dict, returning\ prematurely and assuming you know what's up. If you don't, stop here and try to\ identify what decisions you've made might have led you to this weird point your\ workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\ done great so far.", nl_before=1, nl_after=1) return if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found): raise ConfigError( "You must provide either an input file, or an gene calls dict to process external\ gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\ with wrong parameters.") Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # take care of gene calls dict if not gene_calls_found: gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary( input_file_path, expected_fields=t.genes_in_contigs_table_structure, only_expected_fields=True, column_mapping=[int, str, int, int, str, int, str, str]) if not len(gene_calls_dict): raise ConfigError( "You provided an external gene calls file, but it returned zero gene calls. Assuming that\ this is an error, anvi'o will stop here and complain. If this is not an error and you\ in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\ instead of providing an emtpy external gene calls file. You don't agree? You need this\ for some weird step for you weird pipeline? Let us know, and we will consider changing\ this.") self.run.info( "External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict)) else: # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to # overwrite an existing gene call. Something like this would have returned the # current max, which could be cross-checked with what's in the dict: # # contigs_db = ContigsDatabase(self.db_path) # next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1 # contigs_db.disconnect() append_to_the_db = True # recover amino acid sequences. during this operation we are going to have to read all contig sequences # into the damn memory. anvi'o is doing a pretty bad job with memory management :( amino_acid_sequences = {} contig_sequences = {} if self.contigs_fasta: fasta = u.SequenceSource(self.contigs_fasta) while next(fasta): contig_sequences[fasta.id] = {'sequence': fasta.seq} fasta.close() else: database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) contig_sequences = database.get_table_as_dict( t.contig_sequences_table_name) num_genes_with_internal_stops = 0 number_of_impartial_gene_calls = 0 for gene_callers_id in gene_calls_dict: gene_call = gene_calls_dict[gene_callers_id] contig_name = gene_call['contig'] if contig_name not in contig_sequences: # remove the partial contigs database so things don't get screwed later os.remove(self.db_path) raise ConfigError( "You are in big trouble :( The contig name '%s' in your external gene callers file\ does not appear to be in the contigs FASTA file. How did this happen?" % contig_name) if gene_call['partial']: amino_acid_sequences[gene_callers_id] = '' number_of_impartial_gene_calls += 1 continue sequence = contig_sequences[contig_name]['sequence'][ gene_call['start']:gene_call['stop']] if gene_call['direction'] == 'r': sequence = utils.rev_comp(sequence) amino_acid_sequence = utils.get_DNA_sequence_translated( sequence, gene_callers_id) # check if there are any internal stops: if amino_acid_sequence.find('*') > -1: if ignore_internal_stop_codons: amino_acid_sequence = amino_acid_sequence.replace('*', 'X') num_genes_with_internal_stops += 1 else: os.remove(self.db_path) raise ConfigError( "Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\ which had an internal stop codon :/ This usually indicates that your external gene calls\ have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\ codons on your own risk. It will probably look very ugly on your screen, but here is the\ DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\ anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence)) amino_acid_sequences[gene_callers_id] = amino_acid_sequence # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db. self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db) if num_genes_with_internal_stops: percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len( gene_calls_dict) self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\ stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\ characters, and stored them in the contigs database that way. %d of your genes, which corresponded\ to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \ (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict))) if number_of_impartial_gene_calls: self.run.warning( '%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\ were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))