Beispiel #1
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)

        # set these dudes so we have access to unique IDs:
        self.set_next_available_id(t.collections_bins_info_table_name)
        self.set_next_available_id(t.collections_contigs_table_name)
        self.set_next_available_id(t.collections_splits_table_name)
Beispiel #2
0
    def __init__(self, db_path):
        self.db_path = db_path
        self.states = {}

        if utils.get_db_type(self.db_path) not in ['profile', 'pan', 'structure', 'genes']:
            raise ConfigError("Your database '%s' does not seem to have states table, which anvi'o tries to access.")

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)

        self.init()
Beispiel #3
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path
        self.run = run
        self.progress = progress

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run=self.run, progress=self.progress)

        self.num_entries = 0
        self.db_entries = []
        self.set_next_available_id(t.variable_codons_table_name)
Beispiel #4
0
    def call_genes_and_populate_genes_in_contigs_table(self, gene_caller='prodigal'):
        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True)

        # get gene calls and amino acid sequences
        gene_calls_dict, amino_acid_sequences = self.run_gene_caller(gene_caller)

        # make sure the returning gene calls dict is proper
        self.check_gene_calls_dict(gene_calls_dict)

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences)
Beispiel #5
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        self.run = run
        self.progress = progress

        utils.is_contigs_db(self.db_path)

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)

        self.set_next_available_id(t.gene_function_calls_table_name)
Beispiel #6
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        utils.is_pan_db(db_path)

        self.run = run
        self.progress = progress

        Table.__init__(self, self.db_path, anvio.__pan__version__, run, progress)

        self.set_next_available_id(t.pan_gene_clusters_table_name)

        self.entries = []
Beispiel #7
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path
        self.run = run
        self.progress = progress

        utils.is_contigs_db(self.db_path)

        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress)
        TaxonNamesTable.__init__(self, self.db_path, self.run, self.progress)

        # this class keeps track of genes that occur in splits, and responsible
        # for generating the necessary table in the contigs database
        self.genes_in_splits = GenesInSplits()
Beispiel #8
0
    def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path

        utils.is_contigs_db(self.db_path)

        self.contigs_db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('contigs_db_hash')

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)

        if not self.genes_are_called:
            raise ConfigError("It seems the contigs database '%s' was created with '--skip-gene-calling' flag.\
                                Nothing to do here :/" % (self.db_path))

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            raise ConfigError("Tables that should contain gene calls are empty. Which probably means the gene\
                                caller reported no genes for your contigs.")

        self.set_next_available_id(t.hmm_hits_table_name)
        self.set_next_available_id(t.hmm_hits_splits_table_name)
Beispiel #9
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        Table.__init__(self, self.db_path, utils.get_required_version_for_db(db_path), run, progress)
Beispiel #10
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False, skip_amino_acid_sequences=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

            By default this function will also attempt to add translated DNA sequences into the
            corresponding table per gene call. Unless the `skip_amino_acid_sequences` flag is
            True. This may be useful if genes that are not translated are being added, such as
            ribosomal RNA genes, etc.
        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.", nl_before=1, nl_after=1)
                return


        if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found):
            raise ConfigError("You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path,
                                                                         expected_fields=t.genes_in_contigs_table_structure,
                                                                         only_expected_fields=True,
                                                                         column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid seqeunces or create a blank dictionary
        if skip_amino_acid_sequences:
            amino_acid_sequences = dict([(g, '') for g in gene_calls_dict])
        else:
            amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict(gene_calls_dict, ignore_internal_stop_codons=ignore_internal_stop_codons)

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)
Beispiel #11
0
    def populate_genes_in_splits_tables(self, gene_calls_dict=None):
        utils.is_contigs_db(self.db_path)
        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress)
        self.set_next_available_id(t.genes_in_splits_table_name)
        self.init_gene_calls_dict()

        if not gene_calls_dict:
            gene_calls_dict = self.gene_calls_dict

        genes_in_splits = GenesInSplits(entry_id_start=self.next_id(t.genes_in_splits_table_name))
        # build a dictionary for fast access to all genes identified within a contig
        gene_calls_in_contigs_dict = {}
        for gene_callers_id in gene_calls_dict:
            contig = gene_calls_dict[gene_callers_id]['contig']
            if contig in gene_calls_in_contigs_dict:
                gene_calls_in_contigs_dict[contig].add(gene_callers_id)
            else:
                gene_calls_in_contigs_dict[contig] = set([gene_callers_id])

        contigs_without_any_gene_calls = list(set(self.contigs_info.keys()) - set(gene_calls_in_contigs_dict.keys()))
        self.run.info('Contigs with at least one gene call', '%d of %d (%.1f%%)' % (len(gene_calls_in_contigs_dict),
                                                                                    len(self.contigs_info),
                                                                                    len(gene_calls_in_contigs_dict) * 100.0 / len(self.contigs_info)))

        for contig in contigs_without_any_gene_calls:
            gene_calls_in_contigs_dict[contig] = set([])

        splits_dict = {}
        for contig in self.contigs_info:
            for split_name in self.contig_name_to_splits[contig]:
                start = self.splits_info[split_name]['start']
                stop = self.splits_info[split_name]['end']

                gene_start_stops = []
                # here we go through all genes in the contig and identify the all the ones that happen to be in
                # this particular split to generate summarized info for each split. BUT one important that is done
                # in the following loop is genes_in_splits.add call, which populates GenesInSplits class.
                for gene_callers_id in gene_calls_in_contigs_dict[contig]:
                    if gene_calls_dict[gene_callers_id]['stop'] > start and gene_calls_dict[gene_callers_id]['start'] < stop:
                        gene_start_stops.append((gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop']), )
                        genes_in_splits.add(split_name, start, stop, gene_callers_id, gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop'])

                # here we identify genes that are associated with a split even if one base of the gene spills into
                # the defined start or stop of a split, which means, split N, will include genes A, B and C in this
                # scenario:
                #
                # contig: (...)------[ gene A ]--------[     gene B    ]----[gene C]---------[    gene D    ]-----(...)
                #         (...)----------x---------------------------------------x--------------------------------(...)
                #                        ^ (split N start)                       ^ (split N stop)
                #                        |                                       |
                #                        |<-              split N              ->|
                #
                # however, when looking at the coding versus non-coding nucleotide ratios in a split, we have to make
                # sure that only the relevant portion of gene A and gene C is counted:
                total_coding_nts = 0
                for gene_start, gene_stop in gene_start_stops:
                    total_coding_nts += (gene_stop if gene_stop < stop else stop) - (gene_start if gene_start > start else start)

                splits_dict[split_name] = {'num_genes': len(gene_start_stops),
                                           'avg_gene_length': numpy.mean([(l[1] - l[0]) for l in gene_start_stops]) if len(gene_start_stops) else 0.0,
                                           'ratio_coding': total_coding_nts * 1.0 / (stop - start),
                                           }

        # open connection
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # push entries for genes in splits table
        db_entries = [tuple([entry_id] + [genes_in_splits.splits_to_prots[entry_id][h] for h in t.genes_in_splits_table_structure[1:]]) for entry_id in genes_in_splits.splits_to_prots]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?)''' % t.genes_in_splits_table_name, db_entries)

        # disconnect
        database.disconnect()
Beispiel #12
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.", nl_before=1, nl_after=1)
                return


        if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found):
            raise ConfigError("You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path,
                                                                         expected_fields=t.genes_in_contigs_table_structure,
                                                                         only_expected_fields=True,
                                                                         column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences. during this operation we are going to have to read all contig sequences
        # into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        amino_acid_sequences = {}

        contig_sequences = {}
        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?" % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError("Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))
Beispiel #13
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(
            self,
            input_file_path,
            gene_calls_dict=None,
            ignore_internal_stop_codons=False,
            skip_amino_acid_sequences=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

            By default this function will also attempt to add translated DNA sequences into the
            corresponding table per gene call. Unless the `skip_amino_acid_sequences` flag is
            True. This may be useful if genes that are not translated are being added, such as
            ribosomal RNA genes, etc.
        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError(
                    "'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" %
                    (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single(
                    "'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.",
                    nl_before=1,
                    nl_after=1)
                return

        if (not input_file_path
                and not gene_calls_found) or (input_file_path
                                              and gene_calls_found):
            raise ConfigError(
                "You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self,
                       self.db_path,
                       anvio.__contigs__version__,
                       self.run,
                       self.progress,
                       simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(
                input_file_path,
                expected_fields=t.genes_in_contigs_table_structure,
                only_expected_fields=True,
                column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError(
                    "You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info(
                "External gene calls",
                "%d gene calls recovered and will be processed." %
                len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences or create a blank dictionary
        if skip_amino_acid_sequences:
            amino_acid_sequences = dict([(g, '') for g in gene_calls_dict])
        else:
            amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict(
                gene_calls_dict,
                ignore_internal_stop_codons=ignore_internal_stop_codons)

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict,
                                             amino_acid_sequences,
                                             append_to_the_db=append_to_the_db)
Beispiel #14
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(
            self,
            input_file_path,
            gene_calls_dict=None,
            ignore_internal_stop_codons=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError(
                    "'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" %
                    (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single(
                    "'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.",
                    nl_before=1,
                    nl_after=1)
                return

        if (not input_file_path
                and not gene_calls_found) or (input_file_path
                                              and gene_calls_found):
            raise ConfigError(
                "You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self,
                       self.db_path,
                       anvio.__contigs__version__,
                       self.run,
                       self.progress,
                       simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(
                input_file_path,
                expected_fields=t.genes_in_contigs_table_structure,
                only_expected_fields=True,
                column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError(
                    "You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info(
                "External gene calls",
                "%d gene calls recovered and will be processed." %
                len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences. during this operation we are going to have to read all contig sequences
        # into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        amino_acid_sequences = {}

        contig_sequences = {}
        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path,
                             utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(
                t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError(
                    "You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?"
                    % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][
                gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(
                sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError(
                        "Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" %
                        (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict,
                                             amino_acid_sequences,
                                             append_to_the_db=append_to_the_db)

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(
                gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy."                                                                                                                 % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning(
                '%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' %
                (number_of_impartial_gene_calls, len(gene_calls_dict)))
Beispiel #15
0
    def populate_genes_in_splits_tables(self):
        utils.is_contigs_db(self.db_path)
        Table.__init__(self, self.db_path, anvio.__contigs__version__, run,
                       progress)
        self.init_gene_calls_dict()

        genes_in_splits = GenesInSplits()
        # build a dictionary for fast access to all genes identified within a contig
        gene_calls_in_contigs_dict = {}
        for gene_callers_id in self.gene_calls_dict:
            contig = self.gene_calls_dict[gene_callers_id]['contig']
            if contig in gene_calls_in_contigs_dict:
                gene_calls_in_contigs_dict[contig].add(gene_callers_id)
            else:
                gene_calls_in_contigs_dict[contig] = set([gene_callers_id])

        contigs_without_any_gene_calls = list(
            set(self.contigs_info.keys()) -
            set(gene_calls_in_contigs_dict.keys()))
        run.info(
            'Contigs with at least one gene call', '%d of %d (%.1f%%)' %
            (len(gene_calls_in_contigs_dict), len(self.contigs_info),
             len(gene_calls_in_contigs_dict) * 100.0 / len(self.contigs_info)))

        for contig in contigs_without_any_gene_calls:
            gene_calls_in_contigs_dict[contig] = set([])

        splits_dict = {}
        for contig in self.contigs_info:
            for split_name in self.contig_name_to_splits[contig]:
                start = self.splits_info[split_name]['start']
                stop = self.splits_info[split_name]['end']

                gene_start_stops = []
                # here we go through all genes in the contig and identify the all the ones that happen to be in
                # this particular split to generate summarized info for each split. BUT one important that is done
                # in the following loop is genes_in_splits.add call, which populates GenesInSplits class.
                for gene_callers_id in gene_calls_in_contigs_dict[contig]:
                    if self.gene_calls_dict[gene_callers_id][
                            'stop'] > start and self.gene_calls_dict[
                                gene_callers_id]['start'] < stop:
                        gene_start_stops.append(
                            (self.gene_calls_dict[gene_callers_id]['start'],
                             self.gene_calls_dict[gene_callers_id]['stop']), )
                        genes_in_splits.add(
                            split_name, start, stop, gene_callers_id,
                            self.gene_calls_dict[gene_callers_id]['start'],
                            self.gene_calls_dict[gene_callers_id]['stop'])

                # here we identify genes that are associated with a split even if one base of the gene spills into
                # the defined start or stop of a split, which means, split N, will include genes A, B and C in this
                # scenario:
                #
                # contig: (...)------[ gene A ]--------[     gene B    ]----[gene C]---------[    gene D    ]-----(...)
                #         (...)----------x---------------------------------------x--------------------------------(...)
                #                        ^ (split N start)                       ^ (split N stop)
                #                        |                                       |
                #                        |<-              split N              ->|
                #
                # however, when looking at the coding versus non-coding nucleotide ratios in a split, we have to make
                # sure that only the relevant portion of gene A and gene C is counted:
                total_coding_nts = 0
                for gene_start, gene_stop in gene_start_stops:
                    total_coding_nts += (
                        gene_stop if gene_stop < stop else
                        stop) - (gene_start if gene_start > start else start)

                splits_dict[split_name] = {
                    'num_genes':
                    len(gene_start_stops),
                    'avg_gene_length':
                    numpy.mean([(l[1] - l[0]) for l in gene_start_stops])
                    if len(gene_start_stops) else 0.0,
                    'ratio_coding':
                    total_coding_nts * 1.0 / (stop - start),
                }

        # open connection
        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))
        # push raw entries for splits table
        db_entries = [
            tuple([split] + [
                splits_dict[split][h]
                for h in t.genes_in_splits_summary_table_structure[1:]
            ]) for split in splits_dict
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?)''' %
            t.genes_in_splits_summary_table_name, db_entries)

        # push entries for genes in splits table
        db_entries = [
            tuple([entry_id] + [
                genes_in_splits.splits_to_prots[entry_id][h]
                for h in t.genes_in_splits_table_structure[1:]
            ]) for entry_id in genes_in_splits.splits_to_prots
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?,?)''' %
            t.genes_in_splits_table_name, db_entries)

        # disconnect
        database.disconnect()
Beispiel #16
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(
            self,
            input_file_path,
            gene_calls_dict=None,
            ignore_internal_stop_codons=False,
            skip_predict_frame=False,
            skip_amino_acid_sequences=False):
        """Add genes to the contigs database.

        Primary input is either an `input_file_path` for external gene calls, or an
        external `gene_calls_dict` dictionary object.

        Parameters
        ==========
        input_file_path : str
            Path to file with one of the following structures.

            Option 1:
                gene_callers_id contig          start  stop  direction  partial  call_type  source    version
                0               CACHJY01_00016  0      693   r          1        1          prodigal  v2.6.3
                1               CACHJY01_00016  711    1140  r          0        1          prodigal  v2.6.3

            Option 2:
                gene_callers_id contig          start  stop  direction  partial  call_type  source    version  aa_sequence
                0               CACHJY01_00016  0      693   r          1        1          prodigal  v2.6.3   MSKKIYFTEYSKVNRLQTISNFTGSA
                1               CACHJY01_00016  711    1140  r          0        1          prodigal  v2.6.3   MVNVDYHGLIAGAGSGKTKVLTSRIAHIIK

        gene_calls_dict : dict, None
            Alternative to `input_file_path`. If provided, entries will be APPENDED to the database.
            So you need to make sure gene caller ids in your dict does not overlap with the ones in
            the database. Should look like:

                {
                    "1": {
                        "contig": "contig_name",
                        "start": 20,
                        "stop": 1544,
                        "direction": "f",
                        "partial": 0,
                        "call_type": 1,
                        "source": "source_name",
                        "version": "unknown",
                        "aa_sequence": "MSKKIYFTEYSKVNRLQTISNFTGSA"
                    },

                    "2": {
                      (...)
                    },

                    (...)
                }

            All entries are required except "aa_sequence", which is optional. If provided, it should
            be present for ALL entries, even if it is an empty string. It's presence will be used to
            populate `gene_amino_acid_sequences`.

        ignore_internal_stop_codons : bool, False
            If False, ConfigError will be raised if a stop codon is found inside any gene. If True,
            this is suppressed and the stop codon is replaced with the character `X`.

        skip_predict_frame : bool, False
            If True, ConfigError will be raised if a gene is not divisible by 3. If False, anvi'o predicts
            the most likley open reading frame and trims the start/stop of the gene call to reflect this
            change so that the gene *is* divisible by 3. This flag allows the retention of amino acid
            sequences even if genes are not divisible by 3, or when it is flagged as partial.

        skip_amino_acid_sequences : bool, False
            Should the gene_amino_acid_sequences table be populated? This may be useful if genes
            that are not translated are being added, such as ribosomal RNA genes, etc.
        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError(
                    "'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" %
                    (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            has_aa_seq = lambda x: True if 'aa_sequence' in x else False
            num_with_aa_seqs = sum([
                has_aa_seq(gene_call)
                for gene_call in gene_calls_dict.values()
            ])
            num_gene_calls = len(gene_calls_dict)
            if num_with_aa_seqs != 0 and num_with_aa_seqs != num_gene_calls:
                raise ConfigError(
                    "The gene_calls_dict passed to use_external_gene_calls_to_populate_genes_in_contigs_table "
                    "has %d entries with 'aa_sequence' and %d without. Either 0 or all (%d) should have "
                    "'aa_sequence'" % (num_with_aa_seqs, num_gene_calls -
                                       num_with_aa_seqs, num_gene_calls))

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single(
                    "'Use external gene calls' function found an empty gene calls dict, returning "
                    "prematurely and assuming you know what's up. If you don't, stop here and try to "
                    "identify what decisions you've made might have led you to this weird point your "
                    "workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've "
                    "done great so far.",
                    nl_before=1,
                    nl_after=1)
                return

        if (not input_file_path
                and not gene_calls_found) or (input_file_path
                                              and gene_calls_found):
            raise ConfigError(
                "You must provide either an input file, or an gene calls dict to process external "
                "gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table` "
                "with wrong parameters.")

        Table.__init__(self,
                       self.db_path,
                       anvio.__contigs__version__,
                       self.run,
                       self.progress,
                       simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            expected_fields = t.genes_in_contigs_table_structure
            column_mapping = [int, str, int, int, str, int, int, str, str]

            if 'aa_sequence' in utils.get_columns_of_TAB_delim_file(
                    input_file_path):
                expected_fields = t.genes_in_contigs_table_structure + [
                    'aa_sequence'
                ]
                column_mapping.append(
                    lambda x: ''
                    if x is None else str(x))  # str(None) is 'None', amazingly

            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(
                input_file_path,
                expected_fields=expected_fields,
                only_expected_fields=True,
                column_mapping=column_mapping)

            if not len(gene_calls_dict):
                raise ConfigError(
                    "You provided an external gene calls file, but it returned zero gene calls. Assuming that "
                    "this is an error, anvi'o will stop here and complain. If this is not an error and you "
                    "in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag, "
                    "instead of providing an emtpy external gene calls file. You don't agree? You need this "
                    "for some weird step for you weird pipeline? Let us know, and we will consider changing "
                    "this.")

            self.run.info(
                "External gene calls",
                "%d gene calls recovered and will be processed." %
                len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences or create a blank dictionary
        if skip_amino_acid_sequences:
            amino_acid_sequences = dict([(g, '') for g in gene_calls_dict])
        else:
            gene_calls_dict, amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict(
                gene_calls_dict,
                ignore_internal_stop_codons=ignore_internal_stop_codons,
                skip_predict_frame=skip_predict_frame,
            )

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict,
                                             amino_acid_sequences,
                                             append_to_the_db=append_to_the_db)
Beispiel #17
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        Table.__init__(self, self.db_path,
                       utils.get_required_version_for_db(db_path), run,
                       progress)