예제 #1
0
    def sanity_check_for_fasta_txt(self):
        """ Run sanity checks on the fasta txt file"""

        for name in self.contigs_information.keys():
            u.is_this_name_OK_for_database('fasta.txt entry name',
                                           name,
                                           stringent=True)

        columns = next(iter(self.contigs_information.values()))
        bad_columns = [
            c for c in columns
            if c not in w.get_fields_for_fasta_information()
        ]
        if bad_columns:
            raise ConfigError("Your fasta_txt file contains columns that are \
                               not familiar to us. These are the only columns \
                               that we accept: '%s'. These are the columns that \
                               we don't like in your file: '%s'."                                                                  % (", ".join(w.get_fields_for_fasta_information()), \
                                                                   ", ".join(bad_columns)))

        contigs_with_external_functions_and_no_external_gene_calls = \
                [c for c in self.contigs_information \
                    if self.contigs_information[c].get('gene_functional_annotation')
                    and not self.contigs_information[c].get('external_gene_calls')]
        if contigs_with_external_functions_and_no_external_gene_calls:
            raise ConfigError(
                'You can only provide gene_functional_annotation in \
                               your fasta_txt if you also provide external_gene_calls. \
                               The following entries in "%s" only have functions, but no \
                               gene calls: "%s".' %
                (self.fasta_txt_file, ', '.join(
                    contigs_with_external_functions_and_no_external_gene_calls)
                 ))
예제 #2
0
    def check_params(self):
        # check the project name:
        if not self.project_name:
            raise ConfigError("Please set a project name, and be prepared to see it around as (1) anvi'o will use\
                                that name to set the output directory and to name various output files such as the\
                                databases that will be generated at the end of the process. If you set your own output\
                                directory name, you can have multiple projects in it and all of those projects can use\
                                the same intermediate files whenever possible.")

        utils.is_this_name_OK_for_database('pan project name', self.project_name, stringent=False)

        # if the user did not set a specific output directory name, use the project name
        # for it:
        self.output_dir = self.output_dir if self.output_dir else self.project_name

        # deal with the output directory:
        try:
            filesnpaths.is_file_exists(self.output_dir)
        except FilesNPathsError:
            filesnpaths.gen_output_directory(self.output_dir, delete_if_exists=self.overwrite_output_destinations)

        filesnpaths.is_output_dir_writable(self.output_dir)
        self.output_dir = os.path.abspath(self.output_dir)

        if not self.log_file_path:
            self.log_file_path = self.get_output_file_path('log.txt')

        filesnpaths.is_output_file_writable(self.log_file_path)
        os.remove(self.log_file_path) if os.path.exists(self.log_file_path) else None

        if not isinstance(self.maxbit, float):
            raise ConfigError("maxbit value must be of type float :(")

        if self.maxbit < 0 or self.maxbit > 1:
            raise ConfigError("Well. maxbit must be between 0 and 1. Yes. Very boring.")

        if not isinstance(self.min_percent_identity, float):
            raise ConfigError("Minimum percent identity value must be of type float :(")

        if self.min_percent_identity < 0 or self.min_percent_identity > 100:
            raise ConfigError("Minimum percent identity must be between 0%% and 100%%. Although your %.2f%% is\
                               pretty cute, too." % self.min_percent_identity)


        if len([c for c in list(self.genomes.values()) if 'genome_hash' not in c]):
            raise ConfigError("self.genomes does not seem to be a properly formatted dictionary for\
                               the anvi'o class Pangenome.")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.pan_db_path = self.get_output_file_path(self.project_name + '-PAN.db')
예제 #3
0
    def delete(self, collection_name):
        utils.is_this_name_OK_for_database('collection name',
                                           collection_name,
                                           stringent=False)

        # remove any pre-existing information for 'collection_name'
        self.delete_entries_for_key('collection_name', collection_name, [
            t.collections_info_table_name, t.collections_contigs_table_name,
            t.collections_splits_table_name, t.collections_bins_info_table_name
        ])
예제 #4
0
파일: panops.py 프로젝트: meren/anvio
    def check_project_name(self):
        # check the project name:
        if not self.project_name:
            raise ConfigError("Please set a project name using the `--project-name` parameter, and be prepared to see\
                               it around as anvi'o will use it for multiple things, such as setting the output directory\
                               and naming various output files including the database file that will be generated at the\
                               end of the process. If you set your own output directory name, you can have multiple\
                               projects in it and all of those projects can use the same intermediate files whenever\
                               possible.")

        utils.is_this_name_OK_for_database('pan project name', self.project_name, stringent=False)
예제 #5
0
    def sanity_check(self):
        """Make sure self.genomes is good to go"""

        # depending on whether args requested such behavior.
        self.list_HMM_info_and_quit()

        # make sure genes are called in every contigs db:
        genomes_missing_gene_calls = [
            g for g in self.genomes if not self.genomes[g]['genes_are_called']
        ]
        if len(genomes_missing_gene_calls):
            raise ConfigError(
                'Genes must have been called during the generation of contigs database for this workflow to work. However,\
                                these external genomes do not have gene calls: %s'
                % (', '.join(genomes_missing_gene_calls)))

        # if two contigs db has the same hash, we are kinda f'd:
        if len(
                set([
                    self.genomes[genome_name]['genome_hash']
                    for genome_name in self.external_genome_names
                ])) != len(self.external_genome_names):
            raise ConfigError(
                'Not all hash values are unique across all contig databases you provided. Something\
                                very fishy is going on :/')

        if len(
                set([
                    self.genomes[genome_name]['genome_hash']
                    for genome_name in self.internal_genome_names
                ])) != len(self.internal_genome_names):
            raise ConfigError(
                "Not all hash values are unique across internal genomes. This is almost impossible to happen unless something very\
                                wrong with your workflow :/ Please let the developers know if you can't figure this one out"
            )

        # make sure HMMs for SCGs were run for every contigs db:
        genomes_missing_hmms_for_scgs = [
            g for g in self.genomes
            if not self.genomes[g]['hmms_for_scgs_were_run']
        ]
        if len(genomes_missing_hmms_for_scgs):
            if len(genomes_missing_hmms_for_scgs) == len(self.genomes):
                self.run.warning(
                    "The contigs databases you are using for this analysis are missing HMMs for single-copy core genes.\
                                  Maybe you haven't run `anvi-run-hmms` on your contigs database, or they didn't contain any hits.\
                                  It is perfectly legal to have anvi'o contigs databases without HMMs or SCGs for things to work,\
                                  but we wanted to give you heads up so you can have your 'aha' moment if you see funny things in\
                                  the interface.")
            else:
                raise ConfigError("Some of the genomes you have for this analysis are missing HMM hits for SCGs (%d of %d of them, to be precise). You\
                                    can run `anvi-run-hmms` on them to recover from this. Here is the list: %s"                                                                                                                % \
                                                    (len(genomes_missing_hmms_for_scgs), len(self.genomes), ','.join(genomes_missing_hmms_for_scgs)))

        # make sure genome names are not funny (since they are going to end up being db variables soon)
        [
            utils.is_this_name_OK_for_database(
                'genome name "%s"' % genome_name, genome_name)
            for genome_name in self.genomes
        ]
예제 #6
0
    def sanity_check(self):
        """Make sure self.genomes is good to go"""

        # make sure genes are called in every contigs db:
        genomes_missing_gene_calls = [
            g for g in self.genomes if not self.genomes[g]['genes_are_called']
        ]
        if len(genomes_missing_gene_calls):
            raise ConfigError(
                'Genes must have been called during the generation of contigs database for this workflow to work. However,\
                                these external genomes do not have gene calls: %s'
                % (', '.join(genomes_missing_gene_calls)))

        # if two contigs db has the same hash, we are kinda f'd:
        if len(
                set([
                    self.genomes[genome_name]['genome_hash']
                    for genome_name in self.external_genome_names
                ])) != len(self.external_genome_names):
            raise ConfigError(
                'Not all hash values are unique across all contig databases you provided. Something\
                                very fishy is going on :/')

        if len(
                set([
                    self.genomes[genome_name]['genome_hash']
                    for genome_name in self.internal_genome_names
                ])) != len(self.internal_genome_names):
            raise ConfigError(
                "Not all hash values are unique across internal genomes. This is almost impossible to happen unless something very\
                                wrong with your workflow :/ Please let the developers know if you can't figure this one out"
            )

        # make sure HMMs for SCGs were run for every contigs db:
        genomes_missing_hmms_for_scgs = [
            g for g in self.genomes
            if not self.genomes[g]['hmms_for_scgs_were_run']
        ]
        if len(genomes_missing_hmms_for_scgs):
            if len(genomes_missing_hmms_for_scgs) == len(self.genomes):
                raise ConfigError(
                    "The contigs databases you are using for this analysis are missing HMMs for single-copy core genes. In other words,\
                                    you don't seem to have run `anvi-run-hmms` on them. Although it is perfectly legal to have anvi'o contigs databases\
                                    without HMMs run on SCGs, the current pangenomic workflow does not want to deal with this :( Sorry!"
                )
            else:
                raise ConfigError("Some of the genomes you have for this analysis are missing HMM hits for SCGs (%d of %d of them, to be precise). You\
                                    can run `anvi-run-hmms` on them to recover from this. Here is the list: %s"                                                                                                                % \
                                                    (len(genomes_missing_hmms_for_scgs), len(self.genomes), ','.join(genomes_missing_hmms_for_scgs)))

        # make sure genome names are not funny (since they are going to end up being db variables soon)
        [
            utils.is_this_name_OK_for_database(
                'genome name "%s"' % genome_name, genome_name)
            for genome_name in self.genomes
        ]
예제 #7
0
    def sanity_check(self):
        """Make sure self.genomes is good to go"""

        self.progress.new('Sanity checks')

        # depending on whether args requested such behavior.
        self.progress.update("...")
        self.list_HMM_info_and_quit()

        # make sure genes are called in every contigs db:
        self.progress.update("Checking gene calls ..")
        genomes_missing_gene_calls = [
            g for g in self.genomes if not self.genomes[g]['genes_are_called']
        ]
        if len(genomes_missing_gene_calls):
            self.progress.end()
            raise ConfigError(
                'Genes must have been called during the generation of contigs database for this workflow to work. However,\
                                these external genomes do not have gene calls: %s'
                % (', '.join(genomes_missing_gene_calls)))

        if not self.full_init:
            # if this is not full init, stop the sanity check here.
            self.progress.end()
            self.run.warning(
                "You (or the programmer) requested genome descriptions for your internal and/or external "
                "genomes to be loaded without a 'full init'. There is nothing for you to be concerned. "
                "This is just a friendly reminder to make sure if something goes terribly wrong (like your "
                "computer sets itself on fire), this may be the reason.")

            return

        self.progress.update("Checking HMMs and SCGs ..")
        # make sure HMMs for SCGs were run for every contigs db:
        genomes_missing_hmms_for_scgs = [
            g for g in self.genomes
            if not self.genomes[g]['hmms_for_scgs_were_run']
        ]
        if len(genomes_missing_hmms_for_scgs):
            if len(genomes_missing_hmms_for_scgs) == len(self.genomes):
                self.progress.reset()
                self.run.warning(
                    "The contigs databases you are using for this analysis are missing HMMs for single-copy core genes. "
                    "Maybe you haven't run `anvi-run-hmms` on your contigs database, or they didn't contain any hits. "
                    "It is perfectly legal to have anvi'o contigs databases without HMMs or SCGs for things to work, "
                    "but we wanted to give you heads up so you can have your 'aha' moment if you see funny things in "
                    "the interface.")
            else:
                self.progress.end()
                raise ConfigError("Some of the genomes you have for this analysis are missing HMM hits for SCGs (%d of %d of them, to be precise). You "
                                   "can run `anvi-run-hmms` on them to recover from this. Here is the list: %s" % \
                                                    (len(genomes_missing_hmms_for_scgs), len(self.genomes), ','.join(genomes_missing_hmms_for_scgs)))

        # make sure genome names are not funny (since they are going to end up being db variables soon)
        self.progress.update("Checking genome names ..")
        [
            utils.is_this_name_OK_for_database(
                'genome name "%s"' % genome_name, genome_name)
            for genome_name in self.genomes
        ]

        # figure out whether there are genomes with gene calls that are NOT processed
        self.progress.update("Checking gene calls that are not processed ..")
        genomes_with_non_reported_gene_calls_from_other_gene_callers = []
        for genome_name in self.genomes:
            if self.genomes[genome_name]['gene_calls_from_other_gene_callers']:
                genomes_with_non_reported_gene_calls_from_other_gene_callers.append(
                    genome_name)

        if len(genomes_with_non_reported_gene_calls_from_other_gene_callers):
            info = []
            for genome_name in genomes_with_non_reported_gene_calls_from_other_gene_callers:
                info.append('%s (%s)' % (genome_name,
                                         ', '.join(['%d gene calls by "%s"' % (tpl[1], tpl[0]) for \
                                                         tpl in self.genomes[genome_name]['gene_calls_from_other_gene_callers'].items()])))

            gene_caller = list(self.genomes.values())[0]['gene_caller']
            if anvio.DEBUG:
                self.progress.reset()
                self.run.warning("Some of your genomes had gene calls identified by gene callers other than "
                                 "the gene caller anvi'o used, which was set to '%s' either by default, or because you asked for it. "
                                 "The following genomes contained genes that were not processed (this may be exactly what you expect "
                                 "to happen, but if was not, you may need to use the `--gene-caller` flag to make sure anvi'o is using "
                                 "the gene caller it should be using): %s." % \
                                                (gene_caller, ', '.join(info)), header="PLEASE READ CAREFULLY", lc='green')
            else:
                self.progress.reset()
                self.run.warning("Some of your genomes had gene calls identified by gene callers other than "
                                 "the anvi'o default, '%s', and will not be processed. Use the `--debug` flag "
                                 "if this sounds important and you would like to see more of this message." % \
                                                (gene_caller), header="JUST FYI", lc='green')

        # check whether every genome has at least one gene call.
        self.progress.update(
            "Making sure each genome has at least one gene call ..")
        genomes_with_no_gene_calls = [
            g for g in self.genomes if not self.genomes[g]['num_genes']
        ]
        if len(genomes_with_no_gene_calls):
            self.progress.reset()
            raise ConfigError(
                "Well, %d of your %d genomes had 0 gene calls. We can't think of any reason to include genomes that "
                "contain no gene calls into a genomes, hence, we are going to stop here and ask you to remove these "
                "genomes from your analysis first: %s. If you think this is a dumb thing to do, and they should be "
                "in the genomes storage for reasons you know and we don't, please get in touch with us, and we will "
                "be happy to reconsider. If you think this is happening because you didn't set the right gene caller "
                "you can always take a look at the gene caller sources in a given contigs database by running the "
                "program `anvi-db-info`" %
                (len(genomes_with_no_gene_calls), len(
                    self.genomes), ', '.join(genomes_with_no_gene_calls)))
예제 #8
0
    def sanity_check(self):
        """Make sure self.genomes is good to go"""

        # depending on whether args requested such behavior.
        self.list_HMM_info_and_quit()

        # make sure genes are called in every contigs db:
        genomes_missing_gene_calls = [
            g for g in self.genomes if not self.genomes[g]['genes_are_called']
        ]
        if len(genomes_missing_gene_calls):
            raise ConfigError(
                'Genes must have been called during the generation of contigs database for this workflow to work. However,\
                                these external genomes do not have gene calls: %s'
                % (', '.join(genomes_missing_gene_calls)))

        # if two contigs db has the same hash, we are kinda f'd:
        if len(
                set([
                    self.genomes[genome_name]['genome_hash']
                    for genome_name in self.external_genome_names
                ])) != len(self.external_genome_names):
            raise ConfigError(
                'Not all hash values are unique across all contig databases you provided. Something\
                                very fishy is going on :/')

        if len(
                set([
                    self.genomes[genome_name]['genome_hash']
                    for genome_name in self.internal_genome_names
                ])) != len(self.internal_genome_names):
            raise ConfigError(
                "Not all hash values are unique across internal genomes. This is almost impossible to happen unless something very\
                               wrong with your workflow :/ It is most likely you managed to list the same information for different genome names.\
                               Please double check whether you internal genomes file looks perfectly fine. If it does, then perhaps let the\
                               developers know about the problem.")

        if not self.full_init:
            # if this is not full init, stop the sanity check here.
            self.run.warning(
                "You (or the programmer) requested genome descriptions for your internal and/or external\
                              genomes to be loaded without a 'full init'. There is nothing for you to be concerned.\
                              This is just a friendly reminder to make sure if something goes terribly wrong (like your\
                              computer sets itself on fire), this may be the reason."
            )
            return

        # make sure HMMs for SCGs were run for every contigs db:
        genomes_missing_hmms_for_scgs = [
            g for g in self.genomes
            if not self.genomes[g]['hmms_for_scgs_were_run']
        ]
        if len(genomes_missing_hmms_for_scgs):
            if len(genomes_missing_hmms_for_scgs) == len(self.genomes):
                self.run.warning(
                    "The contigs databases you are using for this analysis are missing HMMs for single-copy core genes.\
                                  Maybe you haven't run `anvi-run-hmms` on your contigs database, or they didn't contain any hits.\
                                  It is perfectly legal to have anvi'o contigs databases without HMMs or SCGs for things to work,\
                                  but we wanted to give you heads up so you can have your 'aha' moment if you see funny things in\
                                  the interface.")
            else:
                raise ConfigError("Some of the genomes you have for this analysis are missing HMM hits for SCGs (%d of %d of them, to be precise). You\
                                    can run `anvi-run-hmms` on them to recover from this. Here is the list: %s"                                                                                                                % \
                                                    (len(genomes_missing_hmms_for_scgs), len(self.genomes), ','.join(genomes_missing_hmms_for_scgs)))

        # make sure genome names are not funny (since they are going to end up being db variables soon)
        [
            utils.is_this_name_OK_for_database(
                'genome name "%s"' % genome_name, genome_name)
            for genome_name in self.genomes
        ]

        # figure out whether there are genomes with gene calls that are NOT processed
        genomes_with_non_reported_gene_calls_from_other_gene_callers = []
        for genome_name in self.genomes:
            if self.genomes[genome_name]['gene_calls_from_other_gene_callers']:
                genomes_with_non_reported_gene_calls_from_other_gene_callers.append(
                    genome_name)

        if len(genomes_with_non_reported_gene_calls_from_other_gene_callers):
            info = []
            for genome_name in genomes_with_non_reported_gene_calls_from_other_gene_callers:
                info.append('%s (%s)' % (genome_name,
                                         ', '.join(['%d gene calls by "%s"' % (tpl[1], tpl[0]) for \
                                                         tpl in self.genomes[genome_name]['gene_calls_from_other_gene_callers'].items()])))

            gene_caller = list(self.genomes.values())[0]['gene_caller']
            if anvio.DEBUG:
                self.run.warning("Some of your genomes had gene calls identified by gene callers other than\
                                  the gene caller anvi'o used, which was set to '%s' either by default, or because you asked for it.\
                                  The following genomes contained genes that were not processed (this may be exactly what you expect\
                                  to happen, but if was not, you may need to use the `--gene-caller` flag to make sure anvi'o is using\
                                  the gene caller it should be using): %s."                                                                            % \
                                                (gene_caller, ', '.join(info)), header="PLEASE READ CAREFULLY", lc='green')
            else:
                self.run.warning("Some of your genomes had gene calls identified by gene callers other than\
                                  the anvi'o default, '%s', and will not be processed. Use the `--debug` flag\
                                  if this sounds important and you would like to see more of this message."                                                                                                            % \
                                                (gene_caller), header="JUST FYI", lc='green')

        # check whether every genome has at least one gene call.
        genomes_with_no_gene_calls = [
            g for g in self.genomes if not self.genomes[g]['num_genes']
        ]
        if len(genomes_with_no_gene_calls):
            raise ConfigError(
                "Well, %d of your %d genomes had 0 gene calls. We can't think of any reason to include genomes that\
                               contain no gene calls into a genomes, hence, we are going to stop here and ask you to remove these\
                               genomes from your analysis first: %s. If you think this is a dumb thing to do, and they should be\
                               in the genomes storage for reasons you know and we don't, please get in touch with us, and we will\
                               be happy to reconsider." %
                (len(genomes_with_no_gene_calls), len(
                    self.genomes), ', '.join(genomes_with_no_gene_calls)))
예제 #9
0
    def append(self, collection_name, collection_dict, bins_info_dict={}):
        utils.is_this_name_OK_for_database('collection name', collection_name, stringent=False)

        for bin_name in collection_dict:
            utils.is_this_name_OK_for_database('bin name', bin_name, stringent=False)

        if bins_info_dict:
            if set(collection_dict.keys()) - set(bins_info_dict.keys()):
                raise ConfigError('Bins in the collection dict do not match to the ones in the bins info dict.\
                                    They do not have to be identical, but for each bin id, there must be a unique\
                                    entry in the bins informaiton dict. There is something wrong with your input :/')

        # remove any pre-existing information for 'collection_name'
        self.delete(collection_name)

        num_splits_in_collection_dict = sum([len(splits) for splits in list(collection_dict.values())])
        splits_in_collection_dict = set(list(chain.from_iterable(list(collection_dict.values()))))
        if len(splits_in_collection_dict) != num_splits_in_collection_dict:
            raise ConfigError("TablesForCollections::append: %d of the split or contig IDs appear more than once in\
                                your collections input. It is unclear to anvi'o how did you manage to do this, but we\
                                cannot go anywhere with this :/" % (num_splits_in_collection_dict - len(splits_in_collection_dict)))

        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # how many clusters are defined in 'collection_dict'?
        bin_names = list(collection_dict.keys())

        # push information about this search result into serach_info table.
        db_entries = tuple([collection_name, num_splits_in_collection_dict, len(bin_names), ','.join(bin_names)])
        database._exec('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_info_table_name, db_entries)

        if not bins_info_dict:
            colors = utils.get_random_colors_dict(bin_names)
            for bin_name in bin_names:
                bins_info_dict[bin_name] = {'html_color': colors[bin_name], 'source': 'UNKNOWN'}

        # populate bins info table.
        db_entries = [(self.next_id(t.collections_bins_info_table_name), collection_name, b, bins_info_dict[b]['source'], bins_info_dict[b]['html_color']) for b in bin_names]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?)''' % t.collections_bins_info_table_name, db_entries)

        # populate splits table
        db_entries = []
        for bin_name in collection_dict:
            for split_name in collection_dict[bin_name]:
                db_entries.append(tuple([self.next_id(t.collections_splits_table_name), collection_name, split_name, bin_name]))
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_splits_table_name, db_entries)
        num_splits = len(db_entries)


        # FIXME: This function can be called to populate the contigs database (via anvi-populate-collections), or
        # the profile database. when it is contigs database, the superclass Table has the self.splits_info variable
        # set when it is initialized. however, the Table instance is missing self.splis when it is initialized with
        # the profile database. hence some special controls for contigs db (note that collections_contigs_table is
        # only populated in the contigs database):
        if self.db_type == 'contigs':
            splits_only_in_collection_dict = [c for c in splits_in_collection_dict if c not in self.splits_info]
            splits_only_in_db = [c for c in self.splits_info if c not in splits_in_collection_dict]

            if len(splits_only_in_collection_dict):
                self.run.warning('%d of %d splits found in "%s" results are not in the database. This may be OK,\
                                          but you must be the judge of it. If this is somewhat surprising, please use caution\
                                          and make sure all is fine before going forward with you analysis.'\
                                                % (len(splits_only_in_collection_dict), len(splits_in_collection_dict), collection_name))

            if len(splits_only_in_db):
                self.run.warning('%d of %d splits found in the database were missing from the "%s" results. If this\
                                          does not make any sense, please make sure you know why before going any further.'\
                                                % (len(splits_only_in_db), len(self.splits_info), collection_name))

            # then populate contigs table.
            db_entries = self.process_contigs(collection_name, collection_dict)
            database._exec_many('''INSERT INTO %s VALUES (?,?,?,?)''' % t.collections_contigs_table_name, db_entries)

        database.disconnect()

        num_bins = len(bin_names)
        num_bins_to_report = 50
        if num_bins <= num_bins_to_report:
            bins_to_report = bin_names
            bin_report_msg = "Here is a full list of the bin names in this collection: {}.".format(",".join(bins_to_report))
        else:
            bins_to_report = bin_names[:num_bins_to_report]
            bin_report_msg = "Here is a list of the first {} bin names in this collection: {}.".format(num_bins_to_report, ",".join(bins_to_report))

        self.run.info('Collections', 'The collection "%s" that describes %s splits and %s bins has been successfully added to the\
                                      database at "%s". %s' % (collection_name, pp(num_splits), pp(num_bins), self.db_path, bin_report_msg), mc='green')
예제 #10
0
    def delete(self, collection_name):
        utils.is_this_name_OK_for_database('collection name', collection_name, stringent=False)

        # remove any pre-existing information for 'collection_name'
        self.delete_entries_for_key('collection_name', collection_name, [t.collections_info_table_name, t.collections_contigs_table_name, t.collections_splits_table_name, t.collections_bins_info_table_name])
예제 #11
0
    def sanity_check(self):
        """Make sure self.genomes is good to go"""

        # depending on whether args requested such behavior.
        self.list_HMM_info_and_quit()

        # make sure genes are called in every contigs db:
        genomes_missing_gene_calls = [g for g in self.genomes if not self.genomes[g]['genes_are_called']]
        if len(genomes_missing_gene_calls):
            raise ConfigError('Genes must have been called during the generation of contigs database for this workflow to work. However,\
                                these external genomes do not have gene calls: %s' % (', '.join(genomes_missing_gene_calls)))

        # if two contigs db has the same hash, we are kinda f'd:
        if len(set([self.genomes[genome_name]['genome_hash'] for genome_name in self.external_genome_names])) != len(self.external_genome_names):
            raise ConfigError('Not all hash values are unique across all contig databases you provided. Something\
                                very fishy is going on :/')


        if len(set([self.genomes[genome_name]['genome_hash'] for genome_name in self.internal_genome_names])) != len(self.internal_genome_names):
            raise ConfigError("Not all hash values are unique across internal genomes. This is almost impossible to happen unless something very\
                                wrong with your workflow :/ Please let the developers know if you can't figure this one out")

        if not self.full_init:
            # if this is not full init, stop the sanity check here.
            self.run.warning("You (or the programmer) requested genome descriptions for your internal and/or external\
                              genomes to be loaded without a 'full init'. There is nothing for you to be concerned.\
                              This is just a friendly reminder to make sure if something goes terribly wrong (like your\
                              computer sets itself on fire), this may be the reason.")
            return

        # make sure HMMs for SCGs were run for every contigs db:
        genomes_missing_hmms_for_scgs =  [g for g in self.genomes if not self.genomes[g]['hmms_for_scgs_were_run']]
        if len(genomes_missing_hmms_for_scgs):
            if len(genomes_missing_hmms_for_scgs) == len(self.genomes):
                self.run.warning("The contigs databases you are using for this analysis are missing HMMs for single-copy core genes.\
                                  Maybe you haven't run `anvi-run-hmms` on your contigs database, or they didn't contain any hits.\
                                  It is perfectly legal to have anvi'o contigs databases without HMMs or SCGs for things to work,\
                                  but we wanted to give you heads up so you can have your 'aha' moment if you see funny things in\
                                  the interface.")
            else:
                raise ConfigError("Some of the genomes you have for this analysis are missing HMM hits for SCGs (%d of %d of them, to be precise). You\
                                    can run `anvi-run-hmms` on them to recover from this. Here is the list: %s" % \
                                                    (len(genomes_missing_hmms_for_scgs), len(self.genomes), ','.join(genomes_missing_hmms_for_scgs)))

        # make sure genome names are not funny (since they are going to end up being db variables soon)
        [utils.is_this_name_OK_for_database('genome name "%s"' % genome_name, genome_name) for genome_name in self.genomes]

        # figure out whether there are genomes with gene calls that are NOT processed
        genomes_with_non_reported_gene_calls_from_other_gene_callers = []
        for genome_name in self.genomes:
            if self.genomes[genome_name]['gene_calls_from_other_gene_callers']:
                genomes_with_non_reported_gene_calls_from_other_gene_callers.append(genome_name)

        if len(genomes_with_non_reported_gene_calls_from_other_gene_callers):
            info = []
            for genome_name in genomes_with_non_reported_gene_calls_from_other_gene_callers:
                info.append('%s (%s)' % (genome_name,
                                         ', '.join(['%d gene calls by "%s"' % (tpl[1], tpl[0]) for \
                                                         tpl in self.genomes[genome_name]['gene_calls_from_other_gene_callers'].items()])))

            gene_caller = list(self.genomes.values())[0]['gene_caller']
            self.run.warning("Some of your genomes had gene calls identified by gene callers other than\
                              the gene caller anvi'o used, which was set to '%s' either by default, or because you asked for it.\
                              The following genomes contained genes that were not processed (this may be exactly what you expect\
                              to happen, but if was not, you may need to use the `--gene-caller` flag to make sure anvi'o is using\
                              the gene caller it should be using): %s." % \
                                            (gene_caller, ', '.join(info)), header="PLEASE READ CAREFULLY", lc='green')

        # check whether every genome has at least one gene call.
        genomes_with_no_gene_calls = [g for g in self.genomes if not self.genomes[g]['num_genes']]
        if len(genomes_with_no_gene_calls):
            raise ConfigError("Well, %d of your %d genomes had 0 gene calls. We can't think of any reason to include genomes that\
                               contain no gene calls into a genomes, hence, we are going to stop here and ask you to remove these\
                               genomes from your analysis first: %s. If you think this is a dumb thing to do, and they should be\
                               in the genomes storage for reasons you know and we don't, please get in touch with us, and we will\
                               be happy to reconsider." % (len(genomes_with_no_gene_calls), len(self.genomes), ', '.join(genomes_with_no_gene_calls)))
예제 #12
0
    def append(self,
               collection_name,
               collection_dict,
               bins_info_dict={},
               drop_collection=True):
        utils.is_this_name_OK_for_database('collection name',
                                           collection_name,
                                           stringent=False)

        for bin_name in collection_dict:
            utils.is_this_name_OK_for_database('bin name',
                                               bin_name,
                                               stringent=False)

        if bins_info_dict:
            if set(collection_dict.keys()) - set(bins_info_dict.keys()):
                raise ConfigError(
                    'Bins in the collection dict do not match to the ones in the bins info dict. '
                    'They do not have to be identical, but for each bin id, there must be a unique '
                    'entry in the bins informaiton dict. There is something wrong with your input :/'
                )

        if drop_collection:
            # remove any pre-existing information for 'collection_name'
            self.delete(collection_name)

        num_splits_in_collection_dict = sum(
            [len(splits) for splits in list(collection_dict.values())])
        splits_in_collection_dict = set(
            list(chain.from_iterable(list(collection_dict.values()))))
        if len(splits_in_collection_dict) != num_splits_in_collection_dict:
            raise ConfigError(
                "TablesForCollections::append: %d of the split or contig IDs appear more than once in "
                "your collections input. It is unclear to anvi'o how did you manage to do this, but we "
                "cannot go anywhere with this :/" %
                (num_splits_in_collection_dict -
                 len(splits_in_collection_dict)))

        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))

        # how many clusters are defined in 'collection_dict'?
        bin_names = list(collection_dict.keys())

        if drop_collection:
            db_entries = tuple([
                collection_name, num_splits_in_collection_dict,
                len(bin_names), ','.join(bin_names)
            ])
            database._exec(
                '''INSERT INTO %s VALUES (?,?,?,?)''' %
                t.collections_info_table_name, db_entries)

        if not bins_info_dict:
            colors = utils.get_random_colors_dict(bin_names)
            for bin_name in bin_names:
                bins_info_dict[bin_name] = {
                    'html_color': colors[bin_name],
                    'source': 'UNKNOWN'
                }

        # populate bins info table.
        db_entries = [(collection_name, b, bins_info_dict[b]['source'],
                       bins_info_dict[b]['html_color']) for b in bin_names]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?)''' %
            t.collections_bins_info_table_name, db_entries)

        # populate splits table
        db_entries = []
        for bin_name in collection_dict:
            for split_name in collection_dict[bin_name]:
                db_entries.append(
                    tuple([collection_name, split_name, bin_name]))
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?)''' %
            t.collections_splits_table_name, db_entries)
        num_splits = len(db_entries)

        # FIXME: This function can be called to populate the contigs database (via anvi-populate-collections), or
        # the profile database. when it is contigs database, the superclass Table has the self.splits_info variable
        # set when it is initialized. however, the Table instance is missing self.splis when it is initialized with
        # the profile database. hence some special controls for contigs db (note that collections_contigs_table is
        # only populated in the contigs database):
        if self.db_type == 'contigs':
            splits_only_in_collection_dict = [
                c for c in splits_in_collection_dict
                if c not in self.splits_info
            ]
            splits_only_in_db = [
                c for c in self.splits_info
                if c not in splits_in_collection_dict
            ]

            if len(splits_only_in_collection_dict):
                self.run.warning('%d of %d splits found in "%s" results are not in the database. This may be OK,\
                                          but you must be the judge of it. If this is somewhat surprising, please use caution\
                                          and make sure all is fine before going forward with you analysis.'\
                                                % (len(splits_only_in_collection_dict), len(splits_in_collection_dict), collection_name))

            if len(splits_only_in_db):
                self.run.warning('%d of %d splits found in the database were missing from the "%s" results. If this '
                                         'does not make any sense, please make sure you know why before going any further.'\
                                                % (len(splits_only_in_db), len(self.splits_info), collection_name))

            # then populate contigs table.
            db_entries = self.process_contigs(collection_name, collection_dict)
            database._exec_many(
                '''INSERT INTO %s VALUES (?,?,?,?)''' %
                t.collections_contigs_table_name, db_entries)

        database.disconnect()

        num_bins = len(bin_names)
        num_bins_to_report = 50
        if not drop_collection:
            bins_to_report = bin_names
            bin_report_msg = "Here is a full list of the bin names added to this collection: {}.".format(
                ", ".join(bins_to_report))
        elif num_bins <= num_bins_to_report:
            bins_to_report = bin_names
            bin_report_msg = "Here is a full list of the bin names in this collection: {}.".format(
                ", ".join(bins_to_report))
        else:
            bins_to_report = bin_names[:num_bins_to_report]
            bin_report_msg = "Here is a list of the first {} bin names in this collection: {}.".format(
                num_bins_to_report, ", ".join(bins_to_report))

        if drop_collection:
            self.run.info(
                'Collections',
                'The collection "%s" that describes %s splits and %s bins has been successfully added to the\
                                          database at "%s". %s' %
                (collection_name, pp(num_splits), pp(num_bins), self.db_path,
                 bin_report_msg),
                mc='green')
        else:
            self.run.info(
                'Collections',
                'The existing collection "%s" updated, %s splits and %s bins has been successfully added to the\
                                          database at "%s". %s' %
                (collection_name, pp(num_splits), pp(num_bins), self.db_path,
                 bin_report_msg),
                mc='green')
예제 #13
0
    def append(self,
               collection_name,
               collection_dict,
               bins_info_dict={},
               drop_collection=True):
        utils.is_this_name_OK_for_database('collection name',
                                           collection_name,
                                           stringent=False)

        for bin_name in collection_dict:
            utils.is_this_name_OK_for_database('bin name',
                                               bin_name,
                                               stringent=False)

        if bins_info_dict:
            if set(collection_dict.keys()) - set(bins_info_dict.keys()):
                raise ConfigError(
                    f"Bins in the collection dict do not match to the ones in the bins info dict. "
                    f"They do not have to be identical, but for each bin id, there must be a unique "
                    f"entry in the bins informaiton dict. There is something wrong with your input :/"
                )

        if drop_collection:
            # remove any pre-existing information for 'collection_name'
            self.delete(collection_name)

        num_splits_in_collection_dict = sum(
            [len(splits) for splits in list(collection_dict.values())])
        splits_in_collection_dict = set(
            list(chain.from_iterable(list(collection_dict.values()))))
        if len(splits_in_collection_dict) != num_splits_in_collection_dict:
            raise ConfigError(
                f"TablesForCollections::append: {(num_splits_in_collection_dict - len(splits_in_collection_dict))} "
                f"split names or contig IDs appear more than once in your input for this collection. This part of "
                f"the code is unable to predict how you may have ended up here, but check your input file maybe? :/"
            )

        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))

        # how many clusters are defined in 'collection_dict'?
        bin_names = list(collection_dict.keys())

        if drop_collection:
            db_entries = tuple([
                collection_name, num_splits_in_collection_dict,
                len(bin_names), ','.join(bin_names)
            ])
            database._exec(
                '''INSERT INTO %s VALUES (?,?,?,?)''' %
                t.collections_info_table_name, db_entries)

        if not bins_info_dict:
            colors = utils.get_random_colors_dict(bin_names)
            for bin_name in bin_names:
                bins_info_dict[bin_name] = {
                    'html_color': colors[bin_name],
                    'source': 'UNKNOWN'
                }

        # populate bins info table.
        db_entries = [(collection_name, b, bins_info_dict[b]['source'],
                       bins_info_dict[b]['html_color']) for b in bin_names]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?)''' %
            t.collections_bins_info_table_name, db_entries)

        # populate splits table
        db_entries = []
        for bin_name in collection_dict:
            for split_name in collection_dict[bin_name]:
                db_entries.append(
                    tuple([collection_name, split_name, bin_name]))
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?)''' %
            t.collections_splits_table_name, db_entries)
        num_splits = len(db_entries)

        # FIXME: This function can be called to populate the contigs database (via anvi-populate-collections), or
        # the profile database. when it is contigs database, the superclass Table has the self.splits_info variable
        # set when it is initialized. however, the Table instance is missing self.splis when it is initialized with
        # the profile database. hence some special controls for contigs db (note that collections_contigs_table is
        # only populated in the contigs database):
        if self.db_type == 'contigs':
            splits_only_in_collection_dict = [
                c for c in splits_in_collection_dict
                if c not in self.splits_info
            ]
            splits_only_in_db = [
                c for c in self.splits_info
                if c not in splits_in_collection_dict
            ]

            if len(splits_only_in_collection_dict):
                self.run.warning(
                    f"{len(splits_only_in_collection_dict)} of {len(splits_in_collection_dict)} splits found in "
                    f"collection '{collection_name}' are not known to the contigs database. This may be OK, but "
                    f"you must be the judge of it. If this surprises you, please use caution and make sure all "
                    f"is fine before going forward with you analysis.")

            if len(splits_only_in_db):
                self.run.warning('%d of %d splits found in the database were missing from the "%s" results. If this '
                                         'does not make any sense, please make sure you know why before going any further.'\
                                                % (len(splits_only_in_db), len(self.splits_info), collection_name))

            # then populate contigs table.
            db_entries = self.process_contigs(collection_name, collection_dict)
            database._exec_many(
                '''INSERT INTO %s VALUES (?,?,?,?)''' %
                t.collections_contigs_table_name, db_entries)

        database.disconnect()

        num_bins = len(bin_names)
        num_bins_to_report = 50
        if not drop_collection:
            bins_to_report = bin_names
            bin_report_msg = f"Here is a full list of the bin names added to this collection: {', '.join(bins_to_report)}."
        elif num_bins <= num_bins_to_report:
            bins_to_report = bin_names
            bin_report_msg = f"Here is a full list of the bin names in this collection: {', '.join(bins_to_report)}."
        else:
            bins_to_report = bin_names[:num_bins_to_report]
            bin_report_msg = f"Here is a list of the first {P('bin name', num_bins_to_report)} in this collection: {', '.join(bins_to_report)}."

        if drop_collection:
            self.run.info(
                'Collections',
                f"The collection '{collection_name}' that describes {P('split', num_splits)} in {P('bin', num_bins)} was successfully "
                f"added to the to the database at '{self.db_path}'. {bin_report_msg}",
                mc='green')
        else:
            self.run.info(
                'Collections',
                f"The existing collection '{collection_name}' updated and {P('split', num_splits)} in {P('bin', num_bins)} were successfully "
                f"added to the to the database at '{self.db_path}'. {bin_report_msg}",
                mc='green')
예제 #14
0
    def sanity_check(self):
        """Make sure self.genomes is good to go"""

        # depending on whether args requested such behavior.
        self.list_HMM_info_and_quit()

        # make sure genes are called in every contigs db:
        genomes_missing_gene_calls = [g for g in self.genomes if not self.genomes[g]['genes_are_called']]
        if len(genomes_missing_gene_calls):
            raise ConfigError('Genes must have been called during the generation of contigs database for this workflow to work. However,\
                                these external genomes do not have gene calls: %s' % (', '.join(genomes_missing_gene_calls)))

        # if two contigs db has the same hash, we are kinda f'd:
        if len(set([self.genomes[genome_name]['genome_hash'] for genome_name in self.external_genome_names])) != len(self.external_genome_names):
            raise ConfigError('Not all hash values are unique across all contig databases you provided. Something\
                                very fishy is going on :/')


        if len(set([self.genomes[genome_name]['genome_hash'] for genome_name in self.internal_genome_names])) != len(self.internal_genome_names):
            raise ConfigError("Not all hash values are unique across internal genomes. This is almost impossible to happen unless something very\
                                wrong with your workflow :/ Please let the developers know if you can't figure this one out")

        # make sure HMMs for SCGs were run for every contigs db:
        genomes_missing_hmms_for_scgs =  [g for g in self.genomes if not self.genomes[g]['hmms_for_scgs_were_run']]
        if len(genomes_missing_hmms_for_scgs):
            if len(genomes_missing_hmms_for_scgs) == len(self.genomes):
                self.run.warning("The contigs databases you are using for this analysis are missing HMMs for single-copy core genes.\
                                  Maybe you haven't run `anvi-run-hmms` on your contigs database, or they didn't contain any hits.\
                                  It is perfectly legal to have anvi'o contigs databases without HMMs or SCGs for things to work,\
                                  but we wanted to give you heads up so you can have your 'aha' moment if you see funny things in\
                                  the interface.")
            else:
                raise ConfigError("Some of the genomes you have for this analysis are missing HMM hits for SCGs (%d of %d of them, to be precise). You\
                                    can run `anvi-run-hmms` on them to recover from this. Here is the list: %s" % \
                                                    (len(genomes_missing_hmms_for_scgs), len(self.genomes), ','.join(genomes_missing_hmms_for_scgs)))

        # make sure genome names are not funny (since they are going to end up being db variables soon)
        [utils.is_this_name_OK_for_database('genome name "%s"' % genome_name, genome_name) for genome_name in self.genomes]

        # figure out whether there are genomes with gene calls that are NOT processed
        genomes_with_non_reported_gene_calls_from_other_gene_callers = []
        for genome_name in self.genomes:
            if self.genomes[genome_name]['gene_calls_from_other_gene_callers']:
                genomes_with_non_reported_gene_calls_from_other_gene_callers.append(genome_name)

        if len(genomes_with_non_reported_gene_calls_from_other_gene_callers):
            info = []
            for genome_name in genomes_with_non_reported_gene_calls_from_other_gene_callers:
                info.append('%s (%s)' % (genome_name,
                                         ', '.join(['%d gene calls by "%s"' % (tpl[1], tpl[0]) for \
                                                         tpl in self.genomes[genome_name]['gene_calls_from_other_gene_callers'].items()])))

            self.run.warning("PLEASE READ CAREFULLY. Some of your genomes had gene calls identified by gene callers other than\
                              the gene caller anvi'o used (which should be 'prodigal' unless you specified another one). As a\
                              result, the following genomes contained gene calls coming from other gene callers that did not\
                              get processed. This may be exactly what you expected to happen, but if was not, you may need to\
                              use the `--gene-caller` flag to make sure anvi'o is using the gene caller it should be using. Here\
                              is the list: %s." % (', '.join(info)), lc='green')

        # check whether every genome has at least one gene call.
        genomes_with_no_gene_calls = [g for g in self.genomes if not self.genomes[g]['num_genes']]
        if len(genomes_with_no_gene_calls):
            raise ConfigError("Well, %d of your %d genomes had 0 gene calls. We can't think of any reason to include genomes that\
                               contain no gene calls into a genomes, hence, we are going to stop here and ask you to remove these\
                               genomes from your analysis first: %s. If you think this is a dumb thing to do, and they should be\
                               in the genomes storage for reasons you know and we don't, please get in touch with us, and we will\
                               be happy to reconsider." % (len(genomes_with_no_gene_calls), len(self.genomes), ', '.join(genomes_with_no_gene_calls)))