Exemple #1
0
    def __init__(self,
                 db_path,
                 num_threads_to_use=1,
                 run=run,
                 progress=progress):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path

        utils.is_contigs_db(self.db_path)

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run,
                       progress)

        if not self.genes_are_called:
            raise ConfigError(
                "It seems the contigs database '%s' was created with '--skip-gene-calling' flag.\
                                Nothing to do here :/" % (self.db_path))

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            raise ConfigError(
                "Tables that should contain gene calls are empty. Which probably means the gene\
                                caller reported no genes for your contigs.")

        self.set_next_available_id(t.hmm_hits_table_name)
        self.set_next_available_id(t.hmm_hits_splits_table_name)
Exemple #2
0
    def populate_genes_in_contigs_table(self,
                                        gene_calls_dict,
                                        amino_acid_sequences,
                                        append_to_the_db=False):
        utils.is_contigs_db(self.db_path)
        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))

        if not append_to_the_db:
            database._exec('''DELETE FROM %s''' %
                           (t.genes_in_contigs_table_name))
            database._exec('''DELETE FROM %s''' %
                           (t.gene_amino_acid_sequences_table_name))
        else:
            # so we are in the append mode. We must remove all the previous entries from genes in contigs
            # that matches to the incoming sources. otherwise we may end up with many duplicates in the db.
            sources = set([v['source'] for v in gene_calls_dict.values()])

            # basically here we will go through those sources, find gene caller ids associated with them in
            # the genes in contigs table, and then remove entries for those gene caller ids both from the
            # genes in contigs and genes in splits tables.
            for source in sources:
                gene_caller_ids_for_source = database.get_single_column_from_table(
                    t.genes_in_contigs_table_name,
                    'gene_callers_id',
                    where_clause="""source='%s'""" % source)

                if gene_caller_ids_for_source:
                    for table_name in [
                            t.genes_in_contigs_table_name,
                            t.genes_in_splits_table_name
                    ]:
                        database._exec('''DELETE FROM %s WHERE gene_callers_id IN (%s)''' % \
                                                    (table_name, ','.join([str(g) for g in gene_caller_ids_for_source])))

        self.progress.new('Processing')
        self.progress.update('Entering %d gene calls into the db ...' %
                             (len(gene_calls_dict)))

        db_entries = [
            tuple([gene_callers_id] + [
                gene_calls_dict[gene_callers_id][h]
                for h in t.genes_in_contigs_table_structure[1:]
            ]) for gene_callers_id in gene_calls_dict
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?,?)''' %
            t.genes_in_contigs_table_name, db_entries)

        db_entries = [
            tuple([gene_callers_id, amino_acid_sequences[gene_callers_id]])
            for gene_callers_id in gene_calls_dict
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?)''' %
            t.gene_amino_acid_sequences_table_name, db_entries)

        self.progress.end()

        database.disconnect()
Exemple #3
0
    def init(self):
        """Initializes informaiton about the contigs databases."""

        D = lambda domain: os.path.join(self.genomes_dir, domain)

        self.progress.new('Training')
        self.progress.update("Making sure all domain subdirectories are present")
        missing_domain_dirs = [domain for domain in self.SCG_domains if not os.path.exists(D(domain))]
        if len(missing_domain_dirs):
            raise ConfigError("Genomes directory is missing subdirectories for these domains: '%s'." % ', '.join(missing_domain_dirs))

        self.progress.update("Learning about the number of contigs databases in each domain subdirectory")
        for domain in self.SCG_domains:
            self.contigs_dbs[domain] = glob.glob(os.path.join(D(domain), '*')) 

            if len(self.contigs_dbs[domain]) == 0:
                self.progress.end()
                raise ConfigError("Each domain subdirectory must include at least one contigs database in it :/")

            if len(self.contigs_dbs[domain]) < 20:
                self.progress.reset()
                self.run.warning("The number of contigs databases found for the domain '%s' is %d. You should consider\
                             increasing the number of genomes you include for this domain. A robust classifier\
                             will require similar number of genomes for each domain that capture the diversity\
                             of the domain they represent. Say, at least 20 gneomes per domain is a good start." \
                                    % (domain, len(self.contigs_dbs[domain])))

            self.progress.update("Making sure contigs dbs are contigs dbs")
            for contigs_db_path in self.contigs_dbs[domain]:
                utils.is_contigs_db(contigs_db_path)

        self.progress.end()
Exemple #4
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    progress.new("Updating the self table")
    progress.update("...")
    taxonomy_source = contigs_db.get_meta_value('taxonomy_source')
    contigs_db.remove_meta_key_value_pair('taxonomy_source')
    contigs_db.set_meta_value('gene_level_taxonomy_source', taxonomy_source)

    progress.update("Updating version")
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    progress.update("Committing changes")
    contigs_db.disconnect()

    progress.end()
    run.info_single("The contigs database is now %s." % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #5
0
    def sanity_check(self):
        """Check sanity while straightening some input variables"""

        filesnpaths.is_output_dir_writable(self.output_dir)

        if (not (self.gene_caller_ids or self.search_term)) or (self.gene_caller_ids and self.search_term):
            raise ConfigError("You must specify exacly one of the following: --gene-caller-ids or --search-term")

        if self.use_hmm and not self.search_term:
            raise ConfigError("If you want to use HMMs to find the gene of interest that will define your locus,\
                               you must also specify a --search-term.")

        utils.is_contigs_db(self.input_contigs_db_path)

        if len(self.hmm_sources):
            self.hmm_sources = set([s.strip() for s in self.hmm_sources.split(',')])

        self.num_genes_list = [int(x) for x in self.num_genes.split(',')]
        if len(self.num_genes_list) > 2:
            raise ConfigError("The block size you provided, \"%s\", is not valid.\
                                The gene block size is defined by only one or two integers for either \
                                a block following the search match or a block preceding and following \
                                the search match respectively." % self.num_genes)

        if len(self.num_genes_list) == 1:
            self.num_genes_list = [0, self.num_genes_list[0]]

        self.run.warning(None, header="Input / Output", lc="cyan")
        self.run.info('Contigs DB', os.path.abspath(self.input_contigs_db_path))
        self.run.info('Output directory', self.output_dir)
        if ',' in self.num_genes:
            self.run.info('Genes to report', '%d genes before the matching gene, and %d that follow' % (self.num_genes_list[0], self.num_genes_list[1]))
        else:
            self.run.info('Genes to report', 'Matching gene, and %d genes after it' % (self.num_genes_list[0]))
        self.run.info('Rev-comp the locus sequence if necessary', self.reverse_complement_if_necessary)
    def get_metagenome_hash(self, entry):
        utils.is_contigs_db(entry['contigs_db_path'])
        contigs_db_hash = db.DB(
            entry['contigs_db_path'], None,
            ignore_version=True).get_meta_value('contigs_db_hash')

        return contigs_db_hash
Exemple #7
0
    def __init__(self, args, run=run, progress=progress):

        self.run = run
        self.progress = progress

        A = lambda x, t: t(args.__dict__[x]) if x in args.__dict__ else None
        null = lambda x: x
        self.contigs_db_path = A('contigs_db', null)
        self.num_threads = A('num_threads', null)
        self.hmm_program = A('hmmer_program', null) or 'hmmsearch'
        self.pfam_data_dir = A('pfam_data_dir', null)

        # load_catalog will populate this
        self.function_catalog = {}

        filesnpaths.is_program_exists(self.hmm_program)
        utils.is_contigs_db(self.contigs_db_path)

        if not self.pfam_data_dir:
            self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__),
                                              'data/misc/Pfam')

        # here, in the process of checking whether Pfam has been downloaded into the pfam_data_dir,
        # we also decompress and hmmpress the profile if it is currently gzipped
        self.is_database_exists()

        self.run.info('Pfam database directory', self.pfam_data_dir)

        self.get_version()
        self.load_catalog()
Exemple #8
0
def migrate(db_path):

    utils.check_h5py_module()
    import h5py

    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    auxiliary_path = ''.join(db_path[:-3]) + '.h5'

    if not os.path.exists(auxiliary_path):
        raise ConfigError("%s, the target of this script does not seem to be where it should have been :/" % auxiliary_path)

    fp = h5py.File(auxiliary_path, 'r')

    contigs_db.create_table(nt_position_info_table_name, nt_position_info_table_structure, nt_position_info_table_types)

    contig_names_in_db = list(fp['/data/nt_position_info'].keys())

    run.info("Auxiliary data file found", auxiliary_path)
    run.info("Contigs found", len(contig_names_in_db))

    progress.new('Processing the auxiliary data file')
    counter, total = 0, len(contig_names_in_db)

    entries = []
    for contig_name in contig_names_in_db:
        entries.append((contig_name, convert_numpy_array_to_binary_blob(fp['/data/nt_position_info/%s' % (contig_name)].value),))

        counter += 1
        progress.update('contig %d of %d ...' % (counter, total))

        if counter % 10 == 0:
            progress.update("Writing buffer to the new database ...")
            contigs_db.insert_many(nt_position_info_table_name, entries=entries)
            entries = []

    contigs_db.insert_many(nt_position_info_table_name, entries=entries)

    progress.end()
    fp.close()

    # we also want to upgrade this table name, which was renamed within #654 re:
    # merenlab/pc_to_gene_cluster PR
    contigs_db._exec('ALTER TABLE gene_protein_sequences RENAME TO gene_amino_acid_sequences;')

    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)
    contigs_db.disconnect()

    os.remove(auxiliary_path)

    run.info_single("The contigs database is now %s, and the now-obsolete '.h5' file is gone forever "
                    "and ever." % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #9
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_contigs_db(db_path)

    # make sure the version is 2
    contigs_db = db.DB(db_path, None, ignore_version=True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this contigs database is not %s (hence, this script cannot really do anything)."
            % current_version)

    progress.new("Trying to upgrade the contigs database")
    progress.update('...')

    # bye, gene_functions content
    contigs_db._exec('''DELETE FROM %s''' % t.gene_function_calls_table_name)
    contigs_db.remove_meta_key_value_pair('gene_function_sources')
    contigs_db.set_meta_value('gene_function_sources', None)
    contigs_db.commit()

    # set the version
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    # bye
    contigs_db.disconnect()

    # bye
    progress.end()
    run.info_single("The contigs database is now %s! The only thing this upgrade did was to reset your "
                    "functional annotations :/ But you know, `anvi-run-ncbi-cogs` is pretty fast!" \
                                        % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #10
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_contigs_db(db_path)

    # make sure the version is 2
    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    progress.new("Trying to upgrade the contigs database")
    progress.update('...')

    # bye, gene_functions content
    contigs_db._exec('''DELETE FROM %s''' % t.gene_function_calls_table_name)
    contigs_db.remove_meta_key_value_pair('gene_function_sources')
    contigs_db.set_meta_value('gene_function_sources', None)
    contigs_db.commit()

    # set the version
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    # bye
    contigs_db.disconnect()

    # bye
    progress.end()
    run.info_single("The contigs database is now %s! The only thing this upgrade did was to reset your\
                     functional annotations :/ But you know, `anvi-run-ncbi-cogs` is pretty fast!" \
                                        % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #11
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version=True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this contigs database is not %s (hence, this script cannot really do anything)."
            % current_version)

    genes_are_called = contigs_db.get_meta_value('genes_are_called')

    if genes_are_called:
        contigs_db = update_with_gene_calls(contigs_db)
    else:
        contigs_db = update_without_gene_calls(contigs_db)

    contigs_db.disconnect()

    run.info_single(
        "Your contigs db is now v16, and the `genes_in_contigs` table in it now has a new column for `coding_type`!",
        nl_after=1,
        nl_before=1,
        mc='green')
Exemple #12
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    progress.new("Dropping the HMMs ")
    progress.update("...")
    for table_name in ['hmm_hits_info', 'hmm_hits', 'hmm_hits_in_splits']:
        contigs_db.remove_some_rows_from_table(table_name, 'source IN ("Rinke_et_al", "Campbell_et_al", "BUSCO_83_Protista")')

    progress.update("Updating version")
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    progress.update("Committing changes")
    contigs_db.disconnect()

    progress.end()
    run.info_single("The contigs database is now %s. Unfortunatly this update removed all single-copy core gene\
                     HMMs from your contigs database :( We are very sorry about this, but we only did it to be\
                     able to offer you  nicer things. It is best if you re-run `anvi-run-hmms` program from scratch.\
                     Doing that will not remove any 'non-default' HMM profiles you may have added in this contigs\
                     database, so you have nothing to worry about." % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #13
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version=True)
    if str(contigs_db.get_version()) != current_version:
        contigs_db.disconnect()
        raise ConfigError(
            "Version of this contigs database is not %s (hence, this script cannot really do anything)."
            % current_version)
    contigs_db.disconnect()

    # drop entry ids one by one
    for table_name in tables:
        drop_entry_id_column_from_table(db_path,
                                        table_name,
                                        table_properties=tables[table_name])

    contigs_db = db.DB(db_path, None, ignore_version=True)
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)
    contigs_db.disconnect()

    run.info_single("Your contigs db is now %s. This update carried one more issue into the graveyard "
                    "of bad design decisions we've made years ago by altering %d tables in your database." \
                            % (next_version, len(tables)), nl_after=1, nl_before=1, mc='green')
Exemple #14
0
    def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path

        utils.is_contigs_db(self.db_path)

        self.contigs_db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('contigs_db_hash')

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            if self.genes_are_called:
                self.run.warning("Tables in this contigs database that should contain gene calls are empty despite the fact that\
                                  you didn't skip the gene calling step while generating this contigs database. This probably means\
                                  that the gene caller did not find any genes among contigs. This is OK for now. But might explode\
                                  later. If it does explode and you decide to let us know about that problem, please remember to mention\
                                  this warning. By the way, this warning probably has been seen by like only 2 people on the planet. Who\
                                  works with contigs with no gene calls? A better implementation of anvi'o will unite researchers who\
                                  study weird stuff.")
            else:
                self.run.warning("It seems you have skipped gene calling step while generating your contigs database, and you have no\
                                  genes calls in tables that should contain gene calls. Anvi'o will let you go with this since some HMM\
                                  sources only operate on DNA sequences, and at this point it doesn't know which HMMs you wish to run.\
                                  If the lack of genes causes a problem, you will get another error message later probably :/")

        self.set_next_available_id(t.hmm_hits_table_name)
        self.set_next_available_id(t.hmm_hits_splits_table_name)
Exemple #15
0
    def get_genome_hash_for_external_genome(self, entry):
        utils.is_contigs_db(entry['contigs_db_path'])
        contigs_db = dbops.ContigsDatabase(entry['contigs_db_path'])
        genome_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        return genome_hash
Exemple #16
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    progress.new("Updating the self table")
    progress.update("...")

    scg_taxonomy_was_run = contigs_db.get_meta_value('scg_taxonomy_was_run')
    if scg_taxonomy_was_run:
        contigs_db.set_meta_value('scg_taxonomy_database_version', 'v89')
    else:
        contigs_db.set_meta_value('scg_taxonomy_database_version', None)

    progress.update("Updating version")
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    progress.update("Committing changes")
    contigs_db.disconnect()

    progress.end()
    run.info_single("The contigs database is now %s. We just updated the self table of the contigs database with a small "
                    "bit of information to be able to track SCG taxonomy version changes more accurately going forward" % \
                                    (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #17
0
    def get_genome_hash_for_external_genome(self, entry):
        utils.is_contigs_db(entry['contigs_db_path'])
        contigs_db = dbops.ContigsDatabase(entry['contigs_db_path'])
        genome_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        return genome_hash
Exemple #18
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # initialize self.arg parameters
        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x
        self.contigs_db_path = A('contigs_db', null)
        self.structure_db_path = A('structure_db', null)
        self.genes_to_remove = A('genes_to_remove', null)
        self.genes_to_remove_path = A('genes_to_remove_file', null)
        self.genes_to_add = A('genes_to_add', null)
        self.genes_to_add_path = A('genes_to_add_file', null)
        self.full_modeller_output = A('dump_dir', null)
        self.modeller_executable = A('modeller_executable', null)
        self.DSSP_executable = None

        utils.is_contigs_db(self.contigs_db_path)
        self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash']

        # init ContigsSuperClass
        self.contigs_super = ContigsSuperclass(self.args)

        if not any([
                self.genes_to_remove, self.genes_to_remove_path,
                self.genes_to_add, self.genes_to_add_path
        ]):
            raise ConfigError(
                "Please specify some genes to add or remove to your database.")

        if self.genes_to_remove and self.genes_to_remove_path:
            raise ConfigError(
                "Provide either --genes-to-remove or --genes-to-remove-path. You provided both."
            )

        if self.genes_to_add and self.genes_to_add_path:
            raise ConfigError(
                "Provide either --genes-to-add or --genes-to-add-path. You provided both."
            )

        if self.genes_to_remove or self.genes_to_remove_path:
            self.run.warning("Removing genes...",
                             header="Updating %s" % self.structure_db_path,
                             lc='green')
            self.load_structure_db()
            remove = self.parse_genes(self.genes_to_remove,
                                      self.genes_to_remove_path)
            self.remove_genes(remove)
            self.structure_db.disconnect()

        if self.genes_to_add or self.genes_to_add_path:
            self.run.warning("Adding genes...",
                             header="Updating %s" % self.structure_db_path,
                             lc='green')
            self.load_structure_db()
            self.add_genes()
Exemple #19
0
    def get_genome_hash_for_internal_genome(self, entry):
        utils.is_contigs_db(entry['contigs_db_path'])
        split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(entry)
        contigs_db = dbops.ContigsDatabase(entry['contigs_db_path'])
        genome_hash = hashlib.sha224('_'.join([''.join(split_names_of_interest), contigs_db.meta['contigs_db_hash']]).encode('utf-8')).hexdigest()[0:12]
        contigs_db.disconnect()

        return genome_hash
Exemple #20
0
    def get_genome_hash_for_internal_genome(self, entry):
        utils.is_contigs_db(entry['contigs_db_path'])
        split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(entry)
        contigs_db = dbops.ContigsDatabase(entry['contigs_db_path'])
        genome_hash = hashlib.sha224('_'.join([''.join(split_names_of_interest), contigs_db.meta['contigs_db_hash']]).encode('utf-8')).hexdigest()[0:12]
        contigs_db.disconnect()

        return genome_hash
Exemple #21
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version=True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this contigs database is not %s (hence, this script cannot really do anything)."
            % current_version)

    progress.new("Removing '" + genes_in_splits_summary_table_name + "'")
    contigs_db._exec("DROP TABLE %s;" % genes_in_splits_summary_table_name)
    progress.end()

    progress.new("Upgrading '" + genes_in_splits_table_name + "'")

    progress.update("Creating temporary table")
    contigs_db.create_table(genes_in_splits_table_name + '_temp',
                            genes_in_splits_table_structure,
                            genes_in_splits_table_types)

    progress.update("Moving unique records")
    contigs_db._exec(
        "INSERT INTO %s SELECT * FROM %s GROUP BY %s;" %
        (genes_in_splits_table_name + '_temp', genes_in_splits_table_name,
         ", ".join(genes_in_splits_table_structure[1:])))
    progress.update("Updating entry_id")
    contigs_db._exec("UPDATE %s SET entry_id = rowid - 1;" %
                     (genes_in_splits_table_name + '_temp'))

    progress.update("Swapping temporary table with actual table")
    contigs_db._exec(
        "ALTER TABLE %s RENAME TO %s;" %
        (genes_in_splits_table_name, genes_in_splits_table_name + '_old'))
    contigs_db._exec(
        "ALTER TABLE %s RENAME TO %s;" %
        (genes_in_splits_table_name + '_temp', genes_in_splits_table_name))

    progress.update("Removing old table")
    contigs_db._exec("DROP TABLE %s;" % (genes_in_splits_table_name + '_old'))

    progress.update("Optimizing the database")
    contigs_db._exec("VACUUM;")

    progress.update("Updating version")
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    progress.update("Committing changes")
    contigs_db.disconnect()

    progress.end()
    run.info_single("The contigs database is now %s." % (next_version),
                    nl_after=1,
                    nl_before=1,
                    mc='green')
Exemple #22
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        self.run = run
        self.progress = progress

        utils.is_contigs_db(self.db_path)

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)
Exemple #23
0
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_search_methods:
            raise ConfigError("Let us start by making it clear that we probably like '%s' as much as you do, but it doesn't\
                               seem to be available on your system OR recognized by the COGs class since anvi'o couldn't\
                               find it among the available search methods. You probably need to try something else :/" \
                                                                                                    % self.search_with)

        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError("Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\
                               database formatted under the COGs data directory for that program :/ You may need to\
                               re-run the COGs setup, UNLESS, you set up your COG data directory somewhere else than what\
                               anvi'o attempts to use at the moment ('%s'). If that is the case, this may be the best\
                               time to point the right directory using the --cog-data-dir parameter, or the environmental\
                               variable 'ANVIO_COG_DATA_DIR'." % (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError("You either need to provide an anvi'o contigs database path, or a FASTA file for AA\
                               sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError("You can't provide both an AA sequences file and a contigs database. Choose one!")

        if self.contigs_db_path:
            utils.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning("Because you set the temporary directory path by hand, anvi'o will not remove its content\
                              when it is done. But she certainly hopes that you will clean those files later.")

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Searching with', self.search_with)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run', self.remove_temp_dir_path)

        if not aa_sequences_file_path:
            aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa'))

        # do the search
        search_results_tabular = self.search_methods_factory[self.search_with](aa_sequences_file_path)

        # convert the output to a hits dict
        self.hits = utils.get_BLAST_tabular_output_as_dict(search_results_tabular, target_id_parser_func=lambda x: x.split('|')[1])

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
Exemple #24
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    auxiliary_path = ''.join(db_path[:-3]) + '.h5'

    if not os.path.exists(auxiliary_path):
        raise ConfigError("%s, the target of this script does not seem to be where it should have been :/" % auxiliary_path)

    fp = h5py.File(auxiliary_path, 'r')

    contigs_db.create_table(nt_position_info_table_name, nt_position_info_table_structure, nt_position_info_table_types)

    contig_names_in_db = list(fp['/data/nt_position_info'].keys())

    run.info("Auxiliary data file found", auxiliary_path)
    run.info("Contigs found", len(contig_names_in_db))

    progress.new('Processing the auxiliary data file')
    counter, total = 0, len(contig_names_in_db)

    entries = []
    for contig_name in contig_names_in_db:
        entries.append((contig_name, convert_numpy_array_to_binary_blob(fp['/data/nt_position_info/%s' % (contig_name)].value),))

        counter += 1
        progress.update('contig %d of %d ...' % (counter, total))

        if counter % 10 == 0:
            progress.update("Writing buffer to the new database ...")
            contigs_db.insert_many(nt_position_info_table_name, entries=entries)
            entries = []

    contigs_db.insert_many(nt_position_info_table_name, entries=entries)

    progress.end()
    fp.close()

    # we also want to upgrade this table name, which was renamed within #654 re:
    # merenlab/pc_to_gene_cluster PR
    contigs_db._exec('ALTER TABLE gene_protein_sequences RENAME TO gene_amino_acid_sequences;')

    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)
    contigs_db.disconnect()

    os.remove(auxiliary_path)

    run.info_single("The contigs database is now %s, and the now-obsolete '.h5' file is gone forever\
                     and ever." % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #25
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        self.run = run
        self.progress = progress

        utils.is_contigs_db(self.db_path)

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)

        self.set_next_available_id(t.gene_function_calls_table_name)
Exemple #26
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path

        self.run = run
        self.progress = progress

        utils.is_contigs_db(self.db_path)

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)

        self.set_next_available_id(t.gene_function_calls_table_name)
Exemple #27
0
    def __init__(self, db_path, contigs_fasta=None, run=run, progress=progress, debug=False):
        self.run = run
        self.progress = progress
        self.db_path = db_path
        self.contigs_fasta = contigs_fasta
        self.debug = debug

        utils.is_contigs_db(self.db_path)

        if self.contigs_fasta:
            filesnpaths.is_file_exists(self.contigs_fasta)
            filesnpaths.is_file_fasta_formatted(self.contigs_fasta)
Exemple #28
0
    def __init__(self, db_path, contigs_fasta=None, run=terminal.Run(), progress=terminal.Progress(), debug=False):
        self.run = run
        self.progress = progress
        self.db_path = db_path
        self.contigs_fasta = contigs_fasta
        self.debug = debug

        utils.is_contigs_db(self.db_path)

        if self.contigs_fasta:
            filesnpaths.is_file_exists(self.contigs_fasta)
            filesnpaths.is_file_fasta_formatted(self.contigs_fasta)
Exemple #29
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path
        self.run = run
        self.progress = progress

        utils.is_contigs_db(self.db_path)

        Table.__init__(self, self.db_path, anvio.__contigs__version__,
                       self.run, self.progress)

        # this class keeps track of genes that occur in splits, and responsible
        # for generating the necessary table in the contigs database
        self.genes_in_splits = GenesInSplits()
Exemple #30
0
    def __init__(self, db_path, run=run, progress=progress):
        self.db_path = db_path
        self.run = run
        self.progress = progress

        utils.is_contigs_db(self.db_path)

        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress)
        TaxonNamesTable.__init__(self, self.db_path, self.run, self.progress)

        # this class keeps track of genes that occur in splits, and responsible
        # for generating the necessary table in the contigs database
        self.genes_in_splits = GenesInSplits()
Exemple #31
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_contigs_db(db_path)

    # make sure the version is 2
    contigs_db = db.DB(db_path, None, ignore_version=True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this contigs database is not %s (hence, this script cannot really do anything)."
            % current_version)

    progress.new("Trying to upgrade the contigs database")
    progress.update('...')

    # drop the old tables:
    try:
        contigs_db._exec('''DROP TABLE %s''' % (t.hmm_hits_info_table_name))
        contigs_db._exec('''DROP TABLE %s''' % (t.hmm_hits_table_name))
        contigs_db._exec('''DROP TABLE %s''' % (t.hmm_hits_splits_table_name))
    except:
        pass
    contigs_db.commit()

    # create new empty ones
    contigs_db.create_table(t.hmm_hits_info_table_name,
                            t.hmm_hits_info_table_structure,
                            t.hmm_hits_info_table_types)
    contigs_db.create_table(t.hmm_hits_table_name, t.hmm_hits_table_structure,
                            t.hmm_hits_table_types)
    contigs_db.create_table(t.hmm_hits_splits_table_name,
                            t.hmm_hits_splits_table_structure,
                            t.hmm_hits_splits_table_types)

    # set the version
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    # bye
    contigs_db.disconnect()

    # bye
    progress.end()
    run.info_single(
        "The contigs database is now %s! It no longer has any HMM hits :/ Don't \
                     forget to run `anvi-run-hmms` on it!" % (next_version),
        nl_after=1,
        nl_before=1,
        mc='green')
Exemple #32
0
    def process(self, aa_sequences_file_path=None):
        if self.search_with not in self.available_db_search_program_targets:
            raise ConfigError("Anvi'o understands that you want to use '%s' to search for COGs, however, there is no\
                                database formatted under the COGs data directory for that program :/ You may need to\
                                re-run the COGs setup, UNLESS, you set up your COG data directory somewhere else than what\
                                anvi'o attempts to use at the moment ('%s'). If that is the case, this may be the best\
                                time to point the right directory using the --cog-data-dir parameter, or the environmental\
                                variable 'ANVIO_COG_DATA_DIR'." % (self.search_with, self.COG_data_dir))

        if not aa_sequences_file_path and not self.contigs_db_path:
            raise ConfigError("You either need to provide an anvi'o contigs database path, or a FASTA file for AA\
                                sequences")

        if aa_sequences_file_path and self.contigs_db_path:
            raise ConfigError("You can't provide both an AA sequences file and a contigs database. Choose one!")

        if self.contigs_db_path:
            utils.is_contigs_db(self.contigs_db_path)

        if not self.temp_dir_path:
            self.temp_dir_path = filesnpaths.get_temp_directory_path()
            self.remove_temp_dir_path = True
        else:
            filesnpaths.is_file_exists(self.temp_dir_path)
            filesnpaths.is_output_dir_writable(self.temp_dir_path)

            self.run.warning("Because you set the temporary directory path by hand, anvi'o will not remove its content\
                              when it is done. But she certainly hopes that you will clean those files later.")

            self.remove_temp_dir_path = False

        self.run.info('COG data directory', self.COG_data_dir)
        self.run.info('Searching with', self.search_with)
        self.run.info('Directory to store temporary files', self.temp_dir_path)
        self.run.info('Directory will be removed after the run', self.remove_temp_dir_path)

        if not aa_sequences_file_path:
            aa_sequences_file_path = dbops.export_aa_sequences_from_contigs_db(self.contigs_db_path, J(self.temp_dir_path, 'aa_sequences.fa'))

        # do the search
        search_results_tabular = self.search_factory[self.search_with](aa_sequences_file_path)

        # convert the output to a hits dict
        self.hits = utils.get_BLAST_tabular_output_as_dict(search_results_tabular, target_id_parser_func=lambda x: x.split('|')[1])

        # store hits into the contigs database
        self.store_hits_into_contigs_db()

        if self.remove_temp_dir_path:
            shutil.rmtree(self.temp_dir_path)
Exemple #33
0
    def __init__(self,
                 db_path,
                 num_threads_to_use=1,
                 run=run,
                 progress=progress,
                 initializing_for_deletion=False,
                 just_do_it=False,
                 hmm_program_to_use='hmmscan',
                 hmmer_output_directory=None,
                 get_domain_table_output=False):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path
        self.just_do_it = just_do_it
        self.hmm_program = hmm_program_to_use or 'hmmscan'
        self.hmmer_output_dir = hmmer_output_directory
        self.hmmer_desired_output = (
            'table', 'domtable') if get_domain_table_output else 'table'

        utils.is_contigs_db(self.db_path)
        filesnpaths.is_program_exists(self.hmm_program)

        self.contigs_db_hash = db.DB(
            self.db_path, utils.get_required_version_for_db(
                self.db_path)).get_meta_value('contigs_db_hash')

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run,
                       progress)

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            if self.genes_are_called:
                self.run.warning(
                    "Tables in this contigs database that should contain gene calls are empty despite the fact that "
                    "you didn't skip the gene calling step while generating this contigs database. This probably means "
                    "that the gene caller did not find any genes among contigs. This is OK for now. But might explode "
                    "later. If it does explode and you decide to let us know about that problem, please remember to mention "
                    "this warning. By the way, this warning probably has been seen by like only 2 people on the planet. Who "
                    "works with contigs with no gene calls? A better implementation of anvi'o will unite researchers who "
                    "study weird stuff.")
            else:
                self.run.warning(
                    "It seems you have skipped gene calling step while generating your contigs database, and you have no "
                    "genes calls in tables that should contain gene calls. Anvi'o will let you go with this since some HMM "
                    "sources only operate on DNA sequences, and at this point it doesn't know which HMMs you wish to run. "
                    "If the lack of genes causes a problem, you will get another error message later probably :/"
                )

        if not initializing_for_deletion:
            self.set_next_available_id(t.hmm_hits_table_name)
Exemple #34
0
    def sanity_check(self):
        """Check sanity while straightening some input variables"""

        filesnpaths.is_output_dir_writable(self.output_dir)

        if (not (self.gene_caller_ids or self.search_term)) or (
                self.gene_caller_ids and self.search_term):
            raise ConfigError(
                "You must specify exacly one of the following: --gene-caller-ids or --search-term"
            )

        if self.use_hmm and not self.search_term:
            raise ConfigError(
                "If you want to use HMMs to find the gene of interest that will define your locus,\
                               you must also specify a --search-term.")

        utils.is_contigs_db(self.input_contigs_db_path)

        if len(self.hmm_sources):
            self.hmm_sources = set(
                [s.strip() for s in self.hmm_sources.split(',')])

        self.num_genes_list = [int(x) for x in self.num_genes.split(',')]
        if len(self.num_genes_list) > 2:
            raise ConfigError(
                "The block size you provided, \"%s\", is not valid.\
                                The gene block size is defined by only one or two integers for either \
                                a block following the search match or a block preceding and following \
                                the search match respectively." %
                self.num_genes)

        if len(self.num_genes_list) == 1:
            self.num_genes_list = [0, self.num_genes_list[0]]

        self.run.warning(None, header="Input / Output", lc="cyan")
        self.run.info('Contigs DB',
                      os.path.abspath(self.input_contigs_db_path))
        self.run.info('Output directory', self.output_dir)
        if ',' in self.num_genes:
            self.run.info(
                'Genes to report',
                '%d genes before the matching gene, and %d that follow' %
                (self.num_genes_list[0], self.num_genes_list[1]))
        else:
            self.run.info(
                'Genes to report', 'Matching gene, and %d genes after it' %
                (self.num_genes_list[0]))
        self.run.info('Rev-comp the locus sequence if necessary',
                      self.reverse_complement_if_necessary)
Exemple #35
0
    def __init__(self,
                 db_path,
                 run=run,
                 progress=progress,
                 profile_db_path=False):
        self.db_path = db_path
        self.run = run
        self.progress = progress

        utils.is_contigs_db(self.db_path)

        Table.__init__(self, self.db_path, anvio.__contigs__version__,
                       self.run, self.progress)

        self.set_next_available_id(t.scg_taxonomy_table_name)
    def init(self):
        """Initializes informaiton about the contigs databases."""

        D = lambda domain: os.path.join(self.genomes_dir, domain)
        domain_dirs = [os.path.basename(os.path.abspath(f.path)) for f in os.scandir(self.genomes_dir) if f.is_dir()]

        self.progress.new('Training')
        self.progress.update("Making sure all domain subdirectories are present")
        missing_domain_dirs = [domain for domain in self.SCG_domains if domain not in domain_dirs]
        if len(missing_domain_dirs):
            raise ConfigError("Genomes directory is missing subdirectories for these domains: '%s'." % ', '.join(missing_domain_dirs))

        unexpected_domain_dirs = [domain for domain in domain_dirs if domain not in self.SCG_domains]
        if len(unexpected_domain_dirs):
            self.progress.reset()
            self.run.warning("THIS IS VERY IMPORTANT! In the directory where you have all the domain directories to\
                              train a new domain classifier, anvi'o found domain directories that did not match any\
                              known domains. Here is the list of orphan domains we are talking about here: \"%s.\".\
                              This process will continue to train a classifier, but this is a serious problem as anvi'o\
                              will simply ignore all orphan domains. This is becuase the current single copy-core gene\
                              collections anvi'o knows and cares about do not include those orphan domains. Hence, the\
                              training step will not take these orphan domains into consideration :/ If you don't care\
                              about this, you should feel free to move on. If you want to include those one or more\
                              orphan domains, you should first copy the HMM directory you have for that domain into\
                              the directory (which seems to be at '%s' for your anvi'o instance) so it looks just\
                              like another HMM profile for anvi'o, and then re-run the training." % (', '.join(unexpected_domain_dirs), hmm_data.dir_path))

        self.progress.update("Learning about the number of contigs databases in each domain subdirectory")
        for domain in self.SCG_domains:
            self.contigs_dbs[domain] = glob.glob(os.path.join(D(domain), '*')) 

            if len(self.contigs_dbs[domain]) == 0:
                self.progress.end()
                raise ConfigError("Each domain subdirectory must include at least one contigs database in it :/")

            if len(self.contigs_dbs[domain]) < 20:
                self.progress.reset()
                self.run.warning("The number of contigs databases found for the domain '%s' is %d. You should consider\
                             increasing the number of genomes you include for this domain. A robust classifier\
                             will require similar number of genomes for each domain that capture the diversity\
                             of the domain they represent. Say, at least 20 gneomes per domain is a good start." \
                                    % (domain, len(self.contigs_dbs[domain])))

            self.progress.update("Making sure contigs dbs are contigs dbs")
            for contigs_db_path in self.contigs_dbs[domain]:
                utils.is_contigs_db(contigs_db_path)

        self.progress.end()
Exemple #37
0
    def init(self):
        """Initializes informaiton about the contigs databases."""

        D = lambda domain: os.path.join(self.genomes_dir, domain)
        domain_dirs = [os.path.basename(os.path.abspath(f.path)) for f in os.scandir(self.genomes_dir) if f.is_dir()]

        self.progress.new('Training')
        self.progress.update("Making sure all domain subdirectories are present")
        missing_domain_dirs = [domain for domain in self.SCG_domains if domain not in domain_dirs]
        if len(missing_domain_dirs):
            raise ConfigError("Genomes directory is missing subdirectories for these domains: '%s'." % ', '.join(missing_domain_dirs))

        unexpected_domain_dirs = [domain for domain in domain_dirs if domain not in self.SCG_domains]
        if len(unexpected_domain_dirs):
            self.progress.reset()
            self.run.warning("THIS IS VERY IMPORTANT! In the directory where you have all the domain directories to\
                              train a new domain classifier, anvi'o found domain directories that did not match any\
                              known domains. Here is the list of orphan domains we are talking about here: \"%s.\".\
                              This process will continue to train a classifier, but this is a serious problem as anvi'o\
                              will simply ignore all orphan domains. This is becuase the current single copy-core gene\
                              collections anvi'o knows and cares about do not include those orphan domains. Hence, the\
                              training step will not take these orphan domains into consideration :/ If you don't care\
                              about this, you should feel free to move on. If you want to include those one or more\
                              orphan domains, you should first copy the HMM directory you have for that domain into\
                              the directory (which seems to be at '%s' for your anvi'o instance) so it looks just\
                              like another HMM profile for anvi'o, and then re-run the training." % (', '.join(unexpected_domain_dirs), hmm_data.dir_path))

        self.progress.update("Learning about the number of contigs databases in each domain subdirectory")
        for domain in self.SCG_domains:
            self.contigs_dbs[domain] = glob.glob(os.path.join(D(domain), '*')) 

            if len(self.contigs_dbs[domain]) == 0:
                self.progress.end()
                raise ConfigError("Each domain subdirectory must include at least one contigs database in it :/")

            if len(self.contigs_dbs[domain]) < 20:
                self.progress.reset()
                self.run.warning("The number of contigs databases found for the domain '%s' is %d. You should consider\
                             increasing the number of genomes you include for this domain. A robust classifier\
                             will require similar number of genomes for each domain that capture the diversity\
                             of the domain they represent. Say, at least 20 gneomes per domain is a good start." \
                                    % (domain, len(self.contigs_dbs[domain])))

            self.progress.update("Making sure contigs dbs are contigs dbs")
            for contigs_db_path in self.contigs_dbs[domain]:
                utils.is_contigs_db(contigs_db_path)

        self.progress.end()
Exemple #38
0
    def init(self):
        utils.is_contigs_db(self.contigs_db_path)

        self.run.info('Input BAM file(s)', ', '.join([os.path.basename(f) for f in self.input_bam_files]))

        d = ccollections.GetSplitNamesInBins(self.args).get_dict()
        self.bins = list(d.keys())

        for split_names in list(d.values()):
            self.split_names_of_interest.update(split_names)

        self.run.info('Collection ID', self.collection_name)
        self.run.info('Bin(s)', ', '.join(self.bins))
        self.run.info('Number of splits', pp(len(self.split_names_of_interest)))

        self.initialized = True
Exemple #39
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_contigs_db(db_path)

    # make sure the current version matches
    contigs_db = db.DB(db_path, None, ignore_version=True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this contigs database is not %s (hence, this script cannot really do anything)."
            % current_version)

    progress.new("Trying to upgrade the contigs database")
    progress.update('...')

    contigs_db.remove_meta_key_value_pair('project_name')
    contigs_db.set_meta_value('project_name', "NO_NAME")
    contigs_db.commit()

    # set the version
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    # gene name changes:
    contigs_db._exec(
        '''UPDATE %s SET genes = replace(genes, '%s', '%s') WHERE source LIKE 'Rinke_et_al';'''
        % (t.hmm_hits_info_table_name, 'Ribosomal_S12', 'Ribosom_S12_S23'))
    contigs_db._exec(
        '''UPDATE %s SET genes = replace(genes, '%s', '%s') Where source LIKE 'Rinke_et_al';'''
        % (t.hmm_hits_info_table_name, 'UPF0027', 'RtcB'))
    contigs_db._exec(
        '''UPDATE %s SET genes = replace(genes, '%s', '%s') Where source LIKE 'Rinke_et_al';'''
        % (t.hmm_hits_info_table_name, '‐', '-'))  # first - is not ASCII

    # bye
    contigs_db.disconnect()

    # bye to you too
    progress.end()

    dbops.update_description_in_db(db_path, 'No description is given')

    run.info_single("The contigs database is now %s! All this upgrade did was to associate your contigs db with a\
                     project name (which happened to be 'NO_NAME', because anvi'o likes you very much)" \
                                            % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #40
0
    def populate_genes_in_contigs_table(self,
                                        gene_calls_dict,
                                        amino_acid_sequences,
                                        append_to_the_db=False):
        utils.is_contigs_db(self.db_path)
        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))

        if not append_to_the_db:
            database._exec('''DELETE FROM %s''' %
                           (t.genes_in_contigs_table_name))
            database._exec('''DELETE FROM %s''' %
                           (t.gene_amino_acid_sequences_table_name))
        else:
            # so we are in the append mode. We must remove all the previous entries from genes in contigs
            # that matches to the incoming sources. otherwhise we may end up with many duplicates in the db.
            sources = set([v['source'] for v in gene_calls_dict.values()])
            for source in sources:
                database._exec('''DELETE FROM %s WHERE source = "%s"''' %
                               (t.genes_in_contigs_table_name, source))

        self.progress.new('Processing')
        self.progress.update('Entering %d gene calls into the db ...' %
                             (len(gene_calls_dict)))

        db_entries = [
            tuple([entry_id] + [
                gene_calls_dict[entry_id][h]
                for h in t.genes_in_contigs_table_structure[1:]
            ]) for entry_id in gene_calls_dict
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?)''' %
            t.genes_in_contigs_table_name, db_entries)

        db_entries = [
            tuple([entry_id] + [amino_acid_sequences[entry_id]])
            for entry_id in gene_calls_dict
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?)''' %
            t.gene_amino_acid_sequences_table_name, db_entries)

        self.progress.end()

        database.disconnect()
Exemple #41
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # initialize self.arg parameters
        A                                  = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None
        null                               = lambda x: x
        self.contigs_db_path               = A('contigs_db', null)
        self.structure_db_path             = A('structure_db', null)
        self.genes_to_remove               = A('genes_to_remove', null)
        self.genes_to_remove_path          = A('genes_to_remove_file', null)
        self.genes_to_add                  = A('genes_to_add', null)
        self.genes_to_add_path             = A('genes_to_add_file', null)
        self.full_modeller_output          = A('dump_dir', null)
        self.modeller_executable           = A('modeller_executable', null)
        self.skip_genes_if_already_present = A('skip_genes_if_already_present', bool)
        self.DSSP_executable               = None

        utils.is_contigs_db(self.contigs_db_path)
        self.contigs_db      = dbops.ContigsDatabase(self.contigs_db_path)
        self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash']

        # init ContigsSuperClass
        self.contigs_super = ContigsSuperclass(self.args)

        if not any([self.genes_to_remove, self.genes_to_remove_path, self.genes_to_add, self.genes_to_add_path]):
            raise ConfigError("Please specify some genes to add or remove to your database.")

        if self.genes_to_remove and self.genes_to_remove_path:
            raise ConfigError("Provide either --genes-to-remove or --genes-to-remove-path. You provided both.")

        if self.genes_to_add and self.genes_to_add_path:
            raise ConfigError("Provide either --genes-to-add or --genes-to-add-path. You provided both.")

        if self.genes_to_remove or self.genes_to_remove_path:
            self.run.warning("Removing genes...", header="Updating %s" % self.structure_db_path, lc='green')
            self.load_structure_db()
            remove = self.parse_genes(self.genes_to_remove, self.genes_to_remove_path)
            self.remove_genes(remove)
            self.structure_db.disconnect()

        if self.genes_to_add or self.genes_to_add_path:
            self.run.warning("Adding genes...", header="Updating %s" % self.structure_db_path, lc='green')
            self.load_structure_db()
            self.add_genes()
Exemple #42
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    progress.new("Removing '" + genes_in_splits_summary_table_name + "'")
    contigs_db._exec("DROP TABLE %s;" % genes_in_splits_summary_table_name)
    progress.end()

    progress.new("Upgrading '" + genes_in_splits_table_name + "'")

    progress.update("Creating temporary table")
    contigs_db.create_table(genes_in_splits_table_name + '_temp', genes_in_splits_table_structure, genes_in_splits_table_types)

    progress.update("Moving unique records")
    contigs_db._exec("INSERT INTO %s SELECT * FROM %s GROUP BY %s;" % (genes_in_splits_table_name + '_temp', 
                                                                      genes_in_splits_table_name, 
                                                                      ", ".join(genes_in_splits_table_structure[1:]) ))
    progress.update("Updating entry_id")
    contigs_db._exec("UPDATE %s SET entry_id = rowid - 1;" % (genes_in_splits_table_name + '_temp'))

    progress.update("Swapping temporary table with actual table")
    contigs_db._exec("ALTER TABLE %s RENAME TO %s;" % (genes_in_splits_table_name, genes_in_splits_table_name + '_old'))
    contigs_db._exec("ALTER TABLE %s RENAME TO %s;" % (genes_in_splits_table_name + '_temp', genes_in_splits_table_name))

    progress.update("Removing old table")
    contigs_db._exec("DROP TABLE %s;" % (genes_in_splits_table_name + '_old'))

    progress.update("Optimizing the database")
    contigs_db._exec("VACUUM;")

    progress.update("Updating version")
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    progress.update("Committing changes")
    contigs_db.disconnect()

    progress.end()
    run.info_single("The contigs database is now %s." % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #43
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_contigs_db(db_path)

    # make sure the version is 2
    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    progress.new("Trying to upgrade the contigs database")
    progress.update('...')

    # drop the old tables:
    try:
        contigs_db._exec('''DROP TABLE %s''' % (t.hmm_hits_info_table_name))
        contigs_db._exec('''DROP TABLE %s''' % (t.hmm_hits_table_name))
        contigs_db._exec('''DROP TABLE %s''' % (t.hmm_hits_splits_table_name))
    except:
        pass
    contigs_db.commit()

    # create new empty ones
    contigs_db.create_table(t.hmm_hits_info_table_name, t.hmm_hits_info_table_structure, t.hmm_hits_info_table_types)
    contigs_db.create_table(t.hmm_hits_table_name, t.hmm_hits_table_structure, t.hmm_hits_table_types)
    contigs_db.create_table(t.hmm_hits_splits_table_name, t.hmm_hits_splits_table_structure, t.hmm_hits_splits_table_types)

    # set the version
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    # bye
    contigs_db.disconnect()

    # bye
    progress.end()
    run.info_single("The contigs database is now %s! It no longer has any HMM hits :/ Don't \
                     forget to run `anvi-run-hmms` on it!" % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #44
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_contigs_db(db_path)

    # make sure the version is 2
    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != '5':
        raise ConfigError("Version of this contigs database is not 5 (hence, this script cannot really do anything).")

    progress.new("Trying to upgrade the contigs database")
    progress.update('...')

    # drop the old tables:
    try:
        contigs_db._exec('''DROP TABLE %s''' % (t.splits_taxonomy_table_name))
        contigs_db._exec('''DROP TABLE %s''' % (t.taxon_names_table_name))
        contigs_db._exec('''DROP TABLE %s''' % (t.genes_taxonomy_table_name))
    except:
        pass
    contigs_db.commit()

    # create new empty ones
    contigs_db.create_table(t.splits_taxonomy_table_name, t.splits_taxonomy_table_structure, t.splits_taxonomy_table_types)
    contigs_db.create_table(t.taxon_names_table_name, t.taxon_names_table_structure, t.taxon_names_table_types)
    contigs_db.create_table(t.genes_taxonomy_table_name, t.genes_taxonomy_table_structure, t.genes_taxonomy_table_types)

    # set the version
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version('6')

    # bye
    contigs_db.disconnect()

    # bye
    progress.end()
    run.info_single("The contigs database successfully upgraded from version 5 to 6!")
Exemple #45
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_contigs_db(db_path)

    # make sure the current version matches
    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    progress.new("Trying to upgrade the contigs database")
    progress.update('...')

    contigs_db.remove_meta_key_value_pair('project_name')
    contigs_db.set_meta_value('project_name', "NO_NAME")
    contigs_db.commit()

    # set the version
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    # gene name changes:
    contigs_db._exec('''UPDATE %s SET genes = replace(genes, '%s', '%s') WHERE source LIKE 'Rinke_et_al';''' % (t.hmm_hits_info_table_name, 'Ribosomal_S12', 'Ribosom_S12_S23'))
    contigs_db._exec('''UPDATE %s SET genes = replace(genes, '%s', '%s') Where source LIKE 'Rinke_et_al';''' % (t.hmm_hits_info_table_name, 'UPF0027', 'RtcB'))
    contigs_db._exec('''UPDATE %s SET genes = replace(genes, '%s', '%s') Where source LIKE 'Rinke_et_al';''' % (t.hmm_hits_info_table_name, '‐', '-')) # first - is not ASCII

    # bye
    contigs_db.disconnect()

    # bye to you too
    progress.end()

    dbops.update_description_in_db(db_path, 'No description is given')

    run.info_single("The contigs database is now %s! All this upgrade did was to associate your contigs db with a\
                     project name (which happened to be 'NO_NAME', because anvi'o likes you very much)" \
                                            % (next_version), nl_after=1, nl_before=1, mc='green')
Exemple #46
0
    def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path

        utils.is_contigs_db(self.db_path)

        self.contigs_db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('contigs_db_hash')

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)

        if not self.genes_are_called:
            raise ConfigError("It seems the contigs database '%s' was created with '--skip-gene-calling' flag.\
                                Nothing to do here :/" % (self.db_path))

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            raise ConfigError("Tables that should contain gene calls are empty. Which probably means the gene\
                                caller reported no genes for your contigs.")

        self.set_next_available_id(t.hmm_hits_table_name)
        self.set_next_available_id(t.hmm_hits_splits_table_name)
Exemple #47
0
    def __init__(self, args, run=run, progress=progress):
        self.args = args
        self.run = run
        self.progress = progress
        self.contigs_db_path = args.contigs_db
        self.num_threads = args.num_threads
        self.pfam_data_dir = args.pfam_data_dir

        # load_catalog will populate this
        self.function_catalog = {}

        filesnpaths.is_program_exists('hmmscan')
        utils.is_contigs_db(self.contigs_db_path)

        if not self.pfam_data_dir:
            self.pfam_data_dir = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/Pfam')

        self.is_database_exists()

        self.run.info('Pfam database directory', self.pfam_data_dir)

        self.get_version()
        self.load_catalog()
Exemple #48
0
    def populate_genes_in_contigs_table(self, gene_calls_dict, amino_acid_sequences, append_to_the_db=False):
        utils.is_contigs_db(self.db_path)
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        if not append_to_the_db:
            database._exec('''DELETE FROM %s''' % (t.genes_in_contigs_table_name))
            database._exec('''DELETE FROM %s''' % (t.gene_amino_acid_sequences_table_name))
        else:
            # so we are in the append mode. We must remove all the previous entries from genes in contigs
            # that matches to the incoming sources. otherwhise we may end up with many duplicates in the db.
            sources = set([v['source'] for v in gene_calls_dict.values()])

            # basically here we will go through those sources, find gene caller ids associated with them in
            # the genes in contigs table, and then remove entries for those gene caller ids both from the
            # genes in contigs and genes in splits tables.
            for source in sources:
                gene_caller_ids_for_source = database.get_single_column_from_table(t.genes_in_contigs_table_name, 
                                                                                   'gene_callers_id',
                                                                                   where_clause="""source='%s'""" % source)

                if gene_caller_ids_for_source:
                    for table_name in [t.genes_in_contigs_table_name, t.genes_in_splits_table_name]:
                        database._exec('''DELETE FROM %s WHERE gene_callers_id IN (%s)''' % \
                                                    (table_name, ','.join([str(g) for g in gene_caller_ids_for_source])))

        self.progress.new('Processing')
        self.progress.update('Entering %d gene calls into the db ...' % (len(gene_calls_dict)))

        db_entries = [tuple([entry_id] + [gene_calls_dict[entry_id][h] for h in t.genes_in_contigs_table_structure[1:]]) for entry_id in gene_calls_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?)''' % t.genes_in_contigs_table_name, db_entries)

        db_entries = [tuple([entry_id] + [amino_acid_sequences[entry_id]]) for entry_id in gene_calls_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?)''' % t.gene_amino_acid_sequences_table_name, db_entries)

        self.progress.end()

        database.disconnect()
Exemple #49
0
    def __init__(self, args, database='bact', executable = 'emapper.py', usemem=True, use_version=None, progress=progress, run=run):
        self.executable = executable
        self.progress = progress
        self.run = run

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.contigs_db_path = A('contigs_db')
        self.num_threads = A('num_threads')
        self.annotation = A('annotation')
        self.use_version = use_version
        self.usemem = usemem

        self.COGs_data = cogs.COGsData(args)

        if not self.COGs_data.initialized:
            raise ConfigError("It seems you don't have your COG data set up on this system. Unfortunately EggNOGmapper class\
                                depends on it, so this is the end of the road for you. If you set up your COG directory to\
                                a specific path, you can use `--cog-data-dir` parameter to show anvi'o where it is. If you\
                                never set up one, then maybe it is time for you to take a look at the program\
                                `anvi-setup-ncbi-cogs`.")

        try:
            self.num_threads = int(self.num_threads) if self.num_threads else None
        except Exception as e:
            raise ConfigError("Someone didn't like the number of threads, and said [%s]. Shame on you :/" % e)

        if database not in ['euk', 'bact', 'arch']:
            raise ConfigError("Wrong database (%s). eggnog-mapper knows only about euk, bact, or arch db types..." % (database))
        else:
            self.database = database

        if self.contigs_db_path:
            utils.is_contigs_db(self.contigs_db_path)

        self.parser = None
        self.entry_id = 0
        self.version_to_use = None
        self.aa_sequences_file_name = 'aa_sequences.fa'
        self.log_file_path = 'log.txt'
        self.output_file_prefix = 'project'
        self.annotations_file_name = self.output_file_prefix + '.emapper.annotations'
        self.annotations_dict = {}

        # this is a shitty workaround to make sure integers used as gene caller ids by anvi'o will not
        # cause any issues downstream (because they did in the past when silly programs started treating
        # them as numerical data and then converted them to float, and then storing them as 1.0, 2.0, etc).
        self.gene_caller_id_prefix = 'g'

        self.available_parsers = {'0.12.6': self.__parser_1,
                                  '1.0.3': self.__parser_2}

        self.check_version()

        if not self.num_threads:
            try:
                run.warning("You have not set the number of threads, and the default is whatever the default is for eggnog-mapper. You\
                             may really want to change that since if you have a large number of genes to annotate, this may take a very\
                             long time. If you don't want to see this message again, just set the number of threads you want eggnog-mapper\
                             to use explicitly. You can press CTRL + C to cancel this run, or simply do nothing since your operation\
                             will contine in probably like 2 seconds or less ... depending how fast you read.")
                time.sleep(25)
            except KeyboardInterrupt:
                sys.exit()
Exemple #50
0
    def populate_genes_in_splits_tables(self, gene_calls_dict=None):
        utils.is_contigs_db(self.db_path)
        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress)
        self.set_next_available_id(t.genes_in_splits_table_name)
        self.init_gene_calls_dict()

        if not gene_calls_dict:
            gene_calls_dict = self.gene_calls_dict

        genes_in_splits = GenesInSplits(entry_id_start=self.next_id(t.genes_in_splits_table_name))
        # build a dictionary for fast access to all genes identified within a contig
        gene_calls_in_contigs_dict = {}
        for gene_callers_id in gene_calls_dict:
            contig = gene_calls_dict[gene_callers_id]['contig']
            if contig in gene_calls_in_contigs_dict:
                gene_calls_in_contigs_dict[contig].add(gene_callers_id)
            else:
                gene_calls_in_contigs_dict[contig] = set([gene_callers_id])

        contigs_without_any_gene_calls = list(set(self.contigs_info.keys()) - set(gene_calls_in_contigs_dict.keys()))
        self.run.info('Contigs with at least one gene call', '%d of %d (%.1f%%)' % (len(gene_calls_in_contigs_dict),
                                                                                    len(self.contigs_info),
                                                                                    len(gene_calls_in_contigs_dict) * 100.0 / len(self.contigs_info)))

        for contig in contigs_without_any_gene_calls:
            gene_calls_in_contigs_dict[contig] = set([])

        splits_dict = {}
        for contig in self.contigs_info:
            for split_name in self.contig_name_to_splits[contig]:
                start = self.splits_info[split_name]['start']
                stop = self.splits_info[split_name]['end']

                gene_start_stops = []
                # here we go through all genes in the contig and identify the all the ones that happen to be in
                # this particular split to generate summarized info for each split. BUT one important that is done
                # in the following loop is genes_in_splits.add call, which populates GenesInSplits class.
                for gene_callers_id in gene_calls_in_contigs_dict[contig]:
                    if gene_calls_dict[gene_callers_id]['stop'] > start and gene_calls_dict[gene_callers_id]['start'] < stop:
                        gene_start_stops.append((gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop']), )
                        genes_in_splits.add(split_name, start, stop, gene_callers_id, gene_calls_dict[gene_callers_id]['start'], gene_calls_dict[gene_callers_id]['stop'])

                # here we identify genes that are associated with a split even if one base of the gene spills into
                # the defined start or stop of a split, which means, split N, will include genes A, B and C in this
                # scenario:
                #
                # contig: (...)------[ gene A ]--------[     gene B    ]----[gene C]---------[    gene D    ]-----(...)
                #         (...)----------x---------------------------------------x--------------------------------(...)
                #                        ^ (split N start)                       ^ (split N stop)
                #                        |                                       |
                #                        |<-              split N              ->|
                #
                # however, when looking at the coding versus non-coding nucleotide ratios in a split, we have to make
                # sure that only the relevant portion of gene A and gene C is counted:
                total_coding_nts = 0
                for gene_start, gene_stop in gene_start_stops:
                    total_coding_nts += (gene_stop if gene_stop < stop else stop) - (gene_start if gene_start > start else start)

                splits_dict[split_name] = {'num_genes': len(gene_start_stops),
                                           'avg_gene_length': numpy.mean([(l[1] - l[0]) for l in gene_start_stops]) if len(gene_start_stops) else 0.0,
                                           'ratio_coding': total_coding_nts * 1.0 / (stop - start),
                                           }

        # open connection
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        # push entries for genes in splits table
        db_entries = [tuple([entry_id] + [genes_in_splits.splits_to_prots[entry_id][h] for h in t.genes_in_splits_table_structure[1:]]) for entry_id in genes_in_splits.splits_to_prots]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?)''' % t.genes_in_splits_table_name, db_entries)

        # disconnect
        database.disconnect()
Exemple #51
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # initialize self.arg parameters
        A                             = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None
        null                          = lambda x: x
        self.contigs_db_path          = A('contigs_db', null)
        self.genes_of_interest_path   = A('genes_of_interest', null)
        self.splits_of_interest_path  = A('splits_of_interest', null)
        self.bin_id                   = A('bin_id', null)
        self.collection_name          = A('collection_name', null)
        self.gene_caller_ids          = A('gene_caller_ids', null)
        self.output_db_path           = A('output_db_path', null)
        self.full_modeller_output     = A('dump_dir', null)
        self.skip_DSSP                = A('skip_DSSP', bool)
        self.modeller_executable      = A('modeller_executable', null)
        self.DSSP_executable          = None

        utils.is_contigs_db(self.contigs_db_path)
        self.contigs_db                = dbops.ContigsDatabase(self.contigs_db_path)
        self.contigs_db_hash           = self.contigs_db.meta['contigs_db_hash']

        # MODELLER params
        self.modeller_database        = A('modeller_database', null)
        self.scoring_method           = A('scoring_method', null)
        self.max_number_templates     = A('max_number_templates', null)
        self.percent_identical_cutoff = A('percent_identical_cutoff', null)
        self.num_models               = A('num_models', null)
        self.deviation                = A('deviation', null)
        self.very_fast                = A('very_fast', bool)

        # check database output
        if not self.output_db_path:
            self.output_db_path = "STRUCTURE.db"
        if not self.output_db_path.endswith('.db'):
            raise ConfigError("The structure database output file (`-o / --output`) must end with '.db'")
        filesnpaths.is_output_file_writable(self.output_db_path)

        # check modeller output
        if self.full_modeller_output:
            self.full_modeller_output = filesnpaths.check_output_directory(self.full_modeller_output, ok_if_exists=False)

        # identify which genes user wants to model structures for
        self.genes_of_interest = self.get_genes_of_interest(self.genes_of_interest_path, self.gene_caller_ids)

        self.sanity_check()

        # residue annotation
        self.residue_annotation_sources_info = self.get_residue_annotation_sources_info()
        self.residue_info_table_structure, self.residue_info_table_types = self.get_residue_info_table_structure()
        self.residue_annotation_df = pd.DataFrame({})

        # initialize StructureDatabase
        self.structure_db = StructureDatabase(self.output_db_path,
                                              self.contigs_db_hash,
                                              residue_info_structure_extras = self.residue_info_table_structure,
                                              residue_info_types_extras = self.residue_info_table_types,
                                              create_new=True)

        # init ContigsSuperClass
        self.contigs_super = ContigsSuperclass(self.args)