コード例 #1
0
ファイル: splitter.py プロジェクト: gelomerase/anvio
    def do_contigs_db(self):
        self.progress.new('Splitting "%s"' % self.bin_id)
        self.progress.update('Subsetting the contigs database')

        bin_contigs_db = dbops.ContigsDatabase(self.bin_contigs_db_path)
        bin_contigs_db.touch()

        # copy-paste tables that will largely stay the same from the parent
        bin_contigs_db.db.copy_paste(table_name='self', source_db_path=self.contigs_db_path)
        bin_contigs_db.db.copy_paste(table_name='hmm_hits_info', source_db_path=self.contigs_db_path)
        bin_contigs_db.db.copy_paste(table_name='taxon_names', source_db_path=self.contigs_db_path)

        # update some variables in the self table:
        self.contigs_db_hash = bin_contigs_db.get_hash()
        bin_contigs_db.db.update_meta_value('num_contigs', self.num_contigs)
        bin_contigs_db.db.update_meta_value('num_splits', self.num_splits)
        bin_contigs_db.db.update_meta_value('total_length', self.total_length)
        bin_contigs_db.db.update_meta_value('creation_date', bin_contigs_db.get_date())
        bin_contigs_db.db.update_meta_value('contigs_db_hash', self.contigs_db_hash)

        # the empty contigs db is ready
        bin_contigs_db.disconnect()

        # touch does not create the k-mers tables, so the resulting contigs db is missing them. we
        # will add them to the db here.
        bin_contigs_db = dbops.ContigsDatabase(self.bin_contigs_db_path)
        k = KMerTablesForContigsAndSplits(None, k=bin_contigs_db.meta['kmer_size'])
        for table_name in ['kmer_contigs', 'kmer_splits']:
            bin_contigs_db.db.create_table(table_name, k.kmers_table_structure, k.kmers_table_types)
        bin_contigs_db.disconnect()

        # setup the filtering rules for migrating data:
        tables = {
                    t.contig_sequences_table_name: ('contig', self.contig_names),
                    t.contigs_info_table_name: ('contig', self.contig_names),
                    t.gene_function_calls_table_name: ('gene_callers_id', self.gene_caller_ids),
                    t.gene_amino_acid_sequences_table_name: ('gene_callers_id', self.gene_caller_ids),
                    t.genes_in_contigs_table_name: ('gene_callers_id', self.gene_caller_ids),
                    t.genes_in_splits_table_name: ('gene_callers_id', self.gene_caller_ids),
                    t.genes_in_splits_summary_table_name: ('split', self.split_names),
                    t.genes_taxonomy_table_name: ('gene_callers_id', self.gene_caller_ids),
                    t.hmm_hits_table_name: ('gene_callers_id', self.gene_caller_ids),
                    t.hmm_hits_splits_table_name: ('split', self.split_names),
                    t.splits_info_table_name: ('split', self.split_names),
                    t.splits_taxonomy_table_name: ('split', self.split_names),
                    t.nt_position_info_table_name: ('contig_name', self.contig_names),
                    'kmer_contigs': ('contig', self.split_names),
                    'kmer_splits': ('contig', self.split_names),
                }

        self.migrate_data(tables, self.contigs_db_path, self.bin_contigs_db_path)

        self.progress.end()
コード例 #2
0
    def gen_combined_proteins_fasta(self):
        self.progress.new('Storing combined protein sequences')
        output_file_path = self.get_output_file_path('combined_proteins.fa',
                                                     temp_file=True)
        output_file = open(output_file_path, 'w')

        for genome_name in self.genomes:
            g = self.genomes[genome_name]
            self.progress.update('Working on %s ...' % genome_name)
            contigs_db = dbops.ContigsDatabase(g['contigs_db_path'])
            protein_sequences_dict = contigs_db.db.get_table_as_dict(
                t.gene_protein_sequences_table_name)
            for gene_caller_id in g['gene_caller_ids']:
                output_file.write('>%s_%d\n' %
                                  (g['genome_entry_hash'], gene_caller_id))
                output_file.write(
                    '%s\n' %
                    protein_sequences_dict[gene_caller_id]['sequence'])
            contigs_db.disconnect()

        output_file.close()
        self.progress.end()

        self.run.info(
            'ORFs', '%s protein sequences are stored for analysis.' %
            pp(sum([g['num_genes'] for g in self.genomes.values()])))

        return output_file_path
コード例 #3
0
    def get_HMM_sources_common_to_all_genomes(self):
        """Returns True if all HMM sources in all genomes are comparable"""

        hmm_sources_info_per_genome = {}

        # first recover hmm sources info per genome
        for genome_name in self.genomes:
            if 'hmm_sources_info' not in self.genomes[genome_name]:
                # someone did not run the expensive `init` function. but we can recover this
                # here quitte cheaply
                contigs_db = dbops.ContigsDatabase(
                    self.genomes[genome_name]['contigs_db_path'])
                hmm_sources_info = contigs_db.db.get_table_as_dict(
                    t.hmm_hits_info_table_name)
            else:
                hmm_sources_info = self.genomes[genome_name][
                    'hmm_sources_info']

            hmm_sources_info_per_genome[genome_name] = hmm_sources_info

        hmm_sources_found = set([])
        for genome_name in self.genomes:
            [hmm_sources_found.add(s) for s in hmm_sources_info.keys()]

        # find out hmm_sources that occur in all genomes
        hmm_sources_in_all_genomes = copy.deepcopy(hmm_sources_found)
        for genome_name in self.genomes:
            for hmm_source in hmm_sources_found:
                if hmm_source not in hmm_sources_info_per_genome[
                        genome_name] and hmm_source in hmm_sources_in_all_genomes:
                    hmm_sources_in_all_genomes.remove(hmm_source)

        return hmm_sources_in_all_genomes
コード例 #4
0
    def get_functions_and_sequences_dicts_from_contigs_db(self, genome_name):
        g = self.genomes[genome_name]

        args = argparse.Namespace(contigs_db=g['contigs_db_path'])
        contigs_super = dbops.ContigsSuperclass(
            args, r=anvio.terminal.Run(verbose=False))

        if self.functions_are_available:
            contigs_super.init_functions(
                requested_sources=list(self.function_annotation_sources))
            function_calls_dict = contigs_super.gene_function_calls_dict
        else:
            function_calls_dict = {}

        # get dna sequences
        gene_caller_ids_list, dna_sequences_dict = contigs_super.get_sequences_for_gene_callers_ids(
            gene_caller_ids_list=list(g['gene_caller_ids']))

        # get amino acid sequences.
        # FIXME: this should be done in the contigs super.
        contigs_db = dbops.ContigsDatabase(g['contigs_db_path'])
        aa_sequences_dict = contigs_db.db.get_table_as_dict(
            t.gene_amino_acid_sequences_table_name)
        contigs_db.disconnect()

        return (function_calls_dict, aa_sequences_dict, dna_sequences_dict)
コード例 #5
0
ファイル: genomedescriptions.py プロジェクト: mschecht/anvio
    def list_HMM_info_and_quit(self):
        hmm_sources_in_all_genomes = self.get_HMM_sources_common_to_all_genomes()

        # since we know hmm sources in `hmm_sources_in_all_genomes` are common to all genomes,
        # we could use any of those genomes to learn about the specifics of them. here we take
        # the first one from `self.genomes`
        hmm_sources_info = dbops.ContigsDatabase(list(self.genomes.values())[0]['contigs_db_path']).db.get_table_as_dict(t.hmm_hits_info_table_name)

        if self.list_hmm_sources or self.list_available_gene_names:
            if not len(hmm_sources_in_all_genomes):
                raise ConfigError("There are no HMM sources among your external genomes that occur in every genome :/")


        if self.list_hmm_sources:
            self.run.warning(None, 'HMM SOURCES COMMON TO ALL %d GENOMES' % (len(self.genomes)), lc='yellow')
            for source in hmm_sources_in_all_genomes:
                s = hmm_sources_info[source]
                self.run.info_single('%s [type: %s] [num genes: %d]' % (source, s['search_type'], len(s['genes'])))
            sys.exit(0)

        if self.list_available_gene_names:
            self.run.warning(None, 'GENES IN HMM SOURCES COMMON TO ALL %d GENOMES' % (len(self.genomes)), lc='yellow')
            for source in hmm_sources_in_all_genomes:
                s = hmm_sources_info[source]
                gene_names = ', '.join(sorted([g.strip() for g in s['genes'].split(',')]))
                self.run.info_single('%s [type: %s]: %s' % (source, s['search_type'], gene_names), nl_after = 2)
            sys.exit(0)
コード例 #6
0
    def get_functions_and_sequences_dicts_from_contigs_db(
            self, contigs_db_path, gene_caller_ids=None):
        """Returns function calls, dna and amino acid sequences for `gene_caller_ids`
           from a contigs database"""

        args = argparse.Namespace(contigs_db=contigs_db_path)
        contigs_super = dbops.ContigsSuperclass(
            args, r=anvio.terminal.Run(verbose=False))

        # get functions
        if self.functions_are_available:
            contigs_super.init_functions(
                requested_sources=self.function_annotation_sources)
            function_calls_dict = contigs_super.gene_function_calls_dict
        else:
            function_calls_dict = {}

        # get dna sequences
        gene_caller_ids_list, dna_sequences_dict = contigs_super.get_sequences_for_gene_callers_ids(
            gene_caller_ids_list=list(gene_caller_ids))

        # get amino acid sequences.
        # FIXME: this should be done in the contigs super.
        contigs_db = dbops.ContigsDatabase(contigs_db_path)
        aa_sequences_dict = contigs_db.db.get_table_as_dict(
            t.gene_protein_sequences_table_name)
        contigs_db.disconnect()

        return (function_calls_dict, aa_sequences_dict, dna_sequences_dict)
コード例 #7
0
ファイル: interacdome.py プロジェクト: genomewalker/anvio
    def potentially_remove_previous_interacdome_data(self):
        database = dbops.ContigsDatabase(self.contigs_db_path)

        entries = database.db.get_single_column_from_table(
            tables.amino_acid_additional_data_table_name,
            'data_group',
            unique=True,
            where_clause="data_group='InteracDome'")

        if 'InteracDome' in entries:
            if not self.just_do_it:
                raise ConfigError(
                    "anvi-run-interacdome has already been run on this database. If you want to delete "
                    "this data and run anvi-run-interacdome again, provide the --just-do-it flag."
                )
            else:
                self.run.warning(
                    "You already have InteracDome data in this database, but since you provided the flag "
                    "--just-do-it, anvi'o is in the process of deleting this data, then things will be re-ran."
                )

                self.progress.new("Deleting previous data")
                self.progress.update('...')

                database.db.remove_some_rows_from_table(
                    tables.amino_acid_additional_data_table_name,
                    where_clause='''data_group="InteracDome"''')

                self.progress.end()

        database.disconnect()
コード例 #8
0
    def get_genome_hash_for_external_genome(self, entry):
        utils.is_contigs_db(entry['contigs_db_path'])
        contigs_db = dbops.ContigsDatabase(entry['contigs_db_path'])
        genome_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        return genome_hash
コード例 #9
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # initialize self.arg parameters
        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x
        self.contigs_db_path = A('contigs_db', null)
        self.structure_db_path = A('structure_db', null)
        self.genes_to_remove = A('genes_to_remove', null)
        self.genes_to_remove_path = A('genes_to_remove_file', null)
        self.genes_to_add = A('genes_to_add', null)
        self.genes_to_add_path = A('genes_to_add_file', null)
        self.full_modeller_output = A('dump_dir', null)
        self.modeller_executable = A('modeller_executable', null)
        self.DSSP_executable = None

        utils.is_contigs_db(self.contigs_db_path)
        self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash']

        # init ContigsSuperClass
        self.contigs_super = ContigsSuperclass(self.args)

        if not any([
                self.genes_to_remove, self.genes_to_remove_path,
                self.genes_to_add, self.genes_to_add_path
        ]):
            raise ConfigError(
                "Please specify some genes to add or remove to your database.")

        if self.genes_to_remove and self.genes_to_remove_path:
            raise ConfigError(
                "Provide either --genes-to-remove or --genes-to-remove-path. You provided both."
            )

        if self.genes_to_add and self.genes_to_add_path:
            raise ConfigError(
                "Provide either --genes-to-add or --genes-to-add-path. You provided both."
            )

        if self.genes_to_remove or self.genes_to_remove_path:
            self.run.warning("Removing genes...",
                             header="Updating %s" % self.structure_db_path,
                             lc='green')
            self.load_structure_db()
            remove = self.parse_genes(self.genes_to_remove,
                                      self.genes_to_remove_path)
            self.remove_genes(remove)
            self.structure_db.disconnect()

        if self.genes_to_add or self.genes_to_add_path:
            self.run.warning("Adding genes...",
                             header="Updating %s" % self.structure_db_path,
                             lc='green')
            self.load_structure_db()
            self.add_genes()
コード例 #10
0
ファイル: genomedescriptions.py プロジェクト: mschecht/anvio
    def get_genome_hash_for_internal_genome(self, entry):
        utils.is_contigs_db(entry['contigs_db_path'])
        split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(entry)
        contigs_db = dbops.ContigsDatabase(entry['contigs_db_path'])
        genome_hash = hashlib.sha224('_'.join([''.join(split_names_of_interest), contigs_db.meta['contigs_db_hash']]).encode('utf-8')).hexdigest()[0:12]
        contigs_db.disconnect()

        return genome_hash
コード例 #11
0
    def get_genes_and_functions_from_contigs_db(self, contigs_db_path):
        """This method will extract a list of gene attributes from each contig within a contigsDB.

        Returns
        =======
        output : list of lists
            first element is gene_caller_id, second is function accession, third is the contig name
        """

        # get contigsDB
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # extract contigs names
        genes_in_contigs = contigs_db.db.get_table_as_dict(
            t.genes_in_contigs_table_name)

        # extract annotations and filter for the sources designated by user using self.annotation_source
        annotations_dict = contigs_db.db.get_table_as_dict(
            t.gene_function_calls_table_name)
        annotations_dict = utils.get_filtered_dict(
            annotations_dict, 'source', set([self.annotation_source]))

        # Make dict with gene-caller-id:accession
        gene_callers_id_to_accession_dict = {
            entry['gene_callers_id']: entry['accession']
            for entry in annotations_dict.values()
        }

        # Make list of lists containing gene attributes. If there is not annotation add one in!
        genes_and_functions_list = [
        ]  # List of lists [gene-caller-id, accessions, contig-name]
        counter = 0
        for gene_callers_id in genes_in_contigs:
            list_of_gene_attributes = []

            if gene_callers_id in gene_callers_id_to_accession_dict:
                accession = gene_callers_id_to_accession_dict[gene_callers_id]
                accession = accession.replace(" ", "")
                contig_name = genes_in_contigs[gene_callers_id]['contig']
                list_of_gene_attributes.extend(
                    (gene_callers_id, accession, contig_name))
                genes_and_functions_list.append(list_of_gene_attributes)
            else:
                # adding in "unknown annotation" if there is none
                accession = "unknown-function"
                contig_name = genes_in_contigs[counter]['contig']
                list_of_gene_attributes.extend(
                    (counter, accession, contig_name))
                genes_and_functions_list.append(list_of_gene_attributes)
            counter = counter + 1

        return genes_and_functions_list
コード例 #12
0
    def load_metagenome_descriptions(self, skip_functions=False, init=True):
        """Load metagenome descriptions"""

        # start with a sanity check to make sure name are distinct
        self.names_check()

        self.metagenome_names = list(self.metagenomes_dict.keys())

        for metagenome_name in self.metagenomes_dict:
            self.metagenomes[metagenome_name] = self.metagenomes_dict[
                metagenome_name]
            for db_path_var in ['contigs_db_path', 'profile_db_path']:
                if db_path_var not in self.metagenomes[metagenome_name]:
                    continue
                path = self.metagenomes[metagenome_name][db_path_var]

                if not path:
                    raise ConfigError(
                        "Bad news: anvi'o was loading metagenome desriptions, and it run into an empty path for "
                        "the metagenome %s. How did this happen? HOW? :(" %
                        metagenome_name)

                if not path.startswith('/'):
                    self.metagenomes[metagenome_name][
                        db_path_var] = os.path.abspath(
                            os.path.join(
                                os.path.dirname(
                                    self.input_file_for_metagenomes), path))

            # while we are going through all genomes and reconstructing self.metagenomes for the first time,
            # let's add the 'name' attribute in it as well.'
            self.metagenomes[metagenome_name]['name'] = metagenome_name

        # add hashes for each metagenome in the self.metagenomes dict.
        self.metagenome_hash_to_metagenome_name = {}
        for metagenome_name in self.metagenome_names:
            g_hash = self.get_metagenome_hash(
                self.metagenomes[metagenome_name])
            self.metagenomes[metagenome_name]['metagenome_hash'] = g_hash
            self.metagenome_hash_to_metagenome_name[g_hash] = metagenome_name

        for metagenome_name in self.metagenomes:
            g = self.metagenomes[metagenome_name]
            contigs_db = dbops.ContigsDatabase(g['contigs_db_path'])
            for key in contigs_db.meta:
                g[key] = contigs_db.meta[key]

        # make sure it is OK to go with self.genomes
        self.sanity_check()
コード例 #13
0
    def __init__(self, args, r=run, p=progress):
        self.run = r
        self.progress = p

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.num_clusters_requested = A('num_clusters_requested') or 80

        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        self.clusters = {}

        self.lengths = {}
        self.kmers = {}
        self.coverages = {}

        self.debug = args.debug

        self.progress.new('Init')

        self.progress.update('accessing the profile database ...')
        profile_db = dbops.ProfileDatabase(args.profile_db)

        if not int(profile_db.meta['merged']):
            self.progress.end()
            raise ConfigError(
                'CONCOCT can only be used to cluster merged runs...')

        self.coverages = profile_db.db.get_table_as_dict(
            'mean_coverage_contigs', columns_of_interest=profile_db.samples)
        profile_db.disconnect()

        self.progress.update('accessing the profile database ...')
        contigs_db = dbops.ContigsDatabase(args.contigs_db, quiet=True)
        self.kmers = contigs_db.db.get_table_as_dict(
            'kmer_contigs', keys_of_interest=list(self.coverages.keys()))
        splits_basic_info = contigs_db.db.get_table_as_dict(
            'splits_basic_info', keys_of_interest=list(self.coverages.keys()))
        contigs_db.disconnect()

        self.progress.update('computing split lengths ...')
        for split_name in splits_basic_info:
            self.lengths[split_name] = splits_basic_info[split_name]['length']

        self.progress.end()
コード例 #14
0
ファイル: interacdome.py プロジェクト: shiyi-pan/anvio
    def store(self):
        self.progress.new("Storing")
        self.progress.update("binding frequencies in contigs database")

        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, run=self.run, progress=self.progress)

        # This is horrible. But we hijacked the miscdata framework to create
        # amino_acid_additional_data table in the contigs DB, so we must live with the consequences
        self.amino_acid_additional_data = self.avg_bind_freq.rename(columns={
            'ligand': 'data_key',
            'binding_freq': 'data_value',
        })
        # Prepend LIG_ to the data_keys
        self.amino_acid_additional_data['data_key'] = 'LIG_' + self.amino_acid_additional_data['data_key']
        self.amino_acid_additional_data['item_name'] = (self.amino_acid_additional_data['gene_callers_id'].astype(str) + ':' +
                                                        self.amino_acid_additional_data['codon_order_in_gene'].astype(str))
        self.amino_acid_additional_data.drop(['gene_callers_id', 'codon_order_in_gene'], axis=1, inplace=True)
        self.amino_acid_additional_data['data_type'] = 'float'
        self.amino_acid_additional_data['data_group'] = 'InteracDome'

        contigs_db.db.insert_rows_from_dataframe(
            tables.amino_acid_additional_data_table_name,
            self.amino_acid_additional_data
        )

        contigs_db.disconnect()
        self.progress.reset()
        num_residues = self.amino_acid_additional_data['item_name'].nunique()
        self.run.info_single(f"Binding frequencies for {num_residues} unique positions successfully stored in {self.contigs_db_path}", mc='green', nl_before=1)

        self.progress.update("HMMER hit table")
        dom_hit_filepath = self.output_prefix + '-domain_hits.txt'
        self.hmm_out.dom_hits.to_csv(dom_hit_filepath, sep='\t', index=False)
        self.progress.reset()
        self.run.info_single(f"Domain hit summaries stored in {dom_hit_filepath}", mc='green')

        self.progress.update("Match state contributors")
        match_state_contributors_filepath = self.output_prefix + '-match_state_contributors.txt'
        self.bind_freq.to_csv(match_state_contributors_filepath, sep='\t', index=False)
        self.progress.reset()
        self.run.info_single(f"Match state contributors stored in {match_state_contributors_filepath}", mc='green')

        self.run.warning(f"You got here, which is great! That means any binding frequencies were stored in your contigs database. "
                         f"In addition, a couple of files were stored in your current directory (with the prefix '{self.output_prefix}') "
                         f"that you may want to keep around if you plan on doing very refined analyses.", header='THINGS HAPPENED', lc='green')

        self.progress.end()
コード例 #15
0
ファイル: panops.py プロジェクト: psaxcode/anvio
    def gen_protein_sequences_dict(self):
        self.run.info('Exclude partial gene calls',
                      self.exclude_partial_gene_calls,
                      nl_after=1)

        total_num_protein_sequences = 0
        total_num_excluded_protein_sequences = 0

        for genome_name in self.genomes:
            self.progress.new('Reading protein seqeunces into memory')
            self.progress.update('...')
            g = self.genomes[genome_name]

            self.protein_sequences_dict[genome_name] = {}

            self.progress.update('Working on %s ...' % genome_name)
            contigs_db = dbops.ContigsDatabase(g['contigs_db_path'])
            protein_sequences_dict = contigs_db.db.get_table_as_dict(
                t.gene_protein_sequences_table_name)

            total_num_excluded_protein_sequences += len(g['excluded_gene_ids'])

            for gene_caller_id in g['gene_caller_ids']:
                self.protein_sequences_dict[genome_name][
                    gene_caller_id] = protein_sequences_dict[gene_caller_id][
                        'sequence']
                total_num_protein_sequences += 1

            self.progress.end()

            self.run.info_single(
                '%s is initialized with %s genes (%s were excluded)' %
                (genome_name, pp(len(
                    g['gene_caller_ids'])), pp(len(g['excluded_gene_ids']))),
                cut_after=120)

            contigs_db.disconnect()

        self.run.info('Num protein sequences',
                      '%s' % pp(total_num_protein_sequences),
                      nl_before=1)
        self.run.info('Num excluded gene calls',
                      '%s' % pp(total_num_excluded_protein_sequences))
コード例 #16
0
    def process(self):
        """Processes all sequences in a given contigs database or a FASTA file.

        What this function does depends on the configuration of the class. Member functions `find_gapless`
        or `find_with_gaps` may be more appropriate to call if there is a single sequence to process.
        """

        if self.contigs_db_path:
            contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
            contig_sequences_dict = contigs_db.db.get_table_as_dict(
                anvio.tables.contig_sequences_table_name)

            self.progress.new('Searching',
                              progress_total_items=len(contig_sequences_dict))
            for sequence_name in contig_sequences_dict:
                self.progress.update(
                    f"{sequence_name} ({pp(len(contig_sequences_dict[sequence_name]['sequence']))} nts)",
                    increment=True)
                self.find(contig_sequences_dict[sequence_name]['sequence'],
                          sequence_name=sequence_name)
            self.progress.end()

        elif self.fasta_file_path:
            num_sequences = utils.get_num_sequences_in_fasta(
                self.fasta_file_path)
            fasta = anvio.fastalib.SequenceSource(self.fasta_file_path)
            self.progress.new('Searching', progress_total_items=num_sequences)

            while next(fasta):
                self.progress.update(f"{fasta.id} ({pp(len(fasta.seq))} nts)",
                                     increment=True)
                self.find(fasta.seq, sequence_name=fasta.id)
            self.progress.end()

        else:
            raise ConfigError(
                "You called the `process` function of the class `Palindromes` without a FASTA "
                "file or contigs database to process :(")

        self.report()
コード例 #17
0
    def get_HMM_sources_common_to_all_genomes(self, dont_raise=False):
        """Returns True if all HMM sources in all genomes are comparable"""
        hmm_sources_found = set([])
        for genome_name in self.genomes:
            if 'hmm_sources_info' not in self.genomes[genome_name]:
                # someone did not run the expensive `init` function. but we can recover this
                # here quitte cheaply
                contigs_db = dbops.ContigsDatabase(
                    self.genomes[genome_name]['contigs_db_path'])
                hmm_sources_info = contigs_db.db.get_table_as_dict(
                    t.hmm_hits_info_table_name)
                for hmm_source in hmm_sources_info:
                    hmm_sources_info[hmm_source]['genes'] = sorted([
                        g.strip() for g in hmm_sources_info[hmm_source]
                        ['genes'].split(',')
                    ])
            else:
                hmm_sources_info = self.genomes[genome_name][
                    'hmm_sources_info']

            [hmm_sources_found.add(s) for s in hmm_sources_info.keys()]

        # find out hmm_sources that occur in all genomes
        hmm_sources_in_all_genomes = copy.deepcopy(hmm_sources_found)
        for genome_name in self.genomes:
            for hmm_source in hmm_sources_found:
                if hmm_source not in hmm_sources_info and hmm_source in hmm_sources_in_all_genomes:
                    hmm_sources_in_all_genomes.remove(hmm_source)

        if not len(hmm_sources_in_all_genomes):
            if dont_raise:
                return None

            raise ConfigError(
                "There are no HMM sources among your external genomes that occur in every genome :/"
            )

        return hmm_sources_in_all_genomes
コード例 #18
0
ファイル: bamops.py プロジェクト: paczian/anvio
    def get_short_reads_for_splits_dict(self):
        short_reads_for_splits_dict = {}

        self.progress.new('Accessing reads')
        self.progress.update('Reading splits info from the contigs database ...')
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        splits_basic_info = contigs_db.db.get_table_as_dict(t.splits_info_table_name)
        contigs_db.disconnect()

        self.progress.update('Identifying contigs associated with splits ...')
        contigs_involved = utils.get_contigs_splits_dict(self.split_names_of_interest, splits_basic_info)

        # this variable will hold a list of (contig_id, start, stop) tuples
        # for each contig and the start and stop positions of sequential blocks
        # of splits identified within them
        contig_start_stops = []

        self.progress.update('Computing start/stops positions of interest in %d contigs ...' % (len(contigs_involved)))
        for contig_id in contigs_involved:
            splits_order = contigs_involved[contig_id].keys()
            sequential_blocks = ccollections.GetSequentialBlocksOfSplits(splits_order).process()

            for sequential_block in sequential_blocks:
                first_split = contigs_involved[contig_id][sequential_block[0]]
                last_split = contigs_involved[contig_id][sequential_block[-1]]

                contig_start_stops.append((contig_id,
                                           splits_basic_info[first_split]['start'],
                                           splits_basic_info[last_split]['end']),)

        # at this point contig_start_stops knows every contig we are interested in, and
        # their start and stop positions based on what split ids were requested. we
        # shall go through each bam file the user is interested, and get those short reads
        # that map to regions of interest:
        for bam_file_path in self.input_bam_files:
            bam_file_name = '.'.join(os.path.basename(bam_file_path).split('.')[:-1])

            self.progress.update('Creating a dictionary of matching short reads in %s ...' % bam_file_name)

            bam_file = pysam.Samfile(bam_file_path, 'rb')
            for contig_id, start, stop in contig_start_stops:
                for entry in bam_file.fetch(contig_id, start, stop):
                    '''
                    here's what's available in the entry object:
                    
                    ['aend', 'alen', 'aligned_pairs', 'bin', 'blocks', 'cigar', 'cigarstring', 'cigartuples', 'compare',
                     'flag', 'get_aligned_pairs', 'get_blocks', 'get_overlap', 'get_reference_positions', 'get_tag',
                     'get_tags', 'has_tag', 'infer_query_length', 'inferred_length', 'is_duplicate', 'is_paired', 
                     'is_proper_pair', 'is_qcfail', 'is_read1', 'is_read2', 'is_reverse', 'is_secondary', 'is_supplementary',
                     'is_unmapped', 'isize', 'mapping_quality', 'mapq', 'mate_is_reverse', 'mate_is_unmapped', 'mpos', 'mrnm',
                     'next_reference_id', 'next_reference_start', 'opt', 'overlap', 'pnext', 'pos', 'positions', 'qend', 
                     'qlen', 'qname', 'qqual', 'qstart', 'qual', 'query', 'query_alignment_end', 'query_alignment_length',
                     'query_alignment_qualities', 'query_alignment_sequence', 'query_alignment_start', 'query_length',
                     'query_name', 'query_qualities', 'query_sequence', 'reference_end', 'reference_id', 'reference_length',
                     'reference_start', 'rlen', 'rname', 'rnext', 'seq', 'setTag', 'set_tag', 'set_tags', 'tags', 'template_length', 'tid', 'tlen']'''

                    # we are doing only for 'single reads', but I think this has to take into account the paired-end case as well.
                    short_reads_for_splits_dict['_'.join([contig_id, str(start), str(stop), entry.query_name, bam_file_name])] = entry.query_sequence

        self.progress.end()

        return short_reads_for_splits_dict
コード例 #19
0
ファイル: merger.py プロジェクト: semiller10/anvio
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory,
            ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError(
                "Anvi'o couldn't find the contigs database where you said it would be :/"
            )

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.check_dbs_to_be_merged()

        self.populate_profile_dbs_info_dict()

        self.populate_layer_additional_data_dict()

        self.sample_ids_found_in_input_dbs = sorted([
            v['sample_id'] for v in list(self.profile_dbs_info_dict.values())
        ])
        if len(self.profile_dbs_info_dict) != len(
                set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError(
                "Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" %
                (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [
            ('total_length', 'The number of nucleotides described'),
            ('num_contigs', 'The number of contigs'),
            ('version', 'The version number'),
            ('num_splits', 'The number of splits'),
            ('min_contig_length', 'The minimum contig length (-M) values'),
            ('max_contig_length',
             'The maximum contig length (--max-contig-length) values'),
            ('min_coverage_for_variability',
             'The minimum coverage values to report variability (-V)'),
            ('report_variability_full',
             'Whether to report full variability (--report-variability-full) flags'
             ), ('SCVs_profiled', 'Profile SCVs flags (--profile-SCVs)'),
            ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')
        ]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                if anvio.FORCE:
                    self.run.warning(
                        "Anvio'o found out that %s is not identical across all your profiles, but since you\
                                      have used the `--force` flag, it will continue with the merge. This is very\
                                      dangerous, and even if merging finishes succesfully, it does not mean you can trust\
                                      your results to be error free. We believe you are prepared to deal with potential\
                                      implications of forcing things because you are awesome."
                        % p,
                        lc="cyan")
                else:
                    raise ConfigError(
                        "Ouch. %s are not identical for all profiles to be merged, which is a \
                                       deal breaker. All profiles that are going to be merged must be\
                                       run with identical flags and parameters :/ You really shouldn't but if you want to\
                                       try to force things because you believe this is due to a misunderstanding, you can\
                                       use the flag --force. While you are considering this as an option, please also\
                                       remember that this we advice against it.."
                        % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(
            list(
                utils.get_all_item_names_from_the_database(
                    list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([
            r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()
        ])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError(
                    "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"
                )
            else:
                raise ConfigError(
                    "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")

        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError(
                "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?"
                % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()
コード例 #20
0
    def process(self, output_dir, drop_previous_annotations=False):
        """Takes an anvi'o contigs database, and does its magic.
        
        Which involves exporting amino acid sequences for gene calls, running emapper.py on them,\
        parsing the output, and storing the results in the contigs database.
        """

        if not self.contigs_db_path:
            raise ConfigError, "EggNOGMapper::process() is speaking: you can't really call this function if you inherited\
                                this class without a contigs database path :/ What are you doing?"

        filesnpaths.is_output_dir_writable(output_dir)

        contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        if not contigs_db.meta['genes_are_called']:
            raise ConfigError, "It seems genes were not called for this contigs database (%s). This is a\
                                total no-no since we will need them to get amino acid seqeunces for functional\
                                annotationd :/" % self.contigs_db_path

        aa_sequences_list = contigs_db.db.get_table_as_list_of_tuples(
            t.gene_protein_sequences_table_name)
        num_aa_sequences = len(aa_sequences_list)
        contigs_db.disconnect()

        # change the current work directory
        work_dir = os.getcwd()
        os.chdir(output_dir)

        self.run.info('Work directory for temporary files', output_dir)
        self.run.info('Num threads to use', self.num_threads)
        self.run.info('Target database', self.database, mc='red')
        self.run.info('Use memomory', self.usemem)
        self.run.info('Genes found', num_aa_sequences, mc='green')
        self.run.info('AA sequences', self.aa_sequences_file_name)

        self.progress.new('Processing')
        self.progress.update('Storing gene sequences ...')

        aa_sequences_fp = open(self.aa_sequences_file_name, 'w')
        for gene_callers_id, aa_sequence in aa_sequences_list:
            aa_sequences_fp.write(
                '>%s%d\n%s\n' %
                (self.gene_caller_id_prefix, gene_callers_id, aa_sequence))
        aa_sequences_fp.close()
        del aa_sequences_list

        cmd_line = [
            self.executable, '-i', self.aa_sequences_file_name, '--output',
            self.output_file_prefix
        ]

        # num threads
        cmd_line.extend(['--cpu', self.num_threads
                         ]) if self.num_threads else None

        # usemem
        cmd_line.extend(['--usemem']) if self.usemem else None

        # database
        cmd_line.extend(['--database', self.database])

        self.progress.update(
            'Running eggnog-mapper on %d sequences. This may take a while ...'
            % num_aa_sequences)
        utils.run_command(cmd_line, self.log_file_path)

        if not os.path.exists(self.annotations_file_name):
            self.progress.end()
            raise ConfigError, "Something went wrong with eggnog-mapper :( The annotations file is not where it is supposed to be.\
                                If you are lucky, this log file will have enough output information for you to make sense of\
                                what went wrong: '%s'. Due to this error, the output directory will be kept as is, and you\
                                will have to remove it manually. Sorry about the inconvenience! Anvi'o developers know how much\
                                it sucks when things just don't work." % os.path.join(
                output_dir, self.log_file_path)

        self.progress.end()

        # we are done, and the annotations file is there.
        self.populate_annotations_dict(
            os.path.join(output_dir, self.annotations_file_name))
        os.chdir(work_dir)

        # alright. store annotations into the database
        self.store_annotations_in_db(
            drop_previous_annotations=drop_previous_annotations)
コード例 #21
0
    def __init__(self,
                 args,
                 skip_sanity_check=False,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.bams_and_profiles_file_path = A('bams_and_profiles')

        if not self.bams_and_profiles_file_path:
            raise ConfigError(
                "Sorry, you can't get an instance of this class without a `--bams-and-profiles` argument."
            )

        # get these filled in immediately
        self.contigs_db_path, self.profile_db_bam_file_pairs = utils.get_bams_and_profiles_txt_as_data(
            self.bams_and_profiles_file_path)
        self.profile_db_paths = [
            e['profile_db_path']
            for e in self.profile_db_bam_file_pairs.values()
        ]

        # params to identify regions of interest. if you are studying the code, don't forget to read
        # the information stored in the help menu of the program about these parameters
        self.min_coverage_to_define_stretches = A(
            'min_coverage_to_define_stretches') or 10
        self.min_stretch_length = A('min_stretch_length') or 50
        self.min_distance_between_independent_stretches = A(
            'min_distance_between_independent_stretches') or 2000
        self.num_nts_to_pad_a_stretch = A('num_nts_to_pad-a_stretch') or 100

        # palindrome search parameters
        self.min_palindrome_length = A('min_palindrome_length') or 10
        self.max_num_mismatches = A('max_num_mismatches') or 0
        self.min_distance_palindrome = A('min-distance') or 50

        # parameters to survey inversions
        self.process_only_inverted_reads = A('process_only_inverted_reads')

        # be talkative or not
        self.verbose = A('verbose')

        # debugging mode:
        self.only_report_from = A('only_report_from')

        if self.only_report_from:
            self.verbose = True

        if not skip_sanity_check:
            self.sanity_check()

        # we will generate our splits info and contigs to splits dicts here.
        split_names = utils.get_all_item_names_from_the_database(
            self.profile_db_paths[0])
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path,
                                           run=run_quiet,
                                           progress=progress_quiet)
        self.splits_basic_info = contigs_db.db.smart_get(
            t.splits_info_table_name, column='split', data=split_names)
        self.contig_sequences = contigs_db.db.get_table_as_dict(
            t.contig_sequences_table_name)
        contigs_db.disconnect()

        # next, we will generate a dictionary to convert contig names to split names
        self.contig_name_to_split_names = {}
        for split_name in sorted(self.splits_basic_info.keys()):
            contig_name = self.splits_basic_info[split_name]['parent']

            if contig_name not in self.contig_name_to_split_names:
                self.contig_name_to_split_names[contig_name] = []

            self.contig_name_to_split_names[contig_name].append(split_name)

        # let's have a variable of convenience:
        self.contig_names = sorted(list(
            self.contig_name_to_split_names.keys()))
コード例 #22
0
ファイル: merger.py プロジェクト: mruehlemann/anvio
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory,
            ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError(
                "Anvi'o couldn't find the contigs database where you said it would be :/"
            )

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.populate_profile_dbs_info_dict()

        self.sample_ids_found_in_input_dbs = sorted([
            v['sample_id'] for v in list(self.profile_dbs_info_dict.values())
        ])
        if len(self.profile_dbs_info_dict) != len(
                set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError(
                "Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" %
                (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [
            ('total_length', 'The number of nucleotides described'),
            ('num_contigs', 'The number of contigs'),
            ('version', 'The version number'),
            ('num_splits', 'The number of splits'),
            ('min_contig_length', 'The minimum contig length (-M) values'),
            ('min_coverage_for_variability',
             'The minimum coverage values to report variability (-V)'),
            ('report_variability_full',
             'Whether to report full variability (--report-variability-full) flags'
             ),
            ('AA_frequencies_profiled',
             'Profile AA frequencies flags (--profile-AA-frequencies)'),
            ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')
        ]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                raise ConfigError(
                    "%s are not identical for all profiles to be merged, which is a \
                                    deal breaker. All profiles that are going to be merged must be\
                                    run with identical flags and parameters :/"
                    % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(
            list(
                dbops.get_split_names_in_profile_db(
                    list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([
            r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()
        ])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError(
                    "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"
                )
            else:
                raise ConfigError(
                    "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")

        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError(
                "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?"
                % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()
コード例 #23
0
    def init_functions(self):
        # check whether function calls are available for all genomes involved, and whether function sources for each genome is identical
        genomes_with_no_functional_annotation = []
        function_annotation_sources_per_genome = {}
        all_function_annotation_sources_observed = set([])
        for genome_name in self.genomes:
            g = self.genomes[genome_name]
            contigs_db = dbops.ContigsDatabase(g['contigs_db_path'])
            sources = contigs_db.meta['gene_function_sources']
            contigs_db.disconnect()

            if not sources:
                genomes_with_no_functional_annotation.append(genome_name)
            else:
                function_annotation_sources_per_genome[genome_name] = sources
                all_function_annotation_sources_observed.update(sources)

        if genomes_with_no_functional_annotation:
            if len(genomes_with_no_functional_annotation) == len(self.genomes):
                self.run.warning(
                    "None of your genomes seem to have any functional annotation. No biggie. Things will continue to work. But "
                    "then your genomes have no functional annotation. SAD.")
            else:
                self.run.warning("Some of your genomes (%d of the %d, to be precise) seem to have no functional annotation. Since this workflow "
                                 "can only use matching functional annotations across all genomes involved, having even one genome without "
                                 "any functions means that there will be no matching function across all. Things will continue to work, but "
                                 "you will have no functions at the end for your gene clusters." % \
                                                (len(genomes_with_no_functional_annotation), len(self.genomes)))

            # make sure it is clear.
            function_annotation_sources_per_genome = {}
            all_function_annotation_sources_observed = set([])
        elif not len(all_function_annotation_sources_observed):
            self.run.warning(
                "None of your genomes seem to have any functional annotation. No biggie. Things will continue to work. But "
                "then your genomes have no functional annotation. It is sad.")
        else:
            # this guy down below fills in the self.function_annotation_sources with function annotation sources
            # that are common to all genomes.
            for sources in list(
                    function_annotation_sources_per_genome.values()):
                if not sources:
                    continue

                if not (self.function_annotation_sources):
                    self.function_annotation_sources.update(sources)
                else:
                    self.function_annotation_sources = self.function_annotation_sources.intersection(
                        sources)

            function_annotation_sources_some_genomes_miss = all_function_annotation_sources_observed.difference(
                self.function_annotation_sources)

            if not len(self.function_annotation_sources):
                # none of the functions are common
                self.run.warning("Although some of your genomes had some functional annotations, none of them were common to all genomes :/ "
                                 "Anvi'o will continue working with them, but you will have no functions available to you downstream. Just "
                                 "so you know, these are the annotation sources observed at least once in at least one of your genomes: '%s'" % \
                                                                    (', '.join(all_function_annotation_sources_observed)))
                self.functions_are_available = False
            else:
                self.functions_are_available = True

                # good. here we know some functions are available, but let's get some further understanding, and report it to the user, you know,
                # because we're nice:
                if len(function_annotation_sources_some_genomes_miss):
                    # some functions were missing from some genomes
                    self.run.warning("Anvi'o has good news and bad news for you (very balanced, as usual). The good news is that there are some "
                                     "function annotation sources that are common to all of your genomes, and they will be used whenever "
                                     "it will be appropriate. Here they are: '%s'. The bad news is you had more functiona annotation sources, "
                                     "but they were not common to all genomes. Here they are so you can say your goodbyes to them (because "
                                     "they will not be used): '%s'" % \
                                            (', '.join(self.function_annotation_sources), ', '.join(function_annotation_sources_some_genomes_miss)))
                else:
                    # every function ever observed is common to all genomes.
                    self.run.warning(
                        "Good news! Anvi'o found all these functions that are common to all of your genomes and will use them for "
                        "downstream analyses and is very proud of you: '%s'." %
                        (', '.join(self.function_annotation_sources)),
                        lc='green')
コード例 #24
0
    def load_genomes_descriptions(self, skip_functions=False, init=True):
        """Load genome descriptions from int/ext genome dictionaries"""

        # start with a sanity check to make sure name are distinct
        self.names_check()

        self.internal_genome_names = list(self.internal_genomes_dict.keys())
        self.external_genome_names = list(self.external_genomes_dict.keys())

        # let us know if the user did not want a full init.
        self.full_init = init

        # convert relative paths to absolute paths and MERGE internal and external genomes into self.genomes:
        for source, input_file in [
            (self.external_genomes_dict, self.input_file_for_external_genomes),
            (self.internal_genomes_dict, self.input_file_for_internal_genomes)
        ]:
            for genome_name in source:
                self.genomes[genome_name] = source[genome_name]
                for db_path_var in ['contigs_db_path', 'profile_db_path']:
                    if db_path_var not in self.genomes[genome_name]:
                        continue
                    path = self.genomes[genome_name][db_path_var]

                    if not path:
                        raise ConfigError(
                            "Bad news: anvi'o was loading genome desriptions, and it run into an empty path for\
                                           the genome %s. How did this happen? HOW? :("
                            % genome_name)

                    if not path.startswith('/'):
                        self.genomes[genome_name][
                            db_path_var] = os.path.abspath(
                                os.path.join(os.path.dirname(input_file),
                                             path))

                # while we are going through all genomes and reconstructing self.genomes for the first time,
                # let's add the 'name' attribute in it as well.'
                self.genomes[genome_name]['name'] = genome_name

        # add hashes for each genome in the self.genomes dict. this will allow us to see whether the HDF file already contains
        # all the information we need.
        self.genome_hash_to_genome_name = {}
        for genome_name in self.external_genome_names:
            g_hash = self.get_genome_hash_for_external_genome(
                self.genomes[genome_name])
            self.genomes[genome_name]['genome_hash'] = g_hash
            self.genome_hash_to_genome_name[g_hash] = genome_name
        for genome_name in self.internal_genome_names:
            g_hash = self.get_genome_hash_for_internal_genome(
                self.genomes[genome_name])
            self.genomes[genome_name]['genome_hash'] = g_hash
            self.genome_hash_to_genome_name[g_hash] = genome_name

        # if the client is not interested in functions, skip the rest.
        if skip_functions:
            self.functions_are_available = False
        else:
            self.init_functions()

        # this will populate self.genomes with relevant data that can be learned about these genomes such as 'avg_gene_length',
        # 'num_splits', 'num_contigs', 'num_genes', 'percent_redundancy', 'gene_caller_ids', 'total_length', 'partial_gene_calls',
        # 'percent_completion', 'num_genes_per_kb', 'gc_content'.
        if self.full_init:
            self.init_internal_genomes()
            self.init_external_genomes()
        else:
            # init will do everything. but it is very expensive. if the user does not want to
            # init all the bulky stuff, we still can give them the contents of the meta tables.
            for genome_name in self.genomes:
                g = self.genomes[genome_name]
                contigs_db = dbops.ContigsDatabase(g['contigs_db_path'])
                for key in contigs_db.meta:
                    g[key] = contigs_db.meta[key]

        # make sure it is OK to go with self.genomes
        self.sanity_check()
コード例 #25
0
    def load_genomes_descriptions(self, skip_functions=False, init=True):
        """Load genome descriptions from int/ext genome dictionaries"""

        # start with a sanity check to make sure name are distinct
        self.names_check()

        self.internal_genome_names = list(self.internal_genomes_dict.keys())
        self.external_genome_names = list(self.external_genomes_dict.keys())

        # let us know if the user did not want a full init.
        self.full_init = init

        # convert relative paths to absolute paths and MERGE internal and external genomes into self.genomes:
        for source, input_file in [
            (self.external_genomes_dict, self.input_file_for_external_genomes),
            (self.internal_genomes_dict, self.input_file_for_internal_genomes)
        ]:
            for genome_name in source:
                self.genomes[genome_name] = source[genome_name]
                for db_path_var in ['contigs_db_path', 'profile_db_path']:
                    if db_path_var not in self.genomes[genome_name]:
                        continue
                    path = self.genomes[genome_name][db_path_var]

                    if not path:
                        raise ConfigError(
                            "Bad news: anvi'o was loading genome desriptions, and it run into an empty path for "
                            "the genome %s. How did this happen? HOW? :(" %
                            genome_name)

                    if not path.startswith('/'):
                        self.genomes[genome_name][
                            db_path_var] = os.path.abspath(
                                os.path.join(os.path.dirname(input_file),
                                             path))

                # while we are going through all genomes and reconstructing self.genomes for the first time,
                # let's add the 'name' attribute in it as well.'
                self.genomes[genome_name]['name'] = genome_name

        # add hashes for each genome in the self.genomes dict.
        self.genome_hash_to_genome_name = {}

        self.progress.new('Setting up genome hash dicts',
                          progress_total_items=len(self.genomes))
        for genome_name in self.external_genome_names:
            self.progress.update("working on %s (external)" % (genome_name),
                                 increment=True)
            g_hash = str(
                self.get_genome_hash_for_external_genome(
                    self.genomes[genome_name]))
            self.genomes[genome_name]['genome_hash'] = g_hash
            self.genome_hash_to_genome_name[g_hash] = genome_name
        for genome_name in self.internal_genome_names:
            self.progress.update("working on %s (internal)" % (genome_name),
                                 increment=True)
            g_hash = str(
                self.get_genome_hash_for_internal_genome(
                    self.genomes[genome_name]))
            self.genomes[genome_name]['genome_hash'] = g_hash
            self.genome_hash_to_genome_name[g_hash] = genome_name
        self.progress.end()

        # if the user wanted anvi'o to not care about checking genome hashes and we ended up
        # finding genomes with identical hashes, let them know
        if self.skip_checking_genome_hashes and (
                len(self.internal_genomes_with_identical_hashes)
                or len(self.external_genomes_with_identical_hashes)):
            self.run.warning(
                "While processing internal and/or external genomes files you have provided, "
                "anvi'o found genomes with indentical hashes (which means they were practically "
                "identical to each other). But since you have instructed anvi'o to ignore that "
                "it is now continuing with the flow (even %d hashes for your internal genomes and %d) "
                "hashes for your external gneomes appeared more than once). See below the genome names "
                "with identical hashes:" %
                (len(self.internal_genomes_with_identical_hashes),
                 len(self.external_genomes_with_identical_hashes)),
                overwrite_verbose=True)

            for _t, _d in [
                ('Internal', self.internal_genomes_with_identical_hashes),
                ('External', self.external_genomes_with_identical_hashes)
            ]:
                all_genome_hashes = list(_d.keys())
                for genome_hash in all_genome_hashes:
                    self.run.info(
                        "%s genomes with hash %s" % (_t, genome_hash),
                        "%s" % ", ".join(_d[genome_hash]),
                        overwrite_verbose=True,
                        nl_after=1
                        if genome_hash == all_genome_hashes[-1] else 0,
                        lc='red')

        # if the client is not interested in functions, skip the rest.
        if skip_functions:
            self.functions_are_available = False
        else:
            self.init_functions()

        # this will populate self.genomes with relevant data that can be learned about these genomes such as 'avg_gene_length',
        # 'num_splits', 'num_contigs', 'num_genes', 'percent_redundancy', 'gene_caller_ids', 'total_length', 'partial_gene_calls',
        # 'percent_completion', 'num_genes_per_kb', 'gc_content'.
        if self.full_init:
            self.init_internal_genomes()
            self.init_external_genomes()
        else:
            # init will do everything. but it is very expensive. if the user does not want to
            # init all the bulky stuff, we still can give them the contents of the meta tables.
            for genome_name in self.genomes:
                g = self.genomes[genome_name]
                contigs_db = dbops.ContigsDatabase(g['contigs_db_path'])
                for key in contigs_db.meta:
                    g[key] = contigs_db.meta[key]

        # make sure it is OK to go with self.genomes
        self.sanity_check()
コード例 #26
0
ファイル: splitter.py プロジェクト: satish162/anvio
    def store_locus_as_contigs_db(self,
                                  contig_name,
                                  sequence,
                                  gene_calls,
                                  output_path_prefix,
                                  reverse_complement=False):
        """Generates a contigs database and a blank profile for a given locus"""

        temporary_files = []

        # dealing with some output file business.
        E = lambda e: output_path_prefix + e
        locus_output_db_path = E(".db")
        locus_sequence_fasta = E("_sequence.fa")
        locus_external_gene_calls = E("_external_gene_calls.txt")
        temporary_files.extend(
            [locus_external_gene_calls, locus_sequence_fasta])

        # we will generate a blank profile database at the end of this. let's get the directory
        # business sorted.
        profile_output_dir = output_path_prefix + '-PROFILE'
        if os.path.exists(profile_output_dir):
            if self.overwrite_output_destinations:
                filesnpaths.shutil.rmtree(profile_output_dir)
            else:
                raise ConfigError(
                    "The directory %s exists, which kinda messes things up here. Either remove\
                                   it manually, or use the flag  --overwrite-output-destinations so anvi'o can\
                                   do it for you." % profile_output_dir)

        # sort out the contigs database output path
        if filesnpaths.is_file_exists(locus_output_db_path, dont_raise=True):
            if self.overwrite_output_destinations:
                os.remove(locus_output_db_path)
            else:
                raise ConfigError(
                    "There is already a contigs database at the output file path :( Either remove it first,\
                                   or use the --overwrite-output-destinations flag to give anvi'o full authority to wipe\
                                   your disk.")

        # do we need to reverse complement this guy? if yes, we will take care of the contigs sequence and
        # gene calls here, and remember this for later.
        gene_calls_list = list(gene_calls.keys())
        if reverse_complement:
            sequence = utils.rev_comp(sequence)
            gene_calls, gene_caller_id_conversion_dict = utils.rev_comp_gene_calls_dict(
                gene_calls, sequence)
        else:
            gene_caller_id_conversion_dict = dict([
                (gene_calls_list[g], g) for g in range(0, len(gene_calls_list))
            ])
            new_gene_calls = {}
            for g in range(0, len(gene_calls_list)):
                gene_call = copy.deepcopy(gene_calls[gene_calls_list[g]])
                new_gene_calls[g] = gene_call
            gene_calls = new_gene_calls

        # write the sequene as a temporary FASTA file since the design of ContigsDatabase::create
        # will work seamlessly with this approach:
        with open(locus_sequence_fasta, 'w') as f:
            f.write('>%s\n%s\n' % (contig_name, sequence))

        # similarly, here we will store external gene calls so there will be no gene calling during
        # the generation of the contigs database
        headers = [
            'gene_callers_id', 'contig', 'start', 'stop', 'direction',
            'partial', 'source', 'version'
        ]
        utils.store_dict_as_TAB_delimited_file(gene_calls,
                                               locus_external_gene_calls,
                                               headers=headers)

        # this is where magic happens. we ask anvi'o to create a contigs database for us.
        args = argparse.Namespace(
            contigs_fasta=locus_sequence_fasta,
            project_name=os.path.basename(output_path_prefix),
            split_length=sys.maxsize,
            kmer_size=4,
            external_gene_calls=locus_external_gene_calls)
        dbops.ContigsDatabase(locus_output_db_path,
                              run=self.run_object).create(args)

        # while we are at it, here we generate a blank profile, too. so visualization of the
        # new contigs database for debugging or other purposes through anvi'o.
        args = argparse.Namespace(
            blank_profile=True,
            contigs_db=locus_output_db_path,
            skip_hierarchical_clustering=False,
            output_dir=profile_output_dir,
            sample_name=os.path.basename(output_path_prefix))
        profiler.BAMProfiler(args, r=self.run_object)._run()

        # so we have a contigs database! but there isn't much in it. the following where clause will
        # help us read from the tables of the original contigs database, and store it into the
        # new one throughout the following sections of the code.
        where_clause = "gene_callers_id in (%s)" % ', '.join(
            ['"%d"' % g for g in gene_caller_id_conversion_dict])

        # a lousy anonymous function to read data from tables given the gene calls of interest
        R = lambda table_name: db.DB(self.input_contigs_db_path, None, ignore_version=True) \
                                              .get_some_rows_from_table_as_dict(table_name,
                                                                                where_clause=where_clause,
                                                                                error_if_no_data=False)

        G = lambda g: gene_caller_id_conversion_dict[g]

        ############################################################################################
        # DO FUNCTIONS
        ###########################################################################################
        function_calls = R(t.gene_function_calls_table_name)

        for entry_id in function_calls:
            function_calls[entry_id]['gene_callers_id'] = G(
                function_calls[entry_id]['gene_callers_id'])

        gene_function_calls_table = TableForGeneFunctions(locus_output_db_path,
                                                          run=self.run_object)
        gene_function_calls_table.create(function_calls)

        self.run.info("Output contigs DB path", locus_output_db_path)
        self.run.info("Output blank profile DB path",
                      os.path.join(profile_output_dir, 'PROFILE.db'))

        ############################################################################################
        # DO AMINO ACID SEQUENCES -- we are using external gene calls to generate the new contigs
        #                            database, but amino acid sequnces are kept in a different table
        #                            and anvi'o checks whether provided gene calls resolve to amino
        #                            acid sequences with proper starts and stops. if not, it skips
        #                            them. but amino acid sequences for each gene call was stored
        #                            in the original contigs database, and the best practice is to
        #                            carry them into the new one. so here we will remove all data
        #                            from the amino acid seqeunces table in the new database, and
        #                            copy the contents from the original one.
        ############################################################################################
        amino_acid_sequences = R(t.gene_amino_acid_sequences_table_name)

        entries = [(gene_caller_id_conversion_dict[g],
                    amino_acid_sequences[g]['sequence'])
                   for g in amino_acid_sequences]
        db.DB(locus_output_db_path, None, ignore_version=True).insert_many(
            t.gene_amino_acid_sequences_table_name, entries=entries)

        ############################################################################################
        # REMOVE TEMP FILES
        ###########################################################################################
        if anvio.DEBUG:
            self.run.info_single(
                "Temp output files were kept for inspection due to --debug")
        else:
            [os.remove(f) for f in temporary_files]
コード例 #27
0
        default=False,
        help='Show available single-copy gene search results and exit.')
    parser.add_argument(
        '--source',
        default=None,
        help='Source to focus on. If none declared, all single-copy gene sources\
                                are going to be listed.')

    args, unknown = parser.parse_known_args()

    contigs = set([])
    contig_lengths = {}
    contig_genes = {}
    genes = {}

    db = dbops.ContigsDatabase(args.contigs_db, quiet=False)
    search_contigs_dict = db.db.get_table_as_dict(t.hmm_hits_table_name)
    genes_in_contigs_dict = db.db.get_table_as_dict(
        t.genes_in_contigs_table_name)
    search_info_dict = db.db.get_table_as_dict(t.hmm_hits_info_table_name)
    contig_lengths_table = db.db.get_table_as_dict(t.contigs_info_table_name)
    contig_lengths = dict([(c, contig_lengths_table[c]['length'])
                           for c in contig_lengths_table])
    db.disconnect()

    sources = {}
    irrelevant_sources = []
    for source in search_info_dict:
        if search_info_dict[source]['search_type'] == "singlecopy":
            sources[source] = [
                g.strip() for g in search_info_dict[source]['genes'].split(',')
コード例 #28
0
ファイル: completeness.py プロジェクト: paczian/anvio
    def __init__(self,
                 contigs_db_path,
                 source=None,
                 run=run,
                 progress=progress):
        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([
            k for k in info_table.keys()
            if info_table[k]['search_type'] != 'singlecopy'
        ])
        singlecopy_sources = set([
            k for k in info_table.keys()
            if info_table[k]['search_type'] == 'singlecopy'
        ])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # read search table (which holds hmmscan hits for splits).
        self.search_table = utils.get_filtered_dict(
            contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name),
            'source', singlecopy_sources)

        # an example entry in self.search_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 100,
        #    'source'                : u'Campbell_et_al',
        #    'gene_unique_identifier': u'c70c1cc3025b636100fd8a910b5b7f0dd09752fc78e2a1f10ee60954',
        #    'e_value'               : 0.0013,
        #    'gene_name'             : u'UvrC_HhH_N',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001'
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [
                h for h in info_table[source_in_db]['ref'].split()
                if h.startswith('http')
            ][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', '))
                                 for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = info_table.keys()

        if source:
            if source not in self.sources:
                raise ConfigError, 'Source "%s" is not one of the single-copy gene sources found in the database.' % source

            # filter out sources that are not requested
            self.sources = [source]
            self.genes_in_db = {source: self.genes_in_db[source]}
            self.search_table = utils.get_filtered_dict(
                self.search_table, 'source', set([source]))

        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        # these will be very useful later. trust me.
        for entry in self.search_table.values():
            if entry[
                    'gene_unique_identifier'] not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[
                    entry['gene_unique_identifier']] = entry['gene_name']

            if entry[
                    'gene_unique_identifier'] not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[
                    entry['gene_unique_identifier']] = [entry['split']]
            else:
                self.splits_unique_gene_id_occurs[
                    entry['gene_unique_identifier']].append(entry['split'])
コード例 #29
0
    def __init__(self,
                 contigs_db_path,
                 source_requested=None,
                 run=run,
                 progress=progress):
        self.run = run
        self.progress = progress

        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] != 'singlecopy'
        ])
        singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] == 'singlecopy'
        ])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # get the hmm hits table
        self.hmm_hits_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_table_name)

        # read search table (which holds hmmscan hits for splits).
        self.hmm_hits_splits_table = utils.get_filtered_dict(
            contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name),
            'source', singlecopy_sources)

        # an example entry in self.hmm_hits_splits_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 69.6763202725724,
        #    'source'                : u'Campbell_et_al',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001',
        #    'hmm_hit_entry_id'      : 1
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [
                h for h in info_table[source_in_db]['ref'].split()
                if h.startswith('http')
            ][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', '))
                                 for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = list(info_table.keys())
        self.domains = set(
            [info_table[source]['domain'] for source in self.sources])
        self.source_to_domain = dict([(source, info_table[source]['domain'])
                                      for source in self.sources])
        self.domain_to_sources = [(domain, [
            source for source in self.sources
            if info_table[source]['domain'] == domain
        ]) for domain in self.domains]

        if source_requested:
            if source_requested not in self.sources:
                raise ConfigError(
                    'Requested source "%s" is not one of the single-copy gene sources found in the database.'
                    % source_requested)

            # filter out sources that are not requested
            self.sources = [source_requested]
            self.genes_in_db = {
                source_requested: self.genes_in_db[source_requested]
            }
            self.hmm_hits_splits_table = utils.get_filtered_dict(
                self.hmm_hits_splits_table, 'source', set([source_requested]))

        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        # these will be very useful later. trust me.
        for entry in list(self.hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]
            gene_unique_identifier = hmm_hit['gene_unique_identifier']

            if gene_unique_identifier not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[
                    gene_unique_identifier] = hmm_hit['gene_name']

            if gene_unique_identifier not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[gene_unique_identifier] = [
                    entry['split']
                ]
            else:
                self.splits_unique_gene_id_occurs[
                    gene_unique_identifier].append(entry['split'])
コード例 #30
0
ファイル: completeness.py プロジェクト: dagahren/anvio
    def __init__(self,
                 contigs_db_path,
                 scg_domain_classifier_path=None,
                 source_requested=None,
                 run=run,
                 progress=progress):
        self.run = run
        self.progress = progress
        self.initialized_properly = True

        self.SCG_domain_predictor = scgdomainclassifier.Predict(
            argparse.Namespace(),
            run=terminal.Run(verbose=False),
            progress=self.progress)

        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] != 'singlecopy'
        ])
        singlecopy_sources = set([
            k for k in list(info_table.keys())
            if info_table[k]['search_type'] == 'singlecopy'
        ])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # get the hmm hits table
        self.hmm_hits_table = contigs_db.db.get_table_as_dict(
            t.hmm_hits_table_name)

        # read search table (which holds hmmscan hits for splits).
        self.hmm_hits_splits_table = utils.get_filtered_dict(
            contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name),
            'source', singlecopy_sources)

        # an example entry in self.hmm_hits_splits_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 69.6763202725724,
        #    'source'                : u'Bacteria_74',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001',
        #    'hmm_hit_entry_id'      : 1
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [
                h for h in info_table[source_in_db]['ref'].split()
                if h.startswith('http')
            ][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', '))
                                 for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = list(info_table.keys())
        self.domains = set(
            [info_table[source]['domain'] for source in self.sources])
        self.source_to_domain = dict([(source, info_table[source]['domain'])
                                      for source in self.sources])
        self.domain_to_sources = [(domain, [
            source for source in self.sources
            if info_table[source]['domain'] == domain
        ]) for domain in self.domains]

        # compatibility sanity checks 1/2: make sure domains between domain predictor and the contigs database match
        self.domains_missing_in_SCG_domain_predictor = [
            d for d in self.domains
            if d not in self.SCG_domain_predictor.SCG_domains
        ]
        self.domains_missing_in_SCGs_run_for_contigs = [
            d for d in self.SCG_domain_predictor.SCG_domains
            if d not in self.domains
        ]

        if len(self.domains_missing_in_SCG_domain_predictor):
            num_domains_missing = len(
                self.domains_missing_in_SCG_domain_predictor)
            self.progress.reset()
            self.run.warning("OK. We have a problem. You seem to have single-copy core gene collections for among your HMM hits %s that\
                              are not included when the anvi'o domain predictor was trained :/ Here is the list of domains that are making\
                              us upset here: \"%s\". This means either you put a new HMM single-copy core gene collection to the anvi'o HMMs\
                              directory, or gave it as a parameter, and run `anvi-run-hmms` without updating the classifier anvi'o uses to\
                              resolve domains for proper completion/redundancy estimates."                                                                                           % \
                                           ('a domain' if num_domains_missing == 1 else '%s domains' % num_domains_missing,
                                            ', '.join(self.domains_missing_in_SCG_domain_predictor)))
            self.initialized_properly = False

        if len(self.domains_missing_in_SCGs_run_for_contigs):
            num_domains_missing = len(
                self.domains_missing_in_SCGs_run_for_contigs)
            self.progress.reset()
            self.run.warning("Things are not quite OK. It seems %d of the domains that are known to the classifier anvi'o uses to predict\
                              domains for completion estimation are missing from your contigs database. This means, you didn't run the\
                              program `anvi-run-hmms` with default parameters, or you removed some essential SCG domains from it later. Or\
                              you did something else. Who knows. Here is the list of domains that are making us upset here: \"%s\". We hope\
                              you are happy. If you want to get rid of this warning you can run `anvi-run-hmms` on this your contigs database\
                              whenever it is convenient to you, so anvi'o can make sure you have everything in the right place."                                                                                                                                 % \
                                           (num_domains_missing, ', '.join(self.domains_missing_in_SCG_domain_predictor)))

            # since we just established that the user did not run these domains for their contigs database,
            # we will update our self.domains variable to make sure the f****d uppery that will likely take
            # place later is to a convenient minumum:
            self.domains.discard(
                set(self.domains_missing_in_SCGs_run_for_contigs))

            self.initialized_properly = False

        # compatibility sanity checks 2/2: make sure sources in domain predictor to those in the contigs database
        self.sources_missing_in_SCGs_run_for_contigs = [
            s for s in self.SCG_domain_predictor.SCG_sources
            if s not in self.sources
        ]
        self.sources_missing_in_SCG_domain_predictor = [
            s for s in self.sources
            if s not in self.SCG_domain_predictor.SCG_sources
        ]
        if len(self.sources_missing_in_SCGs_run_for_contigs):
            num_sources_missing = len(
                self.sources_missing_in_SCGs_run_for_contigs)
            self.progress.reset()
            self.run.warning("OK. We have a VERY interesting problem. You have all the SCG domains necessary to run the predictor covered\
                              in your contigs database, however, %s that are used during the training of the domain predictor does not seem\
                              to occur in your contigs database :/ Here is the list of HMM sources that are making us upset here: \"%s\".\
                              This most likely means you are using a new version of anvi'o with older single-copy core gene sources, or you are\
                              exploring new single-copy core gene sources to see how they behave. That's all good and very exciting, but unfortunately\
                              anvi'o will not be able to predict domains due to this incompatibility here. You could solve this problem by running\
                              `anvi-run-hmms` on your contigs database, but you can also live without solving it as anvi'o will continue running\
                              by not utilizing domain-specific HMMs for completion/redundancy estimates, but giving you all the results all at once."                                                                                                                                                      % \
                                           ('an HMM source' if num_sources_missing == 1 else '%s HMM sources' % num_sources_missing,
                                            ', '.join(self.sources_missing_in_SCGs_run_for_contigs)))
            self.initialized_properly = False

        if source_requested:
            if source_requested not in self.sources:
                raise ConfigError(
                    'Requested source "%s" is not one of the single-copy gene sources found in the database.'
                    % source_requested)

            # filter out sources that are not requested
            self.sources = [source_requested]
            self.genes_in_db = {
                source_requested: self.genes_in_db[source_requested]
            }
            self.hmm_hits_splits_table = utils.get_filtered_dict(
                self.hmm_hits_splits_table, 'source', set([source_requested]))

        # these will be very useful later. trust me.
        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        for entry in list(self.hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]
            gene_unique_identifier = hmm_hit['gene_unique_identifier']

            if gene_unique_identifier not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[
                    gene_unique_identifier] = hmm_hit['gene_name']

            if gene_unique_identifier not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[gene_unique_identifier] = [
                    entry['split']
                ]
            else:
                self.splits_unique_gene_id_occurs[
                    gene_unique_identifier].append(entry['split'])