Ejemplo n.º 1
0
    def run_search_and_parse_results(self):
        """Align the protein against the database based on only sequence"""

        if not self.percent_identical_cutoff or not self.max_number_templates:
            raise ConfigError(
                "run_search_and_parse_results :: You initiated this class without providing values for percent_identical_cutoff "
                "and max_number_templates, which is required for this function."
            )

        # Change to MODELLER working directory
        os.chdir(self.directory)

        driver = diamond.Diamond(
            query_fasta=self.target_fasta_path,
            target_fasta=J(self.database_dir, 'pdb_95.dmnd'),
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.blastp()

        # Change back to user directory
        os.chdir(self.start_dir)

        search_df = driver.view_as_dataframe(
            J(self.directory, driver.tabular_output_path))

        matches_found = search_df.shape[0]

        if not matches_found:
            self.run.warning(
                "No proteins with homologous sequence were found for {}. No structure will be modelled"
                .format(self.corresponding_gene_call))
            raise self.EndModeller

        # We need the gene length for proper_pident
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        while next(target_fasta):
            gene_length = len(target_fasta.seq)

        # add some useful columns
        search_df["proper_pident"] = search_df["pident"] * search_df[
            "length"] / gene_length
        search_df["code"] = search_df["sseqid"].str[:-1]
        search_df["chain"] = search_df["sseqid"].str[-1]

        # filter results by self.percent_identical_cutoff.
        max_pident_found = search_df["proper_pident"].max()
        id_of_max_pident = tuple(
            search_df.loc[search_df["proper_pident"].idxmax(),
                          ["code", "chain"]].values)
        search_df = search_df[
            search_df["proper_pident"] >= self.percent_identical_cutoff]

        search_df = search_df.sort_values("proper_pident", ascending=False)

        # If more than 1 template in 1 PDB id, just choose 1
        search_df = search_df.drop_duplicates('code', keep='first')

        # Order them and take the first self.modeller.max_number_templates.
        matches_after_filter = len(search_df)
        if not matches_after_filter:
            self.run.warning("Gene {} did not have a search result with proper percent identicalness above or equal "
                             "to {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a "
                             "proper percent identicalness of {:.2f}%. No structure will be modelled.".\
                              format(self.corresponding_gene_call,
                                     self.percent_identical_cutoff,
                                     id_of_max_pident[1],
                                     id_of_max_pident[0],
                                     max_pident_found))
            raise self.EndModeller

        # get up to self.modeller.max_number_templates of those with the highest proper_ident scores.
        search_df = search_df.iloc[:min(
            [len(search_df), self.max_number_templates])]

        # Get their chain and 4-letter ids
        self.list_of_template_code_and_chain_ids = list(
            zip(search_df["code"], search_df["chain"]))

        self.run.info("Max number of templates allowed",
                      self.max_number_templates)
        self.run.info("Number of candidate templates", matches_found)
        self.run.info(
            "After >{}% identical filter".format(
                self.percent_identical_cutoff), matches_after_filter)
        self.run.info("Number accepted as templates",
                      len(self.list_of_template_code_and_chain_ids))

        # update user on which templates are used, and write the templates to self.out
        for i in range(len(self.list_of_template_code_and_chain_ids)):
            pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i]
            ppi = search_df["proper_pident"].iloc[i]

            self.out["templates"]["pdb_id"].append(pdb_id)
            self.out["templates"]["chain_id"].append(chain_id)
            self.out["templates"]["ppi"].append(ppi)

            self.run.info(
                "Template {}".format(i + 1),
                "Protein ID: {}, Chain {} ({:.1f}% identical)".format(
                    pdb_id, chain_id, ppi))
Ejemplo n.º 2
0
from anvio.errors import ConfigError
from anvio.tables.views import TablesForViews
from anvio.tables.codonfrequencies import TableForCodonFrequencies
from anvio.tables.variability import TableForVariability
from anvio.tables.miscdata import TableForLayerAdditionalData

__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "*****@*****.**"

null_progress = terminal.Progress(verbose=False)
null_run = terminal.Run(verbose=False)
pp = terminal.pretty_print


class BAMProfiler(dbops.ContigsSuperclass):
    """Creates an über class for BAM file operations"""
    def __init__(self, args, r=terminal.Run(width=35), p=terminal.Progress()):
        self.args = args
        self.progress = p
        self.run = r

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
Ejemplo n.º 3
0
    def __init__(self, args, r=terminal.Run(width=35), p=terminal.Progress()):
        self.args = args
        self.progress = p
        self.run = r

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
        self.output_directory = A('output_dir')
        self.list_contigs_and_exit = A('list_contigs')
        self.min_contig_length = A('min_contig_length') or 0
        self.max_contig_length = A('max_contig_length') or sys.maxsize
        self.min_mean_coverage = A('min_mean_coverage')
        self.min_coverage_for_variability = A('min_coverage_for_variability')
        self.contigs_shall_be_clustered = A('cluster_contigs')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.sample_id = A('sample_name')
        self.report_variability_full = A('report_variability_full')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.skip_SNV_profiling = A('skip_SNV_profiling')
        self.profile_SCVs = A('profile_SCVs')
        self.ignore_orphans = A('ignore_orphans')
        self.max_coverage_depth = A('max_coverage_depth') or 8000
        self.gen_serialized_profile = A('gen_serialized_profile')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.num_threads = int(A('num_threads') or 1)
        self.queue_size = int(
            A('queue_size') if A('queue_size') is not None else 0)
        self.write_buffer_size = int(
            A('write_buffer_size') if A('write_buffer_size'
                                        ) is not None else 500)
        self.total_length_of_all_contigs = 0
        self.total_coverage_values_for_all_contigs = 0
        self.description_file_path = A('description')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        # whehther the profile database is a blank (without any BAM files or reads):
        self.blank = A('blank_profile')

        if not self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confused, and confusing anvi'o, too. You can't as hierarchical clustering\
                               to be performed with one flag, and try to skip it with another one :("
            )

        if self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError(
                "So you want to generate a blank profile, and you both want hierarchical clustering\
                               of your contigs to be performed, and skipped. No."
            )

        if self.blank and self.contigs_shall_be_clustered:
            raise ConfigError(
                "When the blank profile is asked to be generated, there is no need to ask for the\
                               hierarchical clustering of contigs. It is going to be done by default. If it is\
                               not changing anything, why is anvi'o upset with you? Because. Let's don't use flags\
                               we don't need.")

        if self.max_coverage_depth >= auxiliarydataops.COVERAGE_MAX_VALUE:
            raise ConfigError("The value %s for the maximum coverage depth is not going to work :/ While the maximum\
                               depth of coverage for anvi'o to care about is a soft cut-off (hence you have some level\
                               of freedom through the parameter `--max-coverage-depth`), there are database limitations\
                               anvi'o must consider and can not change. The maximum value allowed in the database for\
                               coverage information is 65536. Hence, you should set your depth of coverage to something \
                               that is less than this value. In addition, it is also recommended to leave a little gap\
                               and don't go beyond 90%% of this hard limit (that's why anvi'o will keep telling you,\
                               \"%s is nice, but %s is the best I can do\" when you try to exceed that)." \
                                        % (pp(self.max_coverage_depth), pp(self.max_coverage_depth), pp(auxiliarydataops.COVERAGE_MAX_VALUE)))

        if self.blank and not self.skip_hierarchical_clustering:
            self.contigs_shall_be_clustered = True

        if A('contigs_of_interest'):
            filesnpaths.is_file_exists(args.contigs_of_interest)
            self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                           if c.strip() and not c.startswith('#')])
        else:
            self.contig_names_of_interest = None

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        if not self.contigs_db_path:
            raise ConfigError("No contigs database, no profilin'. Bye.")

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self,
                                         self.args,
                                         r=self.run,
                                         p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = []

        self.database_paths = {
            'CONTIGS.db': os.path.abspath(self.contigs_db_path)
        }

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs[
            'blank' if self.blank else 'single']

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []

        # if genes are not called, yet the user is asking for codon frequencies to be profiled, we give
        # a warning and force-turn that flag off.
        if (not self.a_meta['genes_are_called']) and self.profile_SCVs:
            self.run.warning(
                "You asked the codon frequencies to be profiled, but genes were not called\
                              for your contigs database. Anvi'o is assigning `False` to the profile-codon-frequncies\
                              flag, overruling your request like a boss.")
            self.profile_SCVs = False

        # following variable will be populated while the variable positions table is computed
        self.codons_in_genes_to_profile_SCVs = set([])

        # we don't know what we are about
        self.description = None

        # additional layer data will be filled later
        self.layer_additional_keys = []
        self.layer_additional_data = {}
Ejemplo n.º 4
0
    def __init__(self,
                 args=None,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.init_workflow_super_class(args, workflow_name='pangenomics')

        self.target_files = [
        ]  # TODO: Once we update all other workflows then this will be initiated in WorkflowSuperClass
        self.pan_project_name = None
        self.valid_sequence_sources_for_phylogeny = ['gene_clusters', 'hmm']
        self.sequence_source_for_phylogeny = None
        self.tree_name = None
        self.phylogeny_imported_flag = None

        # initialize the base class
        PhylogenomicsWorkflow.__init__(self)

        self.rules.extend([
            'anvi_gen_genomes_storage', 'anvi_pan_genome',
            'anvi_get_sequences_for_gene_clusters',
            'import_phylogenetic_tree_to_pangenome'
        ])

        self.general_params.extend([
            "project_name", "fasta_txt", "internal_genomes",
            "external_genomes", "sequence_source_for_phylogeny"
        ])

        self.dirs_dict.update({
            "FASTA_DIR": "01_FASTA",
            "CONTIGS_DIR": "02_CONTIGS",
            "PAN_DIR": "03_PAN"
        })

        self.default_config.update({
            "fasta_txt": "fasta.txt",
            "anvi_pan_genome": {
                "threads": 7
            },
            "import_phylogenetic_tree_to_pangenome": {
                'tree_name': 'phylogeny'
            }
        })

        pan_params = ["--project-name", "--genome-names", "--skip-alignments",\
                     "--align-with", "--exclude-partial-gene-calls", "--use-ncbi-blast",\
                     "--minbit", "--mcl-inflation", "--min-occurrence",\
                     "--min-percent-identity", "--sensitive", "--description",\
                     "--overwrite-output-destinations", "--skip-hierarchical-clustering",\
                     "--enforce-hierarchical-clustering", "--distance", "--linkage"]
        self.rule_acceptable_params_dict['anvi_pan_genome'] = pan_params

        storage_params = ["--gene-caller"]
        self.rule_acceptable_params_dict[
            'anvi_gen_genomes_storage'] = storage_params

        seq_params = [
            "--gene-cluster-id", "--gene-cluster-ids-file",
            "--collection-name", "--bin-id",
            "--min-num-genomes-gene-cluster-occurs",
            "--max-num-genomes-gene-cluster-occurs",
            "--min-num-genes-from-each-genome",
            "--max-num-genes-from-each-genome",
            "--max-num-gene-clusters-missing-from-genome",
            "--min-functional-homogeneity-index",
            "--max-functional-homogeneity-index",
            "--min-geometric-homogeneity-index",
            "--max-geometric-homogeneity-index",
            "--add-into-items-additional-data-table",
            "--concatenate-gene-clusters", "--separator", "--align-with"
        ]
        self.rule_acceptable_params_dict[
            'anvi_get_sequences_for_gene_clusters'] = seq_params

        import_params = ['--just-do-it', 'tree_name']
        self.rule_acceptable_params_dict[
            'import_phylogenetic_tree_to_pangenome'] = import_params
Ejemplo n.º 5
0
    def __init__(self, args):
        self.args = args

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
        self.output_directory = A('output_dir')
        self.list_contigs_and_exit = A('list_contigs')
        self.min_contig_length = A('min_contig_length')
        self.min_mean_coverage = A('min_mean_coverage')
        self.min_coverage_for_variability = A('min_coverage_for_variability')
        self.contigs_shall_be_clustered = A('cluster_contigs')
        self.sample_id = A('sample_name')
        self.report_variability_full = A('report_variability_full')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.skip_SNV_profiling = A('skip_SNV_profiling')
        self.profile_AA_frequencies = A('profile_AA_frequencies')
        self.gen_serialized_profile = A('gen_serialized_profile')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.num_threads = int(A('num_threads'))
        self.queue_size = int(A('queue_size'))
        self.write_buffer_size = int(A('write_buffer_size'))
        self.total_length_of_all_contigs = 0
        self.total_coverage_values_for_all_contigs = 0
        self.description_file_path = A('description')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        # whehther the profile database is a blank (without any BAM files or reads):
        self.blank = A('blank_profile')

        if self.blank:
            self.contigs_shall_be_clustered = True

        if args.contigs_of_interest:
            filesnpaths.is_file_exists(args.contigs_of_interest)
            self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                           if c.strip() and not c.startswith('#')])
        else:
            self.contig_names_of_interest = None

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        if not self.contigs_db_path:
            raise ConfigError("No contigs database, no profilin'. Bye.")

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self,
                                         self.args,
                                         r=self.run,
                                         p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = []

        self.database_paths = {'CONTIGS.db': self.contigs_db_path}

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs[
            'blank' if self.blank else 'single']

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []

        # following variable will be populated while the variable positions table is computed
        self.codons_in_genes_to_profile_AA_frequencies = set([])

        # we don't know what we are about
        self.description = None
Ejemplo n.º 6
0
    def __init__(self, args=None):
        self.args = None
        self.input_file_path = None
        self.contigs_db_path = None
        self.serialized_profile_path = None
        self.output_directory = None
        self.list_contigs_and_exit = None
        self.min_contig_length = 10000
        self.min_mean_coverage = 0
        self.min_coverage_for_variability = 10  # if a nucleotide position is covered less than this, don't bother
        self.contig_names_of_interest = None
        self.contigs_shall_be_clustered = False
        self.report_variability_full = False  # don't apply any noise filtering, and simply report ALL base frequencies
        self.overwrite_output_destinations = False
        self.skip_SNV_profiling = False

        if args:
            self.args = args
            self.input_file_path = args.input_file
            self.contigs_db_path = args.contigs_db
            self.serialized_profile_path = args.serialized_profile
            self.output_directory = args.output_dir
            self.list_contigs_and_exit = args.list_contigs
            self.min_contig_length = args.min_contig_length
            self.min_mean_coverage = args.min_mean_coverage
            self.min_coverage_for_variability = args.min_coverage_for_variability
            self.contigs_shall_be_clustered = args.cluster_contigs
            self.number_of_threads = 4
            self.no_trehading = True
            self.sample_id = args.sample_name
            self.report_variability_full = args.report_variability_full
            self.overwrite_output_destinations = args.overwrite_output_destinations
            self.skip_SNV_profiling = args.skip_SNV_profiling

            if args.contigs_of_interest:
                if not os.path.exists(args.contigs_of_interest):
                    raise ConfigError, "Contigs file (%s) is missing..." % (
                        args.contigs_of_interest)

                self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                               if c.strip() and not c.startswith('#')])

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self,
                                         self.args,
                                         r=self.run,
                                         p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = {}

        self.database_paths = {'CONTIGS.db': self.contigs_db_path}

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs['single']

        self.atomic_contig_split_data = contigops.AtomicContigSplitData(
            self.progress)

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []
Ejemplo n.º 7
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # know thyself.
        self.name = 'metagenomics'
        self.samples_information = {}
        self.kraken_annotation_dict = {}

        # initialize the base class
        ContigsDBWorkflow.__init__(self)

        self.rules.extend(['iu_gen_configs', 'iu_filter_quality_minoche', 'gen_qc_report', 'gzip_fastqs',\
                     'fq2fa', 'merge_fastas_for_co_assembly', 'megahit',\
                     'anvi_gen_contigs_database', 'anvi_export_gene_calls', 'centrifuge',\
                     'anvi_import_taxonomy', 'anvi_run_hmms', 'anvi_run_ncbi_cogs',\
                     'bowtie_build', 'bowtie', 'samtools_view', 'anvi_init_bam', 'idba_ud', \
                     'anvi_profile', 'annotate_contigs_database', 'anvi_merge', 'import_percent_of_reads_mapped', \
                     'krakenhll', 'krakenhll_mpa_report', 'import_kraken_hll_taxonomy'])

        self.general_params.extend(["samples_txt", "references_mode", "all_against_all", \
                                    "kraken_txt"])

        rule_acceptable_params_dict = {}

        # defining the accesible params per rule
        rule_acceptable_params_dict['iu_gen_configs'] = [
            "--r1-prefix", "--r2-prefix"
        ]
        rule_acceptable_params_dict['iu_filter_quality_minoche'] = [
            'run', '--visualize-quality-curves', '--ignore-deflines',
            '--limit-num-pairs', '--print-qual-scores', '--store-read-fate'
        ]
        rule_acceptable_params_dict['gzip_fastqs'] = ["run"]
        rule_acceptable_params_dict['megahit'] = [
            "run", "--min-contig-len", "--min-count", "--k-min", "--k-max",
            "--k-step", "--k-list", "--no-mercy", "--no-bubble",
            "--merge-level", "--prune-level", "--prune-depth",
            "--low-local-ratio", "--max-tip-len", "--no-local", "--kmin-1pass",
            "--presets", "--memory", "--mem-flag", "--use-gpu", "--gpu-mem",
            "--keep-tmp-files", "--tmp-dir", "--continue", "--verbose"
        ]
        rule_acceptable_params_dict['idba_ud'] = [
            "run", "--mink", "--maxk", "--step", "--inner_mink",
            "--inner_step", "--prefix", "--min_count", "--min_support",
            "--seed_kmer", "--min_contig", "--similar", "--max_mismatch",
            "--min_pairs", "--no_bubble", "--no_local", "--no_coverage",
            "--no_correct", "--pre_correction"
        ]
        rule_acceptable_params_dict['bowtie'] = ["additional_params"]
        rule_acceptable_params_dict['samtools_view'] = ["additional_params"]
        rule_acceptable_params_dict['anvi_profile'] = [
            "--overwrite-output-destinations", "--sample-name",
            "--report-variability-full", "--skip-SNV-profiling",
            "--profile-SCVs", "--description",
            "--skip-hierarchical-clustering", "--distance", "--linkage",
            "--min-contig-length", "--min-mean-coverage",
            "--min-coverage-for-variability", "--cluster-contigs",
            "--contigs-of-interest", "--queue-size", "--write-buffer-size"
        ]
        rule_acceptable_params_dict['annotate_contigs_database'] = []
        rule_acceptable_params_dict['anvi_merge'] = [
            "--sample-name", "--description", "--skip-hierarchical-clustering",
            "--enforce-hierarchical-clustering", "--distance", "--linkage",
            "--skip-concoct-binning", "--overwrite-output-destinations"
        ]
        rule_acceptable_params_dict['import_percent_of_reads_mapped'] = ["run"]
        rule_acceptable_params_dict['krakenhll'] = [
            "additional_params", "run", "--db", "--gzip-compressed"
        ]
        rule_acceptable_params_dict['krakenhll_mpa_report'] = [
            "additional_params"
        ]
        rule_acceptable_params_dict['import_kraken_hll_taxonomy'] = [
            "--min-abundance"
        ]

        self.rule_acceptable_params_dict.update(rule_acceptable_params_dict)

        forbidden_params = {}
        forbidden_params['krakenhll'] = [
            '--fastq-input', '--paired', '--output'
        ]

        self.forbidden_params.update(forbidden_params)

        self.dirs_dict.update({
            "QC_DIR": "01_QC",
            "FASTA_DIR": "02_FASTA",
            "CONTIGS_DIR": "03_CONTIGS",
            "MAPPING_DIR": "04_MAPPING",
            "PROFILE_DIR": "05_ANVIO_PROFILE",
            "MERGE_DIR": "06_MERGED",
            "TAXONOMY_DIR": "07_TAXONOMY"
        })

        self.default_config.update({
            'samples_txt': "samples.txt",
            'megahit': {
                "--min-contig-len": min_contig_length_for_assembly,
                "--memory": 0.4,
                "threads": 11
            },
            'idba_ud': {
                "--min_contig": min_contig_length_for_assembly,
                "threads": 11
            },
            'iu_filter_quality_minoche': {
                "run": True,
                "--ignore-deflines": True,
                "threads": 2
            },
            "gzip_fastqs": {
                "run": True
            },
            "bowtie_build": {
                "threads": 10
            },
            "bowtie": {
                "additional_params": "--no-unal",
                "threads": 10
            },
            "samtools_view": {
                "additional_params": "-F 4",
                "threads": 4
            },
            "anvi_init_bam": {
                "threads": 4
            },
            "anvi_profile": {
                "threads": 5,
                "--sample-name": "{sample}",
                "--overwrite-output-destinations": True
            },
            "anvi_merge": {
                "--sample-name": "{group}",
                "--overwrite-output-destinations": True
            },
            "import_percent_of_reads_mapped": {
                "run": True
            },
            "krakenhll": {
                "threads": 12,
                "--gzip-compressed": True,
                "additional_params": "--preload"
            }
        })
Ejemplo n.º 8
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):

        self.run = run
        self.progress = progress

        self.run.warning(
            "Anvi'o will use 'InteracDome' by Kobren and Singh (DOI: 10.1093/nar/gky1224) to attribute binding frequencies. "
            "If you publish your findings, please do not forget to properly credit their work.",
            lc='green',
            header="CITATION")

        A = lambda x, t: t(args.__dict__[x]) if x in args.__dict__ else None
        null = lambda x: x
        self.interacdome_data_dir = A(
            'interacdome_data_dir',
            null) or constants.default_interacdome_data_path
        self.information_content_cutoff = A('information_content_cutoff',
                                            null) or 4
        self.min_binding_frequency = A('min_binding_frequency', null) or 0
        self.min_hit_fraction = A('min_hit_fraction', null) or 0.8
        self.interacdome_dataset = A('interacdome_dataset',
                                     null) or 'representable'
        self.output_prefix = A('output_file_prefix', null)
        self.just_do_it = A('just_do_it', null)

        self.run.warning("", header='INITIALIZATION', lc='green')
        self.run.info("Interacdome dataset used", self.interacdome_dataset)
        self.run.info("Minimum hit fraction", self.min_hit_fraction)

        self.hmm_filepath = os.path.join(self.interacdome_data_dir,
                                         'Pfam-A.hmm')

        # Init the InteracDome table
        self.interacdome_table = InteracDomeTableData(
            kind=self.interacdome_dataset,
            interacdome_data_dir=self.interacdome_data_dir)
        self.interacdome_table.load()

        # Init the Pfam baseclass
        args.hmmer_program = 'hmmsearch'  # Force use of hmmsearch
        args.pfam_data_dir = self.interacdome_data_dir
        Pfam.__init__(self, args, run=self.run, progress=self.progress)

        # Init contigs database
        args = argparse.Namespace(contigs_db=self.contigs_db_path)
        self.contigs_db = dbops.ContigsSuperclass(args)

        self.potentially_remove_previous_interacdome_data()

        # Init the HMM profile
        self.hmms = pfam.HMMProfile(self.hmm_filepath)

        # This dictionary is populated and cast as a dataframe. It contains all of the per-residue
        # binding frequency information for each hit
        self.bind_freq = {}

        # This dictionary (eventual dataframe) is just like self.bind_freq, except has averaged
        # binding frequencies for residue-ligand combos that have multiple contributing hits. It
        # also drops all contributing match state information
        self.avg_bind_freq = {}

        # This is a modified version of self.avg_bind_freq that is compatible with the
        # amino_acid_additional_data table structure, i.e.
        # tables.amino_acid_additional_data_table_structure
        self.amino_acid_additional_data = {}
Ejemplo n.º 9
0
    def __init__(self,
                 args,
                 hmm_sources,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress
        self.hmm_sources = hmm_sources

        self.splits_dict = {}

        # initialize the super
        SequencesForHMMHits.__init__(self,
                                     None,
                                     sources=hmm_sources,
                                     run=self.run,
                                     progress=self.progress)

        # process genome descriptions
        GenomeDescriptions.__init__(self,
                                    args,
                                    run=self.run,
                                    progress=self.progress)
        self.load_genomes_descriptions(skip_functions=True, init=False)

        hmm_sources_in_all_genomes = self.get_HMM_sources_common_to_all_genomes(
        )

        if not len(hmm_sources_in_all_genomes):
            raise ConfigError(
                "There are no HMM sources among your external genomes that occur in every genome :/"
            )

        num_internal_genomes = len(
            set([
                g['genome_hash'] for g in self.genomes.values()
                if 'profile_db_path' in g
            ]))
        collection_names = set([
            g['collection_id'] for g in self.genomes.values()
            if 'collection_id' in g
        ])

        if num_internal_genomes:
            self.run.warning(
                "SequencesForHMMHitsWrapperForMultipleContigs class is speaking (yes, the class is "
                "quite aware of its very long name thankyouverymuch). Of the total %d genome descriptions "
                "it was given, %d seem to represent internal genomes with bins in collection(s) '%s'. Anvi'o "
                "will make sure HMM hits to be used for downstream analyses are only those that match to contigs "
                "that were included in those selections." %
                (len(self.genomes), num_internal_genomes,
                 ', '.join(collection_names)),
                lc="green")

        # very hacky code follows. here we generate a self SequencesForHMMHits object,
        # and we will fill everything in it with slightly modified information so multiple
        # contigs databases could be processed by this talented class seamlessly.
        hmm_hits_splits_counter = 0
        for genome_name in self.genomes:
            g = self.genomes[genome_name]
            contigs_db_path = g['contigs_db_path']
            contigs_db_hash = g['contigs_db_hash']

            # this is an important variable and allows us to track origins of HMM hits for bins
            # and individual contigs databases seamlessly. if you want to understand truly what
            # the hell does this mean, look at `get_genome_hash_for_external_genome` and
            # `get_genome_hash_for_internal_genome` functions in `genomedescriptions.py`.
            genome_hash = None

            # here we check if the genome descriptions contain reference to a collection name,
            # because if it is the case, we need to focus only on hmm hits that are relevant
            # to splits in this collection:
            if 'collection_id' in g:
                if ('bin_id' not in g) or ('profile_db_path' not in g):
                    raise ConfigError(
                        "There is something VERY weird going on. Your genome descriptions object contains "
                        "a collection name, yet it doesn't know anything about a bin name or profile database "
                        "path. While this is very interesting because it should never happen, anvi'o will say "
                        "goodbye and abruptly quit in confusion :(")

                # setup an args object, and recover the split names of interest
                args = argparse.Namespace(profile_db=g['profile_db_path'],
                                          contigs_db=g['contigs_db_path'],
                                          bin_id=g['bin_id'],
                                          collection_name=g['collection_id'])
                split_names_of_interest = ccollections.GetSplitNamesInBins(
                    args).get_split_names_only()
                genome_hash = hashlib.sha224('_'.join(
                    [''.join(split_names_of_interest),
                     contigs_db_hash]).encode('utf-8')).hexdigest()[0:12]

                # current hmm hits now will match to the collection
                current = SequencesForHMMHits(
                    contigs_db_path,
                    sources=hmm_sources,
                    split_names_of_interest=split_names_of_interest)
            else:
                current = SequencesForHMMHits(contigs_db_path,
                                              sources=hmm_sources)
                genome_hash = contigs_db_hash

            for hmm_hit_id in current.hmm_hits:
                hit = current.hmm_hits[hmm_hit_id]
                hit['gene_callers_id'] = '%s_%d' % (contigs_db_hash,
                                                    hit['gene_callers_id'])
                hit['genome_hash'] = genome_hash
                self.hmm_hits['%s_%d' % (contigs_db_hash, hmm_hit_id)] = hit

            if not self.hmm_hits_info:
                for hmm_source in hmm_sources_in_all_genomes:
                    self.hmm_hits_info[hmm_source] = current.hmm_hits_info[
                        hmm_source]

            for hit in current.hmm_hits_splits.values():
                hit['split'] = '%s_%s' % (contigs_db_hash, hit['split'])
                hit['hmm_hit_entry_id'] = '%s_%d' % (contigs_db_hash,
                                                     hit['hmm_hit_entry_id'])
                self.hmm_hits_splits[hmm_hits_splits_counter] = hit
                hmm_hits_splits_counter += 1

            for seq in current.contig_sequences:
                self.contig_sequences['%s_%s' %
                                      (contigs_db_hash,
                                       seq)] = current.contig_sequences[seq]

            for seq in current.aa_sequences:
                self.aa_sequences['%s_%s' % (contigs_db_hash,
                                             seq)] = current.aa_sequences[seq]

            for gene_callers_id in current.genes_in_contigs:
                entry = current.genes_in_contigs[gene_callers_id]
                entry['contig'] = '%s_%s' % (contigs_db_hash, entry['contig'])
                self.genes_in_contigs['%s_%d' % (contigs_db_hash,
                                                 gene_callers_id)] = entry

            self.splits_dict[genome_name] = [
                '%s_%s' % (contigs_db_hash, s)
                for s in current.splits_in_contigs
            ]
Ejemplo n.º 10
0
 def __init__(self, program_name='fastANI', args={}, run=terminal.Run(), progress=terminal.Progress()):
     FastANIDriver.__init__(self, program_name, args, run, progress)
Ejemplo n.º 11
0
    def __init__(self,
                 args=None,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.init_workflow_super_class(args, workflow_name='trnaseq')

        self.rules.extend([
            'make_iu_input', 'iu_gen_configs', 'iu_merge_pairs',
            'gen_qc_report', 'anvi_reformat_fasta', 'anvi_trnaseq',
            'anvi_convert_trnaseq_database', 'anvi_run_trna_taxonomy'
        ])

        # "General" section of the workflow config file.
        self.general_params.extend(['samples_txt'])

        # Parameters for each rule that are accessible in the config file.
        rule_acceptable_params_dict = {}
        rule_acceptable_params_dict['iu_merge_pairs'] = [
            'run', '--gzip-output', '--marker-gene-stringent',
            '--max-num-mismatches', '--report-r1-prefix', '--report-r2-prefix'
        ]
        rule_acceptable_params_dict['anvi_reformat_fasta'] = [
            'run', '--gzip-output', '--simplify-names'
        ]
        rule_acceptable_params_dict['anvi_trnaseq'] = [
            'run', '--treatment', '--overwrite-output-destinations',
            '--description', '--write-checkpoints', '--load-checkpoint',
            '--feature-param-file', '--min-length-long-fiveprime',
            '--min-trna-fragment-size', '--agglomeration-max-mismatch-freq',
            '--fiveprimemost-deletion-start',
            '--threeprimemost-deletion-start', '--fiveprimemost-deletion-stop',
            '--threeprimemost-deletion-stop', '--max-distinct-deletions',
            '--skip-fasta-check', '--write-buffer-size',
            '--alignment-target-chunk-size',
            '--fragment-mapping-query-chunk-length',
            '--alignment-progress-interval',
            '--agglomeration-progress-interval'
        ]
        rule_acceptable_params_dict['anvi_convert_trnaseq_database'] = [
            'run', '--project-name', '--max-reported-trna-seeds',
            '--overwrite-output-destinations', '--description',
            '--feature-threshold', '--preferred-treatment',
            '--nonspecific-output', '--min-variation', '--min-third-fourth-nt',
            '--distance', '--linkage'
        ]
        rule_acceptable_params_dict['anvi_run_trna_taxonomy'] = [
            'run', '--trna-taxonomy-data-dir', '--min-percent-identity',
            '--max-num-target-sequences', '--num-parallel-processes',
            '--write-buffer-size'
        ]
        self.rule_acceptable_params_dict.update(rule_acceptable_params_dict)

        # Default values for accessible parameters: all defaults are written to the config file so
        # the user can see them succinctly.

        # Though the workflow superclass automatically adds a threads argument of "" to each
        # workflow, here we make explicit that the default is 1 and the user does not need to
        # enclose the value in quotes. Likewise, the superclass adds mandatory arguments at the end
        # of the list for each rule in the config file, but we explicitly add them here to ensure
        # they appear in the order of each script's help display.
        self.default_config.update({
            'samples_txt': 'samples.txt',
            'iu_merge_pairs': {
                'run': True,
                '--gzip-output': False,
                '--marker-gene-stringent': True,
                '--max-num-mismatches': 0,
                '--report-r1-prefix': False,
                '--report-r2-prefix': False,
                'threads': 1
            },
            'anvi_reformat_fasta': {
                'run': True,
                '--gzip-output':
                False,  # not an argument of anvi-script-reformat-fasta
                '--simplify-names':
                True,  # not the default in anvi-script-reformat-fasta
                'threads': 1
            },
            'anvi_trnaseq': {
                'run':
                True,
                '--treatment':
                "",  # if provided in the config file, the treatment is assumed to be for all samples
                '--overwrite-output-destinations':
                anvio.D['overwrite-output-destinations'][1]['default'],
                '--description':
                "",
                '--write-checkpoints':
                anvio.D['write-checkpoints'][1]['default'],
                '--load-checkpoint':
                "",
                '--feature-param-file':
                "",
                '--min-length-long-fiveprime':
                anvio.D['min-length-long-fiveprime'][1]['default'],
                '--min-trna-fragment-size':
                anvio.D['min-trna-fragment-size'][1]['default'],
                '--agglomeration-max-mismatch-freq':
                anvio.D['agglomeration-max-mismatch-freq'][1]['default'],
                '--fiveprimemost-deletion-start':
                anvio.D['fiveprimemost-deletion-start'][1]['default'],
                '--threeprimemost-deletion-start':
                anvio.D['threeprimemost-deletion-start'][1]['default'],
                '--fiveprimemost-deletion-stop':
                anvio.D['fiveprimemost-deletion-stop'][1]['default'],
                '--threeprimemost-deletion-stop':
                anvio.D['threeprimemost-deletion-stop'][1]['default'],
                '--max-distinct-deletions':
                anvio.D['max-distinct-deletions'][1]['default'],
                '--skip-fasta-check':
                True,  # not the default in anvi-trnaseq
                '--write-buffer-size':
                100000,  # the default set in anvi-trnaseq (not the anvi'o-wide default)
                '--alignment-target-chunk-size':
                anvio.D['alignment-target-chunk-size'][1]['default'],
                '--fragment-mapping-query-chunk-length':
                anvio.D['fragment-mapping-query-chunk-length'][1]['default'],
                '--alignment-progress-interval':
                anvio.D['alignment-progress-interval'][1]['default'],
                '--agglomeration-progress-interval':
                anvio.D['agglomeration-progress-interval'][1]['default'],
                'threads':
                1
            },
            'anvi_convert_trnaseq_database': {
                'run':
                True,
                '--project-name':
                "",
                '--max-reported-trna-seeds':
                anvio.D['max-reported-trna-seeds'][1]['default'],
                '--overwrite-output-destinations':
                anvio.D['overwrite-output-destinations'][1]['default'],
                '--description':
                "",
                '--feature-threshold':
                anvio.D['feature-threshold'][1]['default'],
                '--preferred-treatment':
                "",
                '--nonspecific-output':
                anvio.D['nonspecific-output'][1]['default'],
                '--min-variation':
                anvio.D['min-variation'][1]['default'],
                '--min-third-fourth-nt':
                anvio.D['min-third-fourth-nt'][1]['default'],
                '--distance':
                anvio.D['distance'][1]['default'],
                '--linkage':
                anvio.D['linkage'][1]['default'],
                'threads':
                1
            },
            'anvi_run_trna_taxonomy': {
                'run':
                True,
                '--trna-taxonomy-data-dir':
                "",
                '--min-percent-identity':
                anvio.D['min-percent-identity'][1]['default'],
                '--max-num-target-sequences':
                anvio.D['max-num-target-sequences'][1]['default'],
                '--num-parallel-processes':
                anvio.D['num-parallel-processes'][1]['default'],
                '--write-buffer-size':
                anvio.D['write-buffer-size'][1]['default'],
                'threads':
                1
            },
            'make_iu_input': {
                'threads': 1
            },
            'iu_gen_configs': {
                'threads': 1
            },
            'gen_qc_report': {
                'threads': 1
            },
            'output_dirs':
            {},  # This ensures that output_dirs comes before max_threads in the file
            'max_threads': 1
        })

        self.dirs_dict.update({
            'QC_DIR': '01_QC',
            'IDENT_DIR': '02_IDENT',
            'CONVERT_DIR': '03_CONVERT'
        })
Ejemplo n.º 12
0
from anvio.drivers.blast import BLAST
from anvio.dbops import ContigsDatabase

from anvio.taxonomyops import AccessionIdToTaxonomy
from anvio.taxonomyops import TaxonomyEstimatorSingle
from anvio.taxonomyops import PopulateContigsDatabaseWithTaxonomy

__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "*****@*****.**"

run_quiet = terminal.Run(log_file_path=None, verbose=False)
progress_quiet = terminal.Progress(verbose=False)
pp = terminal.pretty_print

HASH = lambda d: str(
    hashlib.sha224(''.join([
        str(d[level]) for level in constants.levels_of_taxonomy
    ]).encode('utf-8')).hexdigest()[0:8])


class TRNATaxonomyContext(AccessionIdToTaxonomy):
    """The purpose of this base class is ot define file paths and constants for trna taxonomy ops."""
    def __init__(self,
                 trna_taxonomy_data_dir=None,
                 scgs_taxonomy_remote_database_url=None,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
Ejemplo n.º 13
0
    def __init__(self,
                 args=None,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.init_workflow_super_class(args, workflow_name='contigs')

        self.group_names = []
        self.contigs_information = {}
        self.fasta_txt_file = None
        self.fasta_information = {}
        # we have external_genomes_file defined here for the sake of pangenomics and phylogenomics workflows
        self.external_genomes_file = ''
        # we have references_mode defined here for the sake of the metagenomics workflow (it is only used when this workflow is inherited)
        self.references_mode = None
        self.import_external_functions_flags = []

        self.rules.extend([
            'gen_external_genome_file', 'anvi_script_reformat_fasta',
            'anvi_gen_contigs_database', 'export_gene_calls_for_centrifuge',
            'centrifuge', 'anvi_import_taxonomy_for_genes',
            'anvi_run_scg_taxonomy', 'anvi_run_trna_scan', 'anvi_run_hmms',
            'anvi_run_ncbi_cogs', 'annotate_contigs_database',
            'anvi_get_sequences_for_gene_calls', 'emapper',
            'anvi_script_run_eggnog_mapper', 'gunzip_fasta',
            'reformat_external_gene_calls_table',
            'reformat_external_functions', 'import_external_functions',
            'anvi_run_pfams'
        ])

        self.general_params.extend(["fasta_txt"])

        self.dirs_dict.update({
            "FASTA_DIR": "01_FASTA",
            "CONTIGS_DIR": "02_CONTIGS"
        })

        self.default_config.update({
            "fasta_txt": "fasta.txt",
            "anvi_gen_contigs_database": {
                "--project-name": "{group}"
            },
            "centrifuge": {
                "threads": 2
            },
            "anvi_run_hmms": {
                "run": True,
                "threads": 5
            },
            "anvi_run_ncbi_cogs": {
                "run": True,
                "threads": 5
            },
            "anvi_run_scg_taxonomy": {
                "run": True,
                "threads": 6
            },
            'anvi_run_trna_scan': {
                "run": True,
                "threads": 6
            },
            "anvi_script_reformat_fasta": {
                "run": True,
                "--prefix": "{group}",
                "--simplify-names": True
            },
            "emapper": {
                "--database": "bact",
                "--usemem": True,
                "--override": True
            },
            "anvi_script_run_eggnog_mapper": {
                "--use-version": "0.12.6"
            }
        })

        self.rule_acceptable_params_dict['anvi_run_ncbi_cogs'] = [
            'run', '--cog-data-dir', '--sensitive', '--temporary-dir-path',
            '--search-with'
        ]

        self.rule_acceptable_params_dict['anvi_run_scg_taxonomy'] = [
            'run', '--scgs-taxonomy-data-dir'
        ]

        self.rule_acceptable_params_dict['anvi_run_trna_scan'] = [
            'run', '--trna-cutoff-score'
        ]

        self.rule_acceptable_params_dict['anvi_run_hmms'] = [
            'run', '--installed-hmm-profile', '--hmm-profile-dir',
            '--also-scan-trnas'
        ]

        self.rule_acceptable_params_dict['anvi_run_pfams'] = [
            'run', '--pfam-data-dir'
        ]

        self.rule_acceptable_params_dict['centrifuge'] = ['run', 'db']

        self.rule_acceptable_params_dict['emapper'] = [
            '--database', '--usemem', '--override', 'path_to_emapper_dir'
        ]

        self.rule_acceptable_params_dict['anvi_script_run_eggnog_mapper'] = [
            'run', '--cog-data-dir', '--drop-previous-annotations',
            '--use-version'
        ]

        self.rule_acceptable_params_dict['anvi_script_reformat_fasta'] = \
                    ['run', '--keep-ids', '--exclude-ids', '--min-len', "--prefix", "--simplify-names"]


        gen_contigs_params = ['--description', '--skip-gene-calling', '--external-gene-calls',\
                              '--ignore-internal-stop-codons', '--skip-mindful-splitting',\
                              '--contigs-fasta', '--project-name',\
                              '--description', '--split-length', '--kmer-size',\
                              '--skip-mindful-splitting', '--skip-gene-calling', '--external-gene-calls',\
                              '--ignore-internal-stop-codons', '--skip-predict-frame', '--prodigal-translation-table']

        self.rule_acceptable_params_dict[
            'anvi_gen_contigs_database'] = gen_contigs_params
Ejemplo n.º 14
0
    def __init__(self, input_file_paths, taxonomy_table_structure, run=terminal.Run(), progress=terminal.Progress()):
        self.run = run
        self.progress = progress

        self.min_hit_score = 250

        files_expected = {'report': 'centrifuge_report.tsv', 'hits': 'centrifuge_hits.tsv'}

        files_structure = {'report':
                                {'col_names': ['t_species', 'taxon_id', 'f1', 'f2', 'f3', 'f4', 'f5'],
                                 'col_mapping': [str, int, str, str, str, str, str],
                                 'indexing_field': 1},
                           'hits':
                                {'col_names': ['gene_callers_id', 'f1', 'taxon_id', 'score', 'f2', 'f3', 'f4', 'f5'],
                                 'col_mapping': [lambda x: int(x.split('|')[0]), str, int, int, str, str, str, str],
                                 'indexing_field': -1},
                          }

        self.taxonomy_table_structure = taxonomy_table_structure
        Parser.__init__(self, 'centrifuge', input_file_paths, files_expected, files_structure)
Ejemplo n.º 15
0
    def check_database(self):
        """Setup the database files

        Downloads the .pir file if it is missing
        Binarizes .pir file if .bin is missing
        Creates the .dmnd file if it is missing
        """

        extensionless, extension = os.path.splitext(self.modeller_database)
        if extension not in [".bin", ".pir", ""]:
            raise ConfigError(
                "MODELLER :: The only possible database extensions are .bin and .pir"
            )

        bin_db_path = J(self.database_dir, extensionless + ".bin")
        pir_db_path = J(self.database_dir, extensionless + ".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path,
                                                      dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                      dont_raise=True)

        self.database_path = bin_db_path

        if bin_exists and pir_exists:
            # We good
            pass
        else:
            if not pir_exists:
                # Download .pir
                self.run.warning(
                    "Anvi'o looked in {} for a database with the name {} and with an extension "
                    "of either .bin or .pir, but didn't find anything matching that "
                    "criteria. Anvi'o will try and download the best database it knows of from "
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. "
                    "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 "
                    "database".format(self.database_dir,
                                      self.modeller_database))

                db_download_path = os.path.join(self.database_dir,
                                                "pdb_95.pir.gz")
                utils.download_file(
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz",
                    db_download_path)
                utils.run_command(
                    ['gzip', '-d', db_download_path],
                    log_file_path=filesnpaths.get_temp_file_path())

            # Binarize .pir (make .bin)
            self.run.warning(
                "Your database is not in binary format. That means accessing its contents is slower "
                "than it could be. Anvi'o is going to make a binary format. Just FYI"
            )
            self.run_binarize_database(pir_db_path, bin_db_path)

        dmnd_db_path = J(self.database_dir, 'pdb_95.dmnd')

        if os.path.exists(dmnd_db_path):
            return

        self.run.warning(
            "Your diamond database does not exist. It will be created.")

        script_name = "pir_to_fasta.py"

        self.copy_script_to_directory(script_name)

        input_pir_path = J(self.database_dir, 'pdb_95.pir')
        fasta_path = J(self.database_dir, 'pdb_95.fa')
        dmnd_path = J(self.database_dir, 'pdb_95')

        command = [self.executable, script_name, input_pir_path, fasta_path]

        self.run_command(command, script_name=script_name, rename_log=False)

        temp = u.FastaOutput(filesnpaths.get_temp_file_path())
        fasta = u.SequenceSource(fasta_path)

        while next(fasta):
            temp.write_id(fasta.id)
            temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X'))

        shutil.move(temp.output_file_path, fasta_path)
        fasta.close()
        temp.close()

        driver = diamond.Diamond(
            query_fasta=fasta_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.makedb(output_file_path=dmnd_path)

        os.remove(fasta_path)
Ejemplo n.º 16
0
    def __init__(self,
                 args,
                 run=terminal.Run(),
                 progress=terminal.Progress(),
                 skip_sanity_check=False):
        """Parses arguments and run sanity_check"""

        self.args = args
        self.run = run
        self.progress = progress

        # Parse arguments
        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.annotation_source = A('annotation_source')
        self.window_range = A('ngram_window_range') or "2:3"
        self.is_in_unknowns_mode = A('analyze_unknown_functions')
        self.output_file = A('output_file')
        self.skip_init_functions = A('skip_init_functions')
        self.genome_names_to_focus = A('genome_names')
        self.ngram_source = A("ngram_source")

        self.annotation_source_dict = {}

        self.pan_db_path = A('pan_db')

        if self.annotation_source and self.pan_db_path:
            self.annotation_sources = [self.annotation_source, 'gene_clusters']

        if self.pan_db_path:
            self.pan_db = PanDatabase(self.pan_db_path)

            self.p_meta = self.pan_db.meta

            self.p_meta['creation_date'] = utils.get_time_to_date(
                self.p_meta['creation_date']
            ) if 'creation_date' in self.p_meta else 'unknown'
            self.p_meta['genome_names'] = sorted([
                s.strip()
                for s in self.p_meta['external_genome_names'].split(',') +
                self.p_meta['internal_genome_names'].split(',') if s
            ])
            self.p_meta['num_genomes'] = len(self.p_meta['genome_names'])
            self.genome_names = self.p_meta['genome_names']
            self.gene_clusters_gene_alignments_available = self.p_meta[
                'gene_alignments_computed']
        else:
            self.pan_db = None

        self.genomes_storage_path = A('genomes_storage')

        # confirm genome-storage and pangenome hashes match of pangenome is provided
        if self.pan_db:
            self.genomes_storage = genomestorage.GenomeStorage(
                self.genomes_storage_path,
                self.p_meta['genomes_storage_hash'],
                genome_names_to_focus=self.p_meta['genome_names'],
                skip_init_functions=self.skip_init_functions,
                run=self.run,
                progress=self.progress)
        else:
            self.genomes_storage = genomestorage.GenomeStorage(
                self.genomes_storage_path,
                skip_init_functions=self.skip_init_functions,
                run=self.run,
                progress=self.progress)

        # list-annotation-resources
        self.list_annotation_sources = A('list_annotation_sources')
        self.gene_function_source_set = self.genomes_storage.db.get_table_as_dataframe(
            'gene_function_calls').source.unique()
        if self.list_annotation_sources:
            self.run.info('Available functional annotation sources',
                          ', '.join(self.gene_function_source_set))
            sys.exit()

        # This houses the ngrams' data
        self.ngram_attributes_list = []

        # Focus on specfic set of genomes
        if self.genome_names_to_focus:
            if filesnpaths.is_file_exists(self.genome_names_to_focus,
                                          dont_raise=True):
                self.genome_names_to_focus = utils.get_column_data_from_TAB_delim_file(
                    self.genome_names_to_focus,
                    column_indices=[0],
                    expected_number_of_fields=1)[0]
            else:
                self.genome_names_to_focus = [
                    g.strip() for g in self.genome_names_to_focus.split(',')
                ]

            self.run.warning(
                "A subset of genome names is found, and anvi'o will focus only on to those."
            )

            self.genomes_storage = genomestorage.GenomeStorage(
                self.genomes_storage_path,
                storage_hash=None,
                genome_names_to_focus=self.genome_names_to_focus)
            self.genomes = self.genomes_storage.get_genomes_dict()

            self.external_genome_names = [
                g for g in self.genomes if self.genomes[g]['external_genome']
            ]
            self.internal_genome_names = [
                g for g in self.genomes
                if not self.genomes[g]['external_genome']
            ]

            self.hash_to_genome_name = {}
            for genome_name in self.genomes:
                self.hash_to_genome_name[self.genomes[genome_name]
                                         ['genome_hash']] = genome_name

            # number of genomes in genome-storage
            self.num_contigs_in_external_genomes_with_genes = len(self.genomes)

        # number of genomes in genome-storage
        if self.genome_names_to_focus:
            self.num_contigs_in_external_genomes_with_genes = len(
                self.genome_names_to_focus)
        else:
            self.num_contigs_in_external_genomes_with_genes = len(
                self.genomes_storage.get_all_genome_names())

        if not skip_sanity_check:
            self.sanity_check()

        # unless we are in debug mode, let's keep things quiet.
        if anvio.DEBUG:
            self.run_object = terminal.Run()
        else:
            self.run_object = terminal.Run(verbose=False)
Ejemplo n.º 17
0
    def __init__(self,
                 args,
                 run=terminal.Run(),
                 progress=terminal.Progress(),
                 progress_title=None):

        self.args = args
        self.run = run
        self.progress = progress

        up_to_date_modeller_exec = "mod9.21"  # default exec to use

        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x
        self.scoring_method = A('scoring_method', str)
        self.deviation = A('deviation', float)
        self.directory = A('directory', str)
        self.very_fast = A('very_fast', bool)
        self.executable = A('modeller_executable',
                            null) or up_to_date_modeller_exec
        self.num_models = A('num_models', int)
        self.target_fasta_path = A('target_fasta_path', str)
        self.modeller_database = A('modeller_database', str) or "pdb_95"
        self.max_number_templates = A('max_number_templates', null)
        self.percent_identical_cutoff = A('percent_identical_cutoff', null)
        self.deviation = A('deviation', null)

        self.alignment_pap_path = None
        self.alignment_pir_path = None
        self.get_template_path = None
        self.search_results_path = None
        self.target_pir_path = None
        self.template_family_matrix_path = None
        self.template_info_path = None
        self.template_pdbs = None
        self.model_info = None

        self.logs = {}
        self.scripts = {}

        self.sanity_check()

        # as reward, whoever called this class will receive self.out when they run self.process()
        self.out = {
            "templates": {
                "pdb_id": [],
                "chain_id": [],
                "ppi": []
            },
            "models": {
                "molpdf": [],
                "GA341_score": [],
                "DOPE_score": [],
                "picked_as_best": []
            },
            "corresponding_gene_call": self.corresponding_gene_call,
            "structure_exists": False,
            "best_model_path": None,
            "best_score": None,
            "scoring_method": self.scoring_method,
            "percent_identical_cutoff": self.percent_identical_cutoff,
            "very_fast": self.very_fast,
            "deviation": self.deviation,
        }

        # All MODELLER databases are housed in self.database_dir
        self.database_dir = J(os.path.dirname(anvio.__file__),
                              'data/misc/MODELLER/db')

        # copy fasta into the working directory
        try:
            shutil.copy2(self.target_fasta_path, self.directory)
            self.target_fasta_path = J(self.directory, self.target_fasta_path)
        except shutil.SameFileError:
            pass

        # store the original directory so we can cd back and forth between
        # self.directory and self.start_dir
        self.start_dir = os.getcwd()

        self.progress_title = progress_title
        if not self.progress_title:
            self.progress_title = "Running MODELLER for gene id {}".format(
                self.corresponding_gene_call)
Ejemplo n.º 18
0
    def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()):
        self.init_workflow_super_class(args, workflow_name='metagenomics')

        self.samples_information = {}
        self.kraken_annotation_dict = {}
        self.run_krakenuniq = None
        self.run_metaspades = None
        self.use_scaffold_from_metaspades = None
        self.use_scaffold_from_idba_ud = None
        self.remove_short_reads_based_on_references = None
        self.references_for_removal_txt = None
        self.references_for_removal = {}
        self.references_mode = None
        self.fasta_txt_file = None
        self.samples_txt_file = None
        self.sample_names = None
        self.group_sizes = None
        self.collections_txt = None
        self.collections = None

        # initialize the base class
        ContigsDBWorkflow.__init__(self)

        self.rules.extend(['iu_gen_configs', 'iu_filter_quality_minoche', 'gen_qc_report', 'gzip_fastqs',\
                     'merge_fastqs_for_co_assembly', 'megahit', 'merge_fastas_for_co_assembly',\
                     'bowtie_build', 'bowtie', 'samtools_view', 'anvi_init_bam', 'idba_ud',\
                     'anvi_profile', 'anvi_merge', 'import_percent_of_reads_mapped', 'anvi_cluster_contigs',\
                     'krakenuniq', 'krakenuniq_mpa_report', 'import_krakenuniq_taxonomy', 'metaspades',\
                     'remove_short_reads_based_on_references', 'anvi_summarize', 'anvi_split'])

        self.general_params.extend(['samples_txt', "references_mode", "all_against_all",\
                                    "kraken_txt", "collections_txt"])

        rule_acceptable_params_dict = {}

        # defining the accesible params per rule. NOTE --threads is a parameter for every rule
        # and is not explicitly provided in what follows
        rule_acceptable_params_dict['iu_gen_configs'] = ["--r1-prefix", "--r2-prefix"]
        rule_acceptable_params_dict['iu_filter_quality_minoche'] = ['run', '--visualize-quality-curves', '--ignore-deflines', '--limit-num-pairs', '--print-qual-scores', '--store-read-fate']
        rule_acceptable_params_dict['gzip_fastqs'] = ["run"]

        # add parameters for modifying binning algorithms
        additional_params_for_anvi_cluster_contigs = [self.get_param_name_for_binning_driver(d) for d in driver_modules['binning'].keys()]
        rule_acceptable_params_dict['anvi_cluster_contigs'] = ["run", "--collection-name", "--driver", "--just-do-it"]
        rule_acceptable_params_dict['anvi_cluster_contigs'].extend(additional_params_for_anvi_cluster_contigs)

        rule_acceptable_params_dict['anvi_summarize'] = ["additional_params", "run"]
        rule_acceptable_params_dict['anvi_split'] = ["additional_params", "run"]
        rule_acceptable_params_dict['metaspades'] = ["run", "additional_params", "use_scaffolds"]
        rule_acceptable_params_dict['megahit'] = ["run", "--min-contig-len", "--min-count", "--k-min",
                                                  "--k-max", "--k-step", "--k-list",
                                                  "--no-mercy", "--no-bubble", "--merge-level",
                                                  "--prune-level", "--prune-depth", "--low-local-ratio",
                                                  "--max-tip-len", "--no-local", "--kmin-1pass",
                                                  "--presets", "--memory", "--mem-flag",
                                                  "--use-gpu", "--gpu-mem", "--keep-tmp-files",
                                                  "--tmp-dir", "--continue", "--verbose"]
        rule_acceptable_params_dict['idba_ud'] = ["run", "--mink", "--maxk", "--step", "--inner_mink",
                                                  "--inner_step", "--prefix", "--min_count",
                                                  "--min_support", "--seed_kmer", "--min_contig",
                                                  "--similar", "--max_mismatch", "--min_pairs",
                                                  "--no_bubble", "--no_local", "--no_coverage",
                                                  "--no_correct", "--pre_correction", "use_scaffolds"]
        rule_acceptable_params_dict['bowtie'] = ["additional_params"]
        rule_acceptable_params_dict['bowtie_build'] = ["additional_params"]
        rule_acceptable_params_dict['samtools_view'] = ["additional_params"]
        rule_acceptable_params_dict['anvi_profile'] = ["--overwrite-output-destinations", "--sample-name", "--report-variability-full",
                                                        "--skip-SNV-profiling", "--profile-SCVs", "--description",
                                                        "--skip-hierarchical-clustering", "--distance", "--linkage", "--min-contig-length",
                                                        "--min-mean-coverage", "--min-coverage-for-variability", "--cluster-contigs",
                                                        "--contigs-of-interest", "--queue-size", "--write-buffer-size-per-thread", "--max-contig-length"]
        rule_acceptable_params_dict['merge_fastas_for_co_assembly'] = []
        rule_acceptable_params_dict['merge_fastqs_for_co_assembly'] = []
        rule_acceptable_params_dict['anvi_merge'] = ["--sample-name", "--description", "--skip-hierarchical-clustering",
                                                     "--enforce-hierarchical-clustering", "--distance", "--linkage",
                                                     "--overwrite-output-destinations"]
        rule_acceptable_params_dict['import_percent_of_reads_mapped'] = ["run"]
        rule_acceptable_params_dict['krakenuniq'] = ["additional_params", "run", "--db", "--gzip-compressed"]
        rule_acceptable_params_dict['import_krakenuniq_taxonomy'] = ["--min-abundance"]
        rule_acceptable_params_dict['remove_short_reads_based_on_references'] = ["dont_remove_just_map", \
                                                                                 "references_for_removal_txt", \
                                                                                 "delimiter-for-iu-remove-ids-from-fastq"]

        self.rule_acceptable_params_dict.update(rule_acceptable_params_dict)

        forbidden_params = {}
        forbidden_params['krakenuniq'] = ['--fastq-input', '--paired', '--output']

        self.forbidden_params.update(forbidden_params)

        self.dirs_dict.update({"QC_DIR": "01_QC",
                               "FASTA_DIR": "02_FASTA",
                               "CONTIGS_DIR": "03_CONTIGS",
                               "MAPPING_DIR": "04_MAPPING",
                               "PROFILE_DIR": "05_ANVIO_PROFILE",
                               "MERGE_DIR": "06_MERGED",
                               "TAXONOMY_DIR": "07_TAXONOMY",
                               "SUMMARY_DIR": "08_SUMMARY",
                               "SPLIT_PROFILES_DIR": "09_SPLIT_PROFILES"})

        self.default_config.update({'samples_txt': "samples.txt",
                                    'metaspades': {"additional_params": "--only-assembler", "threads": 7},
                                    'megahit': {"--min-contig-len": min_contig_length_for_assembly, "--memory": 0.4, "threads": 7},
                                    'idba_ud': {"--min_contig": min_contig_length_for_assembly, "threads": 7},
                                    'iu_filter_quality_minoche': {"run": True, "--ignore-deflines": True},
                                    "gzip_fastqs": {"run": True},
                                    "bowtie": {"additional_params": "--no-unal", "threads": 3},
                                    "samtools_view": {"additional_params": "-F 4"},
                                    "anvi_profile": {"threads": 3, "--sample-name": "{sample}", "--overwrite-output-destinations": True},
                                    "anvi_merge": {"--sample-name": "{group}", "--overwrite-output-destinations": True},
                                    "import_percent_of_reads_mapped": {"run": True},
                                    "krakenuniq": {"threads": 3, "--gzip-compressed": True, "additional_params": ""},
                                    "remove_short_reads_based_on_references": {"delimiter-for-iu-remove-ids-from-fastq": " "},
                                    "anvi_cluster_contigs": {"--collection-name": "{driver}"}})
Ejemplo n.º 19
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.run = run
        self.progress = progress

        A = lambda x: (args.__dict__[x] if x in args.__dict__ else None) if args else None

        if self.mode == 'train':
            self.genomes_dir = os.path.abspath(A('genomes_dir'))
            self.classifier_output_path = os.path.abspath(A('output'))

            if A('classifier'):
                raise ConfigError("You should not initialize the domain training class with a input classifier path (`args.classifier`).")

            if not self.genomes_dir:
                raise ConfigError("You must provide a genomes directory. Please read the help menu if you are not sure\
                                   how the contents of this directory should look like.")

            filesnpaths.is_output_file_writable(self.classifier_output_path)
            filesnpaths.is_file_exists(self.genomes_dir)

        elif self.mode == 'predict':
            if A('output'):
                raise ConfigError("You should not initialize the domain prediction class with an output classifier path (`args.output`).")

            default_classifier_path = 'misc/SCGDOMAINCLASSIFIER.rf'
            self.input_classifier_path = A('classifier') or os.path.join(os.path.dirname(anvio.data.__file__), default_classifier_path)

            if A('classifier'):
                filesnpaths.is_file_exists(self.input_classifier_path)
            else:
                if not filesnpaths.is_file_exists(self.input_classifier_path, dont_raise=True):
                    raise ConfigError("Somehow, this anvi'o installation dose not seem to have a SCG domain classifier. This is one of\
                                       those anvi'o things that should never happen. If you are an anvi'o user, please feel free to panic :(\
                                       If you are an anvi'o developer, what you need to do is to follow the instructions in \
                                       `anvi-script-gen-scg-domain-classifier` with a reasonable set of genomes and store the resulting\
                                       classifier at the default anvi'o path of /blah/blah/anvio/data/%s." % (default_classifier_path))

            self.rf = RF(self.input_classifier_path, r=self.run, p=self.progress)
            self.rf.initialize_classifier()

        else:
            raise ConfigError("Someone initialized the SCG domain classifier class without an explicit mode :(")

        self.SCG_sources = [d for d in hmm_data.sources if hmm_data.sources[d]['kind'] == 'singlecopy']
        self.SCG_domains = sorted([hmm_data.sources[source]['domain'] for source in self.SCG_sources])
        self.SCG_domain_to_source = dict([(hmm_data.sources[source]['domain'], source) for source in self.SCG_sources])

        if not len(self.SCG_sources):
            raise ConfigError("There is something wrong :( There is not even a single SCG source found. Usually\
                               anvi'o comes with multiple of them :/")

        if len(self.SCG_sources) == 1:
            raise ConfigError("There is only a single SCG source in your anvi'o installation. It is OK if you are\
                               being a hacker and playing with things, but there is no logic behind creating a\
                               classifier with a single class.")

        if len(self.SCG_domains) != len(set(self.SCG_domains)):
            raise ConfigError("Something is wrong. For each domain, there must be a single sinlge-copy core gene\
                               source.")

        self.data, self.labels, self.features  = [], [], []

        for domain in self.SCG_domains:
            self.features.extend(sorted(hmm_data.sources[self.SCG_domain_to_source[domain]]['genes']))

        self.run.info('SCG domain classifier mode', self.mode)
        self.run.info("SCG domains found", ', '.join(self.SCG_domains))
        self.run.info("Num features", len(self.features))
Ejemplo n.º 20
0
    def __init__(self, db_path, client_version, new_database=False, ignore_version=False, run=terminal.Run(), progress=terminal.Progress()):
        self.db_path = db_path
        self.version = None

        self.run = run
        self.progress = progress

        if new_database:
            filesnpaths.is_output_file_writable(db_path)
        else:
            filesnpaths.is_file_exists(db_path)

        if new_database and os.path.exists(self.db_path):
            os.remove(self.db_path)

        self.check_if_db_writable()

        self.conn = sqlite3.connect(self.db_path)
        self.conn.text_factory = str

        self.cursor = self.conn.cursor()

        if new_database:
            self.create_self()
            self.set_version(client_version)
        else:
            self.version = self.get_version()
            if str(self.version) != str(client_version) and not ignore_version:
                if int(self.version) > int(client_version):
                    raise ConfigError("Bad news of the day: the database at %s was generated with an anvi'o version that is 'newer' than\
                                       the one you are actively using right now. We know, you hate to hear this, but you need to upgrade\
                                       your anvi'o :(" % self.db_path)
                else:
                    raise ConfigError("The database at '%s' is outdated (its version is v%s, but your anvi'o installation only knows how to\
                                       deal with v%s). You can migrate your database without losing any data using the program `anvi-migrate`."\
                                               % (self.db_path, self.version, client_version))
Ejemplo n.º 21
0
from anvio.drivers.mcl import MCL
from anvio.drivers import Aligners

from anvio.errors import ConfigError, FilesNPathsError
from anvio.genomestorage import GenomeStorage

__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2017, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "*****@*****.**"

run = terminal.Run()
progress = terminal.Progress()
pp = terminal.pretty_print
aligners = Aligners()


class Pangenome(GenomeStorage):
    def __init__(self, args=None, run=run, progress=progress):
        GenomeStorage.__init__(self, args, run, progress)
        self.init_genomes_data_storage()

        self.args = args
        self.run = run
        self.progress = progress

        self.max_num_PCs_for_hierarchical_clustering = constants.max_num_items_for_hierarchical_clustering
Ejemplo n.º 22
0
    def populate_search_tables(self, contigs_db_path):
        utils.is_contigs_db(contigs_db_path)
        filesnpaths.is_output_file_writable(contigs_db_path, ok_if_exists=True)

        contig_sequences_fasta_path = os.path.join(self.tmp_directory_path,
                                                   'contig_sequences.fa')

        utils.export_sequences_from_contigs_db(contigs_db_path,
                                               contig_sequences_fasta_path)

        search_results_dict = self.run_trnascan_on_FASTA(
            fasta_file_path=contig_sequences_fasta_path)

        # At this point we need to turn this search_results_dict into one that matches how it is used
        # in HMM operations. Here is an entry from tRNA results dict:
        #
        # {1: {'contig': 'Bfragilis_0100_000000000001',
        #      'trna_no': '1',
        #      'start': 135361,
        #      'stop': 135433,
        #      'amino_acid': 'Thr',
        #      'codon': 'CGT',
        #      'score': 67.6}}
        #
        # and here is one exmple from the rRNA HMMs results dict:
        #
        # {1: {'entry_id': 0,
        #      'gene_name': 'Bacterial_23S_rRNA',
        #      'gene_hmm_id': '-',
        #      'contig_name': 'Bfragilis_0100_000000000001',
        #      'start': 1110877,
        #      'stop': 1113757,
        #      'e_value': 0.0}}
        #
        # so we will have to make the former look like the latter. I have the feeling that the
        # score / e_value will cause issues later :(

        missing_amino_acids = Counter()
        missing_codons = Counter()
        entries_to_remove = set([])
        for entry_id in search_results_dict:
            entry = search_results_dict[entry_id]

            aa, codon = entry['amino_acid'], utils.rev_comp(entry['anticodon'])

            if codon not in self.codons:
                missing_codons[codon] += 1
                entries_to_remove.add(entry_id)
                continue

            if aa not in self.amino_acids:
                missing_amino_acids[aa] += 1
                entries_to_remove.add(entry_id)
                continue

            aa_codon = '%s_%s' % (aa, codon)

            entry['gene_name'] = aa_codon
            entry['e_value'] = entry['score']
            entry['gene_hmm_id'] = '-'
            if entry['stop'] > entry['start']:
                # so we are forward
                entry['start'] = entry[
                    'start'] - 1  # setting the pythonic start.
            else:
                # so this one is reverse
                entry['stop'] = entry['stop'] - 1

        for entry_id in entries_to_remove:
            search_results_dict.pop(entry_id)

        self.run.info("Num tRNA genes recovered", len(search_results_dict))

        if len(missing_codons):
            self.run.warning(
                "While anvi'o was trying to parse the output from tRNAScan-SE, it "
                "became clear that some of the codons the tool identified was not "
                "known to anvi'o, so we conservatively discareded those entries. "
                "Here is the list of codons that were discareded and their frequency "
                "among your contigs: '%s'." % (', '.join([
                    '%s (%d)' % (codon, missing_codons[codon])
                    for codon in missing_codons
                ])),
                header="WEIRD CODONS ALERT")

        if len(missing_amino_acids):
            self.run.warning(
                "While anvi'o was trying to parse the output from tRNAScan-SE, it "
                "run into some amino acid names that were not known to anvi'o. "
                "All those entries are now gone :/ But here is the list of amino "
                "acids and their frequencies: '%s'." % (', '.join([
                    '%s (%d)' % (amino_acid, missing_amino_acids[amino_acid])
                    for amino_acid in missing_amino_acids
                ])),
                header="WEIRD AMINO ACIDS ALERT")

        search_results_dict = utils.get_pruned_HMM_hits_dict(
            search_results_dict)

        tables_for_hmm_hits = TablesForHMMHits(contigs_db_path,
                                               run=self.run,
                                               progress=self.progress)
        search_results_dict = tables_for_hmm_hits.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(
            self.kind_of_search,
            search_results_dict,
            skip_amino_acid_sequences=True)
        tables_for_hmm_hits.append(self.source_name, self.reference,
                                   self.kind_of_search, self.domain,
                                   self.all_genes_searched_against,
                                   search_results_dict)

        # when the code comes all the way here, the entries in the search results dict already look like
        # this, so we have a gene callers id for the newly generate genes for tRNAs. we will use it
        # to populate a functions dict and submit it to the contigs database as well:
        #
        #     {'contig_name': 'Bfragilis_0100_000000000001',
        #      'trna_no': '1',
        #      'start': 135361,
        #      'stop': 135433,
        #      'amino_acid': 'Thr',
        #      'anticodon': 'CGT',
        #      'score': 67.6,
        #      'gene_name': 'Thr_ACG',
        #      'e_value': 67.6,
        #      'gene_hmm_id': '-',
        #      'gene_callers_id': 4502}
        #
        functions_dict = {}
        for entry_id in search_results_dict:
            entry = search_results_dict[entry_id]

            function_text = 'tRNA gene for amino acid %s (codon: %s; anticodon:%s; score:%.1f; intron_start:%d; intron_end:%d)' \
                                            % (entry['amino_acid'], codon, entry['anticodon'], \
                                               entry['score'], entry['intron_start'], entry['intron_end'])

            functions_dict[entry_id] = {
                'gene_callers_id': entry['gene_callers_id'],
                'source': self.source_name,
                'accession': '%s_%d' % (aa_codon, entry['gene_callers_id']),
                'function': function_text,
                'e_value': 0.0
            }

        gene_function_calls_table = TableForGeneFunctions(
            contigs_db_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False))
        gene_function_calls_table.create(functions_dict)

        if not anvio.DEBUG:
            self.clean_tmp_directory()
            self.run.info_single(
                "Temp directory is now cleaned (if you would like to keep it the "
                "next time use the flag `--debug`).",
                nl_before=1)
        else:
            self.run.info_single(
                "Due to the `--debug` flag, anvi'o did not remove the temoporary files "
                "directory (which is still at '%s')." %
                (self.tmp_directory_path),
                nl_before=1)