Ejemplo n.º 1
0
    def __init__(self,
                 args,
                 run=terminal.Run(),
                 progress=terminal.Progress(),
                 skip_sanity_check=False):
        """Parses arguments and run sanity_check"""

        self.args = args
        self.run = run
        self.progress = progress

        # Parse arguments
        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.annotation_source = A('annotation_source')
        self.window_range = A('ngram_window_range') or "2:3"
        self.is_in_unknowns_mode = A('analyze_unknown_functions')
        self.output_file = A('output_file')
        self.skip_init_functions = A('skip_init_functions')
        self.genome_names_to_focus = A('genome_names')
        self.ngram_source = A("ngram_source")

        self.annotation_source_dict = {}

        self.pan_db_path = A('pan_db')

        if self.annotation_source and self.pan_db_path:
            self.annotation_sources = [self.annotation_source, 'gene_clusters']

        if self.pan_db_path:
            self.pan_db = PanDatabase(self.pan_db_path)

            self.p_meta = self.pan_db.meta

            self.p_meta['creation_date'] = utils.get_time_to_date(
                self.p_meta['creation_date']
            ) if 'creation_date' in self.p_meta else 'unknown'
            self.p_meta['genome_names'] = sorted([
                s.strip()
                for s in self.p_meta['external_genome_names'].split(',') +
                self.p_meta['internal_genome_names'].split(',') if s
            ])
            self.p_meta['num_genomes'] = len(self.p_meta['genome_names'])
            self.genome_names = self.p_meta['genome_names']
            self.gene_clusters_gene_alignments_available = self.p_meta[
                'gene_alignments_computed']
        else:
            self.pan_db = None

        self.genomes_storage_path = A('genomes_storage')

        # confirm genome-storage and pangenome hashes match of pangenome is provided
        if self.pan_db:
            self.genomes_storage = genomestorage.GenomeStorage(
                self.genomes_storage_path,
                self.p_meta['genomes_storage_hash'],
                genome_names_to_focus=self.p_meta['genome_names'],
                skip_init_functions=self.skip_init_functions,
                run=self.run,
                progress=self.progress)
        else:
            self.genomes_storage = genomestorage.GenomeStorage(
                self.genomes_storage_path,
                skip_init_functions=self.skip_init_functions,
                run=self.run,
                progress=self.progress)

        # list-annotation-resources
        self.list_annotation_sources = A('list_annotation_sources')
        self.gene_function_source_set = self.genomes_storage.db.get_table_as_dataframe(
            'gene_function_calls').source.unique()
        if self.list_annotation_sources:
            self.run.info('Available functional annotation sources',
                          ', '.join(self.gene_function_source_set))
            sys.exit()

        # This houses the ngrams' data
        self.ngram_attributes_list = []

        # Focus on specfic set of genomes
        if self.genome_names_to_focus:
            if filesnpaths.is_file_exists(self.genome_names_to_focus,
                                          dont_raise=True):
                self.genome_names_to_focus = utils.get_column_data_from_TAB_delim_file(
                    self.genome_names_to_focus,
                    column_indices=[0],
                    expected_number_of_fields=1)[0]
            else:
                self.genome_names_to_focus = [
                    g.strip() for g in self.genome_names_to_focus.split(',')
                ]

            self.run.warning(
                "A subset of genome names is found, and anvi'o will focus only on to those."
            )

            self.genomes_storage = genomestorage.GenomeStorage(
                self.genomes_storage_path,
                storage_hash=None,
                genome_names_to_focus=self.genome_names_to_focus)
            self.genomes = self.genomes_storage.get_genomes_dict()

            self.external_genome_names = [
                g for g in self.genomes if self.genomes[g]['external_genome']
            ]
            self.internal_genome_names = [
                g for g in self.genomes
                if not self.genomes[g]['external_genome']
            ]

            self.hash_to_genome_name = {}
            for genome_name in self.genomes:
                self.hash_to_genome_name[self.genomes[genome_name]
                                         ['genome_hash']] = genome_name

            # number of genomes in genome-storage
            self.num_contigs_in_external_genomes_with_genes = len(self.genomes)

        # number of genomes in genome-storage
        if self.genome_names_to_focus:
            self.num_contigs_in_external_genomes_with_genes = len(
                self.genome_names_to_focus)
        else:
            self.num_contigs_in_external_genomes_with_genes = len(
                self.genomes_storage.get_all_genome_names())

        if not skip_sanity_check:
            self.sanity_check()

        # unless we are in debug mode, let's keep things quiet.
        if anvio.DEBUG:
            self.run_object = terminal.Run()
        else:
            self.run_object = terminal.Run(verbose=False)
Ejemplo n.º 2
0
    def __init__(self, args, r=terminal.Run(), p=terminal.Progress()):
        self.args = args
        self.run = r
        self.progress = p

        configs.PairedEndReadsConfiguration.__init__(self, args)
Ejemplo n.º 3
0
    def __init__(self,
                 args=None,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.init_workflow_super_class(args, workflow_name='contigs')

        self.group_names = []
        self.contigs_information = {}
        self.fasta_txt_file = None
        self.fasta_information = {}
        # we have external_genomes_file defined here for the sake of pangenomics and phylogenomics workflows
        self.external_genomes_file = ''
        # we have references_mode defined here for the sake of the metagenomics workflow (it is only used when this workflow is inherited)
        self.references_mode = None
        self.import_external_functions_flags = []

        self.rules.extend([
            'gen_external_genome_file', 'anvi_script_reformat_fasta',
            'anvi_gen_contigs_database', 'export_gene_calls_for_centrifuge',
            'centrifuge', 'anvi_import_taxonomy_for_genes',
            'anvi_run_scg_taxonomy', 'anvi_run_trna_scan', 'anvi_run_hmms',
            'anvi_run_ncbi_cogs', 'annotate_contigs_database',
            'anvi_get_sequences_for_gene_calls', 'emapper',
            'anvi_script_run_eggnog_mapper', 'gunzip_fasta',
            'reformat_external_gene_calls_table',
            'reformat_external_functions', 'import_external_functions',
            'anvi_run_pfams', 'anvi_run_kegg_kofams'
        ])

        self.general_params.extend(["fasta_txt"])

        self.dirs_dict.update({
            "FASTA_DIR": "01_FASTA",
            "CONTIGS_DIR": "02_CONTIGS"
        })

        self.default_config.update({
            "fasta_txt": "fasta.txt",
            "anvi_gen_contigs_database": {
                "--project-name": "{group}"
            },
            "centrifuge": {
                "threads": 2
            },
            "anvi_run_hmms": {
                "run": True,
                "threads": 5,
                "--also-scan-trnas": True
            },
            "anvi_run_kegg_kofams": {
                "run": True,
                "threads": 4
            },
            "anvi_run_ncbi_cogs": {
                "run": True,
                "threads": 5
            },
            "anvi_run_scg_taxonomy": {
                "run": True,
                "threads": 6
            },
            "anvi_run_trna_scan": {
                "run": False,
                "threads": 6
            },
            "anvi_script_reformat_fasta": {
                "run": True,
                "--prefix": "{group}",
                "--simplify-names": True
            },
            "emapper": {
                "--database": "bact",
                "--usemem": True,
                "--override": True
            },
            "anvi_script_run_eggnog_mapper": {
                "--use-version": "0.12.6"
            }
        })

        self.rule_acceptable_params_dict['anvi_run_ncbi_cogs'] = [
            'run', '--cog-data-dir', '--sensitive', '--temporary-dir-path',
            '--search-with'
        ]

        self.rule_acceptable_params_dict['anvi_run_scg_taxonomy'] = [
            'run', '--scgs-taxonomy-data-dir'
        ]

        self.rule_acceptable_params_dict['anvi_run_trna_scan'] = [
            'run', '--trna-cutoff-score'
        ]

        self.rule_acceptable_params_dict['anvi_run_hmms'] = [
            'run', '--installed-hmm-profile', '--hmm-profile-dir',
            '--also-scan-trnas'
        ]

        self.rule_acceptable_params_dict['anvi_run_pfams'] = [
            'run', '--pfam-data-dir'
        ]

        self.rule_acceptable_params_dict['anvi_run_kegg_kofams'] = [
            'run', '--kegg-data-dir', '--hmmer-program', '--keep-all-hits',
            '--log-bitscores', '--just-do-it'
        ]

        self.rule_acceptable_params_dict['centrifuge'] = ['run', 'db']

        self.rule_acceptable_params_dict['emapper'] = [
            '--database', '--usemem', '--override', 'path_to_emapper_dir'
        ]

        self.rule_acceptable_params_dict['anvi_script_run_eggnog_mapper'] = [
            'run', '--cog-data-dir', '--drop-previous-annotations',
            '--use-version'
        ]

        self.rule_acceptable_params_dict['anvi_script_reformat_fasta'] = \
                    ['run', '--keep-ids', '--exclude-ids', '--min-len', "--prefix", "--simplify-names", "--seq-type"]


        gen_contigs_params = ['--description', '--skip-gene-calling',\
                              '--ignore-internal-stop-codons', '--skip-mindful-splitting',\
                              '--contigs-fasta', '--project-name',\
                              '--description', '--split-length', '--kmer-size',\
                              '--skip-mindful-splitting', '--skip-gene-calling',\
                              '--ignore-internal-stop-codons', '--skip-predict-frame', '--prodigal-translation-table']

        self.rule_acceptable_params_dict[
            'anvi_gen_contigs_database'] = gen_contigs_params
Ejemplo n.º 4
0
    def do_profile_db(self):
        # are we working with a merged profile database?
        merged = self.summary.p_meta['merged']
        self.run.info('Merged database', 'True' if merged else 'False')

        self.progress.new('Splitting "%s"' % self.bin_id)
        self.progress.update('Subsetting the %s profile database' %
                             'merged' if merged else 'single')

        bin_profile_db = dbops.ProfileDatabase(self.bin_profile_db_path)
        bin_profile_db.touch()

        # copy-paste tables that will largely stay the same from the parent
        bin_profile_db.db.copy_paste(table_name='self',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='views',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='states',
                                     source_db_path=self.profile_db_path)

        # update some values
        bin_profile_db.db.update_meta_value('contigs_db_hash',
                                            self.contigs_db_hash)
        bin_profile_db.db.update_meta_value('available_clusterings', None)
        bin_profile_db.db.update_meta_value('sample_id', self.bin_id)

        # setup the filtering rules for migrating data:
        tables = {}

        # this is to deal with merge atomic data tables that are stored in merged profiles.
        # they are being created on the fly during merge, so bin_profile_db.touch() did not
        # create them, and we have to do it here ourselves. while creating them in the target
        # db, we will also populate the tables dictionary for data migration::
        sample_names = self.summary.p_meta['samples']
        if merged:
            for table_name in t.atomic_data_table_structure[1:-1]:
                for target in ['splits', 'contigs']:
                    new_table_name = '_'.join([table_name, target])
                    new_table_structure = ['contig'
                                           ] + sample_names + ['__parent__']
                    new_table_types = [
                        'text'
                    ] + ['numeric'] * len(sample_names) + ['text']
                    bin_profile_db.db.create_table(new_table_name,
                                                   new_table_structure,
                                                   new_table_types)

                    tables[new_table_name] = ('contig', self.split_names)
        else:
            profile_db = dbops.ProfileDatabase(self.profile_db_path)
            table_structure = profile_db.db.get_table_structure(
                'atomic_data_contigs')
            table_types = profile_db.db.get_table_column_types(
                'atomic_data_contigs')
            for table_name in ['atomic_data_splits', 'atomic_data_contigs']:
                new_table_structure = profile_db.db.get_table_structure(
                    table_name)
                bin_profile_db.db.create_table(table_name, table_structure,
                                               table_types)

                tables[table_name] = ('contig', self.split_names)

        # we need to migrate these guys, too. unless we don't need to... if we are migrating,
        # the values in the self table are already accurate. if we are skipping, regardless
        # of what the values were, we will set the absolut correct ones.
        if self.skip_variability_tables:
            bin_profile_db.db.update_meta_value('SNVs_profiled', False)
            bin_profile_db.db.update_meta_value('SCVs_profiled', False)
        else:
            tables[t.variable_nts_table_name] = ('split_name',
                                                 self.split_names)
            tables[t.variable_codons_table_name] = ('corresponding_gene_call',
                                                    self.gene_caller_ids)

        bin_profile_db.disconnect()

        self.migrate_data(tables, self.profile_db_path,
                          self.bin_profile_db_path)

        self.progress.end()

        if not self.skip_hierarchical_clustering:
            dbops.do_hierarchical_clustering_of_items(self.bin_profile_db_path, constants.clustering_configs['merged' if merged else 'single'], self.split_names, \
                                                      self.database_paths, input_directory=self.bin_output_directory, \
                                                      default_clustering_config=constants.merged_default, distance=self.distance, \
                                                      linkage=self.linkage, run=terminal.Run(verbose=False), progress=self.progress)

        # add a collection
        collection_dict = {'ALL_SPLITS': self.split_names}
        bins_info_dict = {
            'ALL_SPLITS': {
                'html_color': '#FF0000',
                'source': 'anvi-split'
            }
        }
        collections = TablesForCollections(self.bin_profile_db_path)
        collections.append('DEFAULT',
                           collection_dict,
                           bins_info_dict=bins_info_dict)
Ejemplo n.º 5
0
    def __init__(self, args):
        self.args = args

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
        self.output_directory = A('output_dir')
        self.list_contigs_and_exit = A('list_contigs')
        self.min_contig_length = A('min_contig_length')
        self.min_mean_coverage = A('min_mean_coverage')
        self.min_coverage_for_variability = A('min_coverage_for_variability')
        self.contigs_shall_be_clustered = A('cluster_contigs')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.sample_id = A('sample_name')
        self.report_variability_full = A('report_variability_full')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.skip_SNV_profiling = A('skip_SNV_profiling')
        self.profile_SCVs = A('profile_SCVs')
        self.gen_serialized_profile = A('gen_serialized_profile')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.num_threads = int(A('num_threads'))
        self.queue_size = int(A('queue_size'))
        self.write_buffer_size = int(A('write_buffer_size'))
        self.total_length_of_all_contigs = 0
        self.total_coverage_values_for_all_contigs = 0
        self.description_file_path = A('description')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        # whehther the profile database is a blank (without any BAM files or reads):
        self.blank = A('blank_profile')

        if not self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError("You are confused, and confusing anvi'o, too. You can't as hierarchical clustering\
                               to be performed with one flag, and try to skip it with another one :(")

        if self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError("So you want to generate a blank profile, and you both want hierarchical clustering\
                               of your contigs to be performed, and skipped. No.")

        if self.blank and self.contigs_shall_be_clustered:
            raise ConfigError("When the blank profile is asked to be generated, there is no need to ask for the\
                               hierarchical clustering of contigs. It is going to be done by default. If it is\
                               not changing anything, why is anvi'o upset with you? Because. Let's don't use flags\
                               we don't need.")

        if self.blank and not self.skip_hierarchical_clustering:
            self.contigs_shall_be_clustered = True

        if args.contigs_of_interest:
            filesnpaths.is_file_exists(args.contigs_of_interest)
            self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                           if c.strip() and not c.startswith('#')])
        else:
            self.contig_names_of_interest = None

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        if not self.contigs_db_path:
            raise ConfigError("No contigs database, no profilin'. Bye.")

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self, self.args, r=self.run, p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = []

        self.database_paths = {'CONTIGS.db': os.path.abspath(self.contigs_db_path)}

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs['blank' if self.blank else 'single']

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []

        # if genes are not called, yet the user is asking for codon frequencies to be profiled, we give
        # a warning and force-turn that flag off.
        if (not self.a_meta['genes_are_called']) and self.profile_SCVs:
            self.run.warning("You asked the codon frequencies to be profiled, but genes were not called\
                              for your contigs database. Anvi'o is assigning `False` to the profile-codon-frequncies\
                              flag, overruling your request like a boss.")
            self.profile_SCVs = False

        # following variable will be populated while the variable positions table is computed
        self.codons_in_genes_to_profile_SCVs = set([])

        # we don't know what we are about
        self.description = None

        # additional layer data will be filled later
        self.layer_additional_keys = []
        self.layer_additional_data = {}
Ejemplo n.º 6
0
    def __init__(self,
                 args=None,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.init_workflow_super_class(args, workflow_name='trnaseq')

        self.rules.extend([
            'iu_merge_pairs', 'anvi_reformat_fasta', 'anvi_trnaseq',
            'anvi_merge_trnaseq', 'anvi_run_trna_taxonomy',
            'anvi_tabulate_trnaseq'
        ])

        # "General" section of the workflow config file.
        self.general_params.extend(['samples_txt'])

        # Parameters for each rule that are accessible in the config file.
        rule_acceptable_params_dict = {}
        rule_acceptable_params_dict['iu_merge_pairs'] = [
            'run', '--gzip-output', '--marker-gene-stringent',
            '--max-num-mismatches', '--report-r1-prefix', '--report-r2-prefix'
        ]
        rule_acceptable_params_dict['anvi_reformat_fasta'] = [
            'run', '--gzip-output', '--simplify-names'
        ]
        rule_acceptable_params_dict['anvi_trnaseq'] = [
            'run', '--treatment', '--overwrite-output-destinations',
            '--description', '--write-checkpoints', '--load-checkpoint',
            '--feature-param-file', '--threeprime-termini',
            '--min-length-long-fiveprime', '--min-trna-fragment-size',
            '--agglomeration-max-mismatch-freq', '--skip-INDEL-profiling',
            '--max-indel-freq', '--left-indel-buffer', '--right-indel-buffer',
            '--skip-fasta-check', '--alignment-target-chunk-size',
            '--profiling-chunk-size'
        ]
        rule_acceptable_params_dict['anvi_merge_trnaseq'] = [
            'run', '--project-name', '--max-reported-trna-seeds',
            '--overwrite-output-destinations', '--description',
            '--feature-threshold', '--preferred-treatment',
            '--nonspecific-output', '--min-variation', '--min-third-fourth-nt',
            '--min-indel-fraction', '--distance', '--linkage'
        ]
        rule_acceptable_params_dict['anvi_run_trna_taxonomy'] = [
            'run', '--trna-taxonomy-data-dir', '--min-percent-identity',
            '--max-num-target-sequences', '--num-parallel-processes',
            '--write-buffer-size'
        ]
        rule_acceptable_params_dict['anvi_tabulate_trnaseq'] = [
            'run', '--overwrite-output-destinations'
        ]
        self.rule_acceptable_params_dict.update(rule_acceptable_params_dict)

        # Default values for accessible parameters: all defaults are written to the config file so
        # the user can see them succinctly.

        # Though the workflow superclass automatically adds a threads argument of "" to each
        # workflow, here we make explicit that the default is 1 and the user does not need to
        # enclose the value in quotes. Likewise, the superclass adds mandatory arguments at the end
        # of the list for each rule in the config file, but we explicitly add them here to ensure
        # they appear in the order of each script's help display.
        self.default_config.update({
            'samples_txt': 'samples.txt',
            'iu_merge_pairs': {
                'run': True,
                '--gzip-output': False,
                '--marker-gene-stringent': True,
                '--max-num-mismatches': 0,
                '--report-r1-prefix': False,
                '--report-r2-prefix': False,
                'threads': 1
            },
            'anvi_reformat_fasta': {
                'run': True,
                '--gzip-output':
                False,  # not an argument of anvi-script-reformat-fasta
                '--simplify-names':
                True,  # not the default in anvi-script-reformat-fasta
                'threads': 1
            },
            'anvi_trnaseq': {
                'run':
                True,
                '--treatment':
                "",  # if provided in the config file, the treatment is assumed to be for all samples
                '--overwrite-output-destinations':
                anvio.D['overwrite-output-destinations'][1]['default'],
                '--description':
                "",
                '--write-checkpoints':
                anvio.D['write-checkpoints'][1]['default'],
                '--load-checkpoint':
                "",
                '--feature-param-file':
                "",
                '--threeprime-termini':
                anvio.D['threeprime-termini'][1]['default'],
                '--min-length-long-fiveprime':
                anvio.D['min-length-long-fiveprime'][1]['default'],
                '--min-trna-fragment-size':
                anvio.D['min-trna-fragment-size'][1]['default'],
                '--agglomeration-max-mismatch-freq':
                anvio.D['agglomeration-max-mismatch-freq'][1]['default'],
                '--skip-INDEL-profiling':
                anvio.D['skip-INDEL-profiling'][1]['default'],
                '--max-indel-freq':
                anvio.D['max-indel-freq'][1]['default'],
                '--left-indel-buffer':
                anvio.D['left-indel-buffer'][1]['default'],
                '--right-indel-buffer':
                anvio.D['right-indel-buffer'][1]['default'],
                '--skip-fasta-check':
                True,  # not the default in anvi-trnaseq
                '--profiling-chunk-size':
                anvio.D['profiling-chunk-size'][1]['default'],
                '--alignment-target-chunk-size':
                anvio.D['alignment-target-chunk-size'][1]['default'],
                'threads':
                1
            },
            'anvi_merge_trnaseq': {
                'run':
                True,
                '--project-name':
                "",
                '--max-reported-trna-seeds':
                anvio.D['max-reported-trna-seeds'][1]['default'],
                '--overwrite-output-destinations':
                anvio.D['overwrite-output-destinations'][1]['default'],
                '--description':
                "",
                '--feature-threshold':
                anvio.D['feature-threshold'][1]['default'],
                '--preferred-treatment':
                "",
                '--nonspecific-output':
                anvio.D['nonspecific-output'][1]['default'],
                '--min-variation':
                anvio.D['min-variation'][1]['default'],
                '--min-third-fourth-nt':
                anvio.D['min-third-fourth-nt'][1]['default'],
                '--min-indel-fraction':
                anvio.D['min-indel-fraction'][1]['default'],
                '--distance':
                anvio.D['distance'][1]['default'],
                '--linkage':
                anvio.D['linkage'][1]['default'],
                'threads':
                1
            },
            'anvi_run_trna_taxonomy': {
                'run':
                True,
                '--trna-taxonomy-data-dir':
                "",
                '--min-percent-identity':
                90,  # default in anvi-run-trna-taxonomy
                '--max-num-target-sequences':
                100,  # default in anvi-run-trna-taxonomy
                '--num-parallel-processes':
                anvio.D['num-parallel-processes'][1]['default'],
                '--write-buffer-size':
                anvio.D['write-buffer-size'][1]['default'],
                'threads':
                1
            },
            'anvi_tabulate_trnaseq': {
                'run':
                True,
                '--overwrite-output-destinations':
                anvio.D['overwrite-output-destinations'][1]['default'],
                'threads':
                1
            },
            'output_dirs':
            {},  # This ensures that output_dirs comes before max_threads in the file
            'max_threads': 1
        })

        self.dirs_dict.update({
            'QC_DIR': '01_QC',
            'IDENT_DIR': '02_IDENT',
            'CONVERT_DIR': '03_CONVERT'
        })
Ejemplo n.º 7
0
    def populate_misc_data_tables(self):
        self.run.info_single("Additional data and layer orders...",
                             nl_before=1,
                             nl_after=1,
                             mc="blue")

        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]

        # initialize views.
        args = argparse.Namespace(profile_db=self.merged_profile_db_path)
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out layer orders dictionary
        layer_orders_data_dict = {}
        failed_attempts = []
        self.progress.new('Working on layer orders')
        for essential_field in essential_fields:
            self.progress.update('recovering order for "%s"' %
                                 (essential_field))
            try:
                data_value = clustering.get_newick_tree_data_for_dict(
                    profile_db_super.views[essential_field]['dict'],
                    distance=self.distance,
                    linkage=self.linkage,
                    transpose=True)

                layer_orders_data_dict[essential_field] = {
                    'data_value': data_value,
                    'data_type': 'newick'
                }
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(layer_orders_data_dict):
            self.run.warning(
                "This may or may not be important: anvi'o attempted to generate orders for your\
                              samples based on the view data, however, it failed :/"
            )
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to store layers order information for those view items\
                              the clustering in fact worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        # add the layer orders quietly
        TableForLayerOrders(
            args, r=terminal.Run(verbose=False)).add(layer_orders_data_dict)
        self.run.warning(None, header="Layer orders added", lc='cyan')
        for layer_order in layer_orders_data_dict:
            self.run.info_single(layer_order, mc='cyan')

        # done with layer orders. let's add our layer additional data and call it a day.
        for data_group_name in self.layer_additional_data_dict:
            TableForLayerAdditionalData(
                args, r=terminal.Run(verbose=False)).add(
                    self.layer_additional_data_dict[data_group_name],
                    list(self.layer_additional_data_keys[data_group_name]),
                    data_group=data_group_name)

        self.run.warning(None, header="Data groups added", lc='cyan')
        for data_group in self.layer_additional_data_dict:
            self.run.info_single(
                '%s (w/%d items)' %
                (data_group, len(self.layer_additional_data_keys[data_group])),
                mc='cyan')
Ejemplo n.º 8
0
    def __init__(self,
                 args=None,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        # if a regular instance of `ContigsDBWorkflow` is being generated, we
        # expect it to have a parameter `args`. if there is no `args` given, we
        # assume the class is being inherited as a base class from within another
        if args:
            if len(self.__dict__):
                raise ConfigError(
                    "Something is wrong. You are ineriting `PhylogenomicsWorkflow` from \
                                   within another class, yet you are providing an `args` parameter.\
                                   This is not alright.")
            self.args = args
            self.name = 'phylogenomics'
        else:
            if not len(self.__dict__):
                raise ConfigError(
                    "When you are *not* inheriting `PhylogenomicsWorkflow` from within\
                                   a super class, you must provide an `args` parameter."
                )

            if 'name' not in self.__dict__:
                raise ConfigError(
                    "The super class trying to inherit `PhylogenomicsWorkflow` does not\
                                   have a set `self.name`. Which means there may be other things\
                                   wrong with it, hence anvi'o refuses to continue."
                )

        self.run = run
        self.progress = progress

        self.input_for_anvi_get_sequences_for_hmm_hits = {}
        self.internal_genomes_file = ''
        self.external_genomes_file = ''

        # initialize the base class
        WorkflowSuperClass.__init__(self)

        self.rules.extend(
            ['anvi_get_sequences_for_hmm_hits', 'trimal', 'iqtree'])

        self.general_params.extend(['project_name'])

        self.dirs_dict.update({"PHYLO_DIR": "01_PHYLOGENOMICS"})

        self.default_config.update({
            'anvi_get_sequences_for_hmm_hits': {
                '--return-best-hit': True,
                '--align-with': 'famsa',
                '--concatenate-genes': True,
                '--get-aa-sequences': True,
                '--hmm-sources': 'Campbell_et_al'
            },
            'trimal': {
                '-gt': 0.5
            },
            'iqtree': {
                'threads': 8,
                '-m': 'WAG',
                '-bb': 1000
            }
        })

        get_sequences_params = ['--external-genomes', '--internal-genomes', '--return-best-hit', \
                                '--separator', '--align-with', '--min-num-bins-gene-occurs', \
                                '--max-num-genes-missing-from-bin', '--concatenate-genes', \
                                '--get-aa-sequences', '--gene-names', '--hmm-sources']
        self.rule_acceptable_params_dict[
            'anvi_get_sequences_for_hmm_hits'] = get_sequences_params
        self.rule_acceptable_params_dict['trimal'] = [
            '-gt', 'additional_params'
        ]
        self.rule_acceptable_params_dict['iqtree'] = [
            '-m', '-bb', 'additional_params'
        ]
Ejemplo n.º 9
0
import anvio.auxiliarydataops as auxiliarydataops

from anvio.errors import ConfigError
from anvio.constants import codon_to_AA

__author__ = "A. Murat Eren"
__copyright__ = "Copyright 2015, The anvio Project"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "*****@*****.**"

pp = terminal.pretty_print
progress = terminal.Progress()
run = terminal.Run(width=62)


class VariabilitySuper(object):
    def __init__(self, args={}, p=progress, r=run):
        self.args = args

        self.data = {}

        self.splits_of_interest = set([])
        self.samples_of_interest = set([])

        A = lambda x, t: t(args.__dict__[x]) if x in args.__dict__ else None
        null = lambda x: x
        self.bin_id = A('bin_id', null)
        self.collection_name = A('collection_name', null)
Ejemplo n.º 10
0
    def __store_concatenated_hmm_sequences_into_FASTA(
            self,
            hmm_sequences_dict_for_splits,
            output_file_path,
            wrap=120,
            concatenate_genes=False,
            separator='XXX',
            genes_order=None,
            align_with=None):
        """Generates concatenated sequences from `hmm_sequences_dict_for_splits` dict.

           Please do NOT directly access to this function, and use `store_hmm_sequences_into_FASTA`
           instead.
        """

        if len(self.sources) != 1:
            raise ConfigError(
                "If you want your genes to be concatenated, you should be requesting a single HMM source. Why?\
                               In fact we are not exactly sure why. But when we think of it, we couldn't come up with a \
                               scenario where the user might truly be interested in concatenating genes from multiple HMM\
                               sources, and we wanted to add a control in case they are making a mistake w/o realizing. If you\
                               are sure this is what you must do for the question you are interested in, please send an\
                               e-mail to the anvi'o discussion group, and convince us .. or you can just delete this if block\
                               to avoid this check if you are not in the mood. We know the feeling."
            )

        hmm_source = self.sources.pop()
        gene_names_in_source = [
            g.strip()
            for g in self.hmm_hits_info[hmm_source]['genes'].split(',')
        ]

        # the user wants to play rough. FINE. we will concatenate genes for phylogenomic analyses.
        gene_names = None

        # let's get an instance of the aligner early on so we learn about issues before its too late.
        aligner = self.get_aligner(align_with)

        # lets learn about what we have in this dictionary first.
        bin_names_in_dict = list(
            set([x['bin_id'] for x in hmm_sequences_dict_for_splits.values()]))
        gene_names_in_dict = sorted(
            list(
                set([
                    x['gene_name']
                    for x in hmm_sequences_dict_for_splits.values()
                ])))

        # if the function is called with a particular set and order of genes, use those, otherwise
        # stick with the gene names / order we found in the dictionary.
        if genes_order:
            genes_in_genes_order_but_missing_in_hmm_source = [
                g for g in genes_order if g not in gene_names_in_source
            ]
            if len(genes_in_genes_order_but_missing_in_hmm_source):
                raise ConfigError("One or more gene names in the genes order list does seem to appear among the genes described\
                                   by the HMM source %s (which translates to 'terrible news'). Here are the genes that cause this\
                                   issue if you want to fix this: '%s'" \
                                              % (hmm_source, ', '.join(genes_in_genes_order_but_missing_in_hmm_source)))
            gene_names = genes_order
        else:
            self.run.warning(
                "You did not define any gene names. Bold move. Now anvi'o will attempt to report a file with all\
                              genes defined in the HMM source '%s'." %
                hmm_source)

            gene_names = gene_names_in_dict

        # gene lenghts are especially important to accommodate missing genes with proper number of
        # gap characters
        gene_lengths = {}

        # buld a simpler dict that keeps genes sequences for each bin for a given gene name
        genes_in_bins_dict = {}
        for entry in hmm_sequences_dict_for_splits.values():
            gene_name = entry['gene_name']
            bin_name = entry['bin_id']
            sequence = entry['sequence']
            if gene_name in genes_in_bins_dict:
                genes_in_bins_dict[gene_name][bin_name] = sequence
            else:
                genes_in_bins_dict[gene_name] = {bin_name: sequence}

        # align homolog sequences across bins
        self.progress.new('Aligning homolog gene sequences pre-concatenation')
        all_gene_names = list(genes_in_bins_dict.keys())
        num_genes = len(all_gene_names)
        for i in range(0, num_genes):
            gene_name = all_gene_names[i]
            self.progress.update('working on %s (%d of %d) ...' %
                                 (gene_name, i + 1, num_genes))
            genes_list = [(bin_name, genes_in_bins_dict[gene_name][bin_name]) \
                                                        for bin_name in genes_in_bins_dict[gene_name] \
                                                                           if bin_name in genes_in_bins_dict[gene_name]]
            genes_in_bins_dict[gene_name] = aligner(run=terminal.Run(
                verbose=False)).run_stdin(genes_list)
            gene_lengths[gene_name] = len(
                list(genes_in_bins_dict[gene_name].values())[0])
        self.progress.end()

        # concatenate all of them and write them in a file
        f = open(output_file_path, 'w')
        gene_names_missing_from_everywhere = []
        for bin_name in bin_names_in_dict:
            sequences_list = []

            for gene_name in gene_names:
                if gene_name in genes_in_bins_dict:
                    if bin_name in genes_in_bins_dict[gene_name]:
                        sequences_list.append(
                            genes_in_bins_dict[gene_name][bin_name])
                    else:
                        sequences_list.append('-' * gene_lengths[gene_name])
                else:
                    # if we are here, it means this is a gene that has been missing form the hmm hits dict, since it
                    # was not in any of the bins the dict described, but the user requested to have it in the
                    # alignment anyway. This can happen when the user wants to concatanate genes from one or more
                    # low-completion bins. We will keep track of them, and tell the user.
                    sequences_list.append('-' * 42)
                    gene_names_missing_from_everywhere.append(gene_name)

            sequence = separator.join(sequences_list)

            if wrap:
                sequence = textwrap.fill(sequence,
                                         wrap,
                                         break_on_hyphens=False)

            f.write('>%s genes:%s|separator:%s\n' %
                    (bin_name, ','.join(gene_names), separator))
            f.write('%s\n' % sequence)

        if len(gene_names_missing_from_everywhere):
            run.warning("You asked for some genes that were missing from all bins this class had in the\
            HMM hits dictionary (here is a list of them: '%s'). Not knowing what to do with this werid\
            situation, anvi'o put gap characters for all of them and retained your order. Here are those\
            genes that missed the party: '%s'"                                               % \
                (', '.join(bin_names_in_dict), ', '.join(gene_names_missing_from_everywhere)))

        f.close()
# some tests for SCG taxonomy string processing

import argparse

import anvio.terminal as terminal
import anvio.scgtaxonomyops as scgtaxonomyops

levels_of_taxonomy = ["t_domain", "t_phylum", "t_class", "t_order", "t_family", "t_genus", "t_species"]

c = scgtaxonomyops.PopulateContigsDatabaseWithSCGTaxonomy(argparse.Namespace(skip_sanity_check=True), run=terminal.Run(verbose=False))

p = scgtaxonomyops.SCGTaxonomyEstimatorSingle(argparse.Namespace(skip_sanity_check=True, skip_init=True), run=terminal.Run(verbose=False))

cX = lambda: c.get_consensus_hit(scg_raw_hits)
cT = lambda level: cX()[level]

def pX(scg_dict):
    for i in scg_dict:
        scg_dict[i]['tax_hash'] = scgtaxonomyops.HASH(scg_dict[i])
    return p.get_consensus_taxonomy(scg_dict)

pT = lambda level: pX(scg_dict)[level]


#########################################

scg_raw_hits = [{'percent_identity': 100.0, 't_domain': 'A', 't_phylum': 'B', 't_class': 'C', 't_order': 'D', 't_family': 'E', 't_genus': 'F', 't_species': 'G x'},
                {'percent_identity': 100.0, 't_domain': 'A', 't_phylum': 'B', 't_class': 'C', 't_order': 'D', 't_family': 'E', 't_genus': 'F', 't_species': 'G x'},
                {'percent_identity': 100.0, 't_domain': 'A', 't_phylum': 'B', 't_class': 'C', 't_order': 'D', 't_family': 'E', 't_genus': 'F', 't_species': 'G x'}]

assert cT('t_species') == 'G x'
Ejemplo n.º 12
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.run = run
        self.progress = progress

        A = lambda x: (args.__dict__[x] if x in args.__dict__ else None) if args else None

        if self.mode == 'train':
            self.genomes_dir = os.path.abspath(A('genomes_dir'))
            self.classifier_output_path = os.path.abspath(A('output'))

            if A('classifier'):
                raise ConfigError("You should not initialize the domain training class with a input classifier path (`args.classifier`).")

            if not self.genomes_dir:
                raise ConfigError("You must provide a genomes directory. Please read the help menu if you are not sure\
                                   how the contents of this directory should look like.")

            filesnpaths.is_output_file_writable(self.classifier_output_path)
            filesnpaths.is_file_exists(self.genomes_dir)

        elif self.mode == 'predict':
            if A('output'):
                raise ConfigError("You should not initialize the domain prediction class with an output classifier path (`args.output`).")

            default_classifier_path = 'misc/SCGDOMAINCLASSIFIER.rf'
            self.input_classifier_path = A('classifier') or os.path.join(os.path.dirname(anvio.data.__file__), default_classifier_path)

            if A('classifier'):
                filesnpaths.is_file_exists(self.input_classifier_path)
            else:
                if not filesnpaths.is_file_exists(self.input_classifier_path, dont_raise=True):
                    raise ConfigError("Somehow, this anvi'o installation dose not seem to have a SCG domain classifier. This is one of\
                                       those anvi'o things that should never happen. If you are an anvi'o user, please feel free to panic :(\
                                       If you are an anvi'o developer, what you need to do is to follow the instructions in \
                                       `anvi-script-gen-scg-domain-classifier` with a reasonable set of genomes and store the resulting\
                                       classifier at the default anvi'o path of /blah/blah/anvio/data/%s." % (default_classifier_path))

            self.rf = RF(self.input_classifier_path, r=self.run, p=self.progress)
            self.rf.initialize_classifier()

        else:
            raise ConfigError("Someone initialized the SCG domain classifier class without an explicit mode :(")

        self.SCG_sources = [d for d in hmm_data.sources if hmm_data.sources[d]['kind'] == 'singlecopy']
        self.SCG_domains = sorted([hmm_data.sources[source]['domain'] for source in self.SCG_sources])
        self.SCG_domain_to_source = dict([(hmm_data.sources[source]['domain'], source) for source in self.SCG_sources])

        if not len(self.SCG_sources):
            raise ConfigError("There is something wrong :( There is not even a single SCG source found. Usually\
                               anvi'o comes with multiple of them :/")

        if len(self.SCG_sources) == 1:
            raise ConfigError("There is only a single SCG source in your anvi'o installation. It is OK if you are\
                               being a hacker and playing with things, but there is no logic behind creating a\
                               classifier with a single class.")

        if len(self.SCG_domains) != len(set(self.SCG_domains)):
            raise ConfigError("Something is wrong. For each domain, there must be a single sinlge-copy core gene\
                               source.")

        self.data, self.labels, self.features  = [], [], []

        for domain in self.SCG_domains:
            self.features.extend(sorted(hmm_data.sources[self.SCG_domain_to_source[domain]]['genes']))

        self.run.info('SCG domain classifier mode', self.mode)
        self.run.info("SCG domains found", ', '.join(self.SCG_domains))
        self.run.info("Num features", len(self.features))
Ejemplo n.º 13
0
    def __init__(self,
                 db_path,
                 client_version,
                 new_database=False,
                 ignore_version=False,
                 read_only=False,
                 skip_rowid_prepend=False,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.db_path = db_path
        self.read_only = read_only
        self.version = None

        self.run = run
        self.progress = progress

        # these anonymous functions report whether the ROWID will be added
        # to its rows read from the database or not. if the first column of a given
        # table does not contain unique variables, anvi'o prepends the ROWID of each
        # column to index 0, unless `skip_rowid_prepend` is True
        self.ROWID_PREPENDS_ROW_DATA = lambda table_name: False if skip_rowid_prepend else tables.requires_unique_entry_id[
            table_name]
        self.PROPER_SELECT_STATEMENT = lambda table_name: 'ROWID as "entry_id", *' if self.ROWID_PREPENDS_ROW_DATA(
            table_name) else '*'

        if new_database:
            filesnpaths.is_output_file_writable(db_path)
        else:
            filesnpaths.is_file_exists(db_path)

        if new_database and os.path.exists(self.db_path):
            os.remove(self.db_path)

        if self.read_only and new_database:
            raise ConfigError(
                "One cannot create a new database that is read-only.")

        if not self.read_only:
            self.check_if_db_writable()

        try:
            self.conn = sqlite3.connect(self.db_path)
        except Exception as e:
            raise ConfigError(
                f"This one time someone was not happy with '{self.db_path}' and '{e}', they said."
            )

        self.conn.text_factory = str

        self.cursor = self.conn.cursor()

        self.table_names_in_db = self.get_table_names()

        self.db_connected = True

        if new_database:
            self.create_self()
            self.set_version(client_version)
        else:
            self.version = self.get_version()
            if str(self.version) != str(client_version) and not ignore_version:
                if int(self.version) > int(client_version):
                    progress.reset()
                    raise ConfigError(
                        "Bad news of the day: the database at %s was generated with an anvi'o version that is 'newer' than "
                        "the one you are actively using right now. We know, you hate to hear this, but you need to upgrade "
                        "your anvi'o :(" % self.db_path)
                else:
                    progress.reset()
                    raise ConfigError(
                        f"The database at '{self.db_path}' is outdated (this database is v{self.version} and your anvi'o installation "
                        f"wants to work with v{client_version}). You can migrate your database without losing any data using the "
                        f"program `anvi-migrate` with either of the flags `--migrate-dbs-safely` or `--migrate-dbs-quickly`."
                    )

            bad_tables = [
                table_name for table_name in self.table_names_in_db
                if table_name not in tables.requires_unique_entry_id
            ]
            if len(bad_tables):
                raise ConfigError(
                    "You better be a programmer tinkering with anvi'o databases adding new tables or something. Otherwise we "
                    "have quite a serious problem :/ Each table in a given anvi'o database must have an entry in the "
                    "anvio/tables/__init__.py dictionary `requires_unique_entry_id` to explicitly define whether anvi'o "
                    "should add a unique entry id for its contents upon retrieval as a dictionary. The following tables "
                    "in this database do not satisfy that: '%s'. You can solve this problem by adding an entry into that "
                    "dictionary." % (', '.join(bad_tables)))
Ejemplo n.º 14
0
    def process(self):
        self.sanity_check()

        self.run.info('Input metadata file', self.metadata_file_path)
        self.run.info('Output directory', self.output_directory_path)

        columns = utils.get_columns_of_TAB_delim_file(self.metadata_file_path)
        if 'organism_name' not in columns or 'local_filename' not in columns:
            raise ConfigError(
                "The metadata file you provided does not look like a metadata\
                               file output from the program `ncbi-genome-download` :/ Why?\
                               Because anvi'o expects that file to have at least the following\
                               two columns in it: 'organism_name' and 'local_filename'."
            )

        metadata = utils.get_TAB_delimited_file_as_dictionary(
            self.metadata_file_path)

        for entry in metadata:
            if not os.path.exists(metadata[entry]['local_filename']):
                raise ConfigError(
                    "At least one of the files in your metadata input does not seem to be\
                                   where they think they are :/ Please make sure the entry %s and others\
                                   point to proper local file paths..." %
                    entry)

        self.run.info('Num entries in metadata', len(metadata))

        output_fasta_dict = {}
        self.progress.new("GenBank to anvi'o",
                          progress_total_items=len(metadata))
        for entry in metadata:
            self.progress.increment()
            self.progress.update('Processing %s ...' % entry)

            # set the organism name and accession id and clean them from weird
            # characters.
            organism_name = metadata[entry]['organism_name']
            for char in [
                    c for c in organism_name
                    if c not in OK_CHARS_FOR_ORGANISM_NAME
            ]:
                organism_name = organism_name.replace(char, '_')

            accession_id = entry
            for char in [
                    c for c in accession_id if c not in OK_CHARS_FOR_ACCESSION
            ]:
                accession_id = accession_id.replace(char, '_')

            final_name = '_'.join([organism_name, accession_id])

            args = argparse.Namespace(
                input_genbank=metadata[entry]['local_filename'],
                output_file_prefix=os.path.join(self.output_directory_path,
                                                final_name))
            g = GenbankToAnvio(args,
                               run=terminal.Run(verbose=False),
                               progress=terminal.Progress(verbose=False))

            if final_name in output_fasta_dict:
                raise ConfigError(
                    "The final name '%s' for your genome has alrady been used by\
                                   another one :/ This should never happen unless your metadata\
                                   contains entries with identical accession numbers..."
                )
            output_fasta_dict[final_name] = g.process()

        self.progress.end()

        headers = ['name', 'path']
        if not self.exclude_gene_calls_from_fasta_txt:
            headers.extend(
                ['external_gene_calls', 'gene_functional_annotation'])

        utils.store_dict_as_TAB_delimited_file(output_fasta_dict,
                                               self.output_fasta_descriptor,
                                               headers=headers)

        self.run.info('Output FASTA descriptor', self.output_fasta_descriptor)
Ejemplo n.º 15
0
    def add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(self, source, search_results_dict, skip_amino_acid_sequences=False):
        """Add new gene calls to the contigs database and update the HMM `search_results_dict`.

           When we are looking for HMM hits in the context of CONTIGS, our hits do not
           related to the gene calls we already have in a given contigs database. One
           slution is to add additional gene calls for a given set of HMM hits to keep
           them in the database."""

        if not len(search_results_dict):
            return search_results_dict

        # we will first learn the next available id in the gene callers table
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        next_id = database.get_max_value_in_column('genes_in_contigs', 'gene_callers_id', value_if_empty=0) + 1
        database.disconnect()

        additional_gene_calls = {}
        for e in search_results_dict.values():
            start = e['start']
            stop = e['stop']

            if stop > start:
                direction = 'f'
            else:
                direction = 'r'
                stop, start = start, stop

            partial = 0 if ((stop - start) % 3 == 0) else 1

            # add a new gene call in to the dictionary
            additional_gene_calls[next_id] = {'contig': e['contig_name'],
                                              'start': start,
                                              'stop': stop,
                                              'direction': direction,
                                              'partial': partial,
                                              'source': source,
                                              'version': 'unknown'
                                            }

            # update the search results dictionary with gene callers id:
            e['gene_callers_id'] = next_id

            # update the next available gene callers id:
            next_id += 1

        if not len(additional_gene_calls):
            return search_results_dict

        # update the contigs db with the gene calls in `additional_gene_calls` dict.
        gene_calls_table = TablesForGeneCalls(self.db_path, run=terminal.Run(verbose=False))
        gene_calls_table.use_external_gene_calls_to_populate_genes_in_contigs_table(input_file_path=None,
                                                                                    gene_calls_dict=additional_gene_calls,
                                                                                    ignore_internal_stop_codons=True,
                                                                                    skip_amino_acid_sequences=skip_amino_acid_sequences)
        gene_calls_table.populate_genes_in_splits_tables(gene_calls_dict=additional_gene_calls)

        # refresh the gene calls dict
        self.init_gene_calls_dict()

        self.run.info('Gene calls added to db', '%d (from source "%s")' % (len(additional_gene_calls), source))

        return search_results_dict
Ejemplo n.º 16
0
    def run_search_and_parse_results(self):
        """Align the protein against the database based on only sequence"""

        if not self.percent_identical_cutoff or not self.max_number_templates:
            raise ConfigError(
                "run_search_and_parse_results :: You initiated this class without providing values for percent_identical_cutoff "
                "and max_number_templates, which is required for this function."
            )

        # Change to MODELLER working directory
        os.chdir(self.directory)

        driver = diamond.Diamond(
            query_fasta=self.target_fasta_path,
            target_fasta=J(self.database_dir,
                           self.modeller_database + '.dmnd'),
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.blastp()

        # Change back to user directory
        os.chdir(self.start_dir)

        search_df = driver.view_as_dataframe(
            J(self.directory, driver.tabular_output_path))

        matches_found = search_df.shape[0]

        if not matches_found:
            self.run.warning(
                "No proteins with homologous sequence were found for {}. No structure will be modelled"
                .format(self.corresponding_gene_call))
            raise self.EndModeller

        # We need the gene length for proper_pident
        target_fasta = u.SequenceSource(self.target_fasta_path,
                                        lazy_init=False)
        while next(target_fasta):
            gene_length = len(target_fasta.seq)

        # add some useful columns
        search_df["proper_pident"] = search_df["pident"] * search_df[
            "length"] / gene_length
        search_df["code"] = search_df["sseqid"].str[:-1]
        search_df["chain"] = search_df["sseqid"].str[-1]

        # filter results by self.percent_identical_cutoff.
        max_pident_found = search_df["proper_pident"].max()
        id_of_max_pident = tuple(
            search_df.loc[search_df["proper_pident"].idxmax(),
                          ["code", "chain"]].values)
        search_df = search_df[
            search_df["proper_pident"] >= self.percent_identical_cutoff]

        search_df = search_df.sort_values("proper_pident", ascending=False)

        # If more than 1 template in 1 PDB id, just choose 1
        search_df = search_df.drop_duplicates('code', keep='first')

        # Order them and take the first self.modeller.max_number_templates.
        matches_after_filter = len(search_df)
        if not matches_after_filter:
            self.run.warning("Gene {} did not have a search result with proper percent identicalness above or equal "
                             "to {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a "
                             "proper percent identicalness of {:.2f}%. No structure will be modelled.".\
                              format(self.corresponding_gene_call,
                                     self.percent_identical_cutoff,
                                     id_of_max_pident[1],
                                     id_of_max_pident[0],
                                     max_pident_found))
            raise self.EndModeller

        # get up to self.modeller.max_number_templates of those with the highest proper_ident scores.
        search_df = search_df.iloc[:min(
            [len(search_df), self.max_number_templates])]

        # Get their chain and 4-letter ids
        self.list_of_template_code_and_chain_ids = list(
            zip(search_df["code"], search_df["chain"]))

        self.run.info("Max number of templates allowed",
                      self.max_number_templates)
        self.run.info("Number of candidate templates", matches_found)
        self.run.info(
            "After >{}% identical filter".format(
                self.percent_identical_cutoff), matches_after_filter)
        self.run.info("Number accepted as templates",
                      len(self.list_of_template_code_and_chain_ids))

        # update user on which templates are used, and write the templates to self.out
        for i in range(len(self.list_of_template_code_and_chain_ids)):
            pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i]
            ppi = search_df["proper_pident"].iloc[i]

            self.out["templates"]["pdb_id"].append(pdb_id)
            self.out["templates"]["chain_id"].append(chain_id)
            self.out["templates"]["ppi"].append(ppi)

            self.run.info(
                "Template {}".format(i + 1),
                "Protein ID: {}, Chain {} ({:.1f}% identical)".format(
                    pdb_id, chain_id, ppi))
Ejemplo n.º 17
0
    def populate_search_tables(self, sources={}):
        # make sure the output file is OK to write.
        filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True)

        # if we end up generating a temporary file for amino acid sequences:
        if not len(sources):
            import anvio.data.hmm
            sources = anvio.data.hmm.sources

        if not sources:
            return

        target_files_dict = {}

        tmp_directory_path = filesnpaths.get_temp_directory_path()

        # here we will go through targets and populate target_files_dict based on what we find among them.
        targets = set([s['target'] for s in list(sources.values())])
        for target in targets:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target)

            if not self.genes_are_called and context != "CONTIG":
                raise ConfigError("You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an "
                                  "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run "
                                  "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal "
                                  "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter "
                                  "'--installed-hmm-profile Ribosomal_RNAs')." % (context, alphabet))

            self.run.info('Target found', '%s:%s' % (alphabet, context))

            class Args: pass
            args = Args()
            args.contigs_db = self.db_path
            contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False))

            if context == 'GENE':
                target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet)
                contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet],
                                                                           simple_headers=True,
                                                                           rna_alphabet=True if alphabet=='RNA' else False,
                                                                           report_aa_sequences=True if alphabet=='AA' else False)
            elif context == 'CONTIG':
                if alphabet == 'AA':
                    raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK "
                                      "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If "
                                      "you think this is dumb, please let us know.")
                else:
                    target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet)
                    utils.export_sequences_from_contigs_db(self.db_path,
                                                           target_files_dict['%s:CONTIG' % alphabet],
                                                           rna_alphabet=True if alphabet=='RNA' else False)

        commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use)

        for source in sources:
            alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target'])

            kind_of_search = sources[source]['kind']
            domain = sources[source]['domain']
            all_genes_searched_against = sources[source]['genes']
            hmm_model = sources[source]['model']
            reference = sources[source]['ref']
            noise_cutoff_terms = sources[source]['noise_cutoff_terms']

            hmm_scan_hits_txt = commander.run_hmmscan(source,
                                                      alphabet,
                                                      context,
                                                      kind_of_search,
                                                      domain,
                                                      len(all_genes_searched_against),
                                                      hmm_model,
                                                      reference,
                                                      noise_cutoff_terms)

            if not hmm_scan_hits_txt:
                search_results_dict = {}
            else:
                parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context)
                search_results_dict = parser.get_search_results()

            if not len(search_results_dict):
                run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1)


            if context == 'CONTIG':
                # we are in trouble here. because our search results dictionary contains no gene calls, but contig
                # names contain our hits. on the other hand, the rest of the code outside of this if statement
                # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do.
                # one is to come up with some new gene calls and add them to the contigs database. so things
                # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks
                # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these
                # steps are going to be taken care of in the following function. magic.

                if source != "Ribosomal_RNAs":
                    self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM "
                                     "operation is not directly working with gene calls anvi'o already knows about, the resulting "
                                     "hits will need to be added as 'new gene calls' into the contigs database. So far so good. "
                                     "But because we are in the contigs realm rater than genes realm, it is likely that "
                                     "resulting hits will not correspond to open reading frames that are supposed to be "
                                     "translated (such as ribosomal RNAs), because otherwise you would be working with genes "
                                     "instad of defining CONTIGS as your context in that HMM profile you just used unless you "
                                     "not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the "
                                     "new gene calls it will recover through these HMMs. Please take a moment and you be the "
                                     "judge of whether this will influence your pangenomic analyses or other things you thought "
                                     "you would be doing with the result of this HMM search downstream. If you do not feel like "
                                     "being the judge of anything today you can move on yet remember to remember this if things "
                                     "look somewhat weird later on.",
                                     header="Psst. Your fancy HMM profile '%s' speaking" % source,
                                     lc="green")

                num_hits_before = len(search_results_dict)
                search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict)
                num_hits_after = len(search_results_dict)

                if num_hits_before != num_hits_after:
                    self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before))

                search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search,
                                                                                                           search_results_dict,
                                                                                                           skip_amino_acid_sequences=True)

            self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict)

        # FIXME: I have no clue why importing the anvio module is necessary at this point,
        #        but without this, mini test fails becasue "`anvio.DEBUG` is being used
        #        before initialization". nonsense.
        import anvio
        if not anvio.DEBUG:
            commander.clean_tmp_dirs()
            for v in list(target_files_dict.values()):
                os.remove(v)
Ejemplo n.º 18
0
    def __init__(self,
                 args,
                 target_fasta_path,
                 directory=None,
                 run=terminal.Run(),
                 lazy_init=False,
                 skip_warnings=False,
                 check_db_only=False):

        self.args = args
        self.run = run
        if skip_warnings and not anvio.DEBUG:
            self.run.verbose = False
        self.lazy_init = lazy_init

        self.target_fasta_path = target_fasta_path
        self.directory = directory if directory else filesnpaths.get_temp_directory_path(
        )

        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x
        self.scoring_method = A('scoring_method', str) or 'DOPE_score'
        self.very_fast = A('very_fast', bool) or False
        self.executable = A('modeller_executable',
                            null) or up_to_date_modeller_exec
        self.num_models = A('num_models', int) or 5
        self.modeller_database = A('modeller_database', str) or 'pdb_95'
        self.max_number_templates = A('max_number_templates', null) or 5
        self.percent_identical_cutoff = A('percent_identical_cutoff',
                                          null) or 30
        self.deviation = A('deviation', null) or 4
        self.pdb_db_path = A('pdb_db', null)
        self.offline_mode = A('offline_mode', null)

        # All MODELLER scripts are housed in self.script_folder
        self.scripts_folder = constants.default_modeller_scripts_dir

        self.alignment_pap_path = None
        self.alignment_pir_path = None
        self.get_template_path = None
        self.target_pir_path = None
        self.template_family_matrix_path = None
        self.template_info_path = None
        self.template_pdb_dir = None
        self.model_info = None
        self.pdb_db = None
        self.use_pdb_db = False

        self.logs = {}
        self.scripts = {}

        # All MODELLER databases are housed in self.database_dir
        self.database_dir = constants.default_modeller_database_dir

        # store the original directory so we can cd back and forth between
        # self.directory and self.start_dir
        self.start_dir = os.getcwd()

        if check_db_only:
            self.check_database()
            return

        self.sanity_check()
        self.corresponding_gene_call = self.get_corresponding_gene_call_from_target_fasta_path(
        )

        # as reward, whoever called this class will receive self.out when they run self.process()
        self.out = {
            "templates": {
                "pdb_id": [],
                "chain_id": [],
                "ppi": []
            },
            "models": {
                "molpdf": [],
                "GA341_score": [],
                "DOPE_score": [],
                "picked_as_best": []
            },
            "corresponding_gene_call": self.corresponding_gene_call,
            "structure_exists": False,
            "best_model_path": None,
            "best_score": None,
            "scoring_method": self.scoring_method,
            "percent_identical_cutoff": self.percent_identical_cutoff,
            "very_fast": self.very_fast,
            "deviation": self.deviation,
            "directory": self.directory,
        }

        # copy fasta into the working directory
        try:
            shutil.copy2(self.target_fasta_path, self.directory)
            self.target_fasta_path = J(self.directory, self.target_fasta_path)
        except shutil.SameFileError:
            pass
Ejemplo n.º 19
0
    def __init__(self,
                 hmmer_std_out,
                 context=None,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.run = run
        self.progress = progress

        self.hmmer_std_out = hmmer_std_out
        self.context = context

        self.set_names()

        self.ali_info = {}

        # This is converted to a dataframe after populating
        self.seq_hits = {
            self.query_col: [],
            self.acc_col: [],
            self.target_col: [],
            self.query_len_col: [],
            'evalue': [],
            'score': [],
            'bias': [],
            'best_dom_evalue': [],
            'best_dom_score': [],
            'best_dom_bias': [],
            'expected_doms': [],
            'num_doms': [],
        }

        self.seq_hits_dtypes = {
            self.query_col: str,
            self.acc_col: str,
            self.target_col: str,
            self.query_len_col: int,
            'evalue': float,
            'score': float,
            'bias': float,
            'best_dom_evalue': float,
            'best_dom_score': float,
            'best_dom_bias': float,
            'expected_doms': float,
            'num_doms': int,
        }

        # This is converted to a dataframe after populating
        self.dom_hits = {
            self.query_col: [],
            self.acc_col: [],
            self.target_col: [],
            'domain': [],
            'qual': [],
            'score': [],
            'bias': [],
            'c-evalue': [],
            'i-evalue': [],
            'hmm_start': [],
            'hmm_stop': [],
            'hmm_bounds': [],
            'ali_start': [],
            'ali_stop': [],
            'ali_bounds': [],
            'env_start': [],
            'env_stop': [],
            'env_bounds': [],
            'mean_post_prob': [],
            'match_state_align': [],
            'comparison_align': [],
            'sequence_align': [],
        }

        self.dom_hits_dtypes = {
            self.query_col: str,
            self.acc_col: str,
            self.target_col: str,
            'domain': int,
            'qual': str,
            'score': float,
            'bias': float,
            'c-evalue': float,
            'i-evalue': float,
            'hmm_start': int,
            'hmm_stop': int,
            'hmm_bounds': str,
            'ali_start': int,
            'ali_stop': int,
            'ali_bounds': str,
            'env_start': int,
            'env_stop': int,
            'env_bounds': str,
            'mean_post_prob': float,
            'match_state_align': str,
            'comparison_align': str,
            'sequence_align': str,
        }

        self.delim_query = '//\n'
        self.delim_seq = '>>'
        self.delim_domain = '=='

        self.load()
Ejemplo n.º 20
0
    def check_database(self):
        """Setup the database files

        Downloads the .pir file if it is missing
        Binarizes .pir file if .bin is missing
        Creates the .dmnd file if it is missing
        """

        bin_db_path = J(self.database_dir, self.modeller_database + ".bin")
        pir_db_path = J(self.database_dir, self.modeller_database + ".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path,
                                                      dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                      dont_raise=True)

        if bin_exists and pir_exists:
            # We good
            pass
        else:
            if not pir_exists:
                # Download .pir
                self.run.warning(
                    "Anvi'o looked in {} for a database with the name {} and with an extension "
                    "of either .bin or .pir, but didn't find anything matching that "
                    "criteria. Anvi'o will try and download the best database it knows of from "
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. "
                    "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 "
                    "database".format(self.database_dir,
                                      self.modeller_database))

                db_download_path = os.path.join(self.database_dir,
                                                "pdb_95.pir.gz")
                utils.download_file(
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz",
                    db_download_path)
                utils.run_command(
                    ['gzip', '-d', db_download_path],
                    log_file_path=filesnpaths.get_temp_file_path())

            # Binarize .pir (make .bin)
            self.run.warning(
                "Your database is not in binary format. That means accessing its contents is slower "
                "than it could be. Anvi'o is going to make a binary format. Just FYI"
            )
            self.run_binarize_database(pir_db_path, bin_db_path)

        dmnd_db_path = J(self.database_dir, self.modeller_database + '.dmnd')

        if os.path.exists(dmnd_db_path):
            return

        self.run.warning(
            "Your diamond database does not exist. It will be created.")

        script_name = "pir_to_fasta.py"

        self.copy_script_to_directory(script_name)

        input_pir_path = J(self.database_dir, self.modeller_database + '.pir')
        fasta_path = J(self.database_dir, self.modeller_database + '.fa')
        dmnd_path = J(self.database_dir, self.modeller_database)

        command = [self.executable, script_name, input_pir_path, fasta_path]

        self.run_command(command, script_name=script_name, rename_log=False)

        temp = u.FastaOutput(filesnpaths.get_temp_file_path())
        fasta = u.SequenceSource(fasta_path)

        while next(fasta):
            temp.write_id(fasta.id)
            temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X'))

        shutil.move(temp.output_file_path, fasta_path)
        fasta.close()
        temp.close()

        driver = diamond.Diamond(
            query_fasta=fasta_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.makedb(output_file_path=dmnd_path)

        os.remove(fasta_path)
Ejemplo n.º 21
0
    def __init__(self, authors_yaml_file_path=os.path.join(os.path.dirname(anvio.__file__), 'data/misc/PEOPLE/DEVELOPERS.yaml'), skip_init=False, r=terminal.Run(), p=terminal.Progress()):
        self.run = r
        self.progress = p

        self.authors_yaml_file_path = authors_yaml_file_path
        self.author_avatars_directory = os.path.join(os.path.dirname(authors_yaml_file_path), 'AVATARS')

        self.essential_author_info_keys = ['github', 'name', 'email']

        self.authors = {}

        if not skip_init:
            self.init_authors()
Ejemplo n.º 22
0
    def generate_pages_for_programs(self):
        """Generates static pages for programs in the output directory"""

        self.progress.new("Rendering program pages",
                          progress_total_items=len(self.programs))
        self.progress.update('...')

        program_provides_requires_dict = self.get_program_requires_provides_dict(
        )

        for program_name in self.programs:
            self.progress.update(f"'{program_name}' ...", increment=True)

            program = self.programs[program_name]
            d = {
                'program': {},
                'meta': {
                    'summary_type':
                    'program',
                    'version':
                    '\n'.join([
                        '|%s|%s|' % (t[0], t[1])
                        for t in anvio.get_version_tuples()
                    ]),
                    'date':
                    utils.get_date(),
                    'version_short_identifier':
                    self.version_short_identifier
                }
            }

            d['program']['name'] = program_name
            d['program']['usage'] = program.usage
            d['program']['description'] = program.meta_info['description'][
                'value']
            d['program']['resources'] = program.meta_info['resources']['value']
            d['program']['requires'] = program_provides_requires_dict[
                program_name]['requires']
            d['program']['provides'] = program_provides_requires_dict[
                program_name]['provides']
            d['program']['icon'] = '../../images/icons/%s.png' % 'PROGRAM'
            d['program']['authors'] = self.get_HTML_formatted_authors_data(
                program)
            d['artifacts'] = self.artifacts_info

            if anvio.DEBUG:
                self.progress.reset()
                run.warning(None, 'THE OUTPUT DICT')
                import json
                print(json.dumps(d, indent=2))

            self.progress.update(f"'{program_name}' ... rendering ...",
                                 increment=False)
            program_output_dir = filesnpaths.gen_output_directory(
                os.path.join(self.programs_output_dir, program_name))
            output_file_path = os.path.join(program_output_dir, 'index.md')
            open(output_file_path,
                 'w').write(SummaryHTMLOutput(d, r=run, p=progress).render())

            # create the program network, too
            self.progress.update(
                f"'{program_name}' ... rendering ... network json ...",
                increment=False)
            program_output_dir = filesnpaths.gen_output_directory(
                os.path.join(self.programs_output_dir, program_name))
            program_network = ProgramsNetwork(argparse.Namespace(
                output_file=os.path.join(program_output_dir, "network.json"),
                program_names_to_focus=program_name),
                                              r=terminal.Run(verbose=False))
            program_network.generate()

        self.progress.end()
Ejemplo n.º 23
0
 def __init__(self,
              program_name='fastANI',
              args={},
              run=terminal.Run(),
              progress=terminal.Progress()):
     FastANIDriver.__init__(self, program_name, args, run, progress)
Ejemplo n.º 24
0
    def __init__(self, contigs_db_path, scg_domain_classifier_path=None, source_requested=None, run=run, progress=progress):
        self.run = run
        self.progress = progress
        self.initialized_properly = True

        self.SCG_domain_predictor = scgdomainclassifier.Predict(argparse.Namespace(), run=terminal.Run(verbose=False), progress=self.progress)

        # hi db
        contigs_db = dbops.ContigsDatabase(contigs_db_path)

        # read info table to get what is available in the db
        info_table = contigs_db.db.get_table_as_dict(t.hmm_hits_info_table_name)

        # identify and remove non-single-copy sources of hmm search results:
        non_singlecopy_sources = set([k for k in list(info_table.keys()) if info_table[k]['search_type'] != 'singlecopy'])
        singlecopy_sources = set([k for k in list(info_table.keys()) if info_table[k]['search_type'] == 'singlecopy'])
        for non_singlecopy_source in non_singlecopy_sources:
            info_table.pop(non_singlecopy_source)

        # get the hmm hits table
        self.hmm_hits_table = contigs_db.db.get_table_as_dict(t.hmm_hits_table_name)

        # read search table (which holds hmmscan hits for splits).
        self.hmm_hits_splits_table = utils.get_filtered_dict(contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name), 'source', singlecopy_sources)

        # an example entry in self.hmm_hits_splits_table looks loke this:
        #
        # {
        #    'percentage_in_split'   : 69.6763202725724,
        #    'source'                : u'Bacteria_74',
        #    'split'                 : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001',
        #    'hmm_hit_entry_id'      : 1
        # }
        #

        # a little convenience for potential clients:
        self.http_refs = {}
        for source_in_db in info_table:
            self.http_refs[source_in_db] = [h for h in info_table[source_in_db]['ref'].split() if h.startswith('http')][0]

        self.genes_in_db = dict([(s, info_table[s]['genes'].split(', ')) for s in info_table])

        # we're done with the db
        contigs_db.disconnect()

        self.sources = list(info_table.keys())
        self.domains = set([info_table[source]['domain'] for source in self.sources])
        self.source_to_domain = dict([(source, info_table[source]['domain']) for source in self.sources])
        self.domain_to_sources = [(domain, [source for source in self.sources if info_table[source]['domain'] == domain]) for domain in self.domains]

        # compatibility sanity checks 1/2: make sure domains between domain predictor and the contigs database match
        self.domains_missing_in_SCG_domain_predictor = [d for d in self.domains if d not in self.SCG_domain_predictor.SCG_domains]
        self.domains_missing_in_SCGs_run_for_contigs = [d for d in self.SCG_domain_predictor.SCG_domains if d not in self.domains]

        if len(self.domains_missing_in_SCG_domain_predictor):
            num_domains_missing = len(self.domains_missing_in_SCG_domain_predictor)
            self.progress.reset()
            self.run.warning("OK. We have a problem. You seem to have single-copy core gene collections for among your HMM hits %s that "
                             "are not included when the anvi'o domain predictor was trained :/ Here is the list of domains that are making "
                             "us upset here: \"%s\". This means either you put a new HMM single-copy core gene collection to the anvi'o HMMs "
                             "directory, or gave it as a parameter, and run `anvi-run-hmms` without updating the classifier anvi'o uses to "
                             "resolve domains for proper completion/redundancy estimates." % \
                                           ('a domain' if num_domains_missing == 1 else '%s domains' % num_domains_missing,
                                            ', '.join(self.domains_missing_in_SCG_domain_predictor)))
            self.initialized_properly = False

        if len(self.domains_missing_in_SCGs_run_for_contigs):
            num_domains_missing = len(self.domains_missing_in_SCGs_run_for_contigs)

            if anvio.DEBUG:
                self.progress.reset()
                self.run.warning("It seems %d of the domains that are known to the classifier anvi'o uses to predict "
                                 "domains for completion estimation are missing from this contigs database. This means, the user didn't run the "
                                 "program `anvi-run-hmms` with default parameters, or removed some essential SCG domains from it later. Here is "
                                 "the list of domains that are making us upset here: \"%s\". Running `anvi-run-hmms` on this your contigs database "
                                 "will likely address this warning." % (num_domains_missing, ', '.join(self.domains_missing_in_SCG_domain_predictor)))

            # since we just established that the user did not run these domains for their contigs database,
            # we will update our self.domains variable to make sure the f****d uppery that will likely take
            # place later is to a convenient minumum:
            self.domains.discard(set(self.domains_missing_in_SCGs_run_for_contigs))

            self.initialized_properly = False

        # compatibility sanity checks 2/2: make sure sources in domain predictor to those in the contigs database
        self.sources_missing_in_SCGs_run_for_contigs = [s for s in self.SCG_domain_predictor.SCG_sources if s not in self.sources]
        self.sources_missing_in_SCG_domain_predictor = [s for s in self.sources if s not in self.SCG_domain_predictor.SCG_sources]
        if len(self.sources_missing_in_SCGs_run_for_contigs):
            num_sources_missing = len(self.sources_missing_in_SCGs_run_for_contigs)

            if anvio.DEBUG:
                self.progress.reset()
                self.run.warning("All the SCG domains necessary to run the predictor covered in the contigs database, however, %s that are used "
                                 "during the training of the domain predictor does not seem to occur in it :/ Here is the list of HMM sources "
                                 "that are making us upset here: \"%s\". This most likely means that either a new version of anvi'o are used with "
                                 "an older set of single-copy core gene sources, or someone is exploring new single-copy core gene sources to see "
                                 "how they behave. That's all good and very exciting, but unfortunately anvi'o will not be able to predict domains "
                                 "due to this incompatibility here. Running `anvi-run-hmms` on this contigs database would've solved this problem "
                                 "but it is not an absolute necessity as anvi'o will continue running by not utilizing domain-specific HMMs for "
                                 "completion/redundancy estimates, and report all the results all at once without prioritizing a single domain." % \
                                               ('an HMM source' if num_sources_missing == 1 else '%s HMM sources' % num_sources_missing,
                                                ', '.join(self.sources_missing_in_SCGs_run_for_contigs)))
            self.initialized_properly = False


        if source_requested:
            if source_requested not in self.sources:
                raise ConfigError('Requested source "%s" is not one of the single-copy gene sources found in the database.' % source_requested)

            # filter out sources that are not requested
            self.sources = [source_requested]
            self.genes_in_db = {source_requested: self.genes_in_db[source_requested]}
            self.hmm_hits_splits_table = utils.get_filtered_dict(self.hmm_hits_splits_table, 'source', set([source_requested]))

        # these will be very useful later. trust me.
        self.unique_gene_id_to_gene_name = {}
        self.splits_unique_gene_id_occurs = {}
        for entry in list(self.hmm_hits_splits_table.values()):
            hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']]
            gene_unique_identifier = hmm_hit['gene_unique_identifier']

            if gene_unique_identifier not in self.unique_gene_id_to_gene_name:
                self.unique_gene_id_to_gene_name[gene_unique_identifier] = hmm_hit['gene_name']

            if gene_unique_identifier not in self.splits_unique_gene_id_occurs:
                self.splits_unique_gene_id_occurs[gene_unique_identifier] = [entry['split']]
            else:
                self.splits_unique_gene_id_occurs[gene_unique_identifier].append(entry['split'])
Ejemplo n.º 25
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress(), progress_title=None):

        self.args = args
        self.run = run
        self.progress = progress

        up_to_date_modeller_exec = "mod9.20" # default exec to use

        A = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None
        null = lambda x: x
        self.scoring_method = A('scoring_method', str)
        self.deviation = A('deviation', float)
        self.directory = A('directory', str)
        self.very_fast = A('very_fast', bool)
        self.executable = A('modeller_executable', null) or up_to_date_modeller_exec
        self.num_models = A('num_models', int)
        self.target_fasta_path = A('target_fasta_path', str)
        self.modeller_database = A('modeller_database', str) or "pdb_95"
        self.max_number_templates = A('max_number_templates', null)
        self.percent_identical_cutoff = A('percent_identical_cutoff', null)
        self.deviation = A('deviation', null)

        self.alignment_pap_path = None
        self.alignment_pir_path = None
        self.get_template_path = None
        self.search_results_path = None
        self.target_pir_path = None
        self.template_family_matrix_path = None
        self.template_info_path = None
        self.template_pdbs = None
        self.model_info = None

        self.logs = {}
        self.scripts = {}

        self.sanity_check()

        # as reward, whoever called this class will receive self.out when they run self.process()
        self.out = {
            "templates"                : {"pdb_id": [],"chain_id": [],"ppi": []},
            "models"                   : {"molpdf": [],"GA341_score": [],"DOPE_score": [],"picked_as_best": []},
            "corresponding_gene_call"  : self.corresponding_gene_call,
            "structure_exists"         : False,
            "best_model_path"          : None,
            "best_score"               : None,
            "scoring_method"           : self.scoring_method,
            "percent_identical_cutoff" : self.percent_identical_cutoff,
            "very_fast"                : self.very_fast,
            "deviation"                : self.deviation,
            }

        # All MODELLER databases are housed in self.database_dir
        self.database_dir = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/db')

        # copy fasta into the working directory
        try:
            shutil.copy2(self.target_fasta_path, self.directory)
            self.target_fasta_path = J(self.directory, self.target_fasta_path)
        except shutil.SameFileError:
            pass

        # store the original directory so we can cd back and forth between
        # self.directory and self.start_dir
        self.start_dir = os.getcwd()

        self.progress_title = progress_title
        if not self.progress_title:
            self.progress_title = "Running MODELLER for gene id {}".format(self.corresponding_gene_call)
Ejemplo n.º 26
0
    def __init__(self,
                 args=None,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        # if a regular instance of `ContigsDBWorkflow` is being generated, we
        # expect it to have a parameter `args`. if there is no `args` given, we
        # assume the class is being inherited as a base class from within another
        if args:
            if len(self.__dict__):
                raise ConfigError(
                    "Something is wrong. You are ineriting `ContigsDBWorkflow` from \
                                   within another class, yet you are providing an `args` parameter.\
                                   This is not alright.")
            self.args = args
            self.name = 'contigs'
        else:
            if not len(self.__dict__):
                raise ConfigError(
                    "When you are *not* inheriting `ContigsDBWorkflow` from within\
                                   a super class, you must provide an `args` parameter."
                )

            if 'name' not in self.__dict__:
                raise ConfigError(
                    "The super class trying to inherit `ContigsDBWorkflow` does not\
                                   have a set `self.name`. Which means there may be other things\
                                   wrong with it, hence anvi'o refuses to continue."
                )

        self.run = run
        self.progress = progress

        # initialize the base class
        WorkflowSuperClass.__init__(self)

        self.rules.extend([
            'anvi_script_reformat_fasta', 'remove_human_dna_using_centrifuge',
            'anvi_gen_contigs_database', 'export_gene_calls_for_centrifuge',
            'centrifuge', 'anvi_import_taxonomy', 'anvi_run_hmms',
            'anvi_run_ncbi_cogs', 'annotate_contigs_database',
            'anvi_get_sequences_for_gene_calls', 'emapper',
            'anvi_script_run_eggnog_mapper'
        ])

        self.general_params.extend(["fasta_txt"])

        self.dirs_dict.update({
            "FASTA_DIR": "01_FASTA",
            "CONTIGS_DIR": "02_CONTIGS"
        })

        self.default_config.update({
            "fasta_txt": "fasta.txt",
            "anvi_gen_contigs_database": {
                "--project-name": "{group}",
                "threads": 5
            },
            "centrifuge": {
                "threads": 5
            },
            "anvi_run_hmms": {
                "run": True,
                "threads": 20
            },
            "anvi_run_ncbi_cogs": {
                "run": True,
                "threads": 5
            },
            "anvi_script_reformat_fasta": {
                "run": True,
                "--simplify-names": True
            },
            "emapper": {
                "--database": "bact",
                "--usemem": True,
                "--override": True
            },
            "anvi_script_run_eggnog_mapper": {
                "--use-version": "0.12.6"
            }
        })

        self.rule_acceptable_params_dict['anvi_run_ncbi_cogs'] = [
            'run', '--cog-data-dir', '--sensitive', '--temporary-dir-path',
            '--search-with'
        ]

        self.rule_acceptable_params_dict['anvi_run_hmms'] = [
            'run', '--installed-hmm-profile', '--hmm-profile-dir'
        ]

        self.rule_acceptable_params_dict['centrifuge'] = ['run', 'db']

        self.rule_acceptable_params_dict['emapper'] = [
            '--database', '--usemem', '--override', 'path_to_emapper_dir'
        ]

        self.rule_acceptable_params_dict['anvi_script_run_eggnog_mapper'] = [
            'run', '--cog-data-dir', '--drop-previous-annotations',
            '--use-version'
        ]

        self.rule_acceptable_params_dict['anvi_script_reformat_fasta'] = \
                    ['run', '--simplify-names', '--keep-ids', '--exclude-ids', '--min-len']

        self.rule_acceptable_params_dict[
            'remove_human_dna_using_centrifuge'] = ['run']

        gen_contigs_params = ['--description', '--skip-gene-calling', '--external-gene-calls',\
                              '--ignore-internal-stop-codons', '--skip-mindful-splitting',\
                              '--contigs-fasta', '--project-name',\
                              '--description', '--split-length', '--kmer-size',\
                              '--skip-mindful-splitting', '--skip-gene-calling', '--external-gene-calls',\
                              '--ignore-internal-stop-codons']

        self.rule_acceptable_params_dict[
            'anvi_gen_contigs_database'] = gen_contigs_params
Ejemplo n.º 27
0
# some tests for SCG taxonomy string processing

import argparse

import anvio.terminal as terminal
import anvio.scgtaxonomyops as scgtaxonomyops

levels_of_taxonomy = [
    "t_domain", "t_phylum", "t_class", "t_order", "t_family", "t_genus",
    "t_species"
]

c = scgtaxonomyops.PopulateContigsDatabaseWithSCGTaxonomy(
    argparse.Namespace(skip_sanity_check=True),
    run=terminal.Run(verbose=False))

p = scgtaxonomyops.SCGTaxonomyEstimator(argparse.Namespace(
    skip_sanity_check=True, skip_init=True),
                                        run=terminal.Run(verbose=False))

cX = lambda: c.get_consensus_hit(scg_raw_hits)
cT = lambda level: cX()[level]


def pX(scg_dict):
    for i in scg_dict:
        scg_dict[i]['tax_hash'] = scgtaxonomyops.HASH(scg_dict[i])
    return p.get_consensus_taxonomy(scg_dict)


pT = lambda level: pX(scg_dict)[level]
Ejemplo n.º 28
0
    def do_profile_db(self):
        self.progress.update('Subsetting the profile database')

        bin_profile_db_path = os.path.join(self.bin_output_directory,
                                           'PROFILE.db')

        bin_profile_db = dbops.ProfileDatabase(bin_profile_db_path)
        bin_profile_db.touch()

        # copy-paste tables that will largely stay the same from the parent
        bin_profile_db.db.copy_paste(table_name='self',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='views',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='states',
                                     source_db_path=self.profile_db_path)

        # update some values
        bin_profile_db.db.update_meta_value('contigs_db_hash',
                                            self.contigs_db_hash)
        bin_profile_db.db.update_meta_value('available_clusterings', None)

        # setup the filtering rules for migrating data:
        tables = {}

        # this is to deal with merge atomic data tables that are stored in merged profiles.
        # they are being created on the fly during merge, so bin_profile_db.touch() did not
        # create them, and we have to do it here ourselves. while creating them in the target
        # db, we will also populate the tables dictionary for data migration::
        sample_names = self.summary.p_meta['samples']
        for table_name in t.atomic_data_table_structure[1:-1]:
            for target in ['splits', 'contigs']:
                new_table_name = '_'.join([table_name, target])
                new_table_structure = ['contig'
                                       ] + sample_names + ['__parent__']
                new_table_types = [
                    'text'
                ] + ['numeric'] * len(sample_names) + ['text']
                bin_profile_db.db.create_table(new_table_name,
                                               new_table_structure,
                                               new_table_types)

                tables[new_table_name] = ('contig', self.split_names)

        bin_profile_db.disconnect()

        self.migrate_data(tables, self.profile_db_path, bin_profile_db_path)

        self.progress.end()

        if not self.skip_hierarchical_clustering:
            dbops.do_hierarchical_clusterings(self.split_names, bin_profile_db_path, constants.clustering_configs['merged'], self.database_paths,\
                                              self.bin_output_directory, default_clustering_config=constants.merged_default, \
                                              distance=self.distance, linkage=self.linkage, run=terminal.Run(verbose=False), progress=self.progress)
Ejemplo n.º 29
0
import anvio.utils as utils
import anvio.terminal as terminal

from anvio.tables.tableops import Table
from anvio.errors import ConfigError

__author__ = "Developers of anvi'o (see AUTHORS.txt)"
__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)"
__credits__ = []
__license__ = "GPL 3.0"
__version__ = anvio.__version__
__maintainer__ = "A. Murat Eren"
__email__ = "*****@*****.**"
__status__ = "Development"

run = terminal.Run()
progress = terminal.Progress()
pp = terminal.pretty_print


class TableForGeneLevelCoverages(Table):
    def __init__(self,
                 db_path,
                 parameters,
                 mode,
                 split_names=None,
                 ignore_splits_name_check=False,
                 run=run,
                 progress=progress):
        self.run = run
        self.progress = progress
Ejemplo n.º 30
0
    def __store_concatenated_hmm_sequences_into_FASTA(
            self,
            hmm_sequences_dict_for_splits,
            output_file_path,
            partition_file_path=None,
            wrap=120,
            separator='XXX',
            genes_order=None,
            align_with=None,
            just_do_it=False):
        """Generates concatenated sequences from `hmm_sequences_dict_for_splits` dict.

           Please do NOT directly access to this function, and use `store_hmm_sequences_into_FASTA`
           instead.
        """

        if len(self.sources) != 1:
            if just_do_it:
                self.run.warning(
                    "You have asked anvi'o to not pay attention to the fact that you are asking for genes to be concatenated "
                    "that are coming from different HMM collections. Fingers crossed. Please check the deflines of the "
                    "resulting FASTA file carefully.")
            else:
                raise ConfigError(
                    "In theory you should be requesting a single HMM source if you want your genes to be concatenated. "
                    "But in practice everyone has different needs, so we don't know. If this is not due to an error on "
                    "your part, and if you think you know what you are doing, you can ask anvi'o to let you concatenate "
                    "genes from multiple HMM sources by using the flag `--just-do-it`. In that case you will not see this "
                    "error, but you must be extremely careful to make sure the resulting file looks like it should, and "
                    "the information it contains makes sense. Since this not the common practice, you may run into other "
                    "errors downstream, for which we apologize in advance.")

        # if the user did not define a single HMM source, then it will recover all genes in all HMM sources.
        gene_names_in_source = []
        for _hmm_source in self.sources:
            gene_names_in_source.extend([
                g.strip()
                for g in self.hmm_hits_info[_hmm_source]['genes'].split(',')
            ])

        # the user wants to play rough. FINE. we will concatenate genes for phylogenomic analyses.
        gene_names = None

        # let's get an instance of the aligner early on so we learn about issues before its too late.
        aligner = self.get_aligner(align_with)

        # lets learn about what we have in this dictionary first.
        bin_names_in_dict = list(
            set([x['bin_id'] for x in hmm_sequences_dict_for_splits.values()]))
        gene_names_in_dict = sorted(
            list(
                set([
                    x['gene_name']
                    for x in hmm_sequences_dict_for_splits.values()
                ])))

        # if the function is called with a particular set and order of genes, use those, otherwise
        # stick with the gene names / order we found in the dictionary.
        if genes_order:
            genes_in_genes_order_but_missing_in_hmm_source = [
                g for g in genes_order if g not in gene_names_in_source
            ]
            if len(genes_in_genes_order_but_missing_in_hmm_source):
                raise ConfigError("One or more gene names in the genes order list does seem to appear among the genes described "
                                  "by the HMM sources (which translates to 'terrible news'). Here are the genes that cause this "
                                  "issue if you want to fix this: '%s' (and here are the HMM sources you have been using for this "
                                  "operation in case it helps: '%s')." \
                                              % (', '.join(genes_in_genes_order_but_missing_in_hmm_source), ', '.join(self.sources)))
            gene_names = genes_order
        else:
            self.run.warning(
                "You did not define any gene names. Bold move. Now anvi'o will attempt to report a file with all "
                "genes defined in your HMM source(s). This will likely be quite ugly, so please brace yourself."
            )

            gene_names = gene_names_in_dict

        # gene lenghts are especially important to accommodate missing genes with proper number of
        # gap characters
        gene_lengths = {}

        # buld a simpler dict that keeps genes sequences for each bin for a given gene name
        genes_in_bins_dict = {}
        for entry in hmm_sequences_dict_for_splits.values():
            gene_name = entry['gene_name']
            bin_name = entry['bin_id']
            sequence = entry['sequence']
            if gene_name in genes_in_bins_dict:
                genes_in_bins_dict[gene_name][bin_name] = sequence
            else:
                genes_in_bins_dict[gene_name] = {bin_name: sequence}

        # align homolog sequences across bins
        self.progress.new('Aligning homolog gene sequences pre-concatenation')
        all_gene_names = list(genes_in_bins_dict.keys())
        num_genes = len(all_gene_names)
        for i in range(0, num_genes):
            gene_name = all_gene_names[i]
            self.progress.update('working on %s (%d of %d) ...' %
                                 (gene_name, i + 1, num_genes))
            genes_list = [(bin_name, genes_in_bins_dict[gene_name][bin_name]) \
                                                        for bin_name in genes_in_bins_dict[gene_name] \
                                                                           if bin_name in genes_in_bins_dict[gene_name]]
            genes_in_bins_dict[gene_name] = aligner(run=terminal.Run(
                verbose=False)).run_stdin(genes_list)
            gene_lengths[gene_name] = len(
                list(genes_in_bins_dict[gene_name].values())[0])
        self.progress.end()

        # concatenate all of them and write them in a file
        f = open(output_file_path, 'w')
        gene_names_missing_from_everywhere = []
        for bin_name in bin_names_in_dict:
            sequences_list = []

            for gene_name in gene_names:
                if gene_name in genes_in_bins_dict:
                    if bin_name in genes_in_bins_dict[gene_name]:
                        sequences_list.append(
                            genes_in_bins_dict[gene_name][bin_name])
                    else:
                        sequences_list.append('-' * gene_lengths[gene_name])
                else:
                    # if we are here, it means this is a gene that has been missing form the hmm hits dict, since it
                    # was not in any of the bins the dict described, but the user requested to have it in the
                    # alignment anyway. This can happen when the user wants to concatanate genes from one or more
                    # low-completion bins. We will keep track of them, and tell the user.
                    sequences_list.append('-' * 42)
                    gene_names_missing_from_everywhere.append(gene_name)

            sequence = separator.join(sequences_list)

            if wrap:
                sequence = textwrap.fill(sequence,
                                         wrap,
                                         break_on_hyphens=False)

            f.write(
                '>%s num_genes:%d|genes:%s|separator:%s\n' %
                (bin_name, len(gene_names), ','.join(gene_names), separator))
            f.write('%s\n' % sequence)

        if len(gene_names_missing_from_everywhere):
            run.warning("You asked for some genes that were missing from all bins this class had in the "
            "HMM hits dictionary (here is a list of them: '%s'). Not knowing what to do with this werid "
            "situation, anvi'o put gap characters for all of them and retained your order. Here are those "
            "genes that missed the party: '%s'" % \
                (', '.join(bin_names_in_dict), ', '.join(set(gene_names_missing_from_everywhere))))

        f.close()

        if partition_file_path:
            utils.gen_NEXUS_format_partition_file_for_phylogenomics(
                partition_file_path,
                [(g, gene_lengths[g]) for g in gene_names],
                separator,
                run=self.run,
                progress=self.progress)