def __init__(self, args, run=terminal.Run(), progress=terminal.Progress(), skip_sanity_check=False): """Parses arguments and run sanity_check""" self.args = args self.run = run self.progress = progress # Parse arguments A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.annotation_source = A('annotation_source') self.window_range = A('ngram_window_range') or "2:3" self.is_in_unknowns_mode = A('analyze_unknown_functions') self.output_file = A('output_file') self.skip_init_functions = A('skip_init_functions') self.genome_names_to_focus = A('genome_names') self.ngram_source = A("ngram_source") self.annotation_source_dict = {} self.pan_db_path = A('pan_db') if self.annotation_source and self.pan_db_path: self.annotation_sources = [self.annotation_source, 'gene_clusters'] if self.pan_db_path: self.pan_db = PanDatabase(self.pan_db_path) self.p_meta = self.pan_db.meta self.p_meta['creation_date'] = utils.get_time_to_date( self.p_meta['creation_date'] ) if 'creation_date' in self.p_meta else 'unknown' self.p_meta['genome_names'] = sorted([ s.strip() for s in self.p_meta['external_genome_names'].split(',') + self.p_meta['internal_genome_names'].split(',') if s ]) self.p_meta['num_genomes'] = len(self.p_meta['genome_names']) self.genome_names = self.p_meta['genome_names'] self.gene_clusters_gene_alignments_available = self.p_meta[ 'gene_alignments_computed'] else: self.pan_db = None self.genomes_storage_path = A('genomes_storage') # confirm genome-storage and pangenome hashes match of pangenome is provided if self.pan_db: self.genomes_storage = genomestorage.GenomeStorage( self.genomes_storage_path, self.p_meta['genomes_storage_hash'], genome_names_to_focus=self.p_meta['genome_names'], skip_init_functions=self.skip_init_functions, run=self.run, progress=self.progress) else: self.genomes_storage = genomestorage.GenomeStorage( self.genomes_storage_path, skip_init_functions=self.skip_init_functions, run=self.run, progress=self.progress) # list-annotation-resources self.list_annotation_sources = A('list_annotation_sources') self.gene_function_source_set = self.genomes_storage.db.get_table_as_dataframe( 'gene_function_calls').source.unique() if self.list_annotation_sources: self.run.info('Available functional annotation sources', ', '.join(self.gene_function_source_set)) sys.exit() # This houses the ngrams' data self.ngram_attributes_list = [] # Focus on specfic set of genomes if self.genome_names_to_focus: if filesnpaths.is_file_exists(self.genome_names_to_focus, dont_raise=True): self.genome_names_to_focus = utils.get_column_data_from_TAB_delim_file( self.genome_names_to_focus, column_indices=[0], expected_number_of_fields=1)[0] else: self.genome_names_to_focus = [ g.strip() for g in self.genome_names_to_focus.split(',') ] self.run.warning( "A subset of genome names is found, and anvi'o will focus only on to those." ) self.genomes_storage = genomestorage.GenomeStorage( self.genomes_storage_path, storage_hash=None, genome_names_to_focus=self.genome_names_to_focus) self.genomes = self.genomes_storage.get_genomes_dict() self.external_genome_names = [ g for g in self.genomes if self.genomes[g]['external_genome'] ] self.internal_genome_names = [ g for g in self.genomes if not self.genomes[g]['external_genome'] ] self.hash_to_genome_name = {} for genome_name in self.genomes: self.hash_to_genome_name[self.genomes[genome_name] ['genome_hash']] = genome_name # number of genomes in genome-storage self.num_contigs_in_external_genomes_with_genes = len(self.genomes) # number of genomes in genome-storage if self.genome_names_to_focus: self.num_contigs_in_external_genomes_with_genes = len( self.genome_names_to_focus) else: self.num_contigs_in_external_genomes_with_genes = len( self.genomes_storage.get_all_genome_names()) if not skip_sanity_check: self.sanity_check() # unless we are in debug mode, let's keep things quiet. if anvio.DEBUG: self.run_object = terminal.Run() else: self.run_object = terminal.Run(verbose=False)
def __init__(self, args, r=terminal.Run(), p=terminal.Progress()): self.args = args self.run = r self.progress = p configs.PairedEndReadsConfiguration.__init__(self, args)
def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): self.init_workflow_super_class(args, workflow_name='contigs') self.group_names = [] self.contigs_information = {} self.fasta_txt_file = None self.fasta_information = {} # we have external_genomes_file defined here for the sake of pangenomics and phylogenomics workflows self.external_genomes_file = '' # we have references_mode defined here for the sake of the metagenomics workflow (it is only used when this workflow is inherited) self.references_mode = None self.import_external_functions_flags = [] self.rules.extend([ 'gen_external_genome_file', 'anvi_script_reformat_fasta', 'anvi_gen_contigs_database', 'export_gene_calls_for_centrifuge', 'centrifuge', 'anvi_import_taxonomy_for_genes', 'anvi_run_scg_taxonomy', 'anvi_run_trna_scan', 'anvi_run_hmms', 'anvi_run_ncbi_cogs', 'annotate_contigs_database', 'anvi_get_sequences_for_gene_calls', 'emapper', 'anvi_script_run_eggnog_mapper', 'gunzip_fasta', 'reformat_external_gene_calls_table', 'reformat_external_functions', 'import_external_functions', 'anvi_run_pfams', 'anvi_run_kegg_kofams' ]) self.general_params.extend(["fasta_txt"]) self.dirs_dict.update({ "FASTA_DIR": "01_FASTA", "CONTIGS_DIR": "02_CONTIGS" }) self.default_config.update({ "fasta_txt": "fasta.txt", "anvi_gen_contigs_database": { "--project-name": "{group}" }, "centrifuge": { "threads": 2 }, "anvi_run_hmms": { "run": True, "threads": 5, "--also-scan-trnas": True }, "anvi_run_kegg_kofams": { "run": True, "threads": 4 }, "anvi_run_ncbi_cogs": { "run": True, "threads": 5 }, "anvi_run_scg_taxonomy": { "run": True, "threads": 6 }, "anvi_run_trna_scan": { "run": False, "threads": 6 }, "anvi_script_reformat_fasta": { "run": True, "--prefix": "{group}", "--simplify-names": True }, "emapper": { "--database": "bact", "--usemem": True, "--override": True }, "anvi_script_run_eggnog_mapper": { "--use-version": "0.12.6" } }) self.rule_acceptable_params_dict['anvi_run_ncbi_cogs'] = [ 'run', '--cog-data-dir', '--sensitive', '--temporary-dir-path', '--search-with' ] self.rule_acceptable_params_dict['anvi_run_scg_taxonomy'] = [ 'run', '--scgs-taxonomy-data-dir' ] self.rule_acceptable_params_dict['anvi_run_trna_scan'] = [ 'run', '--trna-cutoff-score' ] self.rule_acceptable_params_dict['anvi_run_hmms'] = [ 'run', '--installed-hmm-profile', '--hmm-profile-dir', '--also-scan-trnas' ] self.rule_acceptable_params_dict['anvi_run_pfams'] = [ 'run', '--pfam-data-dir' ] self.rule_acceptable_params_dict['anvi_run_kegg_kofams'] = [ 'run', '--kegg-data-dir', '--hmmer-program', '--keep-all-hits', '--log-bitscores', '--just-do-it' ] self.rule_acceptable_params_dict['centrifuge'] = ['run', 'db'] self.rule_acceptable_params_dict['emapper'] = [ '--database', '--usemem', '--override', 'path_to_emapper_dir' ] self.rule_acceptable_params_dict['anvi_script_run_eggnog_mapper'] = [ 'run', '--cog-data-dir', '--drop-previous-annotations', '--use-version' ] self.rule_acceptable_params_dict['anvi_script_reformat_fasta'] = \ ['run', '--keep-ids', '--exclude-ids', '--min-len', "--prefix", "--simplify-names", "--seq-type"] gen_contigs_params = ['--description', '--skip-gene-calling',\ '--ignore-internal-stop-codons', '--skip-mindful-splitting',\ '--contigs-fasta', '--project-name',\ '--description', '--split-length', '--kmer-size',\ '--skip-mindful-splitting', '--skip-gene-calling',\ '--ignore-internal-stop-codons', '--skip-predict-frame', '--prodigal-translation-table'] self.rule_acceptable_params_dict[ 'anvi_gen_contigs_database'] = gen_contigs_params
def do_profile_db(self): # are we working with a merged profile database? merged = self.summary.p_meta['merged'] self.run.info('Merged database', 'True' if merged else 'False') self.progress.new('Splitting "%s"' % self.bin_id) self.progress.update('Subsetting the %s profile database' % 'merged' if merged else 'single') bin_profile_db = dbops.ProfileDatabase(self.bin_profile_db_path) bin_profile_db.touch() # copy-paste tables that will largely stay the same from the parent bin_profile_db.db.copy_paste(table_name='self', source_db_path=self.profile_db_path) bin_profile_db.db.copy_paste(table_name='views', source_db_path=self.profile_db_path) bin_profile_db.db.copy_paste(table_name='states', source_db_path=self.profile_db_path) # update some values bin_profile_db.db.update_meta_value('contigs_db_hash', self.contigs_db_hash) bin_profile_db.db.update_meta_value('available_clusterings', None) bin_profile_db.db.update_meta_value('sample_id', self.bin_id) # setup the filtering rules for migrating data: tables = {} # this is to deal with merge atomic data tables that are stored in merged profiles. # they are being created on the fly during merge, so bin_profile_db.touch() did not # create them, and we have to do it here ourselves. while creating them in the target # db, we will also populate the tables dictionary for data migration:: sample_names = self.summary.p_meta['samples'] if merged: for table_name in t.atomic_data_table_structure[1:-1]: for target in ['splits', 'contigs']: new_table_name = '_'.join([table_name, target]) new_table_structure = ['contig' ] + sample_names + ['__parent__'] new_table_types = [ 'text' ] + ['numeric'] * len(sample_names) + ['text'] bin_profile_db.db.create_table(new_table_name, new_table_structure, new_table_types) tables[new_table_name] = ('contig', self.split_names) else: profile_db = dbops.ProfileDatabase(self.profile_db_path) table_structure = profile_db.db.get_table_structure( 'atomic_data_contigs') table_types = profile_db.db.get_table_column_types( 'atomic_data_contigs') for table_name in ['atomic_data_splits', 'atomic_data_contigs']: new_table_structure = profile_db.db.get_table_structure( table_name) bin_profile_db.db.create_table(table_name, table_structure, table_types) tables[table_name] = ('contig', self.split_names) # we need to migrate these guys, too. unless we don't need to... if we are migrating, # the values in the self table are already accurate. if we are skipping, regardless # of what the values were, we will set the absolut correct ones. if self.skip_variability_tables: bin_profile_db.db.update_meta_value('SNVs_profiled', False) bin_profile_db.db.update_meta_value('SCVs_profiled', False) else: tables[t.variable_nts_table_name] = ('split_name', self.split_names) tables[t.variable_codons_table_name] = ('corresponding_gene_call', self.gene_caller_ids) bin_profile_db.disconnect() self.migrate_data(tables, self.profile_db_path, self.bin_profile_db_path) self.progress.end() if not self.skip_hierarchical_clustering: dbops.do_hierarchical_clustering_of_items(self.bin_profile_db_path, constants.clustering_configs['merged' if merged else 'single'], self.split_names, \ self.database_paths, input_directory=self.bin_output_directory, \ default_clustering_config=constants.merged_default, distance=self.distance, \ linkage=self.linkage, run=terminal.Run(verbose=False), progress=self.progress) # add a collection collection_dict = {'ALL_SPLITS': self.split_names} bins_info_dict = { 'ALL_SPLITS': { 'html_color': '#FF0000', 'source': 'anvi-split' } } collections = TablesForCollections(self.bin_profile_db_path) collections.append('DEFAULT', collection_dict, bins_info_dict=bins_info_dict)
def __init__(self, args): self.args = args A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.input_file_path = A('input_file') self.contigs_db_path = A('contigs_db') self.serialized_profile_path = A('serialized_profile') self.output_directory = A('output_dir') self.list_contigs_and_exit = A('list_contigs') self.min_contig_length = A('min_contig_length') self.min_mean_coverage = A('min_mean_coverage') self.min_coverage_for_variability = A('min_coverage_for_variability') self.contigs_shall_be_clustered = A('cluster_contigs') self.skip_hierarchical_clustering = A('skip_hierarchical_clustering') self.sample_id = A('sample_name') self.report_variability_full = A('report_variability_full') self.overwrite_output_destinations = A('overwrite_output_destinations') self.skip_SNV_profiling = A('skip_SNV_profiling') self.profile_SCVs = A('profile_SCVs') self.gen_serialized_profile = A('gen_serialized_profile') self.distance = A('distance') or constants.distance_metric_default self.linkage = A('linkage') or constants.linkage_method_default self.num_threads = int(A('num_threads')) self.queue_size = int(A('queue_size')) self.write_buffer_size = int(A('write_buffer_size')) self.total_length_of_all_contigs = 0 self.total_coverage_values_for_all_contigs = 0 self.description_file_path = A('description') # make sure early on that both the distance and linkage is OK. clustering.is_distance_and_linkage_compatible(self.distance, self.linkage) # whehther the profile database is a blank (without any BAM files or reads): self.blank = A('blank_profile') if not self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering: raise ConfigError("You are confused, and confusing anvi'o, too. You can't as hierarchical clustering\ to be performed with one flag, and try to skip it with another one :(") if self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering: raise ConfigError("So you want to generate a blank profile, and you both want hierarchical clustering\ of your contigs to be performed, and skipped. No.") if self.blank and self.contigs_shall_be_clustered: raise ConfigError("When the blank profile is asked to be generated, there is no need to ask for the\ hierarchical clustering of contigs. It is going to be done by default. If it is\ not changing anything, why is anvi'o upset with you? Because. Let's don't use flags\ we don't need.") if self.blank and not self.skip_hierarchical_clustering: self.contigs_shall_be_clustered = True if args.contigs_of_interest: filesnpaths.is_file_exists(args.contigs_of_interest) self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\ if c.strip() and not c.startswith('#')]) else: self.contig_names_of_interest = None self.progress = terminal.Progress() self.run = terminal.Run(width=35) if self.list_contigs_and_exit: self.list_contigs() sys.exit() if not self.contigs_db_path: raise ConfigError("No contigs database, no profilin'. Bye.") # Initialize contigs db dbops.ContigsSuperclass.__init__(self, self.args, r=self.run, p=self.progress) self.init_contig_sequences() self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys()) self.bam = None self.contigs = [] self.database_paths = {'CONTIGS.db': os.path.abspath(self.contigs_db_path)} self.profile_db_path = None self.clustering_configs = constants.clustering_configs['blank' if self.blank else 'single'] # following variable will be populated during the profiling, and its content will eventually # be stored in t.variable_nts_table_name self.variable_nts_table_entries = [] # if genes are not called, yet the user is asking for codon frequencies to be profiled, we give # a warning and force-turn that flag off. if (not self.a_meta['genes_are_called']) and self.profile_SCVs: self.run.warning("You asked the codon frequencies to be profiled, but genes were not called\ for your contigs database. Anvi'o is assigning `False` to the profile-codon-frequncies\ flag, overruling your request like a boss.") self.profile_SCVs = False # following variable will be populated while the variable positions table is computed self.codons_in_genes_to_profile_SCVs = set([]) # we don't know what we are about self.description = None # additional layer data will be filled later self.layer_additional_keys = [] self.layer_additional_data = {}
def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): self.init_workflow_super_class(args, workflow_name='trnaseq') self.rules.extend([ 'iu_merge_pairs', 'anvi_reformat_fasta', 'anvi_trnaseq', 'anvi_merge_trnaseq', 'anvi_run_trna_taxonomy', 'anvi_tabulate_trnaseq' ]) # "General" section of the workflow config file. self.general_params.extend(['samples_txt']) # Parameters for each rule that are accessible in the config file. rule_acceptable_params_dict = {} rule_acceptable_params_dict['iu_merge_pairs'] = [ 'run', '--gzip-output', '--marker-gene-stringent', '--max-num-mismatches', '--report-r1-prefix', '--report-r2-prefix' ] rule_acceptable_params_dict['anvi_reformat_fasta'] = [ 'run', '--gzip-output', '--simplify-names' ] rule_acceptable_params_dict['anvi_trnaseq'] = [ 'run', '--treatment', '--overwrite-output-destinations', '--description', '--write-checkpoints', '--load-checkpoint', '--feature-param-file', '--threeprime-termini', '--min-length-long-fiveprime', '--min-trna-fragment-size', '--agglomeration-max-mismatch-freq', '--skip-INDEL-profiling', '--max-indel-freq', '--left-indel-buffer', '--right-indel-buffer', '--skip-fasta-check', '--alignment-target-chunk-size', '--profiling-chunk-size' ] rule_acceptable_params_dict['anvi_merge_trnaseq'] = [ 'run', '--project-name', '--max-reported-trna-seeds', '--overwrite-output-destinations', '--description', '--feature-threshold', '--preferred-treatment', '--nonspecific-output', '--min-variation', '--min-third-fourth-nt', '--min-indel-fraction', '--distance', '--linkage' ] rule_acceptable_params_dict['anvi_run_trna_taxonomy'] = [ 'run', '--trna-taxonomy-data-dir', '--min-percent-identity', '--max-num-target-sequences', '--num-parallel-processes', '--write-buffer-size' ] rule_acceptable_params_dict['anvi_tabulate_trnaseq'] = [ 'run', '--overwrite-output-destinations' ] self.rule_acceptable_params_dict.update(rule_acceptable_params_dict) # Default values for accessible parameters: all defaults are written to the config file so # the user can see them succinctly. # Though the workflow superclass automatically adds a threads argument of "" to each # workflow, here we make explicit that the default is 1 and the user does not need to # enclose the value in quotes. Likewise, the superclass adds mandatory arguments at the end # of the list for each rule in the config file, but we explicitly add them here to ensure # they appear in the order of each script's help display. self.default_config.update({ 'samples_txt': 'samples.txt', 'iu_merge_pairs': { 'run': True, '--gzip-output': False, '--marker-gene-stringent': True, '--max-num-mismatches': 0, '--report-r1-prefix': False, '--report-r2-prefix': False, 'threads': 1 }, 'anvi_reformat_fasta': { 'run': True, '--gzip-output': False, # not an argument of anvi-script-reformat-fasta '--simplify-names': True, # not the default in anvi-script-reformat-fasta 'threads': 1 }, 'anvi_trnaseq': { 'run': True, '--treatment': "", # if provided in the config file, the treatment is assumed to be for all samples '--overwrite-output-destinations': anvio.D['overwrite-output-destinations'][1]['default'], '--description': "", '--write-checkpoints': anvio.D['write-checkpoints'][1]['default'], '--load-checkpoint': "", '--feature-param-file': "", '--threeprime-termini': anvio.D['threeprime-termini'][1]['default'], '--min-length-long-fiveprime': anvio.D['min-length-long-fiveprime'][1]['default'], '--min-trna-fragment-size': anvio.D['min-trna-fragment-size'][1]['default'], '--agglomeration-max-mismatch-freq': anvio.D['agglomeration-max-mismatch-freq'][1]['default'], '--skip-INDEL-profiling': anvio.D['skip-INDEL-profiling'][1]['default'], '--max-indel-freq': anvio.D['max-indel-freq'][1]['default'], '--left-indel-buffer': anvio.D['left-indel-buffer'][1]['default'], '--right-indel-buffer': anvio.D['right-indel-buffer'][1]['default'], '--skip-fasta-check': True, # not the default in anvi-trnaseq '--profiling-chunk-size': anvio.D['profiling-chunk-size'][1]['default'], '--alignment-target-chunk-size': anvio.D['alignment-target-chunk-size'][1]['default'], 'threads': 1 }, 'anvi_merge_trnaseq': { 'run': True, '--project-name': "", '--max-reported-trna-seeds': anvio.D['max-reported-trna-seeds'][1]['default'], '--overwrite-output-destinations': anvio.D['overwrite-output-destinations'][1]['default'], '--description': "", '--feature-threshold': anvio.D['feature-threshold'][1]['default'], '--preferred-treatment': "", '--nonspecific-output': anvio.D['nonspecific-output'][1]['default'], '--min-variation': anvio.D['min-variation'][1]['default'], '--min-third-fourth-nt': anvio.D['min-third-fourth-nt'][1]['default'], '--min-indel-fraction': anvio.D['min-indel-fraction'][1]['default'], '--distance': anvio.D['distance'][1]['default'], '--linkage': anvio.D['linkage'][1]['default'], 'threads': 1 }, 'anvi_run_trna_taxonomy': { 'run': True, '--trna-taxonomy-data-dir': "", '--min-percent-identity': 90, # default in anvi-run-trna-taxonomy '--max-num-target-sequences': 100, # default in anvi-run-trna-taxonomy '--num-parallel-processes': anvio.D['num-parallel-processes'][1]['default'], '--write-buffer-size': anvio.D['write-buffer-size'][1]['default'], 'threads': 1 }, 'anvi_tabulate_trnaseq': { 'run': True, '--overwrite-output-destinations': anvio.D['overwrite-output-destinations'][1]['default'], 'threads': 1 }, 'output_dirs': {}, # This ensures that output_dirs comes before max_threads in the file 'max_threads': 1 }) self.dirs_dict.update({ 'QC_DIR': '01_QC', 'IDENT_DIR': '02_IDENT', 'CONVERT_DIR': '03_CONVERT' })
def populate_misc_data_tables(self): self.run.info_single("Additional data and layer orders...", nl_before=1, nl_after=1, mc="blue") essential_fields = [ f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f) ] # initialize views. args = argparse.Namespace(profile_db=self.merged_profile_db_path) profile_db_super = dbops.ProfileSuperclass(args) profile_db_super.load_views(omit_parent_column=True) # figure out layer orders dictionary layer_orders_data_dict = {} failed_attempts = [] self.progress.new('Working on layer orders') for essential_field in essential_fields: self.progress.update('recovering order for "%s"' % (essential_field)) try: data_value = clustering.get_newick_tree_data_for_dict( profile_db_super.views[essential_field]['dict'], distance=self.distance, linkage=self.linkage, transpose=True) layer_orders_data_dict[essential_field] = { 'data_value': data_value, 'data_type': 'newick' } except: failed_attempts.append(essential_field) self.progress.end() if not len(layer_orders_data_dict): self.run.warning( "This may or may not be important: anvi'o attempted to generate orders for your\ samples based on the view data, however, it failed :/" ) return if len(failed_attempts): self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\ available in the merged profile, clustering of some of the essential data\ failed. It is likely not a very big deal, but you shall be the judge of it.\ Anvi'o now proceeds to store layers order information for those view items\ the clustering in fact worked. Here is the list of stuff that failed: '%s'"\ % (', '.join(failed_attempts))) # add the layer orders quietly TableForLayerOrders( args, r=terminal.Run(verbose=False)).add(layer_orders_data_dict) self.run.warning(None, header="Layer orders added", lc='cyan') for layer_order in layer_orders_data_dict: self.run.info_single(layer_order, mc='cyan') # done with layer orders. let's add our layer additional data and call it a day. for data_group_name in self.layer_additional_data_dict: TableForLayerAdditionalData( args, r=terminal.Run(verbose=False)).add( self.layer_additional_data_dict[data_group_name], list(self.layer_additional_data_keys[data_group_name]), data_group=data_group_name) self.run.warning(None, header="Data groups added", lc='cyan') for data_group in self.layer_additional_data_dict: self.run.info_single( '%s (w/%d items)' % (data_group, len(self.layer_additional_data_keys[data_group])), mc='cyan')
def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): # if a regular instance of `ContigsDBWorkflow` is being generated, we # expect it to have a parameter `args`. if there is no `args` given, we # assume the class is being inherited as a base class from within another if args: if len(self.__dict__): raise ConfigError( "Something is wrong. You are ineriting `PhylogenomicsWorkflow` from \ within another class, yet you are providing an `args` parameter.\ This is not alright.") self.args = args self.name = 'phylogenomics' else: if not len(self.__dict__): raise ConfigError( "When you are *not* inheriting `PhylogenomicsWorkflow` from within\ a super class, you must provide an `args` parameter." ) if 'name' not in self.__dict__: raise ConfigError( "The super class trying to inherit `PhylogenomicsWorkflow` does not\ have a set `self.name`. Which means there may be other things\ wrong with it, hence anvi'o refuses to continue." ) self.run = run self.progress = progress self.input_for_anvi_get_sequences_for_hmm_hits = {} self.internal_genomes_file = '' self.external_genomes_file = '' # initialize the base class WorkflowSuperClass.__init__(self) self.rules.extend( ['anvi_get_sequences_for_hmm_hits', 'trimal', 'iqtree']) self.general_params.extend(['project_name']) self.dirs_dict.update({"PHYLO_DIR": "01_PHYLOGENOMICS"}) self.default_config.update({ 'anvi_get_sequences_for_hmm_hits': { '--return-best-hit': True, '--align-with': 'famsa', '--concatenate-genes': True, '--get-aa-sequences': True, '--hmm-sources': 'Campbell_et_al' }, 'trimal': { '-gt': 0.5 }, 'iqtree': { 'threads': 8, '-m': 'WAG', '-bb': 1000 } }) get_sequences_params = ['--external-genomes', '--internal-genomes', '--return-best-hit', \ '--separator', '--align-with', '--min-num-bins-gene-occurs', \ '--max-num-genes-missing-from-bin', '--concatenate-genes', \ '--get-aa-sequences', '--gene-names', '--hmm-sources'] self.rule_acceptable_params_dict[ 'anvi_get_sequences_for_hmm_hits'] = get_sequences_params self.rule_acceptable_params_dict['trimal'] = [ '-gt', 'additional_params' ] self.rule_acceptable_params_dict['iqtree'] = [ '-m', '-bb', 'additional_params' ]
import anvio.auxiliarydataops as auxiliarydataops from anvio.errors import ConfigError from anvio.constants import codon_to_AA __author__ = "A. Murat Eren" __copyright__ = "Copyright 2015, The anvio Project" __credits__ = [] __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "A. Murat Eren" __email__ = "*****@*****.**" pp = terminal.pretty_print progress = terminal.Progress() run = terminal.Run(width=62) class VariabilitySuper(object): def __init__(self, args={}, p=progress, r=run): self.args = args self.data = {} self.splits_of_interest = set([]) self.samples_of_interest = set([]) A = lambda x, t: t(args.__dict__[x]) if x in args.__dict__ else None null = lambda x: x self.bin_id = A('bin_id', null) self.collection_name = A('collection_name', null)
def __store_concatenated_hmm_sequences_into_FASTA( self, hmm_sequences_dict_for_splits, output_file_path, wrap=120, concatenate_genes=False, separator='XXX', genes_order=None, align_with=None): """Generates concatenated sequences from `hmm_sequences_dict_for_splits` dict. Please do NOT directly access to this function, and use `store_hmm_sequences_into_FASTA` instead. """ if len(self.sources) != 1: raise ConfigError( "If you want your genes to be concatenated, you should be requesting a single HMM source. Why?\ In fact we are not exactly sure why. But when we think of it, we couldn't come up with a \ scenario where the user might truly be interested in concatenating genes from multiple HMM\ sources, and we wanted to add a control in case they are making a mistake w/o realizing. If you\ are sure this is what you must do for the question you are interested in, please send an\ e-mail to the anvi'o discussion group, and convince us .. or you can just delete this if block\ to avoid this check if you are not in the mood. We know the feeling." ) hmm_source = self.sources.pop() gene_names_in_source = [ g.strip() for g in self.hmm_hits_info[hmm_source]['genes'].split(',') ] # the user wants to play rough. FINE. we will concatenate genes for phylogenomic analyses. gene_names = None # let's get an instance of the aligner early on so we learn about issues before its too late. aligner = self.get_aligner(align_with) # lets learn about what we have in this dictionary first. bin_names_in_dict = list( set([x['bin_id'] for x in hmm_sequences_dict_for_splits.values()])) gene_names_in_dict = sorted( list( set([ x['gene_name'] for x in hmm_sequences_dict_for_splits.values() ]))) # if the function is called with a particular set and order of genes, use those, otherwise # stick with the gene names / order we found in the dictionary. if genes_order: genes_in_genes_order_but_missing_in_hmm_source = [ g for g in genes_order if g not in gene_names_in_source ] if len(genes_in_genes_order_but_missing_in_hmm_source): raise ConfigError("One or more gene names in the genes order list does seem to appear among the genes described\ by the HMM source %s (which translates to 'terrible news'). Here are the genes that cause this\ issue if you want to fix this: '%s'" \ % (hmm_source, ', '.join(genes_in_genes_order_but_missing_in_hmm_source))) gene_names = genes_order else: self.run.warning( "You did not define any gene names. Bold move. Now anvi'o will attempt to report a file with all\ genes defined in the HMM source '%s'." % hmm_source) gene_names = gene_names_in_dict # gene lenghts are especially important to accommodate missing genes with proper number of # gap characters gene_lengths = {} # buld a simpler dict that keeps genes sequences for each bin for a given gene name genes_in_bins_dict = {} for entry in hmm_sequences_dict_for_splits.values(): gene_name = entry['gene_name'] bin_name = entry['bin_id'] sequence = entry['sequence'] if gene_name in genes_in_bins_dict: genes_in_bins_dict[gene_name][bin_name] = sequence else: genes_in_bins_dict[gene_name] = {bin_name: sequence} # align homolog sequences across bins self.progress.new('Aligning homolog gene sequences pre-concatenation') all_gene_names = list(genes_in_bins_dict.keys()) num_genes = len(all_gene_names) for i in range(0, num_genes): gene_name = all_gene_names[i] self.progress.update('working on %s (%d of %d) ...' % (gene_name, i + 1, num_genes)) genes_list = [(bin_name, genes_in_bins_dict[gene_name][bin_name]) \ for bin_name in genes_in_bins_dict[gene_name] \ if bin_name in genes_in_bins_dict[gene_name]] genes_in_bins_dict[gene_name] = aligner(run=terminal.Run( verbose=False)).run_stdin(genes_list) gene_lengths[gene_name] = len( list(genes_in_bins_dict[gene_name].values())[0]) self.progress.end() # concatenate all of them and write them in a file f = open(output_file_path, 'w') gene_names_missing_from_everywhere = [] for bin_name in bin_names_in_dict: sequences_list = [] for gene_name in gene_names: if gene_name in genes_in_bins_dict: if bin_name in genes_in_bins_dict[gene_name]: sequences_list.append( genes_in_bins_dict[gene_name][bin_name]) else: sequences_list.append('-' * gene_lengths[gene_name]) else: # if we are here, it means this is a gene that has been missing form the hmm hits dict, since it # was not in any of the bins the dict described, but the user requested to have it in the # alignment anyway. This can happen when the user wants to concatanate genes from one or more # low-completion bins. We will keep track of them, and tell the user. sequences_list.append('-' * 42) gene_names_missing_from_everywhere.append(gene_name) sequence = separator.join(sequences_list) if wrap: sequence = textwrap.fill(sequence, wrap, break_on_hyphens=False) f.write('>%s genes:%s|separator:%s\n' % (bin_name, ','.join(gene_names), separator)) f.write('%s\n' % sequence) if len(gene_names_missing_from_everywhere): run.warning("You asked for some genes that were missing from all bins this class had in the\ HMM hits dictionary (here is a list of them: '%s'). Not knowing what to do with this werid\ situation, anvi'o put gap characters for all of them and retained your order. Here are those\ genes that missed the party: '%s'" % \ (', '.join(bin_names_in_dict), ', '.join(gene_names_missing_from_everywhere))) f.close()
# some tests for SCG taxonomy string processing import argparse import anvio.terminal as terminal import anvio.scgtaxonomyops as scgtaxonomyops levels_of_taxonomy = ["t_domain", "t_phylum", "t_class", "t_order", "t_family", "t_genus", "t_species"] c = scgtaxonomyops.PopulateContigsDatabaseWithSCGTaxonomy(argparse.Namespace(skip_sanity_check=True), run=terminal.Run(verbose=False)) p = scgtaxonomyops.SCGTaxonomyEstimatorSingle(argparse.Namespace(skip_sanity_check=True, skip_init=True), run=terminal.Run(verbose=False)) cX = lambda: c.get_consensus_hit(scg_raw_hits) cT = lambda level: cX()[level] def pX(scg_dict): for i in scg_dict: scg_dict[i]['tax_hash'] = scgtaxonomyops.HASH(scg_dict[i]) return p.get_consensus_taxonomy(scg_dict) pT = lambda level: pX(scg_dict)[level] ######################################### scg_raw_hits = [{'percent_identity': 100.0, 't_domain': 'A', 't_phylum': 'B', 't_class': 'C', 't_order': 'D', 't_family': 'E', 't_genus': 'F', 't_species': 'G x'}, {'percent_identity': 100.0, 't_domain': 'A', 't_phylum': 'B', 't_class': 'C', 't_order': 'D', 't_family': 'E', 't_genus': 'F', 't_species': 'G x'}, {'percent_identity': 100.0, 't_domain': 'A', 't_phylum': 'B', 't_class': 'C', 't_order': 'D', 't_family': 'E', 't_genus': 'F', 't_species': 'G x'}] assert cT('t_species') == 'G x'
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress A = lambda x: (args.__dict__[x] if x in args.__dict__ else None) if args else None if self.mode == 'train': self.genomes_dir = os.path.abspath(A('genomes_dir')) self.classifier_output_path = os.path.abspath(A('output')) if A('classifier'): raise ConfigError("You should not initialize the domain training class with a input classifier path (`args.classifier`).") if not self.genomes_dir: raise ConfigError("You must provide a genomes directory. Please read the help menu if you are not sure\ how the contents of this directory should look like.") filesnpaths.is_output_file_writable(self.classifier_output_path) filesnpaths.is_file_exists(self.genomes_dir) elif self.mode == 'predict': if A('output'): raise ConfigError("You should not initialize the domain prediction class with an output classifier path (`args.output`).") default_classifier_path = 'misc/SCGDOMAINCLASSIFIER.rf' self.input_classifier_path = A('classifier') or os.path.join(os.path.dirname(anvio.data.__file__), default_classifier_path) if A('classifier'): filesnpaths.is_file_exists(self.input_classifier_path) else: if not filesnpaths.is_file_exists(self.input_classifier_path, dont_raise=True): raise ConfigError("Somehow, this anvi'o installation dose not seem to have a SCG domain classifier. This is one of\ those anvi'o things that should never happen. If you are an anvi'o user, please feel free to panic :(\ If you are an anvi'o developer, what you need to do is to follow the instructions in \ `anvi-script-gen-scg-domain-classifier` with a reasonable set of genomes and store the resulting\ classifier at the default anvi'o path of /blah/blah/anvio/data/%s." % (default_classifier_path)) self.rf = RF(self.input_classifier_path, r=self.run, p=self.progress) self.rf.initialize_classifier() else: raise ConfigError("Someone initialized the SCG domain classifier class without an explicit mode :(") self.SCG_sources = [d for d in hmm_data.sources if hmm_data.sources[d]['kind'] == 'singlecopy'] self.SCG_domains = sorted([hmm_data.sources[source]['domain'] for source in self.SCG_sources]) self.SCG_domain_to_source = dict([(hmm_data.sources[source]['domain'], source) for source in self.SCG_sources]) if not len(self.SCG_sources): raise ConfigError("There is something wrong :( There is not even a single SCG source found. Usually\ anvi'o comes with multiple of them :/") if len(self.SCG_sources) == 1: raise ConfigError("There is only a single SCG source in your anvi'o installation. It is OK if you are\ being a hacker and playing with things, but there is no logic behind creating a\ classifier with a single class.") if len(self.SCG_domains) != len(set(self.SCG_domains)): raise ConfigError("Something is wrong. For each domain, there must be a single sinlge-copy core gene\ source.") self.data, self.labels, self.features = [], [], [] for domain in self.SCG_domains: self.features.extend(sorted(hmm_data.sources[self.SCG_domain_to_source[domain]]['genes'])) self.run.info('SCG domain classifier mode', self.mode) self.run.info("SCG domains found", ', '.join(self.SCG_domains)) self.run.info("Num features", len(self.features))
def __init__(self, db_path, client_version, new_database=False, ignore_version=False, read_only=False, skip_rowid_prepend=False, run=terminal.Run(), progress=terminal.Progress()): self.db_path = db_path self.read_only = read_only self.version = None self.run = run self.progress = progress # these anonymous functions report whether the ROWID will be added # to its rows read from the database or not. if the first column of a given # table does not contain unique variables, anvi'o prepends the ROWID of each # column to index 0, unless `skip_rowid_prepend` is True self.ROWID_PREPENDS_ROW_DATA = lambda table_name: False if skip_rowid_prepend else tables.requires_unique_entry_id[ table_name] self.PROPER_SELECT_STATEMENT = lambda table_name: 'ROWID as "entry_id", *' if self.ROWID_PREPENDS_ROW_DATA( table_name) else '*' if new_database: filesnpaths.is_output_file_writable(db_path) else: filesnpaths.is_file_exists(db_path) if new_database and os.path.exists(self.db_path): os.remove(self.db_path) if self.read_only and new_database: raise ConfigError( "One cannot create a new database that is read-only.") if not self.read_only: self.check_if_db_writable() try: self.conn = sqlite3.connect(self.db_path) except Exception as e: raise ConfigError( f"This one time someone was not happy with '{self.db_path}' and '{e}', they said." ) self.conn.text_factory = str self.cursor = self.conn.cursor() self.table_names_in_db = self.get_table_names() self.db_connected = True if new_database: self.create_self() self.set_version(client_version) else: self.version = self.get_version() if str(self.version) != str(client_version) and not ignore_version: if int(self.version) > int(client_version): progress.reset() raise ConfigError( "Bad news of the day: the database at %s was generated with an anvi'o version that is 'newer' than " "the one you are actively using right now. We know, you hate to hear this, but you need to upgrade " "your anvi'o :(" % self.db_path) else: progress.reset() raise ConfigError( f"The database at '{self.db_path}' is outdated (this database is v{self.version} and your anvi'o installation " f"wants to work with v{client_version}). You can migrate your database without losing any data using the " f"program `anvi-migrate` with either of the flags `--migrate-dbs-safely` or `--migrate-dbs-quickly`." ) bad_tables = [ table_name for table_name in self.table_names_in_db if table_name not in tables.requires_unique_entry_id ] if len(bad_tables): raise ConfigError( "You better be a programmer tinkering with anvi'o databases adding new tables or something. Otherwise we " "have quite a serious problem :/ Each table in a given anvi'o database must have an entry in the " "anvio/tables/__init__.py dictionary `requires_unique_entry_id` to explicitly define whether anvi'o " "should add a unique entry id for its contents upon retrieval as a dictionary. The following tables " "in this database do not satisfy that: '%s'. You can solve this problem by adding an entry into that " "dictionary." % (', '.join(bad_tables)))
def process(self): self.sanity_check() self.run.info('Input metadata file', self.metadata_file_path) self.run.info('Output directory', self.output_directory_path) columns = utils.get_columns_of_TAB_delim_file(self.metadata_file_path) if 'organism_name' not in columns or 'local_filename' not in columns: raise ConfigError( "The metadata file you provided does not look like a metadata\ file output from the program `ncbi-genome-download` :/ Why?\ Because anvi'o expects that file to have at least the following\ two columns in it: 'organism_name' and 'local_filename'." ) metadata = utils.get_TAB_delimited_file_as_dictionary( self.metadata_file_path) for entry in metadata: if not os.path.exists(metadata[entry]['local_filename']): raise ConfigError( "At least one of the files in your metadata input does not seem to be\ where they think they are :/ Please make sure the entry %s and others\ point to proper local file paths..." % entry) self.run.info('Num entries in metadata', len(metadata)) output_fasta_dict = {} self.progress.new("GenBank to anvi'o", progress_total_items=len(metadata)) for entry in metadata: self.progress.increment() self.progress.update('Processing %s ...' % entry) # set the organism name and accession id and clean them from weird # characters. organism_name = metadata[entry]['organism_name'] for char in [ c for c in organism_name if c not in OK_CHARS_FOR_ORGANISM_NAME ]: organism_name = organism_name.replace(char, '_') accession_id = entry for char in [ c for c in accession_id if c not in OK_CHARS_FOR_ACCESSION ]: accession_id = accession_id.replace(char, '_') final_name = '_'.join([organism_name, accession_id]) args = argparse.Namespace( input_genbank=metadata[entry]['local_filename'], output_file_prefix=os.path.join(self.output_directory_path, final_name)) g = GenbankToAnvio(args, run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False)) if final_name in output_fasta_dict: raise ConfigError( "The final name '%s' for your genome has alrady been used by\ another one :/ This should never happen unless your metadata\ contains entries with identical accession numbers..." ) output_fasta_dict[final_name] = g.process() self.progress.end() headers = ['name', 'path'] if not self.exclude_gene_calls_from_fasta_txt: headers.extend( ['external_gene_calls', 'gene_functional_annotation']) utils.store_dict_as_TAB_delimited_file(output_fasta_dict, self.output_fasta_descriptor, headers=headers) self.run.info('Output FASTA descriptor', self.output_fasta_descriptor)
def add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(self, source, search_results_dict, skip_amino_acid_sequences=False): """Add new gene calls to the contigs database and update the HMM `search_results_dict`. When we are looking for HMM hits in the context of CONTIGS, our hits do not related to the gene calls we already have in a given contigs database. One slution is to add additional gene calls for a given set of HMM hits to keep them in the database.""" if not len(search_results_dict): return search_results_dict # we will first learn the next available id in the gene callers table database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)) next_id = database.get_max_value_in_column('genes_in_contigs', 'gene_callers_id', value_if_empty=0) + 1 database.disconnect() additional_gene_calls = {} for e in search_results_dict.values(): start = e['start'] stop = e['stop'] if stop > start: direction = 'f' else: direction = 'r' stop, start = start, stop partial = 0 if ((stop - start) % 3 == 0) else 1 # add a new gene call in to the dictionary additional_gene_calls[next_id] = {'contig': e['contig_name'], 'start': start, 'stop': stop, 'direction': direction, 'partial': partial, 'source': source, 'version': 'unknown' } # update the search results dictionary with gene callers id: e['gene_callers_id'] = next_id # update the next available gene callers id: next_id += 1 if not len(additional_gene_calls): return search_results_dict # update the contigs db with the gene calls in `additional_gene_calls` dict. gene_calls_table = TablesForGeneCalls(self.db_path, run=terminal.Run(verbose=False)) gene_calls_table.use_external_gene_calls_to_populate_genes_in_contigs_table(input_file_path=None, gene_calls_dict=additional_gene_calls, ignore_internal_stop_codons=True, skip_amino_acid_sequences=skip_amino_acid_sequences) gene_calls_table.populate_genes_in_splits_tables(gene_calls_dict=additional_gene_calls) # refresh the gene calls dict self.init_gene_calls_dict() self.run.info('Gene calls added to db', '%d (from source "%s")' % (len(additional_gene_calls), source)) return search_results_dict
def run_search_and_parse_results(self): """Align the protein against the database based on only sequence""" if not self.percent_identical_cutoff or not self.max_number_templates: raise ConfigError( "run_search_and_parse_results :: You initiated this class without providing values for percent_identical_cutoff " "and max_number_templates, which is required for this function." ) # Change to MODELLER working directory os.chdir(self.directory) driver = diamond.Diamond( query_fasta=self.target_fasta_path, target_fasta=J(self.database_dir, self.modeller_database + '.dmnd'), run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False), ) driver.blastp() # Change back to user directory os.chdir(self.start_dir) search_df = driver.view_as_dataframe( J(self.directory, driver.tabular_output_path)) matches_found = search_df.shape[0] if not matches_found: self.run.warning( "No proteins with homologous sequence were found for {}. No structure will be modelled" .format(self.corresponding_gene_call)) raise self.EndModeller # We need the gene length for proper_pident target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False) while next(target_fasta): gene_length = len(target_fasta.seq) # add some useful columns search_df["proper_pident"] = search_df["pident"] * search_df[ "length"] / gene_length search_df["code"] = search_df["sseqid"].str[:-1] search_df["chain"] = search_df["sseqid"].str[-1] # filter results by self.percent_identical_cutoff. max_pident_found = search_df["proper_pident"].max() id_of_max_pident = tuple( search_df.loc[search_df["proper_pident"].idxmax(), ["code", "chain"]].values) search_df = search_df[ search_df["proper_pident"] >= self.percent_identical_cutoff] search_df = search_df.sort_values("proper_pident", ascending=False) # If more than 1 template in 1 PDB id, just choose 1 search_df = search_df.drop_duplicates('code', keep='first') # Order them and take the first self.modeller.max_number_templates. matches_after_filter = len(search_df) if not matches_after_filter: self.run.warning("Gene {} did not have a search result with proper percent identicalness above or equal " "to {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a " "proper percent identicalness of {:.2f}%. No structure will be modelled.".\ format(self.corresponding_gene_call, self.percent_identical_cutoff, id_of_max_pident[1], id_of_max_pident[0], max_pident_found)) raise self.EndModeller # get up to self.modeller.max_number_templates of those with the highest proper_ident scores. search_df = search_df.iloc[:min( [len(search_df), self.max_number_templates])] # Get their chain and 4-letter ids self.list_of_template_code_and_chain_ids = list( zip(search_df["code"], search_df["chain"])) self.run.info("Max number of templates allowed", self.max_number_templates) self.run.info("Number of candidate templates", matches_found) self.run.info( "After >{}% identical filter".format( self.percent_identical_cutoff), matches_after_filter) self.run.info("Number accepted as templates", len(self.list_of_template_code_and_chain_ids)) # update user on which templates are used, and write the templates to self.out for i in range(len(self.list_of_template_code_and_chain_ids)): pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i] ppi = search_df["proper_pident"].iloc[i] self.out["templates"]["pdb_id"].append(pdb_id) self.out["templates"]["chain_id"].append(chain_id) self.out["templates"]["ppi"].append(ppi) self.run.info( "Template {}".format(i + 1), "Protein ID: {}, Chain {} ({:.1f}% identical)".format( pdb_id, chain_id, ppi))
def populate_search_tables(self, sources={}): # make sure the output file is OK to write. filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True) # if we end up generating a temporary file for amino acid sequences: if not len(sources): import anvio.data.hmm sources = anvio.data.hmm.sources if not sources: return target_files_dict = {} tmp_directory_path = filesnpaths.get_temp_directory_path() # here we will go through targets and populate target_files_dict based on what we find among them. targets = set([s['target'] for s in list(sources.values())]) for target in targets: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target) if not self.genes_are_called and context != "CONTIG": raise ConfigError("You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an " "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run " "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal " "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter " "'--installed-hmm-profile Ribosomal_RNAs')." % (context, alphabet)) self.run.info('Target found', '%s:%s' % (alphabet, context)) class Args: pass args = Args() args.contigs_db = self.db_path contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False)) if context == 'GENE': target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet) contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet], simple_headers=True, rna_alphabet=True if alphabet=='RNA' else False, report_aa_sequences=True if alphabet=='AA' else False) elif context == 'CONTIG': if alphabet == 'AA': raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK " "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If " "you think this is dumb, please let us know.") else: target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet) utils.export_sequences_from_contigs_db(self.db_path, target_files_dict['%s:CONTIG' % alphabet], rna_alphabet=True if alphabet=='RNA' else False) commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use) for source in sources: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target']) kind_of_search = sources[source]['kind'] domain = sources[source]['domain'] all_genes_searched_against = sources[source]['genes'] hmm_model = sources[source]['model'] reference = sources[source]['ref'] noise_cutoff_terms = sources[source]['noise_cutoff_terms'] hmm_scan_hits_txt = commander.run_hmmscan(source, alphabet, context, kind_of_search, domain, len(all_genes_searched_against), hmm_model, reference, noise_cutoff_terms) if not hmm_scan_hits_txt: search_results_dict = {} else: parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context) search_results_dict = parser.get_search_results() if not len(search_results_dict): run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1) if context == 'CONTIG': # we are in trouble here. because our search results dictionary contains no gene calls, but contig # names contain our hits. on the other hand, the rest of the code outside of this if statement # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do. # one is to come up with some new gene calls and add them to the contigs database. so things # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these # steps are going to be taken care of in the following function. magic. if source != "Ribosomal_RNAs": self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM " "operation is not directly working with gene calls anvi'o already knows about, the resulting " "hits will need to be added as 'new gene calls' into the contigs database. So far so good. " "But because we are in the contigs realm rater than genes realm, it is likely that " "resulting hits will not correspond to open reading frames that are supposed to be " "translated (such as ribosomal RNAs), because otherwise you would be working with genes " "instad of defining CONTIGS as your context in that HMM profile you just used unless you " "not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the " "new gene calls it will recover through these HMMs. Please take a moment and you be the " "judge of whether this will influence your pangenomic analyses or other things you thought " "you would be doing with the result of this HMM search downstream. If you do not feel like " "being the judge of anything today you can move on yet remember to remember this if things " "look somewhat weird later on.", header="Psst. Your fancy HMM profile '%s' speaking" % source, lc="green") num_hits_before = len(search_results_dict) search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict) num_hits_after = len(search_results_dict) if num_hits_before != num_hits_after: self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before)) search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search, search_results_dict, skip_amino_acid_sequences=True) self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) # FIXME: I have no clue why importing the anvio module is necessary at this point, # but without this, mini test fails becasue "`anvio.DEBUG` is being used # before initialization". nonsense. import anvio if not anvio.DEBUG: commander.clean_tmp_dirs() for v in list(target_files_dict.values()): os.remove(v)
def __init__(self, args, target_fasta_path, directory=None, run=terminal.Run(), lazy_init=False, skip_warnings=False, check_db_only=False): self.args = args self.run = run if skip_warnings and not anvio.DEBUG: self.run.verbose = False self.lazy_init = lazy_init self.target_fasta_path = target_fasta_path self.directory = directory if directory else filesnpaths.get_temp_directory_path( ) A = lambda x, t: t(args.__dict__[x] ) if x in self.args.__dict__ else None null = lambda x: x self.scoring_method = A('scoring_method', str) or 'DOPE_score' self.very_fast = A('very_fast', bool) or False self.executable = A('modeller_executable', null) or up_to_date_modeller_exec self.num_models = A('num_models', int) or 5 self.modeller_database = A('modeller_database', str) or 'pdb_95' self.max_number_templates = A('max_number_templates', null) or 5 self.percent_identical_cutoff = A('percent_identical_cutoff', null) or 30 self.deviation = A('deviation', null) or 4 self.pdb_db_path = A('pdb_db', null) self.offline_mode = A('offline_mode', null) # All MODELLER scripts are housed in self.script_folder self.scripts_folder = constants.default_modeller_scripts_dir self.alignment_pap_path = None self.alignment_pir_path = None self.get_template_path = None self.target_pir_path = None self.template_family_matrix_path = None self.template_info_path = None self.template_pdb_dir = None self.model_info = None self.pdb_db = None self.use_pdb_db = False self.logs = {} self.scripts = {} # All MODELLER databases are housed in self.database_dir self.database_dir = constants.default_modeller_database_dir # store the original directory so we can cd back and forth between # self.directory and self.start_dir self.start_dir = os.getcwd() if check_db_only: self.check_database() return self.sanity_check() self.corresponding_gene_call = self.get_corresponding_gene_call_from_target_fasta_path( ) # as reward, whoever called this class will receive self.out when they run self.process() self.out = { "templates": { "pdb_id": [], "chain_id": [], "ppi": [] }, "models": { "molpdf": [], "GA341_score": [], "DOPE_score": [], "picked_as_best": [] }, "corresponding_gene_call": self.corresponding_gene_call, "structure_exists": False, "best_model_path": None, "best_score": None, "scoring_method": self.scoring_method, "percent_identical_cutoff": self.percent_identical_cutoff, "very_fast": self.very_fast, "deviation": self.deviation, "directory": self.directory, } # copy fasta into the working directory try: shutil.copy2(self.target_fasta_path, self.directory) self.target_fasta_path = J(self.directory, self.target_fasta_path) except shutil.SameFileError: pass
def __init__(self, hmmer_std_out, context=None, run=terminal.Run(), progress=terminal.Progress()): self.run = run self.progress = progress self.hmmer_std_out = hmmer_std_out self.context = context self.set_names() self.ali_info = {} # This is converted to a dataframe after populating self.seq_hits = { self.query_col: [], self.acc_col: [], self.target_col: [], self.query_len_col: [], 'evalue': [], 'score': [], 'bias': [], 'best_dom_evalue': [], 'best_dom_score': [], 'best_dom_bias': [], 'expected_doms': [], 'num_doms': [], } self.seq_hits_dtypes = { self.query_col: str, self.acc_col: str, self.target_col: str, self.query_len_col: int, 'evalue': float, 'score': float, 'bias': float, 'best_dom_evalue': float, 'best_dom_score': float, 'best_dom_bias': float, 'expected_doms': float, 'num_doms': int, } # This is converted to a dataframe after populating self.dom_hits = { self.query_col: [], self.acc_col: [], self.target_col: [], 'domain': [], 'qual': [], 'score': [], 'bias': [], 'c-evalue': [], 'i-evalue': [], 'hmm_start': [], 'hmm_stop': [], 'hmm_bounds': [], 'ali_start': [], 'ali_stop': [], 'ali_bounds': [], 'env_start': [], 'env_stop': [], 'env_bounds': [], 'mean_post_prob': [], 'match_state_align': [], 'comparison_align': [], 'sequence_align': [], } self.dom_hits_dtypes = { self.query_col: str, self.acc_col: str, self.target_col: str, 'domain': int, 'qual': str, 'score': float, 'bias': float, 'c-evalue': float, 'i-evalue': float, 'hmm_start': int, 'hmm_stop': int, 'hmm_bounds': str, 'ali_start': int, 'ali_stop': int, 'ali_bounds': str, 'env_start': int, 'env_stop': int, 'env_bounds': str, 'mean_post_prob': float, 'match_state_align': str, 'comparison_align': str, 'sequence_align': str, } self.delim_query = '//\n' self.delim_seq = '>>' self.delim_domain = '==' self.load()
def check_database(self): """Setup the database files Downloads the .pir file if it is missing Binarizes .pir file if .bin is missing Creates the .dmnd file if it is missing """ bin_db_path = J(self.database_dir, self.modeller_database + ".bin") pir_db_path = J(self.database_dir, self.modeller_database + ".pir") bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) if bin_exists and pir_exists: # We good pass else: if not pir_exists: # Download .pir self.run.warning( "Anvi'o looked in {} for a database with the name {} and with an extension " "of either .bin or .pir, but didn't find anything matching that " "criteria. Anvi'o will try and download the best database it knows of from " "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. " "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 " "database".format(self.database_dir, self.modeller_database)) db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz") utils.download_file( "https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path) utils.run_command( ['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path()) # Binarize .pir (make .bin) self.run.warning( "Your database is not in binary format. That means accessing its contents is slower " "than it could be. Anvi'o is going to make a binary format. Just FYI" ) self.run_binarize_database(pir_db_path, bin_db_path) dmnd_db_path = J(self.database_dir, self.modeller_database + '.dmnd') if os.path.exists(dmnd_db_path): return self.run.warning( "Your diamond database does not exist. It will be created.") script_name = "pir_to_fasta.py" self.copy_script_to_directory(script_name) input_pir_path = J(self.database_dir, self.modeller_database + '.pir') fasta_path = J(self.database_dir, self.modeller_database + '.fa') dmnd_path = J(self.database_dir, self.modeller_database) command = [self.executable, script_name, input_pir_path, fasta_path] self.run_command(command, script_name=script_name, rename_log=False) temp = u.FastaOutput(filesnpaths.get_temp_file_path()) fasta = u.SequenceSource(fasta_path) while next(fasta): temp.write_id(fasta.id) temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X')) shutil.move(temp.output_file_path, fasta_path) fasta.close() temp.close() driver = diamond.Diamond( query_fasta=fasta_path, run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False), ) driver.makedb(output_file_path=dmnd_path) os.remove(fasta_path)
def __init__(self, authors_yaml_file_path=os.path.join(os.path.dirname(anvio.__file__), 'data/misc/PEOPLE/DEVELOPERS.yaml'), skip_init=False, r=terminal.Run(), p=terminal.Progress()): self.run = r self.progress = p self.authors_yaml_file_path = authors_yaml_file_path self.author_avatars_directory = os.path.join(os.path.dirname(authors_yaml_file_path), 'AVATARS') self.essential_author_info_keys = ['github', 'name', 'email'] self.authors = {} if not skip_init: self.init_authors()
def generate_pages_for_programs(self): """Generates static pages for programs in the output directory""" self.progress.new("Rendering program pages", progress_total_items=len(self.programs)) self.progress.update('...') program_provides_requires_dict = self.get_program_requires_provides_dict( ) for program_name in self.programs: self.progress.update(f"'{program_name}' ...", increment=True) program = self.programs[program_name] d = { 'program': {}, 'meta': { 'summary_type': 'program', 'version': '\n'.join([ '|%s|%s|' % (t[0], t[1]) for t in anvio.get_version_tuples() ]), 'date': utils.get_date(), 'version_short_identifier': self.version_short_identifier } } d['program']['name'] = program_name d['program']['usage'] = program.usage d['program']['description'] = program.meta_info['description'][ 'value'] d['program']['resources'] = program.meta_info['resources']['value'] d['program']['requires'] = program_provides_requires_dict[ program_name]['requires'] d['program']['provides'] = program_provides_requires_dict[ program_name]['provides'] d['program']['icon'] = '../../images/icons/%s.png' % 'PROGRAM' d['program']['authors'] = self.get_HTML_formatted_authors_data( program) d['artifacts'] = self.artifacts_info if anvio.DEBUG: self.progress.reset() run.warning(None, 'THE OUTPUT DICT') import json print(json.dumps(d, indent=2)) self.progress.update(f"'{program_name}' ... rendering ...", increment=False) program_output_dir = filesnpaths.gen_output_directory( os.path.join(self.programs_output_dir, program_name)) output_file_path = os.path.join(program_output_dir, 'index.md') open(output_file_path, 'w').write(SummaryHTMLOutput(d, r=run, p=progress).render()) # create the program network, too self.progress.update( f"'{program_name}' ... rendering ... network json ...", increment=False) program_output_dir = filesnpaths.gen_output_directory( os.path.join(self.programs_output_dir, program_name)) program_network = ProgramsNetwork(argparse.Namespace( output_file=os.path.join(program_output_dir, "network.json"), program_names_to_focus=program_name), r=terminal.Run(verbose=False)) program_network.generate() self.progress.end()
def __init__(self, program_name='fastANI', args={}, run=terminal.Run(), progress=terminal.Progress()): FastANIDriver.__init__(self, program_name, args, run, progress)
def __init__(self, contigs_db_path, scg_domain_classifier_path=None, source_requested=None, run=run, progress=progress): self.run = run self.progress = progress self.initialized_properly = True self.SCG_domain_predictor = scgdomainclassifier.Predict(argparse.Namespace(), run=terminal.Run(verbose=False), progress=self.progress) # hi db contigs_db = dbops.ContigsDatabase(contigs_db_path) # read info table to get what is available in the db info_table = contigs_db.db.get_table_as_dict(t.hmm_hits_info_table_name) # identify and remove non-single-copy sources of hmm search results: non_singlecopy_sources = set([k for k in list(info_table.keys()) if info_table[k]['search_type'] != 'singlecopy']) singlecopy_sources = set([k for k in list(info_table.keys()) if info_table[k]['search_type'] == 'singlecopy']) for non_singlecopy_source in non_singlecopy_sources: info_table.pop(non_singlecopy_source) # get the hmm hits table self.hmm_hits_table = contigs_db.db.get_table_as_dict(t.hmm_hits_table_name) # read search table (which holds hmmscan hits for splits). self.hmm_hits_splits_table = utils.get_filtered_dict(contigs_db.db.get_table_as_dict(t.hmm_hits_splits_table_name), 'source', singlecopy_sources) # an example entry in self.hmm_hits_splits_table looks loke this: # # { # 'percentage_in_split' : 69.6763202725724, # 'source' : u'Bacteria_74', # 'split' : u'ANTARCTICAAQUATIC_SMPL_SITE231_3.0UMcontig18439_split_00001', # 'hmm_hit_entry_id' : 1 # } # # a little convenience for potential clients: self.http_refs = {} for source_in_db in info_table: self.http_refs[source_in_db] = [h for h in info_table[source_in_db]['ref'].split() if h.startswith('http')][0] self.genes_in_db = dict([(s, info_table[s]['genes'].split(', ')) for s in info_table]) # we're done with the db contigs_db.disconnect() self.sources = list(info_table.keys()) self.domains = set([info_table[source]['domain'] for source in self.sources]) self.source_to_domain = dict([(source, info_table[source]['domain']) for source in self.sources]) self.domain_to_sources = [(domain, [source for source in self.sources if info_table[source]['domain'] == domain]) for domain in self.domains] # compatibility sanity checks 1/2: make sure domains between domain predictor and the contigs database match self.domains_missing_in_SCG_domain_predictor = [d for d in self.domains if d not in self.SCG_domain_predictor.SCG_domains] self.domains_missing_in_SCGs_run_for_contigs = [d for d in self.SCG_domain_predictor.SCG_domains if d not in self.domains] if len(self.domains_missing_in_SCG_domain_predictor): num_domains_missing = len(self.domains_missing_in_SCG_domain_predictor) self.progress.reset() self.run.warning("OK. We have a problem. You seem to have single-copy core gene collections for among your HMM hits %s that " "are not included when the anvi'o domain predictor was trained :/ Here is the list of domains that are making " "us upset here: \"%s\". This means either you put a new HMM single-copy core gene collection to the anvi'o HMMs " "directory, or gave it as a parameter, and run `anvi-run-hmms` without updating the classifier anvi'o uses to " "resolve domains for proper completion/redundancy estimates." % \ ('a domain' if num_domains_missing == 1 else '%s domains' % num_domains_missing, ', '.join(self.domains_missing_in_SCG_domain_predictor))) self.initialized_properly = False if len(self.domains_missing_in_SCGs_run_for_contigs): num_domains_missing = len(self.domains_missing_in_SCGs_run_for_contigs) if anvio.DEBUG: self.progress.reset() self.run.warning("It seems %d of the domains that are known to the classifier anvi'o uses to predict " "domains for completion estimation are missing from this contigs database. This means, the user didn't run the " "program `anvi-run-hmms` with default parameters, or removed some essential SCG domains from it later. Here is " "the list of domains that are making us upset here: \"%s\". Running `anvi-run-hmms` on this your contigs database " "will likely address this warning." % (num_domains_missing, ', '.join(self.domains_missing_in_SCG_domain_predictor))) # since we just established that the user did not run these domains for their contigs database, # we will update our self.domains variable to make sure the f****d uppery that will likely take # place later is to a convenient minumum: self.domains.discard(set(self.domains_missing_in_SCGs_run_for_contigs)) self.initialized_properly = False # compatibility sanity checks 2/2: make sure sources in domain predictor to those in the contigs database self.sources_missing_in_SCGs_run_for_contigs = [s for s in self.SCG_domain_predictor.SCG_sources if s not in self.sources] self.sources_missing_in_SCG_domain_predictor = [s for s in self.sources if s not in self.SCG_domain_predictor.SCG_sources] if len(self.sources_missing_in_SCGs_run_for_contigs): num_sources_missing = len(self.sources_missing_in_SCGs_run_for_contigs) if anvio.DEBUG: self.progress.reset() self.run.warning("All the SCG domains necessary to run the predictor covered in the contigs database, however, %s that are used " "during the training of the domain predictor does not seem to occur in it :/ Here is the list of HMM sources " "that are making us upset here: \"%s\". This most likely means that either a new version of anvi'o are used with " "an older set of single-copy core gene sources, or someone is exploring new single-copy core gene sources to see " "how they behave. That's all good and very exciting, but unfortunately anvi'o will not be able to predict domains " "due to this incompatibility here. Running `anvi-run-hmms` on this contigs database would've solved this problem " "but it is not an absolute necessity as anvi'o will continue running by not utilizing domain-specific HMMs for " "completion/redundancy estimates, and report all the results all at once without prioritizing a single domain." % \ ('an HMM source' if num_sources_missing == 1 else '%s HMM sources' % num_sources_missing, ', '.join(self.sources_missing_in_SCGs_run_for_contigs))) self.initialized_properly = False if source_requested: if source_requested not in self.sources: raise ConfigError('Requested source "%s" is not one of the single-copy gene sources found in the database.' % source_requested) # filter out sources that are not requested self.sources = [source_requested] self.genes_in_db = {source_requested: self.genes_in_db[source_requested]} self.hmm_hits_splits_table = utils.get_filtered_dict(self.hmm_hits_splits_table, 'source', set([source_requested])) # these will be very useful later. trust me. self.unique_gene_id_to_gene_name = {} self.splits_unique_gene_id_occurs = {} for entry in list(self.hmm_hits_splits_table.values()): hmm_hit = self.hmm_hits_table[entry['hmm_hit_entry_id']] gene_unique_identifier = hmm_hit['gene_unique_identifier'] if gene_unique_identifier not in self.unique_gene_id_to_gene_name: self.unique_gene_id_to_gene_name[gene_unique_identifier] = hmm_hit['gene_name'] if gene_unique_identifier not in self.splits_unique_gene_id_occurs: self.splits_unique_gene_id_occurs[gene_unique_identifier] = [entry['split']] else: self.splits_unique_gene_id_occurs[gene_unique_identifier].append(entry['split'])
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress(), progress_title=None): self.args = args self.run = run self.progress = progress up_to_date_modeller_exec = "mod9.20" # default exec to use A = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None null = lambda x: x self.scoring_method = A('scoring_method', str) self.deviation = A('deviation', float) self.directory = A('directory', str) self.very_fast = A('very_fast', bool) self.executable = A('modeller_executable', null) or up_to_date_modeller_exec self.num_models = A('num_models', int) self.target_fasta_path = A('target_fasta_path', str) self.modeller_database = A('modeller_database', str) or "pdb_95" self.max_number_templates = A('max_number_templates', null) self.percent_identical_cutoff = A('percent_identical_cutoff', null) self.deviation = A('deviation', null) self.alignment_pap_path = None self.alignment_pir_path = None self.get_template_path = None self.search_results_path = None self.target_pir_path = None self.template_family_matrix_path = None self.template_info_path = None self.template_pdbs = None self.model_info = None self.logs = {} self.scripts = {} self.sanity_check() # as reward, whoever called this class will receive self.out when they run self.process() self.out = { "templates" : {"pdb_id": [],"chain_id": [],"ppi": []}, "models" : {"molpdf": [],"GA341_score": [],"DOPE_score": [],"picked_as_best": []}, "corresponding_gene_call" : self.corresponding_gene_call, "structure_exists" : False, "best_model_path" : None, "best_score" : None, "scoring_method" : self.scoring_method, "percent_identical_cutoff" : self.percent_identical_cutoff, "very_fast" : self.very_fast, "deviation" : self.deviation, } # All MODELLER databases are housed in self.database_dir self.database_dir = J(os.path.dirname(anvio.__file__), 'data/misc/MODELLER/db') # copy fasta into the working directory try: shutil.copy2(self.target_fasta_path, self.directory) self.target_fasta_path = J(self.directory, self.target_fasta_path) except shutil.SameFileError: pass # store the original directory so we can cd back and forth between # self.directory and self.start_dir self.start_dir = os.getcwd() self.progress_title = progress_title if not self.progress_title: self.progress_title = "Running MODELLER for gene id {}".format(self.corresponding_gene_call)
def __init__(self, args=None, run=terminal.Run(), progress=terminal.Progress()): # if a regular instance of `ContigsDBWorkflow` is being generated, we # expect it to have a parameter `args`. if there is no `args` given, we # assume the class is being inherited as a base class from within another if args: if len(self.__dict__): raise ConfigError( "Something is wrong. You are ineriting `ContigsDBWorkflow` from \ within another class, yet you are providing an `args` parameter.\ This is not alright.") self.args = args self.name = 'contigs' else: if not len(self.__dict__): raise ConfigError( "When you are *not* inheriting `ContigsDBWorkflow` from within\ a super class, you must provide an `args` parameter." ) if 'name' not in self.__dict__: raise ConfigError( "The super class trying to inherit `ContigsDBWorkflow` does not\ have a set `self.name`. Which means there may be other things\ wrong with it, hence anvi'o refuses to continue." ) self.run = run self.progress = progress # initialize the base class WorkflowSuperClass.__init__(self) self.rules.extend([ 'anvi_script_reformat_fasta', 'remove_human_dna_using_centrifuge', 'anvi_gen_contigs_database', 'export_gene_calls_for_centrifuge', 'centrifuge', 'anvi_import_taxonomy', 'anvi_run_hmms', 'anvi_run_ncbi_cogs', 'annotate_contigs_database', 'anvi_get_sequences_for_gene_calls', 'emapper', 'anvi_script_run_eggnog_mapper' ]) self.general_params.extend(["fasta_txt"]) self.dirs_dict.update({ "FASTA_DIR": "01_FASTA", "CONTIGS_DIR": "02_CONTIGS" }) self.default_config.update({ "fasta_txt": "fasta.txt", "anvi_gen_contigs_database": { "--project-name": "{group}", "threads": 5 }, "centrifuge": { "threads": 5 }, "anvi_run_hmms": { "run": True, "threads": 20 }, "anvi_run_ncbi_cogs": { "run": True, "threads": 5 }, "anvi_script_reformat_fasta": { "run": True, "--simplify-names": True }, "emapper": { "--database": "bact", "--usemem": True, "--override": True }, "anvi_script_run_eggnog_mapper": { "--use-version": "0.12.6" } }) self.rule_acceptable_params_dict['anvi_run_ncbi_cogs'] = [ 'run', '--cog-data-dir', '--sensitive', '--temporary-dir-path', '--search-with' ] self.rule_acceptable_params_dict['anvi_run_hmms'] = [ 'run', '--installed-hmm-profile', '--hmm-profile-dir' ] self.rule_acceptable_params_dict['centrifuge'] = ['run', 'db'] self.rule_acceptable_params_dict['emapper'] = [ '--database', '--usemem', '--override', 'path_to_emapper_dir' ] self.rule_acceptable_params_dict['anvi_script_run_eggnog_mapper'] = [ 'run', '--cog-data-dir', '--drop-previous-annotations', '--use-version' ] self.rule_acceptable_params_dict['anvi_script_reformat_fasta'] = \ ['run', '--simplify-names', '--keep-ids', '--exclude-ids', '--min-len'] self.rule_acceptable_params_dict[ 'remove_human_dna_using_centrifuge'] = ['run'] gen_contigs_params = ['--description', '--skip-gene-calling', '--external-gene-calls',\ '--ignore-internal-stop-codons', '--skip-mindful-splitting',\ '--contigs-fasta', '--project-name',\ '--description', '--split-length', '--kmer-size',\ '--skip-mindful-splitting', '--skip-gene-calling', '--external-gene-calls',\ '--ignore-internal-stop-codons'] self.rule_acceptable_params_dict[ 'anvi_gen_contigs_database'] = gen_contigs_params
# some tests for SCG taxonomy string processing import argparse import anvio.terminal as terminal import anvio.scgtaxonomyops as scgtaxonomyops levels_of_taxonomy = [ "t_domain", "t_phylum", "t_class", "t_order", "t_family", "t_genus", "t_species" ] c = scgtaxonomyops.PopulateContigsDatabaseWithSCGTaxonomy( argparse.Namespace(skip_sanity_check=True), run=terminal.Run(verbose=False)) p = scgtaxonomyops.SCGTaxonomyEstimator(argparse.Namespace( skip_sanity_check=True, skip_init=True), run=terminal.Run(verbose=False)) cX = lambda: c.get_consensus_hit(scg_raw_hits) cT = lambda level: cX()[level] def pX(scg_dict): for i in scg_dict: scg_dict[i]['tax_hash'] = scgtaxonomyops.HASH(scg_dict[i]) return p.get_consensus_taxonomy(scg_dict) pT = lambda level: pX(scg_dict)[level]
def do_profile_db(self): self.progress.update('Subsetting the profile database') bin_profile_db_path = os.path.join(self.bin_output_directory, 'PROFILE.db') bin_profile_db = dbops.ProfileDatabase(bin_profile_db_path) bin_profile_db.touch() # copy-paste tables that will largely stay the same from the parent bin_profile_db.db.copy_paste(table_name='self', source_db_path=self.profile_db_path) bin_profile_db.db.copy_paste(table_name='views', source_db_path=self.profile_db_path) bin_profile_db.db.copy_paste(table_name='states', source_db_path=self.profile_db_path) # update some values bin_profile_db.db.update_meta_value('contigs_db_hash', self.contigs_db_hash) bin_profile_db.db.update_meta_value('available_clusterings', None) # setup the filtering rules for migrating data: tables = {} # this is to deal with merge atomic data tables that are stored in merged profiles. # they are being created on the fly during merge, so bin_profile_db.touch() did not # create them, and we have to do it here ourselves. while creating them in the target # db, we will also populate the tables dictionary for data migration:: sample_names = self.summary.p_meta['samples'] for table_name in t.atomic_data_table_structure[1:-1]: for target in ['splits', 'contigs']: new_table_name = '_'.join([table_name, target]) new_table_structure = ['contig' ] + sample_names + ['__parent__'] new_table_types = [ 'text' ] + ['numeric'] * len(sample_names) + ['text'] bin_profile_db.db.create_table(new_table_name, new_table_structure, new_table_types) tables[new_table_name] = ('contig', self.split_names) bin_profile_db.disconnect() self.migrate_data(tables, self.profile_db_path, bin_profile_db_path) self.progress.end() if not self.skip_hierarchical_clustering: dbops.do_hierarchical_clusterings(self.split_names, bin_profile_db_path, constants.clustering_configs['merged'], self.database_paths,\ self.bin_output_directory, default_clustering_config=constants.merged_default, \ distance=self.distance, linkage=self.linkage, run=terminal.Run(verbose=False), progress=self.progress)
import anvio.utils as utils import anvio.terminal as terminal from anvio.tables.tableops import Table from anvio.errors import ConfigError __author__ = "Developers of anvi'o (see AUTHORS.txt)" __copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)" __credits__ = [] __license__ = "GPL 3.0" __version__ = anvio.__version__ __maintainer__ = "A. Murat Eren" __email__ = "*****@*****.**" __status__ = "Development" run = terminal.Run() progress = terminal.Progress() pp = terminal.pretty_print class TableForGeneLevelCoverages(Table): def __init__(self, db_path, parameters, mode, split_names=None, ignore_splits_name_check=False, run=run, progress=progress): self.run = run self.progress = progress
def __store_concatenated_hmm_sequences_into_FASTA( self, hmm_sequences_dict_for_splits, output_file_path, partition_file_path=None, wrap=120, separator='XXX', genes_order=None, align_with=None, just_do_it=False): """Generates concatenated sequences from `hmm_sequences_dict_for_splits` dict. Please do NOT directly access to this function, and use `store_hmm_sequences_into_FASTA` instead. """ if len(self.sources) != 1: if just_do_it: self.run.warning( "You have asked anvi'o to not pay attention to the fact that you are asking for genes to be concatenated " "that are coming from different HMM collections. Fingers crossed. Please check the deflines of the " "resulting FASTA file carefully.") else: raise ConfigError( "In theory you should be requesting a single HMM source if you want your genes to be concatenated. " "But in practice everyone has different needs, so we don't know. If this is not due to an error on " "your part, and if you think you know what you are doing, you can ask anvi'o to let you concatenate " "genes from multiple HMM sources by using the flag `--just-do-it`. In that case you will not see this " "error, but you must be extremely careful to make sure the resulting file looks like it should, and " "the information it contains makes sense. Since this not the common practice, you may run into other " "errors downstream, for which we apologize in advance.") # if the user did not define a single HMM source, then it will recover all genes in all HMM sources. gene_names_in_source = [] for _hmm_source in self.sources: gene_names_in_source.extend([ g.strip() for g in self.hmm_hits_info[_hmm_source]['genes'].split(',') ]) # the user wants to play rough. FINE. we will concatenate genes for phylogenomic analyses. gene_names = None # let's get an instance of the aligner early on so we learn about issues before its too late. aligner = self.get_aligner(align_with) # lets learn about what we have in this dictionary first. bin_names_in_dict = list( set([x['bin_id'] for x in hmm_sequences_dict_for_splits.values()])) gene_names_in_dict = sorted( list( set([ x['gene_name'] for x in hmm_sequences_dict_for_splits.values() ]))) # if the function is called with a particular set and order of genes, use those, otherwise # stick with the gene names / order we found in the dictionary. if genes_order: genes_in_genes_order_but_missing_in_hmm_source = [ g for g in genes_order if g not in gene_names_in_source ] if len(genes_in_genes_order_but_missing_in_hmm_source): raise ConfigError("One or more gene names in the genes order list does seem to appear among the genes described " "by the HMM sources (which translates to 'terrible news'). Here are the genes that cause this " "issue if you want to fix this: '%s' (and here are the HMM sources you have been using for this " "operation in case it helps: '%s')." \ % (', '.join(genes_in_genes_order_but_missing_in_hmm_source), ', '.join(self.sources))) gene_names = genes_order else: self.run.warning( "You did not define any gene names. Bold move. Now anvi'o will attempt to report a file with all " "genes defined in your HMM source(s). This will likely be quite ugly, so please brace yourself." ) gene_names = gene_names_in_dict # gene lenghts are especially important to accommodate missing genes with proper number of # gap characters gene_lengths = {} # buld a simpler dict that keeps genes sequences for each bin for a given gene name genes_in_bins_dict = {} for entry in hmm_sequences_dict_for_splits.values(): gene_name = entry['gene_name'] bin_name = entry['bin_id'] sequence = entry['sequence'] if gene_name in genes_in_bins_dict: genes_in_bins_dict[gene_name][bin_name] = sequence else: genes_in_bins_dict[gene_name] = {bin_name: sequence} # align homolog sequences across bins self.progress.new('Aligning homolog gene sequences pre-concatenation') all_gene_names = list(genes_in_bins_dict.keys()) num_genes = len(all_gene_names) for i in range(0, num_genes): gene_name = all_gene_names[i] self.progress.update('working on %s (%d of %d) ...' % (gene_name, i + 1, num_genes)) genes_list = [(bin_name, genes_in_bins_dict[gene_name][bin_name]) \ for bin_name in genes_in_bins_dict[gene_name] \ if bin_name in genes_in_bins_dict[gene_name]] genes_in_bins_dict[gene_name] = aligner(run=terminal.Run( verbose=False)).run_stdin(genes_list) gene_lengths[gene_name] = len( list(genes_in_bins_dict[gene_name].values())[0]) self.progress.end() # concatenate all of them and write them in a file f = open(output_file_path, 'w') gene_names_missing_from_everywhere = [] for bin_name in bin_names_in_dict: sequences_list = [] for gene_name in gene_names: if gene_name in genes_in_bins_dict: if bin_name in genes_in_bins_dict[gene_name]: sequences_list.append( genes_in_bins_dict[gene_name][bin_name]) else: sequences_list.append('-' * gene_lengths[gene_name]) else: # if we are here, it means this is a gene that has been missing form the hmm hits dict, since it # was not in any of the bins the dict described, but the user requested to have it in the # alignment anyway. This can happen when the user wants to concatanate genes from one or more # low-completion bins. We will keep track of them, and tell the user. sequences_list.append('-' * 42) gene_names_missing_from_everywhere.append(gene_name) sequence = separator.join(sequences_list) if wrap: sequence = textwrap.fill(sequence, wrap, break_on_hyphens=False) f.write( '>%s num_genes:%d|genes:%s|separator:%s\n' % (bin_name, len(gene_names), ','.join(gene_names), separator)) f.write('%s\n' % sequence) if len(gene_names_missing_from_everywhere): run.warning("You asked for some genes that were missing from all bins this class had in the " "HMM hits dictionary (here is a list of them: '%s'). Not knowing what to do with this werid " "situation, anvi'o put gap characters for all of them and retained your order. Here are those " "genes that missed the party: '%s'" % \ (', '.join(bin_names_in_dict), ', '.join(set(gene_names_missing_from_everywhere)))) f.close() if partition_file_path: utils.gen_NEXUS_format_partition_file_for_phylogenomics( partition_file_path, [(g, gene_lengths[g]) for g in gene_names], separator, run=self.run, progress=self.progress)