def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.args = args self.run = run self.progress = progress # initialize self.arg parameters A = lambda x, t: t(args.__dict__[x] ) if x in self.args.__dict__ else None null = lambda x: x self.contigs_db_path = A('contigs_db', null) self.structure_db_path = A('structure_db', null) self.genes_to_remove = A('genes_to_remove', null) self.genes_to_remove_path = A('genes_to_remove_file', null) self.genes_to_add = A('genes_to_add', null) self.genes_to_add_path = A('genes_to_add_file', null) self.full_modeller_output = A('dump_dir', null) self.modeller_executable = A('modeller_executable', null) self.DSSP_executable = None utils.is_contigs_db(self.contigs_db_path) self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path) self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash'] # init ContigsSuperClass self.contigs_super = ContigsSuperclass(self.args) if not any([ self.genes_to_remove, self.genes_to_remove_path, self.genes_to_add, self.genes_to_add_path ]): raise ConfigError( "Please specify some genes to add or remove to your database.") if self.genes_to_remove and self.genes_to_remove_path: raise ConfigError( "Provide either --genes-to-remove or --genes-to-remove-path. You provided both." ) if self.genes_to_add and self.genes_to_add_path: raise ConfigError( "Provide either --genes-to-add or --genes-to-add-path. You provided both." ) if self.genes_to_remove or self.genes_to_remove_path: self.run.warning("Removing genes...", header="Updating %s" % self.structure_db_path, lc='green') self.load_structure_db() remove = self.parse_genes(self.genes_to_remove, self.genes_to_remove_path) self.remove_genes(remove) self.structure_db.disconnect() if self.genes_to_add or self.genes_to_add_path: self.run.warning("Adding genes...", header="Updating %s" % self.structure_db_path, lc='green') self.load_structure_db() self.add_genes()
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.args = args self.run = run self.progress = progress # initialize self.arg parameters A = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None null = lambda x: x self.contigs_db_path = A('contigs_db', null) self.genes_of_interest_path = A('genes_of_interest', null) self.splits_of_interest_path = A('splits_of_interest', null) self.bin_id = A('bin_id', null) self.collection_name = A('collection_name', null) self.gene_caller_ids = A('gene_caller_ids', null) self.output_db_path = A('output_db_path', null) self.full_modeller_output = A('dump_dir', null) self.skip_DSSP = A('skip_DSSP', bool) self.modeller_executable = A('modeller_executable', null) self.DSSP_executable = None utils.is_contigs_db(self.contigs_db_path) self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path) self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash'] # MODELLER params self.modeller_database = A('modeller_database', null) self.scoring_method = A('scoring_method', null) self.max_number_templates = A('max_number_templates', null) self.percent_identical_cutoff = A('percent_identical_cutoff', null) self.num_models = A('num_models', null) self.deviation = A('deviation', null) self.very_fast = A('very_fast', bool) # check database output if not self.output_db_path: self.output_db_path = "STRUCTURE.db" if not self.output_db_path.endswith('.db'): raise ConfigError("The structure database output file (`-o / --output`) must end with '.db'") filesnpaths.is_output_file_writable(self.output_db_path) # check modeller output if self.full_modeller_output: self.full_modeller_output = filesnpaths.check_output_directory(self.full_modeller_output, ok_if_exists=False) # identify which genes user wants to model structures for self.genes_of_interest = self.get_genes_of_interest(self.genes_of_interest_path, self.gene_caller_ids) self.sanity_check() # residue annotation self.residue_annotation_sources_info = self.get_residue_annotation_sources_info() self.residue_info_table_structure, self.residue_info_table_types = self.get_residue_info_table_structure() self.residue_annotation_df = pd.DataFrame({}) # initialize StructureDatabase self.structure_db = StructureDatabase(self.output_db_path, self.contigs_db_hash, residue_info_structure_extras = self.residue_info_table_structure, residue_info_types_extras = self.residue_info_table_types, create_new=True) # init ContigsSuperClass self.contigs_super = ContigsSuperclass(self.args)
class Structure(object): def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.args = args self.run = run self.progress = progress # initialize self.arg parameters A = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None null = lambda x: x self.contigs_db_path = A('contigs_db', null) self.genes_of_interest_path = A('genes_of_interest', null) self.splits_of_interest_path = A('splits_of_interest', null) self.bin_id = A('bin_id', null) self.collection_name = A('collection_name', null) self.gene_caller_ids = A('gene_caller_ids', null) self.output_db_path = A('output_db_path', null) self.full_modeller_output = A('dump_dir', null) self.skip_DSSP = A('skip_DSSP', bool) self.modeller_executable = A('modeller_executable', null) self.DSSP_executable = None utils.is_contigs_db(self.contigs_db_path) self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path) self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash'] # MODELLER params self.modeller_database = A('modeller_database', null) self.scoring_method = A('scoring_method', null) self.max_number_templates = A('max_number_templates', null) self.percent_identical_cutoff = A('percent_identical_cutoff', null) self.num_models = A('num_models', null) self.deviation = A('deviation', null) self.very_fast = A('very_fast', bool) # check database output if not self.output_db_path: self.output_db_path = "STRUCTURE.db" if not self.output_db_path.endswith('.db'): raise ConfigError("The structure database output file (`-o / --output`) must end with '.db'") filesnpaths.is_output_file_writable(self.output_db_path) # check modeller output if self.full_modeller_output: self.full_modeller_output = filesnpaths.check_output_directory(self.full_modeller_output, ok_if_exists=False) # identify which genes user wants to model structures for self.genes_of_interest = self.get_genes_of_interest(self.genes_of_interest_path, self.gene_caller_ids) self.sanity_check() # residue annotation self.residue_annotation_sources_info = self.get_residue_annotation_sources_info() self.residue_info_table_structure, self.residue_info_table_types = self.get_residue_info_table_structure() self.residue_annotation_df = pd.DataFrame({}) # initialize StructureDatabase self.structure_db = StructureDatabase(self.output_db_path, self.contigs_db_hash, residue_info_structure_extras = self.residue_info_table_structure, residue_info_types_extras = self.residue_info_table_types, create_new=True) # init ContigsSuperClass self.contigs_super = ContigsSuperclass(self.args) def get_residue_info_table_structure(self): """ Table structure is dependent on which residue annotation sources are available or of interest. That's why it is defined on the fly when db is created. To generate on the fly, the columns from each source are added, but only if skip=False for the residue annotation source. codon_order_in_gene is ignored Since it is common to each residue annotation source and is already present in t.structure_residue_info_table_structure. """ structure = [] types = [] for source, info in self.residue_annotation_sources_info.items(): if not info["skip"] and info.get("structure"): d = {k: v for k, v in info["structure"].items() if k != "codon_order_in_gene"} structure.extend([x for x in d.keys()]) types.extend([d[y] for y in d.keys()]) return structure, types def get_residue_annotation_sources_info(self): """ The residue_annotation_sources_info is a dictionary spelling out all column names relevant to each annotation source, the method which returns the annotation dataframe, and the boolean stating whether or not the annotation source will be called. Those without a `structure` key are necessarily run and the columns they produce are statically present in t.structure_residue_info_table_structure """ residue_annotation_sources_info = { "DSSP": { "method" : self.run_DSSP, "skip" : self.skip_DSSP, "structure" : dict(zip(t.residue_info_sources["DSSP"]["structure"], t.residue_info_sources["DSSP"]["types"])) }, "contact_map": { "method" : self.run_contact_map, "skip" : False, }, "residue_identities": { "method" : self.run_residue_identity_annotation, "skip" : False, }, } return residue_annotation_sources_info def sanity_check(self): # check for genes that do not appear in the contigs database bad_gene_caller_ids = [g for g in self.genes_of_interest if g not in self.genes_in_contigs_database] if bad_gene_caller_ids: raise ConfigError(("This gene caller id you provided is" if len(bad_gene_caller_ids) == 1 else \ "These gene caller ids you provided are") + " not known to this contigs database: {}.\ You have only 2 lives left. 2 more mistakes, and anvi'o will automatically uninstall \ itself. Yes, seriously :(".format(", ".join([str(x) for x in bad_gene_caller_ids]))) # Finally, raise warning if number of genes is greater than 20 if len(self.genes_of_interest) > 20: self.run.warning("Modelling protein structures is no joke. The number of genes you want protein structures for is \ {}, which is a lot (of time!). If its taking too long, consider using the --very-fast flag. \ CTRL + C to cancel.".format(len(self.genes_of_interest))) # if self.percent_identical_cutoff is < 25, you should be careful about accuracy of models if self.percent_identical_cutoff < 25: self.run.warning("You selected a percent identical cutoff of {}%. Below 25%, you should pay close attention \ to the quality of the proteins...".format(self.percent_identical_cutoff)) # check that DSSP exists if self.skip_DSSP: self.run.warning("You requested to skip amino acid residue annotation with DSSP. A bold move only an expert could justify... \ Anvi'o's respect for you increases slightly.") else: if utils.is_program_exists("mkdssp", dont_raise=True): # mkdssp is newer and preferred self.DSSP_executable = "mkdssp" if not self.DSSP_executable: if utils.is_program_exists("dssp", dont_raise=True): self.DSSP_executable = "dssp" else: raise ConfigError("An anvi'o function needs 'mkdssp' or 'dssp' to be installed on your system, but\ neither seem to appear in your path :/ If you are certain you have either on your\ system (for instance you can run either by typing 'mkdssp' or 'dssp' in your terminal\ window), you may want to send a detailed bug report. If you want to install DSSP,\ check out http://merenlab.org/2016/06/18/installing-third-party-software/#dssp.\ If you want to skip secondary structure and solvent accessibility annotation,\ provide the flag --skip-DSSP.") self.run.info_single("Anvi'o found the DSSP executable `%s`, and will use it."\ % self.DSSP_executable, nl_before=1, nl_after=1) def get_genes_of_interest(self, genes_of_interest_path=None, gene_caller_ids=None): """ nabs the genes of interest based on user arguments (self.args) """ genes_of_interest = None # identify the gene caller ids of all genes available self.genes_in_contigs_database = set(dbops.ContigsSuperclass(self.args).genes_in_splits.keys()) if not self.genes_in_contigs_database: raise ConfigError("This contigs database does not contain any identified genes...") # settling genes of interest if genes_of_interest_path and gene_caller_ids: raise ConfigError("You can't provide a gene caller id from the command line, and a list of gene caller ids\ as a file at the same time, obviously.") if gene_caller_ids: gene_caller_ids = set([x.strip() for x in gene_caller_ids.split(',')]) genes_of_interest = [] for gene in gene_caller_ids: try: genes_of_interest.append(int(gene)) except: raise ConfigError("Anvi'o does not like your gene caller id '%s'..." % str(gene)) genes_of_interest = set(genes_of_interest) elif genes_of_interest_path: filesnpaths.is_file_tab_delimited(genes_of_interest_path, expected_number_of_fields=1) try: genes_of_interest = set([int(s.strip()) for s in open(genes_of_interest_path).readlines()]) except ValueError: raise ConfigError("Well. Anvi'o was working on your genes of interest ... and ... those gene IDs did not\ look like anvi'o gene caller ids :/ Anvi'o is now sad.") if not genes_of_interest: # no genes of interest are specified. Assuming all, which could be innumerable--raise warning genes_of_interest = self.genes_in_contigs_database self.run.warning("You did not specify any genes of interest, so anvi'o will assume all of them are of interest.") return genes_of_interest def process(self): """ """ # will be empty if all sources in self.residue_annotation_sources_info have "skip": True residue_annotation_methods = [info["method"] for _, info in self.residue_annotation_sources_info.items() if not info["skip"]] # which genes had structures and which did not. this information is added to the structure database self table has_structure = {True: [], False: []} num_genes_tried = 0 num_genes_to_try = len(self.genes_of_interest) for corresponding_gene_call in self.genes_of_interest: # MODELLER outputs a lot of stuff into its working directory. A temporary directory is # made for each instance of MODELLER (i.e. each protein), And bits and pieces of this # directory are used in the creation of the structure database. If self.full_modeller_output is # provided, these directories and their contents are moved into self.full_modeller_output. self.args.directory = filesnpaths.get_temp_directory_path() self.args.target_fasta_path = filesnpaths.get_temp_file_path() # Export sequence dbops.export_aa_sequences_from_contigs_db(self.contigs_db_path, self.args.target_fasta_path, set([corresponding_gene_call]), quiet = True) # Model structure progress_title = 'Modelling gene ID %d; (%d of %d processed)' % (corresponding_gene_call, num_genes_tried, num_genes_to_try) modeller_out = self.run_modeller(corresponding_gene_call, progress_title) if modeller_out["structure_exists"]: self.run.info_single("Gene successfully modelled!", nl_after=1, mc="green") has_structure[modeller_out["structure_exists"]].append(str(corresponding_gene_call)) # Annotate residues residue_info_dataframe = None if modeller_out["structure_exists"]: residue_info_dataframe = self.run_residue_annotation_for_gene(residue_annotation_methods, corresponding_gene_call, modeller_out["best_model_path"]) # Append info to tables self.append_gene_info_to_tables(modeller_out, residue_info_dataframe) # Append metadata to self self.update_structure_database_meta_table(has_structure) if self.full_modeller_output: self.dump_results_to_full_output() num_genes_tried += 1 if not has_structure[True]: raise ConfigError("Well this is really sad. No structures were modelled, so there is nothing to do. Bye :'(") self.structure_db.disconnect() self.run.info("Structure database", self.output_db_path) def update_structure_database_meta_table(self, has_structure): if self.structure_db.create_new: self.structure_db.db.set_meta_value('genes_queried', ",".join([str(g) for g in self.genes_of_interest])) self.structure_db.db.set_meta_value('genes_with_structure', ",".join(has_structure[True])) self.structure_db.db.set_meta_value('genes_without_structure', ",".join(has_structure[False])) self.structure_db.db.set_meta_value('modeller_database', self.modeller.modeller_database) self.structure_db.db.set_meta_value('scoring_method', self.scoring_method) self.structure_db.db.set_meta_value('percent_identical_cutoff', str(self.percent_identical_cutoff)) self.structure_db.db.set_meta_value('very_fast', str(int(self.very_fast))) self.structure_db.db.set_meta_value('deviation', self.deviation) self.structure_db.db.set_meta_value('max_number_templates', self.max_number_templates) self.structure_db.db.set_meta_value('num_models', self.num_models) for key, val in self.residue_annotation_sources_info.items(): self.structure_db.db.set_meta_value("skip_" + key, str(int(val["skip"]))) else: new_genes_queried = list(self.structure_db.genes_queried) + list(self.genes_of_interest) new_genes_with_structure = list(self.structure_db.genes_with_structure) + has_structure[True] new_genes_without_structure = list(self.structure_db.genes_without_structure) + has_structure[False] self.structure_db.db.update_meta_value('genes_queried', ",".join([str(x) for x in new_genes_queried])) self.structure_db.db.update_meta_value('genes_with_structure', ",".join([str(x) for x in new_genes_with_structure])) self.structure_db.db.update_meta_value('genes_without_structure', ",".join([str(x) for x in new_genes_without_structure])) def run_residue_annotation_for_gene(self, residue_annotation_methods, corresponding_gene_call, pdb_filepath): # residue_annotation_for_gene is a dataframe that stores residue annotations made by all residue # annotation methods (e.g. DSSP) for the current corresponding_gene_call. Each time a # resideu annotation source is ran, its results are appended as columns to # residue_annotation_for_gene. All annotation sources must have the index called # "codon_order_in_gene" whose values are anvi'o-indexed, i.e. the methionine has index 0. # Each annotation source does NOT have to annotate each residue in the gene. residue_annotation_for_gene = pd.DataFrame({}) for method in residue_annotation_methods: residue_annotation_for_gene = pd.concat([residue_annotation_for_gene, method(corresponding_gene_call, pdb_filepath)], axis=1, sort=True) # add corresponding_gene_call and codon_order_in_gene as 0th and 1st columns residue_annotation_for_gene.insert(0, "entry_id", list(range(residue_annotation_for_gene.shape[0]))) residue_annotation_for_gene.insert(1, "corresponding_gene_call", corresponding_gene_call) residue_annotation_for_gene.insert(2, "codon_order_in_gene", residue_annotation_for_gene.index) return residue_annotation_for_gene def dump_results_to_full_output(self): """ if self.full_modeller_output, all files from MODELLERs temp directory are recursively moved into output_gene_dir. Otherwise, the list of files we care about are defined in this function and moved into output_gene_dir. """ output_gene_dir = os.path.join(self.full_modeller_output, self.modeller.corresponding_gene_call) filesnpaths.check_output_directory(output_gene_dir) shutil.move(self.modeller.directory, output_gene_dir) def run_residue_identity_annotation(self, corresponding_gene_call, pdb_filepath): nt_sequence = self.contigs_super.get_sequences_for_gene_callers_ids([corresponding_gene_call], reverse_complement_if_necessary=True) nt_sequence = nt_sequence[1][corresponding_gene_call]['sequence'] seq_dict = {"codon_order_in_gene": [], "codon_number": [], "codon": [], "amino_acid": []} gene_length_in_codons = len(nt_sequence)//3 - 1 # subtract 1 because it's the stop codon for codon_order_in_gene in range(0, gene_length_in_codons): seq_dict["codon_order_in_gene"].append(codon_order_in_gene) seq_dict["codon_number"].append(codon_order_in_gene+1) seq_dict["codon"].append(nt_sequence[3*codon_order_in_gene:3*(codon_order_in_gene + 1)]) seq_dict["amino_acid"].append(constants.codon_to_AA[nt_sequence[3*codon_order_in_gene:3*(codon_order_in_gene + 1)]]) return pd.DataFrame(seq_dict).set_index("codon_order_in_gene") def run_contact_map(self, corresponding_gene_call, pdb_filepath): contact_map_matrix = ContactMap(pdb_filepath).compute_contact_map() contacts_dict = {"codon_order_in_gene": [], "contact_numbers": []} for codon_order_in_gene in range(contact_map_matrix.shape[0]): contacts = np.add(np.where(contact_map_matrix[codon_order_in_gene, :] == 1)[0], 1).astype(str) contacts_dict["codon_order_in_gene"].append(codon_order_in_gene) contacts_dict["contact_numbers"].append(",".join(contacts)) return pd.DataFrame(contacts_dict).set_index("codon_order_in_gene") def run_DSSP(self, corresponding_gene_call, pdb_filepath): """ DSSP is ran using the API developed in Biopython. That means we don't work directly from the text output of DSSP, but rather a Biopython object. """ # Determine the model name by loading the structure file p = PDBParser() structure = p.get_structure(corresponding_gene_call, pdb_filepath) model = structure[0] # pdb files can have multiple models. DSSP assumes the first. # run DSSP residue_annotation = DSSP(model, pdb_filepath, dssp = self.DSSP_executable, acc_array = "Wilke") if not len(residue_annotation.keys()): raise ConfigError("Your executable of DSSP, `{}`, exists but didn't return any meaningful output. This\ is a known issue with certain distributions of DSSP. For information on how to test\ that your version is working correctly, please visit\ http://merenlab.org/2016/06/18/installing-third-party-software/#dssp"\ .format(self.DSSP_executable, pdb_filepath)) # convert to a digestible format return self.convert_DSSP_output_from_biopython_to_dataframe(residue_annotation) def convert_DSSP_output_from_biopython_to_dataframe(self, dssp_biopython_object): """ From the DSSP module in Biopython: ============ ==================== ================ Tuple Index Biopython Anvi'o ============ ==================== ================ 0 DSSP index codon_order_in_gene 1 Amino acid aa 2 Secondary structure sec_struct 3 Relative ASA rel_solvent_acc 4 Phi phi 5 Psi psi 6 NH__>O_1_relidx NH_O_1_index 7 NH__>O_1_energy NH_O_1_energy 8 O__>NH_1_relidx O_NH_1_index 9 O__>NH_1_energy O_NH_1_energy 10 NH__>O_2_relidx NH_O_2_index 11 NH__>O_2_energy NH_O_2_energy 12 O__>NH_2_relidx O_NH_2_index 13 O__>NH_2_energy O_NH_2_energy ============ ==================== ================ Changes from Biopython format to anvi'o format: - residue index converted from 1Met to 0Met - aa converted to 3-letter code - ss type "-" is converted to coil (C) - relative indicies for h-bonds replaced with absolute residue indices (e.g. if relative index = -1 for residue 4, the absolute residue index is 3) """ one_to_three = {v: k for k, v in constants.AA_to_single_letter_code.items()} columns = list(self.residue_annotation_sources_info["DSSP"]["structure"].keys()) # convert biopython object to dictionary d d = {} for key in dssp_biopython_object.keys(): d[key] = list(dssp_biopython_object[key]) d[key][columns.index("codon_order_in_gene")] = utils.convert_sequence_indexing(d[key][columns.index("codon_order_in_gene")], source="M1", destination="M0") d[key][columns.index("aa")] = one_to_three[d[key][columns.index("aa")]] if d[key][columns.index("sec_struct")] == "-": d[key][columns.index("sec_struct")] = "C" for hbond in ["NH_O_1", "O_NH_1", "NH_O_2", "O_NH_2"]: res_index = d[key][columns.index("codon_order_in_gene")] rel_index = d[key][columns.index(hbond+"_index")] if rel_index == 0: d[key][columns.index(hbond+"_index")] = np.nan d[key][columns.index(hbond+"_energy")] = np.nan else: d[key][columns.index(hbond+"_index")] = res_index + rel_index # convert dictionary d to dataframe df return pd.DataFrame(d, index=columns).T.set_index("codon_order_in_gene") def run_modeller(self, corresponding_gene_call, progress_title): self.modeller = MODELLER.MODELLER(self.args, run=self.run, progress=self.progress, progress_title=progress_title) modeller_out = self.modeller.process() return modeller_out def append_gene_info_to_tables(self, modeller_out, residue_info_dataframe): """ Modeller and residue annotation sources have been called, now it is time to wrangle these data into formats that can be appended to their respective structure database tables. """ corresponding_gene_call = modeller_out["corresponding_gene_call"] # templates is always added, even when structure was not modelled templates = pd.DataFrame(modeller_out["templates"]) templates.insert(0, "corresponding_gene_call", corresponding_gene_call) templates = templates.reset_index().rename(columns={"index": "entry_id"}) self.structure_db.entries[t.structure_templates_table_name] = \ self.structure_db.entries[t.structure_templates_table_name].append(templates) self.structure_db.store(t.structure_templates_table_name, key="entry_id") # entries that are only added if a structure was modelled if modeller_out["structure_exists"]: # models models = pd.DataFrame(modeller_out["models"]) models.insert(0, "corresponding_gene_call", corresponding_gene_call) models = models.reset_index().rename(columns={"index": "entry_id"}) self.structure_db.entries[t.structure_models_table_name] = \ self.structure_db.entries[t.structure_models_table_name].append(models) self.structure_db.store(t.structure_models_table_name, key="entry_id") # pdb file data pdb_file = open(modeller_out["best_model_path"], 'rb') pdb_contents = pdb_file.read() pdb_file.close() pdb_table_entry = (corresponding_gene_call, pdb_contents) self.structure_db.entries[t.structure_pdb_data_table_name].append(pdb_table_entry) self.structure_db.store(t.structure_pdb_data_table_name) # residue_info self.structure_db.entries[t.structure_residue_info_table_name] = \ self.structure_db.entries[t.structure_residue_info_table_name].append(residue_info_dataframe) self.structure_db.store(t.structure_residue_info_table_name, key="entry_id")
get_taxo_line = "anvi-estimate-scg-taxonomy --quiet -T {threads} -c {temp_folder}/clean_bins/{bin_id}/{bin_id}.db -o {tempfile}" head = ["d__", "p__", "c__", "o__", "f__", "g__", "s__"] for bin_id in tqdm(os.listdir(pjoin(temp_folder, "clean_bins"))): if not os.path.exists(pjoin(temp_folder, "clean_bins", bin_id, bin_id + ".db")): call(f"anvi-gen-contigs-database --ignore-internal-stop-codons --quiet -n {binset_name} -f {temp_folder}/clean_bins/{bin_id}/{bin_id}.fna -o {temp_folder}/clean_bins/{bin_id/{bin_id}.db -T {threads} --skip-gene-calling") if os.path.isdir(pjoin(temp_folder, "clean_bins", bin_id)) and bin_id not in stats: tt = ContigSummarizer(pjoin(temp_folder, "clean_bins", bin_id, bin_id + ".db")).get_contigs_db_info_dict(gene_caller_to_use="Prodigal") t_file = NamedTemporaryFile() formating_dat['bin_id'] = bin_id formating_dat['tempfile'] = t_file.name call(get_taxo_line.format(**formating_dat), shell = True) with open(t_file.name) as handle: handle.readline().split() scg_taxo = handle.readline().strip().split("\t") params.__dict__['contigs_db'] = pjoin(temp_folder, "clean_bins", bin_id, bin_id + ".db") c = ContigsSuperclass(params) calls = c.get_sequences_for_gene_callers_ids(simple_headers=False)[1] with open(pjoin(temp_folder, binset_name + ".faa"), "a") as handle: seqs = [] for k,v in calls.items(): ss = Seq(v['sequence']).translate() seqs.append(SeqRecord(ss, id = bin_id + ";" + str(k), description = "")) SeqIO.write(seqs, handle, "fasta") export_sequences_from_contigs_db(pjoin(temp_folder, "clean_bins", bin_id, bin_id + ".db"), t_file.name) call("cat {tempfile} >> {temp_folder}/{binset_name}.fna".format(**formating_dat), shell = True) t_file.close() est_coding = tt['avg_gene_length']*tt['num_genes']/tt['total_length'] tt = {k : v for k,v in tt.items() if k in fields} stats[bin_id] = tt if scg_taxo[0] != '':
def populate_search_tables(self, sources={}): # if we end up generating a temporary file for amino acid sequences: if not len(sources): import anvio.data.hmm sources = anvio.data.hmm.sources if not sources: return target_files_dict = {} tmp_directory_path = filesnpaths.get_temp_directory_path() # here we will go through targets and populate target_files_dict based on what we find among them. targets = set([s['target'] for s in list(sources.values())]) for target in targets: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target) if not self.genes_are_called and context != "CONTIG": raise ConfigError("You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an\ HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run\ HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal\ RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter\ '--installed-hmm-profile Ribosomal_RNAs')." % (context, alphabet)) self.run.info('Target found', '%s:%s' % (alphabet, context)) class Args: pass args = Args() args.contigs_db = self.db_path contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False)) if context == 'GENE': target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet) contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet], simple_headers=True, rna_alphabet=True if alphabet=='RNA' else False, report_aa_sequences=True if alphabet=='AA' else False) elif context == 'CONTIG': if alphabet == 'AA': raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK\ to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\ you think this is dumb, please let us know.") else: target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet) utils.export_sequences_from_contigs_db(self.db_path, target_files_dict['%s:CONTIG' % alphabet], rna_alphabet=True if alphabet=='RNA' else False) commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use) for source in sources: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target']) kind_of_search = sources[source]['kind'] domain = sources[source]['domain'] all_genes_searched_against = sources[source]['genes'] hmm_model = sources[source]['model'] reference = sources[source]['ref'] noise_cutoff_terms = sources[source]['noise_cutoff_terms'] hmm_scan_hits_txt = commander.run_hmmscan(source, alphabet, context, kind_of_search, domain, len(all_genes_searched_against), hmm_model, reference, noise_cutoff_terms) if not hmm_scan_hits_txt: search_results_dict = {} else: parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context) search_results_dict = parser.get_search_results() if not len(search_results_dict): run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1) if context == 'CONTIG': # we are in trouble here. because our search results dictionary contains no gene calls, but contig # names that contain our hits. on the other hand, the rest of the code outside of this if statement # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do # to do. one is to come up with some new gene calls and add them to the contigs database. so things # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these # steps are going to be taken care of in the following function. magic. if source != "Ribosomal_RNAs": self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM\ operation is not directly working with gene calls anvi'o already knows about, the resulting\ hits will need to be added as 'new gene calls' into the contigs database. So far so good.\ But because we are in the contigs realm rater than genes realm, it is likely that\ resulting hits will not correspond to open reading frames that are supposed to be\ translated (such as ribosomal RNAs), because otherwise you would be working with genes\ instad of defining CONTIGS as your context in that HMM profile you just used unless you\ not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the\ new gene calls it will recover through these HMMs. Please take a moment and you be the\ judge of whether this will influence your pangenomic analyses or other things you thought\ you would be doing with the result of this HMM search downstream. If you do not feel like\ being the judge of anything today you can move on yet remember to remember this if things\ look somewhat weird later on.", header="Psst. Your fancy HMM profile '%s' speaking" % source, lc="green") num_hits_before = len(search_results_dict) search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict) num_hits_after = len(search_results_dict) if num_hits_before != num_hits_after: self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before)) search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search, search_results_dict, skip_amino_acid_sequences=True) self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) # FIXME: I have no clue why importing the anvio module is necessary at this point, # but without this, mini test fails becasue "`anvio.DEBUG` is being used # before initialization". nonsense. import anvio if not anvio.DEBUG: commander.clean_tmp_dirs() for v in list(target_files_dict.values()): os.remove(v)
def populate_search_tables(self, sources={}): # if we end up generating a temporary file for amino acid sequences: if not len(sources): import anvio.data.hmm sources = anvio.data.hmm.sources if not sources: return target_files_dict = {} tmp_directory_path = filesnpaths.get_temp_directory_path() # here we will go through targets and populate target_files_dict based on what we find among them. targets = set([s['target'] for s in list(sources.values())]) for target in targets: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context( target) self.run.info('Target found', '%s:%s' % (alphabet, context)) class Args: pass args = Args() args.contigs_db = self.db_path contigs_db = ContigsSuperclass(args) if context == 'GENE': target_files_dict['%s:GENE' % alphabet] = os.path.join( tmp_directory_path, '%s_gene_sequences.fa' % alphabet) contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids( output_file_path=target_files_dict['%s:GENE' % alphabet], simple_headers=True, rna_alphabet=True if alphabet == 'RNA' else False, report_aa_sequences=True if alphabet == 'AA' else False) elif context == 'CONTIG': if alphabet == 'AA': raise ConfigError( "You are somewhere you shouldn't be. You came here because you thought it would be OK\ to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\ you think this is dumb, please let us know." ) else: target_files_dict['%s:CONTIG' % alphabet] = os.path.join( tmp_directory_path, '%s_contig_sequences.fa' % alphabet) utils.export_sequences_from_contigs_db( self.db_path, target_files_dict['%s:CONTIG' % alphabet], rna_alphabet=True if alphabet == 'RNA' else False) commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use) for source in sources: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context( sources[source]['target']) kind_of_search = sources[source]['kind'] domain = sources[source]['domain'] all_genes_searched_against = sources[source]['genes'] hmm_model = sources[source]['model'] reference = sources[source]['ref'] noise_cutoff_terms = sources[source]['noise_cutoff_terms'] hmm_scan_hits_txt = commander.run_hmmscan( source, alphabet, context, kind_of_search, domain, all_genes_searched_against, hmm_model, reference, noise_cutoff_terms) if not hmm_scan_hits_txt: search_results_dict = {} else: parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context) search_results_dict = parser.get_search_results() if not len(search_results_dict): run.info_single( "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1) if context == 'CONTIG': # we are in trouble here. because our search results dictionary contains no gene calls, but contig # names that contain our hits. on the other hand, the rest of the code outside of this if statement # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do # to do. one is to come up with some new gene calls and add them to the contigs database. so things # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these # steps are going to be taken care of in the following function. magic. self.run.warning( "Alright! You just called an HMM profile that runs on contigs. Because it is not\ working with anvi'o gene calls directly, the resulting hits will need to be added\ as 'new gene calls' into the contigs database. This is a new feature, and if it\ starts screwing things up for you please let us know. Other than that you're pretty\ much golden. Carry on.", header="Psst. Your fancy HMM profile '%s' speaking" % source, lc="green") num_hits_before = len(search_results_dict) search_results_dict = utils.get_pruned_HMM_hits_dict( search_results_dict) num_hits_after = len(search_results_dict) if num_hits_before != num_hits_after: self.run.info( 'Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before)) search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict( kind_of_search, search_results_dict) self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) # FIXME: I have no clue why importing the anvio module is necessary at this point, # but without this, mini test fails becasue "`anvio.DEBUG` is being used # before initialization". nonsense. import anvio if not anvio.DEBUG: commander.clean_tmp_dirs() for v in list(target_files_dict.values()): os.remove(v)
class Structure(object): def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.args = args self.run = run self.progress = progress # initialize self.arg parameters A = lambda x, t: t(args.__dict__[x] ) if x in self.args.__dict__ else None null = lambda x: x self.contigs_db_path = A('contigs_db', null) self.genes_of_interest_path = A('genes_of_interest', null) self.splits_of_interest_path = A('splits_of_interest', null) self.bin_id = A('bin_id', null) self.collection_name = A('collection_name', null) self.gene_caller_ids = A('gene_caller_ids', null) self.output_db_path = A('output_db_path', null) self.full_modeller_output = A('dump_dir', null) self.skip_DSSP = A('skip_DSSP', bool) self.modeller_executable = A('modeller_executable', null) self.DSSP_executable = None utils.is_contigs_db(self.contigs_db_path) self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path) self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash'] # MODELLER params self.modeller_database = A('modeller_database', null) self.scoring_method = A('scoring_method', null) self.max_number_templates = A('max_number_templates', null) self.percent_identical_cutoff = A('percent_identical_cutoff', null) self.num_models = A('num_models', null) self.deviation = A('deviation', null) self.very_fast = A('very_fast', bool) # check database output if not self.output_db_path: self.output_db_path = "STRUCTURE.db" if not self.output_db_path.endswith('.db'): raise ConfigError( "The structure database output file (`-o / --output`) must end with '.db'" ) filesnpaths.is_output_file_writable(self.output_db_path) # check modeller output if self.full_modeller_output: self.full_modeller_output = filesnpaths.check_output_directory( self.full_modeller_output, ok_if_exists=False) # identify which genes user wants to model structures for self.genes_of_interest = self.get_genes_of_interest( self.genes_of_interest_path, self.gene_caller_ids) self.sanity_check() # residue annotation self.annotation_sources_info = self.get_annotation_sources_info() self.residue_info_table_structure, self.residue_info_table_types = self.get_residue_info_table_structure( ) self.res_annotation_df = pd.DataFrame({}) # initialize StructureDatabase self.structure_db = StructureDatabase( self.output_db_path, self.contigs_db_hash, residue_info_structure_extras=self.residue_info_table_structure, residue_info_types_extras=self.residue_info_table_types, create_new=True) # init ContigsSuperClass self.contigs_super = ContigsSuperclass(self.args) def get_residue_info_table_structure(self): """ Table structure is dependent on which annotation sources are available or of interest. That's why it is defined on the fly when db is created. To generate on the fly, the columns from each source are added, but only if skip=False for the annotation source. codon_order_in_gene is ignored Since it is common to each annotation source and is already present in t.structure_residue_info_table_structure. """ structure = [] types = [] for source, info in self.annotation_sources_info.items(): if not info["skip"] and info.get("structure"): d = { k: v for k, v in info["structure"].items() if k != "codon_order_in_gene" } structure.extend([x for x in d.keys()]) types.extend([d[y] for y in d.keys()]) return structure, types def get_annotation_sources_info(self): """ The annotation_sources_info is a dictionary spelling out all column names relevant to each annotation source, the method which returns the annotation dataframe, and the boolean stating whether or not the annotation source will be called. Those without a `structure` key are necessarily run and the columns they produce are statically present in t.structure_residue_info_table_structure """ annotation_sources_info = { "DSSP": { "method": self.run_DSSP, "skip": self.skip_DSSP, "structure": dict( zip(t.residue_info_sources["DSSP"]["structure"], t.residue_info_sources["DSSP"]["types"])) }, "contact_map": { "method": self.run_contact_map, "skip": False, }, "residue_identities": { "method": self.run_residue_identity_annotation, "skip": False, }, } return annotation_sources_info def sanity_check(self): # check for genes that do not appear in the contigs database bad_gene_caller_ids = [ g for g in self.genes_of_interest if g not in self.genes_in_contigs_database ] if bad_gene_caller_ids: raise ConfigError(("This gene caller id you provided is" if len(bad_gene_caller_ids) == 1 else \ "These gene caller ids you provided are") + " not known to this contigs database: {}.\ You have only 2 lives left. 2 more mistakes, and anvi'o will automatically uninstall \ itself. Yes, seriously :(" .format(", ".join([str(x) for x in bad_gene_caller_ids]))) # Finally, raise warning if number of genes is greater than 20 if len(self.genes_of_interest) > 20: self.run.warning( "Modelling protein structures is no joke. The number of genes you want protein structures for is \ {}, which is a lot (of time!). If its taking too long, consider using the --very-fast flag. \ CTRL + C to cancel.".format( len(self.genes_of_interest))) # if self.percent_identical_cutoff is < 25, you should be careful about accuracy of models if self.percent_identical_cutoff < 25: self.run.warning( "You selected a percent identical cutoff of {}%. Below 25%, you should pay close attention \ to the quality of the proteins...".format( self.percent_identical_cutoff)) # check that DSSP exists if self.skip_DSSP: self.run.warning( "You requested to skip amino acid residue annotation with DSSP. A bold move only an expert could justify... \ Anvi'o's respect for you increases slightly.") else: if utils.is_program_exists( "mkdssp", dont_raise=True): # mkdssp is newer and preferred self.DSSP_executable = "mkdssp" if not self.DSSP_executable: if utils.is_program_exists("dssp", dont_raise=True): self.DSSP_executable = "dssp" else: raise ConfigError( "An anvi'o function needs 'mkdssp' or 'dssp' to be installed on your system, but\ neither seem to appear in your path :/ If you are certain you have either on your\ system (for instance you can run either by typing 'mkdssp' or 'dssp' in your terminal\ window), you may want to send a detailed bug report. If you want to install DSSP,\ check out http://merenlab.org/2016/06/18/installing-third-party-software/#dssp.\ If you want to skip secondary structure and solvent accessibility annotation,\ provide the flag --skip-DSSP.") self.run.info_single("Anvi'o found the DSSP executable `%s`, and will use it."\ % self.DSSP_executable, nl_before=1, nl_after=1) def get_genes_of_interest(self, genes_of_interest_path=None, gene_caller_ids=None): """ nabs the genes of interest based on user arguments (self.args) """ genes_of_interest = None # identify the gene caller ids of all genes available self.genes_in_contigs_database = set( dbops.ContigsSuperclass(self.args).genes_in_splits.keys()) if not self.genes_in_contigs_database: raise ConfigError( "This contigs database does not contain any identified genes..." ) # settling genes of interest if genes_of_interest_path and gene_caller_ids: raise ConfigError( "You can't provide a gene caller id from the command line, and a list of gene caller ids\ as a file at the same time, obviously.") if gene_caller_ids: gene_caller_ids = set( [x.strip() for x in gene_caller_ids.split(',')]) genes_of_interest = [] for gene in gene_caller_ids: try: genes_of_interest.append(int(gene)) except: raise ConfigError( "Anvi'o does not like your gene caller id '%s'..." % str(gene)) genes_of_interest = set(genes_of_interest) elif genes_of_interest_path: filesnpaths.is_file_tab_delimited(genes_of_interest_path, expected_number_of_fields=1) try: genes_of_interest = set([ int(s.strip()) for s in open(genes_of_interest_path).readlines() ]) except ValueError: raise ConfigError( "Well. Anvi'o was working on your genes of interest ... and ... those gene IDs did not\ look like anvi'o gene caller ids :/ Anvi'o is now sad." ) if not genes_of_interest: # no genes of interest are specified. Assuming all, which could be innumerable--raise warning genes_of_interest = self.genes_in_contigs_database self.run.warning( "You did not specify any genes of interest, so anvi'o will assume all of them are of interest." ) return genes_of_interest def process(self): """ """ # will be empty if all sources in self.annotation_sources_info have "skip": True residue_annotation_methods = [ info["method"] for _, info in self.annotation_sources_info.items() if not info["skip"] ] # which genes had structures and which did not. this information is added to the structure database self table has_structure = {True: [], False: []} num_genes_tried = 0 num_genes_to_try = len(self.genes_of_interest) for corresponding_gene_call in self.genes_of_interest: # MODELLER outputs a lot of stuff into its working directory. A temporary directory is # made for each instance of MODELLER (i.e. each protein), And bits and pieces of this # directory are used in the creation of the structure database. If self.full_modeller_output is # provided, these directories and their contents are moved into self.full_modeller_output. self.args.directory = filesnpaths.get_temp_directory_path() self.args.target_fasta_path = filesnpaths.get_temp_file_path() # Export sequence dbops.export_aa_sequences_from_contigs_db( self.contigs_db_path, self.args.target_fasta_path, set([corresponding_gene_call]), quiet=True) # Model structure progress_title = 'Modelling gene ID %d; (%d of %d processed)' % ( corresponding_gene_call, num_genes_tried, num_genes_to_try) modeller_out = self.run_modeller(corresponding_gene_call, progress_title) if modeller_out["structure_exists"]: self.run.info_single("Gene successfully modelled!", nl_after=1, mc="green") has_structure[modeller_out["structure_exists"]].append( str(corresponding_gene_call)) # Annotate residues residue_info_dataframe = None if modeller_out["structure_exists"]: residue_info_dataframe = self.run_residue_annotation_for_gene( residue_annotation_methods, corresponding_gene_call, modeller_out["best_model_path"]) # Append info to tables self.append_gene_info_to_tables(modeller_out, residue_info_dataframe) # Append metadata to self self.update_structure_database_meta_table(has_structure) if self.full_modeller_output: self.dump_results_to_full_output() num_genes_tried += 1 if not has_structure[True]: raise ConfigError( "Well this is really sad. No structures were modelled, so there is nothing to do. Bye :'(" ) self.structure_db.disconnect() def update_structure_database_meta_table(self, has_structure): if self.structure_db.create_new: self.structure_db.db.set_meta_value( 'genes_queried', ",".join([str(g) for g in self.genes_of_interest])) self.structure_db.db.set_meta_value('genes_with_structure', ",".join(has_structure[True])) self.structure_db.db.set_meta_value('genes_without_structure', ",".join(has_structure[False])) self.structure_db.db.set_meta_value( 'modeller_database', self.modeller.modeller_database) self.structure_db.db.set_meta_value('scoring_method', self.scoring_method) self.structure_db.db.set_meta_value( 'percent_identical_cutoff', str(self.percent_identical_cutoff)) self.structure_db.db.set_meta_value('very_fast', str(int(self.very_fast))) self.structure_db.db.set_meta_value('deviation', self.deviation) self.structure_db.db.set_meta_value('max_number_templates', self.max_number_templates) self.structure_db.db.set_meta_value('num_models', self.num_models) for key, val in self.annotation_sources_info.items(): self.structure_db.db.set_meta_value("skip_" + key, str(int(val["skip"]))) else: new_genes_queried = list(self.structure_db.genes_queried) + list( self.genes_of_interest) new_genes_with_structure = list( self.structure_db.genes_with_structure) + has_structure[True] new_genes_without_structure = list( self.structure_db.genes_without_structure ) + has_structure[False] self.structure_db.db.update_meta_value( 'genes_queried', ",".join([str(x) for x in new_genes_queried])) self.structure_db.db.update_meta_value( 'genes_with_structure', ",".join([str(x) for x in new_genes_with_structure])) self.structure_db.db.update_meta_value( 'genes_without_structure', ",".join([str(x) for x in new_genes_without_structure])) def run_residue_annotation_for_gene(self, residue_annotation_methods, corresponding_gene_call, pdb_filepath): # res_annotation_for_gene is a dataframe that stores annotations made by all # annotation methods (e.g. DSSP) for the current corresponding_gene_call. Each time an annotation # source is ran, its results are appended as columns to res_annotation_for_gene. # All annotation sources must have the index called "codon_order_in_gene" whose values are # anvi'o-indexed, i.e. the methionine has index 0. Each annotation source does NOT have # to annotate each residue in the gene. res_annotation_for_gene = pd.DataFrame({}) for method in residue_annotation_methods: res_annotation_for_gene = pd.concat([ res_annotation_for_gene, method(corresponding_gene_call, pdb_filepath) ], axis=1) # add corresponding_gene_call and codon_order_in_gene as 0th and 1st columns res_annotation_for_gene.insert( 0, "entry_id", list(range(res_annotation_for_gene.shape[0]))) res_annotation_for_gene.insert(1, "corresponding_gene_call", corresponding_gene_call) res_annotation_for_gene.insert(2, "codon_order_in_gene", res_annotation_for_gene.index) return res_annotation_for_gene def dump_results_to_full_output(self): """ if self.full_modeller_output, all files from MODELLERs temp directory are recursively moved into output_gene_dir. Otherwise, the list of files we care about are defined in this function and moved into output_gene_dir. """ output_gene_dir = os.path.join(self.full_modeller_output, self.modeller.corresponding_gene_call) filesnpaths.check_output_directory(output_gene_dir) shutil.move(self.modeller.directory, output_gene_dir) def run_residue_identity_annotation(self, corresponding_gene_call, pdb_filepath): nt_sequence = self.contigs_super.get_sequences_for_gene_callers_ids( [corresponding_gene_call], reverse_complement_if_necessary=True) nt_sequence = nt_sequence[1][corresponding_gene_call]['sequence'] seq_dict = { "codon_order_in_gene": [], "codon_number": [], "codon": [], "amino_acid": [] } gene_length_in_codons = len( nt_sequence) // 3 - 1 # subtract 1 because it's the stop codon for codon_order_in_gene in range(0, gene_length_in_codons): seq_dict["codon_order_in_gene"].append(codon_order_in_gene) seq_dict["codon_number"].append(codon_order_in_gene + 1) seq_dict["codon"].append(nt_sequence[3 * codon_order_in_gene:3 * (codon_order_in_gene + 1)]) seq_dict["amino_acid"].append( constants.codon_to_AA[nt_sequence[3 * codon_order_in_gene:3 * (codon_order_in_gene + 1)]]) return pd.DataFrame(seq_dict).set_index("codon_order_in_gene") def run_contact_map(self, corresponding_gene_call, pdb_filepath): contact_map_matrix = ContactMap(pdb_filepath).compute_contact_map() contacts_dict = {"codon_order_in_gene": [], "contact_numbers": []} for codon_order_in_gene in range(contact_map_matrix.shape[0]): contacts = np.add( np.where(contact_map_matrix[codon_order_in_gene, :] == 1)[0], 1).astype(str) contacts_dict["codon_order_in_gene"].append(codon_order_in_gene) contacts_dict["contact_numbers"].append(",".join(contacts)) return pd.DataFrame(contacts_dict).set_index("codon_order_in_gene") def run_DSSP(self, corresponding_gene_call, pdb_filepath): """ DSSP is ran using the API developed in Biopython. That means we don't work directly from the text output of DSSP, but rather a Biopython object. """ # Determine the model name by loading the structure file p = PDBParser() structure = p.get_structure(corresponding_gene_call, pdb_filepath) model = structure[ 0] # pdb files can have multiple models. DSSP assumes the first. # run DSSP residue_annotation = DSSP(model, pdb_filepath, dssp=self.DSSP_executable, acc_array="Wilke") if not len(residue_annotation.keys()): raise ConfigError("Your executable of DSSP, `{}`, exists but didn't return any meaningful output. This\ is a known issue with certain distributions of DSSP. For information on how to test\ that your version is working correctly, please visit\ http://merenlab.org/2016/06/18/installing-third-party-software/#dssp"\ .format(self.DSSP_executable, pdb_filepath)) # convert to a digestible format return self.convert_DSSP_output_from_biopython_to_dataframe( residue_annotation) def convert_DSSP_output_from_biopython_to_dataframe( self, dssp_biopython_object): """ From the DSSP module in Biopython: ============ ==================== ================ Tuple Index Biopython Anvi'o ============ ==================== ================ 0 DSSP index codon_order_in_gene 1 Amino acid aa 2 Secondary structure sec_struct 3 Relative ASA rel_solvent_acc 4 Phi phi 5 Psi psi 6 NH__>O_1_relidx NH_O_1_index 7 NH__>O_1_energy NH_O_1_energy 8 O__>NH_1_relidx O_NH_1_index 9 O__>NH_1_energy O_NH_1_energy 10 NH__>O_2_relidx NH_O_2_index 11 NH__>O_2_energy NH_O_2_energy 12 O__>NH_2_relidx O_NH_2_index 13 O__>NH_2_energy O_NH_2_energy ============ ==================== ================ Changes from Biopython format to anvi'o format: - residue index converted from 1Met to 0Met - aa converted to 3-letter code - ss type "-" is converted to coil (C) - relative indicies for h-bonds replaced with absolute residue indices (e.g. if relative index = -1 for residue 4, the absolute residue index is 3) """ one_to_three = { v: k for k, v in constants.AA_to_single_letter_code.items() } columns = list( self.annotation_sources_info["DSSP"]["structure"].keys()) # convert biopython object to dictionary d d = {} for key in dssp_biopython_object.keys(): d[key] = list(dssp_biopython_object[key]) d[key][columns.index( "codon_order_in_gene")] = utils.convert_sequence_indexing( d[key][columns.index("codon_order_in_gene")], source="M1", destination="M0") d[key][columns.index("aa")] = one_to_three[d[key][columns.index( "aa")]] if d[key][columns.index("sec_struct")] == "-": d[key][columns.index("sec_struct")] = "C" for hbond in ["NH_O_1", "O_NH_1", "NH_O_2", "O_NH_2"]: res_index = d[key][columns.index("codon_order_in_gene")] rel_index = d[key][columns.index(hbond + "_index")] if rel_index == 0: d[key][columns.index(hbond + "_index")] = np.nan d[key][columns.index(hbond + "_energy")] = np.nan else: d[key][columns.index(hbond + "_index")] = res_index + rel_index # convert dictionary d to dataframe df return pd.DataFrame(d, index=columns).T.set_index("codon_order_in_gene") def run_modeller(self, corresponding_gene_call, progress_title): self.modeller = MODELLER.MODELLER(self.args, run=self.run, progress=self.progress, progress_title=progress_title) modeller_out = self.modeller.process() return modeller_out def append_gene_info_to_tables(self, modeller_out, residue_info_dataframe): """ Modeller and residue annotation sources have been called, now it is time to wrangle these data into formats that can be appended to their respective structure database tables. """ corresponding_gene_call = modeller_out["corresponding_gene_call"] # templates is always added, even when structure was not modelled templates = pd.DataFrame(modeller_out["templates"]) templates.insert(0, "corresponding_gene_call", corresponding_gene_call) templates = templates.reset_index().rename( columns={"index": "entry_id"}) self.structure_db.entries[t.structure_templates_table_name] = \ self.structure_db.entries[t.structure_templates_table_name].append(templates) self.structure_db.store(t.structure_templates_table_name, key="entry_id") # entries that are only added if a structure was modelled if modeller_out["structure_exists"]: # models models = pd.DataFrame(modeller_out["models"]) models.insert(0, "corresponding_gene_call", corresponding_gene_call) models = models.reset_index().rename(columns={"index": "entry_id"}) self.structure_db.entries[t.structure_models_table_name] = \ self.structure_db.entries[t.structure_models_table_name].append(models) self.structure_db.store(t.structure_models_table_name, key="entry_id") # pdb file data pdb_file = open(modeller_out["best_model_path"], 'rb') pdb_contents = pdb_file.read() pdb_file.close() pdb_table_entry = (corresponding_gene_call, pdb_contents) self.structure_db.entries[t.structure_pdb_data_table_name].append( pdb_table_entry) self.structure_db.store(t.structure_pdb_data_table_name) # residue_info self.structure_db.entries[t.structure_residue_info_table_name] = \ self.structure_db.entries[t.structure_residue_info_table_name].append(residue_info_dataframe) self.structure_db.store(t.structure_residue_info_table_name, key="entry_id")
def get_contigs_db_info_dict(contigs_db_path, run = run, progress = progress, include_AA_counts = False, split_names = None): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r = run, p = progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] if split_names: split_names = set(split_names) if split_names: c.init_split_sequences() seq = ''.join([c.split_sequences[split_name] for split_name in split_names]) info_dict['total_length'] = len(seq) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['gene_caller_ids'] = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names]) info_dict['num_genes'] = len(info_dict['gene_caller_ids']) info_dict['avg_gene_length'] = numpy.mean([(c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start']) for gene_caller_id in info_dict['gene_caller_ids']]) info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length'] info_dict['num_splits'] = len(split_names) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['num_genes'] = len(c.genes_in_contigs_dict) info_dict['gene_caller_ids'] = set(c.genes_in_contigs_dict.keys()) info_dict['avg_gene_length'] = numpy.mean([(gene['stop'] - gene['start']) for gene in c.genes_in_contigs_dict.values() if not gene['partial']]) info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length'] # get completeness / contamination estimates if split_names: comp = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names) else: comp = completeness.Completeness(contigs_db_path).get_info_for_splits(set(c.splits_basic_info.keys())) if comp.has_key('Campbell_et_al'): info_dict['percent_complete'] = comp['Campbell_et_al']['percent_complete'] info_dict['percent_redundancy'] = comp['Campbell_et_al']['percent_redundancy'] # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names = split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict
def __init__(self, args, external_clustering=None): self.args = args self.views = {} self.states_table = None self.p_meta = {} self.title = 'Unknown Project' A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.mode = A('mode') self.profile_db_path = A('profile_db') self.contigs_db_path = A('contigs_db') self.collection_name = A('collection_name') self.manual_mode = A('manual_mode') self.split_hmm_layers = A('split_hmm_layers') self.additional_layers_path = A('additional_layers') self.additional_view_path = A('additional_view') self.samples_information_db_path = A('samples_information_db') self.view = A('view') self.fasta_file = A('fasta_file') self.view_data_path = A('view_data') self.tree = A('tree') self.title = A('title') self.output_dir = A('output_dir') self.show_views = A('show_views') self.state = A('state') self.show_states = A('show_states') self.skip_check_names = A('skip_check_names') self.list_collections = A('list_collections') self.distance = A('distance') or constants.distance_metric_default self.linkage = A('linkage') or constants.linkage_method_default # make sure early on that both the distance and linkage is OK. clustering.is_distance_and_linkage_compatible(self.distance, self.linkage) self.split_names_ordered = None self.additional_layers = None self.auxiliary_profile_data_available = False self.samples_information_dict = {} self.samples_order_dict = {} self.samples_information_default_layer_order = {} # make sure the mode will be set properly if self.collection_name and self.manual_mode: raise ConfigError, "You can't anvi-interactive in manual mode with a collection name." self.external_clustering = external_clustering self.collections = ccollections.Collections() ContigsSuperclass.__init__(self, self.args) self.init_splits_taxonomy() if self.samples_information_db_path: samples_information_db = SamplesInformationDatabase(self.samples_information_db_path) self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts() self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order() samples_information_db.disconnect() if self.contigs_db_path: self.completeness = Completeness(self.contigs_db_path) self.collections.populate_collections_dict(self.contigs_db_path, anvio.__contigs__version__) else: self.completeness = None if 'skip_init_functions' in args and not args.skip_init_functions: self.init_functions() # make sure we are not dealing with apples and oranges here. if self.contigs_db_path and self.profile_db_path: is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) self.P = lambda x: os.path.join(self.p_meta['output_dir'], x) self.cwd = os.getcwd() # here is where the big deal stuff takes place: if not self.mode and self.manual_mode: self.mode = 'manual' self.run.info('Mode', self.mode, mc='red') self.load_manual_mode(args) elif self.mode == 'refine': self.load_full_mode(args) elif self.collection_name or self.list_collections: self.mode = 'collection' self.run.info('Mode', self.mode, mc='green') self.load_collection_mode(args) else: self.mode = 'full' self.load_full_mode(args) # make sure the samples information database, if there is one, is in fact compatible with the profile database # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is # being filled within the self.load_manual_mode function based on the headers of the view data. if self.profile_db_path and self.samples_information_db_path: is_profile_db_and_samples_db_compatible(self.profile_db_path, self.samples_information_db_path, manual_mode_exception=self.manual_mode) if self.external_clustering: self.p_meta['clusterings'] = self.clusterings = self.external_clustering['clusterings'] self.p_meta['available_clusterings'] = self.clusterings.keys() self.p_meta['default_clustering'] = self.external_clustering['default_clustering'] if not self.state and 'default' in self.states_table.states: self.state = 'default' if not self.p_meta['clusterings']: if self.p_meta['merged']: raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\ of splits that is required by the interactive interface. It may have been generated\ by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\ clustering step may have been skipped by anvi-merge because you had too many stplits\ to get the clustering in a reasonable amount of time. Please read the help menu for\ anvi-merge, and/or refer to the tutorial: \ http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging" else: raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\ that is required by the interactive interface. You must use `--cluster-contigs`\ flag for single profiles to access to this functionality. Please read the help\ menu for anvi-profile, and/or refer to the tutorial." # self.split_names_ordered is going to be the 'master' names list. everything else is going to # need to match these names: self.split_names_ordered = utils.get_names_order_from_newick_tree(self.p_meta['clusterings'][self.p_meta['default_clustering']]['newick']) # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the # unnecessary splits stored in views dicts. self.prune_view_dicts() # if there are any HMM search results in the contigs database other than 'singlecopy' sources, # we would like to visualize them as additional layers. following function is inherited from # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in # search tables: if self.mode == 'full': self.init_non_singlecopy_gene_hmm_sources(self.split_names_ordered, return_each_gene_as_a_layer=self.split_hmm_layers) if self.additional_layers_path: filesnpaths.is_file_tab_delimited(self.additional_layers_path) self.additional_layers = self.additional_layers_path self.check_names_consistency() self.convert_view_data_into_json()
def __init__(self, args, external_clustering=None): self.args = args self.views = {} self.states_table = None self.p_meta = {} self.title = 'Unknown Project' A = lambda x: args.__dict__[x] if x in args.__dict__ else None self.mode = A('mode') self.profile_db_path = A('profile_db') self.contigs_db_path = A('contigs_db') self.collection_name = A('collection_name') self.manual_mode = A('manual_mode') self.split_hmm_layers = A('split_hmm_layers') self.taxonomic_level = A('taxonomic_level') self.additional_layers_path = A('additional_layers') self.additional_view_path = A('additional_view') self.samples_information_db_path = A('samples_information_db') self.view = A('view') self.fasta_file = A('fasta_file') self.view_data_path = A('view_data') self.tree = A('tree') self.title = A('title') self.output_dir = A('output_dir') self.show_views = A('show_views') self.state_autoload = A('state_autoload') self.collection_autoload = A('collection_autoload') self.show_states = A('show_states') self.skip_check_names = A('skip_check_names') self.list_collections = A('list_collections') self.distance = A('distance') or constants.distance_metric_default self.linkage = A('linkage') or constants.linkage_method_default # make sure early on that both the distance and linkage is OK. clustering.is_distance_and_linkage_compatible(self.distance, self.linkage) self.split_names_ordered = None self.additional_layers = None self.auxiliary_profile_data_available = False self.samples_information_dict = {} self.samples_order_dict = {} self.samples_information_default_layer_order = {} # make sure the mode will be set properly if self.collection_name and self.manual_mode: raise ConfigError, "You can't anvi-interactive in manual mode with a collection name." self.external_clustering = external_clustering self.collections = ccollections.Collections() ContigsSuperclass.__init__(self, self.args) self.init_splits_taxonomy(self.taxonomic_level) if self.samples_information_db_path: samples_information_db = SamplesInformationDatabase( self.samples_information_db_path) self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts( ) self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order( ) samples_information_db.disconnect() if self.contigs_db_path: self.completeness = Completeness(self.contigs_db_path) self.collections.populate_collections_dict( self.contigs_db_path, anvio.__contigs__version__) else: self.completeness = None if 'skip_init_functions' in args and not args.skip_init_functions: self.init_functions() # make sure we are not dealing with apples and oranges here. if self.contigs_db_path and self.profile_db_path: is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) self.P = lambda x: os.path.join(self.p_meta['output_dir'], x) self.cwd = os.getcwd() # here is where the big deal stuff takes place: if not self.mode and self.manual_mode: self.mode = 'manual' self.run.info('Mode', self.mode, mc='red') self.load_manual_mode(args) elif self.mode == 'refine': self.load_full_mode(args) elif self.collection_name or self.list_collections: self.mode = 'collection' self.run.info('Mode', self.mode, mc='green') self.load_collection_mode(args) else: self.mode = 'full' self.load_full_mode(args) # make sure the samples information database, if there is one, is in fact compatible with the profile database # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is # being filled within the self.load_manual_mode function based on the headers of the view data. if self.profile_db_path and self.samples_information_db_path: is_profile_db_and_samples_db_compatible( self.profile_db_path, self.samples_information_db_path, manual_mode_exception=self.manual_mode) if self.external_clustering: self.p_meta[ 'clusterings'] = self.clusterings = self.external_clustering[ 'clusterings'] self.p_meta['available_clusterings'] = self.clusterings.keys() self.p_meta['default_clustering'] = self.external_clustering[ 'default_clustering'] if not self.state_autoload and 'default' in self.states_table.states: self.state_autoload = 'default' if not self.collection_autoload and 'default' in self.collections.collections_dict: self.collection_autoload = 'default' if not self.p_meta['clusterings']: if self.p_meta['merged']: raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\ of splits that is required by the interactive interface. It may have been generated\ by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\ clustering step may have been skipped by anvi-merge because you had too many stplits\ to get the clustering in a reasonable amount of time. Please read the help menu for\ anvi-merge, and/or refer to the tutorial: \ http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging" else: raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\ that is required by the interactive interface. You must use `--cluster-contigs`\ flag for single profiles to access to this functionality. Please read the help\ menu for anvi-profile, and/or refer to the tutorial." # self.split_names_ordered is going to be the 'master' names list. everything else is going to # need to match these names: self.split_names_ordered = utils.get_names_order_from_newick_tree( self.p_meta['clusterings'][ self.p_meta['default_clustering']]['newick']) # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the # unnecessary splits stored in views dicts. self.prune_view_dicts() # if there are any HMM search results in the contigs database other than 'singlecopy' sources, # we would like to visualize them as additional layers. following function is inherited from # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in # search tables: if self.mode == 'full': self.init_non_singlecopy_gene_hmm_sources( self.split_names_ordered, return_each_gene_as_a_layer=self.split_hmm_layers) if self.additional_layers_path: filesnpaths.is_file_tab_delimited(self.additional_layers_path) self.additional_layers = self.additional_layers_path self.check_names_consistency() self.convert_view_data_into_json()
def populate_search_tables(self, sources={}): # if we end up generating a temporary file for amino acid sequences: if not len(sources): import anvio.data.hmm sources = anvio.data.hmm.sources if not sources: return target_files_dict = {} tmp_directory_path = filesnpaths.get_temp_directory_path() # here we will go through targets and populate target_files_dict based on what we find among them. targets = set([s['target'] for s in list(sources.values())]) for target in targets: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(target) self.run.info('Target found', '%s:%s' % (alphabet, context)) class Args: pass args = Args() args.contigs_db = self.db_path contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False)) if context == 'GENE': target_files_dict['%s:GENE' % alphabet] = os.path.join(tmp_directory_path, '%s_gene_sequences.fa' % alphabet) contigs_db.gen_FASTA_file_of_sequences_for_gene_caller_ids(output_file_path=target_files_dict['%s:GENE' % alphabet], simple_headers=True, rna_alphabet=True if alphabet=='RNA' else False, report_aa_sequences=True if alphabet=='AA' else False) elif context == 'CONTIG': if alphabet == 'AA': raise ConfigError("You are somewhere you shouldn't be. You came here because you thought it would be OK\ to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If\ you think this is dumb, please let us know.") else: target_files_dict['%s:CONTIG' % alphabet] = os.path.join(tmp_directory_path, '%s_contig_sequences.fa' % alphabet) utils.export_sequences_from_contigs_db(self.db_path, target_files_dict['%s:CONTIG' % alphabet], rna_alphabet=True if alphabet=='RNA' else False) commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use) for source in sources: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context(sources[source]['target']) kind_of_search = sources[source]['kind'] domain = sources[source]['domain'] all_genes_searched_against = sources[source]['genes'] hmm_model = sources[source]['model'] reference = sources[source]['ref'] noise_cutoff_terms = sources[source]['noise_cutoff_terms'] hmm_scan_hits_txt = commander.run_hmmscan(source, alphabet, context, kind_of_search, domain, len(all_genes_searched_against), hmm_model, reference, noise_cutoff_terms) if not hmm_scan_hits_txt: search_results_dict = {} else: parser = parser_modules['search']['hmmscan'](hmm_scan_hits_txt, alphabet=alphabet, context=context) search_results_dict = parser.get_search_results() if not len(search_results_dict): run.info_single("The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1) if context == 'CONTIG': # we are in trouble here. because our search results dictionary contains no gene calls, but contig # names that contain our hits. on the other hand, the rest of the code outside of this if statement # expects a `search_results_dict` with gene callers id in it. so there are two things we need to do # to do. one is to come up with some new gene calls and add them to the contigs database. so things # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these # steps are going to be taken care of in the following function. magic. if source != "Ribosomal_RNAs": self.run.warning("You just called an HMM profile that runs on contigs and not genes. Because this HMM\ operation is not directly working with gene calls anvi'o already knows about, the resulting\ hits will need to be added as 'new gene calls' into the contigs database. So far so good.\ But blecause we are in the contigs realm rater than genes realm, it is likely that\ resulting hits will not correspond to open reading frames that are supposed to be\ translated (such as ribosomal RNAs), because otherwise you would be working with genes\ instad of defining CONTIGS as your context in that HMM profile you just used unless you\ not sure what you are doing. Hence, anvi'o will not report amino acid sequences for the\ new gene calls it will recover through these HMMs. Please take a moment and you be the\ judge of whether this will influence your pangenomic analyses or other things you thought\ you would be doing with the result of this HMM search downstream. If you do not feel like\ being the judge of anything today you can move on yet remember to remember this if things\ look somewhat weird later on.", header="Psst. Your fancy HMM profile '%s' speaking" % source, lc="green") num_hits_before = len(search_results_dict) search_results_dict = utils.get_pruned_HMM_hits_dict(search_results_dict) num_hits_after = len(search_results_dict) if num_hits_before != num_hits_after: self.run.info('Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before)) search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict(kind_of_search, search_results_dict, skip_amino_acid_sequences=True) self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) # FIXME: I have no clue why importing the anvio module is necessary at this point, # but without this, mini test fails becasue "`anvio.DEBUG` is being used # before initialization". nonsense. import anvio if not anvio.DEBUG: commander.clean_tmp_dirs() for v in list(target_files_dict.values()): os.remove(v)
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r=run, p=progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] if split_names: split_names = set(split_names) if split_names: c.init_split_sequences() seq = ''.join( [c.split_sequences[split_name] for split_name in split_names]) info_dict['total_length'] = len(seq) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['gene_caller_ids'] = set([ e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names ]) info_dict['num_genes'] = len(info_dict['gene_caller_ids']) info_dict['avg_gene_length'] = numpy.mean([ (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start']) for gene_caller_id in info_dict['gene_caller_ids'] ]) info_dict['num_genes_per_kb'] = info_dict[ 'num_genes'] * 1000.0 / info_dict['total_length'] info_dict['num_splits'] = len(split_names) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['num_genes'] = len(c.genes_in_contigs_dict) info_dict['gene_caller_ids'] = set(c.genes_in_contigs_dict.keys()) info_dict['avg_gene_length'] = numpy.mean([ (gene['stop'] - gene['start']) for gene in c.genes_in_contigs_dict.values() if not gene['partial'] ]) info_dict['num_genes_per_kb'] = info_dict[ 'num_genes'] * 1000.0 / info_dict['total_length'] # get completeness / contamination estimates if split_names: comp = completeness.Completeness(contigs_db_path).get_info_for_splits( split_names) else: comp = completeness.Completeness(contigs_db_path).get_info_for_splits( set(c.splits_basic_info.keys())) if comp.has_key('Campbell_et_al'): info_dict['percent_complete'] = comp['Campbell_et_al'][ 'percent_complete'] info_dict['percent_redundancy'] = comp['Campbell_et_al'][ 'percent_redundancy'] # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names=split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None, exclude_partial_gene_calls=True): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r=run, p=progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] # Two different strategies here depending on whether we work with a given set if split ids or # everything in the contigs database. if split_names: split_names = set(split_names) c.init_split_sequences() seq = ''.join([c.split_sequences[split_name] for split_name in split_names]) candidate_gene_caller_ids = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names]) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) candidate_gene_caller_ids = c.genes_in_contigs_dict.keys() gene_caller_ids = set([]) excluded_gene_ids = set([]) for gene_caller_id in candidate_gene_caller_ids: if c.genes_in_contigs_dict[gene_caller_id]['partial'] and exclude_partial_gene_calls: excluded_gene_ids.add(gene_caller_id) else: gene_caller_ids.add(gene_caller_id) info_dict['gene_caller_ids'] = gene_caller_ids info_dict['excluded_gene_ids'] = excluded_gene_ids info_dict['num_genes'] = len(gene_caller_ids) info_dict['gene_lengths'] = dict([(gene_caller_id, (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start'])) for gene_caller_id in gene_caller_ids]) info_dict['avg_gene_length'] = numpy.mean(info_dict['gene_lengths'].values()) info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length'] info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['total_length'] = len(seq) # get completeness / contamination estimates if split_names: comp = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names) else: comp = completeness.Completeness(contigs_db_path).get_info_for_splits(set(c.splits_basic_info.keys())) if 'Campbell_et_al' in comp: info_dict['percent_complete'] = comp['Campbell_et_al']['percent_complete'] info_dict['percent_redundancy'] = comp['Campbell_et_al']['percent_redundancy'] # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names=split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict
def __init__(self, args, external_clustering = None): self.args = args self.views = {} self.states_table = None self.p_meta = {} self.title = 'Unknown Project' A = lambda x: args.__dict__[x] if args.__dict__.has_key(x) else None self.profile_db_path = A('profile_db') self.contigs_db_path = A('contigs_db') self.manual_mode = A('manual_mode') self.split_hmm_layers = A('split_hmm_layers') self.additional_layers_path = A('additional_layers') self.additional_view_path = A('additional_view') self.samples_information_db_path = A('samples_information_db') self.view = A('view') self.fasta_file = A('fasta_file') self.view_data_path = A('view_data') self.tree = A('tree') self.title = A('title') self.output_dir = A('output_dir') self.show_views = A('show_views') self.state = A('state') self.show_states = A('show_states') self.skip_check_names = A('skip_check_names') self.split_names_ordered = None self.additional_layers = None self.samples_information_dict = {} self.samples_order_dict = {} self.samples_information_default_layer_order = {} self.external_clustering = external_clustering self.collections = ccollections.Collections() ContigsSuperclass.__init__(self, self.args) if self.samples_information_db_path: samples_information_db = SamplesInformationDatabase(self.samples_information_db_path) self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts() self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order() samples_information_db.disconnect() if self.contigs_db_path: self.completeness = completeness.Completeness(self.contigs_db_path) self.collections.populate_sources_dict(self.contigs_db_path, anvio.__contigs__version__) else: self.completeness = None if 'skip_init_functions' in args and not args.skip_init_functions: self.init_functions() # make sure we are not dealing with apples and oranges here. if self.contigs_db_path and self.profile_db_path: is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path) self.P = lambda x: os.path.join(self.p_meta['output_dir'], x) self.cwd = os.getcwd() # here is where the big deal stuff takes place: if self.manual_mode: self.load_from_user_files(args) else: self.load_from_anvio_files(args) # make sure the samples information database, if there is one, is in fact compatible with the profile database # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is # being filled within the self.load_from_user_files function based on the headers of the view data. if self.profile_db_path and self.samples_information_db_path: is_profile_db_and_samples_db_compatible(self.profile_db_path, self.samples_information_db_path) if self.external_clustering: self.p_meta['clusterings'] = self.clusterings = self.external_clustering['clusterings'] self.p_meta['available_clusterings'] = self.clusterings.keys() self.p_meta['default_clustering'] = self.external_clustering['default_clustering'] if not self.p_meta['clusterings']: if self.p_meta['merged']: raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\ that is required by the interactive interface. It may have been generated\ by anvi-merge with `--skip-hierarchical-clustering` flag, or hierarchical\ clustering step may have been skipped automatically by the platform. Please\ read the help menu for anvi-merge, and/or refer to the tutorial: \ http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging" else: raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\ that is required by the interactive interface. You must use `--cluster-contigs`\ flag for single profiles to access to this functionality. Please read the help\ menu for anvi-profile, and/or refer to the tutorial." tree = Tree(self.p_meta['clusterings'][self.p_meta['default_clustering']]['newick'], format = 1) # self.split_names_ordered is going to be the 'master' names list. everything else is going to # need to match these names: self.split_names_ordered = [n.name for n in tree.get_leaves()] # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the # unnecessary splits stored in views dicts. self.prune_view_dicts() # if there are any HMM search results in the contigs database other than 'singlecopy' sources, # we would like to visualize them as additional layers. following function is inherited from # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in # search tables: self.init_non_singlecopy_gene_hmm_sources(self.split_names_ordered, return_each_gene_as_a_layer = self.split_hmm_layers) if self.additional_layers_path: filesnpaths.is_file_tab_delimited(self.additional_layers_path) self.additional_layers = self.additional_layers_path self.check_names_consistency() self.convert_view_data_into_json()
def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()): self.args = args self.run = run self.progress = progress # initialize self.arg parameters A = lambda x, t: t(args.__dict__[x] ) if x in self.args.__dict__ else None null = lambda x: x self.contigs_db_path = A('contigs_db', null) self.genes_of_interest_path = A('genes_of_interest', null) self.splits_of_interest_path = A('splits_of_interest', null) self.bin_id = A('bin_id', null) self.collection_name = A('collection_name', null) self.gene_caller_ids = A('gene_caller_ids', null) self.output_db_path = A('output_db_path', null) self.full_modeller_output = A('dump_dir', null) self.skip_DSSP = A('skip_DSSP', bool) self.modeller_executable = A('modeller_executable', null) self.DSSP_executable = None utils.is_contigs_db(self.contigs_db_path) self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path) self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash'] # MODELLER params self.modeller_database = A('modeller_database', null) self.scoring_method = A('scoring_method', null) self.max_number_templates = A('max_number_templates', null) self.percent_identical_cutoff = A('percent_identical_cutoff', null) self.num_models = A('num_models', null) self.deviation = A('deviation', null) self.very_fast = A('very_fast', bool) # check database output if not self.output_db_path: self.output_db_path = "STRUCTURE.db" if not self.output_db_path.endswith('.db'): raise ConfigError( "The structure database output file (`-o / --output`) must end with '.db'" ) filesnpaths.is_output_file_writable(self.output_db_path) # check modeller output if self.full_modeller_output: self.full_modeller_output = filesnpaths.check_output_directory( self.full_modeller_output, ok_if_exists=False) # identify which genes user wants to model structures for self.genes_of_interest = self.get_genes_of_interest( self.genes_of_interest_path, self.gene_caller_ids) self.sanity_check() # residue annotation self.annotation_sources_info = self.get_annotation_sources_info() self.residue_info_table_structure, self.residue_info_table_types = self.get_residue_info_table_structure( ) self.res_annotation_df = pd.DataFrame({}) # initialize StructureDatabase self.structure_db = StructureDatabase( self.output_db_path, self.contigs_db_hash, residue_info_structure_extras=self.residue_info_table_structure, residue_info_types_extras=self.residue_info_table_types, create_new=True) # init ContigsSuperClass self.contigs_super = ContigsSuperclass(self.args)
def populate_search_tables(self, sources={}): # make sure the output file is OK to write. filesnpaths.is_output_file_writable(self.db_path, ok_if_exists=True) # if we end up generating a temporary file for amino acid sequences: if not len(sources): import anvio.data.hmm sources = anvio.data.hmm.sources if not sources: return self.check_sources(sources) target_files_dict = {} tmp_directory_path = filesnpaths.get_temp_directory_path() hmmpressed_files = self.hmmpress_sources(sources, tmp_directory_path) self.run.info("Contigs DB", self.db_path) self.run.info("HMM sources", ', '.join(sources.keys())) # here we will go through targets and populate target_files_dict based on what we find among them. targets = set([s['target'] for s in list(sources.values())]) have_hmm_sources_with_non_RNA_contig_context = False for target in targets: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context( target) if not self.genes_are_called and context != "CONTIG": raise ConfigError( "You are in trouble. The gene calling was skipped for this contigs database, yet anvi'o asked to run an " "HMM profile that wishes to operate on %s context using the %s alphabet. It is not OK. You still could run " "HMM profiles that does not require gene calls to be present (such as the HMM profile that identifies Ribosomal " "RNAs in contigs, but for that you would have to explicitly ask for it by using the additional parameter " "'--installed-hmm-profile PROFILE_NAME_HERE')." % (context, alphabet)) self.run.info('Alphabet/context target found', '%s:%s' % (alphabet, context)) if context == 'CONTIG' and alphabet != 'RNA': have_hmm_sources_with_non_RNA_contig_context = True class Args: pass args = Args() args.contigs_db = self.db_path contigs_db = ContigsSuperclass(args, r=terminal.Run(verbose=False)) if context == 'GENE': target_files_dict['%s:GENE' % alphabet] = os.path.join( tmp_directory_path, '%s_gene_sequences.fa' % alphabet) contigs_db.get_sequences_for_gene_callers_ids( output_file_path=target_files_dict['%s:GENE' % alphabet], simple_headers=True, rna_alphabet=True if alphabet == 'RNA' else False, report_aa_sequences=True if alphabet == 'AA' else False) elif context == 'CONTIG': if alphabet == 'AA': raise ConfigError( "You are somewhere you shouldn't be. You came here because you thought it would be OK " "to ask for AA sequences in the CONTIG context. The answer to that is 'no, thanks'. If " "you think this is dumb, please let us know.") else: target_files_dict['%s:CONTIG' % alphabet] = os.path.join( tmp_directory_path, '%s_contig_sequences.fa' % alphabet) utils.export_sequences_from_contigs_db( self.db_path, target_files_dict['%s:CONTIG' % alphabet], rna_alphabet=True if alphabet == 'RNA' else False) if have_hmm_sources_with_non_RNA_contig_context: # in that case, we should remind people what's up. self.run.warning( "The HMM profiles that are about to be run includes at least one HMM profile that runs on " "contigs and not genes. Thus, this HMM operation will not be working with gene calls anvi'o " "already knows about. Which means, the resulting hits will need to be added as 'new gene calls' " "into the contigs database. So far so good. But because we are in the realm of contigs rather " "than genes, the resulting HMM hits will unlikely correspond to open reading frames that are " "supposed to be translated (such as ribosomal RNAs). While anvi'o adds new gene calls to your " "contigs database for these hits, it will NOT report amino acid sequences for the " "new gene calls that will emerge from these HMMs, expecting you to judge whether this will " "influence your pangenomic analyses or other things you thought you would be doing with the " "result of this HMM search downstream. If you do not feel like being the judge of anything today " "you can move on yet remember to remember this if things look somewhat weird later on.", header="THE MORE YOU KNOW 🌈", lc="green") commander = HMMer(target_files_dict, num_threads_to_use=self.num_threads_to_use, program_to_use=self.hmm_program) for source in sources: alphabet, context = utils.anvio_hmm_target_term_to_alphabet_and_context( sources[source]['target']) if alphabet in ['DNA', 'RNA' ] and 'domtable' in self.hmmer_desired_output: raise ConfigError( "Domain table output was requested (probably with the --get-domtable-output flag, " "does that look familiar?) but unfortunately this option is incompatible with the " f"current source of HMM profiles, {source}, because this source uses a nucleotide " "alphabet.") kind_of_search = sources[source]['kind'] domain = sources[source]['domain'] all_genes_searched_against = sources[source]['genes'] hmm_model = hmmpressed_files[source] reference = sources[source]['ref'] noise_cutoff_terms = sources[source]['noise_cutoff_terms'] hmmer_output = commander.run_hmmer( source, alphabet, context, kind_of_search, domain, len(all_genes_searched_against), hmm_model, reference, noise_cutoff_terms, desired_output=self.hmmer_desired_output, hmmer_output_dir=self.hmmer_output_dir) if self.hmmer_output_dir: self.run.info("HMMER output directory", self.hmmer_output_dir) if not isinstance(hmmer_output, tuple): hmm_scan_hits_txt = hmmer_output else: hmm_scan_hits_txt, domain_hits_txt = hmmer_output self.run.info("Domain table output", domain_hits_txt) if not hmm_scan_hits_txt: search_results_dict = {} else: try: parser = parser_modules['search']['hmmer_table_output']( hmm_scan_hits_txt, alphabet=alphabet, context=context, program=self.hmm_program) except StupidHMMError as e: raise ConfigError( f"Unfortunately something went wrong while anvi'o was trying to parse some HMM output for your data. " f"This error is typically due to contig names that are long and variable in length, which that " f"confuses HMMER and so it generates output tables that are simply unparseable. Anvi'o does its best, " f"but occasionally fails, which leads to this error. If you are curious why is this happening, you can take a " f"look at this issue where this issue is described: https://github.com/merenlab/anvio/issues/1564. " f"Solution to this is relatively easy: use `anvi-script-reformat-fasta` with `--simplify-names` flag " f"BEFORE generating your contigs database as we advice you to. Sorry you came all this way just to " f"find out about this :/ Here is the origial error message anvi'o produced from the code beneath: {e}." ) search_results_dict = parser.get_search_results() if not len(search_results_dict): run.info_single( "The HMM source '%s' returned 0 hits. SAD (but it's stil OK)." % source, nl_before=1) if context == 'CONTIG': # we are in trouble here. because our search results dictionary contains no gene calls, but contig # names contain our hits. on the other hand, the rest of the code outside of this if statement # expects a `search_results_dict` with gene caller ids in it. so there are two things we need to do. # one is to come up with some new gene calls and add them to the contigs database. so things # will go smoothly downstream. two, we will need to update our `search_results_dict` so it looks # like a a dictionary the rest of the code expects with `gene_callers_id` fields. both of these # steps are going to be taken care of in the following function. magic. num_hits_before = len(search_results_dict) search_results_dict = utils.get_pruned_HMM_hits_dict( search_results_dict) num_hits_after = len(search_results_dict) if num_hits_before != num_hits_after: self.run.info( 'Pruned', '%d out of %d hits were removed due to redundancy' % (num_hits_before - num_hits_after, num_hits_before)) search_results_dict = self.add_new_gene_calls_to_contigs_db_and_update_serach_results_dict( kind_of_search, search_results_dict, skip_amino_acid_sequences=True) self.append(source, reference, kind_of_search, domain, all_genes_searched_against, search_results_dict) # FIXME: I have no clue why importing the anvio module is necessary at this point, # but without this, mini test fails becasue "`anvio.DEBUG` is being used # before initialization". nonsense. import anvio if not anvio.DEBUG: commander.clean_tmp_dirs() for v in list(target_files_dict.values()): os.remove(v) shutil.rmtree(tmp_directory_path)
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None, exclude_partial_gene_calls=True): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r=run, p=progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] # Two different strategies here depending on whether we work with a given set if split ids or # everything in the contigs database. if split_names: split_names = set(split_names) c.init_split_sequences() seq = ''.join([c.split_sequences[split_name] for split_name in split_names]) candidate_gene_caller_ids = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names]) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) candidate_gene_caller_ids = c.genes_in_contigs_dict.keys() info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['total_length'] = len(seq) gene_caller_ids = set([]) excluded_gene_ids = set([]) for gene_caller_id in candidate_gene_caller_ids: if c.genes_in_contigs_dict[gene_caller_id]['partial'] and exclude_partial_gene_calls: excluded_gene_ids.add(gene_caller_id) else: gene_caller_ids.add(gene_caller_id) info_dict['gene_caller_ids'] = gene_caller_ids info_dict['excluded_gene_ids'] = excluded_gene_ids info_dict['num_genes'] = len(gene_caller_ids) info_dict['gene_lengths'] = dict([(gene_caller_id, (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start'])) for gene_caller_id in gene_caller_ids]) info_dict['avg_gene_length'] = numpy.mean(info_dict['gene_lengths'].values()) info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length'] # get completeness / contamination estimates p_completion, p_redundancy, domain, domain_confidence, results_dict = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names if split_names else set(c.splits_basic_info.keys())) info_dict['percent_complete'] = p_completion info_dict['percent_redundancy'] = p_redundancy info_dict['scg_domain'] = domain info_dict['scg_domain_confidence'] = domain_confidence # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names=split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict