def run_search_and_parse_results(self): """Align the protein against the database based on only sequence""" if not self.percent_identical_cutoff or not self.max_number_templates: raise ConfigError( "run_search_and_parse_results :: You initiated this class without providing values for percent_identical_cutoff " "and max_number_templates, which is required for this function." ) # Change to MODELLER working directory os.chdir(self.directory) driver = diamond.Diamond( query_fasta=self.target_fasta_path, target_fasta=J(self.database_dir, self.modeller_database + '.dmnd'), run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False), ) driver.blastp() # Change back to user directory os.chdir(self.start_dir) search_df = driver.view_as_dataframe( J(self.directory, driver.tabular_output_path)) matches_found = search_df.shape[0] if not matches_found: self.run.warning( "No proteins with homologous sequence were found for {}. No structure will be modelled" .format(self.corresponding_gene_call)) raise self.EndModeller # We need the gene length for proper_pident target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False) while next(target_fasta): gene_length = len(target_fasta.seq) # add some useful columns search_df["proper_pident"] = search_df["pident"] * search_df[ "length"] / gene_length search_df["code"] = search_df["sseqid"].str[:-1] search_df["chain"] = search_df["sseqid"].str[-1] # filter results by self.percent_identical_cutoff. max_pident_found = search_df["proper_pident"].max() id_of_max_pident = tuple( search_df.loc[search_df["proper_pident"].idxmax(), ["code", "chain"]].values) search_df = search_df[ search_df["proper_pident"] >= self.percent_identical_cutoff] search_df = search_df.sort_values("proper_pident", ascending=False) # If more than 1 template in 1 PDB id, just choose 1 search_df = search_df.drop_duplicates('code', keep='first') # Order them and take the first self.modeller.max_number_templates. matches_after_filter = len(search_df) if not matches_after_filter: self.run.warning("Gene {} did not have a search result with proper percent identicalness above or equal " "to {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a " "proper percent identicalness of {:.2f}%. No structure will be modelled.".\ format(self.corresponding_gene_call, self.percent_identical_cutoff, id_of_max_pident[1], id_of_max_pident[0], max_pident_found)) raise self.EndModeller # get up to self.modeller.max_number_templates of those with the highest proper_ident scores. search_df = search_df.iloc[:min( [len(search_df), self.max_number_templates])] # Get their chain and 4-letter ids self.list_of_template_code_and_chain_ids = list( zip(search_df["code"], search_df["chain"])) self.run.info("Max number of templates allowed", self.max_number_templates) self.run.info("Number of candidate templates", matches_found) self.run.info( "After >{}% identical filter".format( self.percent_identical_cutoff), matches_after_filter) self.run.info("Number accepted as templates", len(self.list_of_template_code_and_chain_ids)) # update user on which templates are used, and write the templates to self.out for i in range(len(self.list_of_template_code_and_chain_ids)): pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i] ppi = search_df["proper_pident"].iloc[i] self.out["templates"]["pdb_id"].append(pdb_id) self.out["templates"]["chain_id"].append(chain_id) self.out["templates"]["ppi"].append(ppi) self.run.info( "Template {}".format(i + 1), "Protein ID: {}, Chain {} ({:.1f}% identical)".format( pdb_id, chain_id, ppi))
def check_database(self): """Setup the database files Downloads the .pir file if it is missing Binarizes .pir file if .bin is missing Creates the .dmnd file if it is missing """ bin_db_path = J(self.database_dir, self.modeller_database + ".bin") pir_db_path = J(self.database_dir, self.modeller_database + ".pir") bin_exists = utils.filesnpaths.is_file_exists(bin_db_path, dont_raise=True) pir_exists = utils.filesnpaths.is_file_exists(pir_db_path, dont_raise=True) if bin_exists and pir_exists: # We good pass else: if not pir_exists: # Download .pir self.run.warning( "Anvi'o looked in {} for a database with the name {} and with an extension " "of either .bin or .pir, but didn't find anything matching that " "criteria. Anvi'o will try and download the best database it knows of from " "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. " "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 " "database".format(self.database_dir, self.modeller_database)) db_download_path = os.path.join(self.database_dir, "pdb_95.pir.gz") utils.download_file( "https://salilab.org/modeller/downloads/pdb_95.pir.gz", db_download_path) utils.run_command( ['gzip', '-d', db_download_path], log_file_path=filesnpaths.get_temp_file_path()) # Binarize .pir (make .bin) self.run.warning( "Your database is not in binary format. That means accessing its contents is slower " "than it could be. Anvi'o is going to make a binary format. Just FYI" ) self.run_binarize_database(pir_db_path, bin_db_path) dmnd_db_path = J(self.database_dir, self.modeller_database + '.dmnd') if os.path.exists(dmnd_db_path): return self.run.warning( "Your diamond database does not exist. It will be created.") script_name = "pir_to_fasta.py" self.copy_script_to_directory(script_name) input_pir_path = J(self.database_dir, self.modeller_database + '.pir') fasta_path = J(self.database_dir, self.modeller_database + '.fa') dmnd_path = J(self.database_dir, self.modeller_database) command = [self.executable, script_name, input_pir_path, fasta_path] self.run_command(command, script_name=script_name, rename_log=False) temp = u.FastaOutput(filesnpaths.get_temp_file_path()) fasta = u.SequenceSource(fasta_path) while next(fasta): temp.write_id(fasta.id) temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X')) shutil.move(temp.output_file_path, fasta_path) fasta.close() temp.close() driver = diamond.Diamond( query_fasta=fasta_path, run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False), ) driver.makedb(output_file_path=dmnd_path) os.remove(fasta_path)
def run_search_and_parse_results(self): """Align the protein against the database based on only sequence""" # Change to MODELLER working directory os.chdir(self.directory) columns = [ 'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gaps', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore' ] driver = diamond.Diamond( query_fasta=self.target_fasta_path, target_fasta=J(self.database_dir, self.modeller_database + '.dmnd'), outfmt=' '.join(['6'] + columns), run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False), ) driver.blastp() # Change back to user directory os.chdir(self.start_dir) search_df = driver.view_as_dataframe( J(self.directory, driver.tabular_output_path)) matches_found = search_df.shape[0] if not matches_found: self.run.warning( "No proteins with homologous sequence were found for {}. No structure will be modelled" .format(self.corresponding_gene_call)) raise self.EndModeller # We need the gene length for pident target_fasta = u.SequenceSource(self.target_fasta_path, lazy_init=False) while next(target_fasta): gene_length = len(target_fasta.seq) # add some useful columns search_df["code"] = search_df["sseqid"].str[:-1] search_df["chain"] = search_df["sseqid"].str[-1] search_df["align_fraction"] = (search_df["length"] - search_df["gaps"]) / gene_length search_df["proper_pident"] = search_df["pident"] * search_df[ "align_fraction"] # Find best match for align fraction and pident code_chain_id_of_best = tuple( search_df.iloc[search_df['proper_pident'].argmax()][[ 'code', 'chain' ]].values) best_hit = search_df.loc[ (search_df['code'] == code_chain_id_of_best[0]) & \ (search_df['chain'] == code_chain_id_of_best[1]), ['pident', 'align_fraction'] ].iloc[0] # filter results by self.percent_cutoff and self.alignment_fraction_cutoff search_df = search_df[search_df["pident"] >= self.percent_cutoff] search_df = search_df[ search_df["align_fraction"] >= self.alignment_fraction_cutoff] # Rank by the alignment fraction times the percent id search_df = search_df.sort_values("proper_pident", ascending=False) # If more than 1 template in 1 PDB id, just choose 1 search_df = search_df.drop_duplicates('code', keep='first') matches_after_filter = len(search_df) if not matches_after_filter: self.run.warning("Gene {} did not have a search result with percent identicalness above or equal " "to {}% and alignment fraction above {}%. The best match was chain {} of https://www.rcsb.org/structure/{}, which had a " "percent identicalness of {:.2f}% and an alignment fraction of {:.3f}. No structure will be modelled.".\ format(self.corresponding_gene_call, self.percent_cutoff, self.alignment_fraction_cutoff, code_chain_id_of_best[1], code_chain_id_of_best[0], best_hit['pident'], best_hit['align_fraction'])) raise self.EndModeller # Filter out templates with proper_pident more than 5% less than best match # http://merenlab.org/2018/09/04/getting-started-with-anvi-structure/#how-much-do-templates-matter search_df = search_df[search_df['proper_pident'] >= ( search_df['proper_pident'].max() - 5)] # get up to self.modeller.max_number_templates of those with the highest proper_ident scores. search_df = search_df.iloc[:min( [len(search_df), self.max_number_templates])] # Get their chain and 4-letter ids self.list_of_template_code_and_chain_ids = list( zip(search_df["code"], search_df["chain"])) self.run.info("Max number of templates allowed", self.max_number_templates) self.run.info("Number of candidate templates", matches_found) self.run.info( "After >{}% identical filter".format(self.percent_cutoff), matches_after_filter) self.run.info("Number accepted as templates", len(self.list_of_template_code_and_chain_ids)) # update user on which templates are used, and write the templates to self.out for i in range(len(self.list_of_template_code_and_chain_ids)): pdb_id, chain_id = self.list_of_template_code_and_chain_ids[i] proper_percent_similarity = search_df["proper_pident"].iloc[i] percent_similarity = search_df["pident"].iloc[i] align_fraction = search_df["align_fraction"].iloc[i] self.out["templates"]["pdb_id"].append(pdb_id) self.out["templates"]["chain_id"].append(chain_id) self.out["templates"]["proper_percent_similarity"].append( proper_percent_similarity) self.out["templates"]["percent_similarity"].append( percent_similarity) self.out["templates"]["align_fraction"].append(align_fraction) self.run.info( "Template {}".format(i + 1), "Protein ID: {}, Chain {} ({:.1f}% identical, {:.2f} align fraction)" .format(pdb_id, chain_id, percent_similarity, align_fraction))