def get_kic_run_details(output_directory, pdb_id, loop_sets, test_mode = False): '''This function returns the details required to set up the analysis for the Rosetta KIC and NGK methods.''' details = [] c = 0 for sc_file in glob.glob(os.path.join(output_directory, '{0}*.sc'.format(pdb_id))): # Determine the id sc_filename = os.path.split(sc_file)[1] assert(sc_filename.startswith('{0}_score'.format(pdb_id))) run_id = int(sc_filename[10:-3]) # Determine the score sc_lines = [l.strip() for l in get_file_lines(sc_file) if l.strip()] assert(sc_lines[0] == 'SEQUENCE:') assert(sc_lines[1].split()[:2] == ['SCORE:', 'total_score']) assert(sc_lines[2].split()[0] == 'SCORE:') total_score = float(sc_lines[2].split()[1]) # Determine the filepath of the predicted structure associated_pdb_file = os.path.join(output_directory, '{0}_{0}{1}_0001.pdb'.format(pdb_id, run_id)) # Extract the PDB coordinates into a pandas dataframe (HDF5 format) assert(os.path.exists(associated_pdb_file)) hdf5_file = os.path.splitext(associated_pdb_file)[0] + '.hdf5' if os.path.exists(hdf5_file): store = pandas.HDFStore(hdf5_file) pdb_loop_residue_matrix = store['dataframe'] store.close() else: pdb_loop_residue_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(associated_pdb_file).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) store = pandas.HDFStore(hdf5_file) store['dataframe'] = pdb_loop_residue_matrix store.close() details.append(dict( id = run_id, score = total_score, predicted_structure = associated_pdb_file, pdb_loop_residue_matrix = pdb_loop_residue_matrix, )) if test_mode: c += 1 if c >= 10: break return details
def setup(): global pdb_file_paths # RCSB PDB_ID -> PDB file global rcsb_pdb_objects # RCSB PDB_ID -> PDB object global tina_pdb_objects # Tina's PDB_ID -> PDB object global tina_pdb_id_to_rcsb_pdb_id # Tina's PDB_ID -> RCSB PDB_ID global mutations_dataframe if not mutations_dataframe: setup_mutations_dataframe() # old_mutations_csv is missing some cases but has the mapping from pdb -> partner 1 name, partner 2 name old_mutations_csv = os.path.join('temp', 'mutations_Gsp1_old.txt') assert(os.path.exists('temp')) assert(os.path.exists(old_mutations_csv)) df = pandas.read_csv(old_mutations_csv, sep = '\t') tina_pdb_ids = sorted(set([p for p in df['pdb'].values])) rcsb_pdb_ids = set() for pdb_id in tina_pdb_ids: rcsb_pdb_ids.add(pdb_id[:4]) tina_pdb_id_to_rcsb_pdb_id[pdb_id] = pdb_id[:4] rcsb_pdb_ids = sorted(rcsb_pdb_ids) assert(rcsb_pdb_ids == sorted(set([p[:4] for p in mutations_dataframe['pdb'].values]))) rcsb_file_dir = '../../rawdata' for pdb_id in tina_pdb_ids: tina_pdb_objects[pdb_id] = PDB.from_filepath(os.path.join('temp', 'pdbs', '{0}.pdb'.format(pdb_id)), parse_ligands = True) for pdb_id in rcsb_pdb_ids: filename = '{0}.pdb'.format(pdb_id.upper()) pdb_file_paths[pdb_id.upper()] = os.path.join(rcsb_file_dir, filename) pdb_contents = download_pdb(pdb_id, rcsb_file_dir, silent = True, filename = filename) p = PDB(pdb_contents, parse_ligands = True) rcsb_pdb_objects[pdb_id] = p print('\nRosetta files ({0}) : {1}'.format(str(len(tina_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in tina_pdb_ids]))) print('Original files ({0}) : {1}\n'.format(str(len(rcsb_pdb_ids)).rjust(2), ', '.join([s.rjust(5) for s in rcsb_pdb_ids]))) ppi_api = get_ppi_api() for pdb_id, pdb_file_path in pdb_file_paths.iteritems(): existing_records = ppi_api.DDG_db.execute_select('SELECT * FROM PDBFile WHERE ID=%s', parameters=(pdb_id,)) if existing_records: colortext.warning('The PDB file {0} exists in the database.'.format(pdb_id)) complex_ids = ppi_api.search_complexes_by_pdb_id(pdb_id) if complex_ids: colortext.warning('The PDB file {0} has associated complexes: {1}'.format(pdb_id, ', '.join(map(str, complex_ids)))) print('')
def regenerate_mutfile(PredictionID): '''I needed to write this function as I forgot to add a *.mutfile mask to the ProtocolCleaner at first so mutfiles were not kept.''' raise Exception("We should never need to call this") KeepHETATMLines = False results = ddGdb.execute_select("SELECT ExperimentID, UserDataSetExperimentID FROM Prediction WHERE ID=%s", parameters = (PredictionID,)) assert(len(results) == 1) ExperimentID = results[0]['ExperimentID'] UserDataSetExperimentID = results[0]['UserDataSetExperimentID'] results = ddGdb.execute_select("SELECT PDBFileID FROM UserDataSetExperiment WHERE ID=%s", parameters = (UserDataSetExperimentID,)) assert(len(results) == 1) PDB_ID = results[0]['PDBFileID'] results = ddGdb.execute_select("SELECT PDBFileID, Content FROM Experiment INNER JOIN PDBFile WHERE Experiment.PDBFileID=PDBFile.ID AND Experiment.ID=%s", parameters = (ExperimentID,)) assert(len(results) == 1) experimentPDB_ID = results[0]["PDBFileID"] results = ddGdb.execute_select("SELECT ID, Content FROM PDBFile WHERE ID=%s", parameters=(PDB_ID)) if len(results) != 1: raise colortext.Exception("The SQL query '%s' returned %d results where 1 result was expected." % (sql, len(results))) predictionPDB_ID = results[0]["ID"] # Get the related PDB ID and file assert(len(results) == 1) result = results[0] pdbID = result["ID"] contents = result["Content"] pdb = PDB(contents.split("\n")) # Check that the mutated positions exist and that the wild-type matches the PDB mutations = ddGdb.call_select_proc("GetMutations", parameters = (ExperimentID,)) # todo: Hack. This should be removed when PDB homologs are dealt with properly. for mutation in mutations: if experimentPDB_ID == "1AJ3" and predictionPDB_ID == "1U5P": assert(int(mutation['ResidueID']) < 1000) mutation['ResidueID'] = str(int(mutation['ResidueID']) + 1762) pdb.validate_mutations(mutations) # Strip the PDB to the list of chains. This also renumbers residues in the PDB for Rosetta. chains = [result['Chain'] for result in ddGdb.call_select_proc("GetChains", parameters = (ExperimentID,))] pdb.stripForDDG(chains, KeepHETATMLines, numberOfModels = 1) # - Post stripping checks - # Get the 'Chain ResidueID' PDB-formatted identifier for each mutation mapped to Rosetta numbering # then check again that the mutated positions exist and that the wild-type matches the PDB remappedMutations = pdb.remapMutations(mutations, pdbID) remappedMutations = [[m[0], PDB.ResidueID2String(m[1]), m[2], m[3]] for m in remappedMutations] #resfile = self._createResfile(pdb, remappedMutations) return( _createMutfile(pdb, remappedMutations))
def static_get_pdb_object(pdb_id, bio_cache = None, cache_dir = None): '''This method does not necessarily use a BioCache but it seems to fit here.''' pdb_id = pdb_id.upper() if bio_cache: return bio_cache.get_pdb_object(pdb_id) if cache_dir: # Check to see whether we have a cached copy of the PDB file filepath = os.path.join(cache_dir, '{0}.pdb'.format(pdb_id)) if os.path.exists(filepath): return PDB.from_filepath(filepath) # Get any missing files from the RCSB and create cached copies if appropriate pdb_contents = retrieve_pdb(pdb_id) if cache_dir: write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents) return PDB(pdb_contents)
def __init__(self, xml_contents, pdb_contents, acceptable_sequence_percentage_match = 70.0, cache_dir = None, domain_overlap_cutoff = 0.88, require_uniprot_residue_mapping = True, bio_cache = None, pdb_id = None): ''' The PDB contents should be passed so that we can deal with HETATM records as the XML does not contain the necessary information. If require_uniprot_residue_mapping is set and there is no PDB residue -> UniProt sequence index mapping (e.g. 2IMM at the time of writing) then we raise an exception. Otherwise, we store the information we can which can still be useful e.g. SCOP domain data. bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly. ''' self.atom_to_uniparc_sequence_maps = {} # PDB Chain -> PDBUniParcSequenceMap(PDB ResidueID -> (UniParc ID, UniParc sequence index)) where the UniParc sequence index is 1-based (first element has index 1) # Note: These maps map from PDB residue IDs to PDBe residue IDs self.atom_to_seqres_sequence_maps = {} # PDB Chain -> SequenceMap(PDB ResidueID -> SEQRES sequence index) where the SEQRES sequence index is 1-based (first element has index 1) self.seqres_to_uniparc_sequence_maps = {} # PDB Chain -> PDBUniParcSequenceMap(SEQRES index -> (UniParc ID, UniParc sequence index)) where the SEQRES index and UniParc sequence index is 1-based (first element has index 1) self.counters = {} self.pdb_id = pdb_id self.bio_cache = bio_cache self.acceptable_sequence_percentage_match = acceptable_sequence_percentage_match self.tag_data = [] self.cache_dir = cache_dir self.uniparc_sequences = {} self.uniparc_objects = {} self.pdb_chain_to_uniparc_id_map = {} self.region_mapping = {} self.region_map_coordinate_systems = {} self.domain_overlap_cutoff = domain_overlap_cutoff # the percentage (measured in the range [0, 1.0]) at which we consider two domains to be the same e.g. if a Pfam domain of length 60 overlaps with a SCOP domain on 54 residues then the overlap would be 54/60 = 0.9 self.require_uniprot_residue_mapping = require_uniprot_residue_mapping self.xml_contents = xml_contents if bio_cache and pdb_id: self.modified_residues = bio_cache.get_pdb_object(pdb_id).modified_residues else: self.modified_residues = PDB(pdb_contents).modified_residues self._STACK = [] # This is used to create a simple FSA for the parsing self.current_residue = None self.residues = [] self.reading_unobserved_property = False self.uniparc_ids = set() assert(0 <= acceptable_sequence_percentage_match <= 100) assert(xml_contents.find("encoding='UTF-8'") != -1)
def main(FixedIDs = [], radii = [6.0, 7.0, 8.0, 9.0]): max_processors = get_number_of_processors() rescore_process_file = "/tmp/klab_rescore.txt" parser = OptionParser() parser.add_option("-n", "--numprocesses", default=1, type='int', dest="num_processes", help="The number of processes used for the rescoring. The cases are split according to this number.", metavar="NUM_PROCESSES") parser.add_option("-p", "--process", default=1, type='int', dest="process", help="The ID of this process. This should be an integer between 1 and the number of processes used for the rescoring.", metavar="PROCESS_ID") parser.add_option("-d", "--delete", action="store_true", dest="delete", help="Delete the process tracking file %s." % rescore_process_file) parser.add_option("-s", "--set", type='string', dest="prediction_set", help="The prediction set to rescore.") (options, args) = parser.parse_args() if options.delete and os.path.exists(rescore_process_file): print("Removing %s." % rescore_process_file) os.remove(rescore_process_file) num_processes = options.num_processes prediction_set = options.prediction_set process_id = options.process for i in FixedIDs: assert(type(i) == type(1)) # SELECT * FROM `Prediction` WHERE `PredictionSet`= 'RosCon2013_P16_score12prime' AND Status='done' LIMIT 1 # Check prediction set if not prediction_set: raise colortext.Exception("A prediction set must be specified.") else: if FixedIDs: results = ddGdb.execute("SELECT DISTINCT PredictionSet FROM Prediction WHERE ID IN (%s)" % ",".join(map(str, FixedIDs))) if len(results) != 1: raise colortext.Exception("Error: The fixed IDs cover %d different prediction sets." % len(results)) else: results = ddGdb.execute("SELECT ID FROM PredictionSet WHERE ID=%s", parameters=(prediction_set,)) if not results: raise colortext.Exception("The prediction set '%s' does not exist in the database." % prediction_set) if num_processes < 1: raise colortext.Exception("At least 1 processor must be used.") if num_processes > max_processors: raise colortext.Exception("Only %d processors/cores were detected. Cannot run with %d processes." % (max_processors, num_processes)) if num_processes > (max_processors * 0.75): colortext.warning("Warning: Using %d processors/cores out of %d which is %0.2f%% of the total available." % (num_processes, max_processors, (100.0*float(num_processes)/float(max_processors)))) if not(1 <= process_id <= min(max_processors, num_processes)): raise colortext.Exception("The process ID %d must be between 1 and the number of processes, %d." % (process_id, num_processes)) if os.path.exists(rescore_process_file): lines = readFileLines(rescore_process_file) idx = lines[0].find("numprocesses") if idx == -1: raise Exception("Badly formatted %s." % rescore_process_file) existing_num_processes = int(lines[0][idx+len("numprocesses"):]) if existing_num_processes != num_processes: raise colortext.Exception("You specified the number of processes to be %d but %s already specifies it as %d." % (num_processes, rescore_process_file, existing_num_processes)) for line in [line for line in lines[1:] if line.strip()]: idx = line.find("process") if idx == -1: raise colortext.Exception("Badly formatted %s. Line is '%s'." % (rescore_process_file, line)) existing_process = int(line[idx+len('process'):]) if process_id == existing_process: raise colortext.Exception("Process %d is already logged as running. Check if this is so and edit %s." % (process_id, rescore_process_file)) F = open(rescore_process_file, 'a') F.write("process %d\n" % process_id) F.close() else: F = open(rescore_process_file, 'w') F.write("numprocesses %d\n" % num_processes) F.write("process %d\n" % process_id) F.close() output_dir = os.path.join('rescoring', str(process_id)) if not(os.path.exists(output_dir)): os.makedirs(output_dir) abs_output_dir = os.path.abspath(os.path.join(os.getcwd(), output_dir)) print("Running process in %s.\n" % abs_output_dir) ReallyFixedIDs = False results = ddGdb.execute("SELECT ID, ExperimentID, Scores FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion <> %s", parameters=(prediction_set, float(current_score_revision),)) if not(FixedIDs) and results: raise WrongScoreRevisionException("Score versions found which are not %s. Need to update table structure." % current_score_revision) else: # Hacky way to run multiple processes if ReallyFixedIDs: num_to_score = len(remaining_unscored) num_for_this_to_score = num_to_score / num_processes IDs_to_score = remaining_unscored[(process_id-1) * num_for_this_to_score : (process_id) * num_for_this_to_score] results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s)" % (",".join(map(str, IDs_to_score)))) elif FixedIDs: results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE ID IN (%s) AND MOD(ID,%s)=%s" % (",".join(map(str, FixedIDs)), num_processes,process_id-1)) else: results = ddGdb.execute("SELECT ID, ExperimentID, Scores, UserDataSetExperimentID FROM Prediction WHERE PredictionSet=%s AND Status='done' AND ScoreVersion=%s AND MOD(ID,%s)=%s", parameters=(prediction_set, float(current_score_revision),num_processes,process_id-1)) count = 0 cases_computed = 0 total_time_in_secs = 0 number_of_cases_left = len(results) * len(radii) failed_cases = [] colortext.printf("Rescoring %d predictions over %d radii...\n" % (len(results), len(radii)), 'lightgreen') for r in results: t = Timer() t.add('Preamble') inner_count = 0 mutations = ddGdb.execute('SELECT * FROM ExperimentMutation WHERE ExperimentID=%s', parameters=(r['ExperimentID'],)) mutation_str = ', '.join(['%s %s%s%s' % (m['Chain'], m['WildTypeAA'], m['ResidueID'], m['MutantAA']) for m in mutations]) extracted_data = False details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],)) details = ddGdb.execute_select('SELECT Prediction.ID, PDBFileID, Chain FROM Prediction INNER JOIN Experiment ON Prediction.ExperimentID=Experiment.ID INNER JOIN ExperimentChain ON Prediction.ExperimentID=ExperimentChain.ExperimentID WHERE Prediction.ID=%s', parameters=(r['ID'],)) colortext.message("Prediction: %d, %s chain %s. Mutations: %s. Experiment ID #%d. UserDataSetExperimentID #%d." % (details[0]['ID'], details[0]['PDBFileID'], details[0]['Chain'], mutation_str, r['ExperimentID'], r['UserDataSetExperimentID'])) experiment_pdbID = ddGdb.execute('SELECT PDBFileID FROM Experiment WHERE ID=%s', parameters=(r['ExperimentID'],))[0]['PDBFileID'] print('Experiment PDB file ID = %s' % experiment_pdbID) pdbID = ddGdb.execute('SELECT UserDataSetExperiment.PDBFileID FROM Prediction INNER JOIN UserDataSetExperiment ON UserDataSetExperimentID=UserDataSetExperiment.ID WHERE Prediction.ID=%s', parameters=(r['ID'],))[0]['PDBFileID'] print('UserDataSetExperiment PDB file ID = %s' % pdbID) count += 1 if True:#len(mutations) == 1: timestart = time.time() #mutation = mutations[0] dbchains = sorted(set([mutation['Chain'] for mutation in mutations])) # todo: note: assuming monomeric structures here assert(len(dbchains) == 1) dbchain = dbchains[0] #mutantaa = mutation['MutantAA'] ddG_dict = json.loads(r['Scores']) kellogg_ddG = ddG_dict['data']['kellogg']['total']['ddG'] #assert(ddG_dict['version'] == current_score_revision) all_done = True for radius in radii: score_name = ('noah_%0.1fA' % radius).replace(".", ",") if not(ddG_dict['data'].get(score_name)): all_done = False else: cases_computed += 1 number_of_cases_left -= 1 if all_done: print('Prediction %d: done.' % r["ID"]) continue # Extract data t.add('Grab data') #archivefile = None #prediction_data_path = ddGdb.execute('SELECT Value FROM _DBCONSTANTS WHERE VariableName="PredictionDataPath"')[0]['Value'] #job_data_path = os.path.join(prediction_data_path, '%d.zip' % r['ID']) #print(job_data_path) #assert(os.path.exists(job_data_path)) #archivefile = readBinaryFile(job_data_path) archivefile = DDG_interface.getData(r['ID']) zipfilename = os.path.join(output_dir, "%d.zip" % r['ID']) F = open(zipfilename, "wb") F.write(archivefile) F.close() t.add('Extract data') zipped_content = zipfile.ZipFile(zipfilename, 'r', zipfile.ZIP_DEFLATED) tmpdir = None repacked_files = [] mutant_files = [] rosetta_resids = [] try: tmpdir = makeTemp755Directory(output_dir) highestIndex = -1 foundResfile = False foundMutfile = False presumed_mutation = None for fname in sorted(zipped_content.namelist()): if fname.endswith(".pdb"): if fname.startswith("%s/mut_" % r['ID']) or fname.startswith("%s/repacked_" % r['ID']): structnum = int(fname[fname.rindex('_')+1:-4]) if fname.startswith("%s/mut_" % r['ID']): if presumed_mutation: assert(presumed_mutation == os.path.split(fname)[1].split('_')[1]) else: presumed_mutation = os.path.split(fname)[1].split('_')[1] newfname = 'mutant_%02d' % structnum if fname.startswith("%s/repacked_" % r['ID']): newfname = 'repacked_%02d' % structnum highestIndex = max(highestIndex, structnum) newfilepath = os.path.join(tmpdir, newfname) writeFile(newfilepath, zipped_content.read(fname)) if fname.startswith("%s/mut_" % r['ID']): mutant_files.append(newfilepath) if fname.startswith("%s/repacked_" % r['ID']): repacked_files.append(newfilepath) #elif fname.startswith("%s/%s-%s" % (r['ID'],r['ExperimentID'],pdbID)) or fname.startswith("%s/repacked_" % r['ID']): # writeFile(os.path.join(tmpdir, '%s.pdb' % pdbID), zipped_content.read(fname)) if fname.startswith("%s/%s-%s.resfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)): raise Exception('This case needs to be updated (see the mutfile section below). We mainly use mutfiles now so I did not update this section.') foundResfile = True lines = zipped_content.read(fname).split("\n") assert(len(lines) == 3) assert(lines[0] == "NATAA") assert(lines[1] == "start") resfile_mutation = lines[2].split(" ") assert(len(resfile_mutation) == 4) rosetta_resid = resfile_mutation[0] rosetta_chain = resfile_mutation[1] rosetta_mutaa = resfile_mutation[3] assert(mutantaa == rosetta_mutaa) assert(dbchain == rosetta_chain) assert(resfile_mutation[2] == 'PIKAA') assert(len(rosetta_mutaa) == 1) if fname.startswith("%s/%s-%s.mutfile" % (r['ID'],r['ExperimentID'],experiment_pdbID)): foundMutfile = True lines = zipped_content.read(fname).split("\n") assert(lines[0].startswith('total ')) num_mutations = int(lines[0][6:]) assert(lines[1] == str(num_mutations)) # todo: note: assuming monomeric structures here rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],)) assert(len(rosetta_chain) == 1) rosetta_chain = rosetta_chain[0]['Chain'] resfile_mutations = lines[2:] for resfile_mutation in resfile_mutations: resfile_mutation = resfile_mutation.split(" ") assert(len(resfile_mutation) == 3) rosetta_resids.append(resfile_mutation[1]) rosetta_mutaa = resfile_mutation[2] assert(dbchain == rosetta_chain) assert(len(rosetta_mutaa) == 1) # Make sure the wtaa->mutantaa types match the structures assert(not(foundResfile)) if not foundMutfile: raise Exception('This case needs to be updated (see the mutfile section below). This was added as a hack for cases where I did not store the mutfile so I did not update this section.') input_files = ddGdb.execute_select('SELECT InputFiles FROM Prediction WHERE ID=%s', parameters=(r['ID'],)) assert(len(input_files) == 1) lines = pickle.loads(input_files[0]['InputFiles'])['MUTFILE'].split("\n") #lines = regenerate_mutfile(r['ID']).split("\n") assert(len(lines) == 3) assert(lines[0] == "total 1") assert(lines[1] == "1") resfile_mutation = lines[2].split(" ") assert(len(resfile_mutation) == 3) rosetta_resid = resfile_mutation[1] rosetta_chain = ddGdb.execute("SELECT Chain FROM ExperimentChain WHERE ExperimentID=%s", parameters=(r['ExperimentID'],)) assert(len(rosetta_chain) == 1) rosetta_chain = rosetta_chain[0]['Chain'] rosetta_mutaa = resfile_mutation[2] assert(dbchain == rosetta_chain) assert(len(rosetta_mutaa) == 1) assert("%s%s%s" % (resfile_mutation[0], resfile_mutation[1], resfile_mutation[2]) == presumed_mutation) fullresids = [] for rosetta_resid in rosetta_resids: fullresid = None if rosetta_resid.isdigit(): fullresid = '%s%s%s ' % (rosetta_chain, (4-len(rosetta_resid)) * ' ', rosetta_resid) else: assert(False) fullresid = '%s%s%s' % (rosetta_chain, (5-len(rosetta_resid)) * ' ', rosetta_resid) fullresids.append(fullresid) resultst1 = ddGdb.execute_select("SELECT ExperimentID, UserDataSetExperimentID FROM Prediction WHERE ID=%s", parameters = (r['ID'],)) assert(len(resultst1) == 1) ExperimentIDt1 = resultst1[0]['ExperimentID'] UserDataSetExperimentIDt1 = resultst1[0]['UserDataSetExperimentID'] if UserDataSetExperimentIDt1: resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM UserDataSetExperiment WHERE ID=%s", parameters = (UserDataSetExperimentIDt1,)) else: resultst2 = ddGdb.execute_select("SELECT PDBFileID FROM Experiment WHERE ID=%s", parameters = (ExperimentIDt1,)) assert(len(resultst2) == 1) prediction_PDB_ID = resultst2[0]['PDBFileID'] if False and prediction_PDB_ID not in ['1TEN', '1AYE', '1H7M'] + ['1A2P', '1BNI', '1STN']: for fullresid in fullresids: wtaa = None for m in mutations: # Hack for ub_RPN13 if prediction_PDB_ID == 'ub_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109): wtaa = m['WildTypeAA'] # Hack for ub_RPN13_yeast elif prediction_PDB_ID == 'uby_RPN13' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 109): wtaa = m['WildTypeAA'] # Hack for ub_OTU elif prediction_PDB_ID == 'ub_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172): wtaa = m['WildTypeAA'] # Hack for ub_OTU_yeast elif prediction_PDB_ID == 'uby_OTU' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 172): wtaa = m['WildTypeAA'] # Hack for ub_UQcon elif prediction_PDB_ID == 'ub_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) + 213): # starts at 501 wtaa = m['WildTypeAA'] # Hack for uby_UQcon elif prediction_PDB_ID == 'uby_UQcon' and m['Chain'] == fullresid[0] and m['ResidueID'] == str(int(fullresid[1:].strip()) - 287): wtaa = m['WildTypeAA'] elif m['Chain'] == fullresid[0] and m['ResidueID'] == fullresid[1:].strip(): wtaa = m['WildTypeAA'] if (wtaa == None): colortext.error(prediction_PDB_ID) colortext.error('wtaa == None') colortext.error('fullresid = %s' % str(fullresid)) colortext.error(str(mutations)) colortext.warning([rosetta_resid.strip() for rosetta_resid in rosetta_resids]) #sys.exit(0) assert(wtaa != None) assert(PDB.from_filepath(repacked_files[0]).get_residue_id_to_type_map()[fullresid] == wtaa) #assert(PDB(mutant_files[0]).get_residue_id_to_type_map()[fullresid] == mutantaa) for radius in radii: score_name = ('noah_%0.1fA' % radius).replace(".", ",") if ddG_dict['data'].get(score_name): print('Radius %0.1f: done.' % radius) continue cases_computed += 1 number_of_cases_left -= 1 t.add('Radius %0.3f: repacked' % radius) colortext.printf("Prediction ID: %d. Calculating radius %0.1f. Calculation #%d of %d." % (r['ID'], radius, cases_computed, len(results) * len(radii)), 'orange') repacked_score = NoahScore() repacked_score.calculate(repacked_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius) colortext.message("Repacked") print(repacked_score) t.add('Radius %0.3f: mutant' % radius) mutant_score = NoahScore() mutant_score.calculate(mutant_files, rosetta_chain, sorted([rosetta_resid.strip() for rosetta_resid in rosetta_resids]), radius = radius) colortext.printf("Mutant", color = 'cyan') print(mutant_score) t.add('Radius %0.3f: postamble' % radius) colortext.printf("ddG", color = 'lightpurple') ddg_score = repacked_score.ddg(mutant_score) print(ddg_score) colortext.printf("Liz's ddG", color = 'yellow') print("Total score: %0.3f" % kellogg_ddG) ddG_dict['version'] = '0.23' if ddG_dict['version'] == '0.1': ddG_dict['version'] = '0.21' ddG_dict['data'] = { 'kellogg' : { 'total' : ddG_dict['data'], }, 'noah': { 'total' : {'ddG' : ddg_score.total}, 'positional' : {'ddG' : ddg_score.positional}, 'positional_twoscore' : {'ddG' : ddg_score.positional_twoscore}, }, } elif ddG_dict['version'] == '0.2': ddG_dict['version'] = '0.21' ddG_dict['data']['noah']['total']['ddG'] = ddg_score.total ddG_dict['data']['noah']['positional']['ddG'] = ddg_score.positional ddG_dict['data']['noah']['positional_twoscore']['ddG'] = ddg_score.positional_twoscore elif ddG_dict['version'] == '0.22': ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}} ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore elif ddG_dict['version'] == '0.23': ddG_dict['data'][score_name] = {'total' : {}, 'positional' : {}, 'positional_twoscore' : {}} ddG_dict['data'][score_name]['total']['ddG'] = ddg_score.total ddG_dict['data'][score_name]['positional']['ddG'] = ddg_score.positional ddG_dict['data'][score_name]['positional_twoscore']['ddG'] = ddg_score.positional_twoscore jsonified_ddG = json.dumps(ddG_dict) ddGdb.execute('UPDATE Prediction SET Scores=%s WHERE ID=%s', parameters=(jsonified_ddG, r['ID'],)) t.add('Cleanup') shutil.rmtree(tmpdir) os.remove(zipfilename) except Exception, e: print("Exception! In prediction %d" % r['ID'], str(e)) failed_cases.append(r['ID']) import traceback print(traceback.format_exc()) if tmpdir: shutil.rmtree(tmpdir) total_time_in_secs += t.sum() average_time_taken = float(total_time_in_secs)/float(cases_computed or 1) estimate_remaining_time = number_of_cases_left * average_time_taken t.stop() colortext.printf("**Profile**", 'orange') print(t) colortext.message("Time taken for this case: %0.2fs." % t.sum()) colortext.message("Average time taken per case: %0.2fs." % average_time_taken) colortext.message("Estimated time remaining: %dh%dm%ds." % (int(estimate_remaining_time/3600), int((estimate_remaining_time/60) % 60), estimate_remaining_time % 60)) print("\n")
os.path.split(pdb_file)[1])[0].lower() file_prefix = os.path.splitext(pdb_file)[0] fasta_file = file_prefix + '.fasta' loop_file = file_prefix + '.loop.json' assert (os.path.exists(fasta_file)) assert (os.path.exists(loop_file)) # Convert the FASTA headers back into PDB residue IDs fasta_contents = read_file(fasta_file) headers = [ l for l in fasta_contents.split('\n') if l.startswith('>') ] assert (len(headers) == 1) header = headers[0] pdb_residue_ids = [ PDB.ChainResidueID2String(l[0], l[1:]) for l in header[header.find('Residues ') + 9:].split(';') ] # Add the missing atoms atoms back into the PDB file spackler = Spackler.from_filepath(pdb_file) new_pdb_content = spackler.add_backbone_atoms_linearly_from_loop_filepaths( loop_file, fasta_file, pdb_residue_ids) write_file( os.path.join(output_directory, '{0}.pdb'.format(pdb_prefix)), new_pdb_content) # Create a Rosetta .loop file loop_set = json.loads(read_file(loop_file)).get('LoopSet') assert (len(loop_set) == 1) start_res = '{chainID}{resSeq:>4d}{iCode}'.format(
class SIFTS(xml.sax.handler.ContentHandler): def __init__(self, xml_contents, pdb_contents, acceptable_sequence_percentage_match = 70.0, cache_dir = None, domain_overlap_cutoff = 0.88, require_uniprot_residue_mapping = True, bio_cache = None, pdb_id = None): ''' The PDB contents should be passed so that we can deal with HETATM records as the XML does not contain the necessary information. If require_uniprot_residue_mapping is set and there is no PDB residue -> UniProt sequence index mapping (e.g. 2IMM at the time of writing) then we raise an exception. Otherwise, we store the information we can which can still be useful e.g. SCOP domain data. bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly. ''' self.atom_to_uniparc_sequence_maps = {} # PDB Chain -> PDBUniParcSequenceMap(PDB ResidueID -> (UniParc ID, UniParc sequence index)) where the UniParc sequence index is 1-based (first element has index 1) # Note: These maps map from PDB residue IDs to PDBe residue IDs self.atom_to_seqres_sequence_maps = {} # PDB Chain -> SequenceMap(PDB ResidueID -> SEQRES sequence index) where the SEQRES sequence index is 1-based (first element has index 1) self.seqres_to_uniparc_sequence_maps = {} # PDB Chain -> PDBUniParcSequenceMap(SEQRES index -> (UniParc ID, UniParc sequence index)) where the SEQRES index and UniParc sequence index is 1-based (first element has index 1) self.counters = {} self.pdb_id = pdb_id self.bio_cache = bio_cache self.acceptable_sequence_percentage_match = acceptable_sequence_percentage_match self.tag_data = [] self.cache_dir = cache_dir self.uniparc_sequences = {} self.uniparc_objects = {} self.pdb_chain_to_uniparc_id_map = {} self.region_mapping = {} self.region_map_coordinate_systems = {} self.domain_overlap_cutoff = domain_overlap_cutoff # the percentage (measured in the range [0, 1.0]) at which we consider two domains to be the same e.g. if a Pfam domain of length 60 overlaps with a SCOP domain on 54 residues then the overlap would be 54/60 = 0.9 self.require_uniprot_residue_mapping = require_uniprot_residue_mapping self.xml_contents = xml_contents if bio_cache and pdb_id: self.modified_residues = bio_cache.get_pdb_object(pdb_id).modified_residues else: self.modified_residues = PDB(pdb_contents).modified_residues self._STACK = [] # This is used to create a simple FSA for the parsing self.current_residue = None self.residues = [] self.reading_unobserved_property = False self.uniparc_ids = set() assert(0 <= acceptable_sequence_percentage_match <= 100) assert(xml_contents.find("encoding='UTF-8'") != -1) def get_pdb_chain_to_uniparc_id_map(self): if self.pdb_chain_to_uniparc_id_map: return self.pdb_chain_to_uniparc_id_map else: self.pdb_chain_to_uniparc_id_map = {} for c, mp in self.atom_to_uniparc_sequence_maps.iteritems(): self.pdb_chain_to_uniparc_id_map[c] = self.pdb_chain_to_uniparc_id_map.get(c, set()) for _, v, _ in mp: self.pdb_chain_to_uniparc_id_map[c].add(v[0]) for c, mp in self.seqres_to_uniparc_sequence_maps.iteritems(): self.pdb_chain_to_uniparc_id_map[c] = self.pdb_chain_to_uniparc_id_map.get(c, set()) for _, v, _ in mp: self.pdb_chain_to_uniparc_id_map[c].add(v[0]) for c, s in self.pdb_chain_to_uniparc_id_map.iteritems(): self.pdb_chain_to_uniparc_id_map[c] = sorted(s) return self.pdb_chain_to_uniparc_id_map def get_uniparc_sequences(self): if self.uniparc_sequences: return self.uniparc_sequences else: self.uniparc_sequences = {} self.uniparc_objects = {} for UniParcID in self.uniparc_ids: entry = UniParcEntry(UniParcID, cache_dir = self.cache_dir) self.uniparc_sequences[entry.UniParcID] = Sequence.from_sequence(entry.UniParcID, entry.sequence) self.uniparc_objects[entry.UniParcID] = entry return self.uniparc_sequences @staticmethod def retrieve(pdb_id, cache_dir = None, acceptable_sequence_percentage_match = 70.0, require_uniprot_residue_mapping = True, bio_cache = None): '''Creates a PDBML object by using a cached copy of the files if they exists or by retrieving the files from the RCSB. bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly. ''' pdb_contents = None xml_contents = None pdb_id = pdb_id.upper() l_pdb_id = pdb_id.lower() if len(pdb_id) != 4 or not pdb_id.isalnum(): raise Exception("Bad PDB identifier '%s'." % pdb_id) if bio_cache: pdb_contents = bio_cache.get_pdb_contents(pdb_id) xml_contents = bio_cache.get_sifts_xml_contents(pdb_id) if cache_dir: if not pdb_contents: # Check to see whether we have a cached copy of the PDB file filename = os.path.join(cache_dir, "%s.pdb" % pdb_id) if os.path.exists(filename): pdb_contents = read_file(filename) if not xml_contents: # Check to see whether we have a cached copy of the XML file filename = os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id) if os.path.exists(filename): xml_contents = read_file(filename) # Get any missing files from the RCSB and create cached copies if appropriate if not pdb_contents: pdb_contents = rcsb.retrieve_pdb(pdb_id) if cache_dir: write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents) if not xml_contents: try: xml_contents = retrieve_xml(pdb_id, silent = False) if cache_dir: write_file(os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id), xml_contents) except FTPException550: raise MissingSIFTSRecord('The file "%s.sifts.xml.gz" could not be found on the EBI FTP server.' % l_pdb_id) xml_contents = xml_contents # Return the object handler = SIFTS(xml_contents, pdb_contents, acceptable_sequence_percentage_match = acceptable_sequence_percentage_match, cache_dir = cache_dir, require_uniprot_residue_mapping = require_uniprot_residue_mapping, bio_cache = bio_cache, pdb_id = pdb_id) xml.sax.parseString(xml_contents, handler) return handler def stack_push(self, lvl, data): if lvl == 0: assert(not(self._STACK)) else: assert(self._STACK and (len(self._STACK) == lvl)) for x in range(lvl): assert(self._STACK[x][0] == x) self._STACK.append((lvl, data)) def stack_pop(self, lvl): num_levels = lvl + 1 assert(self._STACK and (len(self._STACK) == num_levels)) for x in range(num_levels): assert(self._STACK[x][0] == x) self._STACK.pop() if lvl == 0: assert(not(self._STACK)) def check_stack(self, lvl): assert(self._STACK and (len(self._STACK) == lvl)) for x in range(lvl): assert(self._STACK[x][0] == x) def start_document(self): '''"The SAX parser will invoke this method only once, before any other methods in this interface or in DTDHandler (except for setDocumentLocator())."''' pass a=''' <entity type="protein" entityId="A"> <segment segId="1aqt_A_1_2" start="1" end="2"> <listResidue> <listMapRegion> <mapRegion start="3" end="138"> <db dbSource="PDB" dbCoordSys="PDBresnum" dbAccessionId="1aqt" dbChainId="A" start="3" end="138"/> </mapRegion>''' def add_region_mapping(self, attributes): chain_id = (self._get_current_PDBe_chain()) mapRegion_attributes = self._STACK[3][1] segment_range = (int(mapRegion_attributes['start']), int(mapRegion_attributes['end'])) dbSource = attributes['dbSource'] dbAccessionId = attributes['dbAccessionId'] self.region_mapping[chain_id] = self.region_mapping.get(chain_id, {}) self.region_mapping[chain_id][dbSource] = self.region_mapping[chain_id].get(dbSource, {}) self.region_mapping[chain_id][dbSource][dbAccessionId] = self.region_mapping[chain_id][dbSource].get(dbAccessionId, []) self.region_mapping[chain_id][dbSource][dbAccessionId].append(segment_range) # Note: I do not currently store the coordinate system type on a range level since I am assuming that each mapping uses one coordinate system if attributes.get('dbCoordSys'): self.region_map_coordinate_systems[dbSource] = self.region_map_coordinate_systems.get(dbSource, set()) self.region_map_coordinate_systems[dbSource].add(attributes['dbCoordSys']) def start_element(self, name, attributes): self.tag_data = '' # Residue details and mappings if name == 'crossRefDb': self.start_crossRefDb(attributes) elif name == 'residueDetail': self.stack_push(4, None) self.start_residueDetail(attributes) elif name == 'residue': self.stack_push(3, None) assert(attributes.get('dbSource')) assert(attributes.get('dbCoordSys')) assert(attributes.get('dbResNum')) assert(attributes.get('dbResName')) assert(attributes['dbSource'] == 'PDBe') assert(attributes['dbCoordSys'] == 'PDBe') self.current_residue = SIFTSResidue(self._get_current_PDBe_chain(), attributes['dbResNum'], attributes['dbResName']) elif name == 'listResidue': self.stack_push(2, None) # Region mappings elif name == 'db': if len(self._STACK) == 4 and self._STACK[3][1].get('nodeType') == 'mapRegion': assert(attributes.get('dbSource')) assert(attributes.get('dbAccessionId')) self.add_region_mapping(attributes) elif name == 'mapRegion': assert(attributes.get('start')) assert(attributes.get('end')) self.stack_push(3, dict(start=attributes['start'], end=attributes['end'], nodeType = 'mapRegion')) elif name == 'listMapRegion': self.stack_push(2, None) # Entities and segments elif name == 'segment': assert(attributes.get('segId')) assert(attributes.get('start')) assert(attributes.get('end')) self.stack_push(1, dict(segId=attributes['segId'], start=attributes['start'], end=attributes['end'])) elif name == 'entity': assert(attributes.get('type')) entityId = None if attributes['type'] == 'protein': entityId = attributes.get('entityId') self.stack_push(0, entityId) elif name == 'entry': self.counters['entry'] = self.counters.get('entry', 0) + 1 self.parse_header(attributes) def parse_header(self, attributes): if attributes.get('dbAccessionId'): pdb_id = attributes.get('dbAccessionId').upper() if self.pdb_id: assert(self.pdb_id.upper() == pdb_id) self.pdb_id = pdb_id else: raise Exception('Could not verify the PDB ID from the <entry> tag.') def start_residueDetail(self, attributes): self.check_stack(5) self.reading_unobserved_property = False dbSource = attributes.get('dbSource') assert(dbSource) if dbSource == 'PDBe': residue_detail_property = attributes.get('property') if residue_detail_property and residue_detail_property == 'Annotation': self.reading_unobserved_property = True def start_crossRefDb(self, attributes): self.check_stack(4) dbSource = attributes.get('dbSource') assert(dbSource) if dbSource == 'PDB' or dbSource == 'UniProt': current_residue = self.current_residue dbCoordSys = attributes.get('dbCoordSys') dbAccessionId = attributes.get('dbAccessionId') dbResNum = attributes.get('dbResNum') dbResName = attributes.get('dbResName') if dbSource == 'PDB': dbChainId = attributes.get('dbChainId') assert(dbCoordSys == "PDBresnum") assert(dbAccessionId.upper() == self.pdb_id.upper()) #assert(dbChainId == self._STACK[0][1]) # this is not always true e.g. 1lmb has entityId="C" but dbChainId="3" if not dbChainId == self._STACK[0][1]: # use the dbChainId chain ID since that is what is used in the logic later on. Note: this may introduce bugs if the dbChainIds differ amongst themselves self._STACK[0] = (0, dbChainId) assert(dbCoordSys and dbAccessionId and dbResNum and dbResName and dbChainId ) current_residue.add_pdb_residue(dbChainId, dbResNum, dbResName) elif dbSource == 'UniProt': assert(dbCoordSys and dbAccessionId and dbResNum and dbResName) assert(dbCoordSys == "UniProt") assert(dbCoordSys and dbAccessionId and dbResNum and dbResName) current_residue.add_uniprot_residue(dbAccessionId, dbResNum, dbResName) def _get_current_PDBe_chain(self): return self._STACK[0][1] def _get_current_segment_range(self): return (self._STACK[1][1]['start'], self._STACK[1][1]['end']) def end_element(self, name): tag_content = self.tag_data # Residue details and mappings if name == 'residueDetail': self.stack_pop(4) if self.reading_unobserved_property and (tag_content == 'Not_Observed'): self.current_residue.WasNotObserved = True self.reading_unobserved_property = False elif name == 'residue': self.stack_pop(3) current_residue = self.current_residue #assert(self._get_current_PDBe_chain() == current_residue.PDBChainID) # this is not always true e.g. 1lmb has entityId="C" but dbChainId="3" self.residues.append(current_residue) self.current_residue = None elif name == 'listResidue': self.stack_pop(2) # Region mappings elif name == 'mapRegion': self.stack_pop(3) elif name == 'listMapRegion': self.stack_pop(2) # Entities and segments elif name == 'segment': self.stack_pop(1) elif name == 'entity': self.stack_pop(0) def end_document(self): assert(self.counters['entry'] == 1) residue_count = 0 residues_matched = {} residues_encountered = set() atom_to_uniparc_residue_map = {} atom_to_seqres_residue_map = {} seqres_to_uniparc_residue_map = {} UniProtACs = set() for r in self.residues: if r.UniProtAC: UniProtACs.add(r.UniProtAC) ACC_to_UPARC_mapping = uniprot_map('ACC', 'UPARC', list(UniProtACs), cache_dir = self.cache_dir) assert(sorted(ACC_to_UPARC_mapping.keys()) == sorted(list(UniProtACs))) for k, v in ACC_to_UPARC_mapping.iteritems(): assert(len(v) == 1) ACC_to_UPARC_mapping[k] = v[0] map_chains = set() for r in self.residues: if not(r.PDBResidueID.isalnum() and int(r.PDBResidueID.isalnum()) < 0): # These are not valid PDB residue IDs - the SIFTS XML convention sometimes assigns negative residue IDs to unobserved residues before the first ATOM record # (only if the first residue ID is 1?) pass # Store the PDB->UniProt mapping if r.has_pdb_to_uniprot_mapping(): UniProtAC = r.UniProtAC UniParcID = ACC_to_UPARC_mapping[UniProtAC] self.uniparc_ids.add(UniParcID) full_pdb_residue_ID = r.get_pdb_residue_id() PDBChainID = r.PDBChainID map_chains.add(PDBChainID) residues_matched[PDBChainID] = residues_matched.get(PDBChainID, 0) if not r.WasNotObserved: # Do not add ATOM mappings when the ATOM data does not exist if r.has_pdb_to_uniprot_mapping(): atom_to_uniparc_residue_map[PDBChainID] = atom_to_uniparc_residue_map.get(PDBChainID, {}) atom_to_uniparc_residue_map[PDBChainID][full_pdb_residue_ID] = (UniParcID, r.UniProtResidueIndex) atom_to_seqres_residue_map[PDBChainID] = atom_to_seqres_residue_map.get(PDBChainID, {}) atom_to_seqres_residue_map[PDBChainID][full_pdb_residue_ID] = r.PDBeResidueID if r.has_pdb_to_uniprot_mapping(): seqres_to_uniparc_residue_map[PDBChainID] = seqres_to_uniparc_residue_map.get(PDBChainID, {}) seqres_to_uniparc_residue_map[PDBChainID][r.PDBeResidueID] = (UniParcID, r.UniProtResidueIndex) # Make sure we only have at most one match per PDB residue assert(full_pdb_residue_ID not in residues_encountered) residues_encountered.add(full_pdb_residue_ID) # Count the number of exact sequence matches PDBResidue3AA = r.PDBResidue3AA pdb_residue_type = residue_type_3to1_map.get(PDBResidue3AA) or self.modified_residues.get(PDBResidue3AA) or protonated_residue_type_3to1_map.get(PDBResidue3AA) or non_canonical_amino_acids.get(PDBResidue3AA) if r.has_pdb_to_uniprot_mapping(): if pdb_residue_type == r.UniProtResidue1AA: residues_matched[PDBChainID] += 1 residue_count += 1 # Create the SequenceMaps for c in map_chains: if residues_matched[c] > 0: # 1IR3 has chains A, # Chain A has mappings from atom and seqres (PDBe) residues to UniParc as usual # Chain B (18 residues long) has mappings from atom to seqres residues but not to UniParc residues self.atom_to_uniparc_sequence_maps[c] = PDBUniParcSequenceMap.from_dict(atom_to_uniparc_residue_map[c]) self.seqres_to_uniparc_sequence_maps[c] = PDBUniParcSequenceMap.from_dict(seqres_to_uniparc_residue_map[c]) self.atom_to_seqres_sequence_maps[c] = SequenceMap.from_dict(atom_to_seqres_residue_map[c]) # Check the match percentage total_residues_matched = sum([residues_matched[c] for c in residues_matched.keys()]) if total_residues_matched == 0: if self.pdb_id and self.pdb_id in NoSIFTSPDBUniParcMappingCases: if self.require_uniprot_residue_mapping: raise NoSIFTSPDBUniParcMapping('The PDB file %s has a bad or missing SIFTS mapping at the time of writing.' % self.pdb_id) else: colortext.error('Warning: The PDB file %s has a a bad or missing SIFTS mapping at the time of writing so there is no PDB -> UniProt residue mapping.' % self.pdb_id) else: if self.require_uniprot_residue_mapping: raise Exception('No residue information matching PDB residues to UniProt residues was found.') else: colortext.error('Warning: No residue information matching PDB residues to UniProt residues was found.') else: percentage_matched = float(total_residues_matched)*100.0/float(residue_count) if percentage_matched < self.acceptable_sequence_percentage_match: if self.pdb_id and self.pdb_id in BadSIFTSMappingCases: raise BadSIFTSMapping('The PDB file %s has a known bad SIFTS mapping at the time of writing.' % self.pdb_id) else: raise Exception('Expected %.2f%% sequence match on matched residues but the SIFTS results only gave us %.2f%%.' % (self.acceptable_sequence_percentage_match, percentage_matched)) # Merge the ranges for the region mappings i.e. so [1-3],[3-86] becomes [1-86] region_mapping = self.region_mapping for chain_id, chain_details in region_mapping.iteritems(): for dbSource, source_details in chain_details.iteritems(): for dbAccessionId, range_list in source_details.iteritems(): source_details[dbAccessionId] = merge_range_pairs(range_list) # Check to see if the expected numbering schemes hold for k, v in expected_residue_numbering_schemes.iteritems(): if self.region_map_coordinate_systems.get(k): assert(self.region_map_coordinate_systems[k] == set([v])) pfam_scop_mapping = {} scop_pfam_mapping = {} for chain_id, chain_details in region_mapping.iteritems(): if chain_details.get('Pfam') and chain_details.get('SCOP'): for pfamAccessionId, pfam_range_lists in chain_details['Pfam'].iteritems(): pfam_residues = parse_range(','.join(['%d-%d' % (r[0], r[1]) for r in pfam_range_lists])) for scopAccessionId, scop_range_lists in chain_details['SCOP'].iteritems(): scop_residues = parse_range(','.join(['%d-%d' % (r[0], r[1]) for r in scop_range_lists])) num_same_residues = len(set(pfam_residues).intersection(set(scop_residues))) if num_same_residues > 10: Pfam_match_quality = float(num_same_residues) / float(len(pfam_residues)) SCOP_match_quality = float(num_same_residues) / float(len(scop_residues)) if (Pfam_match_quality >= self.domain_overlap_cutoff) or (SCOP_match_quality >= self.domain_overlap_cutoff): pfam_scop_mapping[pfamAccessionId] = pfam_scop_mapping.get(pfamAccessionId, DomainMatch(pfamAccessionId, 'Pfam')) pfam_scop_mapping[pfamAccessionId].add(scopAccessionId, 'SCOP', SCOP_match_quality) scop_pfam_mapping[scopAccessionId] = scop_pfam_mapping.get(scopAccessionId, DomainMatch(scopAccessionId, 'SCOP')) scop_pfam_mapping[scopAccessionId].add(pfamAccessionId, 'Pfam', Pfam_match_quality) self.pfam_scop_mapping = pfam_scop_mapping self.scop_pfam_mapping = scop_pfam_mapping self._validate() def _validate(self): '''Tests that the maps agree through composition.''' # I used to use the assertion "self.atom_to_uniparc_sequence_maps.keys() == self.atom_to_seqres_sequence_maps.keys() == self.seqres_to_uniparc_sequence_maps.keys()" # but that failed for 2IMM where "self.atom_to_uniparc_sequence_maps.keys() == self.seqres_to_uniparc_sequence_maps.keys() == []" but THAT fails for 1IR3 so I removed # the assertions entirely. for c, m in self.atom_to_seqres_sequence_maps.iteritems(): if self.seqres_to_uniparc_sequence_maps.keys(): atom_uniparc_keys = set(self.atom_to_uniparc_sequence_maps.get(c, {}).keys()) atom_seqres_keys = set(self.atom_to_seqres_sequence_maps.get(c, {}).keys()) assert(atom_uniparc_keys.intersection(atom_seqres_keys) == atom_uniparc_keys) for k, v in m.map.iteritems(): uparc_id_1, uparc_id_2 = None, None try: uparc_id_1 = self.seqres_to_uniparc_sequence_maps[c].map[v] uparc_id_2 = self.atom_to_uniparc_sequence_maps[c].map[k] except: continue assert(uparc_id_1 == uparc_id_2) def characters(self, chrs): self.tag_data += chrs startDocument = start_document endDocument = end_document startElement = start_element endElement = end_element
def extract_analysis_data(dataset_list_file, output_directory, data_extraction_method, expectn, top_x, prefix, test_mode = False): '''This is the main function in this script and is where the basic analysis is compiled. output_directory should contain the results of the prediction run. data_extraction_method should be a function pointer to the method-specific function used to retrieve the prediction results e.g. get_kic_run_details expectn specifies how many predictions we expect to find (useful in case some jobs failed). top_x specifies how many of the best-scoring predictions should be used to generate the TopX metric results e.g. the Top5 RMSD metric value measures the lowest RMSD amongst the five best-scoring structures. prefix is used to name the output files. ''' # Sanity check assert(top_x <= expectn) # Set up reference structures structures_folder = os.path.join('..', 'input', 'structures', '12_res') rcsb_references = os.path.join(structures_folder, 'rcsb', 'reference') rosetta_references = os.path.join(structures_folder, 'rosetta', 'reference') # Set up the per-case statistics dicts best_scoring_structures = {} median_scoring_structures = {} worst_scoring_structures = {} total_percent_subanstrom = {} top_x_percent_subanstrom = {} top_x_loop_prediction_sets = {} # Set up the input file used to generate the graph plotting the "percentage of subangstrom models" metric over # varying values of X used to select the TopX structures percentage_subangstrom_over_top_X_plot_input = ['PDB\tX\tPercentage of subangstrom cases for TopX'] percent_subangrom_by_top_x = {} # Set up the summary analysis file csv_file = ['\t'.join(['PDB ID', 'Models', '%<1.0A', 'Top{0} %<1.0A'.format(top_x), 'Best score', 'Top{0} score'.format(top_x), 'Median score', 'Worst score', 'Closest score', 'Top1 RMSD', 'Top{0} RMSD'.format(top_x), 'Closest RMSD'])] # Read in the benchmark input pdb_ids = [os.path.splitext(os.path.split(s.strip())[1])[0] for s in get_file_lines(dataset_list_file) if s.strip()] # Truncate the benchmark input for test mode if test_mode: pdb_ids = pdb_ids[:10] # Analyze the performance for each case in the benchmark for pdb_id in pdb_ids: rcsb_reference_pdb = os.path.join(rcsb_references, pdb_id + '.pdb') assert(os.path.exists(rcsb_reference_pdb)) rosetta_reference_pdb = os.path.join(rosetta_references, pdb_id + '.pdb') assert(os.path.exists(rosetta_reference_pdb)) assert(len(pdb_id) == 4) loops_file = os.path.join(structures_folder, 'rosetta', 'pruned', '{0}.loop.json'.format(pdb_id)) loop_sets = json.loads(read_file(loops_file)) assert(len(loop_sets['LoopSet']) == 1) # Create a container for loop predictions loop_prediction_set = LoopPredictionSet() # Read the coordinates from the reference PDB file rcsb_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rcsb_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) rosetta_reference_matrix = PDB.extract_xyz_matrix_from_loop_json(PDB.from_filepath(rosetta_reference_pdb).structure_lines, loop_sets, atoms_of_interest = backbone_atoms, expected_num_residues = 12, expected_num_residue_atoms = 4) colortext.wgreen('\n\nReading in the run details for {0}:'.format(pdb_id)) details = data_extraction_method(output_directory, pdb_id, loop_sets, test_mode = test_mode) for d in details: loop_prediction = loop_prediction_set.add(d['id'], d['score'], pdb_id = pdb_id, rmsd = None, pdb_path = d['predicted_structure'], pdb_loop_residue_matrix = d['pdb_loop_residue_matrix']) print(' Done') # Compute the RMSD for this case for the structure using the pandas dataframe # It is more efficient to do this after truncation if truncating by score but in the general case users will # probably want to consider all predictions. If not (e.g. for testing) then arbitrary subsets can be chosen # in the loop above colortext.wgreen('Computing RMSDs for {0}:'.format(pdb_id)) loop_prediction_set.compute_rmsds(rcsb_reference_matrix) loop_prediction_set.check_rmsds(rosetta_reference_matrix) print(' Done\n') # Truncate the structures to the top expectn-scoring files loop_prediction_set.sort_by_score() loop_prediction_set.truncate(expectn) if len(loop_prediction_set) != expectn: print('Error: Expected {0} structures but only found {1}.'.format(expectn, len(loop_prediction_set))) sys.exit(1) # Create a new set containing the top-X-scoring structures and identify the median-scoring structure top_x_loop_prediction_sets[pdb_id] = loop_prediction_set[:top_x] median_scoring_structures[pdb_id] = loop_prediction_set[int(expectn / 2)] # Determine the lowest-/best-scoring structure best_scoring_structures[pdb_id] = loop_prediction_set[0] best_score = best_scoring_structures[pdb_id].score worst_scoring_structures[pdb_id] = loop_prediction_set[-1] worst_score = worst_scoring_structures[pdb_id].score assert(top_x_loop_prediction_sets[pdb_id][0] == best_scoring_structures[pdb_id]) # Print structures colortext.warning('Top{0} structures'.format(top_x)) print(top_x_loop_prediction_sets[pdb_id]) colortext.warning('Top1 structure') print(best_scoring_structures[pdb_id]) colortext.warning('Median (by score) structure') print(median_scoring_structures[pdb_id]) colortext.warning('Lowest-scoring structures') print(worst_scoring_structures[pdb_id]) # Create values for TopX variable plot loop_prediction_set.sort_by_score() for top_x_var in range(1, len(loop_prediction_set) + 1): new_subset = loop_prediction_set[:top_x_var] percent_subangstrom = 100 * new_subset.fraction_with_rmsd_lt(1.0) percentage_subangstrom_over_top_X_plot_input.append('{0}\t{1}\t{2}'.format(pdb_id, top_x_var, percent_subangstrom)) percent_subangrom_by_top_x[top_x_var] = percent_subangrom_by_top_x.get(top_x_var, {}) percent_subangrom_by_top_x[top_x_var][pdb_id] = percent_subangstrom total_percent_subanstrom[pdb_id] = 100 * loop_prediction_set.fraction_with_rmsd_lt(1.0) top_x_percent_subanstrom[pdb_id] = 100 * top_x_loop_prediction_sets[pdb_id].fraction_with_rmsd_lt(1.0) colortext.warning('Number of sub-angstrom cases in the full set of {0}: {1}'.format(expectn, total_percent_subanstrom[pdb_id])) colortext.warning('Number of sub-angstrom cases in the TopX structures: {1}'.format(expectn, top_x_percent_subanstrom[pdb_id])) loop_prediction_set.sort_by_rmsd() closest_rmsd = loop_prediction_set[0].rmsd closest_score = loop_prediction_set[0].score colortext.warning('RMSD of closest model: {0}'.format(closest_rmsd)) colortext.warning('Score of closest model: {0}'.format(closest_score)) top_1_rmsd = best_scoring_structures[pdb_id].rmsd top_x_rmsd = best_scoring_structures[pdb_id].rmsd top_x_score = best_scoring_structures[pdb_id].score for s in top_x_loop_prediction_sets[pdb_id]: if (s.rmsd < top_x_rmsd) or (s.rmsd == top_x_rmsd and s.score < top_x_score): top_x_rmsd = s.rmsd top_x_score = s.score assert(top_x_score <= worst_score) assert(top_x_rmsd <= top_1_rmsd) print('Top 1 RMSD (predicted vs Rosetta/RCSB reference structure): {0}'.format(top_1_rmsd)) print('Top {0} RMSD (predicted vs Rosetta/RCSB reference structure): {1}'.format(top_x, top_x_rmsd)) csv_file.append('\t'.join(map(str, [pdb_id, expectn, total_percent_subanstrom[pdb_id], top_x_percent_subanstrom[pdb_id], best_score, top_x_score, median_scoring_structures[pdb_id].score, worst_score, closest_score, top_1_rmsd, top_x_rmsd, closest_rmsd]))) # Add a column of median percent subangstrom values for top_x_var, values_by_pdb in sorted(percent_subangrom_by_top_x.iteritems()): assert(sorted(values_by_pdb.keys()) == sorted(pdb_ids)) median_value = sorted(values_by_pdb.values())[len(pdb_ids) / 2] percentage_subangstrom_over_top_X_plot_input.append('Median\t{1}\t{2}'.format(pdb_id, top_x_var, median_value)) write_file('{0}analysis.csv'.format(prefix), '\n'.join(csv_file)) write_file('{0}analysis.tsv'.format(prefix), '\n'.join(csv_file)) write_file('{0}percentage_subangstrom_over_top_X.tsv'.format(prefix), '\n'.join(percentage_subangstrom_over_top_X_plot_input))
def setup_jobs(outpath, options, input_files): ''' This function sets up the jobs by creating the necessary input files as expected. - outpath is where the output is to be stored. - options is the optparse options object. - input_files is a list of paths to input files. ''' job_inputs = None reverse_mapping = None fasta_file_contents = {} # Generate FASTA files for PDB inputs # fasta_file_contents is a mapping from a file path to a pair (FASTA contents, file type). We remember the file type # since we offset residue IDs depending on file type i.e. for FASTA files, we treat each sequence separately and do # not renumber the fragments in postprocessing. For PDB files, however, we need to respect the order and length of # sequences so that we renumber the fragments appropriately in postprocessing - we assume that if a PDB file is passed in # then all chains (protein, RNA, or DNA) will be used in a Rosetta run. for input_file in input_files: assert(not(fasta_file_contents.get(input_file))) if any(fnmatch(input_file, x) for x in pdb_file_wildcards): pdb = PDB.from_filepath(input_file, strict=True) pdb.pdb_id = os.path.basename(input_file).split('.')[0] if pdb.pdb_id.startswith('pdb') and len(pdb.pdb_id) >= 7: # Hack to rename FASTA identifiers for pdb*.ent files which are present in mirrors of the PDB pdb.pdb_id = pdb.pdb_id.replace('pdb', '') fasta_file_contents[input_file] = (pdb.create_fasta(prefer_seqres_order = False), 'PDB') else: fasta_file_contents[input_file] = (read_file(input_file), 'FASTA') # Extract sequences from the input FASTA files. found_sequences, reverse_mapping, errors = get_sequences(options, fasta_file_contents) if found_sequences: reformat(found_sequences) if errors: return None, False, errors # Discard sequences that are the wrong chain. desired_sequences = {} for key, sequence in found_sequences.iteritems(): pdb_id, chain, file_name = key if options.chain is None or chain == options.chain: desired_sequences[key] = sequence # Create the input FASTA and script files. job_inputs, errors = create_inputs(options, outpath, desired_sequences) # Create the reverse mapping file if reverse_mapping: segment_mapping_file = os.path.join(outpath, "segment_map.json") colorprinter.message("Creating a reverse mapping file %s." % segment_mapping_file) write_file(segment_mapping_file, json.dumps(reverse_mapping)) # Create the post-processing script file post_processing_script = read_file(os.path.join(os.path.split(os.path.realpath(__file__))[0], 'post_processing.py')) write_file(os.path.join(outpath, 'post_processing.py'), post_processing_script, 'w') # Create the secondary structure filter file if options.secondary_structure_file: write_file(os.path.join(outpath, 'ss_filter.json'), json.dumps({'secondary_structure_filter' : SecondaryStructureDefinition.from_filepath(options.secondary_structure_file).data}), 'w') return job_inputs, reverse_mapping != None, errors
from klab.bio.basics import ChainMutation from klab.fs.fsio import read_file, write_temp_file, open_temp_file, write_file from klab.bio.pfam import Pfam from klab.bio.dssp import MonomerDSSP, ComplexDSSP, MissingAtomException from klab.bio.ligand import Ligand, PDBLigand from klab.bio.pdbtm import PDBTM from klab.db.sqlalchemy_interface import get_single_record_from_query, get_or_create_in_transaction from kddg.api.schema import test_schema_against_database_instance from kddg.api.schema import PDBFile, PDBChain, PDBMolecule, PDBMoleculeChain, PDBResidue, LigandDescriptor, LigandIdentifier, LigandSynonym, PDBLigand from kddg.api.schema import Ligand as DBLigand #from kddg.api.schema import Publication, PublicationAuthor, PublicationIdentifier from kddg.api.layers import * from kddg.api.db import ddG, PartialDataException, SanityCheckException import kddg.api.dbi as dbi rosetta_scripts_path = '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease' rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database' p = PDB(read_file('/kortemmelab/data/kyleb/ddg_numbering_for_shane/24548-data/1CBW_FGHI.pdb')) #p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path) p.construct_pdb_to_rosetta_residue_map(rosetta_scripts_path, rosetta_database_path, extra_command_flags = '-ignore_zero_occupancy false -ignore_unrecognized_res') pprint.pprint(p.get_atom_sequence_to_rosetta_map()) pprint.pprint(p.rosetta_sequences) from kddg.api.ppi import get_interface as get_ppi_interface ppi_api = get_ppi_interface(read_file('../misc/ddgdb.pw'), rosetta_scripts_path = '/home/oconchus/t14benchmarking/r57934/main/source/bin/rosetta_scripts.linuxgccrelease', rosetta_database_path = '/home/oconchus/t14benchmarking/r57934/main/database') content = ppi_api.DDG_db.execute_select('SELECT Content FROM PDBFile WHERE ID="1CBW"')[0]['Content'] print(content) write_file('/tmp/ddginterface/1CBW_FGHI_db.pdb', content)
# align_two_simple_sequences(fasta_sequence, uniparc_sequence, sequence1name = '%s:%s|PDBID|CHAIN|SEQUENCE' % (pdb_id, c), sequence2name = uniparc_id) # sanity check - see if uniprotAC in pdb is in the list of the matched uniprot id print(chains) sys.exit(0) px = PDBML.retrieve('1A2C', cache_dir='/home/oconchus/temp') for k, v in sorted(px.atom_to_seqres_sequence_maps.iteritems(), key=lambda x:(x[0], x[1])): print(k,v) p = PDB.from_filepath('../.testdata/1H38.pdb') # has protein, DNA, RNA p = PDB.from_filepath('../.testdata/1ZC8.pdb') p = PDB.from_filepath('../.testdata/4IHY.pdb') #p = PDB('../.testdata/2GRB.pdb') p = PDB.from_filepath('../.testdata/1J1M.pdb') p = PDB.from_filepath('../.testdata/1H38.pdb') p = PDB.from_filepath('../.testdata/1A2C.pdb') #print(p.structure_lines) colortext.message("Resolution") print(p.get_resolution()) colortext.message("Techniques") print(p.get_techniques())