def mk_native_files(self, native_pdb, native_mtz): """Create the files required by SHELXE from the native structure Parameters ---------- native_pdb : str Path to the native PDB file native_mtz : str Path to the native MTZ file """ mtz_util.to_hkl(native_mtz, hkl_file=os.path.join(self.work_dir, self.stem + ".hkl")) shutil.copyfile(native_pdb, os.path.join(self.work_dir, self.stem + ".ent"))
def analyse(amoptd, newroot=None): if newroot: assert os.path.isdir(newroot) global _oldroot, _newroot _newroot = newroot _oldroot = amoptd['work_dir'] if not os.path.isdir(fixpath(amoptd['benchmark_dir'])): os.mkdir(fixpath(amoptd['benchmark_dir'])) os.chdir(fixpath(amoptd['benchmark_dir'])) # AnalysePdb may have already been called from the main script if amoptd['native_pdb'] and 'native_pdb_std' not in amoptd: analysePdb(amoptd) if amoptd['native_pdb_std']: # Generate an SHELXE HKL and ENT file so that we can calculate phase errors mtz_util.to_hkl(amoptd['mtz'], hkl_file=os.path.join(amoptd['benchmark_dir'], SHELXE_STEM + ".hkl")) shutil.copyfile( amoptd['native_pdb_std'], os.path.join(amoptd['benchmark_dir'], SHELXE_STEM + ".ent")) if amoptd['native_pdb'] and \ not (amoptd['homologs'] or amoptd['ideal_helices'] or amoptd['import_ensembles'] or amoptd['single_model_mode']): analyseModels(amoptd) # Get the ensembling data if 'ensembles_data' not in amoptd or not len(amoptd['ensembles_data']): logger.critical("Benchmark cannot find any ensemble data!") return # Get dict of ensemble name -> ensemble result ensemble_results = {e['name']: e for e in amoptd['ensembles_data']} # Get mrbump_results for cluster if 'mrbump_results' not in amoptd or not len(amoptd['mrbump_results']): logger.critical("Benchmark cannot find any mrbump results!") return data = [] mrinfo = shelxe.MRinfo(amoptd['shelxe_exe'], amoptd['native_pdb_info'].pdb, amoptd['mtz']) for result in amoptd['mrbump_results']: # use mrbump dict as basis for result object d = copy.copy(result) # Add in the data from the ensemble d.update(ensemble_results[d['ensemble_name']]) assert d['ensemble_name'] == d['name'], d # Hack for old results if 'truncation_num_residues' in d: d['num_residues'] = d['truncation_num_residues'] del d['truncation_num_residues'] # Hack for ideal helices where num_residues are missing if amoptd['ideal_helices'] and ('num_residues' not in d or d['num_residues'] is None): d['num_residues'] = int(d['ensemble_name'].lstrip('polyala')) # Get the ensemble data and add to the MRBUMP data d['ensemble_percent_model'] = int( (float(d['num_residues']) / float(amoptd['fasta_length'])) * 100) if amoptd['native_pdb']: # Add in stuff we've cleaned from the pdb native_keys = [ 'native_pdb_code', 'native_pdb_title', 'native_pdb_resolution', 'native_pdb_solvent_content', 'native_pdb_space_group', 'native_pdb_num_chains', 'native_pdb_num_atoms', 'native_pdb_num_residues' ] d.update({key: amoptd[key] for key in native_keys}) # Analyse the solution analyseSolution(amoptd, d, mrinfo) data.append(d) # Put everything in a pandas DataFrame dframe = pd.DataFrame(data) # General stuff dframe['ample_version'] = amoptd['ample_version'] dframe['fasta_length'] = amoptd['fasta_length'] # Analyse subcluster centroid models if 'subcluster_centroid_model' in dframe.columns and amoptd['native_pdb']: centroid_index = dframe.index centroid_models = [ fixpath(f) for f in dframe.subcluster_centroid_model ] native_pdb_std = fixpath(amoptd['native_pdb_std']) fasta = fixpath(amoptd['fasta']) # Calculation of TMscores for subcluster centroid models if amoptd['have_tmscore']: tm = tm_util.TMscore(amoptd['tmscore_exe'], wdir=fixpath(amoptd['benchmark_dir']), **amoptd) tm_results = tm.compare_structures(centroid_models, [native_pdb_std], [fasta]) centroid_tmscores = [r['tmscore'] for r in tm_results] centroid_rmsds = [r['rmsd'] for r in tm_results] else: # Use maxcluster centroid_tmscores = [] for centroid_model in centroid_models: n = os.path.splitext(os.path.basename(centroid_model))[0] cm = None for pdb in amoptd['models']: if n.startswith( os.path.splitext(os.path.basename(pdb))[0]): cm = pdb break if cm: centroid_tmscores.append(_MAXCLUSTERER.tm(cm)) else: msg = "Cannot find model for subcluster_centroid_model {0}".format( dframe[0, 'subcluster_centroid_model']) raise RuntimeError(msg) # There is an issue here as this is the RMSD over the TM-aligned residues # NOT the global RMSD, which it is for the tm_util results centroid_rmsds = [None for _ in centroid_index] dframe['subcluster_centroid_model_TM'] = pd.Series( centroid_tmscores, index=centroid_index) dframe['subcluster_centroid_model_RMSD'] = pd.Series( centroid_rmsds, index=centroid_index) # Save the data file_name = os.path.join(fixpath(amoptd['benchmark_dir']), 'results.csv') dframe.to_csv(file_name, columns=_CSV_KEYLIST, index=False, na_rep="N/A") amoptd['benchmark_results'] = dframe.to_dict('records') return
def analyse(amoptd, newroot=None): if newroot: assert os.path.isdir(newroot) global _oldroot,_newroot _newroot = newroot _oldroot = amoptd['work_dir'] if not os.path.isdir(fixpath(amoptd['benchmark_dir'])): os.mkdir(fixpath(amoptd['benchmark_dir'])) os.chdir(fixpath(amoptd['benchmark_dir'])) # AnalysePdb may have already been called from the main script if amoptd['native_pdb'] and 'native_pdb_std' not in amoptd: analysePdb(amoptd) if amoptd['native_pdb_std']: # Generate an SHELXE HKL and ENT file so that we can calculate phase errors mtz_util.to_hkl(amoptd['mtz'], hkl_file=os.path.join(amoptd['benchmark_dir'], SHELXE_STEM + ".hkl")) shutil.copyfile(amoptd['native_pdb_std'], os.path.join(amoptd['benchmark_dir'], SHELXE_STEM + ".ent")) if amoptd['native_pdb'] and \ not (amoptd['homologs'] or amoptd['ideal_helices'] or amoptd['import_ensembles'] or amoptd['single_model_mode']): analyseModels(amoptd) # Get the ensembling data if 'ensembles_data' not in amoptd or not len(amoptd['ensembles_data']): logger.critical("Benchmark cannot find any ensemble data!") return # Get dict of ensemble name -> ensemble result ensemble_results = { e['name']: e for e in amoptd['ensembles_data'] } # Get mrbump_results for cluster if 'mrbump_results' not in amoptd or not len(amoptd['mrbump_results']): logger.critical("Benchmark cannot find any mrbump results!") return data = [] mrinfo = shelxe.MRinfo(amoptd['shelxe_exe'], amoptd['native_pdb_info'].pdb, amoptd['mtz']) for result in amoptd['mrbump_results']: # use mrbump dict as basis for result object d = copy.copy(result) # Add in the data from the ensemble d.update(ensemble_results[d['ensemble_name']]) assert d['ensemble_name'] == d['name'], d # Hack for old results if 'truncation_num_residues' in d: d['num_residues'] = d['truncation_num_residues'] del d['truncation_num_residues'] # Hack for ideal helices where num_residues are missing if amoptd['ideal_helices'] and ('num_residues' not in d or d['num_residues'] is None): d['num_residues'] = int(d['ensemble_name'].lstrip('polyala')) # Get the ensemble data and add to the MRBUMP data d['ensemble_percent_model'] = int((float(d['num_residues']) / float(amoptd['fasta_length'])) * 100) if amoptd['native_pdb']: # Add in stuff we've cleaned from the pdb native_keys = [ 'native_pdb_code', 'native_pdb_title', 'native_pdb_resolution', 'native_pdb_solvent_content', 'native_pdb_space_group', 'native_pdb_num_chains', 'native_pdb_num_atoms', 'native_pdb_num_residues' ] d.update({key: amoptd[key] for key in native_keys}) # Analyse the solution analyseSolution(amoptd, d, mrinfo) data.append(d) # Put everything in a pandas DataFrame dframe = pd.DataFrame(data) # General stuff dframe['ample_version'] = amoptd['ample_version'] dframe['fasta_length'] = amoptd['fasta_length'] # Analyse subcluster centroid models if 'subcluster_centroid_model' in dframe.columns and amoptd['native_pdb']: centroid_index = dframe.index centroid_models = [fixpath(f) for f in dframe.subcluster_centroid_model] native_pdb_std = fixpath(amoptd['native_pdb_std']) fasta = fixpath(amoptd['fasta']) # Calculation of TMscores for subcluster centroid models if amoptd['have_tmscore']: tm = tm_util.TMscore(amoptd['tmscore_exe'], wdir=fixpath(amoptd['benchmark_dir']), **amoptd) tm_results = tm.compare_structures(centroid_models, [native_pdb_std], [fasta]) centroid_tmscores = [r['tmscore'] for r in tm_results] centroid_rmsds = [r['rmsd'] for r in tm_results] else: raise RuntimeError("No program to calculate tmscores!") dframe['subcluster_centroid_model_TM'] = pd.Series(centroid_tmscores, index=centroid_index) dframe['subcluster_centroid_model_RMSD'] = pd.Series(centroid_rmsds, index=centroid_index) # Save the data file_name = os.path.join(fixpath(amoptd['benchmark_dir']), 'results.csv') dframe.to_csv(file_name, columns=_CSV_KEYLIST, index=False, na_rep="N/A") amoptd['benchmark_results'] = dframe.to_dict('records') return