def gather_templates_from_pdb(pdbids, uniprot_domain_regex=None, chainids=None, structure_dirs=None, loglevel=None): """ :param pdbids: list of str :param uniprot_domain_regex: str :param chainids: dict {pdbid (str): [chainid (str)]} :param structure_dirs: list of str :return: """ ensembler.utils.set_loglevel(loglevel) manual_overrides = ensembler.core.ManualOverrides() selected_pdbchains = None if mpistate.rank == 0: for pdbid in pdbids: get_structure_files_for_single_pdbchain(pdbid, structure_dirs) uniprot_acs = extract_uniprot_acs_from_sifts_files(pdbids) logger.debug('Extracted UniProt ACs: {0}'.format(uniprot_acs)) uniprot_ac_query_string = ensembler.uniprot.build_uniprot_query_string_from_acs(uniprot_acs) uniprotxml = ensembler.uniprot.get_uniprot_xml(uniprot_ac_query_string) selected_pdbchains = extract_template_pdbchains_from_uniprot_xml(uniprotxml, uniprot_domain_regex=uniprot_domain_regex, manual_overrides=manual_overrides, specified_pdbids=pdbids, specified_chainids=chainids) selected_pdbchains = mpistate.comm.bcast(selected_pdbchains, root=0) logger.debug('Selected PDB chains: {0}'.format([pdbchain['templateid'] for pdbchain in selected_pdbchains])) selected_templates = extract_template_pdb_chain_residues(selected_pdbchains) write_template_seqs_to_fasta_file(selected_templates) extract_template_structures_from_pdb_files(selected_templates) write_gather_templates_from_pdb_metadata(pdbids, uniprot_domain_regex, len(selected_templates), chainids, structure_dirs)
def __init__(self, targetid, traj_filepath=None, topol_filepath=None, models_data_filepath=None, process_only_these_templates=None, loglevel=None, run_main=True): """ Makes trajectory of the model files with added hydrogens, but prior to any refinement. For the specified target, makes a single topology pdb file, a single trajectory xtc file, and individual pdb files for each model. See docs on `MkTraj` for further info on paramters. Examples -------- MkTrajImplicitStart(targetid='EGFR_HUMAN_D0') """ ensembler.utils.set_loglevel(loglevel) ensembler.core.check_project_toplevel_dir() self.models_target_dir = os.path.join(default_project_dirnames.models, targetid) logger.debug('Working on target %s' % targetid) self.ensembler_stage = 'implicit-start' self.model_filename = 'implicit-start.pdb.gz' if traj_filepath is None: self.traj_filepath = os.path.join( self.models_target_dir, 'traj-{0}.xtc'.format(self.ensembler_stage) ) else: self.traj_filepath = traj_filepath if topol_filepath is None: self.topol_filepath = os.path.join( self.models_target_dir, 'traj-{0}-topol.pdb'.format(self.ensembler_stage) ) else: self.topol_filepath = topol_filepath if models_data_filepath is None: self.models_data_filepath = os.path.join( self.models_target_dir, 'traj-{0}-data.csv'.format(self.ensembler_stage) ) else: self.models_data_filepath = models_data_filepath if process_only_these_templates: self.templateids = process_only_these_templates else: directories = [ directory for directory in os.walk(self.models_target_dir) ] self.templateids = directories[0][1] if run_main: self._gen_implicit_start_models() self._gen_df(model_filename=self.model_filename) self.df.to_csv(self.models_data_filepath, columns=['templateid', 'seqid']) self._construct_traj() self._superpose() self._write_traj()
def _construct_traj(self): logger.debug('Loading Trajectory object for model {0} ({1}/{2})'.format(self.df.templateid.iloc[0], 0, len(self.df.model_filepath))) traj = mdtraj.load_pdb(self.df.model_filepath[0]) remove_disulfide_bonds_from_topology(traj.topology) self.traj = traj for m, model_filepath in enumerate(self.df.model_filepath[1:]): logger.debug('Loading Trajectory object for model {0} ({1}/{2})'.format(self.df.templateid.iloc[m+1], m+1, len(self.df.model_filepath))) traj = mdtraj.load_pdb(model_filepath) remove_disulfide_bonds_from_topology(traj.topology) self.traj += traj
def calc_pme_parameters(system): """Calculate PME parameters using scheme similar to OpenMM OpenCL platform. Parameters ---------- system : simtk.openmm.System The system for which parameters are to be computed. Returns ------- alpha : float The PME alpha parameter nx, ny, nz : int The grid numbers in each dimension """ # Find nonbonded force. forces = { system.getForce(index).__class__.__name__: system.getForce(index) for index in range(system.getNumForces()) } force = forces['NonbondedForce'] tol = force.getEwaldErrorTolerance() boxVectors = system.getDefaultPeriodicBoxVectors() from numpy import sqrt, log, ceil from math import pow alpha = (1.0 / force.getCutoffDistance()) * sqrt(-log(2.0 * tol)) xsize = int(ceil(2 * alpha * boxVectors[0][0] / (3 * pow(tol, 0.2)))) ysize = int(ceil(2 * alpha * boxVectors[1][1] / (3 * pow(tol, 0.2)))) zsize = int(ceil(2 * alpha * boxVectors[2][2] / (3 * pow(tol, 0.2)))) logger.debug('xsize = %d, ysize = %d, zsize = %d' % (xsize, ysize, zsize)) def findLegalDimension(minimum): while (True): # Attempt to factor the current value. unfactored = minimum for factor in range(2, 8): while (unfactored > 1) and (unfactored % factor == 0): unfactored /= factor if (unfactored == 1): return int(minimum) minimum += 1 nx = findLegalDimension(xsize) ny = findLegalDimension(ysize) nz = findLegalDimension(zsize) return (alpha, nx, ny, nz)
def create_dir(dirpath, quiet=True): """ :param dirpath: str """ try: os.makedirs(dirpath) if not quiet: logger.info('Created directory "%s"' % dirpath) except OSError as e: if e.errno == 17: logger.debug('Directory "%s" already exists - will not overwrite' % dirpath) else: raise
def _mk_traj(self): with ensembler.utils.mk_temp_dir() as tmpdir: model_filepaths = [] for m, model_filepath_gz in enumerate(self.model_filepaths): logger.debug('Unzipping model {0}/{1}'.format(m, len(self.model_filepaths))) with gzip.open(model_filepath_gz) as model_file: model_filepath = os.path.join(tmpdir, '{0}.pdb'.format(m)) model_filepaths.append(model_filepath) model_text = model_file.read() with open(model_filepath, 'w') as model_file: model_file.write(model_text) self.traj = mdtraj.load(model_filepaths)
def get_valid_templates_for_target( target, templates_resolved_seq, process_only_these_templates=None, model_seqid_cutoff=None, model_validation_score_cutoff=None, model_validation_score_percentile=None, ): logger.debug("Building list of valid templates...") models_target_dir = os.path.join(default_project_dirnames.models, target.id) if model_seqid_cutoff: selected_template_ids = select_templates_by_seqid_cutoff( target.id, seqid_cutoff=model_seqid_cutoff) elif model_validation_score_cutoff or model_validation_score_percentile: selected_template_ids = select_templates_by_validation_score( targetid=target.id, validation_score_cutoff=model_validation_score_cutoff, validation_score_percentile=model_validation_score_percentile, ) elif process_only_these_templates: selected_template_ids = [ seq_obj.id for seq_obj in templates_resolved_seq if seq_obj.id in process_only_these_templates ] else: selected_template_ids = [ seq_obj.id for seq_obj in templates_resolved_seq ] valid_templates = [] for template in selected_template_ids: # Check to make sure all files needed are present. not_valid = False for filename in filenames_necessary_for_fah_packaging: fullpath = os.path.join(models_target_dir, template, filename) if not (os.path.exists(fullpath) or os.path.exists(fullpath + '.gz')): not_valid = True break if not_valid: continue else: valid_templates.append(template) logger.debug('{} valid unique initial starting conditions found'.format( len(valid_templates))) return valid_templates
def _mk_traj(self): with ensembler.utils.mk_temp_dir() as tmpdir: model_filepaths = [] for m, model_filepath_gz in enumerate(self.model_filepaths): logger.debug('Unzipping model {0}/{1}'.format( m, len(self.model_filepaths))) with gzip.open(model_filepath_gz) as model_file: model_filepath = os.path.join(tmpdir, '{0}.pdb'.format(m)) model_filepaths.append(model_filepath) model_text = model_file.read() with open(model_filepath, 'w') as model_file: model_file.write(model_text) self.traj = mdtraj.load(model_filepaths)
def calc_pme_parameters(system): """Calculate PME parameters using scheme similar to OpenMM OpenCL platform. Parameters ---------- system : simtk.openmm.System The system for which parameters are to be computed. Returns ------- alpha : float The PME alpha parameter nx, ny, nz : int The grid numbers in each dimension """ # Find nonbonded force. forces = { system.getForce(index).__class__.__name__ : system.getForce(index) for index in range(system.getNumForces()) } force = forces['NonbondedForce'] tol = force.getEwaldErrorTolerance() boxVectors = system.getDefaultPeriodicBoxVectors() from numpy import sqrt, log, ceil from math import pow alpha = (1.0/force.getCutoffDistance())*sqrt(-log(2.0*tol)) xsize = int(ceil(2*alpha*boxVectors[0][0]/(3*pow(tol, 0.2)))) ysize = int(ceil(2*alpha*boxVectors[1][1]/(3*pow(tol, 0.2)))) zsize = int(ceil(2*alpha*boxVectors[2][2]/(3*pow(tol, 0.2)))) logger.debug('xsize = %d, ysize = %d, zsize = %d' % (xsize,ysize,zsize)) def findLegalDimension(minimum): while (True): # Attempt to factor the current value. unfactored = minimum for factor in range(2, 8): while (unfactored > 1) and (unfactored%factor == 0): unfactored /= factor if (unfactored == 1): return int(minimum) minimum += 1 nx = findLegalDimension(xsize) ny = findLegalDimension(ysize) nz = findLegalDimension(zsize) return (alpha, nx, ny, nz)
def molprobity_validation(targetid, ensembler_stage=None, loglevel=None): set_loglevel(loglevel) valid_model_ids = [] if mpistate.rank == 0: if ensembler_stage is None: ensembler_stage = get_most_advanced_ensembler_modeling_stage( targetid) valid_model_ids = get_valid_model_ids(ensembler_stage, targetid) if ensembler_stage is None: ensembler_stage = mpistate.comm.bcast(ensembler_stage, root=0) valid_model_ids = mpistate.comm.bcast(valid_model_ids, root=0) nvalid_model_ids = len(valid_model_ids) model_structure_filename = model_filenames_by_ensembler_stage[ ensembler_stage] models_target_dir = os.path.join(default_project_dirnames.models, targetid) molprobity_results_filepath = os.path.join( models_target_dir, 'validation_scores_sorted-molprobity-{}'.format(ensembler_stage)) molprobity_scores_sublist = [] for model_index in range(mpistate.rank, nvalid_model_ids, mpistate.size): model_id = valid_model_ids[model_index] logger.debug('MPI process {} working on model {}'.format( mpistate.rank, model_id)) molprobity_score = run_molprobity_oneline_analysis_and_write_results( targetid, model_id, ensembler_stage, model_structure_filename=model_structure_filename, models_target_dir=models_target_dir, ) molprobity_scores_sublist.append((model_id, molprobity_score)) molprobity_scores_gathered_list = mpistate.comm.gather( molprobity_scores_sublist, root=0) if mpistate.rank == 0: molprobity_scores_list_of_tuples = [ item for sublist in molprobity_scores_gathered_list for item in sublist ] molprobity_scores_sorted = sorted(molprobity_scores_list_of_tuples, key=lambda x: x[1]) write_molprobity_scores_list(molprobity_scores_sorted, molprobity_results_filepath)
def _construct_traj(self): logger.debug( 'Loading Trajectory object for model {0} ({1}/{2})'.format( self.df.templateid.iloc[0], 0, len(self.df.model_filepath))) traj = mdtraj.load_pdb(self.df.model_filepath[0]) remove_disulfide_bonds_from_topology(traj.topology) self.traj = traj for m, model_filepath in enumerate(self.df.model_filepath[1:]): logger.debug( 'Loading Trajectory object for model {0} ({1}/{2})'.format( self.df.templateid.iloc[m + 1], m + 1, len(self.df.model_filepath))) traj = mdtraj.load_pdb(model_filepath) remove_disulfide_bonds_from_topology(traj.topology) self.traj += traj
def __init__(self, targetid, traj_filepath=None, topol_filepath=None, models_data_filepath=None, process_only_these_templates=None, loglevel=None, run_main=True): """Quick hack. """ ensembler.utils.set_loglevel(loglevel) ensembler.core.check_project_toplevel_dir() self.models_target_dir = os.path.join(default_project_dirnames.models, targetid) logger.debug('Working on target %s' % targetid) self.ensembler_stage = 'implicit-start' self.model_filename = 'implicit-start.pdb.gz' if traj_filepath is None: self.traj_filepath = os.path.join( self.models_target_dir, 'traj-{0}.xtc'.format(self.ensembler_stage) ) else: self.traj_filepath = traj_filepath if topol_filepath is None: self.topol_filepath = os.path.join( self.models_target_dir, 'traj-{0}-topol.pdb'.format(self.ensembler_stage) ) else: self.topol_filepath = topol_filepath if models_data_filepath is None: self.models_data_filepath = os.path.join( self.models_target_dir, 'traj-{0}-data.csv'.format(self.ensembler_stage) ) else: self.models_data_filepath = models_data_filepath if process_only_these_templates: self.templateids = process_only_these_templates else: self.templateids = os.walk(self.models_target_dir).next()[1] if run_main: self._gen_implicit_start_models() self._gen_df(model_filename=self.model_filename) self.df.to_csv(self.models_data_filepath, columns=['templateid', 'seqid']) self._construct_traj() self._superpose() self._write_traj()
def setup_system_and_integrator_files(target, template, temperature, collision_rate, timestep): logger.debug( 'Copying system and integrator files for template {}'.format(template)) models_target_dir = os.path.join(default_project_dirnames.models, target.id) template_dir = os.path.join(models_target_dir, template) target_project_dir = os.path.join(fah_projects_dir, target.id) source_system_filepath = os.path.join(template_dir, 'explicit-system.xml') source_state_filepath = os.path.join(template_dir, 'explicit-state.xml') dest_system_filepath = os.path.join(target_project_dir, 'system.xml') dest_integrator_filepath = os.path.join(target_project_dir, 'integrator.xml') system = mm.XmlSerializer.deserialize( read_file_contents_gz_or_not(source_system_filepath)) state = mm.XmlSerializer.deserialize( read_file_contents_gz_or_not(source_state_filepath)) # Substitute default box vectors in system with those from state. box_vectors = state.getPeriodicBoxVectors() system.setDefaultPeriodicBoxVectors(*box_vectors) # Set PME parameters explicitly to minimize discrepancy between Reference and OpenCL/CUDA if not already set explicitly. ensure_pme_parameters_are_explicit(system) # Create new integrator to use. integrator = mm.LangevinIntegrator(temperature, collision_rate, timestep) # Make sure MonteCarloBarostat temperature matches set temperature. forces = { system.getForce(index).__class__.__name__: system.getForce(index) for index in range(system.getNumForces()) } if 'MonteCarloBarostat' in forces: forces['MonteCarloBarostat'].setTemperature(temperature) # Serialize System. with open(dest_system_filepath, 'w') as dest_system_file: dest_system_file.write(mm.XmlSerializer.serialize(system)) # Serialize Integrator with open(dest_integrator_filepath, 'w') as dest_integrator_file: dest_integrator_file.write(mm.XmlSerializer.serialize(integrator)) return system
def sort_valid_templates_by_seqid(target, valid_templates): logger.debug( "Sorting templates in order of decreasing sequence identity...") models_target_dir = os.path.join(default_project_dirnames.models, target.id) seqids = [] for template in valid_templates: seqids.append(get_seqid_for_model(models_target_dir, template)) sorted_valid_templates_and_seqids = sorted(zip(valid_templates, seqids), reverse=True, key=lambda x: x[1]) sorted_valid_templates = zip(*sorted_valid_templates_and_seqids)[0] return sorted_valid_templates
def setup_system_and_integrator_files(target, template, temperature, collision_rate, timestep ): logger.debug('Copying system and integrator files for template {}'.format(template)) models_target_dir = os.path.join(default_project_dirnames.models, target.id) template_dir = os.path.join(models_target_dir, template) target_project_dir = os.path.join(fah_projects_dir, target.id) source_system_filepath = os.path.join(template_dir, 'explicit-system.xml') source_state_filepath = os.path.join(template_dir, 'explicit-state.xml') dest_system_filepath = os.path.join(target_project_dir, 'system.xml') dest_integrator_filepath = os.path.join(target_project_dir, 'integrator.xml') system = mm.XmlSerializer.deserialize( read_file_contents_gz_or_not(source_system_filepath) ) state = mm.XmlSerializer.deserialize( read_file_contents_gz_or_not(source_state_filepath) ) # Substitute default box vectors in system with those from state. box_vectors = state.getPeriodicBoxVectors() system.setDefaultPeriodicBoxVectors(*box_vectors) # Set PME parameters explicitly to minimize discrepancy between Reference and OpenCL/CUDA if not already set explicitly. ensure_pme_parameters_are_explicit(system) # Create new integrator to use. integrator = mm.LangevinIntegrator(temperature, collision_rate, timestep) # Make sure MonteCarloBarostat temperature matches set temperature. forces = { system.getForce(index).__class__.__name__ : system.getForce(index) for index in range(system.getNumForces()) } if 'MonteCarloBarostat' in forces: forces['MonteCarloBarostat'].setTemperature(temperature) # Serialize System. with open(dest_system_filepath, 'w') as dest_system_file: dest_system_file.write(mm.XmlSerializer.serialize(system)) # Serialize Integrator with open(dest_integrator_filepath, 'w') as dest_integrator_file: dest_integrator_file.write(mm.XmlSerializer.serialize(integrator)) return system
def sort_valid_templates_by_seqid(target, valid_templates): logger.debug("Sorting templates in order of decreasing sequence identity...") models_target_dir = os.path.join(default_project_dirnames.models, target.id) seqids = [] for template in valid_templates: seqids.append(get_seqid_for_model(models_target_dir, template)) sorted_valid_templates_and_seqids = sorted( zip(valid_templates, seqids), reverse=True, key=lambda x: x[1] ) sorted_valid_templates = zip(*sorted_valid_templates_and_seqids)[0] return sorted_valid_templates
def get_valid_templates_for_target(target, templates_resolved_seq, process_only_these_templates=None, model_seqid_cutoff=None, model_validation_score_cutoff=None, model_validation_score_percentile=None, ): logger.debug("Building list of valid templates...") models_target_dir = os.path.join(default_project_dirnames.models, target.id) if model_seqid_cutoff: selected_template_ids = select_templates_by_seqid_cutoff( target.id, seqid_cutoff=model_seqid_cutoff ) elif model_validation_score_cutoff or model_validation_score_percentile: selected_template_ids = select_templates_by_validation_score( targetid=target.id, validation_score_cutoff=model_validation_score_cutoff, validation_score_percentile=model_validation_score_percentile, ) elif process_only_these_templates: selected_template_ids = [ seq_obj.id for seq_obj in templates_resolved_seq if seq_obj.id in process_only_these_templates ] else: selected_template_ids = [seq_obj.id for seq_obj in templates_resolved_seq] valid_templates = [] for template in selected_template_ids: # Check to make sure all files needed are present. not_valid = False for filename in filenames_necessary_for_fah_packaging: fullpath = os.path.join(models_target_dir, template, filename) if not (os.path.exists(fullpath) or os.path.exists(fullpath+'.gz')): not_valid = True break if not_valid: continue else: valid_templates.append(template) logger.debug('{} valid unique initial starting conditions found'.format(len(valid_templates))) return valid_templates
def gather_templates_from_pdb(pdbids, uniprot_domain_regex=None, chainids=None, structure_dirs=None, loglevel=None): """ :param pdbids: list of str :param uniprot_domain_regex: str :param chainids: dict {pdbid (str): [chainid (str)]} :param structure_dirs: list of str :return: """ ensembler.utils.set_loglevel(loglevel) manual_overrides = ensembler.core.ManualOverrides() selected_pdbchains = None if mpistate.rank == 0: for pdbid in pdbids: get_structure_files_for_single_pdbchain(pdbid, structure_dirs) uniprot_acs = extract_uniprot_acs_from_sifts_files(pdbids) logger.debug('Extracted UniProt ACs: {0}'.format(uniprot_acs)) uniprot_ac_query_string = ensembler.uniprot.build_uniprot_query_string_from_acs( uniprot_acs) uniprotxml = ensembler.uniprot.get_uniprot_xml(uniprot_ac_query_string) selected_pdbchains = extract_template_pdbchains_from_uniprot_xml( uniprotxml, uniprot_domain_regex=uniprot_domain_regex, manual_overrides=manual_overrides, specified_pdbids=pdbids, specified_chainids=chainids) selected_pdbchains = mpistate.comm.bcast(selected_pdbchains, root=0) logger.debug('Selected PDB chains: {0}'.format( [pdbchain['templateid'] for pdbchain in selected_pdbchains])) selected_templates = extract_template_pdb_chain_residues( selected_pdbchains) write_template_seqs_to_fasta_file(selected_templates) extract_template_structures_from_pdb_files(selected_templates) write_gather_templates_from_pdb_metadata(pdbids, uniprot_domain_regex, len(selected_templates), chainids, structure_dirs)
def gather_templates_from_uniprot(uniprot_query_string, uniprot_domain_regex=None, structure_dirs=None, pdbids=None, chainids=None, loglevel=None): """# Searches UniProt for a set of template proteins with a user-defined query string, then saves IDs, sequences and structures.""" ensembler.utils.set_loglevel(loglevel) manual_overrides = ensembler.core.ManualOverrides() selected_pdbchains = None if mpistate.rank == 0: uniprotxml = ensembler.uniprot.get_uniprot_xml(uniprot_query_string) log_unique_domain_names(uniprot_query_string, uniprotxml) if uniprot_domain_regex is not None: log_unique_domain_names_selected_by_regex(uniprot_domain_regex, uniprotxml) selected_pdbchains = extract_template_pdbchains_from_uniprot_xml(uniprotxml, uniprot_domain_regex=uniprot_domain_regex, manual_overrides=manual_overrides, specified_pdbids=pdbids, specified_chainids=chainids) get_structure_files(selected_pdbchains, structure_dirs) selected_pdbchains = mpistate.comm.bcast(selected_pdbchains, root=0) logger.debug('Selected PDB chains: {0}'.format([pdbchain['templateid'] for pdbchain in selected_pdbchains])) selected_templates = extract_template_pdb_chain_residues(selected_pdbchains) write_template_seqs_to_fasta_file(selected_templates) extract_template_structures_from_pdb_files(selected_templates) write_gather_templates_from_uniprot_metadata(uniprot_query_string, uniprot_domain_regex, len(selected_templates), structure_dirs)
def molprobity_validation(targetid, ensembler_stage=None, loglevel=None): set_loglevel(loglevel) valid_model_ids = [] if mpistate.rank == 0: if ensembler_stage is None: ensembler_stage = get_most_advanced_ensembler_modeling_stage(targetid) valid_model_ids = get_valid_model_ids(ensembler_stage, targetid) if ensembler_stage is None: ensembler_stage = mpistate.comm.bcast(ensembler_stage, root=0) valid_model_ids = mpistate.comm.bcast(valid_model_ids, root=0) nvalid_model_ids = len(valid_model_ids) model_structure_filename = model_filenames_by_ensembler_stage[ensembler_stage] models_target_dir = os.path.join(default_project_dirnames.models, targetid) molprobity_results_filepath = os.path.join( models_target_dir, "validation_scores_sorted-molprobity-{}".format(ensembler_stage) ) molprobity_scores_sublist = [] for model_index in range(mpistate.rank, nvalid_model_ids, mpistate.size): model_id = valid_model_ids[model_index] logger.debug("MPI process {} working on model {}".format(mpistate.rank, model_id)) molprobity_score = run_molprobity_oneline_analysis_and_write_results( targetid, model_id, ensembler_stage, model_structure_filename=model_structure_filename, models_target_dir=models_target_dir, ) molprobity_scores_sublist.append((model_id, molprobity_score)) molprobity_scores_gathered_list = mpistate.comm.gather(molprobity_scores_sublist, root=0) if mpistate.rank == 0: molprobity_scores_list_of_tuples = [item for sublist in molprobity_scores_gathered_list for item in sublist] molprobity_scores_sorted = sorted(molprobity_scores_list_of_tuples, key=lambda x: x[1]) write_molprobity_scores_list(molprobity_scores_sorted, molprobity_results_filepath)
def gather_templates_from_uniprot(uniprot_query_string, uniprot_domain_regex=None, structure_dirs=None, pdbids=None, chainids=None, loglevel=None): """# Searches UniProt for a set of template proteins with a user-defined query string, then saves IDs, sequences and structures.""" ensembler.utils.set_loglevel(loglevel) manual_overrides = ensembler.core.ManualOverrides() selected_pdbchains = None if mpistate.rank == 0: uniprotxml = ensembler.uniprot.get_uniprot_xml(uniprot_query_string) log_unique_domain_names(uniprot_query_string, uniprotxml) if uniprot_domain_regex is not None: log_unique_domain_names_selected_by_regex(uniprot_domain_regex, uniprotxml) selected_pdbchains = extract_template_pdbchains_from_uniprot_xml( uniprotxml, uniprot_domain_regex=uniprot_domain_regex, manual_overrides=manual_overrides, specified_pdbids=pdbids, specified_chainids=chainids) get_structure_files(selected_pdbchains, structure_dirs) selected_pdbchains = mpistate.comm.bcast(selected_pdbchains, root=0) logger.debug('Selected PDB chains: {0}'.format( [pdbchain['templateid'] for pdbchain in selected_pdbchains])) selected_templates = extract_template_pdb_chain_residues( selected_pdbchains) write_template_seqs_to_fasta_file(selected_templates) extract_template_structures_from_pdb_files(selected_templates) write_gather_templates_from_uniprot_metadata(uniprot_query_string, uniprot_domain_regex, len(selected_templates), structure_dirs)
def run_molprobity_oneline_analysis_and_write_results( targetid, model_id, ensembler_stage, model_structure_filename=None, models_target_dir=None, check_for_existing_results=True, ): if model_structure_filename is None: model_structure_filename = model_filenames_by_ensembler_stage[ ensembler_stage] if models_target_dir is None: models_target_dir = os.path.join(default_project_dirnames.models, targetid) results_output_filepath = os.path.join( models_target_dir, model_id, 'molprobity-{}.yaml'.format(ensembler_stage)) if check_for_existing_results: if os.path.exists(results_output_filepath): with open(results_output_filepath) as results_output_file: prev_results = yaml.load(stream=results_output_file, Loader=YamlLoader) prev_molprobity_score = prev_results.get('MolProbityScore') if prev_molprobity_score is not None: logger.debug( 'Existing MolProbity score of {} found for model {}'. format(prev_molprobity_score, model_id)) return prev_molprobity_score molprobity_results = run_molprobity_oneline_analysis( targetid, model_id, model_structure_filename) if molprobity_results is None: logger.debug( 'MolProbity returned no results for model {}'.format(model_id)) return None logger.debug('MolProbity score of {} calculated for model {}'.format( molprobity_results.get('MolProbityScore'), model_id)) molprobity_score = molprobity_results.get('MolProbityScore') if molprobity_score is not None: write_molprobity_results_for_target(molprobity_results, models_target_dir, model_id, ensembler_stage) return molprobity_score
def run_molprobity_oneline_analysis_and_write_results( targetid, model_id, ensembler_stage, model_structure_filename=None, models_target_dir=None, check_for_existing_results=True, ): if model_structure_filename is None: model_structure_filename = model_filenames_by_ensembler_stage[ensembler_stage] if models_target_dir is None: models_target_dir = os.path.join(default_project_dirnames.models, targetid) results_output_filepath = os.path.join(models_target_dir, model_id, "molprobity-{}.yaml".format(ensembler_stage)) if check_for_existing_results: if os.path.exists(results_output_filepath): with open(results_output_filepath) as results_output_file: prev_results = yaml.load(stream=results_output_file, Loader=YamlLoader) prev_molprobity_score = prev_results.get("MolProbityScore") if prev_molprobity_score is not None: logger.debug( "Existing MolProbity score of {} found for model {}".format(prev_molprobity_score, model_id) ) return prev_molprobity_score molprobity_results = run_molprobity_oneline_analysis(targetid, model_id, model_structure_filename) if molprobity_results is None: logger.debug("MolProbity returned no results for model {}".format(model_id)) return None logger.debug( "MolProbity score of {} calculated for model {}".format(molprobity_results.get("MolProbityScore"), model_id) ) molprobity_score = molprobity_results.get("MolProbityScore") if molprobity_score is not None: write_molprobity_results_for_target(molprobity_results, models_target_dir, model_id, ensembler_stage) return molprobity_score
def build_model(target, template_resolved_seq, target_setup_data, write_modeller_restraints_file=False, loglevel=None): """Uses Modeller to build a homology model for a given target and template. Will not run Modeller if the output files already exist. Parameters ---------- target : BioPython SeqRecord template_resolved_seq : BioPython SeqRecord Must be a corresponding .pdb template file with the same ID in the templates/structures directory. template_resolved_seq : BioPython SeqRecord Must be a corresponding .pdb template file with the same ID in the templates/structures directory. target_setup_data : TargetSetupData obj write_modeller_restraints_file : bool Write file containing restraints used by Modeller - note that this file can be relatively large, e.g. ~300KB per model for a protein kinase domain target. loglevel : bool """ ensembler.utils.set_loglevel(loglevel) template_structure_dir = os.path.abspath( ensembler.core.default_project_dirnames. templates_structures_modeled_loops) if os.path.exists( os.path.join(template_structure_dir, template_resolved_seq.id + '.pdb')): remodeled_seq_filepath = os.path.join( ensembler.core.default_project_dirnames. templates_structures_modeled_loops, template_resolved_seq.id + '-pdbfixed.fasta') template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0] else: template = template_resolved_seq template_structure_dir = os.path.abspath( ensembler.core.default_project_dirnames. templates_structures_resolved) model_dir = os.path.abspath( os.path.join(target_setup_data.models_target_dir, template.id)) if not os.path.exists(model_dir): ensembler.utils.create_dir(model_dir) model_pdbfilepath = os.path.abspath(os.path.join(model_dir, 'model.pdb.gz')) modeling_log_filepath = os.path.abspath( os.path.join(model_dir, 'modeling-log.yaml')) check_model_pdbfilepath_ends_in_pdbgz(model_pdbfilepath) model_pdbfilepath_uncompressed = model_pdbfilepath[:-3] if check_all_model_files_present(model_dir): logger.debug( "Output files already exist for target '%s' // template '%s'; files were not overwritten." % (target.id, template.id)) return logger.info( '-------------------------------------------------------------------------\n' 'Modelling "%s" => "%s"\n' '-------------------------------------------------------------------------' % (target.id, template.id)) # aln = align_target_template(target, template) aln_filepath = os.path.abspath(os.path.join(model_dir, 'alignment.pir')) # write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath) log_file = init_build_model_logfile(modeling_log_filepath) with ensembler.utils.enter_temp_dir(): try: start = datetime.datetime.utcnow() shutil.copy(aln_filepath, 'alignment.pir') run_modeller( target, template, model_dir, model_pdbfilepath, model_pdbfilepath_uncompressed, template_structure_dir, write_modeller_restraints_file=write_modeller_restraints_file) if os.path.getsize(model_pdbfilepath) < 1: raise Exception('Output PDB file is empty.') end_successful_build_model_logfile(log_file, start) except Exception as e: end_exception_build_model_logfile(e, log_file)
def package_for_fah(process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, model_validation_score_cutoff=None, model_validation_score_percentile=None, nclones=1, archive=False, openmm_platform='Reference', temperature=300.0 * unit.kelvin, collision_rate=1.0 / unit.picosecond, timestep=2.0 * unit.femtoseconds, loglevel=None): """ Create the input files and directory structure necessary to start a Folding@Home project. MPI-enabled. Parameters ---------- archive : Bool A .tgz compressed archive will be created for each individual RUN directory. """ set_loglevel(loglevel) if mpistate.rank == 0: if not os.path.exists(fah_projects_dir): os.mkdir(fah_projects_dir) mpistate.comm.Barrier() targets, templates_resolved_seq = get_targets_and_templates() for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue target_project_dir = os.path.join(fah_projects_dir, target.id) models_target_dir = os.path.join(default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue mpistate.comm.Barrier() sorted_valid_templates = [] system = None renumbered_resnums = {} if mpistate.rank == 0: logger.info( '-------------------------------------------------------------------------' ) logger.info('Building FAH OpenMM project for target {}'.format( target.id)) logger.info( '-------------------------------------------------------------------------' ) valid_templates = get_valid_templates_for_target( target, templates_resolved_seq, process_only_these_templates=process_only_these_templates, model_seqid_cutoff=model_seqid_cutoff, model_validation_score_cutoff=model_validation_score_cutoff, model_validation_score_percentile= model_validation_score_percentile) sorted_valid_templates = sort_valid_templates_by_seqid( target, valid_templates) create_target_project_dir(target) system = setup_system_and_integrator_files( target, sorted_valid_templates[0], temperature, collision_rate, timestep) renumbered_resnums = get_renumbered_topol_resnums(target) sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0) system = mpistate.comm.bcast(system, root=0) renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0) logger.debug("Building RUNs in parallel...") for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size): template = sorted_valid_templates[run_index] logger.info( '-------------------------------------------------------------------------' ) logger.info('Building RUN{} for template {}'.format( run_index, template)) logger.info( '-------------------------------------------------------------------------' ) source_dir = os.path.join(models_target_dir, template) generate_fah_run( target_project_dir, template, source_dir, system, run_index, nclones, temperature, collision_rate, timestep, openmm_platform, renumbered_resnums, ) if archive: tgz_fah_run(target, run_index) mpistate.comm.Barrier() if mpistate.rank == 0: logger.info('Done.')
def mktraj(targetid, ensembler_stage=None, traj_filepath=None, topol_filepath=None, models_data_filepath=None, process_only_these_templates=None): """Makes a trajectory for a given target, using mdtraj. The trajectory can be used with other software, e.g. for visualization with PyMOL or VMD. Parameters ---------- targetid : str e.g. 'EGFR_HUMAN_D0' ensembler_stage : str The Ensembler stage from which to build models, e.g. 'build_models' results in a trajectory built from the 'model.pdb.gz' files output by the build_models command. options: build_models|refine_implicit_md|refine_explicit_md default: most advanced stage for which model files are available traj_filepath : str default: models/[targetid]/traj-[ensembler_stage].xtc topol_filepath : str default: models/[targetid]/traj-[ensembler_stage]-topol.pdb models_data_filepath : default: models/[targetid]/traj-[ensembler_stage]-data.csv process_only_these_templates : list of str Returns ------- traj : mdtraj.Trajectory df : pandas.DataFrame models data (e.g. sequence identities): """ ensembler.core.check_project_toplevel_dir() models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, targetid) logger.debug('Working on target %s' % targetid) if ensembler_stage is None: for stagename in ['refine_explicit_md', 'refine_implicit_md', 'build_models']: if check_ensembler_modeling_stage_complete(stagename, targetid): ensembler_stage = stagename break if ensembler_stage is None: raise Exception('Models have not yet been built for this Ensembler project.') if traj_filepath is None: traj_filepath = os.path.join(models_target_dir, 'traj-{0}.xtc'.format(ensembler_stage)) if topol_filepath is None: topol_filepath = os.path.join(models_target_dir, 'traj-{0}-topol.pdb'.format(ensembler_stage)) if models_data_filepath is None: models_data_filepath = os.path.join(models_target_dir, 'traj-{0}-data.csv'.format(ensembler_stage)) if process_only_these_templates: templateids = process_only_these_templates else: dirs = os.walk(models_target_dir).next()[1] templateids = [dir for dir in dirs if '_D' in dir] model_filename = ensembler.core.model_filenames_by_ensembler_stage[ensembler_stage] valid_model_templateids = [templateid for templateid in templateids if os.path.exists(os.path.join(models_target_dir, templateid, model_filename))] valid_model_filepaths = [os.path.join(models_target_dir, templateid, model_filename) for templateid in valid_model_templateids] seqid_filepaths = [os.path.join(models_target_dir, templateid, 'sequence-identity.txt') for templateid in valid_model_templateids] seqids = [float(open(seqid_filepath).read().strip()) if os.path.exists(seqid_filepath) else None for seqid_filepath in seqid_filepaths] df = pd.DataFrame({ 'templateid': valid_model_templateids, 'model_filepath': valid_model_filepaths, 'seqid': seqids, }) df.sort(columns='seqid', inplace=True, ascending=False) df.reset_index(drop=True, inplace=True) df.to_csv(models_data_filepath, columns=['templateid', 'seqid']) # construct traj traj = mdtraj.load_pdb(df.model_filepath[0]) for model_filepath in df.model_filepath[1:]: traj += mdtraj.load_pdb(model_filepath) # superpose structured C-alphas dssp = mdtraj.compute_dssp(traj[0])[0] structured_resis_bool = (dssp == 'H') + (dssp == 'E') alpha_indices = traj.topology.select_atom_indices('alpha') structured_alpha_indices = np.array([alpha_indices[x] for x in range(traj.n_residues) if structured_resis_bool[x]]) traj.superpose(reference=traj, frame=0, atom_indices=structured_alpha_indices) # write traj, and write first frame as pdb file traj[0].save(topol_filepath) traj.save(traj_filepath) return traj, df
def build_model(target, template_resolved_seq, target_setup_data, write_modeller_restraints_file=False, loglevel=None): """Uses Modeller to build a homology model for a given target and template. Will not run Modeller if the output files already exist. Parameters ---------- target : BioPython SeqRecord template_resolved_seq : BioPython SeqRecord Must be a corresponding .pdb template file with the same ID in the templates/structures directory. template_resolved_seq : BioPython SeqRecord Must be a corresponding .pdb template file with the same ID in the templates/structures directory. target_setup_data : TargetSetupData obj write_modeller_restraints_file : bool Write file containing restraints used by Modeller - note that this file can be relatively large, e.g. ~300KB per model for a protein kinase domain target. loglevel : bool """ ensembler.utils.set_loglevel(loglevel) template_structure_dir = os.path.abspath( ensembler.core.default_project_dirnames.templates_structures_modeled_loops ) if os.path.exists(os.path.join(template_structure_dir, template_resolved_seq.id + '.pdb')): remodeled_seq_filepath = os.path.join( ensembler.core.default_project_dirnames.templates_structures_modeled_loops, template_resolved_seq.id + '-pdbfixed.fasta' ) template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0] else: template = template_resolved_seq template_structure_dir = os.path.abspath( ensembler.core.default_project_dirnames.templates_structures_resolved ) model_dir = os.path.abspath(os.path.join(target_setup_data.models_target_dir, template.id)) if not os.path.exists(model_dir): ensembler.utils.create_dir(model_dir) model_pdbfilepath = os.path.abspath(os.path.join(model_dir, 'model.pdb.gz')) modeling_log_filepath = os.path.abspath(os.path.join(model_dir, 'modeling-log.yaml')) check_model_pdbfilepath_ends_in_pdbgz(model_pdbfilepath) model_pdbfilepath_uncompressed = model_pdbfilepath[:-3] if check_all_model_files_present(model_dir): logger.debug( "Output files already exist for target '%s' // template '%s'; files were not overwritten." % (target.id, template.id) ) return logger.info( '-------------------------------------------------------------------------\n' 'Modelling "%s" => "%s"\n' '-------------------------------------------------------------------------' % (target.id, template.id) ) # aln = align_target_template(target, template) aln_filepath = os.path.abspath(os.path.join(model_dir, 'alignment.pir')) # write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath) log_file = init_build_model_logfile(modeling_log_filepath) with ensembler.utils.enter_temp_dir(): try: start = datetime.datetime.utcnow() shutil.copy(aln_filepath, 'alignment.pir') run_modeller(target, template, model_dir, model_pdbfilepath, model_pdbfilepath_uncompressed, template_structure_dir, write_modeller_restraints_file=write_modeller_restraints_file) if os.path.getsize(model_pdbfilepath) < 1: raise Exception('Output PDB file is empty.') end_successful_build_model_logfile(log_file, start) except Exception as e: end_exception_build_model_logfile(e, log_file)
def cluster_models(process_only_these_targets=None, cutoff=0.06, loglevel=None): """Cluster models based on RMSD, and filter out non-unique models as determined by a given cutoff. Parameters ---------- cutoff : float Minimum distance cutoff for RMSD clustering (nm) Runs serially. """ # TODO refactor ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = get_targets_and_templates() templates = templates_resolved_seq for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue # ============================= # Construct a mdtraj trajectory containing all models # ============================= starttime = datetime.datetime.utcnow() logger.debug('Building a list of valid models...') model_pdbfilenames_compressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb.gz') for template in templates } model_pdbfilenames_uncompressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb') for template in templates } valid_templateids = [ templateid for templateid in model_pdbfilenames_compressed if os.path.exists(model_pdbfilenames_compressed[templateid]) ] # Write uncompressed model.pdb files from model.pdb.gz if necessary for templateid in valid_templateids: if not os.path.exists(model_pdbfilenames_uncompressed[templateid]) or os.path.getsize(model_pdbfilenames_uncompressed[templateid]) == 0: with gzip.open(model_pdbfilenames_compressed[templateid]) as model_pdbfile_compressed: with open(model_pdbfilenames_uncompressed[templateid], 'w') as model_pdbfile: model_pdbfile.write(model_pdbfile_compressed.read()) logger.info('Constructing a trajectory containing all valid models...') if len(valid_templateids) == 0: logger.info('No models found for target {0}.'.format(target.id)) continue valid_model_pdbfilenames_uncompressed = [ model_pdbfilenames_uncompressed[templateid] for templateid in valid_templateids ] traj = mdtraj.load(valid_model_pdbfilenames_uncompressed) # ============================= # Clustering # ============================= logger.info('Conducting RMSD-based clustering...') # Remove any existing unique_by_clustering files for f in glob.glob(models_target_dir+'/*_PK_*/unique_by_clustering'): os.unlink(f) CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA'] unique_templateids = models_regular_spatial_clustering( valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff ) write_unique_by_clustering_files(unique_templateids, models_target_dir) with open(os.path.join(models_target_dir, 'unique-models.txt'), 'w') as uniques_file: for u in unique_templateids: uniques_file.write(u+'\n') logger.info( '%d unique models (from original set of %d) using cutoff of %.3f nm' % (len(unique_templateids), len(valid_templateids), cutoff) ) for template in templates: model_dir = os.path.join(models_target_dir, template.id) model_pdbfilename = os.path.join(model_dir, 'model.pdb') if os.path.exists(model_pdbfilename): os.remove(model_pdbfilename) # ======== # Metadata # ======== project_metadata = ensembler.core.ProjectMetadata( project_stage='cluster_models', target_id=target.id ) datestamp = ensembler.core.get_utcnow_formatted() timedelta = datetime.datetime.utcnow() - starttime metadata = { 'target_id': target.id, 'datestamp': datestamp, 'nunique_models': len(unique_templateids), 'python_version': sys.version.split('|')[0].strip(), 'python_full_version': ensembler.core.literal_str(sys.version), 'ensembler_version': ensembler.version.short_version, 'ensembler_commit': ensembler.version.git_revision, 'biopython_version': Bio.__version__, 'mdtraj_version': mdtraj.version.short_version, 'mdtraj_commit': mdtraj.version.git_revision, 'timing': ensembler.core.strf_timedelta(timedelta), } project_metadata.add_data(metadata) project_metadata.write()
def _gen_implicit_start_models(self, ff='amber99sbildn.xml', implicit_water_model='amber99_obc.xml', ph=8.0): self.ph = ph from simtk.openmm import app valid_model_templateids = [ templateid for templateid in self.templateids if os.path.exists( os.path.join( self.models_target_dir, templateid, ensembler.core. model_filenames_by_ensembler_stage['refine_implicit_md'])) ] gen_model_templateids = [ templateid for templateid in valid_model_templateids if not os.path.exists( os.path.join(self.models_target_dir, templateid, self.model_filename)) ] # make reference model forcefield = app.ForceField(ff, implicit_water_model) reference_model_id = get_highest_seqid_existing_model( models_target_dir=self.models_target_dir) logger.debug('Using {0} as reference model'.format(reference_model_id)) reference_model_path = os.path.join( self.models_target_dir, reference_model_id, model_filenames_by_ensembler_stage['build_models']) with gzip.open(reference_model_path) as reference_pdb_file: reference_pdb = app.PDBFile(reference_pdb_file) remove_disulfide_bonds_from_topology(reference_pdb.topology) reference_topology = reference_pdb.topology reference_modeller = app.Modeller(reference_pdb.topology, reference_pdb.positions) reference_variants = reference_modeller.addHydrogens(forcefield, pH=self.ph) for template_index in range(mpistate.rank, len(gen_model_templateids), mpistate.size): templateid = gen_model_templateids[template_index] logger.debug( 'Generating implicit-start model for {0}'.format(templateid)) try: input_model_filepath = os.path.join( self.models_target_dir, templateid, model_filenames_by_ensembler_stage['build_models']) output_model_filepath = os.path.join(self.models_target_dir, templateid, self.model_filename) with gzip.open(input_model_filepath) as pdb_file: pdb = app.PDBFile(pdb_file) remove_disulfide_bonds_from_topology(pdb.topology) modeller = app.Modeller(reference_topology, pdb.positions) modeller.addHydrogens(forcefield, pH=self.ph, variants=reference_variants) topology = modeller.getTopology() positions = modeller.getPositions() with gzip.open(output_model_filepath, 'wt') as output_model_file: app.PDBFile.writeHeader(topology, file=output_model_file) app.PDBFile.writeFile(topology, positions, file=output_model_file) app.PDBFile.writeFooter(topology, file=output_model_file) except Exception as e: print('Error for model {0}: {1}'.format(templateid, e)) continue
def pdbfix_template(template_full_seq, overwrite_structures=False): """ Parameters ---------- template_full_seq: BioPython SeqRecord full UniProt sequence for span of the template (including unresolved residues) overwrite_structures: bool Returns ------- fixer.missingResidues """ try: template_pdbfixed_filepath = os.path.join( ensembler.core.default_project_dirnames.templates_structures_modeled_loops, template_full_seq.id + '-pdbfixed.pdb' ) seq_pdbfixed_filepath = os.path.join( ensembler.core.default_project_dirnames.templates_structures_modeled_loops, template_full_seq.id + '-pdbfixed.fasta' ) import pdbfixer import simtk.openmm.app template_filepath = os.path.join( ensembler.core.default_project_dirnames.templates_structures_resolved, template_full_seq.id + '.pdb' ) fixer = pdbfixer.PDBFixer(filename=template_filepath) chainid = next(fixer.structure.iter_chains()).chain_id seq_obj = simtk.openmm.app.internal.pdbstructure.Sequence(chainid) for r in template_full_seq.seq: resi3 = Bio.SeqUtils.seq3(r).upper() seq_obj.residues.append(resi3) fixer.structure.sequences.append(seq_obj) fixer.findMissingResidues() remove_missing_residues_at_termini(fixer, len_full_seq=len(template_full_seq.seq)) if not overwrite_structures and os.path.exists(template_pdbfixed_filepath): return fixer.missingResidues fixer.findMissingAtoms() (newTopology, newPositions, newAtoms, existingAtomMap) = fixer._addAtomsToTopology(True, True) fixer.topology = newTopology fixer.positions = newPositions with open(template_pdbfixed_filepath, 'w') as template_pdbfixed_file: simtk.openmm.app.PDBFile.writeFile( fixer.topology, fixer.positions, file=template_pdbfixed_file ) # Write sequence to file seq_pdbfixed = ''.join([Bio.SeqUtils.seq1(r.name) for r in fixer.topology.residues()]) seq_record_pdbfixed = SeqRecord(Seq(seq_pdbfixed), id=template_full_seq.id, description=template_full_seq.id) Bio.SeqIO.write([seq_record_pdbfixed], seq_pdbfixed_filepath, 'fasta') return fixer.missingResidues except (KeyboardInterrupt, ImportError): raise except Exception as e: trbk = traceback.format_exc() log_filepath = os.path.abspath(os.path.join( ensembler.core.default_project_dirnames.templates_structures_modeled_loops, template_full_seq.id + '-pdbfixer-log.yaml' )) logfile = ensembler.core.LogFile(log_filepath) logfile.log({ 'templateid': str(template_full_seq.id), 'exception': e, 'traceback': ensembler.core.literal_str(trbk), 'mpi_rank': mpistate.rank, }) logger.error( 'MPI rank %d pdbfixer error for template %s - see logfile' % (mpistate.rank, template_full_seq.id) ) logger.debug(e) logger.debug(trbk)
def refine_implicit_md( openmm_platform=None, gpupn=1, process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, write_trajectory=False, include_disulfide_bonds=False, custom_residue_variants=None, ff='amber99sbildn', implicit_water_model='amber99_obc', sim_length=100.0 * unit.picoseconds, timestep=2.0 * unit.femtoseconds, # timestep temperature=300.0 * unit.kelvin, # simulation temperature collision_rate=20.0 / unit.picoseconds, # Langevin collision rate cutoff=None, # nonbonded cutoff minimization_tolerance=10.0 * unit.kilojoules_per_mole / unit.nanometer, minimization_steps=20, nsteps_per_iteration=500, ph=None, retry_failed_runs=False, cpu_platform_threads=1, loglevel=None): # TODO - refactor """Run MD refinement in implicit solvent. MPI-enabled. """ ensembler.utils.set_loglevel(loglevel) gpuid = mpistate.rank % gpupn manual_overrides = ManualOverrides() if ph is None: if manual_overrides.refinement.ph is not None: ph = manual_overrides.refinement.ph else: ph = 7.0 if custom_residue_variants is None: custom_residue_variants = deepcopy( manual_overrides.refinement.custom_residue_variants_by_targetid ) if (sim_length / timestep) < nsteps_per_iteration: nsteps_per_iteration = int(sim_length / timestep) niterations = int((sim_length / timestep) / nsteps_per_iteration) models_dir = os.path.abspath(ensembler.core.default_project_dirnames.models) targets, templates_resolved_seq = ensembler.core.get_targets_and_templates() if process_only_these_templates: selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] else: selected_template_indices = range(len(templates_resolved_seq)) if not openmm_platform: openmm_platform = auto_select_openmm_platform() if openmm_platform == 'CPU': platform_properties = {'CpuThreads': str(cpu_platform_threads)} else: platform_properties = {} ff_files = [ff+'.xml', implicit_water_model+'.xml'] forcefield = app.ForceField(*ff_files) kB = unit.MOLAR_GAS_CONSTANT_R kT = kB * temperature def simulate_implicit_md(): logger.debug("Reading model...") with gzip.open(model_filename) as model_file: pdb = app.PDBFile(model_file) # Set up Platform platform = openmm.Platform.getPlatformByName(openmm_platform) if 'CUDA_VISIBLE_DEVICES' not in os.environ: # Set GPU id. if openmm_platform == 'CUDA': platform.setPropertyDefaultValue('CudaDeviceIndex', '%d' % gpuid) elif openmm_platform == 'OpenCL': platform.setPropertyDefaultValue('OpenCLDeviceIndex', '%d' % gpuid) # Construct Modeller object with same topology as ref structure # (necessary to keep disulfide bonds consistent) modeller = app.Modeller(reference_topology, pdb.positions) # set_openmm_topology_bonds_from_atom_indices(modeller.topology, reference_bonds) # Add missing protons. modeller.addHydrogens(forcefield, pH=ph, variants=reference_variants) topology = modeller.getTopology() positions = modeller.getPositions() logger.debug("Constructing System object...") if cutoff is None: system = forcefield.createSystem(topology, nonbondedMethod=app.NoCutoff, constraints=app.HBonds) else: system = forcefield.createSystem(topology, nonbondedMethod=app.CutoffNonPeriodic, nonbondedCutoff=cutoff, constraints=app.HBonds) logger.debug("Creating Context...") integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) context = openmm.Context(system, integrator, platform, platform_properties) context.setPositions(positions) logger.debug("Minimizing structure...") openmm.LocalEnergyMinimizer.minimize(context, minimization_tolerance, minimization_steps) if write_trajectory: # Open trajectory for writing. logger.debug("Opening trajectory for writing...") trajectory_filename = os.path.join(model_dir, 'implicit-trajectory.pdb.gz') trajectory_outfile = gzip.open(trajectory_filename, 'w') app.PDBFile.writeHeader(topology, file=trajectory_outfile) # Open energy trajectory for writing energy_filename = os.path.join(model_dir, 'implicit-energies.txt') energy_outfile = open(energy_filename, 'w') energy_outfile.write('# iteration | simulation time (ps) | potential_energy (kT) | kinetic_energy (kT) | ns per day\n') logger.debug("Running dynamics...") import time initial_time = time.time() for iteration in range(niterations): # integrate dynamics integrator.step(nsteps_per_iteration) # get current state state = context.getState(getEnergy=True, getPositions=True) simulation_time = state.getTime() potential_energy = state.getPotentialEnergy() kinetic_energy = state.getKineticEnergy() final_time = time.time() elapsed_time = (final_time - initial_time) * unit.seconds ns_per_day = (simulation_time / elapsed_time) / (unit.nanoseconds / unit.day) logger.debug( " %8.1f ps : potential %8.3f kT | kinetic %8.3f kT | %.3f ns/day | %.3f s remain" % ( simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT, ns_per_day, elapsed_time * (niterations-iteration-1) / (iteration+1) / unit.seconds ) ) # Check energies are still finite. if np.isnan(potential_energy/kT) or np.isnan(kinetic_energy/kT): raise Exception("Potential or kinetic energies are nan.") if write_trajectory: app.PDBFile.writeModel(topology, state.getPositions(), file=trajectory_outfile, modelIndex=iteration) # write data energy_outfile.write(" %8d %8.1f %8.3f %8.3f %.3f\n" % (iteration, simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT, ns_per_day)) energy_outfile.flush() if write_trajectory: app.PDBFile.writeFooter(topology, file=trajectory_outfile) trajectory_outfile.close() energy_outfile.close() # Write final PDB file. pdb_outfile = gzip.open(pdb_filename, 'wt') app.PDBFile.writeHeader(topology, file=pdb_outfile) app.PDBFile.writeFile(topology, state.getPositions(), file=pdb_outfile) app.PDBFile.writeFooter(topology, file=pdb_outfile) pdb_outfile.close() # Process targets print('Processing targets...') # DEBUG for target in targets: if (process_only_these_targets is not None) and (target.id not in process_only_these_targets): print('Skipping because %s is not in process_only_these_targets' % target.id) print(process_only_these_targets) continue logger.info('Processing %s' % target) models_target_dir = os.path.join(models_dir, target.id) if mpistate.rank == 0: target_starttime = datetime.datetime.utcnow() if not os.path.exists(models_target_dir): print('%s does not exist, skipping' % models_target_dir) continue mpistate.comm.Barrier() # ======== # Determine topology (including protonation state) to use throughout # ======== reference_model_id = get_highest_seqid_existing_model(models_target_dir=models_target_dir) if reference_model_id is None: continue reference_model_path = os.path.join(models_target_dir, reference_model_id, 'model.pdb.gz') with gzip.open(reference_model_path) as reference_pdb_file: reference_pdb = app.PDBFile(reference_pdb_file) logger.debug("Using %s as highest identity model" % (reference_model_id)) if not include_disulfide_bonds: remove_disulfide_bonds_from_topology(reference_pdb.topology) # Build topology for reference model logger.debug("Creating app.Modeller instance...") modeller = app.Modeller(reference_pdb.topology, reference_pdb.positions) reference_topology = modeller.topology logger.debug("Adding hydrogens...") reference_variants = modeller.addHydrogens(forcefield, pH=ph) if target.id in custom_residue_variants: apply_custom_residue_variants(reference_variants, custom_residue_variants[target.id]) logger.debug("Reference variants extracted:") if reference_variants is not None: for (residue_index, residue) in enumerate(reference_variants): if residue is not None: logger.debug("%8d %s" % (residue_index+1, residue)) logger.debug("") else: logger.debug(reference_variants) if model_seqid_cutoff: process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(target.id, seqid_cutoff=model_seqid_cutoff) selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] ntemplates_selected = len(selected_template_indices) for template_index in range(mpistate.rank, ntemplates_selected, mpistate.size): template = templates_resolved_seq[selected_template_indices[template_index]] model_dir = os.path.join(models_target_dir, template.id) if not os.path.exists(model_dir): continue # Only simulate models that are unique following filtering by clustering. unique_by_clustering = os.path.exists(os.path.join(model_dir, 'unique_by_clustering')) if not unique_by_clustering: continue # Pass if this simulation has already been run. log_filepath = os.path.join(model_dir, 'implicit-log.yaml') if os.path.exists(log_filepath): with open(log_filepath) as log_file: log_data = yaml.load(log_file, Loader=ensembler.core.YamlLoader) if log_data.get('successful') is True: continue if log_data.get('finished') is True and (retry_failed_runs is False and log_data.get('successful') is False): continue # Check to make sure the initial model file is present. model_filename = os.path.join(model_dir, 'model.pdb.gz') if not os.path.exists(model_filename): logger.debug('model.pdb.gz not present: target %s template %s rank %d gpuid %d' % (target.id, template.id, mpistate.rank, gpuid)) continue pdb_filename = os.path.join(model_dir, 'implicit-refined.pdb.gz') logger.info("-------------------------------------------------------------------------") logger.info("Simulating %s => %s in implicit solvent for %.1f ps (MPI rank: %d, GPU ID: %d)" % (target.id, template.id, niterations * nsteps_per_iteration * timestep / unit.picoseconds, mpistate.rank, gpuid)) logger.info("-------------------------------------------------------------------------") # Open log file log_data = { 'mpi_rank': mpistate.rank, 'gpuid': gpuid if 'CUDA_VISIBLE_DEVICES' not in os.environ else os.environ['CUDA_VISIBLE_DEVICES'], 'openmm_platform': openmm_platform, 'finished': False, 'sim_length': str(sim_length), 'timestep': str(timestep), 'temperature': str(temperature), 'ph': ph, } log_file = ensembler.core.LogFile(log_filepath) log_file.log(new_log_data=log_data) try: start = datetime.datetime.utcnow() simulate_implicit_md() timing = ensembler.core.strf_timedelta(datetime.datetime.utcnow() - start) log_data = { 'finished': True, 'timing': timing, 'successful': True, } log_file.log(new_log_data=log_data) except Exception as e: trbk = traceback.format_exc() warnings.warn( '= ERROR start: MPI rank {0} hostname {1} gpuid {2} =\n{3}\n{4}\n= ERROR end: MPI rank {0} hostname {1} gpuid {2}'.format( mpistate.rank, socket.gethostname(), gpuid, e, trbk ) ) timing = ensembler.core.strf_timedelta(datetime.datetime.utcnow() - start) log_data = { 'exception': e, 'traceback': ensembler.core.literal_str(trbk), 'timing': timing, 'finished': True, 'successful': False, } log_file.log(new_log_data=log_data) logger.debug('Finished template loop: rank %d' % mpistate.rank) mpistate.comm.Barrier() if mpistate.rank == 0: project_metadata = ensembler.core.ProjectMetadata(project_stage='refine_implicit_md', target_id=target.id) datestamp = ensembler.core.get_utcnow_formatted() command = ['find', models_target_dir, '-name', 'implicit-refined.pdb.gz'] output = subprocess.check_output(command) nsuccessful_refinements = output.decode('UTF-8').count('\n') target_timedelta = datetime.datetime.utcnow() - target_starttime metadata = { 'target_id': target.id, 'datestamp': datestamp, 'timing': ensembler.core.strf_timedelta(target_timedelta), 'openmm_platform': openmm_platform, 'process_only_these_targets': process_only_these_targets, 'process_only_these_templates': process_only_these_templates, 'model_seqid_cutoff': model_seqid_cutoff, 'write_trajectory': write_trajectory, 'include_disulfide_bonds': include_disulfide_bonds, 'custom_residue_variants': custom_residue_variants, 'ff': ff, 'implicit_water_model': implicit_water_model, 'sim_length': str(sim_length), 'timestep': str(timestep), 'temperature': str(temperature), 'collision_rate': str(collision_rate), 'cutoff': str(cutoff), 'nsteps_per_iteration': nsteps_per_iteration, 'ph': ph, 'nsuccessful_refinements': nsuccessful_refinements, 'python_version': sys.version.split('|')[0].strip(), 'python_full_version': ensembler.core.literal_str(sys.version), 'ensembler_version': ensembler.version.short_version, 'ensembler_commit': ensembler.version.git_revision, 'biopython_version': Bio.__version__, 'openmm_version': simtk.openmm.version.short_version, 'openmm_commit': simtk.openmm.version.git_revision, } project_metadata.add_data(metadata) project_metadata.write() mpistate.comm.Barrier() mpistate.comm.Barrier() if mpistate.rank == 0: logger.info('Done.')
def __init__(self, targetid, traj_filepath=None, topol_filepath=None, models_data_filepath=None, process_only_these_templates=None, loglevel=None, run_main=True): """ Makes trajectory of the model files with added hydrogens, but prior to any refinement. For the specified target, makes a single topology pdb file, a single trajectory xtc file, and individual pdb files for each model. See docs on `MkTraj` for further info on paramters. Examples -------- MkTrajImplicitStart(targetid='EGFR_HUMAN_D0') """ ensembler.utils.set_loglevel(loglevel) ensembler.core.check_project_toplevel_dir() self.models_target_dir = os.path.join(default_project_dirnames.models, targetid) logger.debug('Working on target %s' % targetid) self.ensembler_stage = 'implicit-start' self.model_filename = 'implicit-start.pdb.gz' if traj_filepath is None: self.traj_filepath = os.path.join( self.models_target_dir, 'traj-{0}.xtc'.format(self.ensembler_stage)) else: self.traj_filepath = traj_filepath if topol_filepath is None: self.topol_filepath = os.path.join( self.models_target_dir, 'traj-{0}-topol.pdb'.format(self.ensembler_stage)) else: self.topol_filepath = topol_filepath if models_data_filepath is None: self.models_data_filepath = os.path.join( self.models_target_dir, 'traj-{0}-data.csv'.format(self.ensembler_stage)) else: self.models_data_filepath = models_data_filepath if process_only_these_templates: self.templateids = process_only_these_templates else: directories = [ directory for directory in os.walk(self.models_target_dir) ] self.templateids = directories[0][1] if run_main: self._gen_implicit_start_models() self._gen_df(model_filename=self.model_filename) self.df.to_csv(self.models_data_filepath, columns=['templateid', 'seqid']) self._construct_traj() self._superpose() self._write_traj()
def __init__(self, targetid, ensembler_stage=None, traj_filepath=None, topol_filepath=None, models_data_filepath=None, process_only_these_templates=None, loglevel=None, run_main=True): """Makes a trajectory for a given target, using mdtraj. The trajectory can be used with other software, e.g. for visualization with PyMOL or VMD. Parameters ---------- targetid : str e.g. 'EGFR_HUMAN_D0' ensembler_stage : str The Ensembler stage from which to build models, e.g. 'build_models' results in a trajectory built from the 'model.pdb.gz' files output by the build_models command. options: build_models|refine_implicit_md|refine_explicit_md default: most advanced stage for which model files are available traj_filepath : str default: models/[targetid]/traj-[ensembler_stage].xtc topol_filepath : str default: models/[targetid]/traj-[ensembler_stage]-topol.pdb models_data_filepath : default: models/[targetid]/traj-[ensembler_stage]-data.csv process_only_these_templates : list of str Returns ------- traj : mdtraj.Trajectory df : pandas.DataFrame models data (e.g. sequence identities): """ ensembler.utils.set_loglevel(loglevel) ensembler.core.check_project_toplevel_dir() self.models_target_dir = os.path.join(default_project_dirnames.models, targetid) logger.debug('Working on target %s' % targetid) if ensembler_stage is None: self.ensembler_stage = get_most_advanced_ensembler_modeling_stage( targetid) else: self.ensembler_stage = ensembler_stage if traj_filepath is None: self.traj_filepath = os.path.join( self.models_target_dir, 'traj-{0}.xtc'.format(self.ensembler_stage)) else: self.traj_filepath = traj_filepath if topol_filepath is None: self.topol_filepath = os.path.join( self.models_target_dir, 'traj-{0}-topol.pdb'.format(self.ensembler_stage)) else: self.topol_filepath = topol_filepath if models_data_filepath is None: self.models_data_filepath = os.path.join( self.models_target_dir, 'traj-{0}-data.csv'.format(self.ensembler_stage)) else: self.models_data_filepath = models_data_filepath if process_only_these_templates: self.templateids = process_only_these_templates else: directories = [ directory for directory in os.walk(self.models_target_dir) ] self.templateids = directories[0][1] if run_main: self._gen_df() self.df.to_csv(self.models_data_filepath, columns=['templateid', 'seqid']) self._construct_traj() self._superpose() self._write_traj()
def pdbfix_template(template_full_seq, overwrite_structures=False): """ Parameters ---------- template_full_seq: BioPython SeqRecord full UniProt sequence for span of the template (including unresolved residues) overwrite_structures: bool Returns ------- fixer.missingResidues """ try: template_pdbfixed_filepath = os.path.join( ensembler.core.default_project_dirnames. templates_structures_modeled_loops, template_full_seq.id + '-pdbfixed.pdb') seq_pdbfixed_filepath = os.path.join( ensembler.core.default_project_dirnames. templates_structures_modeled_loops, template_full_seq.id + '-pdbfixed.fasta') import pdbfixer import simtk.openmm.app template_filepath = os.path.join( ensembler.core.default_project_dirnames. templates_structures_resolved, template_full_seq.id + '.pdb') fixer = pdbfixer.PDBFixer(filename=template_filepath) chainid = next(fixer.topology.chains()).id sequence = [ Bio.SeqUtils.seq3(r).upper() for r in template_full_seq.seq ] seq_obj = pdbfixer.pdbfixer.Sequence(chainid, sequence) fixer.sequences.append(seq_obj) fixer.findMissingResidues() remove_missing_residues_at_termini(fixer, len_full_seq=len( template_full_seq.seq)) if not overwrite_structures and os.path.exists( template_pdbfixed_filepath): return fixer.missingResidues fixer.findMissingAtoms() (newTopology, newPositions, newAtoms, existingAtomMap) = fixer._addAtomsToTopology(True, True) fixer.topology = newTopology fixer.positions = newPositions with open(template_pdbfixed_filepath, 'w') as template_pdbfixed_file: simtk.openmm.app.PDBFile.writeFile(fixer.topology, fixer.positions, file=template_pdbfixed_file) # Write sequence to file seq_pdbfixed = ''.join( [Bio.SeqUtils.seq1(r.name) for r in fixer.topology.residues()]) seq_record_pdbfixed = SeqRecord(Seq(seq_pdbfixed), id=template_full_seq.id, description=template_full_seq.id) Bio.SeqIO.write([seq_record_pdbfixed], seq_pdbfixed_filepath, 'fasta') return fixer.missingResidues except (KeyboardInterrupt, ImportError): raise except Exception as e: trbk = traceback.format_exc() log_filepath = os.path.abspath( os.path.join( ensembler.core.default_project_dirnames. templates_structures_modeled_loops, template_full_seq.id + '-pdbfixer-log.yaml')) logfile = ensembler.core.LogFile(log_filepath) logfile.log({ 'templateid': str(template_full_seq.id), 'exception': e, 'traceback': ensembler.core.literal_str(trbk), 'mpi_rank': mpistate.rank, }) logger.error( 'MPI rank %d pdbfixer error for template %s - see logfile' % (mpistate.rank, template_full_seq.id)) logger.debug(e) logger.debug(trbk)
def package_for_fah(process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, model_validation_score_cutoff=None, model_validation_score_percentile=None, nclones=1, archive=False, openmm_platform='Reference', temperature=300.0 * unit.kelvin, collision_rate=1.0 / unit.picosecond, timestep=2.0 * unit.femtoseconds, loglevel=None): """ Create the input files and directory structure necessary to start a Folding@Home project. MPI-enabled. Parameters ---------- archive : Bool A .tgz compressed archive will be created for each individual RUN directory. """ set_loglevel(loglevel) if mpistate.rank == 0: if not os.path.exists(fah_projects_dir): os.mkdir(fah_projects_dir) mpistate.comm.Barrier() targets, templates_resolved_seq = get_targets_and_templates() for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue target_project_dir = os.path.join(fah_projects_dir, target.id) models_target_dir = os.path.join(default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue mpistate.comm.Barrier() sorted_valid_templates = [] system = None renumbered_resnums = {} if mpistate.rank == 0: logger.info('-------------------------------------------------------------------------') logger.info('Building FAH OpenMM project for target {}'.format(target.id)) logger.info('-------------------------------------------------------------------------') valid_templates = get_valid_templates_for_target( target, templates_resolved_seq, process_only_these_templates=process_only_these_templates, model_seqid_cutoff=model_seqid_cutoff, model_validation_score_cutoff=model_validation_score_cutoff, model_validation_score_percentile=model_validation_score_percentile ) sorted_valid_templates = sort_valid_templates_by_seqid( target, valid_templates ) create_target_project_dir(target) system = setup_system_and_integrator_files( target, sorted_valid_templates[0], temperature, collision_rate, timestep ) renumbered_resnums = get_renumbered_topol_resnums(target) sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0) system = mpistate.comm.bcast(system, root=0) renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0) logger.debug("Building RUNs in parallel...") for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size): template = sorted_valid_templates[run_index] logger.info('-------------------------------------------------------------------------') logger.info( 'Building RUN{} for template {}'.format( run_index, template ) ) logger.info('-------------------------------------------------------------------------') source_dir = os.path.join(models_target_dir, template) generate_fah_run( target_project_dir, template, source_dir, system, run_index, nclones, temperature, collision_rate, timestep, openmm_platform, renumbered_resnums, ) if archive: tgz_fah_run(target, run_index) mpistate.comm.Barrier() if mpistate.rank == 0: logger.info('Done.')
def generate_fah_run(target_project_dir, template, source_dir, system, run_index, nclones, temperature, collision_rate, timestep, openmm_platform, renumbered_resnums, ): """ Build Folding@Home RUN and CLONE subdirectories from (possibly compressed) OpenMM serialized XML files. ARGUMENTS run (int) - run index """ logger.debug("Building RUN %d" % run_index) try: # Determine directory and pathnames. run_dir = os.path.join(target_project_dir, 'RUN%d' % run_index) run_template_id_filepath = os.path.join(run_dir, 'template.txt') run_seqid_filepath = os.path.join(run_dir, 'sequence-identity.txt') run_protein_structure_filepath = os.path.join(run_dir, 'protein.pdb') run_system_structure_filepath = os.path.join(run_dir, 'system.pdb') run_final_state_filepath = os.path.join(run_dir, 'state%d.xml' % (nclones - 1)) source_seqid_filepath = os.path.join(source_dir, 'sequence-identity.txt') source_protein_structure_filepath = os.path.join(source_dir, 'implicit-refined.pdb.gz') source_system_structure_filepath = os.path.join(source_dir, 'explicit-refined.pdb.gz') source_openmm_state_filepath = os.path.join(source_dir, 'explicit-state.xml') # Return if this directory has already been set up. if os.path.exists(run_dir): if ( os.path.exists(run_template_id_filepath) and os.path.exists(run_seqid_filepath) and os.path.exists(run_protein_structure_filepath) and os.path.exists(run_system_structure_filepath) and os.path.exists(run_final_state_filepath) ): return else: # Construct run directory if it does not exist. if not os.path.exists(run_dir): os.makedirs(run_dir) # Write template ID with open(run_template_id_filepath, 'w') as outfile: outfile.write(template + '\n') # Write the protein and system structure pdbs if 'implicit' in renumbered_resnums: write_renumbered_structure( source_protein_structure_filepath, run_protein_structure_filepath, renumbered_resnums['implicit'], ) else: with open(run_protein_structure_filepath, 'w') as protein_structure_file: protein_structure_file.write( read_file_contents_gz_or_not(source_protein_structure_filepath) ) if 'explicit' in renumbered_resnums: write_renumbered_structure( source_system_structure_filepath, run_system_structure_filepath, renumbered_resnums['explicit'], ) else: with open(run_system_structure_filepath, 'w') as system_structure_file: system_structure_file.write( read_file_contents_gz_or_not(source_system_structure_filepath) ) state = mm.XmlSerializer.deserialize( read_file_contents_gz_or_not(source_openmm_state_filepath) ) # Write sequence identity. with open(run_seqid_filepath, 'w') as run_seqid_file: run_seqid_file.write(read_file_contents_gz_or_not(source_seqid_filepath)) # Create new integrator to use. integrator = mm.LangevinIntegrator(temperature, collision_rate, timestep) # Create Context so we can randomize velocities. platform = mm.Platform.getPlatformByName(openmm_platform) context = mm.Context(system, integrator, platform) context.setPositions(state.getPositions()) box_vectors = state.getPeriodicBoxVectors() context.setPeriodicBoxVectors(*box_vectors) # Create clones with different random initial velocities. for clone_index in range(nclones): state_filename = os.path.join(run_dir, 'state%d.xml' % clone_index) if os.path.exists(state_filename): continue context.setVelocitiesToTemperature(temperature) state = context.getState( getPositions=True, getVelocities=True, getForces=True, getEnergy=True, getParameters=True, enforcePeriodicBox=True ) with open(state_filename, 'w') as state_file: state_file.write(mm.XmlSerializer.serialize(state)) except Exception as e: import traceback print(traceback.format_exc()) print(str(e))
def cluster_models(process_only_these_targets=None, cutoff=0.06, loglevel=None): """Cluster models based on RMSD, and filter out non-unique models as determined by a given cutoff. Parameters ---------- cutoff : float Minimum distance cutoff for RMSD clustering (nm) Runs serially. """ # TODO refactor ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = get_targets_and_templates() templates = templates_resolved_seq for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue models_target_dir = os.path.join( ensembler.core.default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue # ============================= # Construct a mdtraj trajectory containing all models # ============================= starttime = datetime.datetime.utcnow() logger.debug('Building a list of valid models...') model_pdbfilenames_compressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb.gz') for template in templates } model_pdbfilenames_uncompressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb') for template in templates } valid_templateids = [ templateid for templateid in model_pdbfilenames_compressed if os.path.exists(model_pdbfilenames_compressed[templateid]) ] # Write uncompressed model.pdb files from model.pdb.gz if necessary for templateid in valid_templateids: if not os.path.exists( model_pdbfilenames_uncompressed[templateid] ) or os.path.getsize( model_pdbfilenames_uncompressed[templateid]) == 0: with gzip.open(model_pdbfilenames_compressed[templateid] ) as model_pdbfile_compressed: with open(model_pdbfilenames_uncompressed[templateid], 'w') as model_pdbfile: model_pdbfile.write(model_pdbfile_compressed.read()) logger.info('Constructing a trajectory containing all valid models...') if len(valid_templateids) == 0: logger.info('No models found for target {0}.'.format(target.id)) continue valid_model_pdbfilenames_uncompressed = [ model_pdbfilenames_uncompressed[templateid] for templateid in valid_templateids ] traj = mdtraj.load(valid_model_pdbfilenames_uncompressed) # ============================= # Clustering # ============================= logger.info('Conducting RMSD-based clustering...') # Remove any existing unique_by_clustering files for f in glob.glob(models_target_dir + '/*_PK_*/unique_by_clustering'): os.unlink(f) CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA'] unique_templateids = models_regular_spatial_clustering( valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff) write_unique_by_clustering_files(unique_templateids, models_target_dir) with open(os.path.join(models_target_dir, 'unique-models.txt'), 'w') as uniques_file: for u in unique_templateids: uniques_file.write(u + '\n') logger.info( '%d unique models (from original set of %d) using cutoff of %.3f nm' % (len(unique_templateids), len(valid_templateids), cutoff)) for template in templates: model_dir = os.path.join(models_target_dir, template.id) model_pdbfilename = os.path.join(model_dir, 'model.pdb') if os.path.exists(model_pdbfilename): os.remove(model_pdbfilename) # ======== # Metadata # ======== project_metadata = ensembler.core.ProjectMetadata( project_stage='cluster_models', target_id=target.id) datestamp = ensembler.core.get_utcnow_formatted() timedelta = datetime.datetime.utcnow() - starttime metadata = { 'target_id': target.id, 'datestamp': datestamp, 'nunique_models': len(unique_templateids), 'python_version': sys.version.split('|')[0].strip(), 'python_full_version': ensembler.core.literal_str(sys.version), 'ensembler_version': ensembler.version.short_version, 'ensembler_commit': ensembler.version.git_revision, 'biopython_version': Bio.__version__, 'mdtraj_version': mdtraj.version.short_version, 'mdtraj_commit': mdtraj.version.git_revision, 'timing': ensembler.core.strf_timedelta(timedelta), } project_metadata.add_data(metadata) project_metadata.write()
def _gen_implicit_start_models( self, ff='amber99sbildn.xml', implicit_water_model='amber99_obc.xml', ph=8.0): self.ph = ph from simtk.openmm import app valid_model_templateids = [ templateid for templateid in self.templateids if os.path.exists( os.path.join( self.models_target_dir, templateid, ensembler.core.model_filenames_by_ensembler_stage['refine_implicit_md'] ) ) ] gen_model_templateids = [ templateid for templateid in valid_model_templateids if not os.path.exists( os.path.join(self.models_target_dir, templateid, self.model_filename) ) ] # make reference model forcefield = app.ForceField(ff, implicit_water_model) reference_model_id = get_highest_seqid_existing_model(models_target_dir=self.models_target_dir) logger.debug('Using {0} as reference model'.format(reference_model_id)) reference_model_path = os.path.join(self.models_target_dir, reference_model_id, model_filenames_by_ensembler_stage['build_models']) with gzip.open(reference_model_path) as reference_pdb_file: reference_pdb = app.PDBFile(reference_pdb_file) remove_disulfide_bonds_from_topology(reference_pdb.topology) reference_topology = reference_pdb.topology reference_modeller = app.Modeller(reference_pdb.topology, reference_pdb.positions) reference_variants = reference_modeller.addHydrogens(forcefield, pH=self.ph) for template_index in range(mpistate.rank, len(gen_model_templateids), mpistate.size): templateid = gen_model_templateids[template_index] logger.debug('Generating implicit-start model for {0}'.format(templateid)) try: input_model_filepath = os.path.join(self.models_target_dir, templateid, model_filenames_by_ensembler_stage['build_models']) output_model_filepath = os.path.join(self.models_target_dir, templateid, self.model_filename) with gzip.open(input_model_filepath) as pdb_file: pdb = app.PDBFile(pdb_file) remove_disulfide_bonds_from_topology(pdb.topology) modeller = app.Modeller(reference_topology, pdb.positions) modeller.addHydrogens(forcefield, pH=self.ph, variants=reference_variants) topology = modeller.getTopology() positions = modeller.getPositions() with gzip.open(output_model_filepath, 'wt') as output_model_file: app.PDBFile.writeHeader(topology, file=output_model_file) app.PDBFile.writeFile(topology, positions, file=output_model_file) app.PDBFile.writeFooter(topology, file=output_model_file) except Exception as e: print('Error for model {0}: {1}'.format(templateid, e)) continue
def simulate_implicit_md(): logger.debug("Reading model...") with gzip.open(model_filename) as model_file: pdb = app.PDBFile(model_file) # Set up Platform platform = openmm.Platform.getPlatformByName(openmm_platform) if 'CUDA_VISIBLE_DEVICES' not in os.environ: # Set GPU id. if openmm_platform == 'CUDA': platform.setPropertyDefaultValue('CudaDeviceIndex', '%d' % gpuid) elif openmm_platform == 'OpenCL': platform.setPropertyDefaultValue('OpenCLDeviceIndex', '%d' % gpuid) # Construct Modeller object with same topology as ref structure # (necessary to keep disulfide bonds consistent) modeller = app.Modeller(reference_topology, pdb.positions) # set_openmm_topology_bonds_from_atom_indices(modeller.topology, reference_bonds) # Add missing protons. modeller.addHydrogens(forcefield, pH=ph, variants=reference_variants) topology = modeller.getTopology() positions = modeller.getPositions() logger.debug("Constructing System object...") if cutoff is None: system = forcefield.createSystem(topology, nonbondedMethod=app.NoCutoff, constraints=app.HBonds) else: system = forcefield.createSystem(topology, nonbondedMethod=app.CutoffNonPeriodic, nonbondedCutoff=cutoff, constraints=app.HBonds) logger.debug("Creating Context...") integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) context = openmm.Context(system, integrator, platform, platform_properties) context.setPositions(positions) logger.debug("Minimizing structure...") openmm.LocalEnergyMinimizer.minimize(context, minimization_tolerance, minimization_steps) if write_trajectory: # Open trajectory for writing. logger.debug("Opening trajectory for writing...") trajectory_filename = os.path.join(model_dir, 'implicit-trajectory.pdb.gz') trajectory_outfile = gzip.open(trajectory_filename, 'w') app.PDBFile.writeHeader(topology, file=trajectory_outfile) # Open energy trajectory for writing energy_filename = os.path.join(model_dir, 'implicit-energies.txt') energy_outfile = open(energy_filename, 'w') energy_outfile.write('# iteration | simulation time (ps) | potential_energy (kT) | kinetic_energy (kT) | ns per day\n') logger.debug("Running dynamics...") import time initial_time = time.time() for iteration in range(niterations): # integrate dynamics integrator.step(nsteps_per_iteration) # get current state state = context.getState(getEnergy=True, getPositions=True) simulation_time = state.getTime() potential_energy = state.getPotentialEnergy() kinetic_energy = state.getKineticEnergy() final_time = time.time() elapsed_time = (final_time - initial_time) * unit.seconds ns_per_day = (simulation_time / elapsed_time) / (unit.nanoseconds / unit.day) logger.debug( " %8.1f ps : potential %8.3f kT | kinetic %8.3f kT | %.3f ns/day | %.3f s remain" % ( simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT, ns_per_day, elapsed_time * (niterations-iteration-1) / (iteration+1) / unit.seconds ) ) # Check energies are still finite. if np.isnan(potential_energy/kT) or np.isnan(kinetic_energy/kT): raise Exception("Potential or kinetic energies are nan.") if write_trajectory: app.PDBFile.writeModel(topology, state.getPositions(), file=trajectory_outfile, modelIndex=iteration) # write data energy_outfile.write(" %8d %8.1f %8.3f %8.3f %.3f\n" % (iteration, simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT, ns_per_day)) energy_outfile.flush() if write_trajectory: app.PDBFile.writeFooter(topology, file=trajectory_outfile) trajectory_outfile.close() energy_outfile.close() # Write final PDB file. pdb_outfile = gzip.open(pdb_filename, 'wt') app.PDBFile.writeHeader(topology, file=pdb_outfile) app.PDBFile.writeFile(topology, state.getPositions(), file=pdb_outfile) app.PDBFile.writeFooter(topology, file=pdb_outfile) pdb_outfile.close()
def generate_fah_run( target_project_dir, template, source_dir, system, run_index, nclones, temperature, collision_rate, timestep, openmm_platform, renumbered_resnums, ): """ Build Folding@Home RUN and CLONE subdirectories from (possibly compressed) OpenMM serialized XML files. ARGUMENTS run (int) - run index """ logger.debug("Building RUN %d" % run_index) try: # Determine directory and pathnames. run_dir = os.path.join(target_project_dir, 'RUN%d' % run_index) run_template_id_filepath = os.path.join(run_dir, 'template.txt') run_seqid_filepath = os.path.join(run_dir, 'sequence-identity.txt') run_protein_structure_filepath = os.path.join(run_dir, 'protein.pdb') run_system_structure_filepath = os.path.join(run_dir, 'system.pdb') run_final_state_filepath = os.path.join(run_dir, 'state%d.xml' % (nclones - 1)) source_seqid_filepath = os.path.join(source_dir, 'sequence-identity.txt') source_protein_structure_filepath = os.path.join( source_dir, 'implicit-refined.pdb.gz') source_system_structure_filepath = os.path.join( source_dir, 'explicit-refined.pdb.gz') source_openmm_state_filepath = os.path.join(source_dir, 'explicit-state.xml') # Return if this directory has already been set up. if os.path.exists(run_dir): if (os.path.exists(run_template_id_filepath) and os.path.exists(run_seqid_filepath) and os.path.exists(run_protein_structure_filepath) and os.path.exists(run_system_structure_filepath) and os.path.exists(run_final_state_filepath)): return else: # Construct run directory if it does not exist. if not os.path.exists(run_dir): os.makedirs(run_dir) # Write template ID with open(run_template_id_filepath, 'w') as outfile: outfile.write(template + '\n') # Write the protein and system structure pdbs if 'implicit' in renumbered_resnums: write_renumbered_structure( source_protein_structure_filepath, run_protein_structure_filepath, renumbered_resnums['implicit'], ) else: with open(run_protein_structure_filepath, 'w') as protein_structure_file: protein_structure_file.write( read_file_contents_gz_or_not( source_protein_structure_filepath)) if 'explicit' in renumbered_resnums: write_renumbered_structure( source_system_structure_filepath, run_system_structure_filepath, renumbered_resnums['explicit'], ) else: with open(run_system_structure_filepath, 'w') as system_structure_file: system_structure_file.write( read_file_contents_gz_or_not( source_system_structure_filepath)) state = mm.XmlSerializer.deserialize( read_file_contents_gz_or_not(source_openmm_state_filepath)) # Write sequence identity. with open(run_seqid_filepath, 'w') as run_seqid_file: run_seqid_file.write( read_file_contents_gz_or_not(source_seqid_filepath)) # Create new integrator to use. integrator = mm.LangevinIntegrator(temperature, collision_rate, timestep) # Create Context so we can randomize velocities. platform = mm.Platform.getPlatformByName(openmm_platform) context = mm.Context(system, integrator, platform) context.setPositions(state.getPositions()) box_vectors = state.getPeriodicBoxVectors() context.setPeriodicBoxVectors(*box_vectors) # Create clones with different random initial velocities. for clone_index in range(nclones): state_filename = os.path.join(run_dir, 'state%d.xml' % clone_index) if os.path.exists(state_filename): continue context.setVelocitiesToTemperature(temperature) state = context.getState(getPositions=True, getVelocities=True, getForces=True, getEnergy=True, getParameters=True, enforcePeriodicBox=True) with open(state_filename, 'w') as state_file: state_file.write(mm.XmlSerializer.serialize(state)) except Exception as e: import traceback print(traceback.format_exc()) print(str(e))
def __init__(self, targetid, ensembler_stage=None, traj_filepath=None, topol_filepath=None, models_data_filepath=None, process_only_these_templates=None, loglevel=None, run_main=True): """Makes a trajectory for a given target, using mdtraj. The trajectory can be used with other software, e.g. for visualization with PyMOL or VMD. Parameters ---------- targetid : str e.g. 'EGFR_HUMAN_D0' ensembler_stage : str The Ensembler stage from which to build models, e.g. 'build_models' results in a trajectory built from the 'model.pdb.gz' files output by the build_models command. options: build_models|refine_implicit_md|refine_explicit_md default: most advanced stage for which model files are available traj_filepath : str default: models/[targetid]/traj-[ensembler_stage].xtc topol_filepath : str default: models/[targetid]/traj-[ensembler_stage]-topol.pdb models_data_filepath : default: models/[targetid]/traj-[ensembler_stage]-data.csv process_only_these_templates : list of str Returns ------- traj : mdtraj.Trajectory df : pandas.DataFrame models data (e.g. sequence identities): """ ensembler.utils.set_loglevel(loglevel) ensembler.core.check_project_toplevel_dir() self.models_target_dir = os.path.join(default_project_dirnames.models, targetid) logger.debug('Working on target %s' % targetid) if ensembler_stage is None: self.ensembler_stage = get_most_advanced_ensembler_modeling_stage(targetid) else: self.ensembler_stage = ensembler_stage if traj_filepath is None: self.traj_filepath = os.path.join( self.models_target_dir, 'traj-{0}.xtc'.format(self.ensembler_stage) ) else: self.traj_filepath = traj_filepath if topol_filepath is None: self.topol_filepath = os.path.join( self.models_target_dir, 'traj-{0}-topol.pdb'.format(self.ensembler_stage) ) else: self.topol_filepath = topol_filepath if models_data_filepath is None: self.models_data_filepath = os.path.join( self.models_target_dir, 'traj-{0}-data.csv'.format(self.ensembler_stage) ) else: self.models_data_filepath = models_data_filepath if process_only_these_templates: self.templateids = process_only_these_templates else: directories = [ directory for directory in os.walk(self.models_target_dir) ] self.templateids = directories[0][1] if run_main: self._gen_df() self.df.to_csv(self.models_data_filepath, columns=['templateid', 'seqid']) self._construct_traj() self._superpose() self._write_traj()