def build_models(process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, write_modeller_restraints_file=False, loglevel=None): """Uses the build_model method to build homology models for a given set of targets and templates. MPI-enabled. """ # Note that this code uses an os.chdir call to switch into a temp directory before running Modeller. # This is because Modeller writes various output files in the current directory, and there is NO WAY # to define where these files are written, other than to chdir beforehand. If running this routine # in parallel, it is likely that occasional exceptions will occur, due to concurrent processes # making os.chdir calls. ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = get_targets_and_templates() if process_only_these_templates: selected_template_indices = [ i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates ] else: selected_template_indices = range(len(templates_resolved_seq)) for target in targets: if process_only_these_targets and target.id not in process_only_these_targets: continue target_setup_data = build_models_target_setup(target) if model_seqid_cutoff: process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff( target.id, seqid_cutoff=model_seqid_cutoff) selected_template_indices = [ i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates ] ntemplates_selected = len(selected_template_indices) for template_index in range(mpistate.rank, ntemplates_selected, mpistate.size): template_resolved_seq = templates_resolved_seq[ selected_template_indices[template_index]] if process_only_these_templates and template_resolved_seq.id not in process_only_these_templates: continue build_model( target, template_resolved_seq, target_setup_data, write_modeller_restraints_file=write_modeller_restraints_file, loglevel=loglevel) write_build_models_metadata(target, target_setup_data, process_only_these_targets, process_only_these_templates, model_seqid_cutoff, write_modeller_restraints_file)
def build_models(process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, write_modeller_restraints_file=False, loglevel=None): """Uses the build_model method to build homology models for a given set of targets and templates. MPI-enabled. """ # Note that this code uses an os.chdir call to switch into a temp directory before running Modeller. # This is because Modeller writes various output files in the current directory, and there is NO WAY # to define where these files are written, other than to chdir beforehand. If running this routine # in parallel, it is likely that occasional exceptions will occur, due to concurrent processes # making os.chdir calls. ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = get_targets_and_templates() if process_only_these_templates: selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] else: selected_template_indices = range(len(templates_resolved_seq)) for target in targets: if process_only_these_targets and target.id not in process_only_these_targets: continue target_setup_data = build_models_target_setup(target) if model_seqid_cutoff: process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(target.id, seqid_cutoff=model_seqid_cutoff) selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] ntemplates_selected = len(selected_template_indices) for template_index in range(mpistate.rank, ntemplates_selected, mpistate.size): template_resolved_seq = templates_resolved_seq[selected_template_indices[template_index]] if process_only_these_templates and template_resolved_seq.id not in process_only_these_templates: continue build_model(target, template_resolved_seq, target_setup_data, write_modeller_restraints_file=write_modeller_restraints_file, loglevel=loglevel) write_build_models_metadata(target, target_setup_data, process_only_these_targets, process_only_these_templates, model_seqid_cutoff, write_modeller_restraints_file)
def package_for_fah(process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, model_validation_score_cutoff=None, model_validation_score_percentile=None, nclones=1, archive=False, openmm_platform='Reference', temperature=300.0 * unit.kelvin, collision_rate=1.0 / unit.picosecond, timestep=2.0 * unit.femtoseconds, loglevel=None): """ Create the input files and directory structure necessary to start a Folding@Home project. MPI-enabled. Parameters ---------- archive : Bool A .tgz compressed archive will be created for each individual RUN directory. """ set_loglevel(loglevel) if mpistate.rank == 0: if not os.path.exists(fah_projects_dir): os.mkdir(fah_projects_dir) mpistate.comm.Barrier() targets, templates_resolved_seq = get_targets_and_templates() for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue target_project_dir = os.path.join(fah_projects_dir, target.id) models_target_dir = os.path.join(default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue mpistate.comm.Barrier() sorted_valid_templates = [] system = None renumbered_resnums = {} if mpistate.rank == 0: logger.info( '-------------------------------------------------------------------------' ) logger.info('Building FAH OpenMM project for target {}'.format( target.id)) logger.info( '-------------------------------------------------------------------------' ) valid_templates = get_valid_templates_for_target( target, templates_resolved_seq, process_only_these_templates=process_only_these_templates, model_seqid_cutoff=model_seqid_cutoff, model_validation_score_cutoff=model_validation_score_cutoff, model_validation_score_percentile= model_validation_score_percentile) sorted_valid_templates = sort_valid_templates_by_seqid( target, valid_templates) create_target_project_dir(target) system = setup_system_and_integrator_files( target, sorted_valid_templates[0], temperature, collision_rate, timestep) renumbered_resnums = get_renumbered_topol_resnums(target) sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0) system = mpistate.comm.bcast(system, root=0) renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0) logger.debug("Building RUNs in parallel...") for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size): template = sorted_valid_templates[run_index] logger.info( '-------------------------------------------------------------------------' ) logger.info('Building RUN{} for template {}'.format( run_index, template)) logger.info( '-------------------------------------------------------------------------' ) source_dir = os.path.join(models_target_dir, template) generate_fah_run( target_project_dir, template, source_dir, system, run_index, nclones, temperature, collision_rate, timestep, openmm_platform, renumbered_resnums, ) if archive: tgz_fah_run(target, run_index) mpistate.comm.Barrier() if mpistate.rank == 0: logger.info('Done.')
def cluster_models(process_only_these_targets=None, cutoff=0.06, loglevel=None): """Cluster models based on RMSD, and filter out non-unique models as determined by a given cutoff. Parameters ---------- cutoff : float Minimum distance cutoff for RMSD clustering (nm) Runs serially. """ # TODO refactor ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = get_targets_and_templates() templates = templates_resolved_seq for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue # ============================= # Construct a mdtraj trajectory containing all models # ============================= starttime = datetime.datetime.utcnow() logger.debug('Building a list of valid models...') model_pdbfilenames_compressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb.gz') for template in templates } model_pdbfilenames_uncompressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb') for template in templates } valid_templateids = [ templateid for templateid in model_pdbfilenames_compressed if os.path.exists(model_pdbfilenames_compressed[templateid]) ] # Write uncompressed model.pdb files from model.pdb.gz if necessary for templateid in valid_templateids: if not os.path.exists(model_pdbfilenames_uncompressed[templateid]) or os.path.getsize(model_pdbfilenames_uncompressed[templateid]) == 0: with gzip.open(model_pdbfilenames_compressed[templateid]) as model_pdbfile_compressed: with open(model_pdbfilenames_uncompressed[templateid], 'w') as model_pdbfile: model_pdbfile.write(model_pdbfile_compressed.read()) logger.info('Constructing a trajectory containing all valid models...') if len(valid_templateids) == 0: logger.info('No models found for target {0}.'.format(target.id)) continue valid_model_pdbfilenames_uncompressed = [ model_pdbfilenames_uncompressed[templateid] for templateid in valid_templateids ] traj = mdtraj.load(valid_model_pdbfilenames_uncompressed) # ============================= # Clustering # ============================= logger.info('Conducting RMSD-based clustering...') # Remove any existing unique_by_clustering files for f in glob.glob(models_target_dir+'/*_PK_*/unique_by_clustering'): os.unlink(f) CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA'] unique_templateids = models_regular_spatial_clustering( valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff ) write_unique_by_clustering_files(unique_templateids, models_target_dir) with open(os.path.join(models_target_dir, 'unique-models.txt'), 'w') as uniques_file: for u in unique_templateids: uniques_file.write(u+'\n') logger.info( '%d unique models (from original set of %d) using cutoff of %.3f nm' % (len(unique_templateids), len(valid_templateids), cutoff) ) for template in templates: model_dir = os.path.join(models_target_dir, template.id) model_pdbfilename = os.path.join(model_dir, 'model.pdb') if os.path.exists(model_pdbfilename): os.remove(model_pdbfilename) # ======== # Metadata # ======== project_metadata = ensembler.core.ProjectMetadata( project_stage='cluster_models', target_id=target.id ) datestamp = ensembler.core.get_utcnow_formatted() timedelta = datetime.datetime.utcnow() - starttime metadata = { 'target_id': target.id, 'datestamp': datestamp, 'nunique_models': len(unique_templateids), 'python_version': sys.version.split('|')[0].strip(), 'python_full_version': ensembler.core.literal_str(sys.version), 'ensembler_version': ensembler.version.short_version, 'ensembler_commit': ensembler.version.git_revision, 'biopython_version': Bio.__version__, 'mdtraj_version': mdtraj.version.short_version, 'mdtraj_commit': mdtraj.version.git_revision, 'timing': ensembler.core.strf_timedelta(timedelta), } project_metadata.add_data(metadata) project_metadata.write()
def package_for_fah(process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, model_validation_score_cutoff=None, model_validation_score_percentile=None, nclones=1, archive=False, openmm_platform='Reference', temperature=300.0 * unit.kelvin, collision_rate=1.0 / unit.picosecond, timestep=2.0 * unit.femtoseconds, loglevel=None): """ Create the input files and directory structure necessary to start a Folding@Home project. MPI-enabled. Parameters ---------- archive : Bool A .tgz compressed archive will be created for each individual RUN directory. """ set_loglevel(loglevel) if mpistate.rank == 0: if not os.path.exists(fah_projects_dir): os.mkdir(fah_projects_dir) mpistate.comm.Barrier() targets, templates_resolved_seq = get_targets_and_templates() for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue target_project_dir = os.path.join(fah_projects_dir, target.id) models_target_dir = os.path.join(default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue mpistate.comm.Barrier() sorted_valid_templates = [] system = None renumbered_resnums = {} if mpistate.rank == 0: logger.info('-------------------------------------------------------------------------') logger.info('Building FAH OpenMM project for target {}'.format(target.id)) logger.info('-------------------------------------------------------------------------') valid_templates = get_valid_templates_for_target( target, templates_resolved_seq, process_only_these_templates=process_only_these_templates, model_seqid_cutoff=model_seqid_cutoff, model_validation_score_cutoff=model_validation_score_cutoff, model_validation_score_percentile=model_validation_score_percentile ) sorted_valid_templates = sort_valid_templates_by_seqid( target, valid_templates ) create_target_project_dir(target) system = setup_system_and_integrator_files( target, sorted_valid_templates[0], temperature, collision_rate, timestep ) renumbered_resnums = get_renumbered_topol_resnums(target) sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0) system = mpistate.comm.bcast(system, root=0) renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0) logger.debug("Building RUNs in parallel...") for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size): template = sorted_valid_templates[run_index] logger.info('-------------------------------------------------------------------------') logger.info( 'Building RUN{} for template {}'.format( run_index, template ) ) logger.info('-------------------------------------------------------------------------') source_dir = os.path.join(models_target_dir, template) generate_fah_run( target_project_dir, template, source_dir, system, run_index, nclones, temperature, collision_rate, timestep, openmm_platform, renumbered_resnums, ) if archive: tgz_fah_run(target, run_index) mpistate.comm.Barrier() if mpistate.rank == 0: logger.info('Done.')
def cluster_models(process_only_these_targets=None, cutoff=0.06, loglevel=None): """Cluster models based on RMSD, and filter out non-unique models as determined by a given cutoff. Parameters ---------- cutoff : float Minimum distance cutoff for RMSD clustering (nm) Runs serially. """ # TODO refactor ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = get_targets_and_templates() templates = templates_resolved_seq for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue models_target_dir = os.path.join( ensembler.core.default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue # ============================= # Construct a mdtraj trajectory containing all models # ============================= starttime = datetime.datetime.utcnow() logger.debug('Building a list of valid models...') model_pdbfilenames_compressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb.gz') for template in templates } model_pdbfilenames_uncompressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb') for template in templates } valid_templateids = [ templateid for templateid in model_pdbfilenames_compressed if os.path.exists(model_pdbfilenames_compressed[templateid]) ] # Write uncompressed model.pdb files from model.pdb.gz if necessary for templateid in valid_templateids: if not os.path.exists( model_pdbfilenames_uncompressed[templateid] ) or os.path.getsize( model_pdbfilenames_uncompressed[templateid]) == 0: with gzip.open(model_pdbfilenames_compressed[templateid] ) as model_pdbfile_compressed: with open(model_pdbfilenames_uncompressed[templateid], 'w') as model_pdbfile: model_pdbfile.write(model_pdbfile_compressed.read()) logger.info('Constructing a trajectory containing all valid models...') if len(valid_templateids) == 0: logger.info('No models found for target {0}.'.format(target.id)) continue valid_model_pdbfilenames_uncompressed = [ model_pdbfilenames_uncompressed[templateid] for templateid in valid_templateids ] traj = mdtraj.load(valid_model_pdbfilenames_uncompressed) # ============================= # Clustering # ============================= logger.info('Conducting RMSD-based clustering...') # Remove any existing unique_by_clustering files for f in glob.glob(models_target_dir + '/*_PK_*/unique_by_clustering'): os.unlink(f) CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA'] unique_templateids = models_regular_spatial_clustering( valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff) write_unique_by_clustering_files(unique_templateids, models_target_dir) with open(os.path.join(models_target_dir, 'unique-models.txt'), 'w') as uniques_file: for u in unique_templateids: uniques_file.write(u + '\n') logger.info( '%d unique models (from original set of %d) using cutoff of %.3f nm' % (len(unique_templateids), len(valid_templateids), cutoff)) for template in templates: model_dir = os.path.join(models_target_dir, template.id) model_pdbfilename = os.path.join(model_dir, 'model.pdb') if os.path.exists(model_pdbfilename): os.remove(model_pdbfilename) # ======== # Metadata # ======== project_metadata = ensembler.core.ProjectMetadata( project_stage='cluster_models', target_id=target.id) datestamp = ensembler.core.get_utcnow_formatted() timedelta = datetime.datetime.utcnow() - starttime metadata = { 'target_id': target.id, 'datestamp': datestamp, 'nunique_models': len(unique_templateids), 'python_version': sys.version.split('|')[0].strip(), 'python_full_version': ensembler.core.literal_str(sys.version), 'ensembler_version': ensembler.version.short_version, 'ensembler_commit': ensembler.version.git_revision, 'biopython_version': Bio.__version__, 'mdtraj_version': mdtraj.version.short_version, 'mdtraj_commit': mdtraj.version.git_revision, 'timing': ensembler.core.strf_timedelta(timedelta), } project_metadata.add_data(metadata) project_metadata.write()