def log_unique_domain_names(uniprot_query_string, uniprotxml): # Example query string: 'domain:"Protein kinase" AND reviewed:yes' domain_match = re.search('domain:([\"\'].*[\"\'])', uniprot_query_string) if domain_match and len(domain_match.groups()) > 0: query_string_domain_selection = domain_match.groups()[0].replace( '\'', '').replace('\"', '') uniprot_query_string_domains = uniprotxml.xpath( 'entry/feature[@type="domain"][match_regex(@description, "%s")]' % query_string_domain_selection, extensions={ (None, 'match_regex'): ensembler.core.xpath_match_regex_case_insensitive }) uniprot_unique_domain_names = set([ domain.get('description') for domain in uniprot_query_string_domains ]) logger.info( 'Set of unique domain names selected by the domain selector \'%s\' during the initial UniProt search:\n%s\n' % (query_string_domain_selection, uniprot_unique_domain_names)) else: uniprot_domains = uniprotxml.xpath('entry/feature[@type="domain"]') uniprot_unique_domain_names = set( [domain.get('description') for domain in uniprot_domains]) logger.info( 'Set of unique domain names returned from the initial UniProt search using the query string \'%s\':\n%s\n' % (uniprot_query_string, uniprot_unique_domain_names))
def loopmodel_templates(templates, missing_residues, process_only_these_templates=None, overwrite_structures=False): """ Parameters ---------- templates: list of BioPython SeqRecord only the id is used missing_residues: list of list of OpenMM Residue process_only_these_templates: bool overwrite_structures: bool """ for template_index in range(mpistate.rank, len(templates), mpistate.size): template = templates[template_index] if process_only_these_templates and template.id not in process_only_these_templates: continue if mpistate.size > 1: logger.info('MPI rank %d modeling missing loops for template %s' % (mpistate.rank, template.id)) else: logger.info('Modeling missing loops for template %s' % template.id) loopmodel_template(template, missing_residues[template_index], overwrite_structures=overwrite_structures)
def molprobity_validation_multiple_targets(targetids=None, modeling_stage=None, loglevel=None): """ Calculate model quality using MolProbity ``oneline-analysis`` command. For each target, this function outputs a text file named ``models/[targetid]/validation_scores_sorted-[method]-[ensembler_stage]`` which contains a list of targetids sorted by validation score. This can be used by the subsequent ``package_models`` command to filter out models below a specified quality threshold. Typically, this should be run after models have been refined to the desired extent (e.g. after implicit or explicit MD refinement) More detailed validation results are written to the individual model directories. MPI-enabled. Parameters ---------- targetids: list of str or str modeling_stage: str {None|build_models|refine_implicit_md|refine_explicit_md} Default: None (automatically selects most advanced stage) """ set_loglevel(loglevel) if targetids is None: targetids = [target.id for target in get_targets()] elif type(targetids) is str: targetids = [targetids] for targetid in targetids: logger.info("Working on target {}".format(targetid)) molprobity_validation(targetid=targetid, ensembler_stage=modeling_stage, loglevel=loglevel)
def molprobity_validation_multiple_targets(targetids=None, modeling_stage=None, loglevel=None): """ Calculate model quality using MolProbity ``oneline-analysis`` command. For each target, this function outputs a text file named ``models/[targetid]/validation_scores_sorted-[method]-[ensembler_stage]`` which contains a list of targetids sorted by validation score. This can be used by the subsequent ``package_models`` command to filter out models below a specified quality threshold. Typically, this should be run after models have been refined to the desired extent (e.g. after implicit or explicit MD refinement) More detailed validation results are written to the individual model directories. MPI-enabled. Parameters ---------- targetids: list of str or str modeling_stage: str {None|build_models|refine_implicit_md|refine_explicit_md} Default: None (automatically selects most advanced stage) """ set_loglevel(loglevel) if targetids is None: targetids = [target.id for target in get_targets()] elif type(targetids) is str: targetids = [targetids] for targetid in targetids: logger.info('Working on target {}'.format(targetid)) molprobity_validation(targetid=targetid, ensembler_stage=modeling_stage, loglevel=loglevel)
def log_unique_domain_names_selected_by_regex(uniprot_domain_regex, uniprotxml): regex_matched_domains = uniprotxml.xpath( 'entry/feature[@type="domain"][match_regex(@description, "%s")]' % uniprot_domain_regex, extensions={(None, 'match_regex'): ensembler.core.xpath_match_regex_case_sensitive} ) regex_matched_domains_unique_names = set([domain.get('description') for domain in regex_matched_domains]) logger.info('Unique domain names selected after searching with the case-sensitive regex string \'%s\':\n%s\n' % (uniprot_domain_regex, regex_matched_domains_unique_names))
def align_targets_and_templates(process_only_these_targets=None, process_only_these_templates=None, loglevel=None): """ Conducts pairwise alignments of target sequences against template sequences. Stores Modeller-compatible 'alignment.pir' files in each model directory, and also outputs a table of model IDs, sorted by sequence identity. :param process_only_these_targets: :param process_only_these_templates: :param loglevel: :return: """ ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = ensembler.core.get_targets_and_templates() ntemplates = len(templates_resolved_seq) nselected_templates = len(process_only_these_templates) if process_only_these_templates else ntemplates for target in targets: if process_only_these_targets and target.id not in process_only_these_targets: continue if mpistate.rank == 0: logger.info('Working on target %s...' % target.id) models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id) ensembler.utils.create_dir(models_target_dir) seq_identity_data_sublist = [] for template_index in range(mpistate.rank, ntemplates, mpistate.size): template_id = templates_resolved_seq[template_index].id if os.path.exists(os.path.join(ensembler.core.default_project_dirnames.templates_structures_modeled_loops, template_id + '.pdb')): remodeled_seq_filepath = os.path.join(ensembler.core.default_project_dirnames.templates_structures_modeled_loops, template_id + '-pdbfixed.fasta') template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0] else: template = templates_resolved_seq[template_index] if process_only_these_templates and template_id not in process_only_these_templates: continue model_dir = os.path.abspath(os.path.join(ensembler.core.default_project_dirnames.models, target.id, template_id)) ensembler.utils.create_dir(model_dir) aln = align_target_template(target, template) aln_filepath = os.path.join(model_dir, 'alignment.pir') write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath) seq_identity_data_sublist.append({ 'templateid': template_id, 'seq_identity': calculate_seq_identity(aln), }) seq_identity_data_gathered = mpistate.comm.gather(seq_identity_data_sublist, root=0) seq_identity_data = [] if mpistate.rank == 0: seq_identity_data = [None] * nselected_templates for i in range(nselected_templates): seq_identity_data[i] = seq_identity_data_gathered[i % mpistate.size][i // mpistate.size] seq_identity_data = mpistate.comm.bcast(seq_identity_data, root=0) seq_identity_data = sorted(seq_identity_data, key=lambda x: x['seq_identity'], reverse=True) write_sorted_seq_identities(target, seq_identity_data)
def dep_extract_template_pdbchains_from_uniprot_xml(uniprotxml, uniprot_domain_regex=None, manual_overrides=None, specified_pdbids=None, specified_chainids=None): selected_pdbchains = [] all_uniprot_entries = uniprotxml.findall('entry') for entry in all_uniprot_entries: entry_name = entry.find('name').text if uniprot_domain_regex: selected_domains = entry.xpath( 'feature[@type="domain"][match_regex(@description, "%s")]' % uniprot_domain_regex, extensions={(None, 'match_regex'): ensembler.core.xpath_match_regex_case_sensitive} ) else: selected_domains = entry.findall('feature[@type="domain"]') domain_iter = 0 for domain in selected_domains: domain_id = '%s_D%d' % (entry_name, domain_iter) domain_span = [int(domain.find('location/begin').get('position')), int(domain.find('location/end').get('position'))] if manual_overrides and domain_id in manual_overrides.template.domain_spans: domain_span = [int(x) for x in manual_overrides.template.domain_spans[domain_id].split('-')] domain_len = domain_span[1] - domain_span[0] + 1 if manual_overrides and manual_overrides.template.min_domain_len is not None and domain_len < manual_overrides.template.min_domain_len: continue if manual_overrides and manual_overrides.template.max_domain_len is not None and domain_len > manual_overrides.template.max_domain_len: continue domain_iter += 1 pdbs = domain.getparent().xpath( 'dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..') for pdb in pdbs: pdbid = pdb.get('id') if manual_overrides and pdbid in manual_overrides.template.skip_pdbs: continue if specified_pdbids and pdbid not in specified_pdbids: continue pdb_chain_span_nodes = pdb.findall('property[@type="chains"]') for PDB_chain_span_node in pdb_chain_span_nodes: chain_span_string = PDB_chain_span_node.get('value') chain_spans = ensembler.UniProt.parse_uniprot_pdbref_chains(chain_span_string) for chainid in chain_spans.keys(): if specified_chainids and len(specified_chainids[pdbid]) > 0 and chainid not in specified_chainids[pdbid]: continue span = chain_spans[chainid] if (span[0] < domain_span[0] + 30) & (span[1] > domain_span[1] - 30): templateid = '%s_%s_%s' % (domain_id, pdbid, chainid) data = { 'templateid': templateid, 'pdbid': pdbid, 'chainid': chainid, 'domain_span': domain_span } selected_pdbchains.append(data) logger.info('%d PDB chains selected.' % len(selected_pdbchains)) return selected_pdbchains
def auto_select_openmm_platform(): for platform_name in ['CUDA', 'OpenCL', 'CPU', 'Reference']: try: platform = openmm.Platform.getPlatformByName(platform_name) if type(platform) == openmm.Platform: logger.info('Auto-selected OpenMM platform: %s' % platform_name) return platform_name except Exception: continue raise Exception('No OpenMM platform found')
def extract_template_structures_from_pdb_files(selected_templates): logger.info('Writing template structures...') for template in selected_templates: pdb_filename = os.path.join( ensembler.core.default_project_dirnames.structures_pdb, template.pdbid + '.pdb.gz') template_resolved_filename = os.path.join( ensembler.core.default_project_dirnames. templates_structures_resolved, template.templateid + '.pdb') ensembler.pdb.extract_residues_by_resnum(template_resolved_filename, pdb_filename, template)
def extract_template_pdb_chain_residues(selected_pdbchains): selected_templates = None if mpistate.rank == 0: logger.info('Extracting residues from PDB chains...') selected_templates = [] for pdbchain in selected_pdbchains: extracted_pdb_template_seq_data = extract_pdb_template_seq(pdbchain) if extracted_pdb_template_seq_data is not None: selected_templates.append(extracted_pdb_template_seq_data) logger.info('%d templates selected.\n' % len(selected_templates)) selected_templates = mpistate.comm.bcast(selected_templates, root=0) return selected_templates
def get_renumbered_topol_resnums(target): models_target_dir = os.path.join(default_project_dirnames.models, target.id) renumbered_resnums = {} for topol_type in ['implicit', 'explicit']: topol_path = os.path.join(models_target_dir, 'topol-renumbered-{}.pdb'.format(topol_type)) if not os.path.exists(topol_path): continue traj = mdtraj.load_pdb(topol_path) res_numbers = [resi.resSeq for resi in traj.top.residues] renumbered_resnums[topol_type] = res_numbers logger.info('Will use renumbered residues from {} for target {}'.format(topol_path, target.id)) return renumbered_resnums
def extract_template_pdb_chain_residues(selected_pdbchains): selected_templates = None if mpistate.rank == 0: logger.info('Extracting residues from PDB chains...') selected_templates = [] for pdbchain in selected_pdbchains: extracted_pdb_template_seq_data = extract_pdb_template_seq( pdbchain) if extracted_pdb_template_seq_data is not None: selected_templates.append(extracted_pdb_template_seq_data) logger.info('%d templates selected.\n' % len(selected_templates)) selected_templates = mpistate.comm.bcast(selected_templates, root=0) return selected_templates
def create_dir(dirpath, quiet=True): """ :param dirpath: str """ try: os.makedirs(dirpath) if not quiet: logger.info('Created directory "%s"' % dirpath) except OSError as e: if e.errno == 17: logger.debug('Directory "%s" already exists - will not overwrite' % dirpath) else: raise
def log_unique_domain_names_selected_by_regex(uniprot_domain_regex, uniprotxml): regex_matched_domains = uniprotxml.xpath( 'entry/feature[@type="domain"][match_regex(@description, "%s")]' % uniprot_domain_regex, extensions={ (None, 'match_regex'): ensembler.core.xpath_match_regex_case_sensitive }) regex_matched_domains_unique_names = set( [domain.get('description') for domain in regex_matched_domains]) logger.info( 'Unique domain names selected after searching with the case-sensitive regex string \'%s\':\n%s\n' % (uniprot_domain_regex, regex_matched_domains_unique_names))
def build_models_target_setup(target): target_setup_data = None if mpistate.rank == 0: models_target_dir = os.path.join( ensembler.core.default_project_dirnames.models, target.id) target_starttime = datetime.datetime.utcnow() logger.info( '=========================================================================\n' 'Working on target "%s"\n' '=========================================================================' % target.id) target_setup_data = TargetSetupData( target_starttime=target_starttime, models_target_dir=models_target_dir) target_setup_data = mpistate.comm.bcast(target_setup_data, root=0) return target_setup_data
def get_renumbered_topol_resnums(target): models_target_dir = os.path.join(default_project_dirnames.models, target.id) renumbered_resnums = {} for topol_type in ['implicit', 'explicit']: topol_path = os.path.join(models_target_dir, 'topol-renumbered-{}.pdb'.format(topol_type)) if not os.path.exists(topol_path): continue traj = mdtraj.load_pdb(topol_path) res_numbers = [resi.resSeq for resi in traj.top.residues] renumbered_resnums[topol_type] = res_numbers logger.info( 'Will use renumbered residues from {} for target {}'.format( topol_path, target.id)) return renumbered_resnums
def _gather_targets(self): logger.info('Querying UniProt web server...') get_uniprot_xml_args = {} if self._save_uniprot_xml: get_uniprot_xml_args['write_to_filepath'] = 'targets-uniprot.xml' uniprotxml = ensembler.UniProt.get_uniprot_xml(self.uniprot_query_string, **get_uniprot_xml_args) logger.info('Number of entries returned from initial UniProt search: %r\n' % len(uniprotxml)) log_unique_domain_names(self.uniprot_query_string, uniprotxml) if self.uniprot_domain_regex is not None: log_unique_domain_names_selected_by_regex(self.uniprot_domain_regex, uniprotxml) fasta_ofilepath = os.path.join(ensembler.core.default_project_dirnames.targets, 'targets.fa') self.targets = self._extract_targets_from_uniprot_xml(uniprotxml) Bio.SeqIO.write(self.targets, fasta_ofilepath, 'fasta') self._write_metadata()
def build_models_target_setup(target): target_setup_data = None if mpistate.rank == 0: models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id) target_starttime = datetime.datetime.utcnow() logger.info( '=========================================================================\n' 'Working on target "%s"\n' '=========================================================================' % target.id ) target_setup_data = TargetSetupData( target_starttime=target_starttime, models_target_dir=models_target_dir ) target_setup_data = mpistate.comm.bcast(target_setup_data, root=0) return target_setup_data
def log_unique_domain_names(uniprot_query_string, uniprotxml): # Example query string: 'domain:"Protein kinase" AND reviewed:yes' domain_match = re.search('domain:([\"\'].*[\"\'])', uniprot_query_string) if domain_match and len(domain_match.groups()) > 0: query_string_domain_selection = domain_match.groups()[0].replace('\'', '').replace('\"', '') uniprot_query_string_domains = uniprotxml.xpath( 'entry/feature[@type="domain"][match_regex(@description, "%s")]' % query_string_domain_selection, extensions={ (None, 'match_regex'): ensembler.core.xpath_match_regex_case_insensitive } ) uniprot_unique_domain_names = set([domain.get('description') for domain in uniprot_query_string_domains]) logger.info('Set of unique domain names selected by the domain selector \'%s\' during the initial UniProt search:\n%s\n' % (query_string_domain_selection, uniprot_unique_domain_names)) else: uniprot_domains = uniprotxml.xpath('entry/feature[@type="domain"]') uniprot_unique_domain_names = set([domain.get('description') for domain in uniprot_domains]) logger.info('Set of unique domain names returned from the initial UniProt search using the query string \'%s\':\n%s\n' % (uniprot_query_string, uniprot_unique_domain_names))
def _gather_targets(self, write_output_files=True): logger.info('Querying UniProt web server...') get_uniprot_xml_args = {} if self._save_uniprot_xml: get_uniprot_xml_args['write_to_filepath'] = 'targets-uniprot.xml' self.uniprotxml = ensembler.uniprot.get_uniprot_xml( self.uniprot_query_string, **get_uniprot_xml_args) logger.info( 'Number of entries returned from initial UniProt search: %r\n' % len(self.uniprotxml)) log_unique_domain_names(self.uniprot_query_string, self.uniprotxml) if self.uniprot_domain_regex: log_unique_domain_names_selected_by_regex( self.uniprot_domain_regex, self.uniprotxml) fasta_ofilepath = os.path.join( ensembler.core.default_project_dirnames.targets, 'targets.fa') self._extract_targets_from_uniprot_xml() if write_output_files: Bio.SeqIO.write(self.targets, fasta_ofilepath, 'fasta') self._write_metadata()
def build_model(target, template_resolved_seq, target_setup_data, write_modeller_restraints_file=False, loglevel=None): """Uses Modeller to build a homology model for a given target and template. Will not run Modeller if the output files already exist. Parameters ---------- target : BioPython SeqRecord template_resolved_seq : BioPython SeqRecord Must be a corresponding .pdb template file with the same ID in the templates/structures directory. template_resolved_seq : BioPython SeqRecord Must be a corresponding .pdb template file with the same ID in the templates/structures directory. target_setup_data : TargetSetupData obj write_modeller_restraints_file : bool Write file containing restraints used by Modeller - note that this file can be relatively large, e.g. ~300KB per model for a protein kinase domain target. loglevel : bool """ ensembler.utils.set_loglevel(loglevel) template_structure_dir = os.path.abspath( ensembler.core.default_project_dirnames.templates_structures_modeled_loops ) if os.path.exists(os.path.join(template_structure_dir, template_resolved_seq.id + '.pdb')): remodeled_seq_filepath = os.path.join( ensembler.core.default_project_dirnames.templates_structures_modeled_loops, template_resolved_seq.id + '-pdbfixed.fasta' ) template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0] else: template = template_resolved_seq template_structure_dir = os.path.abspath( ensembler.core.default_project_dirnames.templates_structures_resolved ) model_dir = os.path.abspath(os.path.join(target_setup_data.models_target_dir, template.id)) if not os.path.exists(model_dir): ensembler.utils.create_dir(model_dir) model_pdbfilepath = os.path.abspath(os.path.join(model_dir, 'model.pdb.gz')) modeling_log_filepath = os.path.abspath(os.path.join(model_dir, 'modeling-log.yaml')) check_model_pdbfilepath_ends_in_pdbgz(model_pdbfilepath) model_pdbfilepath_uncompressed = model_pdbfilepath[:-3] if check_all_model_files_present(model_dir): logger.debug( "Output files already exist for target '%s' // template '%s'; files were not overwritten." % (target.id, template.id) ) return logger.info( '-------------------------------------------------------------------------\n' 'Modelling "%s" => "%s"\n' '-------------------------------------------------------------------------' % (target.id, template.id) ) # aln = align_target_template(target, template) aln_filepath = os.path.abspath(os.path.join(model_dir, 'alignment.pir')) # write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath) log_file = init_build_model_logfile(modeling_log_filepath) with ensembler.utils.enter_temp_dir(): try: start = datetime.datetime.utcnow() shutil.copy(aln_filepath, 'alignment.pir') run_modeller(target, template, model_dir, model_pdbfilepath, model_pdbfilepath_uncompressed, template_structure_dir, write_modeller_restraints_file=write_modeller_restraints_file) if os.path.getsize(model_pdbfilepath) < 1: raise Exception('Output PDB file is empty.') end_successful_build_model_logfile(log_file, start) except Exception as e: end_exception_build_model_logfile(e, log_file)
def _finish(self): logger.info('Done.')
def download_sifts_file(pdbid, project_sifts_filepath): logger.info('Downloading sifts file for: %s', pdbid) sifts_page = ensembler.pdb.retrieve_sifts(pdbid) with gzip.open(project_sifts_filepath, 'wb') as project_sifts_file: project_sifts_file.write(sifts_page)
def refine_implicit_md( openmm_platform=None, gpupn=1, process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, write_trajectory=False, include_disulfide_bonds=False, custom_residue_variants=None, ff='amber99sbildn', implicit_water_model='amber99_obc', sim_length=100.0 * unit.picoseconds, timestep=2.0 * unit.femtoseconds, # timestep temperature=300.0 * unit.kelvin, # simulation temperature collision_rate=20.0 / unit.picoseconds, # Langevin collision rate cutoff=None, # nonbonded cutoff minimization_tolerance=10.0 * unit.kilojoules_per_mole / unit.nanometer, minimization_steps=20, nsteps_per_iteration=500, ph=None, retry_failed_runs=False, cpu_platform_threads=1, loglevel=None): # TODO - refactor """Run MD refinement in implicit solvent. MPI-enabled. """ ensembler.utils.set_loglevel(loglevel) gpuid = mpistate.rank % gpupn manual_overrides = ManualOverrides() if ph is None: if manual_overrides.refinement.ph is not None: ph = manual_overrides.refinement.ph else: ph = 7.0 if custom_residue_variants is None: custom_residue_variants = deepcopy( manual_overrides.refinement.custom_residue_variants_by_targetid ) if (sim_length / timestep) < nsteps_per_iteration: nsteps_per_iteration = int(sim_length / timestep) niterations = int((sim_length / timestep) / nsteps_per_iteration) models_dir = os.path.abspath(ensembler.core.default_project_dirnames.models) targets, templates_resolved_seq = ensembler.core.get_targets_and_templates() if process_only_these_templates: selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] else: selected_template_indices = range(len(templates_resolved_seq)) if not openmm_platform: openmm_platform = auto_select_openmm_platform() if openmm_platform == 'CPU': platform_properties = {'CpuThreads': str(cpu_platform_threads)} else: platform_properties = {} ff_files = [ff+'.xml', implicit_water_model+'.xml'] forcefield = app.ForceField(*ff_files) kB = unit.MOLAR_GAS_CONSTANT_R kT = kB * temperature def simulate_implicit_md(): logger.debug("Reading model...") with gzip.open(model_filename) as model_file: pdb = app.PDBFile(model_file) # Set up Platform platform = openmm.Platform.getPlatformByName(openmm_platform) if 'CUDA_VISIBLE_DEVICES' not in os.environ: # Set GPU id. if openmm_platform == 'CUDA': platform.setPropertyDefaultValue('CudaDeviceIndex', '%d' % gpuid) elif openmm_platform == 'OpenCL': platform.setPropertyDefaultValue('OpenCLDeviceIndex', '%d' % gpuid) # Construct Modeller object with same topology as ref structure # (necessary to keep disulfide bonds consistent) modeller = app.Modeller(reference_topology, pdb.positions) # set_openmm_topology_bonds_from_atom_indices(modeller.topology, reference_bonds) # Add missing protons. modeller.addHydrogens(forcefield, pH=ph, variants=reference_variants) topology = modeller.getTopology() positions = modeller.getPositions() logger.debug("Constructing System object...") if cutoff is None: system = forcefield.createSystem(topology, nonbondedMethod=app.NoCutoff, constraints=app.HBonds) else: system = forcefield.createSystem(topology, nonbondedMethod=app.CutoffNonPeriodic, nonbondedCutoff=cutoff, constraints=app.HBonds) logger.debug("Creating Context...") integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) context = openmm.Context(system, integrator, platform, platform_properties) context.setPositions(positions) logger.debug("Minimizing structure...") openmm.LocalEnergyMinimizer.minimize(context, minimization_tolerance, minimization_steps) if write_trajectory: # Open trajectory for writing. logger.debug("Opening trajectory for writing...") trajectory_filename = os.path.join(model_dir, 'implicit-trajectory.pdb.gz') trajectory_outfile = gzip.open(trajectory_filename, 'w') app.PDBFile.writeHeader(topology, file=trajectory_outfile) # Open energy trajectory for writing energy_filename = os.path.join(model_dir, 'implicit-energies.txt') energy_outfile = open(energy_filename, 'w') energy_outfile.write('# iteration | simulation time (ps) | potential_energy (kT) | kinetic_energy (kT) | ns per day\n') logger.debug("Running dynamics...") import time initial_time = time.time() for iteration in range(niterations): # integrate dynamics integrator.step(nsteps_per_iteration) # get current state state = context.getState(getEnergy=True, getPositions=True) simulation_time = state.getTime() potential_energy = state.getPotentialEnergy() kinetic_energy = state.getKineticEnergy() final_time = time.time() elapsed_time = (final_time - initial_time) * unit.seconds ns_per_day = (simulation_time / elapsed_time) / (unit.nanoseconds / unit.day) logger.debug( " %8.1f ps : potential %8.3f kT | kinetic %8.3f kT | %.3f ns/day | %.3f s remain" % ( simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT, ns_per_day, elapsed_time * (niterations-iteration-1) / (iteration+1) / unit.seconds ) ) # Check energies are still finite. if np.isnan(potential_energy/kT) or np.isnan(kinetic_energy/kT): raise Exception("Potential or kinetic energies are nan.") if write_trajectory: app.PDBFile.writeModel(topology, state.getPositions(), file=trajectory_outfile, modelIndex=iteration) # write data energy_outfile.write(" %8d %8.1f %8.3f %8.3f %.3f\n" % (iteration, simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT, ns_per_day)) energy_outfile.flush() if write_trajectory: app.PDBFile.writeFooter(topology, file=trajectory_outfile) trajectory_outfile.close() energy_outfile.close() # Write final PDB file. pdb_outfile = gzip.open(pdb_filename, 'wt') app.PDBFile.writeHeader(topology, file=pdb_outfile) app.PDBFile.writeFile(topology, state.getPositions(), file=pdb_outfile) app.PDBFile.writeFooter(topology, file=pdb_outfile) pdb_outfile.close() # Process targets print('Processing targets...') # DEBUG for target in targets: if (process_only_these_targets is not None) and (target.id not in process_only_these_targets): print('Skipping because %s is not in process_only_these_targets' % target.id) print(process_only_these_targets) continue logger.info('Processing %s' % target) models_target_dir = os.path.join(models_dir, target.id) if mpistate.rank == 0: target_starttime = datetime.datetime.utcnow() if not os.path.exists(models_target_dir): print('%s does not exist, skipping' % models_target_dir) continue mpistate.comm.Barrier() # ======== # Determine topology (including protonation state) to use throughout # ======== reference_model_id = get_highest_seqid_existing_model(models_target_dir=models_target_dir) if reference_model_id is None: continue reference_model_path = os.path.join(models_target_dir, reference_model_id, 'model.pdb.gz') with gzip.open(reference_model_path) as reference_pdb_file: reference_pdb = app.PDBFile(reference_pdb_file) logger.debug("Using %s as highest identity model" % (reference_model_id)) if not include_disulfide_bonds: remove_disulfide_bonds_from_topology(reference_pdb.topology) # Build topology for reference model logger.debug("Creating app.Modeller instance...") modeller = app.Modeller(reference_pdb.topology, reference_pdb.positions) reference_topology = modeller.topology logger.debug("Adding hydrogens...") reference_variants = modeller.addHydrogens(forcefield, pH=ph) if target.id in custom_residue_variants: apply_custom_residue_variants(reference_variants, custom_residue_variants[target.id]) logger.debug("Reference variants extracted:") if reference_variants is not None: for (residue_index, residue) in enumerate(reference_variants): if residue is not None: logger.debug("%8d %s" % (residue_index+1, residue)) logger.debug("") else: logger.debug(reference_variants) if model_seqid_cutoff: process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(target.id, seqid_cutoff=model_seqid_cutoff) selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] ntemplates_selected = len(selected_template_indices) for template_index in range(mpistate.rank, ntemplates_selected, mpistate.size): template = templates_resolved_seq[selected_template_indices[template_index]] model_dir = os.path.join(models_target_dir, template.id) if not os.path.exists(model_dir): continue # Only simulate models that are unique following filtering by clustering. unique_by_clustering = os.path.exists(os.path.join(model_dir, 'unique_by_clustering')) if not unique_by_clustering: continue # Pass if this simulation has already been run. log_filepath = os.path.join(model_dir, 'implicit-log.yaml') if os.path.exists(log_filepath): with open(log_filepath) as log_file: log_data = yaml.load(log_file, Loader=ensembler.core.YamlLoader) if log_data.get('successful') is True: continue if log_data.get('finished') is True and (retry_failed_runs is False and log_data.get('successful') is False): continue # Check to make sure the initial model file is present. model_filename = os.path.join(model_dir, 'model.pdb.gz') if not os.path.exists(model_filename): logger.debug('model.pdb.gz not present: target %s template %s rank %d gpuid %d' % (target.id, template.id, mpistate.rank, gpuid)) continue pdb_filename = os.path.join(model_dir, 'implicit-refined.pdb.gz') logger.info("-------------------------------------------------------------------------") logger.info("Simulating %s => %s in implicit solvent for %.1f ps (MPI rank: %d, GPU ID: %d)" % (target.id, template.id, niterations * nsteps_per_iteration * timestep / unit.picoseconds, mpistate.rank, gpuid)) logger.info("-------------------------------------------------------------------------") # Open log file log_data = { 'mpi_rank': mpistate.rank, 'gpuid': gpuid if 'CUDA_VISIBLE_DEVICES' not in os.environ else os.environ['CUDA_VISIBLE_DEVICES'], 'openmm_platform': openmm_platform, 'finished': False, 'sim_length': str(sim_length), 'timestep': str(timestep), 'temperature': str(temperature), 'ph': ph, } log_file = ensembler.core.LogFile(log_filepath) log_file.log(new_log_data=log_data) try: start = datetime.datetime.utcnow() simulate_implicit_md() timing = ensembler.core.strf_timedelta(datetime.datetime.utcnow() - start) log_data = { 'finished': True, 'timing': timing, 'successful': True, } log_file.log(new_log_data=log_data) except Exception as e: trbk = traceback.format_exc() warnings.warn( '= ERROR start: MPI rank {0} hostname {1} gpuid {2} =\n{3}\n{4}\n= ERROR end: MPI rank {0} hostname {1} gpuid {2}'.format( mpistate.rank, socket.gethostname(), gpuid, e, trbk ) ) timing = ensembler.core.strf_timedelta(datetime.datetime.utcnow() - start) log_data = { 'exception': e, 'traceback': ensembler.core.literal_str(trbk), 'timing': timing, 'finished': True, 'successful': False, } log_file.log(new_log_data=log_data) logger.debug('Finished template loop: rank %d' % mpistate.rank) mpistate.comm.Barrier() if mpistate.rank == 0: project_metadata = ensembler.core.ProjectMetadata(project_stage='refine_implicit_md', target_id=target.id) datestamp = ensembler.core.get_utcnow_formatted() command = ['find', models_target_dir, '-name', 'implicit-refined.pdb.gz'] output = subprocess.check_output(command) nsuccessful_refinements = output.decode('UTF-8').count('\n') target_timedelta = datetime.datetime.utcnow() - target_starttime metadata = { 'target_id': target.id, 'datestamp': datestamp, 'timing': ensembler.core.strf_timedelta(target_timedelta), 'openmm_platform': openmm_platform, 'process_only_these_targets': process_only_these_targets, 'process_only_these_templates': process_only_these_templates, 'model_seqid_cutoff': model_seqid_cutoff, 'write_trajectory': write_trajectory, 'include_disulfide_bonds': include_disulfide_bonds, 'custom_residue_variants': custom_residue_variants, 'ff': ff, 'implicit_water_model': implicit_water_model, 'sim_length': str(sim_length), 'timestep': str(timestep), 'temperature': str(temperature), 'collision_rate': str(collision_rate), 'cutoff': str(cutoff), 'nsteps_per_iteration': nsteps_per_iteration, 'ph': ph, 'nsuccessful_refinements': nsuccessful_refinements, 'python_version': sys.version.split('|')[0].strip(), 'python_full_version': ensembler.core.literal_str(sys.version), 'ensembler_version': ensembler.version.short_version, 'ensembler_commit': ensembler.version.git_revision, 'biopython_version': Bio.__version__, 'openmm_version': simtk.openmm.version.short_version, 'openmm_commit': simtk.openmm.version.git_revision, } project_metadata.add_data(metadata) project_metadata.write() mpistate.comm.Barrier() mpistate.comm.Barrier() if mpistate.rank == 0: logger.info('Done.')
def log_done(): logger.info('Done.')
def cluster_models(process_only_these_targets=None, cutoff=0.06, loglevel=None): """Cluster models based on RMSD, and filter out non-unique models as determined by a given cutoff. Parameters ---------- cutoff : float Minimum distance cutoff for RMSD clustering (nm) Runs serially. """ # TODO refactor ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = get_targets_and_templates() templates = templates_resolved_seq for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue models_target_dir = os.path.join( ensembler.core.default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue # ============================= # Construct a mdtraj trajectory containing all models # ============================= starttime = datetime.datetime.utcnow() logger.debug('Building a list of valid models...') model_pdbfilenames_compressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb.gz') for template in templates } model_pdbfilenames_uncompressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb') for template in templates } valid_templateids = [ templateid for templateid in model_pdbfilenames_compressed if os.path.exists(model_pdbfilenames_compressed[templateid]) ] # Write uncompressed model.pdb files from model.pdb.gz if necessary for templateid in valid_templateids: if not os.path.exists( model_pdbfilenames_uncompressed[templateid] ) or os.path.getsize( model_pdbfilenames_uncompressed[templateid]) == 0: with gzip.open(model_pdbfilenames_compressed[templateid] ) as model_pdbfile_compressed: with open(model_pdbfilenames_uncompressed[templateid], 'w') as model_pdbfile: model_pdbfile.write(model_pdbfile_compressed.read()) logger.info('Constructing a trajectory containing all valid models...') if len(valid_templateids) == 0: logger.info('No models found for target {0}.'.format(target.id)) continue valid_model_pdbfilenames_uncompressed = [ model_pdbfilenames_uncompressed[templateid] for templateid in valid_templateids ] traj = mdtraj.load(valid_model_pdbfilenames_uncompressed) # ============================= # Clustering # ============================= logger.info('Conducting RMSD-based clustering...') # Remove any existing unique_by_clustering files for f in glob.glob(models_target_dir + '/*_PK_*/unique_by_clustering'): os.unlink(f) CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA'] unique_templateids = models_regular_spatial_clustering( valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff) write_unique_by_clustering_files(unique_templateids, models_target_dir) with open(os.path.join(models_target_dir, 'unique-models.txt'), 'w') as uniques_file: for u in unique_templateids: uniques_file.write(u + '\n') logger.info( '%d unique models (from original set of %d) using cutoff of %.3f nm' % (len(unique_templateids), len(valid_templateids), cutoff)) for template in templates: model_dir = os.path.join(models_target_dir, template.id) model_pdbfilename = os.path.join(model_dir, 'model.pdb') if os.path.exists(model_pdbfilename): os.remove(model_pdbfilename) # ======== # Metadata # ======== project_metadata = ensembler.core.ProjectMetadata( project_stage='cluster_models', target_id=target.id) datestamp = ensembler.core.get_utcnow_formatted() timedelta = datetime.datetime.utcnow() - starttime metadata = { 'target_id': target.id, 'datestamp': datestamp, 'nunique_models': len(unique_templateids), 'python_version': sys.version.split('|')[0].strip(), 'python_full_version': ensembler.core.literal_str(sys.version), 'ensembler_version': ensembler.version.short_version, 'ensembler_commit': ensembler.version.git_revision, 'biopython_version': Bio.__version__, 'mdtraj_version': mdtraj.version.short_version, 'mdtraj_commit': mdtraj.version.git_revision, 'timing': ensembler.core.strf_timedelta(timedelta), } project_metadata.add_data(metadata) project_metadata.write()
def extract_template_pdbchains_from_uniprot_xml(uniprotxml, uniprot_domain_regex=None, manual_overrides=None, specified_pdbids=None, specified_chainids=None): """ Parameters ---------- uniprotxml: lxml.etree.Element uniprot_domain_regex: str manual_overrides: ensembler.core.ManualOverrides specified_pdbids: list of str ['2QR8', '4GU9'] specified_chainids: dict of list of str {'2QR8': ['A'], '4GU9': ['A', 'B']} Returns ------- selected_pdbchains: list of dict [ { 'templateid': str, 'pdbid': str, 'chainid': str, 'residue_span': [ start (int), # 1-based inclusive end (int) # 1-based inclusive ] } ] """ selected_pdbchains = [] all_uniprot_entries = uniprotxml.findall('entry') for entry in all_uniprot_entries: entry_name = entry.find('name').text if uniprot_domain_regex: selected_domains = entry.xpath( 'feature[@type="domain"][match_regex(@description, "%s")]' % uniprot_domain_regex, extensions={ (None, 'match_regex'): ensembler.core.xpath_match_regex_case_sensitive }) domain_iter = 0 for domain in selected_domains: domain_id = '%s_D%d' % (entry_name, domain_iter) domain_span = [ int(domain.find('location/begin').get('position')), int(domain.find('location/end').get('position')) ] if manual_overrides and domain_id in manual_overrides.template.domain_spans: domain_span = [ int(x) for x in manual_overrides.template. domain_spans[domain_id].split('-') ] domain_len = domain_span[1] - domain_span[0] + 1 if manual_overrides and manual_overrides.template.min_domain_len is not None and domain_len < manual_overrides.template.min_domain_len: continue if manual_overrides and manual_overrides.template.max_domain_len is not None and domain_len > manual_overrides.template.max_domain_len: continue domain_iter += 1 pdbs = domain.getparent().xpath( 'dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..' ) for pdb in pdbs: pdbid = pdb.get('id') if manual_overrides and pdbid in manual_overrides.template.skip_pdbs: continue if specified_pdbids and pdbid not in specified_pdbids: continue pdb_chain_span_nodes = pdb.findall( 'property[@type="chains"]') for pdb_chain_span_node in pdb_chain_span_nodes: chain_span_string = pdb_chain_span_node.get('value') chain_spans = ensembler.uniprot.parse_uniprot_pdbref_chains( chain_span_string) for chainid in chain_spans.keys(): if specified_chainids and len( specified_chainids[pdbid] ) > 0 and chainid not in specified_chainids[pdbid]: continue span = chain_spans[chainid] if (span[0] < domain_span[0] + 30) & ( span[1] > domain_span[1] - 30): templateid = '%s_%s_%s' % (domain_id, pdbid, chainid) data = { 'templateid': templateid, 'pdbid': pdbid, 'chainid': chainid, 'residue_span': domain_span } selected_pdbchains.append(data) else: pdbs = entry.xpath( 'dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..' ) for pdb in pdbs: pdbid = pdb.get('id') if manual_overrides and pdbid in manual_overrides.template.skip_pdbs: continue if specified_pdbids and pdbid not in specified_pdbids: continue pdb_chain_span_nodes = pdb.findall('property[@type="chains"]') for pdb_chain_span_node in pdb_chain_span_nodes: chain_span_string = pdb_chain_span_node.get('value') chain_spans = ensembler.uniprot.parse_uniprot_pdbref_chains( chain_span_string) for chainid in chain_spans.keys(): if specified_chainids and len( specified_chainids[pdbid] ) > 0 and chainid not in specified_chainids[pdbid]: continue span = chain_spans[chainid] templateid = '%s_%s_%s' % (entry_name, pdbid, chainid) data = { 'templateid': templateid, 'pdbid': pdbid, 'chainid': chainid, 'residue_span': span } selected_pdbchains.append(data) logger.info('%d PDB chains selected.' % len(selected_pdbchains)) return selected_pdbchains
def cluster_models(process_only_these_targets=None, cutoff=0.06, loglevel=None): """Cluster models based on RMSD, and filter out non-unique models as determined by a given cutoff. Parameters ---------- cutoff : float Minimum distance cutoff for RMSD clustering (nm) Runs serially. """ # TODO refactor ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = get_targets_and_templates() templates = templates_resolved_seq for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue # ============================= # Construct a mdtraj trajectory containing all models # ============================= starttime = datetime.datetime.utcnow() logger.debug('Building a list of valid models...') model_pdbfilenames_compressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb.gz') for template in templates } model_pdbfilenames_uncompressed = { template.id: os.path.join(models_target_dir, template.id, 'model.pdb') for template in templates } valid_templateids = [ templateid for templateid in model_pdbfilenames_compressed if os.path.exists(model_pdbfilenames_compressed[templateid]) ] # Write uncompressed model.pdb files from model.pdb.gz if necessary for templateid in valid_templateids: if not os.path.exists(model_pdbfilenames_uncompressed[templateid]) or os.path.getsize(model_pdbfilenames_uncompressed[templateid]) == 0: with gzip.open(model_pdbfilenames_compressed[templateid]) as model_pdbfile_compressed: with open(model_pdbfilenames_uncompressed[templateid], 'w') as model_pdbfile: model_pdbfile.write(model_pdbfile_compressed.read()) logger.info('Constructing a trajectory containing all valid models...') if len(valid_templateids) == 0: logger.info('No models found for target {0}.'.format(target.id)) continue valid_model_pdbfilenames_uncompressed = [ model_pdbfilenames_uncompressed[templateid] for templateid in valid_templateids ] traj = mdtraj.load(valid_model_pdbfilenames_uncompressed) # ============================= # Clustering # ============================= logger.info('Conducting RMSD-based clustering...') # Remove any existing unique_by_clustering files for f in glob.glob(models_target_dir+'/*_PK_*/unique_by_clustering'): os.unlink(f) CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA'] unique_templateids = models_regular_spatial_clustering( valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff ) write_unique_by_clustering_files(unique_templateids, models_target_dir) with open(os.path.join(models_target_dir, 'unique-models.txt'), 'w') as uniques_file: for u in unique_templateids: uniques_file.write(u+'\n') logger.info( '%d unique models (from original set of %d) using cutoff of %.3f nm' % (len(unique_templateids), len(valid_templateids), cutoff) ) for template in templates: model_dir = os.path.join(models_target_dir, template.id) model_pdbfilename = os.path.join(model_dir, 'model.pdb') if os.path.exists(model_pdbfilename): os.remove(model_pdbfilename) # ======== # Metadata # ======== project_metadata = ensembler.core.ProjectMetadata( project_stage='cluster_models', target_id=target.id ) datestamp = ensembler.core.get_utcnow_formatted() timedelta = datetime.datetime.utcnow() - starttime metadata = { 'target_id': target.id, 'datestamp': datestamp, 'nunique_models': len(unique_templateids), 'python_version': sys.version.split('|')[0].strip(), 'python_full_version': ensembler.core.literal_str(sys.version), 'ensembler_version': ensembler.version.short_version, 'ensembler_commit': ensembler.version.git_revision, 'biopython_version': Bio.__version__, 'mdtraj_version': mdtraj.version.short_version, 'mdtraj_commit': mdtraj.version.git_revision, 'timing': ensembler.core.strf_timedelta(timedelta), } project_metadata.add_data(metadata) project_metadata.write()
def package_for_fah(process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, model_validation_score_cutoff=None, model_validation_score_percentile=None, nclones=1, archive=False, openmm_platform='Reference', temperature=300.0 * unit.kelvin, collision_rate=1.0 / unit.picosecond, timestep=2.0 * unit.femtoseconds, loglevel=None): """ Create the input files and directory structure necessary to start a Folding@Home project. MPI-enabled. Parameters ---------- archive : Bool A .tgz compressed archive will be created for each individual RUN directory. """ set_loglevel(loglevel) if mpistate.rank == 0: if not os.path.exists(fah_projects_dir): os.mkdir(fah_projects_dir) mpistate.comm.Barrier() targets, templates_resolved_seq = get_targets_and_templates() for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue target_project_dir = os.path.join(fah_projects_dir, target.id) models_target_dir = os.path.join(default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue mpistate.comm.Barrier() sorted_valid_templates = [] system = None renumbered_resnums = {} if mpistate.rank == 0: logger.info( '-------------------------------------------------------------------------' ) logger.info('Building FAH OpenMM project for target {}'.format( target.id)) logger.info( '-------------------------------------------------------------------------' ) valid_templates = get_valid_templates_for_target( target, templates_resolved_seq, process_only_these_templates=process_only_these_templates, model_seqid_cutoff=model_seqid_cutoff, model_validation_score_cutoff=model_validation_score_cutoff, model_validation_score_percentile= model_validation_score_percentile) sorted_valid_templates = sort_valid_templates_by_seqid( target, valid_templates) create_target_project_dir(target) system = setup_system_and_integrator_files( target, sorted_valid_templates[0], temperature, collision_rate, timestep) renumbered_resnums = get_renumbered_topol_resnums(target) sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0) system = mpistate.comm.bcast(system, root=0) renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0) logger.debug("Building RUNs in parallel...") for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size): template = sorted_valid_templates[run_index] logger.info( '-------------------------------------------------------------------------' ) logger.info('Building RUN{} for template {}'.format( run_index, template)) logger.info( '-------------------------------------------------------------------------' ) source_dir = os.path.join(models_target_dir, template) generate_fah_run( target_project_dir, template, source_dir, system, run_index, nclones, temperature, collision_rate, timestep, openmm_platform, renumbered_resnums, ) if archive: tgz_fah_run(target, run_index) mpistate.comm.Barrier() if mpistate.rank == 0: logger.info('Done.')
def download_pdb_file(pdbid, project_pdb_filepath): logger.info('Downloading PDB file for: %s' % pdbid) pdbgz_page = ensembler.pdb.retrieve_pdb(pdbid, compressed='yes') with open(project_pdb_filepath, 'w') as pdbgz_file: pdbgz_file.write(pdbgz_page)
def align_targets_and_templates(process_only_these_targets=None, process_only_these_templates=None, substitution_matrix='gonnet', gap_open=-10, gap_extend=-0.5, loglevel=None): """ Conducts pairwise alignments of target sequences against template sequences. Stores Modeller-compatible 'alignment.pir' files in each model directory, and also outputs a table of model IDs, sorted by sequence identity. Parameters ---------- process_only_these_targets: process_only_these_templates: substitution_matrix: str Specify an amino acid substitution matrix available from Bio.SubsMat.MatrixInfo """ ensembler.utils.set_loglevel(loglevel) targets, templates_resolved_seq = ensembler.core.get_targets_and_templates( ) ntemplates = len(templates_resolved_seq) nselected_templates = len(process_only_these_templates ) if process_only_these_templates else ntemplates for target in targets: if process_only_these_targets and target.id not in process_only_these_targets: continue if mpistate.rank == 0: logger.info('Working on target %s...' % target.id) models_target_dir = os.path.join( ensembler.core.default_project_dirnames.models, target.id) ensembler.utils.create_dir(models_target_dir) seq_identity_data_sublist = [] for template_index in range(mpistate.rank, ntemplates, mpistate.size): template_id = templates_resolved_seq[template_index].id if os.path.exists( os.path.join( ensembler.core.default_project_dirnames. templates_structures_modeled_loops, template_id + '.pdb')): remodeled_seq_filepath = os.path.join( ensembler.core.default_project_dirnames. templates_structures_modeled_loops, template_id + '-pdbfixed.fasta') template = list( Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0] else: template = templates_resolved_seq[template_index] if process_only_these_templates and template_id not in process_only_these_templates: continue model_dir = os.path.abspath( os.path.join(ensembler.core.default_project_dirnames.models, target.id, template_id)) ensembler.utils.create_dir(model_dir) aln = align_target_template( target, template, substitution_matrix=substitution_matrix, gap_open=gap_open, gap_extend=gap_extend) aln_filepath = os.path.join(model_dir, 'alignment.pir') write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath) seq_identity_data_sublist.append({ 'templateid': template_id, 'seq_identity': calculate_seq_identity(aln), }) seq_identity_data_gathered = mpistate.comm.gather( seq_identity_data_sublist, root=0) seq_identity_data = [] if mpistate.rank == 0: seq_identity_data = [None] * nselected_templates for i in range(nselected_templates): seq_identity_data[i] = seq_identity_data_gathered[ i % mpistate.size][i // mpistate.size] seq_identity_data = mpistate.comm.bcast(seq_identity_data, root=0) seq_identity_data = sorted(seq_identity_data, key=lambda x: x['seq_identity'], reverse=True) write_sorted_seq_identities(target, seq_identity_data)
def build_model(target, template_resolved_seq, target_setup_data, write_modeller_restraints_file=False, loglevel=None): """Uses Modeller to build a homology model for a given target and template. Will not run Modeller if the output files already exist. Parameters ---------- target : BioPython SeqRecord template_resolved_seq : BioPython SeqRecord Must be a corresponding .pdb template file with the same ID in the templates/structures directory. template_resolved_seq : BioPython SeqRecord Must be a corresponding .pdb template file with the same ID in the templates/structures directory. target_setup_data : TargetSetupData obj write_modeller_restraints_file : bool Write file containing restraints used by Modeller - note that this file can be relatively large, e.g. ~300KB per model for a protein kinase domain target. loglevel : bool """ ensembler.utils.set_loglevel(loglevel) template_structure_dir = os.path.abspath( ensembler.core.default_project_dirnames. templates_structures_modeled_loops) if os.path.exists( os.path.join(template_structure_dir, template_resolved_seq.id + '.pdb')): remodeled_seq_filepath = os.path.join( ensembler.core.default_project_dirnames. templates_structures_modeled_loops, template_resolved_seq.id + '-pdbfixed.fasta') template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0] else: template = template_resolved_seq template_structure_dir = os.path.abspath( ensembler.core.default_project_dirnames. templates_structures_resolved) model_dir = os.path.abspath( os.path.join(target_setup_data.models_target_dir, template.id)) if not os.path.exists(model_dir): ensembler.utils.create_dir(model_dir) model_pdbfilepath = os.path.abspath(os.path.join(model_dir, 'model.pdb.gz')) modeling_log_filepath = os.path.abspath( os.path.join(model_dir, 'modeling-log.yaml')) check_model_pdbfilepath_ends_in_pdbgz(model_pdbfilepath) model_pdbfilepath_uncompressed = model_pdbfilepath[:-3] if check_all_model_files_present(model_dir): logger.debug( "Output files already exist for target '%s' // template '%s'; files were not overwritten." % (target.id, template.id)) return logger.info( '-------------------------------------------------------------------------\n' 'Modelling "%s" => "%s"\n' '-------------------------------------------------------------------------' % (target.id, template.id)) # aln = align_target_template(target, template) aln_filepath = os.path.abspath(os.path.join(model_dir, 'alignment.pir')) # write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath) log_file = init_build_model_logfile(modeling_log_filepath) with ensembler.utils.enter_temp_dir(): try: start = datetime.datetime.utcnow() shutil.copy(aln_filepath, 'alignment.pir') run_modeller( target, template, model_dir, model_pdbfilepath, model_pdbfilepath_uncompressed, template_structure_dir, write_modeller_restraints_file=write_modeller_restraints_file) if os.path.getsize(model_pdbfilepath) < 1: raise Exception('Output PDB file is empty.') end_successful_build_model_logfile(log_file, start) except Exception as e: end_exception_build_model_logfile(e, log_file)
def extract_template_structures_from_pdb_files(selected_templates): logger.info('Writing template structures...') for template in selected_templates: pdb_filename = os.path.join(ensembler.core.default_project_dirnames.structures_pdb, template.pdbid + '.pdb.gz') template_resolved_filename = os.path.join(ensembler.core.default_project_dirnames.templates_structures_resolved, template.templateid + '.pdb') ensembler.pdb.extract_residues_by_resnum(template_resolved_filename, pdb_filename, template)
def package_for_fah(process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, model_validation_score_cutoff=None, model_validation_score_percentile=None, nclones=1, archive=False, openmm_platform='Reference', temperature=300.0 * unit.kelvin, collision_rate=1.0 / unit.picosecond, timestep=2.0 * unit.femtoseconds, loglevel=None): """ Create the input files and directory structure necessary to start a Folding@Home project. MPI-enabled. Parameters ---------- archive : Bool A .tgz compressed archive will be created for each individual RUN directory. """ set_loglevel(loglevel) if mpistate.rank == 0: if not os.path.exists(fah_projects_dir): os.mkdir(fah_projects_dir) mpistate.comm.Barrier() targets, templates_resolved_seq = get_targets_and_templates() for target in targets: if process_only_these_targets and (target.id not in process_only_these_targets): continue target_project_dir = os.path.join(fah_projects_dir, target.id) models_target_dir = os.path.join(default_project_dirnames.models, target.id) if not os.path.exists(models_target_dir): continue mpistate.comm.Barrier() sorted_valid_templates = [] system = None renumbered_resnums = {} if mpistate.rank == 0: logger.info('-------------------------------------------------------------------------') logger.info('Building FAH OpenMM project for target {}'.format(target.id)) logger.info('-------------------------------------------------------------------------') valid_templates = get_valid_templates_for_target( target, templates_resolved_seq, process_only_these_templates=process_only_these_templates, model_seqid_cutoff=model_seqid_cutoff, model_validation_score_cutoff=model_validation_score_cutoff, model_validation_score_percentile=model_validation_score_percentile ) sorted_valid_templates = sort_valid_templates_by_seqid( target, valid_templates ) create_target_project_dir(target) system = setup_system_and_integrator_files( target, sorted_valid_templates[0], temperature, collision_rate, timestep ) renumbered_resnums = get_renumbered_topol_resnums(target) sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0) system = mpistate.comm.bcast(system, root=0) renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0) logger.debug("Building RUNs in parallel...") for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size): template = sorted_valid_templates[run_index] logger.info('-------------------------------------------------------------------------') logger.info( 'Building RUN{} for template {}'.format( run_index, template ) ) logger.info('-------------------------------------------------------------------------') source_dir = os.path.join(models_target_dir, template) generate_fah_run( target_project_dir, template, source_dir, system, run_index, nclones, temperature, collision_rate, timestep, openmm_platform, renumbered_resnums, ) if archive: tgz_fah_run(target, run_index) mpistate.comm.Barrier() if mpistate.rank == 0: logger.info('Done.')
def determine_nwaters(process_only_these_targets=None, process_only_these_templates=None, model_seqid_cutoff=None, verbose=False, select_at_percentile=None): '''Determine distribution of nwaters, and select the value at a certain percentile. If not user-specified, the percentile is set to 100 if there are less than 10 templates, otherwise it is set to 68. ''' # Run serially if mpistate.rank == 0: models_dir = os.path.abspath(ensembler.core.default_project_dirnames.models) targets, templates_resolved_seq = ensembler.core.get_targets_and_templates() if process_only_these_templates: selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] else: selected_template_indices = range(len(templates_resolved_seq)) for target in targets: # Process only specified targets if directed. if process_only_these_targets and (target.id not in process_only_these_targets): continue models_target_dir = os.path.join(models_dir, target.id) if not os.path.exists(models_target_dir): continue if model_seqid_cutoff: process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(target.id, seqid_cutoff=model_seqid_cutoff) selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates] ntemplates_selected = len(selected_template_indices) if not select_at_percentile: select_at_percentile = 100 if ntemplates_selected < 10 else 68 if verbose: print("Determining number of waters in each system from target '%s'..." % target.id) nwaters_list = [] for template_index in range(ntemplates_selected): template = templates_resolved_seq[selected_template_indices[template_index]] if process_only_these_templates and template.id not in process_only_these_templates: continue model_dir = os.path.join(models_target_dir, template.id) if not os.path.exists(model_dir): continue try: nwaters_filename = os.path.join(model_dir, 'nwaters.txt') with open(nwaters_filename, 'r') as nwaters_file: firstline = nwaters_file.readline() nwaters = int(firstline) nwaters_list.append(nwaters) except Exception: pass nwaters_array = np.array(nwaters_list) nwaters_array.sort() nwaters_list_filename = os.path.join(models_target_dir, 'nwaters-list.txt') with open(nwaters_list_filename, 'w') as nwaters_list_file: for nwaters in nwaters_array: nwaters_list_file.write('%12d\n' % nwaters) # display statistics index_selected = int((len(nwaters_array) - 1) * (float(select_at_percentile) / 100.0)) index68 = int((len(nwaters_array) - 1) * 0.68) index95 = int((len(nwaters_array) - 1) * 0.95) if len(nwaters_array) > 0: logger.info('Number of waters in solvated models (target: %s): min = %d, max = %d, ' 'mean = %.1f, 68%% = %.0f, 95%% = %.0f, chosen_percentile (%d%%) = %.0f' % ( target.id, nwaters_array.min(), nwaters_array.max(), nwaters_array.mean(), nwaters_array[index68], nwaters_array[index95], select_at_percentile, nwaters_array[index_selected] ) ) filename = os.path.join(models_target_dir, 'nwaters-max.txt') with open(filename, 'w') as outfile: outfile.write('%d\n' % nwaters_array.max()) filename = os.path.join(models_target_dir, 'nwaters-use.txt') with open(filename, 'w') as outfile: outfile.write('%d\n' % nwaters_array[index_selected]) else: logger.info('No nwaters information found.') project_metadata = ensembler.core.ProjectMetadata(project_stage='determine_nwaters', target_id=target.id) datestamp = ensembler.core.get_utcnow_formatted() metadata = { 'target_id': target.id, 'datestamp': datestamp, 'model_seqid_cutoff': model_seqid_cutoff, 'select_at_percentile': select_at_percentile, 'process_only_these_targets': process_only_these_targets, 'process_only_these_templates': process_only_these_templates, 'python_version': sys.version.split('|')[0].strip(), 'python_full_version': ensembler.core.literal_str(sys.version), 'ensembler_version': ensembler.version.short_version, 'ensembler_commit': ensembler.version.git_revision, 'biopython_version': Bio.__version__, } project_metadata.add_data(metadata) project_metadata.write() mpistate.comm.Barrier() mpistate.comm.Barrier() if mpistate.rank == 0: print('Done.')