def annotate_with_assay_data(mols, assay_data_filename): """ Annotate the set of molecules with activity data using SD tags Parameters ---------- mols : list of OEMol List of molecules to annotate assay_data_filename Filename of CSV file containing activity data """ # Load assay data assayed_molecules = dict() with oechem.oemolistream(assay_data_filename) as ifs: for mol in ifs.GetOEGraphMols(): assayed_molecules[mol.GetTitle()] = oechem.OEGraphMol(mol) logging.info(f'Loaded data for {len(assayed_molecules)} assayed molecules') # Copy all SDData from assayed molecules that match title nmols_with_assay_data = 0 for mol in mols: if mol.GetTitle() in assayed_molecules: assayed_molecule = assayed_molecules[mol.GetTitle()] oechem.OECopySDData(mol, assayed_molecule) nmols_with_assay_data += 1 logging.info( f'Found assay data for {nmols_with_assay_data} / {len(mols)} molecules' )
def get_torsional_confs(mol): mc_mol = gen_starting_confs(mol, TORSION_LIBRARY, True, 20) torsion_tag = 'TORSION_ATOMS_FRAGMENT' torsion_atoms_in_fragment = get_sd_data(mol, torsion_tag).split() dihedral_atom_indices = [int(x) - 1 for x in torsion_atoms_in_fragment] dih, _ = get_dihedral(mc_mol, dihedral_atom_indices) torsional_confs = get_best_conf(mc_mol, dih, 24) torsional_mols = [] for conf in torsional_confs.GetConfs(): new_mol = oechem.OEMol(conf) oechem.OECopySDData(new_mol, mol) torsional_mols.append(new_mol) return torsional_mols
def get_best_conf(mol, dih, num_points): """Drive the primary torsion in the molecule and select the lowest energy conformer to represent each dihedral angle """ delta = 360.0 / num_points angle_list = [2 * i * oechem.Pi / num_points for i in range(num_points)] dih_atoms = [x for x in dih.GetAtoms()] # Create new output OEMol title = mol.GetTitle() tor_mol = oechem.OEMol() opts = oeszybki.OETorsionScanOptions() opts.SetDelta(delta) opts.SetForceFieldType(oeszybki.OEForceFieldType_MMFF94) opts.SetSolvationType(oeszybki.OESolventModel_NoSolv) tmp_angle = 0.0 tor = oechem.OETorsion(dih_atoms[0], dih_atoms[1], dih_atoms[2], dih_atoms[3], tmp_angle) oeszybki.OETorsionScan(tor_mol, mol, tor, opts) oechem.OECopySDData(tor_mol, mol) # if 0 and 360 sampled because of rounding if tor_mol.NumConfs() > num_points: for conf in tor_mol.GetConfs(): continue tor_mol.DeleteConf(conf) for angle, conf in zip(angle_list, tor_mol.GetConfs()): angle_deg = int(round(angle * oechem.Rad2Deg)) tor_mol.SetActive(conf) oechem.OESetTorsion(conf, dih_atoms[0], dih_atoms[1], dih_atoms[2], dih_atoms[3], angle) conf_name = title + '_{:02d}'.format(conf.GetIdx()) oechem.OESetSDData(conf, 'CONFORMER_LABEL', conf_name) oechem.OESetSDData(conf, 'TORSION_ANGLE', "{:.0f}".format(angle_deg)) conf.SetDoubleData('TORSION_ANGLE', angle_deg) conf.SetTitle('{}: Angle {:.0f}'.format(conf_name, angle_deg)) return tor_mol
def enumerate_conformations(name, smiles=None, pdbname=None): """Run Epik to get protonation states using PDB residue templates for naming. Parameters ---------- name : str Common name of molecule (used to create subdirectory) smiles : str Isomeric SMILES string pdbname : str Three-letter PDB code (e.g. 'DB8') """ # Create output subfolder output_basepath = os.path.join(output_dir, name) if not os.path.isdir(output_basepath): os.mkdir(output_basepath) output_basepath = os.path.join(output_basepath, name) if pdbname: # Make sure to only use one entry if there are mutliple if ' ' in pdbname: pdbnames = pdbname.split(' ') print("Splitting '%s' into first entry only: '%s'" % (pdbname, pdbnames[0])) pdbname = pdbnames[0] # Retrieve PDB (for atom names) url = 'http://ligand-expo.rcsb.org/reports/%s/%s/%s_model.pdb' % (pdbname[0], pdbname, pdbname) pdb_filename = output_basepath + '-input.pdb' retrieve_url(url, pdb_filename) pdb_molecule = read_molecule(pdb_filename) # Retrieve SDF (for everything else) url = 'http://ligand-expo.rcsb.org/reports/%s/%s/%s_model.sdf' % (pdbname[0], pdbname, pdbname) sdf_filename = output_basepath + '-input.sdf' retrieve_url(url, sdf_filename) sdf_molecule = read_molecule(sdf_filename) # Replace atom names in SDF for (sdf_atom, pdb_atom) in zip(sdf_molecule.GetAtoms(), pdb_molecule.GetAtoms()): sdf_atom.SetName(pdb_atom.GetName()) # Assign Tripos atom types oechem.OETriposAtomTypeNames(sdf_molecule) oechem.OETriposBondTypeNames(sdf_molecule) oe_molecule = sdf_molecule # We already know the residue name residue_name = pdbname elif smiles: # Generate molecule geometry with OpenEye print("Generating molecule {}".format(name)) oe_molecule = openeye.smiles_to_oemol(smiles) # Assign Tripos atom types oechem.OETriposAtomTypeNames(oe_molecule) oechem.OETriposBondTypeNames(oe_molecule) try: oe_molecule = openeye.get_charges(oe_molecule, keep_confs=1) except RuntimeError as e: traceback.print_exc() print("Skipping molecule " + name) return residue_name = re.sub('[^A-Za-z]+', '', name.upper())[:3] else: raise Exception('Must provide SMILES string or pdbname') # Save mol2 file, preserving atom names print("Running epik on molecule {}".format(name)) mol2_file_path = output_basepath + '-input.mol2' write_mol2_preserving_atomnames(mol2_file_path, oe_molecule, residue_name) # Run epik on mol2 file mae_file_path = output_basepath + '-epik.mae' schrodinger.run_epik(mol2_file_path, mae_file_path, tautomerize=False, max_structures=100, min_probability=np.exp(-MAX_ENERGY_PENALTY), ph=7.4) # Convert maestro file to sdf and mol2 output_sdf_filename = output_basepath + '-epik.sdf' output_mol2_filename = output_basepath + '-epik.mol2' schrodinger.run_structconvert(mae_file_path, output_sdf_filename) schrodinger.run_structconvert(mae_file_path, output_mol2_filename) # Read SDF file. ifs_sdf = oechem.oemolistream() ifs_sdf.SetFormat(oechem.OEFormat_SDF) ifs_sdf.open(output_sdf_filename) sdf_molecule = oechem.OEGraphMol() # Read MOL2 file. ifs_mol2 = oechem.oemolistream() ifs_mol2.open(output_mol2_filename) mol2_molecule = oechem.OEMol() # Assign charges. charged_molecules = list() index = 0 while oechem.OEReadMolecule(ifs_sdf, sdf_molecule): oechem.OEReadMolecule(ifs_mol2, mol2_molecule) index += 1 print("Charging molecule %d" % (index)) try: # Charge molecule. charged_molecule = openeye.get_charges(mol2_molecule, max_confs=800, strictStereo=False, normalize=True, keep_confs=None) # Assign Tripos types oechem.OETriposAtomTypeNames(charged_molecule) oechem.OETriposBondTypeNames(charged_molecule) # Store tags. oechem.OECopySDData(charged_molecule, sdf_molecule) # Store molecule charged_molecules.append(charged_molecule) except Exception as e: print(e) print("Skipping protomer/tautomer because of failed charging.") # Clean up ifs_sdf.close() ifs_mol2.close() # Write state penalites. outfile = open(output_basepath + '-state-penalties.out', 'w') for (index, charged_molecule) in enumerate(charged_molecules): # Get Epik data. epik_Ionization_Penalty = float(oechem.OEGetSDData(charged_molecule, "r_epik_Ionization_Penalty")) epik_Ionization_Penalty_Charging = float(oechem.OEGetSDData(charged_molecule, "r_epik_Ionization_Penalty_Charging")) epik_Ionization_Penalty_Neutral = float(oechem.OEGetSDData(charged_molecule, "r_epik_Ionization_Penalty_Neutral")) epik_State_Penalty = float(oechem.OEGetSDData(charged_molecule, "r_epik_State_Penalty")) epik_Tot_Q = int(oechem.OEGetSDData(charged_molecule, "i_epik_Tot_Q")) outfile.write('%16.8f\n' % epik_State_Penalty) outfile.close() # Write as PDB charged_pdb_filename = output_basepath + '-epik-charged.pdb' ofs = oechem.oemolostream(charged_pdb_filename) flavor = oechem.OEOFlavor_PDB_CurrentResidues | oechem.OEOFlavor_PDB_ELEMENT | oechem.OEOFlavor_PDB_BONDS | oechem.OEOFlavor_PDB_HETBONDS | oechem.OEOFlavor_PDB_BOTH ofs.SetFlavor(oechem.OEFormat_PDB, flavor) for (index, charged_molecule) in enumerate(charged_molecules): # Fix residue names for atom in charged_molecule.GetAtoms(): residue = oechem.OEAtomGetResidue(atom) residue.SetName(residue_name) oechem.OEAtomSetResidue(atom, residue) #oechem.OEWritePDBFile(ofs, charged_molecule, flavor) oechem.OEWriteMolecule(ofs, charged_molecule) ofs.close() # Write molecules as mol2. charged_mol2_filename = output_basepath + '-epik-charged.mol2' write_mol2_preserving_atomnames(charged_mol2_filename, charged_molecules, residue_name)
refmol_filename = f'../receptors/monomer/Mpro-{fragment}_0_bound-ligand.mol2' refmol = None with oechem.oemolistream(refmol_filename) as ifs: for mol in ifs.GetOEGraphMols(): refmol = mol break if refmol is None: raise Exception(f'Could not read {refmol_filename}') print(f'Reference molecule has {refmol.NumAtoms()} atoms') # Replace title refmol.SetTitle(fragments[fragment]) # Copy data from assayed molecules (if present) for mol in assayed_molecules: if mol.GetTitle() == fragments[fragment]: print(f'{refmol.GetTitle()} found in target_molecules; copying SDData') oechem.OECopySDData(refmol, mol) break # Read target molecules target_molecules_filename = prefix + f'.csv' print('Reading target molecules...') from openeye import oechem target_molecules = list() with oechem.oemolistream(target_molecules_filename) as ifs: for mol in ifs.GetOEGraphMols(): # Copy data from assayed molecules (if present) for assayed_mol in assayed_molecules: if assayed_mol.GetTitle() == mol.GetTitle(): print(f'{mol.GetTitle()} found in assayed data; copying SDData') oechem.OECopySDData(refmol, mol) break
def enumerate_conformations(name, pdbfile=None, smiles=None, pdbname=None, pH=7.4): """Run Epik to get protonation states using PDB residue templates for naming. Parameters ---------- name : str Common name of molecule (used to create subdirectory) smiles : str Isomeric SMILES string pdbname : str Three-letter PDB code (e.g. 'DB8') """ # Create output subfolder # output_basepath = os.path.join(output_dir, name) # if not os.path.isdir(output_basepath): # os.mkdir(output_basepath) # output_basepath = os.path.join(output_basepath, name) oehandler = openeye.oechem.OEThrow # String stream output oss = oechem.oeosstream() oehandler.SetOutputStream(oss) log = "New run:\nPDB code: {pdbname}; Molecule: {name}; pH {pH}\n".format( **locals()) success_status = True if pdbname: # Make sure to only use one entry if there are multiple if ' ' in pdbname: pdbnames = pdbname.split(' ') log += "Splitting '%s' into first entry only: '%s'" % (pdbname, pdbnames[0]) pdbname = pdbnames[0] # Retrieve PDB (for atom names) url = 'http://ligand-expo.rcsb.org/reports/%s/%s/%s_model.pdb' % ( pdbname[0], pdbname, pdbname) pdb_filename = name + '-rcsb_download.pdb' log += "Retrieving PDB structure from RCSB ligand expo: {}.\n".format( pdb_filename) retrieve_url(url, pdb_filename) log += "Parsing PDB file.\n" pdb_molecule = read_molecule(pdb_filename) # Retrieve SDF (for everything else) url = 'http://ligand-expo.rcsb.org/reports/%s/%s/%s_model.sdf' % ( pdbname[0], pdbname, pdbname) sdf_filename = name + '-rcsb_download.sdf' log += "Retrieving SDF structure from RCSB ligand expo: {}.\n".format( sdf_filename) retrieve_url(url, sdf_filename) log += "Parsing SDF file.\n" sdf_molecule = read_molecule(sdf_filename) # Replace atom names in SDF log += "Canonicalizing atom names.\n" for (sdf_atom, pdb_atom) in zip(sdf_molecule.GetAtoms(), pdb_molecule.GetAtoms()): sdf_atom.SetName(pdb_atom.GetName()) # Assign Tripos atom types log += "Assign atom type names.\n" oechem.OETriposAtomTypeNames(sdf_molecule) oechem.OETriposBondTypeNames(sdf_molecule) oe_molecule = sdf_molecule # We already know the residue name residue_name = pdbname # For the moment, disabling these two types of input # elif smiles: # # Generate molecule geometry with OpenEye # logging.info(("Generating molecule {}".format(name))) # oe_molecule = openeye.smiles_to_oemol(smiles) # # Assign Tripos atom types # oechem.OETriposAtomTypeNames(oe_molecule) # oechem.OETriposBondTypeNames(oe_molecule) # try: # logging.info("Charging initial") # write_mol2_preserving_atomnames(name + '-debug.mol2', oe_molecule, 'debug') # oe_molecule = openeye.get_charges(oe_molecule, keep_confs=1) # except RuntimeError as e: # traceback.print_exc() # logging.info(("Skipping molecule " + name)) # return # residue_name = re.sub('[^A-Za-z]+', '', name.upper())[:3] # logging.info("resname = %s", residue_name) # oe_molecule.SetTitle(residue_name) # fix iupac name issue with mol2convert # elif pdbfile: # residue_name = re.sub('[^A-Za-z]+', '', name.upper())[:3] # logging.info("Loading molecule molecule {0} from {1}".format(name, pdbfile)) # oe_molecule = read_molecule(pdbfile) # # Assign Tripos atom types # oechem.OETriposAtomTypeNames(oe_molecule) # oechem.OETriposBondTypeNames(oe_molecule) # try: # logging.info("Charging initial") # write_mol2_preserving_atomnames(name + '-debug.mol2', oe_molecule, 'debug') # oe_molecule = openeye.get_charges(oe_molecule, keep_confs=1) # except RuntimeError as e: # traceback.print_exc() # logging.info(("Skipping molecule " + name)) # return else: raise Exception('Must provide SMILES string or pdbname, or pdbfile') # Save mol2 file, preserving atom names log += "Running Epik.\n" mol2_file_path = name + '-before_epik.mol2' write_mol2_preserving_atomnames(mol2_file_path, oe_molecule, residue_name) # Run epik on mol2 file mae_file_path = name + '-epik.mae' schrodinger.run_epik(mol2_file_path, mae_file_path, tautomerize=False, max_structures=50, min_probability=np.exp(-MAX_ENERGY_PENALTY), ph=pH) log += "Epik run completed.\n" # Convert maestro file to sdf and mol2 output_sdf_filename = name + '-after_epik.sdf' output_mol2_filename = name + '-after_epik.mol2' # logging.info("Creating sdf") schrodinger.run_structconvert(mae_file_path, output_sdf_filename) # logging.info("Creating mol2") schrodinger.run_structconvert(mae_file_path, output_mol2_filename) # Read SDF file. ifs_sdf = oechem.oemolistream() ifs_sdf.SetFormat(oechem.OEFormat_SDF) ifs_sdf.open(output_sdf_filename) sdf_molecule = oechem.OEGraphMol() # Read MOL2 file. ifs_mol2 = oechem.oemolistream() ifs_mol2.open(output_mol2_filename) mol2_molecule = oechem.OEMol() # Assign charges. # reset count of error handler oehandler.Clear() log += "Assigning charges to protonation states.\n" charged_molecules = list() index = 0 failed_states = set() while oechem.OEReadMolecule(ifs_sdf, sdf_molecule): oechem.OEReadMolecule(ifs_mol2, mol2_molecule) index += 1 log += "State {0:d}\n".format(index) try: # Charge molecule. charged_molecule_conformers = omtoe.get_charges(mol2_molecule, max_confs=800, strictStereo=False, normalize=True, keep_confs=-1) log += "Charging stage output:\n" OEOutput = str(oss) log += OEOutput log += "\nCharging state completed.\n" # Restore coordinates to original charged_molecule = select_conformers(charged_molecule_conformers, mol2_molecule, keep_confs=None) # Assign Tripos types oechem.OETriposAtomTypeNames(charged_molecule) oechem.OETriposBondTypeNames(charged_molecule) # Store tags. oechem.OECopySDData(charged_molecule, sdf_molecule) # Store molecule charged_molecules.append(charged_molecule) # Check for failure in the log openeye_charge_log_parser(OEOutput, True) oehandler.Clear() except Exception as e: failed_states.add(index) logging.info(e) log += "State failed charging.\n" log += str(e) log += "\n" filename_failure = name + '-conformers-failed-state-{}-.mol2'.format( index) try: write_mol2_preserving_atomnames(filename_failure, charged_molecule_conformers, residue_name) except: log += "Could not store result, most likely failed during Omega step!\n" success_status = False oehandler.Clear() # Clean up ifs_sdf.close() ifs_mol2.close() # Write state penalties. outfile = open(name + '-state-penalties.out', 'w') for (index, charged_molecule) in enumerate(charged_molecules): # Get Epik data. log += "Writing Epik data for state {:d}\n".format(index + 1) epik_Ionization_Penalty = float( oechem.OEGetSDData(charged_molecule, "r_epik_Ionization_Penalty")) epik_Ionization_Penalty_Charging = float( oechem.OEGetSDData(charged_molecule, "r_epik_Ionization_Penalty_Charging")) epik_Ionization_Penalty_Neutral = float( oechem.OEGetSDData(charged_molecule, "r_epik_Ionization_Penalty_Neutral")) epik_State_Penalty = float( oechem.OEGetSDData(charged_molecule, "r_epik_State_Penalty")) epik_Tot_Q = int(oechem.OEGetSDData(charged_molecule, "i_epik_Tot_Q")) outfile.write('%16.8f\n' % epik_State_Penalty) outfile.close() # Write as PDB charged_pdb_filename = name + '-charged_output.pdb' ofs = oechem.oemolostream(charged_pdb_filename) flavor = oechem.OEOFlavor_PDB_CurrentResidues | oechem.OEOFlavor_PDB_ELEMENT | oechem.OEOFlavor_PDB_BONDS | oechem.OEOFlavor_PDB_HETBONDS | oechem.OEOFlavor_PDB_BOTH ofs.SetFlavor(oechem.OEFormat_PDB, flavor) for (index, charged_molecule) in enumerate(charged_molecules): # Fix residue names for atom in charged_molecule.GetAtoms(): residue = oechem.OEAtomGetResidue(atom) residue.SetName(residue_name) oechem.OEAtomSetResidue(atom, residue) oechem.OEWriteMolecule(ofs, charged_molecule) ofs.close() # Write molecules as mol2. charged_mol2_filename = name + '-charged_output.mol2' write_mol2_preserving_atomnames(charged_mol2_filename, charged_molecules, residue_name) log += "Run completed.\n" if success_status: log += "Status: Success\n" else: log += "Status: Failure\n" log += "Failed states: {}\n".format(" ".join( [str(state) for state in sorted(list(failed_states))])) with open("log.txt", 'w') as logfile: logfile.write(log) return log, success_status
def GetBestOverlays(self, querymolstr, options, iformat, oformat): """ Return a string of the format specified by 'oformat' containing nhits overlaid confomers using querymolstr as the query interpretted as iformat. querymolstr - a string containing a molecule to use as the query options - an instance of OEShapeDatabaseOptions iformat - a string representing the file extension to parse the querymolstr as. Note: old clients could be passing .sq files, so iformat == '.oeb' will try to interpret the file as a .sq file. oformat - file format to write the results as """ timer = oechem.OEWallTimer() # make sure to wait for the load to finish blocking = True loaded = self.IsLoaded(blocking) assert loaded if iformat.startswith(".sq"): query = ReadShapeQuery(querymolstr) else: # read in query qfs = oechem.oemolistream() qfs = SetupStream(qfs, iformat) if not qfs.openstring(querymolstr): raise ValueError("Unable to open input molecule string") query = oechem.OEGraphMol() if not oechem.OEReadMolecule(qfs, query): if iformat == ".oeb": # could be an old client trying to send a .sq file. query = ReadShapeQuery(querymolstr) else: raise ValueError( "Unable to read a molecule from the string of format '%s'" % iformat) ofs = oechem.oemolostream() ofs = SetupStream(ofs, oformat) if not ofs.openstring(): raise ValueError("Unable to openstring for output") # do we only want shape based results? # this is a "Write" lock to be paranoid and not overload the GPU self.rwlock.AcquireWriteLock() try: # do search scores = self.shapedb.GetSortedScores(query, options) sys.stderr.write("%f seconds to do search\n" % timer.Elapsed()) finally: self.rwlock.ReleaseWriteLock() timer.Start() # write results for score in scores: mcmol = oechem.OEMol() if not self.moldb.GetMolecule(mcmol, score.GetMolIdx()): oechem.OEThrow.Warning( "Can't retrieve molecule %i from the OEMolDatabase, " "skipping..." % score.GetMolIdx()) continue # remove hydrogens to make output smaller, this also # ensures OEPrepareFastROCSMol will have the same output oechem.OESuppressHydrogens(mcmol) mol = oechem.OEGraphMol( mcmol.GetConf(oechem.OEHasConfIdx(score.GetConfIdx()))) oechem.OECopySDData(mol, mcmol) if options.GetSimFunc() == oefastrocs.OEShapeSimFuncType_Tanimoto: oechem.OESetSDData(mol, "ShapeTanimoto", "%.4f" % score.GetShapeTanimoto()) oechem.OESetSDData(mol, "ColorTanimoto", "%.4f" % score.GetColorTanimoto()) oechem.OESetSDData(mol, "TanimotoCombo", "%.4f" % score.GetTanimotoCombo()) else: oechem.OESetSDData(mol, "ShapeTversky", "%.4f" % score.GetShapeTversky()) oechem.OESetSDData(mol, "ColorTversky", "%.4f" % score.GetColorTversky()) oechem.OESetSDData(mol, "TverskyCombo", "%.4f" % score.GetTverskyCombo()) if options.GetInitialOrientation( ) != oefastrocs.OEFastROCSOrientation_Inertial: oechem.OEAddSDData( mol, "Opt. Starting Pos.", GetAltStartsString(options.GetInitialOrientation())) score.Transform(mol) oechem.OEWriteMolecule(ofs, mol) output = ofs.GetString() sys.stderr.write("%f seconds to write hitlist\n" % timer.Elapsed()) sys.stderr.flush() ofs.close() return output
def split_confs(mol): for conf in mol.GetConfs(): new_mol = oechem.OEMol(conf) oechem.OECopySDData(new_mol, mol) yield new_mol