def test_repeating_molecules(self): """ Test methods used to create minidrugbank """ smiles = set() # check for repeating SMILES for idx, ff_mol in enumerate(TestMiniDrugBank.ff_mols): # get SMILES information ff_smile = oechem.OECreateIsoSmiString(ff_mol) tri_mol = TestMiniDrugBank.tripos_mols[idx] tri_smile = oechem.OECreateIsoSmiString(tri_mol) # SMILES should be the same for the two force fields self.assertEqual( ff_smile, tri_smile, msg= "SMILES for tripos molecule %s and parm@frosst molecule % should agree and don't" % (tri_mol.GetTitle(), ff_mol.GetTitle)) # there should also be no repeating smiles self.assertFalse( (ff_smile in smiles), msg="Found repeating SMILES string for %s" % ff_mol.GetTitle()) # add smiles to the list smiles.add(ff_smile)
def descriptorToMol(self, descr, descrType, limitPerceptions=False, messageTag=None): """Parse the input descriptor string and return a molecule object (OeGraphMol/OeQMol). Args: descr (str): descriptor descrType (str): descriptor type limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor messageTag (srt, optional): prefix string for error messages. Defaults to None. Returns: object: OeGraphMol()/OeQmol() object or None for failure ifs.SetFlavor(oechem.OEFormat_PDB, oechem.OEIFlavor_PDB_Default | oechem.OEIFlavor_PDB_DATA | oechem.OEIFlavor_PDB_ALTLOC) # noq """ try: if "SMILES" in descrType.upper() and "ISO" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: isoSmiles = oechem.OECreateIsoSmiString(oeMol) return self.smilesToMol(isoSmiles, limitPerceptions=limitPerceptions, messageTag=messageTag) else: return None if "SMILES" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: smiles = oechem.OECreateCanSmiString(oeMol) return self.smilesToMol(smiles, limitPerceptions=limitPerceptions, messageTag=messageTag) else: return None elif "INCHI" in descrType.upper(): oeMol = self.inchiToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: isoSmiles = oechem.OECreateIsoSmiString(oeMol) return self.smilesToMol(isoSmiles, limitPerceptions=limitPerceptions, messageTag=messageTag) elif "SMARTS" in descrType.upper(): return self.smartsToQmol(descr, messageTag=messageTag) else: return None except Exception as e: logger.exception("Failing with %s", str(e)) return None
def canonicalize_SMILES(smiles_list): """Ensure all SMILES strings end up in canonical form. Stereochemistry must already have been expanded. SMILES strings are converted to a OpenEye Topology and back again. Parameters ---------- smiles_list : list of str List of SMILES strings Returns ------- canonical_smiles_list : list of str List of SMILES strings, after canonicalization. """ # Round-trip each molecule to a Topology to end up in canonical form from openmoltools.forcefield_generators import generateOEMolFromTopologyResidue, generateTopologyFromOEMol from perses.utils.openeye import smiles_to_oemol from openeye import oechem canonical_smiles_list = list() for smiles in smiles_list: molecule = smiles_to_oemol(smiles) topology = generateTopologyFromOEMol(molecule) residues = [ residue for residue in topology.residues() ] new_molecule = generateOEMolFromTopologyResidue(residues[0]) new_smiles = oechem.OECreateIsoSmiString(new_molecule) canonical_smiles_list.append(new_smiles) return canonical_smiles_list
def find_parameter_ids(filename: str, indices: set) -> \ ({int : {"smiles": str, "ids": {"id": ["atom_indices"]}}}, set): """Finds the SMILES parameter ids associated with the molecule at each index Returns: - params_by_molecule: mapping from molecule indices to SMILES string and parameter ids - param_ids: set of all parameter ids found """ logging.info("Finding parameters for molecules") params_by_molecule = {} param_ids = set() for mol, index in read_index_mols_from_file(filename, indices): oechem.OEAddExplicitHydrogens(mol) smiles = oechem.OECreateIsoSmiString(mol) logging.info("Looking at molecule %d => %s", index, smiles) params = get_smirnoff_params(mol) logging.info("Parameter IDs: %s", list(params.keys())) param_ids |= params.keys() params_by_molecule[index] = { "smiles": smiles, "ids": params, } return params_by_molecule, param_ids
def frag_to_smiles(frags, mol): """ Convert fragments (AtomBondSet) to smiles string Parameters ---------- frags mol Returns ------- smiles: list of smiles strings """ smiles = {} for frag in frags: fragatompred = oechem.OEIsAtomMember(frag.GetAtoms()) fragbondpred = oechem.OEIsBondMember(frag.GetBonds()) fragment = oechem.OEGraphMol() adjustHCount = True oechem.OESubsetMol(fragment, mol, fragatompred, fragbondpred, adjustHCount) s = oechem.OECreateIsoSmiString(fragment) if s not in smiles: smiles[s] = [] smiles[s].append(frag) return smiles
def descriptorToSmiles(self, descr, descrType, limitPerceptions=False, messageTag=None): """Parse the input descriptor string and return an OE smiles. Args: descr (str): descriptor descrType (str): descriptor type limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor messageTag (srt, optional): prefix string for error messages. Defaults to None. Returns: str: SMILES string """ try: if "SMILES" in descrType.upper() and "ISO" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: return oechem.OECreateIsoSmiString(oeMol) else: return None if "SMILES" in descrType.upper(): oeMol = self.smilesToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: return oechem.OECreateCanSmiString(oeMol) else: return None elif "INCHI" in descrType.upper(): oeMol = self.inchiToMol(descr, limitPerceptions=limitPerceptions, messageTag=messageTag) if oeMol: return oechem.OECreateIsoSmiString(oeMol) else: return None except Exception as e: logger.exception("Failing with %s", str(e)) return None
def test_molecular_atom_mapping(): """ Test the creation of atom maps between pairs of molecules from the JACS benchmark set. """ from openeye import oechem from perses.rjmc.topology_proposal import SmallMoleculeSetProposalEngine from perses.tests.utils import createOEMolFromSMILES from perses.tests.utils import render_atom_mapping from itertools import combinations # Test mappings for JACS dataset ligands for dataset_name in [ 'CDK2' ]: #, 'p38', 'Tyk2', 'Thrombin', 'PTP1B', 'MCL1', 'Jnk1', 'Bace']: # Read molecules dataset_path = 'data/schrodinger-jacs-datasets/%s_ligands.sdf' % dataset_name mol2_filename = resource_filename('perses', dataset_path) ifs = oechem.oemolistream(mol2_filename) molecules = list() for mol in ifs.GetOEGraphMols(): molecules.append(oechem.OEGraphMol(mol)) # Build atom map for some transformations. #for (molecule1, molecule2) in combinations(molecules, 2): # too slow molecule1 = molecules[0] for i, molecule2 in enumerate(molecules[1:]): new_to_old_atom_map = SmallMoleculeSetProposalEngine._get_mol_atom_map( molecule1, molecule2) # Make sure we aren't mapping hydrogens onto anything else atoms1 = [atom for atom in molecule1.GetAtoms()] atoms2 = [atom for atom in molecule2.GetAtoms()] #for (index2, index1) in new_to_old_atom_map.items(): # atom1, atom2 = atoms1[index1], atoms2[index2] # if (atom1.GetAtomicNum()==1) != (atom2.GetAtomicNum()==1): filename = 'mapping-error-%d.png' % i render_atom_mapping(filename, molecule1, molecule2, new_to_old_atom_map) #msg = 'Atom atomic number %d is being mapped to atomic number %d\n' % (atom1.GetAtomicNum(), atom2.GetAtomicNum()) msg = 'molecule 1 : %s\n' % oechem.OECreateIsoSmiString(molecule1) msg += 'molecule 2 : %s\n' % oechem.OECreateIsoSmiString(molecule2) msg += 'Wrote atom mapping to %s for inspection; please check this.' % filename msg += str(new_to_old_atom_map) print(msg)
def filter_molecules(input_molstream, output_molstream, allow_repeats=False, allow_warnings=False, max_heavy_atoms=100, remove_smirks=list(), max_metals=0, explicitHs=True, elements=None, check_type=None): """ Takes input file and removes molecules using given criteria then writes a new output file """ errs = oechem.oeosstream() oechem.OEThrow.SetOutputStream(errs) molecule = oechem.OECreateOEGraphMol() smiles = list() count = 0 warnings = 0 smile_count = 0 saved = 0 while oechem.OEReadMolecule(input_molstream, molecule): count += 1 if ("warning" in errs.str().lower()) and not allow_warnings: warnings += 1 errs.clear() continue smi = oechem.OECreateIsoSmiString(molecule) mol_copy = oechem.OEMol(molecule) if explicitHs: oechem.OEAddExplicitHydrogens(mol_copy) new_smile = smi not in smiles if not new_smile: smile_count += 1 if new_smile or allow_repeats: keep = keep_molecule(mol_copy, max_heavy_atoms, remove_smirks, max_metals, elements, check_type) if keep: smiles.append(smi) oechem.OEWriteMolecule(output_molstream, mol_copy) saved += 1 errs.clear() print(f"{count} molecules in input stream") print(f"{warnings} molecules resulted in warnings when parsing") print(f"{smile_count} molecules were had repeated isomeric SMILES") print(f"{saved} molecules saved")
def __makeChemCompDescriptorCategory(self, ccId, oeMol): """ loop_ _pdbx_chem_comp_descriptor.comp_id _pdbx_chem_comp_descriptor.type _pdbx_chem_comp_descriptor.program _pdbx_chem_comp_descriptor.program_version _pdbx_chem_comp_descriptor.descriptor ARG SMILES ACDLabs 10.04 "O=C(O)C(N)CCCNC(=[NH2+])N" ARG SMILES_CANONICAL CACTVS 3.341 "N[C@@H](CCCNC(N)=[NH2+])C(O)=O" ARG SMILES CACTVS 3.341 "N[CH](CCCNC(N)=[NH2+])C(O)=O" ARG SMILES_CANONICAL "OpenEye OEToolkits" 1.5.0 "C(C[C@@H](C(=O)O)N)CNC(=[NH2+])N" ARG SMILES "OpenEye OEToolkits" 1.5.0 "C(CC(C(=O)O)N)CNC(=[NH2+])N" ARG InChI InChI 1.03 "InChI=1S/C6H14N4O2/c7-4(5(11)12)2-1-3-1..... " ARG InChIKey InChI 1.03 ODKSFYDXXFIFQN-BYPYZUCNSA-O # """ rowL = [] # aRow = {} aRow["comp_id"] = ccId aRow["type"] = "SMILES_CANONICAL" aRow["program"] = "OpenEye OEToolkits" aRow["program_version"] = self.__oeVersion aRow["descriptor"] = oechem.OECreateIsoSmiString(oeMol) rowL.append(aRow) # aRow = {} aRow["comp_id"] = ccId aRow["type"] = "SMILES" aRow["program"] = "OpenEye OEToolkits" aRow["program_version"] = self.__oeVersion aRow["descriptor"] = oechem.OECreateCanSmiString(oeMol) rowL.append(aRow) # aRow = {} aRow["comp_id"] = ccId aRow["type"] = "InChI" aRow["program"] = "OpenEye OEToolkits" aRow["program_version"] = self.__oeVersion aRow["descriptor"] = oechem.OECreateInChI(oeMol) rowL.append(aRow) # aRow = {} aRow["comp_id"] = ccId aRow["type"] = "InChIKey" aRow["program"] = "OpenEye OEToolkits" aRow["program_version"] = self.__oeVersion aRow["descriptor"] = oechem.OECreateInChIKey(oeMol) rowL.append(aRow) # return rowL
def standardizeSmiles(self, smiles, type="ISOMERIC"): # pylint: disable=redefined-builtin """ Return a standardized SMILES (type) or None """ smilesOut = None try: mol = oechem.OEGraphMol() if (oechem.OEParseSmiles(mol, smiles) == 1): oechem.OEAssignAromaticFlags(mol) if type == "CANNONICAL": smilesOut = oechem.OECreateCanSmiString(mol) elif type == "ISOMERIC": smilesOut = oechem.OECreateIsoSmiString(mol) else: logger.error("Unable to parse input SMILES '%s'", smiles) except Exception as e: logger.exception("Error '%s' occured. Arguments %s.", str(e), e.args) return smilesOut
c_mol = oechem.OECreateOEGraphMol() while oechem.OEReadMolecule(ifs, c_mol): index += 1 # process molecules individually, storing less p = multiprocessing.Process(target=genConfs, args=( c_mol, ofsff, ofsTri, index, )) p.start() p.join(24) if p.is_alive(): print("TIMED OUT %s" % oechem.OECreateIsoSmiString(c_mol)) oechem.OEWriteConstMolecule(ofsFail, oechem.OEMol(c_mol)) time_out += 1 p.terminate() p.join() elif p.exitcode: success += 1 p.terminate() p.join() else: print("CONF FAIL %s" % oechem.OECreateIsoSmiString(c_mol)) oechem.OEWriteConstMolecule(ofsFail, oechem.OEMol(c_mol)) conf_fail += 1 p.terminate() p.join()
def __init__(self, refmol): self.refmol = oechem.OEGraphMol(refmol) self.ss = oechem.OESubSearch(oechem.OECreateIsoSmiString(refmol)) self.ref_match = self.get_match(self.refmol)
def eMolecules_filtering(input_f, current_smiles = list()): """ This function was used to filter eMolecules database and the eMolecules_incremental database. It creates all the filtered output files with 1000 molecules in each sdf file and 1,000,000 molecule-ID to smiles strings in each text file Parameter --------- input_f : string "path/to/inputfile.sdf" current_smiles : list of strings; smiles already in your molecule sets """ set_name = input_f.split('.')[0] output_f = set_name+"_%i.sdf" smiles_base = set_name+"_%i.txt" molecule_name = set_name+"_%i_%i" # Load and check input file ifs = oechem.oemolistream(input_f) if not ifs.IsValid(): raise Exception("Error: input_file (%s) was not valid" % input_f) errs = oechem.oeosstream() oechem.OEThrow.SetOutputStream(errs) molecule = oechem.OECreateOEGraphMol() count = 0 smile_count = 0 saved = 0 switch = False # first output file current_letter = 1000 ofs_file = output_f%current_letter ofs = oechem.oemolostream(ofs_file) if not ofs.IsValid(): raise Exception("output file %s is not valid" % ofs_file) add_smiles = open(smiles_base % current_letter, 'a') while oechem.OEReadMolecule(ifs, molecule): # count input file molecules count +=1 if switch: # If True create new output file switch = False ofs.close() current_letter += 1 ofs_file = output_f % current_letter # Load and check output file ofs = oechem.oemolostream(ofs_file) if not ofs.IsValid(): raise Exception("output file %s is not valid" % ofs_file) print("Switching to file %s, currently saved %i molecules" % (ofs_file, saved)) if current_letter%100 == 0: add_smiles.close() add_smiles = open(smiles_base % current_letter, 'a') # IF smiles in current list skip the molecule smi = oechem.OECreateIsoSmiString(molecule) if smi in current_smiles: smile_count += 1 continue # Make copy of molecule before making changes mol_copy = oechem.OEMol(molecule) oechem.OEAddExplicitHydrogens(mol_copy) # if the molecule meets our requirements save to current output if keep_molecule(mol_copy): mol_title = molecule_name % (current_letter,count) mol_copy.SetTitle(mol_title) add_smiles.writelines("%s\t\t%s\n" % (mol_title, smi)) oechem.OEWriteMolecule(ofs, mol_copy) saved += 1 if saved%1000 == 0: switch = True print("%i molecules in input file" % (count)) print("%i molecules were had repeated isomeric SMILES" % smile_count) print("%i molecules saved to output files" % (saved)) ifs.close() ofs.close()
def check_valence(mol): """ Checks for hypervalency Parameter --------- mol - OEMol() Return ------ boolean - True (no inappropriate valency) False (an atom with atomic number < 10 has > 4 Valence) """ for atom in mol.GetAtoms(): atomNum = atom.GetAtomicNum() # find number of neighbors to this atom valence = atom.GetValence() if atomNum <= 10: # first row elements if valence > 4: print("Found a #%i atom with valence %i in molecule %s" % (atomNum, valence, oechem.OECreateIsoSmiString(mol))) return False return True
# count input file count +=1 if switch: # If True, open new output file switch = False ofs.close() current_letter = letters.pop(0) ofs_file = output_f % current_letter # Load and check output file ofs = oechem.oemolostream(ofs_file) if not ofs.IsValid(): raise Exception("output file %s is not valid" % ofs_file) print("Switching to file %s, currently saved %i molecules" % (ofs_file, saved)) # get isomeric smiles string smi = oechem.OECreateIsoSmiString(molecule) # if it isn't a new molecule skip it if smi in current_smiles: smile_count += 1 continue # create and save molecule name in form DrugBank_[letter][number] mol_title = molecule_name % (current_letter,count) # Make copy before making changes to molecule mol_copy = oechem.OEMol(molecule) mol_copy.SetTitle(mol_title) oechem.OEAddExplicitHydrogens(mol_copy) # Determine if molecule meets requirements keep = keep_molecule(mol_copy) if keep:
def get_molecule_parameterIDs(oemols, ffxml): """Process a list of oemols with a specified SMIRNOFF ffxml file and determine which parameters are used by which molecules, returning collated results. Parameters ---------- oemols : list List of OpenEye OEChem molecules to parse; must have explicit hydrogens. Returns ------- parameters_by_molecule : dict Parameter IDs used in each molecule, keyed by isomeric SMILES generated from provided OEMols. Each entry in the dict is a list which does not necessarily have unique entries; i.e. parameter IDs which are used more than once will occur multiple times. parameters_by_ID : dict Molecules in which each parameter ID occur, keyed by parameter ID. Each entry in the dict is a set of isomeric SMILES for molecules in which that parameter occurs. No frequency information is stored. """ # Create storage parameters_by_molecule = {} parameters_by_ID = {} # Generate isomeric SMILES isosmiles = list() for mol in oemols: smi = oechem.OECreateIsoSmiString(mol) if not smi in isosmiles: isosmiles.append(smi) # If the molecule is already here, raise exception else: raise ValueError( "Error: get_molecule_parameterIDs has been provided a list of oemols which contains the same molecule, having isomeric smiles %s, more than once." % smi) # Label molecules ff = ForceField(ffxml) labels = ff.labelMolecules(oemols) # Organize labels into output dictionary by looping over all molecules/smiles for idx in range(len(isosmiles)): # Pull smiles, initialize storage smi = isosmiles[idx] parameters_by_molecule[smi] = [] # Organize data for this molecule data = labels[idx] for force_type in data.keys(): for (atom_indices, pid, smirks) in data[force_type]: # Store pid to molecule parameters_by_molecule[smi].append(pid) # Store which molecule this pid occurred in if pid not in parameters_by_ID: parameters_by_ID[pid] = set() parameters_by_ID[pid].add(smi) else: parameters_by_ID[pid].add(smi) return parameters_by_molecule, parameters_by_ID
def check_atomtype(mol, types): for atom in mol.GetAtoms(): if atom.GetType() in types: print("Found type %s atom in molecule %s" % (atom.GetType(), oechem.OECreateIsoSmiString(mol))) return False return True
def __testReproduceDescriptors(self, molBuildType, limitPerceptions=True): # ccMolD, ccIdxD = self.__getChemCompDefs() oemf = OeMoleculeFactory() countD = defaultdict(int) for ccId, ccDef in ccMolD.items(): tId = oemf.setChemCompDef(ccDef) if ccId != tId: continue oemf.build(molBuildType=molBuildType, limitPerceptions=limitPerceptions) oeMol = oemf.getMol() # countD["total components"] += 1 if ccId not in ccIdxD: logger.info("Missing ccIndex entry for %s", ccId) continue ccdD = ccIdxD[ccId] if ccdD["ambiguous"]: countD["ambiguous component"] += 1 continue # countD["total molecules"] += 1 nativeCanIsoSmiles = oechem.OECreateIsoSmiString(oeMol) canIsoSmiles = oechem.OEMolToSmiles(oeMol) isoSmiles = oemf.getIsoSMILES() canSmiles = oemf.getCanSMILES() # check interal consistency if nativeCanIsoSmiles != isoSmiles: logger.error("%s stored and calculated OE smiles differ %s %s", ccId, nativeCanIsoSmiles, isoSmiles) if canIsoSmiles != isoSmiles: logger.error( "%s calculated OE ISO and canonical smiles differ %s %s", ccId, isoSmiles, canIsoSmiles) # compare with archived values if isoSmiles != ccdD["oe-iso-smiles"]: logger.info("%s ISO SMILES differ \nccd: %r \nOE: %r", ccId, ccdD["oe-iso-smiles"], isoSmiles) countD["iso_smiles_diff"] += 1 # ---------- if canSmiles != ccdD["oe-smiles"]: logger.info("%s CAN SMILES differ \nccd: %r \nOE: %r", ccId, ccdD["oe-smiles"], canSmiles) countD["smiles_diff"] += 1 formula = oemf.getFormula() if formula.upper() != ccdD["formula"].upper(): logger.debug("%s formulas differ \nccd: %r \nOE: %r", ccId, ccdD["formula"], formula) countD["formula_diff"] += 1 # --------- inchiKey = oemf.getInChIKey() if inchiKey != ccdD["inchikey"]: logger.debug("%s InChI keys differ \nccd: %r \nOE: %r", ccId, ccdD["inchikey"], inchiKey) countD["inchikey_diff"] += 1 # inchi = oemf.getInChI() if inchi != ccdD["inchi"]: logger.debug("%s InChIs differ \nccd: %r \nOE: %r", ccId, ccdD["inchi"], inchi) countD["inchi_diff"] += 1 # # for ky, vl in countD.items(): logger.info("%-12s %6d", ky, vl)