def getCCDefObj(self, dataContainer, molBuildType="model-xyz", suppressHydrogens=False): """Build OE molecule from the input chemical component definition object.""" # oemf = OeMoleculeFactory() if not self.__verbose: oemf.setQuiet() ccId = oemf.setChemCompDef(dataContainer) oemf.build(molBuildType=molBuildType) if self.__verbose: logger.info(" CCId = %s", ccId) logger.info(" Title = %s", oemf.getTitle()) logger.info(" SMILES = %s", oemf.getCanSMILES()) logger.info(" SMILES (stereo) = %s", oemf.getIsoSMILES()) logger.info(" Formula (Hill) = %s", oemf.getFormula()) logger.info(" InChI key = %s", oemf.getInChIKey()) logger.info(" InChI = %s", oemf.getInChI()) fD = {} fD = {"Formula": oemf.getFormula(), "SMILES": oemf.getCanSMILES(), "SMILES_STEREO": oemf.getIsoSMILES(), "InChI": oemf.getInChI(), "InChIKey": oemf.getInChIKey()} if suppressHydrogens: tMol = oemf.getGraphMolSuppressH() else: tMol = oemf.getMol() fD["OEMOL"] = tMol fD["xyz"] = oemf.getAtomDetails(xyzType="model") return (ccId, tMol, fD)
def testBuildCifFromOE(self): """Build chemical component definitions from OE Mol object""" try: ccMolD = self.__getChemCompDefs() oemf = OeMoleculeFactory() # for ccId, ccObj in list(ccMolD.items())[:10]: # ---- tId = oemf.setChemCompDef(ccObj) self.assertEqual(tId, ccId) ok = oemf.build(molBuildType="model-xyz") self.assertTrue(ok) fp = os.path.join(self.__ccCifPath, ccId + "-gen.cif") oeMol = oemf.getMol() oeccU = OeChemCompUtils() ok = oeccU.addOeMol(ccId, oeMol, missingModelXyz=False, writeIdealXyz=False) self.assertTrue(ok) ok = oeccU.write(fp) # ---- except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def getCCDefFile(self, ccFilePath, molBuildType="model-xyz", suppressHydrogens=False): """Fetch the molecule definition (ccPath) and build OE molecules for comparison. """ # mU = MarshalUtil(workPath=self.__workPath) rdCcObjL = mU.doImport(ccFilePath, fmt="mmcif") oemf = OeMoleculeFactory() if not self.__verbose: oemf.setQuiet() ccId = oemf.setChemCompDef(rdCcObjL[0]) oemf.build(molBuildType=molBuildType) if self.__verbose: logger.info(" CCId = %s", ccId) logger.info(" Title = %s", oemf.getTitle()) logger.info(" SMILES = %s", oemf.getCanSMILES()) logger.info(" SMILES (stereo) = %s", oemf.getIsoSMILES()) logger.info(" Formula (Hill) = %s", oemf.getFormula()) logger.info(" InChI key = %s", oemf.getInChIKey()) logger.info(" InChI = %s", oemf.getInChI()) fD = {} fD = {"Formula": oemf.getFormula(), "SMILES": oemf.getCanSMILES(), "SMILES_STEREO": oemf.getIsoSMILES(), "InChI": oemf.getInChI(), "InChIKey": oemf.getInChIKey()} if suppressHydrogens: tMol = oemf.getGraphMolSuppressH() else: tMol = oemf.getMol() fD["OEMOL"] = tMol fD["xyz"] = oemf.getAtomDetails(xyzType="model") return (ccId, tMol, fD)
def __displayAlignedDescriptorPair(self, ccId, descrRef, buildTypeRef, descrFit, buildTypeFit, title=None, limitPerceptions=True): oemfRef = OeMoleculeFactory() oemfRef.setDescriptor(descrRef, buildTypeRef, ccId) oemfRef.build(molBuildType=buildTypeRef, limitPerceptions=limitPerceptions) oeMolRef = oemfRef.getMol() # oemfFit = OeMoleculeFactory() oemfFit.setDescriptor(descrFit, buildTypeFit, ccId) oemfFit.build(molBuildType=buildTypeFit, limitPerceptions=limitPerceptions) oeMolFit = oemfFit.getMol() # oed = OeDepictMCSAlignPage() oed.setSearchType(sType="graph-relaxed", minAtomMatchFraction=0.50) oed.setDisplayOptions(labelAtomName=True, labelAtomCIPStereo=True, labelAtomIndex=False, labelBondIndex=False, highlightStyleFit="ballAndStickInverse", bondDisplayWidth=0.5) oed.setRefMol(oeMolRef, ccId) oed.setFitMol(oeMolFit, ccId) myTitle = title if title else buildTypeRef + "-" + buildTypeFit imgPath = os.path.join(self.__workPath, myTitle + "-" + ccId + ".svg") logger.info("Using image path %r", imgPath) aML = oed.alignPair(imagePath=imgPath) if aML: logger.info("%s aligned image path %r", ccId, imgPath) for (rCC, rAt, tCC, tAt) in aML: logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt)
def write(self, filePath, oeMol, constantMol=False, addSdTags=True): """Write an oeMol with format type inferred from the filePath extension (e.g. .mol) Args: filePath (str): filepath with a chemical type extension constantMol (bool, optional): copies molecule before performing format specific perceptions Returns: bool: True for success or False otherwise """ try: molId = os.path.splitext(os.path.basename(filePath))[0] fmt = os.path.splitext(os.path.basename(filePath))[1][1:].lower() # if addSdTags: oemf = OeMoleculeFactory() oemf.setOeMol(oeMol, molId) oemf.addSdTags() oeMol = oemf.getMol() # self.__mU.mkdir(os.path.dirname(filePath)) ofs = oechem.oemolostream() ofs.open(filePath) logger.debug("Writing (fmt=%s) molId %s path %s title %s", fmt, molId, filePath, oeMol.GetTitle()) # if constantMol: oechem.OEWriteConstMolecule(ofs, oeMol) else: oechem.OEWriteMolecule(ofs, oeMol) # # If this is a mol2 file, we need to replace the resname if fmt.startswith("mol2"): # If this is a mol2/mol2h substitute the default substructure id with open(filePath, "r", encoding="utf-8") as ifh: lines = ifh.readlines() lines = [line.replace("<0>", molId) for line in lines] with open(filePath, "w", encoding="utf-8") as ofh: ofh.writelines(lines) return True except Exception as e: logger.exception("Failing for %s with %s", filePath, str(e)) return False
def chemCompToMol(self, ccdFilePath, molBuildType="model-xyz", quietFlag=False): retMolL = [] try: rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif") logger.info("Read %s with %d definitions", ccdFilePath, len(rdCcObjL)) oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() for ccObj in rdCcObjL: ccId = oemf.setChemCompDef(ccObj) if ccId: ok = oemf.build(molBuildType=molBuildType) if ok: oeMol = oemf.getMol() retMolL.append(oeMol) except Exception as e: logger.exception("Loading %s failing with %s", ccdFilePath, str(e)) return retMolL
def buildOeBinaryMolCacheFromIndex(self, filePath, ccIdxD, quietFlag=False, fpTypeList=None, limitPerceptions=False, suppressHydrogens=False): """Build cache of OEGraphMol() objects from the input chemical component search index. Args: filePath (str): output cache file path ccIdxD (dict): search index dictionary quietFlag (bool, optional): suppress OE output. Defaults to False. fpTypeList (list, optional): list of fingerprint types. Defaults to None. limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False. suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False. Returns: (int, int, list): chem comp success count, error count, chem comp identifier failure list """ failIdList = [] ccCount = 0 errCount = 0 startTime = time.time() try: ofs = oechem.oemolostream() ofs.SetFormat(oechem.OEFormat_OEB) if ofs.open(filePath): oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() for searchCcId, ccIdx in ccIdxD.items(): oemf.setDescriptor(ccIdx["smiles"], "oe-iso-smiles", searchCcId) ok = oemf.build(molBuildType="oe-iso-smiles", limitPerceptions=limitPerceptions) if ok and fpTypeList: fpOk = oemf.addFingerPrints(fpTypeList) if not fpOk: logger.info("Fingerprint generation fails for %r", searchCcId) if ok: if not suppressHydrogens: oemf.addExplicitHydrogens() oemf.setSimpleAtomNames() oeMol = oemf.getMol( suppressHydrogens=suppressHydrogens) oechem.OEWriteMolecule(ofs, oeMol) ccCount += 1 if not ok: # build failed incomplete component (e.g. missing atoms or bonds) errCount += 1 failIdList.append(searchCcId) else: logger.error("Unable to open cache database %s", filePath) errCount += 1 except Exception as e: logger.exception("Failing with %s", str(e)) # endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ccCount, errCount, failIdList
def buildOeBinaryMolCache(self, filePath, ccObjD, molBuildType="model-xyz", quietFlag=False, fpTypeList=None, limitPerceptions=False, suppressHydrogens=False): """Build cache of OEMol() objects from the input chemical component definition list. Args: filePath (str): output cache file path ccObjD (dict): chemical component object dictionary molBuildType (str, optional): [description]. Defaults to "model-xyz". quietFlag (bool, optional): [description]. Defaults to False. fpTypeList (list, optional): fingerprint type list. Defaults to None. limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False. suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False. Returns: (int, int, list): chem comp success count, error count, chem comp identifier failure list """ ok = False startTime = time.time() failIdList = [] ccCount = 0 errCount = 0 try: ofs = oechem.oemolostream() ofs.SetFormat(oechem.OEFormat_OEB) if ofs.open(filePath): oemf = OeMoleculeFactory() if quietFlag: oemf.setQuiet() for ccId, ccObj in ccObjD.items(): tId = oemf.setChemCompDef(ccObj) if tId and tId == ccId: ok = oemf.build(molBuildType=molBuildType, limitPerceptions=limitPerceptions) if ok and fpTypeList: fpOk = oemf.addFingerPrints(fpTypeList) if not fpOk: logger.info( "Fingerprint generation fails for %r", ccId) if ok: oeMol = oemf.getMol( suppressHydrogens=suppressHydrogens) oechem.OEWriteMolecule(ofs, oeMol) ccCount += 1 if not ok or not tId: # build failed incomplete component (e.g. missing atoms or bonds) errCount += 1 failIdList.append(ccId) else: logger.error("Unable to open cache database %s", filePath) errCount += 1 except Exception as e: logger.exception("Failing with %s", str(e)) # endTime = time.time() logger.info("Completed operation at %s (%.4f seconds)", time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - startTime) return ccCount, errCount, failIdList
def __testReproduceDescriptors(self, molBuildType, limitPerceptions=True): # ccMolD, ccIdxD = self.__getChemCompDefs() oemf = OeMoleculeFactory() countD = defaultdict(int) for ccId, ccDef in ccMolD.items(): tId = oemf.setChemCompDef(ccDef) if ccId != tId: continue oemf.build(molBuildType=molBuildType, limitPerceptions=limitPerceptions) oeMol = oemf.getMol() # countD["total components"] += 1 if ccId not in ccIdxD: logger.info("Missing ccIndex entry for %s", ccId) continue ccdD = ccIdxD[ccId] if ccdD["ambiguous"]: countD["ambiguous component"] += 1 continue # countD["total molecules"] += 1 nativeCanIsoSmiles = oechem.OECreateIsoSmiString(oeMol) canIsoSmiles = oechem.OEMolToSmiles(oeMol) isoSmiles = oemf.getIsoSMILES() canSmiles = oemf.getCanSMILES() # check interal consistency if nativeCanIsoSmiles != isoSmiles: logger.error("%s stored and calculated OE smiles differ %s %s", ccId, nativeCanIsoSmiles, isoSmiles) if canIsoSmiles != isoSmiles: logger.error( "%s calculated OE ISO and canonical smiles differ %s %s", ccId, isoSmiles, canIsoSmiles) # compare with archived values if isoSmiles != ccdD["oe-iso-smiles"]: logger.info("%s ISO SMILES differ \nccd: %r \nOE: %r", ccId, ccdD["oe-iso-smiles"], isoSmiles) countD["iso_smiles_diff"] += 1 # ---------- if canSmiles != ccdD["oe-smiles"]: logger.info("%s CAN SMILES differ \nccd: %r \nOE: %r", ccId, ccdD["oe-smiles"], canSmiles) countD["smiles_diff"] += 1 formula = oemf.getFormula() if formula.upper() != ccdD["formula"].upper(): logger.debug("%s formulas differ \nccd: %r \nOE: %r", ccId, ccdD["formula"], formula) countD["formula_diff"] += 1 # --------- inchiKey = oemf.getInChIKey() if inchiKey != ccdD["inchikey"]: logger.debug("%s InChI keys differ \nccd: %r \nOE: %r", ccId, ccdD["inchikey"], inchiKey) countD["inchikey_diff"] += 1 # inchi = oemf.getInChI() if inchi != ccdD["inchi"]: logger.debug("%s InChIs differ \nccd: %r \nOE: %r", ccId, ccdD["inchi"], inchi) countD["inchi_diff"] += 1 # # for ky, vl in countD.items(): logger.info("%-12s %6d", ky, vl)
def testSssWithFingerPrintFromDescriptor(self): oemp = OeMoleculeProvider(**self.__myKwargs) ok = oemp.testCache() ccmP = ChemCompIndexProvider(**self.__myKwargs) ccIdxD = ccmP.getIndex() ok = ccmP.testCache(minCount=self.__minCount) self.assertTrue(ok) limitPerceptions = False # minFpScore = 0.5 maxFpResults = 50 matchOpts = "graph-relaxed" numMols = 20 oeioU = OeIoUtils() oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList) missTupL = [] missedD = {} missedFpD = {} # ---- startTime = time.time() for ccId, ccD in list(ccIdxD.items())[:numMols]: for buildType in [ "oe-iso-smiles", "oe-smiles", "acdlabs-smiles", "cactvs-iso-smiles", "cactvs-smiles", "inchi" ]: if buildType in ccD: logger.debug("Search %s %r", ccId, ccD[buildType]) if buildType in ["inchi"]: oemf = OeMoleculeFactory() oemf.setDescriptor(ccD["inchi"], "inchi", ccId) ok = oemf.build(molBuildType="inchi", limitPerceptions=limitPerceptions) if not ok: logger.info("%s build failed with InChI %r", ccId, ccD["inchi"]) else: oeMol = oemf.getMol() if oemf.getInChI() != ccD["inchi"]: logger.info( "%s regenerated InChI differs\n%r\n%s", ccId, ccD["inchi"], oemf.getInChI()) else: oeMol = oeioU.smilesToMol( ccD[buildType], limitPerceptions=limitPerceptions) if not oeMol: continue maxHits = 0 minHits = maxFpResults selfHit = False for fpType, minFpScore in self.__fpTypeCuttoffList: retStatus, mL = oesU.searchSubStructureWithFingerPrint( oeMol, fpType, minFpScore, maxFpResults, matchOpts=matchOpts) self.assertTrue(retStatus) logger.debug("%s fpType %r hits %d", ccId, fpType, len(mL)) maxHits = max(maxHits, len(mL)) minHits = min(minHits, len(mL)) matchedSelf = self.__resultContains(ccId, mL) selfHit = selfHit or matchedSelf if not matchedSelf: missedFpD.setdefault(ccId, []).append( (buildType, fpType, len(mL))) if not selfHit: missedD.setdefault(ccId, []).append(buildType) logger.info("%s (%r) buildType %r min hits %d max hits %d", ccId, selfHit, buildType, minHits, maxHits) else: logger.info("%s missing descriptor %r", ccId, buildType) # for ccId, missL in missedD.items(): logger.info("%s missed list %r", ccId, missL) if ccId in missedFpD: logger.info("%s unmatched for fpTypes %r", ccId, missedFpD[ccId]) # ---- doDepict = False if doDepict: mD = {} for missTup in missTupL: mD.setdefault(missTup[0], []).append(missTup[1]) for ccId, buildTypeL in mD.items(): idxD = ccIdxD[ccId] if "oe-iso-smiles" in idxD: for buildType in buildTypeL: self.__displayAlignedDescriptorPair( ccId, idxD["oe-iso-smiles"], "oe-iso-smiles", idxD[buildType], buildType, title=None, limitPerceptions=True) logger.info("%s fingerprints search on %d in (%.4f seconds)", len(self.__fpTypeList), numMols, time.time() - startTime)
def __getMiscFile(self, filePath, suppressHydrogens=False, importType="2D", title=None, largestPart=False): """Fetch a miscellaneous chemical file (ccPath) and build OE molecules for comparison. """ try: oeioU = OeIoUtils() oeMolL = oeioU.fileToMols(filePath, use3D=importType == "3D", largestPart=largestPart) logger.info("Read (%d) from %s ", len(oeMolL), filePath) oeMol = oeMolL[0] ccId = title if title else oeMol.GetTitle() if title: oeMol.SetTitle(ccId) # oemf = OeMoleculeFactory() if not self.__verbose: oemf.setQuiet() oemf.setOeMol(oeMol, ccId) # fD = oemf.getOeMoleculeFeatures() if self.__verbose: logger.info(" Title = %s", title) logger.info(" Title OEMF = %s", oemf.getTitle()) logger.info(" SMILES = %s", oemf.getCanSMILES()) logger.info(" SMILES (stereo) = %s", oemf.getIsoSMILES()) logger.info(" Formula (Hill) = %s", oemf.getFormula()) logger.info(" InChI key = %s", oemf.getInChIKey()) logger.info(" InChI = %s", oemf.getInChI()) # ccId = oemf.getTitle() if suppressHydrogens: tMol = oemf.getGraphMolSuppressH() else: tMol = oemf.getMol() molXyzL = [] if importType == "3D": for atm in tMol.GetAtoms(): xyzL = oechem.OEFloatArray(3) tMol.GetCoords(atm, xyzL) molXyzL.append( ComponentAtomDetails( atIdx=atm.GetIdx(), atNo=atm.GetAtomicNum(), atName=atm.GetName(), atType=atm.GetType(), x=xyzL[0], y=xyzL[1], z=xyzL[2], atFormalCharge=atm.GetFormalCharge(), ) ) fD = {} fD = { "Formula": oemf.getFormula(), "SMILES": oemf.getCanSMILES(), "SMILES_STEREO": oemf.getIsoSMILES(), "InChI": oemf.getInChI(), "InChIKey": oemf.getInChIKey(), "xyz": molXyzL, } for atm in tMol.GetAtoms(): xyzL = oechem.OEFloatArray(3) tMol.GetCoords(atm, xyzL) if self.__verbose: logger.debug("atom %s %s %s %s %r", atm.GetIdx(), atm.GetAtomicNum(), atm.GetName(), atm.GetType(), xyzL) fD["OEMOL"] = tMol return (ccId, tMol, fD) except Exception as e: logger.exception("Failing with %s", str(e)) return None, None, None