def test_RDKit(self): from cinfony import rdk mol = rdk.readstring("smi", "CCC") desc = mol.calcdesc(rdk.descs[0:3]) for d in desc: if desc[d] != desc[d]: desc[d] = '?' expectedDesc = {'fr_Ar_COO': 0, 'Chi4v': 0.0, 'fr_C_O_noCOO': 0} self.assertEqual(desc,expectedDesc)
def validate(fn, typ): ofn = fn.replace('.sdf.gz', '_sanitized.sdf') if not os.path.isfile(ofn): good = 0 total = 0 o = rdk.Outputfile('sdf', ofn, overwrite=True) n = 0 for mol in pybel.readfile('sdf', fn): mol.title = typ + "_" + mol.title + "_" + str(n) total += 1 n += 1 try: o.write(rdk.readstring('mol', mol.write('mol'))) good += 1 except Exception, e: print e pass print "%s %% good mols (%s/%s)" % ((good * 100. / total), good, total) o.close()
def clean(fn): ligands = os.path.join(os.path.dirname(fn), "..", "PDB", "ligands_allgood.sdf") if not os.path.isfile(ligands): return inchikeys = set( [mol.write('inchikey') for mol in pybel.readfile('sdf', ligands)]) ofn = fn.replace('.sdf', '_filtered.sdf') if not os.path.isfile(ofn): good = 0 total = 0 o = rdk.Outputfile('sdf', ofn, overwrite=True) for mol in pybel.readfile('sdf', fn): total += 1 if mol.write('inchikey') not in inchikeys: try: o.write(rdk.readstring('mol', mol.write('mol'))) good += 1 except Exception, e: #print e pass o.close() print "%s %% remaining mols (%s/%s)" % ( (good * 100. / total), good, total)
def getRdkDescResult(data,descList, radius = 1): """ Calculates the descriptors for the descList using RDK It expects an attribute containing smiles with a name defined in AZOrangeConfig.SMILESNAMES It returns a dataset with the same smiles input variable, and as many variables as the descriptors returned by the toolkit """ if "rdk" not in toolkitsEnabled: return None FingerPrints = False smilesName = getSMILESAttr(data) if not smilesName: return None myDescList = [desc.replace(rdkTag,"") for desc in descList if rdkTag in desc] if not myDescList: return None if "FingerPrints" in myDescList: FingerPrints = True myDescList.remove("FingerPrints") #Get fingerprints in advance fingerPrintsAttrs = [] fingerPrintsRes = {} if FingerPrints: for ex in data: mol = str(ex[smilesName].value) try: chemMol = rdk.Chem.MolFromSmiles(mol,True) if not chemMol: chemMol = rdk.Chem.MolFromSmiles(mol,False) fingerPrint = rdk.AllChem.GetMorganFingerprint(chemMol,radius) resDict = fingerPrint.GetNonzeroElements() except: continue fingerPrintsRes[mol] = {} for ID in resDict: count = resDict[ID] name = rdkTag+"FP_"+str(ID) if name not in [x.name for x in fingerPrintsAttrs]: fingerPrintsAttrs.append(orange.FloatVariable(name)) fingerPrintsRes[mol][name]=int(count) resData = orange.ExampleTable(orange.Domain([data.domain[smilesName]] + [orange.FloatVariable(rdkTag+name) for name in myDescList] + [name for name in fingerPrintsAttrs],0)) badCompounds = 0 for ex in data: newEx = orange.Example(resData.domain) newEx[smilesName] = ex[smilesName] molStr = str(newEx[smilesName].value) # OBS - add something keeping count on the number of unused smiles try: chemMol = rdk.Chem.MolFromSmiles(molStr,True) if not chemMol: chemMol = rdk.Chem.MolFromSmiles(molStr,False) mol = rdk.readstring("mol", rdk.Chem.MolToMolBlock(chemMol)) #mol = rdk.readstring("smi", molStr) moldesc = mol.calcdesc(myDescList) for desc in myDescList: newEx[rdkTag+desc] = moldesc[desc] #Process fingerprints if FingerPrints: for desc in fingerPrintsAttrs: if desc.name in fingerPrintsRes[molStr]: newEx[desc.name] = fingerPrintsRes[molStr][desc.name] else: newEx[desc.name] = 0 resData.append(newEx) except: badCompounds += 1 print "Compounds in original data: ",len(data) print "Compounds able to calculate descs:",len(resData) print "Ignored Compounds: ",badCompounds return resData
def get_RMSD_value(refmol, probemol): """Input is 2 mol files.""" rdref = rdk.readstring('mol', str(refmol)) rdprobe = rdk.readstring('mol', str(probemol)) return rdk.Chem.AllChem.GetBestRMS(rdref.Mol, rdprobe.Mol)
def getRdkDescResult(data,descList, radius = 1): """ Calculates the descriptors for the descList using RDK It expects an attribute containing smiles with a name defined in AZOrangeConfig.SMILESNAMES It returns a dataset with the same smiles input variable, and as many variables as the descriptors returned by the toolkit """ if "rdk" not in toolkitsEnabled: return None FingerPrints = False smilesName = getSMILESAttr(data) if not smilesName: return None FP_desc = [] myDescList = [desc.replace(toolkitsDef["rdk"]["tag"],"") for desc in descList if toolkitsDef["rdk"]["tag"] in desc] if not myDescList: return None if "FingerPrints" in myDescList: FingerPrints = True myDescList.remove("FingerPrints") if sum(["FP_" in fp for fp in myDescList]): tmpDescList = [] FingerPrints = True for attr in myDescList: if "FP_" not in attr: tmpDescList.append(attr) else: FP_desc.append(attr) myDescList = tmpDescList #Get fingerprints in advance fingerPrintsAttrs = [] fingerPrintsRes = {} if FingerPrints: for ex in data: mol = str(ex[smilesName].value) try: chemMol = rdk.Chem.MolFromSmiles(mol,True) if not chemMol: chemMol = rdk.Chem.MolFromSmiles(mol,False) fingerPrint = rdk.AllChem.GetMorganFingerprint(chemMol,radius) resDict = fingerPrint.GetNonzeroElements() except: continue fingerPrintsRes[mol] = {} for ID in resDict: count = resDict[ID] name = toolkitsDef["rdk"]["tag"]+"FP_"+str(ID) if name not in [x.name for x in fingerPrintsAttrs]: fingerPrintsAttrs.append(orange.FloatVariable(name)) fingerPrintsRes[mol][name] = float(count) #Add FP attributes even if there was no reference to it. Models will need it as FP not present, i.e. equal 0.0 ! for fpDesc in FP_desc: name = toolkitsDef["rdk"]["tag"]+fpDesc if name not in [str(attr.name) for attr in fingerPrintsAttrs]: fingerPrintsAttrs.append(orange.FloatVariable(name)) #Test attrTypes for ex in data: try: attrObj = [] molStr = str(ex[smilesName].value) chemMol = rdk.Chem.MolFromSmiles(molStr,True) if not chemMol: chemMol = rdk.Chem.MolFromSmiles(molStr,False) mol = rdk.readstring("mol", rdk.Chem.MolToMolBlock(chemMol)) moldesc = mol.calcdesc(myDescList) for desc in myDescList: if type(moldesc[desc]) == str: attrObj.append(orange.StringVariable(toolkitsDef["rdk"]["tag"] + desc)) else: attrObj.append(orange.FloatVariable(toolkitsDef["rdk"]["tag"] + desc)) #Process fingerprints if FingerPrints: for desc in [fp for fp in fingerPrintsAttrs if fp.name not in attrObj]: attrObj.append(desc)#orange.FloatVariable(desc.name)) break except: continue resData = orange.ExampleTable(orange.Domain([data.domain[smilesName]] + attrObj,0)) badCompounds = 0 for ex in data: newEx = orange.Example(resData.domain) # All attrs: ?, ?, ?, ..., ? newEx[smilesName] = ex[smilesName] molStr = str(newEx[smilesName].value) # OBS - add something keeping count on the number of unused smiles try: chemMol = rdk.Chem.MolFromSmiles(molStr,True) if not chemMol: chemMol = rdk.Chem.MolFromSmiles(molStr,False) mol = rdk.readstring("mol", rdk.Chem.MolToMolBlock(chemMol)) #mol = rdk.readstring("smi", molStr) moldesc = mol.calcdesc(myDescList) for desc in myDescList: newEx[toolkitsDef["rdk"]["tag"]+desc] = moldesc[desc] #Process fingerprints if FingerPrints: for desc in fingerPrintsAttrs: if desc.name in fingerPrintsRes[molStr]: newEx[desc.name] = fingerPrintsRes[molStr][desc.name] else: newEx[desc.name] = 0.0 resData.append(newEx) except: #print "Unexpected error:", sys.exc_info() badCompounds += 1 print "Compounds in original data: ",len(data) print "Compounds able to calculate descs:",len(resData) print "Ignored Compounds: ",badCompounds return resData
def getRdkDescResult(data,descList, radius = 1): """ Calculates the descriptors for the descList using RDK It expects an attribute containing smiles with a name defined in AZOrangeConfig.SMILESNAMES It returns a dataset with the same smiles input variable, and as many variables as the descriptors returned by the toolkit """ if "rdk" not in toolkitsEnabled: return None FingerPrints = False smilesName = getSMILESAttr(data) if not smilesName: return None FP_desc = [] myDescList = [desc.replace(toolkitsDef["rdk"]["tag"],"") for desc in descList if toolkitsDef["rdk"]["tag"] in desc] if not myDescList: return None if "FingerPrints" in myDescList: FingerPrints = True myDescList.remove("FingerPrints") if sum(["FP_" in fp for fp in myDescList]): tmpDescList = [] FingerPrints = True for attr in myDescList: if "FP_" not in attr: tmpDescList.append(attr) else: FP_desc.append(attr) myDescList = tmpDescList #Get fingerprints in advance fingerPrintsAttrs = [] fingerPrintsRes = {} if FingerPrints: for ex in data: mol = str(ex[smilesName].value) try: chemMol = rdk.Chem.MolFromSmiles(mol,True) if not chemMol: chemMol = rdk.Chem.MolFromSmiles(mol,False) fingerPrint = rdk.AllChem.GetMorganFingerprint(chemMol,radius) resDict = fingerPrint.GetNonzeroElements() except: continue fingerPrintsRes[mol] = {} for ID in resDict: count = resDict[ID] name = toolkitsDef["rdk"]["tag"]+"FP_"+str(ID) if name not in [x.name for x in fingerPrintsAttrs]: fingerPrintsAttrs.append(orange.FloatVariable(name)) fingerPrintsRes[mol][name] = float(count) #Add FP attributes even if there was no reference to it. Models will need it as FP not present, i.e. equal 0.0 ! for fpDesc in FP_desc: name = toolkitsDef["rdk"]["tag"]+fpDesc if name not in [str(attr.name) for attr in fingerPrintsAttrs]: fingerPrintsAttrs.append(orange.FloatVariable(name)) #Test attrTypes for ex in data: try: attrObj = [] molStr = str(ex[smilesName].value) chemMol = rdk.Chem.MolFromSmiles(molStr,True) if not chemMol: chemMol = rdk.Chem.MolFromSmiles(molStr,False) mol = rdk.readstring("mol", rdk.Chem.MolToMolBlock(chemMol)) moldesc = mol.calcdesc(myDescList) for desc in myDescList: if type(moldesc[desc]) == str: attrObj.append(orange.StringVariable(toolkitsDef["rdk"]["tag"] + desc)) else: attrObj.append(orange.FloatVariable(toolkitsDef["rdk"]["tag"] + desc)) #Process fingerprints if FingerPrints: for desc in [fp for fp in fingerPrintsAttrs if fp.name not in attrObj]: attrObj.append(desc)#orange.FloatVariable(desc.name)) break except: continue resData = orange.ExampleTable(orange.Domain([data.domain[smilesName]] + attrObj,0)) badCompounds = 0 for ex in data: newEx = orange.Example(resData.domain) # All attrs: ?, ?, ?, ..., ? newEx[smilesName] = ex[smilesName] molStr = str(newEx[smilesName].value) # OBS - add something keeping count on the number of unused smiles try: chemMol = rdk.Chem.MolFromSmiles(molStr,True) if not chemMol: chemMol = rdk.Chem.MolFromSmiles(molStr,False) mol = rdk.readstring("mol", rdk.Chem.MolToMolBlock(chemMol)) #mol = rdk.readstring("smi", molStr) moldesc = mol.calcdesc(myDescList) for desc in myDescList: newEx[toolkitsDef["rdk"]["tag"]+desc] = moldesc[desc] #Process fingerprints if FingerPrints: for desc in fingerPrintsAttrs: if desc.name in fingerPrintsRes[molStr]: newEx[desc.name] = fingerPrintsRes[molStr][desc.name] else: newEx[desc.name] = 0.0 resData.append(newEx) except: badCompounds += 1 print "Compounds in original data: ",len(data) print "Compounds able to calculate descs:",len(resData) print "Ignored Compounds: ",badCompounds return resData