def getDescriptors(self, smiles): self.getSmilesData(smiles) # Calculate descriptors defined in the model files descList = self.model.varNames savedSmilesData = dataUtilities.DataTable(self.smilesData) #Try 3 time to get All compounds descriptors nTry = 3 errorDesc = "" while nTry > 0: try: traceLog = "Model Location:"+str(self.modelLocation)+"\n" nBadEx = 0 # Determine Signature and non-Signature descriptor names cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes(descList) # Signatures if "sign" in DescMethodsAvailable and signatureHeight: traceLog += "Calculating signatures...\n" print "Calculating signatures...." preCalcData = dataUtilities.DataTable(self.preDefSignatureFile) startHeight = 0 # Not used desc ignored in model prediction endHeight = signatureHeight self.smilesData = getSignatures.getSignatures(self.smilesData, startHeight, endHeight, preCalcData) # C-Lab desc if "clab" in DescMethodsAvailable and clabDesc: traceLog += "Calculating C-Lab...\n" print "Calculating C-Lab desc...." self.smilesData = ClabUtilities.appendCLabDesc(clabDesc, self.smilesData) # Cinfony if cinfonyDesc: traceLog += "Calculating Cinfony...\n" print "Calculating Cinfony desc..." self.smilesData = getCinfonyDesc.getCinfonyDescResults(self.smilesData, cinfonyDesc, radius = 5) # bbrcDesc if "bbrc" in DescMethodsAvailable and bbrcDesc: traceLog += "Calculating BBRC...\n" print "Calculating BBRC desc..." self.smilesData = getBBRCDesc.getBBRCDescResult(self.smilesData, algo = "FTM", minSupPar = 1, descList = bbrcDesc) # Detect if the descripts calaculation or something else went wrong! for ex in self.smilesData: if sum([ex[attr].isSpecial() for attr in self.smilesData.domain.attributes]) == len(self.smilesData.domain.attributes): nBadEx +=1 if nBadEx: traceLog += "WARNING: Desc. Calculation: From the "+str(len(self.smilesData))+" compounds, "+str(nBadEx)+" could not be calculated!\n" print "WARNING: Desc. Calculation: From the "+str(len(self.smilesData))+" compounds, "+str(nBadEx)+" could not be calculated!" print "WARNING: Tying again..." self.smilesData = dataUtilities.DataTable(savedSmilesData) nTry -= 1 else: nTry = 0 except Exception, e: errorDesc = "Error Calculating Descriptors:;"+traceLog+str(e)+";" nTry -= 1
def getClabDescSignList(self, smiles, getMolFile=False): # Create an Orange ExampleTable with a smiles attribute smilesAttr = orange.EnumVariable("SMILEStoPred", values=[smiles]) myDomain = orange.Domain([smilesAttr], 0) smilesData = dataUtilities.DataTable(myDomain, [[smiles]]) # Calculate descriptors defined in the model files try: descList = self.model.varNames except: # Consensus object different attributes = self.model.domain.variables descList = [] for attr in attributes: descList.append(attr.name) # Determine Signature and non-Signature descriptor names cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes( descList) # Signatures if "sign" in DescMethodsAvailable and signatureHeight: print "Calculating signatures..." preCalcData = dataUtilities.DataTable(self.preDefSignatureFile) startHeight = 0 # Not used desc ignored in model prediction endHeight = signatureHeight dataSign, cmpdSignDict, cmpdSignList, sdfStr = getSignatures.getSignatures( smilesData, startHeight, endHeight, preCalcData, returnAtomID=True) else: cmpdSignList = [[]] sdfStr = "" if not getMolFile: return (clabDesc, cmpdSignList[0]) elif not sdfStr: return (clabDesc, cmpdSignList[0], "", "") # create a mol file molFile = miscUtilities.generateUniqueFile(desc="NN", ext="mol") file = open(molFile, "w") molStr = "" for line in sdfStr[0]: if "$$$$" in line: break molStr += line file.write(line) file.close() return (clabDesc, cmpdSignList[0], molFile, molStr)
def getClabDescSignList(self, smiles, getMolFile=False): # Create an Orange ExampleTable with a smiles attribute smilesAttr = orange.EnumVariable("SMILEStoPred", values = [smiles]) myDomain = orange.Domain([smilesAttr], 0) smilesData = dataUtilities.DataTable(myDomain, [[smiles]]) # Calculate descriptors defined in the model files try: descList = self.model.varNames except: # Consensus object different attributes = self.model.domain.variables descList = [] for attr in attributes: descList.append(attr.name) # Determine Signature and non-Signature descriptor names cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes(descList) # Signatures if "sign" in DescMethodsAvailable and signatureHeight: print "Calculating signatures..." preCalcData = dataUtilities.DataTable(self.preDefSignatureFile) startHeight = 0 # Not used desc ignored in model prediction endHeight = signatureHeight dataSign,cmpdSignDict, cmpdSignList, sdfStr = getSignatures.getSignatures(smilesData, startHeight, endHeight, preCalcData, returnAtomID=True) else: cmpdSignList = [[]] sdfStr = "" if not getMolFile: return (clabDesc,cmpdSignList[0]) elif not sdfStr: return (clabDesc,cmpdSignList[0],"","") # create a mol file molFile = miscUtilities.generateUniqueFile(desc="NN", ext = "mol") file= open(molFile,"w") molStr="" for line in sdfStr[0]: if "$$$$" in line: break molStr += line file.write(line) file.close() return (clabDesc,cmpdSignList[0],molFile,molStr)
def processSignificance(self, smi, prediction, orderedDesc, res, resultsPath, exWithDesc=None, idx=0, topN=1, regMinIsDesired=True): """descs* = [(1.3, ["LogP"]), (0.2, ["[So2]", ...]), ...] res = { "signature" : "", "imgPath" : "", for placing the results "non-signature" : "", "molStr" : "", "atoms" : [] "color" : [(r,g,b),(),...]} It uses for Classificartion: self.predictionOutcomes that must define [BADlabel, GOODlabel] in this same order and for Regression: self.significanceThreshold for which a GOOD prediction is BELOW the threshold orderedDesc = { "molStr":''..., # only on specialType=1 "height":2, # only on specialType=1 "atoms":[1,2,3], # only on specialType=1 'Continuous': { 'DOWN':[ [('[F]', -0.008885456983609475), ... ('[F]',-0,0001)], [('[O3]', -0.007324209285573964)], [('[C3]([C3][C3])', -0.0047175657931883405)], [('[C3]', -0.00389763719161594)]], 'UP': [[('[Car]([Car][Nar])', 0.009768981717302358)], [('[HC]([C3])', 0.009135563633559857)]]}, 'Discrete': {'DOWN': [], 'UP': []}} """ atomColor = None orderedDesc_sign = { 'Continuous': { 'DOWN': [], 'UP': [] }, 'Discrete': { 'DOWN': [], 'UP': [] } } orderedDesc_nonSign = { 'Continuous': { 'DOWN': [], 'UP': [] }, 'Discrete': { 'DOWN': [], 'UP': [] } } if hasattr(self.model, "specialType") and self.model.specialType == 1: print "This is a special model nr 1: It calculates itself the Significant Signatures" endHeight = orderedDesc["height"] try: molStr = orderedDesc["molStr"] atoms = eval(orderedDesc["atoms"]) except: atoms = None molStr = None if not molStr or type(atoms) != list or not atoms: atoms = None molStr = None orderedDesc_sign = orderedDesc else: atoms = None endHeight = None molStr = None cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes( [attr.name for attr in self.model.domain.attributes]) for attrType in ['Continuous', 'Discrete']: for vector in ['UP', 'DOWN']: for ord in range(len(orderedDesc[attrType][vector])): signEmpty = True nonSignEmpty = True for attr in orderedDesc[attrType][vector][ord]: if attr[0] in signDesc: if signEmpty: signEmpty = False orderedDesc_sign[attrType][vector].append( []) orderedDesc_sign[attrType][vector][-1].append( attr) else: if nonSignEmpty: nonSignEmpty = False orderedDesc_nonSign[attrType][ vector].append([]) orderedDesc_nonSign[attrType][vector][ -1].append(attr) #Process color to use if highlight is used outComeIsRev = None if self.model.classVar.varType == orange.VarTypes.Discrete: if self.predictionOutcomes is None: print "WARNING: Cannot process Significance, Missing definition of predictionOutcomes for the EndPoint" return theGoodPred = str(self.predictionOutcomes[1]) theBadPred = str(self.predictionOutcomes[0]) if [str(p) for p in self.model.classVar.values ] == self.predictionOutcomes: outComeIsRev = False elif [str(p) for p in self.model.classVar.values ][::-1] == self.predictionOutcomes: outComeIsRev = True else: print "ERROR: User outcome ordered list is not consistens toth model: ",\ self.predictionOutcomes, "<-->",self.model.classVar.values if prediction == theGoodPred: atomColor = 'g' else: atomColor = 'r' else: if self.significanceThreshold is None: print "WARNING: Cannot process Significance, Missing definition of significanceThreshold for the EndPoint" return if prediction < self.significanceThreshold: # It is a GOOD prediction atomColor = 'g' else: atomColor = 'r' #Process Signatures if exWithDesc: # Precalculated signatures # cmpdSignList differ from when it is calc from smiles. However, not used. dataSign, cmpdSignDict, cmpdSignList, sdfStr = self.getSignDataStruct( exWithDesc) else: # OBS Hard coded for signatures 0 to 1. smilesData = self.getAZOdata(smi) if "sign" in DescMethodsAvailable: dataSign, cmpdSignDict, cmpdSignList, sdfStr = getSignatures.getSignatures( smilesData, 0, 1, returnAtomID=True, useClabSmiles=False) else: dataSign, cmpdSignDict, cmpdSignList, sdfStr = None, [ {} ] * len(smilesData), [[]] * len(smilesData), "" # If signSVM model already returning one sign as the most significant if not (hasattr(self.model, "specialType") and self.model.specialType == 1): downAbs = 0.0 rankIdxDown = 0 elemIdxDown = 0 if len(orderedDesc_sign["Continuous"]["DOWN"]): for rankIdx in range( len(orderedDesc_sign["Continuous"]["DOWN"])): if downAbs != 0.0: break for elemIdx in range( len(orderedDesc_sign["Continuous"]["DOWN"] [rankIdx])): # Test that the signature exists in the molecule if orderedDesc_sign["Continuous"]["DOWN"][rankIdx][ elemIdx][0] in cmpdSignDict[0].keys(): downAbs = abs(orderedDesc_sign["Continuous"] ["DOWN"][rankIdx][elemIdx][1]) rankIdxDown = rankIdx elemIdxDown = elemIdx break else: if len(orderedDesc_sign["Continuous"]["DOWN"]): downAbs = abs(orderedDesc_sign["Continuous"]["DOWN"][0][0][1]) else: downAbs = 0.0 # If signSVM model already returning one sign as the most significant if not (hasattr(self.model, "specialType") and self.model.specialType == 1): upAbs = 0.0 rankIdxUp = 0 elemIdxUp = 0 if len(orderedDesc_sign["Continuous"]["UP"]): for rankIdx in range(len( orderedDesc_sign["Continuous"]["UP"])): if upAbs != 0.0: break for elemIdx in range( len(orderedDesc_sign["Continuous"]["UP"] [rankIdx])): # Test that the signature exists in the molecule if orderedDesc_sign["Continuous"]["UP"][rankIdx][ elemIdx][0] in cmpdSignDict[0].keys(): upAbs = abs(orderedDesc_sign["Continuous"]["UP"] [rankIdx][elemIdx][1]) rankIdxUp = rankIdx elemIdxUp = elemIdx break else: if len(orderedDesc_sign["Continuous"]["UP"]): upAbs = abs(orderedDesc_sign["Continuous"]["UP"][0][0][1]) else: upAbs = 0.0 # Could happen that all derivatives are smaller than epsilon if orderedDesc_sign["Continuous"]["UP"] or orderedDesc_sign[ "Continuous"]["DOWN"]: if topN == 1: # Preserve syntax for Plato, only one significant signature if upAbs > downAbs: if not (hasattr(self.model, "specialType") and self.model.specialType == 1): MSDsign = orderedDesc_sign["Continuous"]["UP"][ rankIdxUp][elemIdxUp][0] MSDdv = orderedDesc_sign["Continuous"]["UP"][ rankIdxUp][elemIdxUp][1] else: MSDsign = orderedDesc_sign["Continuous"]["UP"][0][0][0] MSDdv = orderedDesc_sign["Continuous"]["UP"][0][0][1] elif downAbs > upAbs: if not (hasattr(self.model, "specialType") and self.model.specialType == 1): MSDsign = orderedDesc_sign["Continuous"]["DOWN"][ rankIdxDown][elemIdxDown][0] MSDdv = orderedDesc_sign["Continuous"]["DOWN"][ rankIdxDown][elemIdxDown][1] else: MSDsign = orderedDesc_sign["Continuous"]["DOWN"][0][0][ 0] MSDdv = orderedDesc_sign["Continuous"]["DOWN"][0][0][1] elif downAbs != 0.0: if not (hasattr(self.model, "specialType") and self.model.specialType == 1): MSDsign = orderedDesc_sign["Continuous"]["DOWN"][ rankIdxDown][elemIdxDown][0] MSDdv = orderedDesc_sign["Continuous"]["DOWN"][ rankIdxDown][elemIdxDown][1] else: MSDsign = orderedDesc_sign["Continuous"]["DOWN"][0][0][ 0] MSDdv = orderedDesc_sign["Continuous"]["DOWN"][0][0][1] else: MSDsign = None MSDsign = 0 else: MSDsign = [] MSDdv = [] for idx in range(topN): try: if abs( orderedDesc_sign["Continuous"]["DOWN"][0][0][1] ) > abs(orderedDesc_sign["Continuous"]["UP"][0][0][1]): MSDsign.append(orderedDesc_sign["Continuous"] ["DOWN"][0][0][0]) MSDdv.append(orderedDesc_sign["Continuous"] ["DOWN"].pop(0)[0][1]) else: MSDsign.append( orderedDesc_sign["Continuous"]["UP"][0][0][0]) MSDdv.append(orderedDesc_sign["Continuous"] ["UP"].pop(0)[0][1]) except: try: MSDsign.append(orderedDesc_sign["Continuous"] ["DOWN"][0][0][0]) MSDdv.append(orderedDesc_sign["Continuous"] ["DOWN"].pop(0)[0][1]) except: try: MSDsign.append(orderedDesc_sign["Continuous"] ["UP"][0][0][0]) MSDdv.append(orderedDesc_sign["Continuous"] ["UP"].pop(0)[0][1]) except: pass else: MSDsign = None MSDsign = 0 #Process non-signatures if self.model.classVar.varType == orange.VarTypes.Discrete and outComeIsRev: UP = "DOWN" DOWN = "UP" elif self.model.classVar.varType != orange.VarTypes.Discrete and not regMinIsDesired: UP = "DOWN" DOWN = "UP" else: UP = "UP" DOWN = "DOWN" #Process DiscreteAttrs MSDnonSign = "" nD_DOWN = len(orderedDesc_nonSign["Discrete"][DOWN]) if nD_DOWN: for n in range(min(topN, nD_DOWN)): if topN > 1: MSDnonSign += str(n + 1) + ": " MSDnonSign += string.join([ "Change " + x[0] for x in orderedDesc_nonSign["Discrete"][DOWN][n] ], '\n') + '\n' #Process Continuous attributes for n in range(topN): if topN > 1: order = str(n + 1) + ": " else: order = "" if len(orderedDesc_nonSign["Continuous"][DOWN]): downAbs = abs(orderedDesc_nonSign["Continuous"][DOWN][0][0][1]) else: downAbs = 0.0 if len(orderedDesc_nonSign["Continuous"][UP]): upAbs = abs(orderedDesc_nonSign["Continuous"][UP][0][0][1]) else: upAbs = 0.0 if orderedDesc_nonSign["Continuous"][UP] and upAbs >= downAbs: TOPmsd = orderedDesc_nonSign["Continuous"][UP].pop(0) MSDnonSign += order + string.join( ["Decrease " + x[0] for x in TOPmsd], '\n') + '\n' if orderedDesc_nonSign["Continuous"][DOWN] and downAbs >= upAbs: TOPmsd = orderedDesc_nonSign["Continuous"][DOWN].pop(0) MSDnonSign += order + string.join( ["Increase " + x[0] for x in TOPmsd], '\n') + '\n' res["non-signature"] = MSDnonSign # Most probably Signatures will always be associated with Discrete attributes. Nevertheless, it happens that some are Continuous, and therefore # we will be using signatures reported as Continuous if any if not MSDsign: res["imgPath"] = "" res["signature"] = "" res["signarure_deriv_val"] = 0 return if resultsPath and os.path.isdir(resultsPath): imgPath = os.path.join( resultsPath, "significance_" + str(idx) + "_" + str(time.time()).replace(".", '') + ".png") else: imgPath = "" # Call the method to create the image/mol specifying the color of the hilighted atoms if exWithDesc == None: # Don't use with precalc desc if molStr and atoms and endHeight is not None and not imgPath: print "Using molStr and atoms from Learner Significant Signature" res["imgPath"] = '' res["molStr"] = molStr allAtoms = self.getNNAtoms(molStr, atoms, endHeight) res["atoms"] = allAtoms res["color"] = [atomColor] * len(allAtoms) else: res["imgPath"], res["molStr"], res["atoms"], res[ "color"] = self.createSignImg(smi, MSDsign, atomColor, imgPath, endHeight) #Fix the significant descriptors so that it is a formated string res["signature"] = MSDsign res["signarure_deriv_val"] = MSDdv return res
def getDescriptors(self, smiles): self.getSmilesData(smiles) # Calculate descriptors defined in the model files descList = self.model.varNames savedSmilesData = dataUtilities.DataTable(self.smilesData) #Try 3 time to get All compounds descriptors nTry = 3 errorDesc = "" while nTry > 0: try: #if True: traceLog = "Model Location:" + str(self.modelLocation) + "\n" nBadEx = 0 # Determine Signature and non-Signature descriptor names cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes( descList) # Signatures if "sign" in DescMethodsAvailable and signatureHeight: traceLog += "Calculating signatures...\n" print "Calculating signatures...." preCalcData = dataUtilities.DataTable( self.preDefSignatureFile) startHeight = 0 # Not used desc ignored in model prediction endHeight = signatureHeight self.smilesData = getSignatures.getSignatures( self.smilesData, startHeight, endHeight, preCalcData) # C-Lab desc if "clab" in DescMethodsAvailable and clabDesc: traceLog += "Calculating C-Lab...\n" print "Calculating C-Lab desc...." self.smilesData = ClabUtilities.appendCLabDesc( clabDesc, self.smilesData) # Cinfony if cinfonyDesc: traceLog += "Calculating Cinfony...\n" print "Calculating Cinfony desc..." self.smilesData = getCinfonyDesc.getCinfonyDescResults( self.smilesData, cinfonyDesc, radius=5) # bbrcDesc if "bbrc" in DescMethodsAvailable and bbrcDesc: traceLog += "Calculating BBRC...\n" print "Calculating BBRC desc..." self.smilesData = getBBRCDesc.getBBRCDescResult( self.smilesData, algo="FTM", minSupPar=1, descList=bbrcDesc) # Detect if the descripts calaculation or something else went wrong! for ex in self.smilesData: if sum([ ex[attr].isSpecial() for attr in self.smilesData.domain.attributes ]) == len(self.smilesData.domain.attributes): nBadEx += 1 if nBadEx: traceLog += "WARNING: Desc. Calculation: From the " + str( len(self.smilesData)) + " compounds, " + str( nBadEx) + " could not be calculated!\n" print "WARNING: Desc. Calculation: From the " + str( len(self.smilesData)) + " compounds, " + str( nBadEx) + " could not be calculated!" print "WARNING: Tying again..." self.smilesData = dataUtilities.DataTable(savedSmilesData) nTry -= 1 else: nTry = 0 #else: except Exception, e: errorDesc = "Error Calculating Descriptors:;" + traceLog + str( e) + ";" nTry -= 1
def getTopImportantVars(self, inEx, nVars=1, gradRef=None, absGradient=True, c_step=None, getGrad=False): """Return the n top important variables (n = nVars) for the given example if nVars is 0, it returns all variables ordered by importance if c_step (costume step) is passed, force it instead of hardcoded """ # Determine Signature and non-Signature descriptor names #signDesc = [] # This Disable distinction from signatures ans non-signatures cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes( [attr.name for attr in self.domain.attributes]) varGrad = [] ExFix = dataUtilities.ExFix() ExFix.set_domain(self.domain) ex = ExFix.fixExample(inEx) if self.basicStat == None or not self.NTrainEx or ( self.domain.classVar.varType == orange.VarTypes.Discrete and len(self.domain.classVar.values) != 2): return None if gradRef == None: gradRef = self(ex, returnDFV=True)[1] def calcDiscVarGrad(var, ex, gradRef): step = 1 # MUST be 1!! if ex[var].isSpecial(): return ([gradRef, gradRef], step) localMaxDiff = 0 localMaxPred = gradRef #Uncomment next line to skip discrete variables #return localMaxPred for val in self.domain[var].values: localEx = orange.Example(ex) localEx[var] = val pred = self(localEx, returnDFV=True)[1] if abs(pred - gradRef) > localMaxDiff: localMaxDiff = abs(pred - gradRef) localMaxPred = pred return ([localMaxPred, gradRef], step) def calcContVarGrad(var, ex, gradRef): localEx = orange.Example(ex) if c_step is None: if self.domain.classVar.varType == orange.VarTypes.Discrete: # Classification coef_step = 1.0 else: coef_step = 0.08 # Needs confirmation! Coefficient step: c else: # used for testing significance: comment next and uncomment next-next raise (Exception( "This mode should only be used for debugging! Comment this line if debugging." )) #coef_step = float(c_step) if var in signDesc: step = 1 # Set step to one in case od signatures else: # dev - Standard deviation: http://orange.biolab.si/doc/reference/Orange.statistics.basic/ if "dev" in self.basicStat[var]: step = self.basicStat[var]["dev"] * coef_step else: return ([gradRef, gradRef], 0) if ex[var].isSpecial(): return ([gradRef, gradRef], step) # step UP localEx[var] = ex[var] + step ResUp = self(localEx, returnDFV=True)[1] # step DOWN localEx[var] = ex[var] - step ResDown = self(localEx, returnDFV=True)[1] return ([ResUp, ResDown], step) def calcVarGrad(var, ex, gradRef): if attr.varType == orange.VarTypes.Discrete: res, step = calcDiscVarGrad(attr.name, ex, gradRef) # f(a) f(x) _grad = (res[0] - res[1]) # /step ... but step MUST be 1!! _faMax = res[0] else: res, step = calcContVarGrad(attr.name, ex, gradRef) if step == 0: _grad = 0 else: _grad = (res[0] - res[1]) / (2.0 * step) _faMax = None return (_grad, _faMax) def compareABS(x, y): if abs(x) > abs(y): return 1 elif abs(x) < abs(y): return -1 else: return 0 eps = 1E-5 # epsilon: amplitude of derivatives that will be considered 0. Attributes with derivative amplitude less than epsilon will not be considered. # Print used for algorithm final confirmation #print " %s " % (str(gradRef)), for attr in self.domain.attributes: grad = calcVarGrad(attr.name, ex, gradRef) # Print used for testing significance #print " %s " % (str(grad[0])), # Print used for algorithm final confirmation #print " %s " % (str(grad[1])), if attr.name in signDesc: actualEps = 0 else: actualEps = eps if abs( grad[0] ) > actualEps: # only consider attributes with derivative greatest than epsilon # f'(x) x f(a) # derivative value direction f(a) farest away from f(x) only setted for classification varGrad.append((grad[0], attr.name, grad[1])) #Separate continuous from categorical variables contVars = [] discVars = [] for var in varGrad: if self.domain[var[1]].varType == orange.VarTypes.Discrete: discVars.append(var) else: contVars.append(var) if nVars == 0: nRet = None else: nRet = nVars #Order the vars in terms of importance if absGradient: contVars.sort(reverse=1, cmp=lambda x, y: compareABS(x[0], y[0])) contVars = getVarNames(groupTiedScores(contVars, 0), getGrad=getGrad) discVars.sort(reverse=1, cmp=lambda x, y: compareABS(x[0], y[0])) discVars = getVarNames(groupTiedScores(discVars, 0), getGrad=getGrad) return {"Continuous":contVars[0:min(nRet,len(contVars))] ,\ "Discrete" :discVars[0:min(nRet,len(discVars))] } if self.domain.classVar.varType == orange.VarTypes.Discrete: # Classificatio # We will be looking to the max f(a) [2] # Will be excluding attributes for which f(a) was between 0 and f(x): |f(a)| < |f(x)| AND f(x)*f(a)>0 idx4Rem = [] for idx, v in enumerate(discVars): fx = gradRef fa = v[2] if abs(fa) < abs(fx) and (fx * fa) > 0: idx4Rem.append(idx) idx4Rem.sort(reverse=True) for idx in idx4Rem: discVars.pop(idx) # (3 lines) Print used for algorithm final confirmation # print " %s " % (idx4Rem), #else: # print " %s " % ([]), # Now we will be looking only to the actual derivative value; [0] UPd = [v for v in discVars if v[0] > 0] UPd.sort(reverse=1, cmp=lambda x, y: compareABS(x[0], y[0])) UPd = getVarNames(groupTiedScores(UPd, 0), getGrad=getGrad) DOWNd = [v for v in discVars if v[0] < 0] DOWNd.sort(reverse=1, cmp=lambda x, y: compareABS(x[0], y[0])) DOWNd = getVarNames(groupTiedScores(DOWNd, 0), getGrad=getGrad) UPc = [v for v in contVars if v[0] > 0] UPc.sort(reverse=1, cmp=lambda x, y: compareABS(x[0], y[0])) UPc = getVarNames(groupTiedScores(UPc, 0), getGrad=getGrad) DOWNc = [v for v in contVars if v[0] < 0] DOWNc.sort(reverse=1, cmp=lambda x, y: compareABS(x[0], y[0])) DOWNc = getVarNames(groupTiedScores(DOWNc, 0), getGrad=getGrad) return {"Continuous":{"UP": UPc[0:min(nRet,len( UPc))],\ "DOWN": DOWNc[0:min(nRet,len(DOWNc))]},\ "Discrete": {"UP": UPd[0:min(nRet,len( UPd))],\ "DOWN": DOWNd[0:min(nRet,len(DOWNd))]} }
def getTopImportantVars(self, inEx, nVars = 1, gradRef = None, absGradient = True, c_step = None, getGrad = False): """Return the n top important variables (n = nVars) for the given example if nVars is 0, it returns all variables ordered by importance if c_step (costume step) is passed, force it instead of hardcoded """ # Determine Signature and non-Signature descriptor names #signDesc = [] # This Disable distinction from signatures ans non-signatures cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes([attr.name for attr in self.domain.attributes]) varGrad = [] ExFix = dataUtilities.ExFix() ExFix.set_domain(self.domain) ex = ExFix.fixExample(inEx) if self.basicStat == None or not self.NTrainEx or (self.domain.classVar.varType == orange.VarTypes.Discrete and len(self.domain.classVar.values)!=2): return None if gradRef == None: gradRef = self(ex,returnDFV = True)[1] def calcDiscVarGrad(var,ex,gradRef): step = 1 # MUST be 1!! if ex[var].isSpecial(): return ([gradRef, gradRef],step) localMaxDiff = 0 localMaxPred = gradRef #Uncomment next line to skip discrete variables #return localMaxPred for val in self.domain[var].values: localEx = orange.Example(ex) localEx[var] = val pred = self(localEx,returnDFV = True)[1] if abs(pred - gradRef) > localMaxDiff: localMaxDiff = abs(pred - gradRef) localMaxPred = pred return ([localMaxPred, gradRef], step) def calcContVarGrad(var,ex,gradRef): localEx = orange.Example(ex) if c_step is None: coef_step = 0.5 # Needs confirmation! Coefficient step: c else: # used for testing significance: comment next and uncomment next-next raise(Exception("This mode should only be used for debugging! Comment this line if debugging.")) #coef_step = float(c_step) if var in signDesc: step = 1 # Set step to one in case od signatures else: # dev - Standard deviation: http://orange.biolab.si/doc/reference/Orange.statistics.basic/ if "dev" in self.basicStat[var]: step = self.basicStat[var]["dev"] * coef_step else: return ([gradRef, gradRef], 0) if ex[var].isSpecial(): return ([gradRef, gradRef], step) # step UP localEx[var] = ex[var] + step ResUp = self(localEx,returnDFV = True)[1] # step DOWN localEx[var] = ex[var] - step ResDown = self(localEx,returnDFV = True)[1] return ([ResUp, ResDown], step) def calcVarGrad(var,ex,gradRef): if attr.varType == orange.VarTypes.Discrete: res,step = calcDiscVarGrad(attr.name,ex,gradRef) # f(a) f(x) _grad = (res[0]-res[1]) # /step ... but step MUST be 1!! _faMax = res[0] else: res,step = calcContVarGrad(attr.name,ex,gradRef) if step == 0: _grad = 0 else: _grad = (res[0]-res[1])/(2.0*step) _faMax = None return (_grad, _faMax) def compareABS(x,y): if abs(x) > abs(y): return 1 elif abs(x) < abs(y): return -1 else: return 0 eps = 1E-5 # epsilon: amplitude of derivatives that will be considered 0. Attributes with derivative amplitude less than epsilon will not be considered. # Print used for algorithm final confirmation #print " %s " % (str(gradRef)), for attr in self.domain.attributes: grad = calcVarGrad(attr.name,ex,gradRef) # Print used for testing significance #print " %s " % (str(grad[0])), # Print used for algorithm final confirmation #print " %s " % (str(grad[1])), if attr.name in signDesc: actualEps = 0 else: actualEps = eps if abs(grad[0]) > actualEps: # only consider attributes with derivative greatest than epsilon # f'(x) x f(a) # derivative value direction f(a) farest away from f(x) only setted for classification varGrad.append( (grad[0], attr.name, grad[1]) ) #Separate continuous from categorical variables contVars = [] discVars = [] for var in varGrad: if self.domain[var[1]].varType == orange.VarTypes.Discrete: discVars.append(var) else: contVars.append(var) if nVars == 0: nRet = None else: nRet = nVars #Order the vars in terms of importance if absGradient: contVars.sort(reverse=1, cmp=lambda x,y: compareABS(x[0], y[0])) contVars = getVarNames(groupTiedScores(contVars,0), getGrad=getGrad) discVars.sort(reverse=1, cmp=lambda x,y: compareABS(x[0], y[0])) discVars = getVarNames(groupTiedScores(discVars,0), getGrad=getGrad) return {"Continuous":contVars[0:min(nRet,len(contVars))] ,\ "Discrete" :discVars[0:min(nRet,len(discVars))] } if self.domain.classVar.varType == orange.VarTypes.Discrete: # Classificatio # We will be looking to the max f(a) [2] # Will be excluding attributes for which f(a) was between 0 and f(x): |f(a)| < |f(x)| AND f(x)*f(a)>0 idx4Rem = [] for idx,v in enumerate(discVars): fx = gradRef fa = v[2] if abs(fa) < abs(fx) and (fx * fa) > 0: idx4Rem.append(idx) idx4Rem.sort(reverse=True) for idx in idx4Rem: discVars.pop(idx) # (3 lines) Print used for algorithm final confirmation # print " %s " % (idx4Rem), #else: # print " %s " % ([]), # Now we will be looking only to the actual derivative value; [0] UPd = [v for v in discVars if v[0] > 0] UPd.sort(reverse=1, cmp=lambda x,y: compareABS(x[0], y[0])) UPd = getVarNames(groupTiedScores(UPd,0), getGrad=getGrad) DOWNd = [v for v in discVars if v[0] < 0] DOWNd.sort(reverse=1, cmp=lambda x,y: compareABS(x[0], y[0])) DOWNd = getVarNames(groupTiedScores(DOWNd,0), getGrad=getGrad) UPc = [v for v in contVars if v[0] > 0] UPc.sort(reverse=1, cmp=lambda x,y: compareABS(x[0], y[0])) UPc = getVarNames(groupTiedScores(UPc,0), getGrad=getGrad) DOWNc = [v for v in contVars if v[0] < 0] DOWNc.sort(reverse=1, cmp=lambda x,y: compareABS(x[0], y[0])) DOWNc = getVarNames(groupTiedScores(DOWNc,0), getGrad=getGrad) return {"Continuous":{"UP": UPc[0:min(nRet,len( UPc))],\ "DOWN": DOWNc[0:min(nRet,len(DOWNc))]},\ "Discrete": {"UP": UPd[0:min(nRet,len( UPd))],\ "DOWN": DOWNd[0:min(nRet,len(DOWNd))]} }
def processSignificance(self, smi, prediction, orderedDesc, res, resultsPath, idx = 0, topN = 1): """descs* = [(1.3, ["LogP"]), (0.2, ["[So2]", ...]), ...] res = { "signature" : "", "imgPath" : "", for placing the results "non-signature" : "", "molStr" : "", "atoms" : [] "color" : [(r,g,b),(),...]} It uses for Classificartion: self.predictionOutcomes that must define [BADlabel, GOODlabel] in this same order and for Regression: self.significanceThreshold for which a GOOD prediction is BELOW the threshold orderedDesc = { "molStr":''..., # only on specialType=1 "height":2, # only on specialType=1 "atoms":[1,2,3], # only on specialType=1 'Continuous': { 'DOWN':[ [('[F]', -0.008885456983609475), ... ('[F]',-0,0001)], [('[O3]', -0.007324209285573964)], [('[C3]([C3][C3])', -0.0047175657931883405)], [('[C3]', -0.00389763719161594)]], 'UP': [[('[Car]([Car][Nar])', 0.009768981717302358)], [('[HC]([C3])', 0.009135563633559857)]]}, 'Discrete': {'DOWN': [], 'UP': []}} """ atomColor = None orderedDesc_sign = {'Continuous': {'DOWN': [], 'UP': []}, 'Discrete': {'DOWN': [], 'UP': []}} orderedDesc_nonSign = {'Continuous': {'DOWN': [], 'UP': []}, 'Discrete': {'DOWN': [], 'UP': []}} if hasattr(self.model, "specialType") and self.model.specialType == 1: print "This is a special model nr 1: It calculates itself the Significant Signatures" endHeight = orderedDesc["height"] try: molStr = orderedDesc["molStr"] atoms = eval(orderedDesc["atoms"]) except: atoms = None molStr = None if not molStr or type(atoms)!=list or not atoms: atoms = None molStr = None orderedDesc_sign = orderedDesc else: atoms = None endHeight = None molStr = None cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes([attr.name for attr in self.model.domain.attributes]) for attrType in ['Continuous', 'Discrete']: for vector in ['UP','DOWN']: for ord in range(len(orderedDesc[attrType][vector])): signEmpty=True nonSignEmpty=True for attr in orderedDesc[attrType][vector][ord]: if attr[0] in signDesc: if signEmpty: signEmpty = False orderedDesc_sign[attrType][vector].append([]) orderedDesc_sign[attrType][vector][-1].append(attr) else: if nonSignEmpty: nonSignEmpty = False orderedDesc_nonSign[attrType][vector].append([]) orderedDesc_nonSign[attrType][vector][-1].append(attr) #Process color to use if highlight is used outComeIsRev = None if self.model.classVar.varType == orange.VarTypes.Discrete: if self.predictionOutcomes is None: print "WARNING: Cannot process Significance, Missing definition of predictionOutcomes for the EndPoint" return theGoodPred = str(self.predictionOutcomes[1]) theBadPred = str(self.predictionOutcomes[0]) if [str(p) for p in self.model.classVar.values] == self.predictionOutcomes: outComeIsRev = False elif [str(p) for p in self.model.classVar.values][::-1] == self.predictionOutcomes: outComeIsRev = True else: print "ERROR: User outcome ordered list is not consistens toth model: ",\ self.predictionOutcomes, "<-->",self.model.classVar.values if prediction == theGoodPred: atomColor = 'g' else: atomColor = 'r' else: if self.significanceThreshold is None: print "WARNING: Cannot process Significance, Missing definition of significanceThreshold for the EndPoint" return if prediction < self.significanceThreshold: # It is a GOOD prediction atomColor = 'g' else: atomColor = 'r' #Process Signatures # OBS Hard coded for signatures 0 to 1. smilesData = self.getAZOdata(smi) dataSign, cmpdSignDict, cmpdSignList, sdfStr = getSignatures.getSignatures(smilesData, 0, 1, returnAtomID = True, useClabSmiles = False) # If signSVM model already returning one sign as the most significant if not (hasattr(self.model, "specialType") and self.model.specialType == 1): downAbs = 0.0 rankIdxDown = 0 elemIdxDown = 0 if len(orderedDesc_sign["Continuous"]["DOWN"]): for rankIdx in range(len(orderedDesc_sign["Continuous"]["DOWN"])): if downAbs != 0.0: break for elemIdx in range(len(orderedDesc_sign["Continuous"]["DOWN"][rankIdx])): # Test that the signature exists in the molecule if orderedDesc_sign["Continuous"]["DOWN"][rankIdx][elemIdx][0] in cmpdSignDict[0].keys(): downAbs = abs(orderedDesc_sign["Continuous"]["DOWN"][rankIdx][elemIdx][1]) rankIdxDown = rankIdx elemIdxDown = elemIdx break else: if len(orderedDesc_sign["Continuous"]["DOWN"]): downAbs = abs(orderedDesc_sign["Continuous"]["DOWN"][0][0][1]) else: downAbs = 0.0 # If signSVM model already returning one sign as the most significant if not (hasattr(self.model, "specialType") and self.model.specialType == 1): upAbs = 0.0 rankIdxUp = 0 elemIdxUp = 0 if len(orderedDesc_sign["Continuous"]["UP"]): for rankIdx in range(len(orderedDesc_sign["Continuous"]["UP"])): if upAbs != 0.0: break for elemIdx in range(len(orderedDesc_sign["Continuous"]["UP"][rankIdx])): # Test that the signature exists in the molecule if orderedDesc_sign["Continuous"]["UP"][rankIdx][elemIdx][0] in cmpdSignDict[0].keys(): upAbs = abs(orderedDesc_sign["Continuous"]["UP"][rankIdx][elemIdx][1]) rankIdxUp = rankIdx elemIdxUp = elemIdx break else: if len(orderedDesc_sign["Continuous"]["UP"]): upAbs = abs(orderedDesc_sign["Continuous"]["UP"][0][0][1]) else: upAbs = 0.0 if upAbs > downAbs: if not (hasattr(self.model, "specialType") and self.model.specialType == 1): MSDsign = orderedDesc_sign["Continuous"]["UP"][rankIdxUp][elemIdxUp][0] MSDdv = orderedDesc_sign["Continuous"]["UP"][rankIdxUp][elemIdxUp][1] else: MSDsign = orderedDesc_sign["Continuous"]["UP"][0][0][0] MSDdv = orderedDesc_sign["Continuous"]["UP"][0][0][1] elif downAbs > upAbs: if not (hasattr(self.model, "specialType") and self.model.specialType == 1): MSDsign = orderedDesc_sign["Continuous"]["DOWN"][rankIdxDown][elemIdxDown][0] MSDdv = orderedDesc_sign["Continuous"]["DOWN"][rankIdxDown][elemIdxDown][1] else: MSDsign = orderedDesc_sign["Continuous"]["DOWN"][0][0][0] MSDdv = orderedDesc_sign["Continuous"]["DOWN"][0][0][1] elif downAbs != 0.0: if not (hasattr(self.model, "specialType") and self.model.specialType == 1): MSDsign = orderedDesc_sign["Continuous"]["DOWN"][rankIdxDown][elemIdxDown][0] MSDdv = orderedDesc_sign["Continuous"]["DOWN"][rankIdxDown][elemIdxDown][1] else: MSDsign = orderedDesc_sign["Continuous"]["DOWN"][0][0][0] MSDdv = orderedDesc_sign["Continuous"]["DOWN"][0][0][1] else: MSDsign = None MSDsign = 0 #Process non-signatures if self.model.classVar.varType == orange.VarTypes.Discrete and outComeIsRev: UP = "DOWN" DOWN = "UP" else: UP = "UP" DOWN = "DOWN" #Process DiscreteAttrs MSDnonSign = "" nD_DOWN = len(orderedDesc_nonSign["Discrete"][DOWN]) if nD_DOWN: for n in range(min(topN,nD_DOWN)): if topN > 1: MSDnonSign += str(n+1)+": " MSDnonSign += string.join(["Change "+x[0] for x in orderedDesc_nonSign["Discrete"][DOWN][n]],'\n')+'\n' #Process Continuous attributes for n in range(topN): if topN > 1: order = str(n+1)+": " else: order = "" if len(orderedDesc_nonSign["Continuous"][DOWN]): downAbs = abs(orderedDesc_nonSign["Continuous"][DOWN][0][0][1]) else: downAbs = 0.0 if len(orderedDesc_nonSign["Continuous"][UP]): upAbs = abs(orderedDesc_nonSign["Continuous"][UP][0][0][1]) else: upAbs = 0.0 if orderedDesc_nonSign["Continuous"][UP] and upAbs >= downAbs: TOPmsd = orderedDesc_nonSign["Continuous"][UP].pop(0) MSDnonSign += order + string.join(["Decrease "+x[0] for x in TOPmsd],'\n')+'\n' if orderedDesc_nonSign["Continuous"][DOWN] and downAbs >= upAbs: TOPmsd = orderedDesc_nonSign["Continuous"][DOWN].pop(0) MSDnonSign += order + string.join(["Increase "+x[0] for x in TOPmsd],'\n')+'\n' res["non-signature"] = MSDnonSign # Most probably Signatures will always be associated with Discrete attributes. Nevertheless, it happens that some are Continuous, and therefore # we will be using signatures reported as Continuous if any if not MSDsign: res["imgPath"] = "" res["signature"] = "" res["signarure_deriv_val"] = 0 return if resultsPath and os.path.isdir(resultsPath): imgPath = os.path.join(resultsPath,"significance_"+str(idx)+"_"+str(time.time()).replace(".",'')+".png") else: imgPath = "" # Call the method to create the image/mol specifying the color of the hilighted atoms if molStr and atoms and endHeight is not None and not imgPath: print "Using molStr and atoms from Learner Significant Signature" res["imgPath"]='' res["molStr"] = molStr allAtoms = self.getNNAtoms(molStr, atoms, endHeight) res["atoms"] = allAtoms res["color"] = [atomColor]*len(allAtoms) else: res["imgPath"] , res["molStr"], res["atoms"], res["color"] = self.createSignImg(smi,MSDsign,atomColor,imgPath,endHeight) #Fix the significant descriptors so that it is a formated string res["signature"] = MSDsign res["signarure_deriv_val"] = MSDdv