def selectModel(MLStatistics, logFile=None): """Return the model with highest Q2/CA amongst methods with a stability less than 0.1. If no methods is considered stable, select the method with the greatest Q2/CA """ log(logFile, "Selecting MLmethod...") bestModelName = None bestRes = None bestStableVal = None #Select only from stable models for modelName in MLStatistics: StabilityValue = MLStatistics[modelName]["StabilityValue"] if StabilityValue is not None: if MLStatistics[modelName]["responseType"] == "Classification": if statc.mean(MLStatistics[modelName]["foldStat"] ["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H elif MLStatistics[modelName]["responseType"] == "Regression": if statc.mean(MLStatistics[modelName]["foldStat"] ["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H if StabilityValue < stableTH: valRes = max(MLStatistics[modelName]["Q2"], MLStatistics[modelName] ["CA"]) # One of them is always None if bestRes is None or valRes > bestRes: bestRes = valRes bestModelName = modelName bestStableVal = StabilityValue elif valRes == bestRes and StabilityValue < bestStableVal: bestRes = valRes bestModelName = modelName bestStableVal = StabilityValue # No stable models found! Selecting the one with best result still... if bestModelName is None: log( logFile, " No stable models found! Selecting the one with best result still..." ) for modelName in MLStatistics: valRes = max( MLStatistics[modelName]["Q2"], MLStatistics[modelName]["CA"]) # One of them is always None if bestRes is None or valRes > bestRes: bestRes = valRes bestModelName = modelName log(logFile, " Selected the non-stable MLmethod: " + bestModelName) else: log(logFile, " Selected the stable MLmethod: " + bestModelName) MLMethod = copy.deepcopy(MLStatistics[bestModelName]) MLMethod["MLMethod"] = bestModelName MLStatistics[bestModelName]["selected"] = True return MLMethod
def selectModel(MLStatistics, logFile = None): """Return the model with highest Q2/CA amongst methods with a stability less than 0.1. If no methods is considered stable, select the method with the greatest Q2/CA """ log(logFile, "Selecting MLmethod...") bestModelName = None bestRes = None bestStableVal = None #Select only from stable models for modelName in MLStatistics: StabilityValue = MLStatistics[modelName]["StabilityValue"] if StabilityValue is not None: if MLStatistics[modelName]["responseType"] == "Classification": if statc.mean(MLStatistics[modelName]["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H elif MLStatistics[modelName]["responseType"] == "Regression": if statc.mean(MLStatistics[modelName]["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H if StabilityValue < stableTH: valRes = max( MLStatistics[modelName]["Q2"], MLStatistics[modelName]["CA"]) # One of them is always None if bestRes is None or valRes > bestRes: bestRes = valRes bestModelName = modelName bestStableVal = StabilityValue elif valRes == bestRes and StabilityValue < bestStableVal: bestRes = valRes bestModelName = modelName bestStableVal = StabilityValue # No stable models found! Selecting the one with best result still... if bestModelName is None: log(logFile, " No stable models found! Selecting the one with best result still...") for modelName in MLStatistics: valRes = max( MLStatistics[modelName]["Q2"], MLStatistics[modelName]["CA"]) # One of them is always None if bestRes is None or valRes > bestRes: bestRes = valRes bestModelName = modelName log(logFile, " Selected the non-stable MLmethod: " + bestModelName) else: log(logFile, " Selected the stable MLmethod: " + bestModelName) MLMethod = MLStatistics[bestModelName].copy() MLMethod["MLMethod"] = bestModelName MLStatistics[bestModelName]["selected"] = True return MLMethod
def calcRsqrt(exp_pred_Val): """Calculates the Rsqrt of the predicted values in exp_pred_Val[1] against the respective experimental values in exp_pred_Val[0] Input example: [ (ExperimentalValue1, PredictedValue1), # In respect to 1st Ex (ExperimentalValue2, PredictedValue2), # In respect to 2nd Ex (ExperimentalValue3, PredictedValue3), # In respect to 3rd Ex (ExperimentalValue4, PredictedValue4), # In respect to 4rd Ex ... # ... ] """ # Calc mean of the experimental response variable actualValuesList = [] for val in exp_pred_Val: actualValuesList.append(val[0]) testMean = statc.mean(actualValuesList) errSum = 0.0 meanSum = 0.0 for val in exp_pred_Val: errSum = errSum + math.pow(val[0] - string.atof(str(val[1])), 2) meanSum = meanSum + math.pow(testMean - val[0], 2) if not meanSum: Rsqrt = -999999 else: Rsqrt = 1 - errSum / meanSum return Rsqrt
def calcRsqrt(exp_pred_Val): """Calculates the Rsqrt of the predicted values in exp_pred_Val[1] against the respective experimental values in exp_pred_Val[0] Input example: [ (ExperimentalValue1, PredictedValue1), # In respect to 1st Ex (ExperimentalValue2, PredictedValue2), # In respect to 2nd Ex (ExperimentalValue3, PredictedValue3), # In respect to 3rd Ex (ExperimentalValue4, PredictedValue4), # In respect to 4rd Ex ... # ... ] """ # Calc mean of the experimental response variable actualValuesList = [] for val in exp_pred_Val: actualValuesList.append(val[0]) testMean = statc.mean(actualValuesList) errSum = 0.0 meanSum = 0.0 for val in exp_pred_Val: errSum = errSum + math.pow(val[0] - string.atof(str(val[1])),2) meanSum = meanSum + math.pow(testMean - val[0],2) if not meanSum: Rsqrt = -999999 else: Rsqrt = 1 - errSum/meanSum return Rsqrt
def _get_par(self, datao): gaussiane = [ estimate_gaussian_per_class(datao, at, common_if_extreme=True) for at in range(len(datao.domain.attributes)) ] normalizec = [] for i,g in zip(range(len(datao.domain.attributes)), gaussiane): r = [ _llrlogratio(ex[i].value, *g) for ex in datao ] normalizec.append((mean(r), std(r))) return gaussiane, normalizec
def getRMSEstd(res, nFolds): """ Method for calculating the std of RMSE of nFolds in a crossvalidation (returned). res is the object containing the results from orngTest methods such as crossValidation. """ # Initialize a list to contain lists of errors for each fold. errorList = [] for idx in range(nFolds): errorList.append([]) # ex contains info on the fold number, prediction and actural responses for exah example used in the CV # Append ex error to correct fold list for ex in res.results: error = (ex.classes[0] - ex.actualClass)**2 errorList[ex.iterationNumber].append(error) # RMSE of the different folds RMSElist = [] for idx in range(nFolds): average = sum(errorList[idx]) / len(errorList[idx]) RMSElist.append(math.sqrt(average)) RMSEstd = stats.stdev(RMSElist) RMSEmean = statc.mean(RMSElist) if verbose > 0: print str(RMSEmean) + "\t" + str(RMSEstd) + "\t" + string.join( [str(x) for x in RMSElist], "\t") return RMSEstd, RMSElist
def getRMSEstd(res, nFolds): """ Method for calculating the std of RMSE of nFolds in a crossvalidation (returned). res is the object containing the results from orngTest methods such as crossValidation. """ # Initialize a list to contain lists of errors for each fold. errorList = [] for idx in range(nFolds): errorList.append([]) # ex contains info on the fold number, prediction and actural responses for exah example used in the CV # Append ex error to correct fold list for ex in res.results: error = (ex.classes[0]- ex.actualClass)**2 errorList[ex.iterationNumber].append(error) # RMSE of the different folds RMSElist = [] for idx in range(nFolds): average = sum(errorList[idx])/len(errorList[idx]) RMSElist.append(math.sqrt(average)) RMSEstd = stats.stdev(RMSElist) RMSEmean = statc.mean(RMSElist) if verbose > 0: print str(RMSEmean)+"\t"+str(RMSEstd)+"\t"+string.join( [str(x) for x in RMSElist], "\t") return RMSEstd, RMSElist
def Rsqrt_obsolete(res = None): """ Calculates the R-squared (Coefficient of determination) of orngTest.ExperimentResults in res The results res must be from a learner """ # If Called without arguments, return the type of problems this method can be used for: # 1 - Classification problems (Discrete Class) # 2 - Regression problems (Continuous Class) # 3 - Both Regression and Classification problems (Continuous or Discrete Class) if res == None: return {"type":REGRESSION} if res.numberOfIterations > 1: Rs = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] errSum = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] meanSum = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] means = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] nIter = [0]*res.numberOfIterations for tex in res.results: ac = float(tex.actualClass) nIter[tex.iterationNumber] += 1 for i, cls in enumerate(tex.classes): means[i][tex.iterationNumber] += ac for nit, it in enumerate(nIter): for i, cls in enumerate(tex.classes): means[i][nit] /=it for tex in res.results: ac = float(tex.actualClass) for i, cls in enumerate(tex.classes): errSum[i][tex.iterationNumber] += (float(cls) - ac)**2 meanSum[i][tex.iterationNumber] += (means[i][tex.iterationNumber] - ac)**2 for learner in range(res.numberOfLearners): for it in range(len(nIter)): if meanSum[learner][it]==0: return "N/A" Rs[learner][it] = 1-(errSum[learner][it] / meanSum[learner][it]) return [statc.mean(x) for x in Rs] else: RsqrtList=[] for nLearner in range(len(res.results[0].classes)): # Calc average of the prediction variable testMean = 0 for ex in res.results: testMean = testMean + ex.actualClass testMean = testMean/len(res.results) errSum = 0.0 meanSum = 0.0 for ex in res.results: errSum = errSum + math.pow(ex.actualClass - ex.classes[nLearner],2) meanSum = meanSum + math.pow(testMean - ex.actualClass,2) if meanSum==0: return "N/A" RsqrtList.append(1 - errSum/meanSum) return RsqrtList
def _get_par(self, datao): gaussiane = [ estimate_gaussian_per_class(datao, at, common_if_extreme=True) for at in range(len(datao.domain.attributes)) ] normalizec = [] for i, g in zip(range(len(datao.domain.attributes)), gaussiane): r = [_llrlogratio(ex[i].value, *g) for ex in datao] normalizec.append((mean(r), std(r))) return gaussiane, normalizec
def RMSE_obsolete(res=None): """ Calculates the Root Mean Squared Error of orngTest.ExperimentResults in res The results res must be from a regressor """ # If Called without arguments, return the type of problems this method can be used for: # 1 - Classification problems (Discrete Class) # 2 - Regression problems (Continuous Class) # 3 - Both Regression and Classification problems (Continuous or Discrete Class) if res == None: return {"type": REGRESSION} if res.numberOfIterations > 1: MSEs = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] nIter = [0] * res.numberOfIterations for tex in res.results: ac = float(tex.actualClass) nIter[tex.iterationNumber] += 1 for i, cls in enumerate(tex.classes): MSEs[i][tex.iterationNumber] += (float(cls) - ac)**2 MSEs = [[x / ni for x, ni in zip(y, nIter)] for y in MSEs] MSEs = [[math.sqrt(x) for x in y] for y in MSEs] # Print output from each fold to tem file RMSEfoldList = MSEs RMSE = [statc.mean(x) for x in RMSEfoldList] RMSEstd = stats.stdev(RMSEfoldList[0]) #print str(RMSE[0])+"\t"+str(RMSEstd)+"\t"+string.join( [str(x) for x in RMSEfoldList[0]] , "\t") return [round(statc.mean(x), 2) for x in MSEs] else: MSEs = [0.0] * res.numberOfLearners for tex in res.results: MSEs = map(lambda res, cls, ac=float(tex.actualClass): res + (float(cls) - ac)**2, MSEs, tex.classes) MSEs = [x / (len(res.results)) for x in MSEs] return [round(math.sqrt(x), 2) for x in MSEs]
def CA_obsolete(res=None, returnFoldStat=False): """ Calculates the classification Accuracy of orngTest.ExperimentResults in res The results res must be from a classifier """ # If Called without arguments, return the type of problems this method can be used for: # 1 - Classification problems (Discrete Class) # 2 - Regression problems (Continuous Class) # 3 - Both Regression and Classification problems (Continuous or Discrete Class) if res == None: return {"type": CLASSIFICATION} if res.numberOfIterations > 1: CAs = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] nIter = [0] * res.numberOfIterations for tex in res.results: ac = tex.actualClass nIter[tex.iterationNumber] += 1 for i, cls in enumerate(tex.classes): if cls == ac: CAs[i][tex.iterationNumber] += 1 CAs = [[x / ni for x, ni in zip(y, nIter)] for y in CAs] CAfoldList = CAs CA = [statc.mean(x) for x in CAs] CAstd = stats.stdev(CAfoldList[0]) if returnFoldStat: return [round(statc.mean(x), 3) for x in CAs], CAfoldList else: return [round(statc.mean(x), 3) for x in CAs] else: CAs = [0.0] * res.numberOfLearners for tex in res.results: CAs = map(lambda res, cls, ac=tex.actualClass: res + types.IntType( cls == ac), CAs, tex.classes) return [round(x / (len(res.results)), 3) for x in CAs]
def estimate_gaussian_per_class(data, i, a=None, b=None, common_if_extreme=False): cv = data.domain.class_var if a == None: a = cv.values[0] if b == None: b = cv.values[1] def avWCVal(value): return [ ex[i].value for ex in data if ex[-1].value == value and not ex[i].isSpecial() ] list1 = avWCVal(a) list2 = avWCVal(b) mi1 = mi2 = st1 = st2 = None try: mi1 = statc.mean(list1) st1 = statc.std(list1) except: pass try: mi2 = statc.mean(list2) st2 = statc.std(list2) except: pass def extreme(): return st1 == 0 or st2 == 0 if common_if_extreme and extreme(): st1 = st2 = statc.std(list1 + list2) return mi1, st1, mi2, st2
def RMSE_obsolete(res = None): """ Calculates the Root Mean Squared Error of orngTest.ExperimentResults in res The results res must be from a regressor """ # If Called without arguments, return the type of problems this method can be used for: # 1 - Classification problems (Discrete Class) # 2 - Regression problems (Continuous Class) # 3 - Both Regression and Classification problems (Continuous or Discrete Class) if res == None: return {"type":REGRESSION} if res.numberOfIterations > 1: MSEs = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] nIter = [0]*res.numberOfIterations for tex in res.results: ac = float(tex.actualClass) nIter[tex.iterationNumber] += 1 for i, cls in enumerate(tex.classes): MSEs[i][tex.iterationNumber] += (float(cls) - ac)**2 MSEs = [[x/ni for x, ni in zip(y, nIter)] for y in MSEs] MSEs = [[math.sqrt(x) for x in y] for y in MSEs] # Print output from each fold to tem file RMSEfoldList = MSEs RMSE = [statc.mean(x) for x in RMSEfoldList] RMSEstd = stats.stdev(RMSEfoldList[0]) #print str(RMSE[0])+"\t"+str(RMSEstd)+"\t"+string.join( [str(x) for x in RMSEfoldList[0]] , "\t") return [round(statc.mean(x),2) for x in MSEs] else: MSEs = [0.0]*res.numberOfLearners for tex in res.results: MSEs = map(lambda res, cls, ac = float(tex.actualClass): res + (float(cls) - ac)**2, MSEs, tex.classes) MSEs = [x/(len(res.results)) for x in MSEs] return [round(math.sqrt(x),2) for x in MSEs]
def CA_obsolete(res = None, returnFoldStat = False): """ Calculates the classification Accuracy of orngTest.ExperimentResults in res The results res must be from a classifier """ # If Called without arguments, return the type of problems this method can be used for: # 1 - Classification problems (Discrete Class) # 2 - Regression problems (Continuous Class) # 3 - Both Regression and Classification problems (Continuous or Discrete Class) if res == None: return {"type":CLASSIFICATION} if res.numberOfIterations > 1: CAs = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] nIter = [0]*res.numberOfIterations for tex in res.results: ac = tex.actualClass nIter[tex.iterationNumber] += 1 for i, cls in enumerate(tex.classes): if cls == ac: CAs[i][tex.iterationNumber] += 1 CAs = [[x/ni for x, ni in zip(y, nIter)] for y in CAs] CAfoldList = CAs CA = [statc.mean(x) for x in CAs] CAstd = stats.stdev(CAfoldList[0]) if returnFoldStat: return [round(statc.mean(x),3) for x in CAs], CAfoldList else: return [round(statc.mean(x),3) for x in CAs] else: CAs = [0.0]*res.numberOfLearners for tex in res.results: CAs = map(lambda res, cls, ac = tex.actualClass: res + types.IntType(cls == ac), CAs, tex.classes) return [round(x/(len(res.results)),3) for x in CAs]
def build_feature(self, data, gs): at = Orange.feature.Continuous(name=str(gs)) geneset = list(gs.genes) nm, name_ind, genes, takegenes, to_geneset = self._match_data( data, geneset, odic=True) gsi = [name_ind[g] for g in genes] gausse = compute_llr(data, gsi, self._gauss_cache) genes_gs = [to_geneset[g] for g in genes] if self.normalize: # per (3) in the paper #compute log ratios for all samples and genes from this gene set for i, gene_gs, g in zip(gsi, genes_gs, gausse): if gene_gs not in self._normalizec: #skip if computed already r = [_llrlogratio(ex[i].value, *g) for ex in data] self._normalizec[gene_gs] = (mean(r), std(r)) def t(ex, w, genes_gs=genes_gs, gausse=gausse, normalizec=self._normalizec): nm2, name_ind2, genes2 = self._match_instance(ex, genes_gs, None) gsvalues = [vou(ex, gn, name_ind2) for gn in genes2] vals = [ _llrlogratio(v, *g) if v != "?" else 0.0 for v, g in zip(gsvalues, gausse) ] if len(normalizec): #normalize according to (3) vals2 = [] for v, g in zip(vals, genes_gs): m, s = normalizec[g] if s == 0: #disregard attributes without differences vals2.append(0.) else: vals2.append((v - m) / s) vals = vals2 return sum(vals) at.get_value_from = t return at
def calcRsqrt(exp_pred_Val): """Calculates the Rsqrt of the predicted values in exp_pred_Val[1] against the respective experimental values in exp_pred_Val[0] """ # Calc mean of the experimental response variable actualValuesList = [] for val in exp_pred_Val: actualValuesList.append(val[0].value) testMean = statc.mean(actualValuesList) errSum = 0.0 meanSum = 0.0 for val in exp_pred_Val: errSum = errSum + math.pow(val[0] - string.atof(str(val[1])),2) meanSum = meanSum + math.pow(testMean - val[0],2) Rsqrt = 1 - errSum/meanSum return Rsqrt
def getRsqrt(testData, predictor): """Calculate the coefficient of determination (R-squared) for the orange model predictor on the data set testData. R^2 = 1 - sum((pred - actual)^2)/(sum((mean - actual)^2))""" # Calc average of the prediction variable predValuesList = [] for ex in testData: predValuesList.append(ex[ex.domain.classVar.name]) testMean = statc.mean(predValuesList) errSum = 0.0 meanSum = 0.0 for ex in testData: errSum = errSum + math.pow(ex.getclass() - string.atof(str(predictor(ex))),2) meanSum = meanSum + math.pow(testMean - ex.getclass(),2) Rsqrt = 1 - errSum/meanSum return Rsqrt
def getRsqrt(testData, predictor): """Calculate the coefficient of determination (R-squared) for the orange model predictor on the data set testData. This uses the Test Set Activity Mean R^2 = 1 - sum((pred - actual)^2)/(sum((testMean - actual)^2))""" # Calc average of the response variable actualValuesList = [] for ex in testData: actualValuesList.append(ex.getclass().value) testMean = statc.mean(actualValuesList) errSum = 0.0 meanSum = 0.0 for ex in testData: errSum = errSum + math.pow(ex.getclass() - string.atof(str(predictor(ex))),2) meanSum = meanSum + math.pow(testMean - ex.getclass(),2) Rsqrt = 1 - errSum/meanSum return Rsqrt
def build_feature(self, data, gs): at = Orange.feature.Continuous(name=str(gs)) geneset = list(gs.genes) nm, name_ind, genes, takegenes, to_geneset = self._match_data(data, geneset, odic=True) gsi = [ name_ind[g] for g in genes ] gausse = compute_llr(data, gsi, self._gauss_cache) genes_gs = [ to_geneset[g] for g in genes ] if self.normalize: # per (3) in the paper #compute log ratios for all samples and genes from this gene set for i, gene_gs, g in zip(gsi, genes_gs, gausse): if gene_gs not in self._normalizec: #skip if computed already r = [ _llrlogratio(ex[i].value, *g) for ex in data ] self._normalizec[gene_gs] = (mean(r), std(r)) def t(ex, w, genes_gs=genes_gs, gausse=gausse, normalizec=self._normalizec): nm2, name_ind2, genes2 = self._match_instance(ex, genes_gs, None) gsvalues = [ vou(ex, gn, name_ind2) for gn in genes2 ] vals = [ _llrlogratio(v, *g) if v != "?" else 0.0 for v,g in zip(gsvalues, gausse) ] if len(normalizec): #normalize according to (3) vals2 = [] for v,g in zip(vals, genes_gs): m,s = normalizec[g] if s == 0: #disregard attributes without differences vals2.append(0.) else: vals2.append((v-m)/s) vals = vals2 return sum(vals) at.get_value_from = t return at
def createStatObj( self, results=None, exp_pred=None, nTrainCmpds=None, nTestCmpds=None, responseType=None, nExtFolds=None, userAlert="", rocs=None, ): # Initialize res (statObj) for statistic results res = {} self.__log("Starting to create Stat Obj") # Classification res["CA"] = None res["CM"] = None res["MCC"] = None res["ROC"] = None # Regression res["Q2"] = None res["RMSE"] = None # Both res["StabilityValue"] = None res["userAlert"] = userAlert res["selected"] = False res["stable"] = False res["responseType"] = False res["foldStat"] = { "nTrainCmpds": None, "nTestCmpds": None, # Regression "Q2": None, "RMSE": None, # Classification "CM": None, "CA": None, "MCC": None, "ROC": None, } if ( results is None ): # or exp_pred is None or responseType is None or nExtFolds is None or nTestCmpds is None or nTrainCmpds is None: self.__log(" NONE...") return res res["responseType"] = responseType # Calculate the (Q2, RMSE) or (CM, CA) results depending on Classification or regression if responseType == "Classification": # Compute CA res["CA"] = sum(r[0] for r in results) / nExtFolds # Compute CM res["CM"] = copy.deepcopy(results[0][1]) # Get the first ConfMat for r in results[1:]: for Lidx, line in enumerate(r[1]): for idx, val in enumerate(line): res["CM"][Lidx][idx] = res["CM"][Lidx][idx] + val # Add each same ConfMat position # Compute MCC res["MCC"] = evalUtilities.calcMCC(res["CM"]) # Compute ROC res["ROC"] = sum(ro[0] for ro in rocs) / self.nExtFolds # Compute foldStat res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds] res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds] res["foldStat"]["CA"] = [r[0] for r in results] res["foldStat"]["CM"] = [r[1] for r in results] res["foldStat"]["MCC"] = [evalUtilities.calcMCC(r[1]) for r in results] res["foldStat"]["ROC"] = [ro for ro in rocs] # Compute Stability res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["CA"]) else: # compute Q2 res["Q2"] = evalUtilities.calcRsqrt(exp_pred) # compute RMSE res["RMSE"] = evalUtilities.calcRMSE(exp_pred) # Compute foldStat res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds] res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds] res["foldStat"]["RMSE"] = [r[0] for r in results] res["foldStat"]["Q2"] = [r[1] for r in results] # Compute Stability value res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["Q2"]) # Evaluate stability of ML StabilityValue = res["StabilityValue"] if StabilityValue is not None: if responseType == "Classification": if statc.mean(res["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H else: if statc.mean(res["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H if StabilityValue < stableTH: # Select only stable models res["stable"] = True return res
def setHubs(self, i=None): if not i is None: self.hubs = i self.graph.tooltipNeighbours = self.hubs == 2 and self.markDistance or 0 self.graph.markWithRed = False if not self.visualize or not self.visualize.graph: return hubs = self.hubs vgraph = self.visualize.graph if hubs == 0: return elif hubs == 1: txt = self.markSearchString labelText = self.graph.labelText self.graph.markWithRed = self.graph.nVertices > 200 self.graph.setMarkedNodes( [i for i, values in enumerate(vgraph.items) if txt in " ".join([str(values[ndx]) for ndx in labelText])] ) print [ i for i, values in enumerate(vgraph.items) if txt in " ".join([str(values[ndx]) for ndx in labelText]) ] return elif hubs == 2: self.graph.setMarkedNodes([]) self.graph.tooltipNeighbours = self.markDistance return elif hubs == 3: self.graph.setMarkedNodes([]) self.graph.selectionNeighbours = self.markDistance self.graph.markSelectionNeighbours() return self.graph.tooltipNeighbours = self.graph.selectionNeighbours = 0 powers = vgraph.getDegrees() if hubs == 4: # at least N connections N = self.markNConnections self.graph.setMarkedNodes([i for i, power in enumerate(powers) if power >= N]) elif hubs == 5: N = self.markNConnections self.graph.setMarkedNodes([i for i, power in enumerate(powers) if power <= N]) elif hubs == 6: self.graph.setMarkedNodes( [ i for i, power in enumerate(powers) if power > max([0] + [powers[nn] for nn in vgraph.getNeighbours(i)]) ] ) elif hubs == 7: self.graph.setMarkedNodes( [ i for i, power in enumerate(powers) if power > mean([0] + [powers[nn] for nn in vgraph.getNeighbours(i)]) ] ) elif hubs == 8: sortedIdx = range(len(powers)) sortedIdx.sort(lambda x, y: -cmp(powers[x], powers[y])) cutP = self.markNumber cutPower = powers[sortedIdx[cutP]] while cutP < len(powers) and powers[sortedIdx[cutP]] == cutPower: cutP += 1 self.graph.setMarkedNodes(sortedIdx[: cutP - 1])
def setHubs(self, i=None): if not i is None: self.hubs = i self.graph.tooltipNeighbours = self.hubs == 2 and self.markDistance or 0 self.graph.markWithRed = False if not self.visualize or not self.visualize.graph: return hubs = self.hubs vgraph = self.visualize.graph if hubs == 0: return elif hubs == 1: txt = self.markSearchString labelText = self.graph.labelText self.graph.markWithRed = self.graph.nVertices > 200 self.graph.setMarkedNodes([ i for i, values in enumerate(vgraph.items) if txt in " ".join([str(values[ndx]) for ndx in labelText]) ]) print[ i for i, values in enumerate(vgraph.items) if txt in " ".join([str(values[ndx]) for ndx in labelText]) ] return elif hubs == 2: self.graph.setMarkedNodes([]) self.graph.tooltipNeighbours = self.markDistance return elif hubs == 3: self.graph.setMarkedNodes([]) self.graph.selectionNeighbours = self.markDistance self.graph.markSelectionNeighbours() return self.graph.tooltipNeighbours = self.graph.selectionNeighbours = 0 powers = vgraph.getDegrees() if hubs == 4: # at least N connections N = self.markNConnections self.graph.setMarkedNodes( [i for i, power in enumerate(powers) if power >= N]) elif hubs == 5: N = self.markNConnections self.graph.setMarkedNodes( [i for i, power in enumerate(powers) if power <= N]) elif hubs == 6: self.graph.setMarkedNodes([ i for i, power in enumerate(powers) if power > max([0] + [powers[nn] for nn in vgraph.getNeighbours(i)]) ]) elif hubs == 7: self.graph.setMarkedNodes([ i for i, power in enumerate(powers) if power > mean([0] + [powers[nn] for nn in vgraph.getNeighbours(i)]) ]) elif hubs == 8: sortedIdx = range(len(powers)) sortedIdx.sort(lambda x, y: -cmp(powers[x], powers[y])) cutP = self.markNumber cutPower = powers[sortedIdx[cutP]] while cutP < len(powers) and powers[sortedIdx[cutP]] == cutPower: cutP += 1 self.graph.setMarkedNodes(sortedIdx[:cutP - 1])
def mean(l): return statc.mean(l)
def stability(res): mean = statc.mean(res) dists = [abs(x-mean) for x in res] return statc.mean(dists)
def Rsqrt_obsolete(res=None): """ Calculates the R-squared (Coefficient of determination) of orngTest.ExperimentResults in res The results res must be from a learner """ # If Called without arguments, return the type of problems this method can be used for: # 1 - Classification problems (Discrete Class) # 2 - Regression problems (Continuous Class) # 3 - Both Regression and Classification problems (Continuous or Discrete Class) if res == None: return {"type": REGRESSION} if res.numberOfIterations > 1: Rs = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] errSum = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] meanSum = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] means = [[0.0] * res.numberOfIterations for i in range(res.numberOfLearners)] nIter = [0] * res.numberOfIterations for tex in res.results: ac = float(tex.actualClass) nIter[tex.iterationNumber] += 1 for i, cls in enumerate(tex.classes): means[i][tex.iterationNumber] += ac for nit, it in enumerate(nIter): for i, cls in enumerate(tex.classes): means[i][nit] /= it for tex in res.results: ac = float(tex.actualClass) for i, cls in enumerate(tex.classes): errSum[i][tex.iterationNumber] += (float(cls) - ac)**2 meanSum[i][tex.iterationNumber] += ( means[i][tex.iterationNumber] - ac)**2 for learner in range(res.numberOfLearners): for it in range(len(nIter)): if meanSum[learner][it] == 0: return "N/A" Rs[learner][it] = 1 - (errSum[learner][it] / meanSum[learner][it]) return [statc.mean(x) for x in Rs] else: RsqrtList = [] for nLearner in range(len(res.results[0].classes)): # Calc average of the prediction variable testMean = 0 for ex in res.results: testMean = testMean + ex.actualClass testMean = testMean / len(res.results) errSum = 0.0 meanSum = 0.0 for ex in res.results: errSum = errSum + math.pow( ex.actualClass - ex.classes[nLearner], 2) meanSum = meanSum + math.pow(testMean - ex.actualClass, 2) if meanSum == 0: return "N/A" RsqrtList.append(1 - errSum / meanSum) return RsqrtList
def stability(res): mean = statc.mean(res) dists = [abs(x - mean) for x in res] return statc.mean(dists)
fh = open("RES_out"+os.environ["SGE_TASK_ID"]+".pkl","w") pickle.dump(evaluateMethod(res)[0], fh) fh.close() """ # Assess the memory requirements memSize = dataUtilities.getApproxMemReq(dataSet) evalResList = sgeUtilities.arrayJob(jobName = "EvalJob", jobNumber = %(nExtFolds)s, jobParams = [learner,%(nFolds)s,dataSet,%(evalMethodFunc)s], jobQueue = "batch.q", jobScript = jobScript, memSize = str(memSize)+"M") else: for idx in range(%(nExtFolds)s): MyRandom = orange.RandomGenerator(1000*idx+1) res = %(sMethod)s evalResList.append(%(evalMethodFunc)s(res)[0]) if isClassifier: evalRes = [round(statc.mean(evalResList),3)] else: evalRes = [round(statc.mean(evalResList),2)] if verbose > 0: print evalRes else: res = %(sMethod)s evalRes = %(evalMethodFunc)s(res) # Save intermediate result #if os.path.exists("%(runPath)sintRes.txt"): if [os.path.basename(f) for f in glob("%(runPath)s"+"*intRes.txt")] != []: tmpNew=False else: tmpNew=True #tmp=miscUtilities.lockFile("%(runPath)sintRes.txt","a") #tmp=open("%(runPath)sintRes.txt","a")
def getAcc(self): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" "+str(self.responseType)) #Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) #Var for saving each Fols result results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models={} self.__log("Calculating Statistics for MLmethods:") self.__log(" "+str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN],negate=1) self.__checkTrainData(trainData) for ml in MLmethods: self.__log(" > "+str(ml)+"...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN],negate=1) testData = self.data.select(DataIdxs[foldN]) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData)*(1-1.0/self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0],negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += " Fold "+str(foldN)+": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) else: runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam") trainData.save(os.path.join(runPath,"trainData.tab")) paramOptUtilities.getOptParam( learner = MLmethods[ml], trainDataFile = os.path.join(runPath,"trainData.tab"), paramList = self.paramList, useGrid = False, verbose = self.verbose, queueType = self.queueType, runPath = runPath, nExtFolds = None, nFolds = self.nInnerFolds) if not MLmethods[ml].optimized: self.__log(" The learner "+str(ml)+" was not optimized.") raise Exception("The learner "+str(ml)+" was not optimized.") miscUtilities.removeDir(runPath) #Train the model model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred res = self.createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],self.responseType, self.nExtFolds, logTxt) if self.verbose > 0: print "AccWOptParamGetter!Results "+ml+":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = res.copy() self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner "+str(ml)+" failed to create/optimize the model!") res = self.createStatObj() statistics[ml] = res.copy() if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! stableML={} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None: if self.responseType == "Classification": if statc.mean(statistics[modelName]["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H else: if statc.mean(statistics[modelName]["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H if StabilityValue < stableTH: # Select only stable models stableML[modelName] = statistics[modelName].copy() if len(stableML) >= 2: self.__log("Found "+str(len(stableML))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.") if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in stableML: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(stableML[ml]["CA"])+" " exprTest0 += ")/IF0(sum([False" for ml in stableML: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0,CLASS1) expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: Q2sum = sum([stableML[ml]["Q2"] for ml in stableML]) expression = "(1 / "+str(Q2sum)+") * (0" for ml in stableML: expression += " + "+str(stableML[ml]["Q2"])+" * "+ml+" " expression += ")" #Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log("Calculating the statistics for a Consensus model") for foldN in range(self.nExtFolds): testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in stableML: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = res.copy() statistics["Consensus"]["IndividualStatistics"] = stableML.copy() self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def createStatObj(results=None, exp_pred=None, nTrainCmpds=None, nTestCmpds=None, responseType=None, nExtFolds=None, userAlert="", foldSelectedML=None): #Initialize res (statObj) for statistic results res = {} # Classification res["CA"] = None res["CM"] = None res["MCC"] = None #Regression res["Q2"] = None res["RMSE"] = None #Both res["StabilityValue"] = None res["userAlert"] = userAlert res["selected"] = False res["stable"] = False res["responseType"] = False res["foldStat"] = { "nTrainCmpds": None, "nTestCmpds": None, #Regression "Q2": None, "RMSE": None, #Classification "CM": None, "CA": None, "MCC": None } if not results or results is None or exp_pred is None or responseType is None or nExtFolds is None or nTestCmpds is None or nTrainCmpds is None: return res res["responseType"] = responseType #Calculate the (Q2, RMSE) or (CM, CA) results depending on Classification or regression if responseType == "Classification": #Compute CA res["CA"] = sum(r[0] for r in results) / nExtFolds #Compute CM res["CM"] = copy.deepcopy(results[0][1]) # Get the first ConfMat for r in results[1:]: for Lidx, line in enumerate(r[1]): for idx, val in enumerate(line): res["CM"][Lidx][idx] = res["CM"][Lidx][ idx] + val #Add each same ConfMat position #Compute MCC res["MCC"] = evalUtilities.calcMCC(res["CM"]) #Compute foldStat res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds] res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds] res["foldStat"]["CA"] = [r[0] for r in results] res["foldStat"]["CM"] = [r[1] for r in results] res["foldStat"]["MCC"] = [evalUtilities.calcMCC(r[1]) for r in results] #Compute Stability res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["CA"]) else: #compute Q2 res["Q2"] = evalUtilities.calcRsqrt(exp_pred) #compute RMSE res["RMSE"] = evalUtilities.calcRMSE(exp_pred) #Compute foldStat res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds] res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds] res["foldStat"]["RMSE"] = [r[0] for r in results] res["foldStat"]["Q2"] = [r[1] for r in results] #Compute Stability value res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["Q2"]) # Save selectedMLs if passed if foldSelectedML: res["foldStat"]["foldSelectedML"] = [ml for ml in foldSelectedML] #Evaluate stability of ML StabilityValue = res["StabilityValue"] if StabilityValue is not None: if responseType == "Classification": if statc.mean(res["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H else: if statc.mean(res["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H if StabilityValue < stableTH: # Select only stable models res["stable"] = True return res