def createStatObj(self, results=None, exp_pred=None, responseType=None, nExtFolds=None): #Initialize res (statObj) for statistic results res = {} # Classification res["CA"] = None res["CM"] = None res["MCC"] = None #Regression res["R2"] = None res["RMSE"] = None #Both res["StabilityValue"] = None res["foldStat"] = { #Regression "R2" : None, "RMSE" : None, #Classification "CM" : None, "CA" : None, "MCC" : None } if results is None or exp_pred is None or responseType is None or nExtFolds is None: return res #Calculate the (R2, RMSE) or (CM, CA) results depending on Classification or regression if responseType == "Classification": #Compute CA res["CA"] = sum(r[0] for r in results) / self.nExtFolds #Compute CM res["CM"] = results[0][1] # Get the first ConfMat for r in results[1:]: for Lidx,line in enumerate(r[1]): for idx,val in enumerate(line): res["CM"][Lidx][idx] = res["CM"][Lidx][idx] + val #Add each same ConfMat position #Compute MCC res["MCC"] = evalUtilities.calcMCC(res["CM"]) #Compute foldStat res["foldStat"]["CA"] = [r[0] for r in results] res["foldStat"]["CM"] = [r[1] for r in results] res["foldStat"]["MCC"] = [evalUtilities.calcMCC(r[1]) for r in results] #Compute Stability res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["CA"]) else: #compute R2 res["R2"] = evalUtilities.calcRsqrt(exp_pred) #compute RMSE res["RMSE"] = evalUtilities.calcRMSE(exp_pred) #Compute foldStat res["foldStat"]["RMSE"] = [r[0] for r in results] res["foldStat"]["R2"] = [r[1] for r in results] #Compute Stability res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["R2"]) return res
def getProbabilitiesAsAttribute(self, algorithm=None, minsup=None, atts=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None parameters: algo - key for the structural feature generation algorithm (set dependent structural features that have to be calculated inside the crossvalidation) minsup - minimum support for the algorithm atts - attributes to be removed before learning (e.g. meta etc...) """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if algorithm: self.__log(" Additional features to be calculated inside of cross-validation") self.__log(" Algorithm for structural features: " + str(algorithm)) self.__log(" Minimum support parameter: " + str(minsup)) # Set the response type self.responseType = ( self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" ) self.__log(" " + str(self.responseType)) # Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) # Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} rocs = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) # Check data in advance so that, by chance, it will not faill at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN], negate=1) self.__checkTrainData(trainData) # Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") for ml in sortedML: self.__log(" > " + str(ml) + "...") try: # Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] rocs[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] ### mods TG prediction_attribute = orange.FloatVariable("class_prob") domain = [data.domain.attributes, prediction_attribute, data.domain.classvar] data_new = orange.ExampleTable(domain) logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN], negate=1) orig_len = len(trainData.domain.attributes) # add structural descriptors to the training data (TG) if algorithm: trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, algorithm, minsup) trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts) testData = self.data.select(DataIdxs[foldN]) # print "IDX: ", # print DataIdxs[foldN] # calculate the feature values for the test data (TG) if algorithm: cut_off = orig_len - len(atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: " + str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts) testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) # Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += ( " Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n" ) self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData) ) trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join(runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, ) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized." ) self.__log(" It will be ignored") # self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: " + runPath) # Set learner back to default # MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) # Train the model model = MLmethods[ml](trainData) models[ml].append(model) # Test the model if self.responseType == "Classification": results[ml].append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) roc = self.aroc(testData, [model]) rocs[ml].append(roc) # save the prediction probabilities else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: # We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log( "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods." ) if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) if len(consensusMLs) >= 2: # Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs]) ) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0, CLASS1) expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) # Test the model if self.responseType == "Classification": Cresults.append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics # By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def getAcc(self, callBack=None, algorithm=None, params=None, atts=None, holdout=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None parameters: algorithm - list of feature generation algorithms (set dependent features that have to be calculated inside the crossvalidation) params - dictionary of parameters atts - attributes to be removed before learning (e.g. meta etc...) """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if holdout: self.nExtFolds = 1 if algorithm: self.__log(" Additional features to be calculated inside of cross-validation") for i in algorithm: self.__log(" Algorithm: " + str(i)) for j, v in params.iteritems(): self.__log(" Parameter: " + str(j) + " = " + str(v)) # Set the response type self.responseType = ( self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" ) self.__log(" " + str(self.responseType)) # Create the Train and test sets DataIdxs = None if holdout: self.__log("Using hold out evaluation with " + str(holdout) + "*100 % of data for training") DataIdxs = dataUtilities.SeedDataSampler_holdOut(self.data, holdout) else: DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) # Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} rocs = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) # Check data in advance so that, by chance, it will not fail at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN], negate=1) self.__checkTrainData(trainData) # Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: self.__log(" > " + str(ml) + "...") try: # Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] rocs[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN], negate=1) orig_len = len(trainData.domain.attributes) refs = None methods = [ "rdk_MACCS_keys", "rdk_topo_fps", "rdk_morgan_fps", "rdk_morgan_features_fps", "rdk_atompair_fps", ] train_domain = None # add structural descriptors to the training data (TG) if algorithm: for i in range(len(algorithm)): if algorithm[i] == "structClust": self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) actData = orange.ExampleTable(trainData.domain) for d in trainData: # only valid for simboosted qsar paper experiments!? if d.getclass() == "2": actData.append(d) refs = structuralClustering.getReferenceStructures( actData, threshold=params["threshold"], minClusterSize=params["minClusterSize"], numThreads=2, ) self.__log( " found " + str(len(refs)) + " reference structures in " + str(len(actData)) + " active structures" ) orig_len = orig_len + (len(refs) * len(methods)) trainData_sim = SimBoostedQSAR.getSimDescriptors(refs, trainData, methods) if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_sim, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_sim, []) elif algorithm[i] == "ECFP": self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) trainData_ecfp = getCinfonyDesc.getCinfonyDescResults(trainData, ["rdk.FingerPrints"]) train_domain = trainData_ecfp.domain if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, []) else: self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) trainData_structDesc = getStructuralDesc.getStructuralDescResult( trainData, algorithm[i], params["minsup"] ) if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, []) # trainData.save("/home/girschic/proj/AZ/ProjDev/train.tab") testData = self.data.select(DataIdxs[foldN]) # calculate the feature values for the test data (TG) if algorithm: for i in range(len(algorithm)): if algorithm[i] == "structClust": self.__log(str(algorithm[i])) testData_sim = SimBoostedQSAR.getSimDescriptors(refs, testData, methods) if i == (len(algorithm) - 1): testData = dataUtilities.attributeDeselectionData(testData_sim, atts) else: testData = dataUtilities.attributeDeselectionData(testData_sim, []) elif algorithm[i] == "ECFP": self.__log(str(algorithm[i])) # testData_ecfp = orange.ExampleTable(train_domain) tmp_dat = [] for d in testData: tmp = getCinfonyDesc.getRdkFPforTestInstance(train_domain, d) tmp_dat.append(tmp) testData_ecfp = orange.ExampleTable(tmp_dat[0].domain, tmp_dat) if i == (len(algorithm) - 1): # print "removing atts" testData = dataUtilities.attributeDeselectionData(testData_ecfp, atts) else: # print "removing no atts" testData = dataUtilities.attributeDeselectionData(testData_ecfp, []) else: cut_off = orig_len - len(atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: " + str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts) if i == (len(algorithm) - 1): testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts) else: testData = dataUtilities.attributeDeselectionData(testData_structDesc, []) # testData.save("/home/girschic/proj/AZ/ProjDev/test.tab") nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) # Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += ( " Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n" ) self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData) ) # self.__log(" run path:"+str(runPath)) trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join(runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, ) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized." ) self.__log(" It will be ignored") # self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: " + runPath) # Set learner back to default # MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) # Train the model model = MLmethods[ml](trainData) models[ml].append(model) # Test the model if self.responseType == "Classification": results[ml].append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) roc = self.aroc(testData, [model]) rocs[ml].append(roc) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: print "Unexpected error:", print sys.exc_info()[0] print sys.exc_info()[1] self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: # We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log( "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods." ) if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) if len(consensusMLs) >= 2: # Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs]) ) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0, CLASS1) expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) # Test the model if self.responseType == "Classification": Cresults.append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics # By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def createStatObj( self, results=None, exp_pred=None, nTrainCmpds=None, nTestCmpds=None, responseType=None, nExtFolds=None, userAlert="", rocs=None, ): # Initialize res (statObj) for statistic results res = {} self.__log("Starting to create Stat Obj") # Classification res["CA"] = None res["CM"] = None res["MCC"] = None res["ROC"] = None # Regression res["Q2"] = None res["RMSE"] = None # Both res["StabilityValue"] = None res["userAlert"] = userAlert res["selected"] = False res["stable"] = False res["responseType"] = False res["foldStat"] = { "nTrainCmpds": None, "nTestCmpds": None, # Regression "Q2": None, "RMSE": None, # Classification "CM": None, "CA": None, "MCC": None, "ROC": None, } if ( results is None ): # or exp_pred is None or responseType is None or nExtFolds is None or nTestCmpds is None or nTrainCmpds is None: self.__log(" NONE...") return res res["responseType"] = responseType # Calculate the (Q2, RMSE) or (CM, CA) results depending on Classification or regression if responseType == "Classification": # Compute CA res["CA"] = sum(r[0] for r in results) / nExtFolds # Compute CM res["CM"] = copy.deepcopy(results[0][1]) # Get the first ConfMat for r in results[1:]: for Lidx, line in enumerate(r[1]): for idx, val in enumerate(line): res["CM"][Lidx][idx] = res["CM"][Lidx][idx] + val # Add each same ConfMat position # Compute MCC res["MCC"] = evalUtilities.calcMCC(res["CM"]) # Compute ROC res["ROC"] = sum(ro[0] for ro in rocs) / self.nExtFolds # Compute foldStat res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds] res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds] res["foldStat"]["CA"] = [r[0] for r in results] res["foldStat"]["CM"] = [r[1] for r in results] res["foldStat"]["MCC"] = [evalUtilities.calcMCC(r[1]) for r in results] res["foldStat"]["ROC"] = [ro for ro in rocs] # Compute Stability res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["CA"]) else: # compute Q2 res["Q2"] = evalUtilities.calcRsqrt(exp_pred) # compute RMSE res["RMSE"] = evalUtilities.calcRMSE(exp_pred) # Compute foldStat res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds] res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds] res["foldStat"]["RMSE"] = [r[0] for r in results] res["foldStat"]["Q2"] = [r[1] for r in results] # Compute Stability value res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["Q2"]) # Evaluate stability of ML StabilityValue = res["StabilityValue"] if StabilityValue is not None: if responseType == "Classification": if statc.mean(res["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H else: if statc.mean(res["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H if StabilityValue < stableTH: # Select only stable models res["stable"] = True return res
def getStatistics( dataset, runningDir, resultsFile, mlList=[ml for ml in MLMETHODS if AZOC.MLMETHODS[ml]["useByDefault"]], queueType="NoSGE", verbose=0, getAllModels=False, callBack=None): """ runningDir (An existing dir for creating one job dir per fold) | +---- status (The overall status: "started", "finished" or the progress "1/10", "2/10", ...) | +---- fold_1 | +---- fold_2 | . . . The running will be monitorized by this method. Whenever a MLMethod fails the respective fold job is restarted """ if dataset.domain.classVar.varType == orange.VarTypes.Discrete: responseType = "Classification" else: responseType = "Regression" #Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(dataset, AZOC.QSARNEXTFOLDS) #Check data in advance so that, by chance, it will not faill at the last fold! #for foldN in range(AZOC.QSARNEXTFOLDS): #trainData = dataset.select(DataIdxs,foldN,negate=1) #checkTrainData(trainData) jobs = {} thisDir = os.getcwd() os.chdir(runningDir) #PID = os.getpid() #print "Started getStatistics in Process with PID: "+str(PID) #os.system('echo "'+str(PID)+'" > '+os.path.join(runningDir,"PID")) os.system('echo "started" > ' + os.path.join(runningDir, "status")) # Start all Fold jobs stepsDone = 0 nTotalSteps = AZOC.QSARNEXTFOLDS for fold in range(AZOC.QSARNEXTFOLDS): job = str(fold) print "Starting job for fold ", job trainData = dataset.select(DataIdxs, fold, negate=1) jobs[job] = { "job": job, "path": os.path.join(runningDir, "fold_" + job), "running": False, "failed": False, "finished": False } # Uncomment next 3 lines for running in finished jobs dirs #st, jID = commands.getstatusoutput("cat "+os.path.join(runningDir, "fold_"+job,"jID")) #jobs[job]["jID"] = jID #continue os.system("rm -rf " + jobs[job]["path"]) os.system("mkdir -p " + jobs[job]["path"]) trainData.save(os.path.join(jobs[job]["path"], "trainData.tab")) file_h = open(os.path.join(jobs[job]["path"], "run.sh"), "w") file_h.write("#!/bin/tcsh\n") file_h.write( "source " + os.path.join(os.environ["AZORANGEHOME"], "templateProfile") + "\n") file_h.write("python " + os.path.join(jobs[job]["path"], "QsubScript.py") + "\n") file_h.close() file_h = open(os.path.join(jobs[job]["path"], "QsubScript.py"), "w") file_h.write("import os\n") file_h.write("from AZutilities import dataUtilities\n") file_h.write("from AZutilities import competitiveWorkflow\n") file_h.write("data = dataUtilities.DataTable('" + os.path.join(jobs[job]["path"], "trainData.tab") + "')\n") file_h.write('os.system(\'echo "running" > ' + os.path.join(jobs[job]["path"], "status") + ' \')\n') file_h.write("models = competitiveWorkflow.getModel(data, mlList=" + str(mlList) + ", savePath = '" + os.path.join(jobs[job]["path"], "results.pkl") + "', queueType = '" + queueType + "', getAllModels = " + str(getAllModels) + ")\n") file_h.write("nModelsSaved = 0\n") file_h.write("for model in models:\n") file_h.write(" if not models[model] is None:\n") file_h.write(" models[model].write('" + os.path.join(jobs[job]["path"], "model") + "'+'_'+model)\n") file_h.write(' nModelsSaved += 1\n') file_h.write( 'if nModelsSaved == len([m for m in models if not models[m] is None ]):\n' ) file_h.write(' os.system(\'echo "finished" > ' + os.path.join(jobs[job]["path"], "status") + ' \')\n') file_h.write('else:\n') file_h.write(' os.system(\'echo "failed" > ' + os.path.join(jobs[job]["path"], "status") + ' \')\n') file_h.close() os.chdir(os.path.join(jobs[job]["path"])) if queueType == "NoSGE": # Serial mode status, out = commands.getstatusoutput( "tcsh " + os.path.join(jobs[job]["path"], "run.sh")) if status: print "ERROR on Job " + str( job) + " (will be restarted latter)" print out else: statusFile = os.path.join(jobs[job]["path"], "status") if os.path.isfile(statusFile): st, status = commands.getstatusoutput("cat " + statusFile) else: print "ERROR: Missing status file" status = None if not status: print "ERROR! job " + job + " has no status!" jobs[job]["failed"] = True elif status == "failed": print "Job " + job + " failed to build all models" jobs[job]["failed"] = True elif status == "finished": jobs[job]["finished"] = True if not isJobProgressingOK(jobs[job]): print "Job " + job + " failed to build one or more models in getMLStatistics" jobs[job]["failed"] = True jobs[job]["finished"] = False if jobs[job]["failed"]: print "Job " + job + " FAILED" else: print "Finished Job " + str(job) + " with success" if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None else: cmd = "qsub -cwd -q batch.q" + AZOC.SGE_QSUB_ARCH_OPTION_CURRENT + os.path.join( jobs[job]["path"], "run.sh") status, out = commands.getstatusoutput(cmd) if status: print "ERROR on Job " + str(job) + " (will be skipped)" print out #raise Exception("ERROR starting job for folder "+str(job)) # Your job 955801 ("template_run.sh") has been submitted jID = out.strip().split(" ")[2] print " jID: ", jID os.system('echo "' + jID + '" > ' + os.path.join(jobs[job]["path"], "jID")) jobs[job]["running"] = True jobs[job]["jID"] = jID os.chdir(runningDir) os.chdir(thisDir) finished = [] if queueType == "NoSGE": failed = [] #Report failed Jobs for job in jobs: if jobs[job]["finished"]: finished.append(job) for job in jobs: if jobs[job]["failed"]: failed.append(job) print "Successful finished Jobs: ", finished print "Failed Jobs: ", failed else: # Monitor SGE jobs untill all are finished #Monitor Fold jobs updateJobsStatus(jobs) for job in jobs: if jobs[job]["finished"]: finished.append(job) print "Jobs already finished: ", finished os.system(' echo "' + str(len(finished)) + '/' + str(AZOC.QSARNEXTFOLDS) + '" > ' + os.path.join(runningDir, "status")) while len(finished) < AZOC.QSARNEXTFOLDS: print ".", sys.stdout.flush() updateJobsStatus(jobs) for job in jobs: if jobs[job]["finished"] and job not in finished: finished.append(job) if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None print time.asctime() + ": Finished job " + str(job) os.system(' echo "' + str(len(finished)) + '/' + str(AZOC.QSARNEXTFOLDS) + '" > ' + os.path.join(runningDir, "status")) for job in [j for j in jobs if jobs[j]["failed"]]: jobs[job] = restartJob(jobs[job]) time.sleep(5) print "All fold jobs finished!" # Gather the results print "Gathering results..." #Var for saving each Fols result results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Var for saving the statistics results statistics = {} mlMethods = [ml for ml in AZOC.MLMETHODS] + ["Consensus"] sortedJobs = [job for job in jobs] sortedJobs.sort(cmp=lambda x, y: int(x) > int(y) and 1 or -1) # Place for storing the selected models results results["selectedML"] = [] exp_pred["selectedML"] = [] nTrainEx["selectedML"] = [] nTestEx["selectedML"] = [] foldSelectedML = [] for ml in mlMethods: # Loop over each MLMethod try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] logTxt = "" for job in sortedJobs: #loop over each fold modelPath = os.path.join(jobs[job]["path"], "model_" + ml) if not os.path.isdir(modelPath): if getAllModels: print "MLMethod " + ml + " not available in fold " + job continue resFile = os.path.join(jobs[job]["path"], "results.pkl") statFile_h = open(resFile) foldStat = pickle.load(statFile_h) statFile_h.close() #load model model = AZBaseClasses.modelRead(modelPath) #Test the model testData = dataset.select(DataIdxs, int(job)) nTrainEx[ml].append(model.NTrainEx) nTestEx[ml].append(len(testData)) if foldStat[ml]["selected"]: foldSelectedML.append(ml) nTrainEx["selectedML"].append(model.NTrainEx) nTestEx["selectedML"].append(len(testData)) if responseType == "Classification": results[ml].append( (evalUtilities.getClassificationAccuracy( testData, model), evalUtilities.getConfMat(testData, model))) if foldStat[ml]["selected"]: results["selectedML"].append(results[ml][-1]) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if foldStat[ml]["selected"]: results["selectedML"].append(results[ml][-1]) exp_pred["selectedML"] += local_exp_pred res = createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], responseType, len(sortedJobs), logTxt) if not res: raise Exception("No results available!") if getAllModels: statistics[ml] = copy.deepcopy(res) writeResults(statistics, resultsFile) print " OK", ml except: print "Error on MLmethod " + ml + ". It will be skipped" ml = "selectedML" res = createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], responseType, len(sortedJobs), logTxt, foldSelectedML) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) writeResults(statistics, resultsFile) os.system(' echo "finished" > ' + os.path.join(runningDir, "status")) return statistics
def getAcc(self, algorithm = None, minsup = None, atts = None): """ For regression problems, it returns the RMSE and the R2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"R2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if (self.algorithm): self.__log(" Additional structural features to be calculated inside of cross-validation") self.__log(" Algorithm for structural features: "+str(self.algorithm)) self.__log(" Minimum support parameter: "+str(self.minsup)) # Set the response type responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" "+str(responseType)) #Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) #Var for saving each Fols result results = {} exp_pred = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models={} self.__log("Calculating Statistics for MLmethods:") self.__log(" "+str([x for x in MLmethods])) for ml in MLmethods: self.__log(" > "+str(ml)+"...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN],negate=1) orig_len = len(trainData.domain.attributes) if (self.algorithm): # add structural descriptors to the training data (TG) trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, self.algorithm, self.minsup) trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, self.atts) runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam") trainData.save(os.path.join(runPath,"trainData.tab")) testData = self.data.select(DataIdxs[foldN]) if (self.algorithm): # calculate the feature values for the test data (TG) cut_off = orig_len - len(self.atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: "+str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData,smarts) testData = dataUtilities.attributeDeselectionData(testData_structDesc, self.atts) paramOptUtilities.getOptParam( learner = MLmethods[ml], trainDataFile = os.path.join(runPath,"trainData.tab"), paramList = self.paramList, useGrid = False, verbose = self.verbose, queueType = self.queueType, runPath = runPath, nExtFolds = None, nFolds = self.nInnerFolds ) if not MLmethods[ml].optimized: self.__log(" The learner "+str(ml)+" was not optimized.") raise Exception("The learner "+str(ml)+" was not optimized.") miscUtilities.removeDir(runPath) #Train the model model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred res = self.createStatObj(results[ml], exp_pred[ml], responseType, self.nExtFolds) if self.verbose > 0: print "AccWOptParamGetter!Results "+ml+":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = res.copy() self.__writeResults(res) self.__log(" OK") except: self.__log(" Learner "+str(ml)+" failed to optimize!") res = self.createStatObj() statistics[ml] = res.copy() if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! stableML={} for modelName in statistics: if statistics[modelName]["StabilityValue"] < AZOC.QSARSTABILITYTHRESHOLD: # Select only stable models stableML[modelName] = statistics[modelName].copy() if len(stableML) >= 2: self.__log("Found "+str(len(stableML))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.") if responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in stableML: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(stableML[ml]["CA"])+" " exprTest0 += ")/IF0(sum([False" for ml in stableML: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0,CLASS1) expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: R2sum = sum([stableML[ml]["R2"] for ml in stableML]) expression = "(1 / "+str(R2sum)+") * (0" for ml in stableML: expression += " + "+str(stableML[ml]["R2"])+" * "+ml+" " expression += ")" #Var for saving each Fols result Cresults = [] Cexp_pred = [] self.__log("Calculating the statistics for a Consensus model") for foldN in range(self.nExtFolds): testData = self.data.select(DataIdxs[foldN]) consensusClassifiers = {} for learnerName in stableML: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression) #Test the model if responseType == "Classification": Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, responseType, self.nExtFolds) statistics["Consensus"] = res.copy() statistics["Consensus"]["IndividualStatistics"] = stableML.copy() self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def createStatObj(results=None, exp_pred=None, nTrainCmpds=None, nTestCmpds=None, responseType=None, nExtFolds=None, userAlert="", foldSelectedML=None): #Initialize res (statObj) for statistic results res = {} # Classification res["CA"] = None res["CM"] = None res["MCC"] = None #Regression res["Q2"] = None res["RMSE"] = None #Both res["StabilityValue"] = None res["userAlert"] = userAlert res["selected"] = False res["stable"] = False res["responseType"] = False res["foldStat"] = { "nTrainCmpds": None, "nTestCmpds": None, #Regression "Q2": None, "RMSE": None, #Classification "CM": None, "CA": None, "MCC": None } if not results or results is None or exp_pred is None or responseType is None or nExtFolds is None or nTestCmpds is None or nTrainCmpds is None: return res res["responseType"] = responseType #Calculate the (Q2, RMSE) or (CM, CA) results depending on Classification or regression if responseType == "Classification": #Compute CA res["CA"] = sum(r[0] for r in results) / nExtFolds #Compute CM res["CM"] = copy.deepcopy(results[0][1]) # Get the first ConfMat for r in results[1:]: for Lidx, line in enumerate(r[1]): for idx, val in enumerate(line): res["CM"][Lidx][idx] = res["CM"][Lidx][ idx] + val #Add each same ConfMat position #Compute MCC res["MCC"] = evalUtilities.calcMCC(res["CM"]) #Compute foldStat res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds] res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds] res["foldStat"]["CA"] = [r[0] for r in results] res["foldStat"]["CM"] = [r[1] for r in results] res["foldStat"]["MCC"] = [evalUtilities.calcMCC(r[1]) for r in results] #Compute Stability res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["CA"]) else: #compute Q2 res["Q2"] = evalUtilities.calcRsqrt(exp_pred) #compute RMSE res["RMSE"] = evalUtilities.calcRMSE(exp_pred) #Compute foldStat res["foldStat"]["nTrainCmpds"] = [n for n in nTrainCmpds] res["foldStat"]["nTestCmpds"] = [n for n in nTestCmpds] res["foldStat"]["RMSE"] = [r[0] for r in results] res["foldStat"]["Q2"] = [r[1] for r in results] #Compute Stability value res["StabilityValue"] = evalUtilities.stability(res["foldStat"]["Q2"]) # Save selectedMLs if passed if foldSelectedML: res["foldStat"]["foldSelectedML"] = [ml for ml in foldSelectedML] #Evaluate stability of ML StabilityValue = res["StabilityValue"] if StabilityValue is not None: if responseType == "Classification": if statc.mean(res["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H else: if statc.mean(res["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H if StabilityValue < stableTH: # Select only stable models res["stable"] = True return res
def getAcc(self, callBack = None, callBackWithFoldModel = None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" "+str(self.responseType)) #Create the Train and test sets if self.usePreDefFolds: DataIdxs = self.preDefIndices else: DataIdxs = self.sampler(self.data, self.nExtFolds) foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0] #Folds used only from 1 on ... 0 are for fixed train Bias nFolds = len(foldsN) #Fix the Indexes based on DataIdxs # (0s) represents the train set ( >= 1s) represents the test set folds if self.useVarCtrlCV: nShifted = [0] * nFolds for idx,isTest in enumerate(self.preDefIndices): # self.preDefIndices == 0 are to be used in TrainBias if not isTest: if DataIdxs[idx]: nShifted[DataIdxs[idx]] += 1 DataIdxs[idx] = 0 for idx,shift in enumerate(nShifted): self.__log("In fold "+str(idx)+", "+str(shift)+" examples were shifted to the train set.") #Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models={} self.__log("Calculating Statistics for MLmethods:") self.__log(" "+str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in foldsN: trainData = self.data.select(DataIdxs,foldN,negate=1) self.__checkTrainData(trainData) #Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0,"PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: startTime = time.time() self.__log(" > "+str(ml)+"...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in foldsN: if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs,foldN,negate=1) testData = self.data.select(DataIdxs,foldN) smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: self.__log("Found SMILES attribute:"+smilesAttr) if MLmethods[ml].specialType == 1: trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) testData = dataUtilities.attributeSelectionData(testData, [smilesAttr, testData.domain.classVar.name]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData)*(1-1.0/self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = self.sampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs,1,negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True SpecialModel = None if dontOptimize: logTxt += " Fold "+str(foldN)+": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: if MLmethods[ml].specialType == 1: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optInfo, SpecialModel = MLmethods[ml].optimizePars(trainData, folds = 5) optAcc[ml].append(optInfo["Acc"]) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam", seed = id(trainData)) trainData.save(os.path.join(runPath,"trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner = MLmethods[ml], trainDataFile = os.path.join(runPath,"trainData.tab"), paramList = self.paramList, useGrid = False, verbose = self.verbose, queueType = self.queueType, runPath = runPath, nExtFolds = None, nFolds = self.nInnerFolds, logFile = self.logFile, getTunedPars = True, fixedParams = self.fixedParams) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log(" WARNING: GETACCWOPTPARAM: The learner "+str(ml)+" was not optimized.") self.__log(" It will be ignored") #self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: "+runPath) #Set learner back to default #MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner "+str(ml)+" was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) #Train the model if SpecialModel is not None: model = SpecialModel else: model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n,ex in enumerate(testData): local_exp_pred.append((ex.getclass().value, predictions[n].value)) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None if callBackWithFoldModel: callBackWithFoldModel(model) res = self.createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],self.responseType, self.nExtFolds, logTxt, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results "+ml+":\n" pprint(res) if not res: raise Exception("No results available!") res["runningTime"] = time.time() - startTime statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner "+str(ml)+" failed to create/optimize the model!") error = str(sys.exc_info()[0]) +" "+\ str(sys.exc_info()[1]) +" "+\ str(traceback.extract_tb(sys.exc_info()[2])) self.__log(error) res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0) consensusMLs={} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log("Found "+str(len(consensusMLs))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.") if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs={} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) # Exclude specialType models excludeThis = [] for learnerName in consensusMLs: if models[learnerName][0].specialType > 0: excludeThis.append(learnerName) for learnerName in excludeThis: consensusMLs.pop(learnerName) self.__log(" > Excluded special model " + learnerName) self.__log(" > Stable modules: " + str(consensusMLs.keys())) if len(consensusMLs) >= 2: #Var for saving each Fols result startTime = time.time() Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log("Calculating the statistics for a Consensus model based on "+str([ml for ml in consensusMLs])) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) # exprTest0 exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(optAcc[ml][foldN])+" " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in consensusMLs: exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(optAcc[ml][foldN])+" " exprTest1 += ")/IF0(sum([False" for ml in consensusMLs: exprTest1 += ", "+ml+" == "+CLASS1+" " exprTest1 += "]),1)" # Expression expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / "+str(Q2sum)+") * (0" for ml in consensusMLs: expression += " + "+str(optAcc[ml][foldN])+" * "+ml+" " expression += ")" testData = self.data.select(DataIdxs,foldN+1) # fold 0 if for the train Bias!! smilesAttr = dataUtilities.getSMILESAttr(testData) if smilesAttr: self.__log("Found SMILES attribute:"+smilesAttr) testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n,ex in enumerate(testData): local_exp_pred.append((ex.getclass().value, predictions[n].value)) Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None ) res["runningTime"] = time.time() - startTime statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def getAcc(self): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" "+str(self.responseType)) #Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) #Var for saving each Fols result results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models={} self.__log("Calculating Statistics for MLmethods:") self.__log(" "+str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN],negate=1) self.__checkTrainData(trainData) for ml in MLmethods: self.__log(" > "+str(ml)+"...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN],negate=1) testData = self.data.select(DataIdxs[foldN]) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData)*(1-1.0/self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0],negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += " Fold "+str(foldN)+": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) else: runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam") trainData.save(os.path.join(runPath,"trainData.tab")) paramOptUtilities.getOptParam( learner = MLmethods[ml], trainDataFile = os.path.join(runPath,"trainData.tab"), paramList = self.paramList, useGrid = False, verbose = self.verbose, queueType = self.queueType, runPath = runPath, nExtFolds = None, nFolds = self.nInnerFolds) if not MLmethods[ml].optimized: self.__log(" The learner "+str(ml)+" was not optimized.") raise Exception("The learner "+str(ml)+" was not optimized.") miscUtilities.removeDir(runPath) #Train the model model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred res = self.createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],self.responseType, self.nExtFolds, logTxt) if self.verbose > 0: print "AccWOptParamGetter!Results "+ml+":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = res.copy() self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner "+str(ml)+" failed to create/optimize the model!") res = self.createStatObj() statistics[ml] = res.copy() if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! stableML={} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None: if self.responseType == "Classification": if statc.mean(statistics[modelName]["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_CLASS_H else: if statc.mean(statistics[modelName]["foldStat"]["nTestCmpds"]) > 50: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_L else: stableTH = AZOC.QSARSTABILITYTHRESHOLD_REG_H if StabilityValue < stableTH: # Select only stable models stableML[modelName] = statistics[modelName].copy() if len(stableML) >= 2: self.__log("Found "+str(len(stableML))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.") if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in stableML: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(stableML[ml]["CA"])+" " exprTest0 += ")/IF0(sum([False" for ml in stableML: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0,CLASS1) expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: Q2sum = sum([stableML[ml]["Q2"] for ml in stableML]) expression = "(1 / "+str(Q2sum)+") * (0" for ml in stableML: expression += " + "+str(stableML[ml]["Q2"])+" * "+ml+" " expression += ")" #Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log("Calculating the statistics for a Consensus model") for foldN in range(self.nExtFolds): testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in stableML: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = res.copy() statistics["Consensus"]["IndividualStatistics"] = stableML.copy() self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def getStatistics(dataset, runningDir, resultsFile, queueType = "NoSGE", verbose = 0, getAllModels = False, callBack = None): """ runningDir (An existing dir for creating one job dir per fold) | +---- status (The overall status: "started", "finished" or the progress "1/10", "2/10", ...) | +---- fold_1 | +---- fold_2 | . . . The running will be monitorized by this method. Whenever a MLMethod fails the respective fold job is restarted """ if dataset.domain.classVar.varType == orange.VarTypes.Discrete: responseType = "Classification" else: responseType = "Regression" #Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(dataset, AZOC.QSARNEXTFOLDS ) #Check data in advance so that, by chance, it will not faill at the last fold! #for foldN in range(AZOC.QSARNEXTFOLDS): #trainData = dataset.select(DataIdxs[foldN],negate=1) #checkTrainData(trainData) jobs = {} thisDir = os.getcwd() os.chdir(runningDir) #PID = os.getpid() #print "Started getStatistics in Process with PID: "+str(PID) #os.system('echo "'+str(PID)+'" > '+os.path.join(runningDir,"PID")) os.system('echo "started" > '+os.path.join(runningDir,"status")) # Start all Fold jobs stepsDone = 0 nTotalSteps = AZOC.QSARNEXTFOLDS for fold in range(AZOC.QSARNEXTFOLDS): job = str(fold) print "Starting job for fold ",job trainData = dataset.select(DataIdxs[fold],negate=1) jobs[job] = {"job":job,"path":os.path.join(runningDir, "fold_"+job), "running":False, "failed":False, "finished":False} # Uncomment next 3 lines for running in finished jobs dirs #st, jID = commands.getstatusoutput("cat "+os.path.join(runningDir, "fold_"+job,"jID")) #jobs[job]["jID"] = jID #continue os.system("rm -rf "+jobs[job]["path"]) os.system("mkdir -p "+jobs[job]["path"]) trainData.save(os.path.join(jobs[job]["path"],"trainData.tab")) file_h = open(os.path.join(jobs[job]["path"],"run.sh"),"w") file_h.write("#!/bin/tcsh\n") file_h.write("source "+os.path.join(os.environ["AZORANGEHOME"], "templateProfile") + "\n") file_h.write("python "+os.path.join(jobs[job]["path"],"QsubScript.py")+"\n") file_h.close() file_h = open(os.path.join(jobs[job]["path"],"QsubScript.py"),"w") file_h.write("import os\n") file_h.write("from AZutilities import dataUtilities\n") file_h.write("from AZutilities import competitiveWorkflow\n") file_h.write("data = dataUtilities.DataTable('"+os.path.join(jobs[job]["path"],"trainData.tab")+"')\n") file_h.write('os.system(\'echo "running" > '+os.path.join(jobs[job]["path"],"status")+' \')\n') file_h.write("models = competitiveWorkflow.getModel(data, savePath = '"+os.path.join(jobs[job]["path"],"results.pkl")+"', queueType = '"+queueType+"', getAllModels = "+str(getAllModels)+")\n") file_h.write("nModelsSaved = 0\n") file_h.write("for model in models:\n") file_h.write(" if not models[model] is None:\n") file_h.write(" models[model].write('"+os.path.join(jobs[job]["path"],"model")+"'+'_'+model)\n") file_h.write(' nModelsSaved += 1\n') file_h.write('if nModelsSaved == len([m for m in models if not models[m] is None ]):\n') file_h.write(' os.system(\'echo "finished" > '+os.path.join(jobs[job]["path"],"status")+' \')\n') file_h.write('else:\n') file_h.write(' os.system(\'echo "failed" > '+os.path.join(jobs[job]["path"],"status")+' \')\n') file_h.close() os.chdir(os.path.join(jobs[job]["path"])) if queueType == "NoSGE": # Serial mode status, out = commands.getstatusoutput("tcsh " + os.path.join(jobs[job]["path"],"run.sh")) if status: print "ERROR on Job "+str(job)+" (will be restarted latter)" print out else: statusFile = os.path.join(jobs[job]["path"],"status") if os.path.isfile(statusFile): st, status = commands.getstatusoutput("cat "+statusFile) else: print "ERROR: Missing status file" status = None if not status: print "ERROR! job "+job+" has no status!" jobs[job]["failed"] = True elif status == "failed": print "Job "+job+" failed to build all models" jobs[job]["failed"] = True elif status == "finished": jobs[job]["finished"] = True if not isJobProgressingOK(jobs[job]): print "Job "+job+" failed to build one or more models in getMLStatistics" jobs[job]["failed"] = True jobs[job]["finished"] = False if jobs[job]["failed"]: print "Job "+job+" FAILED" else: print "Finished Job "+str(job)+" with success" if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None else: cmd = "qsub -cwd -q batch.q" + AZOC.SGE_QSUB_ARCH_OPTION_CURRENT + os.path.join(jobs[job]["path"],"run.sh") status, out = commands.getstatusoutput(cmd) if status: print "ERROR on Job "+str(job)+" (will be skipped)" print out #raise Exception("ERROR starting job for folder "+str(job)) # Your job 955801 ("template_run.sh") has been submitted jID = out.strip().split(" ")[2] print " jID: ",jID os.system('echo "'+jID+'" > '+os.path.join(jobs[job]["path"], "jID")) jobs[job]["running"] = True jobs[job]["jID"] = jID os.chdir(runningDir) os.chdir(thisDir) finished = [] if queueType == "NoSGE": failed = [] #Report failed Jobs for job in jobs: if jobs[job]["finished"]: finished.append(job) for job in jobs: if jobs[job]["failed"]: failed.append(job) print "Successful finished Jobs: ",finished print "Failed Jobs: ",failed else: # Monitor SGE jobs untill all are finished #Monitor Fold jobs updateJobsStatus(jobs) for job in jobs: if jobs[job]["finished"]: finished.append(job) print "Jobs already finished: ",finished os.system(' echo "'+str(len(finished))+'/'+str(AZOC.QSARNEXTFOLDS)+'" > '+os.path.join(runningDir,"status")) while len(finished) < AZOC.QSARNEXTFOLDS: print ".", sys.stdout.flush() updateJobsStatus(jobs) for job in jobs: if jobs[job]["finished"] and job not in finished: finished.append(job) if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None print time.asctime()+": Finished job "+str(job) os.system(' echo "'+str(len(finished))+'/'+str(AZOC.QSARNEXTFOLDS)+'" > '+os.path.join(runningDir,"status")) for job in [j for j in jobs if jobs[j]["failed"]]: jobs[job] = restartJob(jobs[job]) time.sleep(5) print "All fold jobs finished!" # Gather the results print "Gathering results..." #Var for saving each Fols result results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Var for saving the statistics results statistics = {} mlMethods = [ml for ml in AZOC.MLMETHODS] + ["Consensus"] sortedJobs = [job for job in jobs] sortedJobs.sort(cmp = lambda x,y:int(x)>int(y) and 1 or -1) # Place for storing the selected models results results["selectedML"] = [] exp_pred["selectedML"] = [] nTrainEx["selectedML"] = [] nTestEx["selectedML"] = [] foldSelectedML = [] for ml in mlMethods: # Loop over each MLMethod try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] logTxt = "" for job in sortedJobs: #loop over each fold modelPath = os.path.join(jobs[job]["path"], "model_"+ml) if not os.path.isdir(modelPath): if getAllModels: print "MLMethod "+ml+" not available in fold "+job continue resFile = os.path.join(jobs[job]["path"], "results.pkl") statFile_h = open(resFile) foldStat = pickle.load(statFile_h) statFile_h.close() #load model model = AZBaseClasses.modelRead(modelPath) #Test the model testData = dataset.select(DataIdxs[int(job)]) nTrainEx[ml].append(model.NTrainEx) nTestEx[ml].append(len(testData)) if foldStat[ml]["selected"]: foldSelectedML.append(ml) nTrainEx["selectedML"].append(model.NTrainEx) nTestEx["selectedML"].append(len(testData)) if responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) if foldStat[ml]["selected"]: results["selectedML"].append(results[ml][-1]) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if foldStat[ml]["selected"]: results["selectedML"].append(results[ml][-1]) exp_pred["selectedML"]+= local_exp_pred res = createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],responseType, len(sortedJobs), logTxt) if not res: raise Exception("No results available!") if getAllModels: statistics[ml] = copy.deepcopy(res) writeResults(statistics, resultsFile) print " OK",ml except: print "Error on MLmethod "+ml+". It will be skipped" ml = "selectedML" res = createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],responseType, len(sortedJobs), logTxt, foldSelectedML) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) writeResults(statistics, resultsFile) os.system(' echo "finished" > '+os.path.join(runningDir,"status")) return statistics
def getAcc(self, callBack=None, callBackWithFoldModel=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" " + str(self.responseType)) #Create the Train and test sets if self.usePreDefFolds: DataIdxs = self.preDefIndices else: DataIdxs = self.sampler(self.data, self.nExtFolds) foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0 ] #Folds used only from 1 on ... 0 are for fixed train Bias nFolds = len(foldsN) #Fix the Indexes based on DataIdxs # (0s) represents the train set ( >= 1s) represents the test set folds if self.useVarCtrlCV: nShifted = [0] * nFolds for idx, isTest in enumerate( self.preDefIndices ): # self.preDefIndices == 0 are to be used in TrainBias if not isTest: if DataIdxs[idx]: nShifted[DataIdxs[idx]] += 1 DataIdxs[idx] = 0 for idx, shift in enumerate(nShifted): self.__log("In fold " + str(idx) + ", " + str(shift) + " examples were shifted to the train set.") #Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in foldsN: trainData = self.data.select(DataIdxs, foldN, negate=1) self.__checkTrainData(trainData) #Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: startTime = time.time() self.__log(" > " + str(ml) + "...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in foldsN: if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs, foldN, negate=1) testData = self.data.select(DataIdxs, foldN) smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: self.__log("Found SMILES attribute:" + smilesAttr) if MLmethods[ml].specialType == 1: trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) testData = dataUtilities.attributeSelectionData( testData, [smilesAttr, testData.domain.classVar.name]) self.__log( "Selected attrs: " + str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) testData = dataUtilities.attributeDeselectionData( testData, [smilesAttr]) self.__log("Selected attrs: " + str( [attr.name for attr in trainData.domain[0:3]] + ["..."] + [ attr.name for attr in trainData. domain[len(trainData.domain) - 3:] ])) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and ( len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = self.sampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs, 1, negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True SpecialModel = None if dontOptimize: logTxt += " Fold " + str( foldN ) + ": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: if MLmethods[ml].specialType == 1: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optInfo, SpecialModel = MLmethods[ ml].optimizePars(trainData, folds=5) optAcc[ml].append(optInfo["Acc"]) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)) trainData.save( os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join( runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, fixedParams=self.fixedParams) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized.") self.__log( " It will be ignored") #self.__log(" It will be set to default parameters") self.__log( " DEBUG can be done in: " + runPath) #Set learner back to default #MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint( 0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) #Train the model if SpecialModel is not None: model = SpecialModel else: model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append( (evalUtilities.getClassificationAccuracy( testData, model), evalUtilities.getConfMat(testData, model))) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n, ex in enumerate(testData): local_exp_pred.append( (ex.getclass().value, predictions[n].value)) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None if callBackWithFoldModel: callBackWithFoldModel(model) res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, labels=hasattr(self.data.domain.classVar, "values") and list(self.data.domain.classVar.values) or None) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") res["runningTime"] = time.time() - startTime statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") error = str(sys.exc_info()[0]) +" "+\ str(sys.exc_info()[1]) +" "+\ str(traceback.extract_tb(sys.exc_info()[2])) self.__log(error) res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0) consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName][ "stable"]: consensusMLs[modelName] = copy.deepcopy( statistics[modelName]) self.__log("Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods.") if len(consensusMLs ) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy( statistics[modelName]) # Exclude specialType models excludeThis = [] for learnerName in consensusMLs: if models[learnerName][0].specialType > 0: excludeThis.append(learnerName) for learnerName in excludeThis: consensusMLs.pop(learnerName) self.__log(" > Excluded special model " + learnerName) self.__log(" > Stable modules: " + str(consensusMLs.keys())) if len(consensusMLs) >= 2: #Var for saving each Fols result startTime = time.time() Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) # exprTest0 exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str( optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in consensusMLs: exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str( optAcc[ml][foldN]) + " " exprTest1 += ")/IF0(sum([False" for ml in consensusMLs: exprTest1 += ", " + ml + " == " + CLASS1 + " " exprTest1 += "]),1)" # Expression expression = [ exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1 ] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str( optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select( DataIdxs, foldN + 1) # fold 0 if for the train Bias!! smilesAttr = dataUtilities.getSMILESAttr(testData) if smilesAttr: self.__log("Found SMILES attribute:" + smilesAttr) testData = dataUtilities.attributeDeselectionData( testData, [smilesAttr]) self.__log("Selected attrs: " + str( [attr.name for attr in trainData.domain[0:3]] + ["..."] + [ attr.name for attr in trainData.domain[len(trainData.domain) - 3:] ])) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[ learnerName][foldN] model = AZorngConsensus.ConsensusClassifier( classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append( (evalUtilities.getClassificationAccuracy( testData, model), evalUtilities.getConfMat(testData, model))) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n, ex in enumerate(testData): local_exp_pred.append( (ex.getclass().value, predictions[n].value)) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj( Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels=hasattr(self.data.domain.classVar, "values") and list(self.data.domain.classVar.values) or None) res["runningTime"] = time.time() - startTime statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"][ "IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]