def getSMARTSrecalcDesc(data, smarts): """ Calculates structural descriptors for test and training data. In other words, checks for the substructure occurrence (0/1) in the test or prediction molecules. Uses RDK. Expects the test/prediction data and a list of SMARTS strings. Returns the data including the new features. """ smilesName = dataUtilities.getSMILESAttr(data) if not smilesName or type(smarts) != list or not len(smarts): print "Please check the input parameters" return None existingAttrs = [attr for attr in smarts if attr in data.domain] if existingAttrs: print "The input data cannot contain the smarts to be calculated!" return None newdomain = orange.Domain(data.domain.attributes + \ [orange.FloatVariable(attr, numberOfDecimals=1) for attr in smarts],\ data.domain.classVar ) newdata = orange.ExampleTable(newdomain, data) for ex in newdata: smile = str(ex[smilesName].value) mol = rdk.Chem.MolFromSmiles(smile) if mol is None: continue for smrt in smarts: patt = rdk.Chem.MolFromSmarts(smrt) if mol.HasSubstructMatch(patt): ex[smrt] = 1.0 else: ex[smrt] = 0.0 return newdata
def getMLStatistics(trainData, mlList=[ml for ml in MLMETHODS if AZOC.MLMETHODS[ml]["useByDefault"]], savePath = None, queueType = "NoSGE", verbose = 0, logFile = None, callBack = None): """ Loop over all MLMETHODS to get their statistics Write to disk the full MLStatistics including the consensus model: Consensus model statistics will be calculated out of the a Consensus model based on MLmethods that are stable (beased on StabilityValue) """ log(logFile, "Running getMLStatistics...") MLStatistics = {} learners = {} smilesAttr = dataUtilities.getSMILESAttr(trainData) for ml in mlList: learner = MLMETHODS[ml](name = ml) if not learner.isCompatible(trainData.domain.classVar) : log(logFile, "Ignored learner "+str(ml)+" since it's not compatible with this class.") continue if learner.specialType == 1 and not smilesAttr: log(logFile, "Ignored learner "+str(ml)+" since it's special and requires a SMILES attribute.") continue learners[ml] = learner # Forced queueType to NoSGE so that appspack do not fload the cluster evaluator = getUnbiasedAccuracy.UnbiasedAccuracyGetter(data = trainData, learner = learners, paramList = None, nExtFolds = AZOC.QSARNINNERFOLDS, nInnerFolds = AZOC.QSARNCVFOLDS, queueType = "NoSGE", verbose = verbose, logFile = logFile, resultsFile = savePath) MLStatistics = evaluator.getAcc(callBack = callBack) saveMLStatistics(savePath, MLStatistics, logFile) return MLStatistics
def getSMILESAttr(data): # Check that the data contains a SMILES attribute smilesName = dataUtilities.getSMILESAttr(data) if not smilesName: print "Warning: The data set does not contain any known smiles attribute!" print "No similarity descriptors added!" return None else: return smilesName
def getSMILESAttr(data): # Check that the data contains a SMILES attribute smilesName = dataUtilities.getSMILESAttr(data) if not smilesName: print "Warning: The data set does not contain any known smiles attribute!" print " Expected SMILES attribute names: "+str(AZOC.SMILESNAMES) print "No Cinfony descriptors added!" return None else: return smilesName
def buildConsensus(trainData, learners, MLMethods, logFile = None): log(logFile, "Building a consensus model based on optimized MLmethods: "+str([ml for ml in MLMethods])+"...") if trainData.domain.classVar.varType == orange.VarTypes.Discrete: #Expression: If CAavg_{POS} ge CAavg_{NEG} -> POS else -> NEG # where CAavg_{POS} is the average of classification accuracies of all models predicting POS. CLASS0 = str(trainData.domain.classVar.values[0]) CLASS1 = str(trainData.domain.classVar.values[1]) #exprTest0 exprTest0 = "(0" for ml in MLMethods: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(MLMethods[ml]["optAcc"])+" " exprTest0 += ")/IF0(sum([False" for ml in MLMethods: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in MLMethods: exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(MLMethods[ml]["optAcc"])+" " exprTest1 += ")/IF0(sum([False" for ml in MLMethods: exprTest1 += ", "+ml+" == "+CLASS1+" " exprTest1 += "]),1)" # expression expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: Q2sum = sum([MLMethods[ml]["optAcc"] for ml in MLMethods]) expression = "(1 / "+str(Q2sum)+") * (0" for ml in MLMethods: expression += " + "+str(MLMethods[ml]["optAcc"])+" * " + ml +" " expression += ")" consensusLearners = {} for learnerName in learners: consensusLearners[learnerName] = learners[learnerName] learner = AZorngConsensus.ConsensusLearner(learners = consensusLearners, expression = expression) log(logFile, " Training Consensus Learner") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile,"Found SMILES attribute:"+smilesAttr) if learner.specialType == 1: trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] +\ [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) return learner(trainData)
def __createBBRCInputs(self): if not self.data: print "ERROR: Data must be loaded first!" return None if self.active not in self.data.domain.classVar.values: print "ERROR: '" + str(self.active) + "' is not part of the class values!" return None smilesName = dataUtilities.getSMILESAttr(self.data) for idx, ex in enumerate(self.data): if ex.getclass().value == self.active: activity = 1 else: activity = 0 ID = idx + 1 # ID is the number of coumpound in self.data which is the number os the example (1 based!) self.MyFminer.AddCompound(str(ex[smilesName].value), ID) self.MyFminer.AddActivity(activity, ID)
def getMLStatistics( trainData, mlList=[ml for ml in MLMETHODS if AZOC.MLMETHODS[ml]["useByDefault"]], savePath=None, queueType="NoSGE", verbose=0, logFile=None, callBack=None): """ Loop over all MLMETHODS to get their statistics Write to disk the full MLStatistics including the consensus model: Consensus model statistics will be calculated out of the a Consensus model based on MLmethods that are stable (beased on StabilityValue) """ log(logFile, "Running getMLStatistics...") MLStatistics = {} learners = {} smilesAttr = dataUtilities.getSMILESAttr(trainData) for ml in mlList: learner = MLMETHODS[ml](name=ml) if not learner.isCompatible(trainData.domain.classVar): log( logFile, "Ignored learner " + str(ml) + " since it's not compatible with this class.") continue if learner.specialType == 1 and not smilesAttr: log( logFile, "Ignored learner " + str(ml) + " since it's special and requires a SMILES attribute.") continue learners[ml] = learner # Forced queueType to NoSGE so that appspack do not fload the cluster evaluator = getUnbiasedAccuracy.UnbiasedAccuracyGetter( data=trainData, learner=learners, paramList=None, nExtFolds=AZOC.QSARNINNERFOLDS, nInnerFolds=AZOC.QSARNCVFOLDS, queueType="NoSGE", verbose=verbose, logFile=logFile, resultsFile=savePath) MLStatistics = evaluator.getAcc(callBack=callBack) saveMLStatistics(savePath, MLStatistics, logFile) return MLStatistics
def __createBBRCInputs(self): if not self.data: print "ERROR: Data must be loaded first!" return None if self.active and self.active not in self.data.domain.classVar.values: print "ERROR: '" + str( self.active) + "' is not part of the class values!" return None smilesName = dataUtilities.getSMILESAttr(self.data) print "SMILES attr detected: ", smilesName for idx, ex in enumerate(self.data): if not self.active: activity = 0 #It is unknown elif ex.getclass().value == self.active: activity = 1 else: activity = 0 ID = idx + 1 # ID is the number of coumpound in self.data which is the number os the example (1 based!) self.MyFminer.AddCompound(str(ex[smilesName].value), ID) self.MyFminer.AddActivity(activity, ID)
def buildModel(trainData, MLMethod, queueType="NoSGE", verbose=0, logFile=None): """ Buld the method passed in MLMethod and optimize ( "IndividualStatistics" not in MLMethod) if MLMethod is a Consensus ("individualStatistics" in MLMethod) , build each and optimize first all models and after build the consensus! """ log(logFile, "Building and optimizing learner: " + MLMethod["MLMethod"] + "...") learners = {} MLMethods = {} if "IndividualStatistics" in MLMethod: #It is a consensus and will certaily not contain any #special model as it was filtered in the getUnbiasedAcc for ML in MLMethod["IndividualStatistics"]: MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML]) else: ML = MLMethod["MLMethod"] if MLMETHODS[ML]( name=ML ).specialType == 1: # If is a special model and has a built-in optimizaer log(logFile, " This is a special model") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile, "Found SMILES attribute:" + smilesAttr) trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) optInfo, SpecialModel = MLMETHODS[ML](name=ML).optimizePars( trainData, folds=5) return SpecialModel else: MLMethods[MLMethod["MLMethod"]] = MLMethod smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) # optimize all MLMethods for ML in MLMethods: log(logFile, " Optimizing MLmethod: " + ML) learners[ML] = MLMETHODS[ML](name=ML) runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="competitiveWorkflow_BuildModel") trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam(learner=learners[ML], trainDataFile=os.path.join( runPath, "trainData.tab"), useGrid=False, verbose=verbose, queueType=queueType, runPath=runPath, nExtFolds=None, logFile=logFile, getTunedPars=True) if not learners[ML].optimized: print "WARNING: competitiveWorkflow: The learner " + str( learners[ML]) + " was not optimized." #print " Using default parameters" print " The " + str(learners[ML]) + " will not be included" #print " Returning None" print " DEBUG can be made in: " + runPath #Setting default parameters #learners[ML] = learners[ML].__class__() #return None learners.pop(ML) continue else: print "Optimized learner ", learners[ML] if trainData.domain.classVar.varType == orange.VarTypes.Discrete: MLMethods[ML]["optAcc"] = tunedPars[0] else: res = orngTest.crossValidation( [learners[ML]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] MLMethods[ML]["optAcc"] = R2 miscUtilities.removeDir(runPath) #Train the model if len(learners) == 1: log(logFile, " Building the model:" + learners.keys()[0]) model = learners[learners.keys()[0]](trainData) elif len(learners) >= 1: model = buildConsensus(trainData, learners, MLMethods) else: print "ERROR: No Learners were selected!" return None return model
def buildConsensus(trainData, learners, MLMethods, logFile=None): log( logFile, "Building a consensus model based on optimized MLmethods: " + str([ml for ml in MLMethods]) + "...") if trainData.domain.classVar.varType == orange.VarTypes.Discrete: #Expression: If CAavg_{POS} ge CAavg_{NEG} -> POS else -> NEG # where CAavg_{POS} is the average of classification accuracies of all models predicting POS. CLASS0 = str(trainData.domain.classVar.values[0]) CLASS1 = str(trainData.domain.classVar.values[1]) #exprTest0 exprTest0 = "(0" for ml in MLMethods: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str( MLMethods[ml]["optAcc"]) + " " exprTest0 += ")/IF0(sum([False" for ml in MLMethods: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in MLMethods: exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str( MLMethods[ml]["optAcc"]) + " " exprTest1 += ")/IF0(sum([False" for ml in MLMethods: exprTest1 += ", " + ml + " == " + CLASS1 + " " exprTest1 += "]),1)" # expression expression = [ exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1 ] else: Q2sum = sum([MLMethods[ml]["optAcc"] for ml in MLMethods]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in MLMethods: expression += " + " + str( MLMethods[ml]["optAcc"]) + " * " + ml + " " expression += ")" consensusLearners = {} for learnerName in learners: consensusLearners[learnerName] = learners[learnerName] learner = AZorngConsensus.ConsensusLearner(learners=consensusLearners, expression=expression) log(logFile, " Training Consensus Learner") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile, "Found SMILES attribute:" + smilesAttr) if learner.specialType == 1: trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) log( logFile, "Selected attrs: " + str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] +\ [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) return learner(trainData)
def getAcc(self, callBack = None, callBackWithFoldModel = None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" "+str(self.responseType)) #Create the Train and test sets if self.usePreDefFolds: DataIdxs = self.preDefIndices else: DataIdxs = self.sampler(self.data, self.nExtFolds) foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0] #Folds used only from 1 on ... 0 are for fixed train Bias nFolds = len(foldsN) #Fix the Indexes based on DataIdxs # (0s) represents the train set ( >= 1s) represents the test set folds if self.useVarCtrlCV: nShifted = [0] * nFolds for idx,isTest in enumerate(self.preDefIndices): # self.preDefIndices == 0 are to be used in TrainBias if not isTest: if DataIdxs[idx]: nShifted[DataIdxs[idx]] += 1 DataIdxs[idx] = 0 for idx,shift in enumerate(nShifted): self.__log("In fold "+str(idx)+", "+str(shift)+" examples were shifted to the train set.") #Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models={} self.__log("Calculating Statistics for MLmethods:") self.__log(" "+str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in foldsN: trainData = self.data.select(DataIdxs,foldN,negate=1) self.__checkTrainData(trainData) #Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0,"PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: startTime = time.time() self.__log(" > "+str(ml)+"...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in foldsN: if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs,foldN,negate=1) testData = self.data.select(DataIdxs,foldN) smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: self.__log("Found SMILES attribute:"+smilesAttr) if MLmethods[ml].specialType == 1: trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) testData = dataUtilities.attributeSelectionData(testData, [smilesAttr, testData.domain.classVar.name]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData)*(1-1.0/self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = self.sampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs,1,negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True SpecialModel = None if dontOptimize: logTxt += " Fold "+str(foldN)+": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: if MLmethods[ml].specialType == 1: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optInfo, SpecialModel = MLmethods[ml].optimizePars(trainData, folds = 5) optAcc[ml].append(optInfo["Acc"]) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam", seed = id(trainData)) trainData.save(os.path.join(runPath,"trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner = MLmethods[ml], trainDataFile = os.path.join(runPath,"trainData.tab"), paramList = self.paramList, useGrid = False, verbose = self.verbose, queueType = self.queueType, runPath = runPath, nExtFolds = None, nFolds = self.nInnerFolds, logFile = self.logFile, getTunedPars = True, fixedParams = self.fixedParams) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log(" WARNING: GETACCWOPTPARAM: The learner "+str(ml)+" was not optimized.") self.__log(" It will be ignored") #self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: "+runPath) #Set learner back to default #MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner "+str(ml)+" was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) #Train the model if SpecialModel is not None: model = SpecialModel else: model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n,ex in enumerate(testData): local_exp_pred.append((ex.getclass().value, predictions[n].value)) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None if callBackWithFoldModel: callBackWithFoldModel(model) res = self.createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],self.responseType, self.nExtFolds, logTxt, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results "+ml+":\n" pprint(res) if not res: raise Exception("No results available!") res["runningTime"] = time.time() - startTime statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner "+str(ml)+" failed to create/optimize the model!") error = str(sys.exc_info()[0]) +" "+\ str(sys.exc_info()[1]) +" "+\ str(traceback.extract_tb(sys.exc_info()[2])) self.__log(error) res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0) consensusMLs={} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log("Found "+str(len(consensusMLs))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.") if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs={} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) # Exclude specialType models excludeThis = [] for learnerName in consensusMLs: if models[learnerName][0].specialType > 0: excludeThis.append(learnerName) for learnerName in excludeThis: consensusMLs.pop(learnerName) self.__log(" > Excluded special model " + learnerName) self.__log(" > Stable modules: " + str(consensusMLs.keys())) if len(consensusMLs) >= 2: #Var for saving each Fols result startTime = time.time() Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log("Calculating the statistics for a Consensus model based on "+str([ml for ml in consensusMLs])) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) # exprTest0 exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(optAcc[ml][foldN])+" " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in consensusMLs: exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(optAcc[ml][foldN])+" " exprTest1 += ")/IF0(sum([False" for ml in consensusMLs: exprTest1 += ", "+ml+" == "+CLASS1+" " exprTest1 += "]),1)" # Expression expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / "+str(Q2sum)+") * (0" for ml in consensusMLs: expression += " + "+str(optAcc[ml][foldN])+" * "+ml+" " expression += ")" testData = self.data.select(DataIdxs,foldN+1) # fold 0 if for the train Bias!! smilesAttr = dataUtilities.getSMILESAttr(testData) if smilesAttr: self.__log("Found SMILES attribute:"+smilesAttr) testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n,ex in enumerate(testData): local_exp_pred.append((ex.getclass().value, predictions[n].value)) Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None ) res["runningTime"] = time.time() - startTime statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def buildModel(trainData, MLMethod, queueType = "NoSGE", verbose = 0, logFile = None): """ Buld the method passed in MLMethod and optimize ( "IndividualStatistics" not in MLMethod) if MLMethod is a Consensus ("individualStatistics" in MLMethod) , build each and optimize first all models and after build the consensus! """ log(logFile, "Building and optimizing learner: "+MLMethod["MLMethod"]+"...") learners = {} MLMethods = {} if "IndividualStatistics" in MLMethod: #It is a consensus and will certaily not contain any #special model as it was filtered in the getUnbiasedAcc for ML in MLMethod["IndividualStatistics"]: MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML]) else: ML = MLMethod["MLMethod"] if MLMETHODS[ML](name = ML).specialType == 1: # If is a special model and has a built-in optimizaer log(logFile, " This is a special model") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile,"Found SMILES attribute:"+smilesAttr) trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) optInfo, SpecialModel = MLMETHODS[ML](name = ML).optimizePars(trainData, folds = 5) return SpecialModel else: MLMethods[MLMethod["MLMethod"]] = MLMethod smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) # optimize all MLMethods for ML in MLMethods: log(logFile, " Optimizing MLmethod: "+ML) learners[ML] = MLMETHODS[ML](name = ML) runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "competitiveWorkflow_BuildModel") trainData.save(os.path.join(runPath,"trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner = learners[ML], trainDataFile = os.path.join(runPath,"trainData.tab"), useGrid = False, verbose = verbose, queueType = queueType, runPath = runPath, nExtFolds = None, logFile = logFile, getTunedPars = True) if not learners[ML].optimized: print "WARNING: competitiveWorkflow: The learner "+str(learners[ML])+" was not optimized." #print " Using default parameters" print " The "+str(learners[ML])+" will not be included" #print " Returning None" print " DEBUG can be made in: "+runPath #Setting default parameters #learners[ML] = learners[ML].__class__() #return None learners.pop(ML) continue else: print "Optimized learner ",learners[ML] if trainData.domain.classVar.varType == orange.VarTypes.Discrete: MLMethods[ML]["optAcc"] = tunedPars[0] else: res = orngTest.crossValidation([learners[ML]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] MLMethods[ML]["optAcc"] = R2 miscUtilities.removeDir(runPath) #Train the model if len(learners) == 1: log(logFile, " Building the model:"+learners.keys()[0]) model = learners[learners.keys()[0]](trainData) elif len(learners) >= 1: model = buildConsensus(trainData,learners,MLMethods) else: print "ERROR: No Learners were selected!" return None return model
def getAcc(self, callBack=None, callBackWithFoldModel=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" " + str(self.responseType)) #Create the Train and test sets if self.usePreDefFolds: DataIdxs = self.preDefIndices else: DataIdxs = self.sampler(self.data, self.nExtFolds) foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0 ] #Folds used only from 1 on ... 0 are for fixed train Bias nFolds = len(foldsN) #Fix the Indexes based on DataIdxs # (0s) represents the train set ( >= 1s) represents the test set folds if self.useVarCtrlCV: nShifted = [0] * nFolds for idx, isTest in enumerate( self.preDefIndices ): # self.preDefIndices == 0 are to be used in TrainBias if not isTest: if DataIdxs[idx]: nShifted[DataIdxs[idx]] += 1 DataIdxs[idx] = 0 for idx, shift in enumerate(nShifted): self.__log("In fold " + str(idx) + ", " + str(shift) + " examples were shifted to the train set.") #Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in foldsN: trainData = self.data.select(DataIdxs, foldN, negate=1) self.__checkTrainData(trainData) #Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: startTime = time.time() self.__log(" > " + str(ml) + "...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in foldsN: if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs, foldN, negate=1) testData = self.data.select(DataIdxs, foldN) smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: self.__log("Found SMILES attribute:" + smilesAttr) if MLmethods[ml].specialType == 1: trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) testData = dataUtilities.attributeSelectionData( testData, [smilesAttr, testData.domain.classVar.name]) self.__log( "Selected attrs: " + str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) testData = dataUtilities.attributeDeselectionData( testData, [smilesAttr]) self.__log("Selected attrs: " + str( [attr.name for attr in trainData.domain[0:3]] + ["..."] + [ attr.name for attr in trainData. domain[len(trainData.domain) - 3:] ])) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and ( len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = self.sampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs, 1, negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True SpecialModel = None if dontOptimize: logTxt += " Fold " + str( foldN ) + ": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: if MLmethods[ml].specialType == 1: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optInfo, SpecialModel = MLmethods[ ml].optimizePars(trainData, folds=5) optAcc[ml].append(optInfo["Acc"]) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)) trainData.save( os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join( runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, fixedParams=self.fixedParams) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized.") self.__log( " It will be ignored") #self.__log(" It will be set to default parameters") self.__log( " DEBUG can be done in: " + runPath) #Set learner back to default #MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint( 0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) #Train the model if SpecialModel is not None: model = SpecialModel else: model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append( (evalUtilities.getClassificationAccuracy( testData, model), evalUtilities.getConfMat(testData, model))) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n, ex in enumerate(testData): local_exp_pred.append( (ex.getclass().value, predictions[n].value)) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None if callBackWithFoldModel: callBackWithFoldModel(model) res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, labels=hasattr(self.data.domain.classVar, "values") and list(self.data.domain.classVar.values) or None) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") res["runningTime"] = time.time() - startTime statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") error = str(sys.exc_info()[0]) +" "+\ str(sys.exc_info()[1]) +" "+\ str(traceback.extract_tb(sys.exc_info()[2])) self.__log(error) res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0) consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName][ "stable"]: consensusMLs[modelName] = copy.deepcopy( statistics[modelName]) self.__log("Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods.") if len(consensusMLs ) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy( statistics[modelName]) # Exclude specialType models excludeThis = [] for learnerName in consensusMLs: if models[learnerName][0].specialType > 0: excludeThis.append(learnerName) for learnerName in excludeThis: consensusMLs.pop(learnerName) self.__log(" > Excluded special model " + learnerName) self.__log(" > Stable modules: " + str(consensusMLs.keys())) if len(consensusMLs) >= 2: #Var for saving each Fols result startTime = time.time() Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) # exprTest0 exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str( optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in consensusMLs: exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str( optAcc[ml][foldN]) + " " exprTest1 += ")/IF0(sum([False" for ml in consensusMLs: exprTest1 += ", " + ml + " == " + CLASS1 + " " exprTest1 += "]),1)" # Expression expression = [ exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1 ] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str( optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select( DataIdxs, foldN + 1) # fold 0 if for the train Bias!! smilesAttr = dataUtilities.getSMILESAttr(testData) if smilesAttr: self.__log("Found SMILES attribute:" + smilesAttr) testData = dataUtilities.attributeDeselectionData( testData, [smilesAttr]) self.__log("Selected attrs: " + str( [attr.name for attr in trainData.domain[0:3]] + ["..."] + [ attr.name for attr in trainData.domain[len(trainData.domain) - 3:] ])) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[ learnerName][foldN] model = AZorngConsensus.ConsensusClassifier( classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append( (evalUtilities.getClassificationAccuracy( testData, model), evalUtilities.getConfMat(testData, model))) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n, ex in enumerate(testData): local_exp_pred.append( (ex.getclass().value, predictions[n].value)) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj( Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels=hasattr(self.data.domain.classVar, "values") and list(self.data.domain.classVar.values) or None) res["runningTime"] = time.time() - startTime statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"][ "IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]