def test_FeedLearnersReg(self): """Test the creation of Consensus feeding Learners for regression""" #The Learners can be individualy costumized before passing them to the Consensus learners = [AZorngCvSVM.CvSVMLearner(), AZorngCvANN.CvANNLearner(), AZorngRF.RFLearner()] #Passing now the learnersObj instead learner = AZorngConsensus.ConsensusLearner(learnersObj = learners) classifier = learner(self.DataReg) predictions = [] for ex in self.DataReg: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in self.DataReg: predictionsL.append(Loaded(ex)) self.assertEqual([round(pred.value,4) for pred in predictions],[round(pred.value,4) for pred in predictionsL],"Loaded model predictions differ: Pred. 1 (saved/loaded):"+str(predictions[0])+" / "+str(predictionsL[0])) self.assertEqual(len(Loaded.domain),len(self.DataReg.domain)) self.assertEqual(len(Loaded.imputeData) , len(Loaded.domain)) self.assertEqual(len(Loaded.basicStat), len(Loaded.domain)) self.assertEqual(Loaded.NTrainEx, len(self.DataReg)) miscUtilities.removeDir(scratchdir)
def teste_FeelLearnersClass(self): """Test the creation of Consensus feeding Learners for classification""" #The Learners can be individualy costumized before passing them to the Consensus learners = [AZorngCvSVM.CvSVMLearner(), AZorngCvANN.CvANNLearner(), AZorngRF.RFLearner()] #Passing now the learnersObj instead learner = AZorngConsensus.ConsensusLearner(learnersObj = learners) classifier = learner(self.irisData) predictions = [] for ex in self.irisData: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in self.irisData: predictionsL.append(Loaded(ex)) self.assertEqual(predictions,predictionsL) self.assertEqual(len(Loaded.domain),len(self.irisData.domain)) self.assertEqual(len(Loaded.imputeData) , len(Loaded.domain)) self.assertEqual(len(Loaded.basicStat), len(Loaded.domain)) self.assertEqual(Loaded.NTrainEx, len(self.irisData)) miscUtilities.removeDir(scratchdir)
def test_FeedClassifiersReg(self): """Test the feeding of regression classifiers """ #DataSet = dataUtilities.DataTable("/home/palmeida/dev/OpenAZOTesteInstall/tests/source/data/linearTrain.tab") DataSet = self.DataReg learners = [AZorngCvSVM.CvSVMLearner(), AZorngCvANN.CvANNLearner(), AZorngRF.RFLearner()] classifiers = [l(DataSet) for l in learners] classifier = AZorngConsensus.ConsensusClassifier(classifiers = classifiers) predictions = [] for ex in DataSet: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in DataSet: predictionsL.append(Loaded(ex)) self.assertEqual([round(pred.value,4) for pred in predictions],[round(pred.value,4) for pred in predictionsL],"Loaded model predictions differ: Pred. 1 (saved/loaded):"+str(predictions[0])+" / "+str(predictionsL[0])) self.assertEqual(len(Loaded.domain),len(DataSet.domain)) self.assertEqual(len(Loaded.imputeData) , len(Loaded.domain)) self.assertEqual(len(Loaded.basicStat), len(Loaded.domain)) self.assertEqual(Loaded.NTrainEx, len(DataSet)) miscUtilities.removeDir(scratchdir)
def TopVarImportanceTest(data, expectNone=False): resA = [] resB = [] learner = AZorngCvSVM.CvSVMLearner( gamma=1.0, svm_type=103, C=1, coef0=0, degree=3, epsR=0.001, kernel_type=2, nu=0.5, p=0.1, probability=0, shrinking=1, ) CvSVM = learner(data) for ex in data: resA.append(CvSVM.getTopImportantVars(ex, 1)) scratchdir = miscUtilities.createScratchDir(desc="TopVarImportanceTest") modelPath = os.path.join(scratchdir, "CvSVNModel") CvSVM.write(modelPath) LoadedCvSVM = AZorngCvSVM.CvSVMread(modelPath) miscUtilities.removeDir(scratchdir) for ex in data: resB.append(LoadedCvSVM.getTopImportantVars(ex, 1)) if expectNone: return resA == resB == [None] * len(data) else: return resA == resB and None not in resA and resA.count(resA[0]) != len(resA)
def test_saveloadReg(self): """Test the save/load for a regression model - Using average of N classifiers""" learnersNames = ["CvANN","CvSVM","RF"] learner = AZorngConsensus.ConsensusLearner(learnersNames = learnersNames) classifier = learner(self.DataReg) predictions = [] for ex in self.DataReg: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in self.DataReg: predictionsL.append(Loaded(ex)) self.assertEqual([round(pred.value,4) for pred in predictions],[round(pred.value,4) for pred in predictionsL],"Loaded model predictions differ: Pred. 1 (saved/loaded):"+str(predictions[0])+" / "+str(predictionsL[0])) self.assertEqual(len(Loaded.domain),len(self.DataReg.domain)) self.assertEqual(len(Loaded.imputeData) , len(Loaded.domain)) self.assertEqual(len(Loaded.basicStat), len(Loaded.domain)) self.assertEqual(Loaded.NTrainEx, len(self.DataReg)-66) miscUtilities.removeDir(scratchdir)
def test_CanPersistClassificationModelMajority(self): """Test the save/load for a classification model - Using Majority""" """ Arrange """ learners = self.createTestLearners() learner = AZorngConsensus.ConsensusLearner(learners = learners) classifier = learner(self.getClassificationTrainingData()) """ Act """ predictions = [] for ex in self.irisData: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) """ Assert """ predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) self.assertEqual(len(Loaded.domain),len(self.irisData.domain)) self.assertEqual(len(Loaded.imputeData) , len(Loaded.domain)) self.assertEqual(len(Loaded.basicStat), len(Loaded.domain)) self.assertEqual(Loaded.NTrainEx, 0.8*len(self.irisData)) for ex in self.irisData: predictionsL.append(Loaded(ex)) self.assertEqual(predictions,predictionsL) miscUtilities.removeDir(scratchdir)
def test_CanPersistClassificationModelProbabilities(self): """Test the save/load for a classification model - Using probabilities average""" # Arrange learners = [AZorngRF.RFLearner(), AZorngCvANN.CvANNLearner()] learner = AZorngConsensus.ConsensusLearner(learners = learners) classifier = learner(self.irisData) # Act predictions = [] for ex in self.irisData: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) # Assert predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in self.irisData: predictionsL.append(Loaded(ex)) self.assertEqual(predictions,predictionsL) self.assertEqual(len(Loaded.domain),len(self.irisData.domain)) self.assertEqual(len(Loaded.imputeData) , len(Loaded.domain)) self.assertEqual(len(Loaded.basicStat), len(Loaded.domain)) self.assertEqual(Loaded.NTrainEx, len(self.irisData) - int(0.2 * len(self.irisData))) miscUtilities.removeDir(scratchdir)
def test_CanPersistRegressionModelUsingClassifiers(self): """Test the save/load for a regression model - Using average of N classifiers""" # Arrange learners = [AZorngRF.RFLearner(), AZorngCvSVM.CvSVMLearner(), AZorngCvANN.CvANNLearner()] learner = AZorngConsensus.ConsensusLearner(learners = learners) classifier = learner(self.DataReg) # Act predictions = [] for ex in self.DataReg: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) # Assert predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in self.DataReg: predictionsL.append(Loaded(ex)) self.assertEqual([round(pred.value,4) for pred in predictions], [round(pred.value,4) for pred in predictionsL], "Loaded model predictions differ: Pred. 1 (saved/loaded):"+str(predictions[0])+" / "+str(predictionsL[0])) self.assertEqual(len(Loaded.domain),len(self.DataReg.domain)) self.assertEqual(len(Loaded.imputeData) , len(Loaded.domain)) self.assertEqual(len(Loaded.basicStat), len(Loaded.domain)) self.assertEqual(Loaded.NTrainEx, len(self.DataReg)*0.8) miscUtilities.removeDir(scratchdir)
def test_FeedClassifiersClass(self): """Test the creation of Consensus feeding Classifiers""" learners = [AZorngCvSVM.CvSVMLearner(), AZorngCvANN.CvANNLearner(), AZorngRF.RFLearner()] classifiers = [l(self.irisData) for l in learners] classifier = AZorngConsensus.ConsensusClassifier(classifiers = classifiers) predictions = [] for ex in self.irisData: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in self.irisData: predictionsL.append(Loaded(ex)) self.assertEqual(predictions,predictionsL) self.assertEqual(len(Loaded.domain),len(self.irisData.domain)) self.assertEqual(len(Loaded.imputeData) , len(Loaded.domain)) self.assertEqual(len(Loaded.basicStat), len(Loaded.domain)) self.assertEqual(Loaded.NTrainEx, len(self.irisData)) miscUtilities.removeDir(scratchdir)
def test_saveloadClass2(self): """Test the save/load for a classification model - Using probabilities average""" learnersNames = ["RF","CvANN"] learner = AZorngConsensus.ConsensusLearner(learnersNames = learnersNames) classifier = learner(self.irisData) predictions = [] for ex in self.irisData: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in self.irisData: predictionsL.append(Loaded(ex)) self.assertEqual(predictions,predictionsL) self.assertEqual(len(Loaded.domain),len(self.irisData.domain)) self.assertEqual(len(Loaded.imputeData) , len(Loaded.domain)) self.assertEqual(len(Loaded.basicStat), len(Loaded.domain)) self.assertEqual(Loaded.NTrainEx, len(self.irisData)) miscUtilities.removeDir(scratchdir)
def testSVM_MPI_3(self): ################################################################### # Test other way of setting appspack ################################################################### # Classification accuracy: ExpectedCA = 0.847 #orange1: 0.837619047619 optimizer = paramOptUtilities.Appspack() learner = AZorngCvSVM.CvSVMLearner() learnerName = "CvSVMLearner" # Create an interface for setting optimizer parameters pars = AZLearnersParamsConfig.API(learnerName) # Set all parameters to not be optimized pars.setOptimizeAllParameters(False) parameterList = ["C", "gamma"] # Set the parameters in parameterList to be optimized for parameter in parameterList: pars.setParameter(parameter,"optimize",True) # Change the range pars.setParameter("C","range",miscUtilities.power2Range(-5,2,1)) trainFile=self.discTrainDataPath # Create a directory for running the appspack (if not defined it will use the present working directory) runPath = miscUtilities.createScratchDir(desc="ParamOptTest_SVM_MPI_3") evalM = "AZutilities.evalUtilities.CA" fMin = False #[<minimum of objective function found>, <optimized parameters>] tunedPars = optimizer(learner=learner,\ dataSet=trainFile,\ evaluateMethod = evalM,\ findMin=fMin,\ runPath = runPath,\ useParameters = pars.getParametersDict(),\ verbose = 0,\ useStd = False,\ advancedMPIoptions = "-v -np 4",\ machinefile = ["localhost:2","localhost:2"]) print "Returned: ", tunedPars print "====================== optimization Done ===========================" print "Learner optimized flag = ", learner.optimized print "Tuned parameters = ", tunedPars[1] print "Best optimization result = ", tunedPars[0] print "check the file intRes.txt to see the intermediate results of optimizer!" # Check if the MPI version was used self.assertEqual(learner.optimized,True) # Check if the MPI version was used self.assertEqual(optimizer.usedMPI, True) self.assertEqual(round(tunedPars[0],3),round(ExpectedCA,3)) self.assert_(len(dataUtilities.DataTable(os.path.join(runPath,"optimizationLog.txt")))>=12) # (orig 14) Must be > 2 #print runPath miscUtilities.removeDir(runPath)
def test_RFRegression(self): """RF - Test of optimizer with continuous class data """ #Create the appspack instance opt=paramOptUtilities.Appspack() #Learner to be optimized learner=AZorngRF.RFLearner() #dataset to use in the parameters optimization (Discrete class in this example) dataSet=self.contTrainDataPath # Define the objective function. This requires: # defining the extreme to find (the min or max): findMin=True or findMin=False fMin=True # defining the method for evaluation (must be a method that accepts as input an orngTest.ExperimentResults): # evaluateMethod="AZutilities.evalUtilities.R2" evalM="AZutilities.evalUtilities.RMSE" # Create an interface for setting optimizer parameters pars = AZLearnersParamsConfig.API("RFLearner") # Set the parameters in parameterList to be optimized pars.setParameter("NumThreads","optimize",False) # Change the default pars.setParameter("NumThreads","default","1") # Create a directory for running the appspack (if not defined it will use the present working directory) runPath = miscUtilities.createScratchDir(desc="ParamOptTest") # Run the appspack which will configure the input learner and aditionaly return #[<minimum of objective function found>, <optimized parameters>] tunedPars = opt(learner=learner,\ dataSet=dataSet,\ evaluateMethod = evalM,\ findMin=fMin,\ runPath = runPath,\ useStd = False,\ useParameters = pars.getParametersDict(),\ verbose = 0) print "Returned: ", tunedPars print "====================== optimization Done ===========================" print "Learner optimized flag = ", learner.optimized print "Tuned parameters = ", tunedPars[1] print "Best optimization result = ", tunedPars[0] print "check the file intRes.txt to see the intermediate results of optimizer!" self.assertEqual(opt.usedMPI,False) self.assertEqual(learner.optimized,True) self.assertEqual(round(tunedPars[0],2),round(3.1499999999999999,2)) #The learner is now with its optimized parameters already set, so we can now make a classifier out of it classifier = learner(self.contTrain) RMSE = evalUtilities.getRMSE(self.contTest,classifier) self.assertEqual(round(RMSE,2),round(2.02,2)) #Ver 0.3 #Check if the best result was not the one with numThreads different of 1 since that way we can get #different results among runs self.assertEqual(int(tunedPars[1]["NumThreads"]),1) miscUtilities.removeDir(runPath)
def arrayJob(jobName = "AZOarray",jobNumber =1 ,jobParams = [], jobParamFile = "Params.pkl", jobQueue = "quick.q", jobScript = "", memSize = "150M"): runPath = miscUtilities.createScratchDir(desc ="optQsub"+jobName, baseDir = AZOC.NFS_SCRATCHDIR) cwd = os.getcwd() os.chdir(runPath) paramFile = open(jobParamFile,"w") cPickle.dump(jobParams,paramFile) paramFile.close() jobFile = open(jobName + ".py","w") jobFile.write(jobScript) jobFile.close() cmd = "echo python " + os.path.join(runPath, str(jobName) + ".py") + \ " | qsub -cwd -V -q " + str(jobQueue) + \ " -p -800 -t 1-" + str(jobNumber) + \ " -N " + str(jobName) + \ " -S /bin/sh -sync yes" + \ AZOC.SGE_QSUB_ARCH_OPTION_CURRENT + \ " -l mf=" + str(memSize) # specify shell /bin/sh so not to get warning: no access to tty in output file. (status, output) = commands.getstatusoutput(cmd) # Check exit status of all our jobs if status != 0: print jobName + " failed! Code = " + str(status) print output raise ValueError for line in output.split("\n"): if not "exit code 0" in line: if not "Your job-array" in line: print jobName + " failed! " + line raise ValueError # Check if error files exist that are not empty. for part in sorted(glob(os.path.join(runPath,jobName+".e*"))): if os.path.getsize(part) != 0: print jobName + " failed! file " + str(part) raise ValueError # Build result list from pickle objects resList = [] for part in sorted(glob(os.path.join(runPath,jobName+".o*"))): file = open(part,"r") resList.append(cPickle.load(file)) file.close() os.chdir(cwd) miscUtilities.removeDir(runPath) return resList
def buildModel(trainData, MLMethod, queueType = "NoSGE", verbose = 0, logFile = None): """ Buld the method passed in MLMethod and optimize ( "IndividualStatistics" not in MLMethod) if MLMethod is a Consensus ("individualStatistics" in MLMethod) , build each and optimize first all models and after build the consensus! """ log(logFile, "Building and optimizing learner: "+MLMethod["MLMethod"]+"...") learners = {} MLMethods = {} if "IndividualStatistics" in MLMethod: #It is a consensus for ML in MLMethod["IndividualStatistics"]: MLMethods[ML] = MLMethod["IndividualStatistics"][ML] else: MLMethods[MLMethod["MLMethod"]] = MLMethod # optimize all MLMethods for ML in MLMethods: log(logFile, " Optimizing MLmethod: "+ML) learners[ML] = MLMETHODS[ML](name = ML) runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AutoQSAR") trainData.save(os.path.join(runPath,"trainData.tab")) paramOptUtilities.getOptParam( learner = learners[ML], trainDataFile = os.path.join(runPath,"trainData.tab"), useGrid = False, verbose = verbose, queueType = queueType, runPath = runPath, nExtFolds = None) if not learners[ML].optimized: print "ERROR: AutoQSAR: The learner was not optimized." return None else: print "Optimized learner ",learners[ML] miscUtilities.removeDir(runPath) #Train the model if len(learners) == 1: log(logFile, " Building the optimized learner:"+learners.keys()[0]) model = learners[learners.keys()[0]](trainData) elif len(learners) >= 1: model = buildConsensus(trainData,learners,MLMethods) else: print "ERROR: No Learners were selected!" return None return model
def test_PLS_Classification(self): """PLS - Test of optimizer with discrete class data """ expectedAcc = [0.57999999999999996, 0.58999999999999997] #Ver 0.3 - Artifact: The second value can be expected on other Systems #Create the appspack instance opt=paramOptUtilities.Appspack() #Learner to be optimized learner=AZorngPLS.PLSLearner() #dataset to use in the parameters optimization (Discrete class in this example) dataSet=self.discTrainDataPath # Define the objective function. This requires: # defining the extreme to find (the min or max): findMin=True or findMin=False fMin=False # defining the method for evaluation (must be a method that accepts as input an orngTest.ExperimentResults): # evaluateMethod="AZutilities.evalUtilities.CA" evalM="AZutilities.evalUtilities.CA" # Create a directory for running the appspack (if not defined it will use the present working directory) runPath = miscUtilities.createScratchDir(desc="ParamOptTest") # Run the appspack which will configure the input learner and aditionaly return #[<minimum of objective function found>, <optimized parameters>] tunedPars = opt(learner=learner,\ dataSet=dataSet,\ evaluateMethod = evalM,\ findMin=fMin,\ runPath = runPath,\ useStd = False,\ verbose = 0) print "Returned: ", tunedPars print "====================== optimization Done ===========================" print "Learner optimized flag = ", learner.optimized print "Tuned parameters = ", tunedPars[1] print "Best optimization result = ", tunedPars[0] print "check the file intRes.txt to see the intermediate results of optimizer!" self.assertEqual(opt.usedMPI,False) self.assertEqual(learner.optimized,True) self.assert_(round(tunedPars[0],2) in [round(x,2) for x in expectedAcc]) #Ver 0.3 #The learner is now with its optimized parameters already set, so we can now make a classifier out of it classifier = learner(self.discTrain) CA = evalUtilities.getClassificationAccuracy(self.discTest,classifier) expectedCA = [0.58999999999999997,2 ,0.57999999999999996] # Artifact: Second value expected in UBUNTU 10.10 self.assert_(round(CA,2) in [round(ca,2) for ca in expectedCA]) # Ver 0.3 miscUtilities.removeDir(runPath)
def TopVarImportanceTest(data, expectNone = False): resA = [] resB = [] CvANN = AZorngCvANN.CvANNLearner(data, stopUPs=0) for ex in data: resA.append(CvANN.getTopImportantVars(ex,1)) scratchdir = miscUtilities.createScratchDir(desc="TopVarImportanceTest") modelPath = os.path.join(scratchdir,"CvANNModel") CvANN.write(modelPath) LoadedCvANN = AZorngCvANN.CvANNread(modelPath) miscUtilities.removeDir(scratchdir) for ex in data: resB.append(LoadedCvANN.getTopImportantVars(ex,1)) if expectNone: return resA == resB == [None]*len(data) else: return resA == resB and None not in resA and resA.count(resA[0]) != len(resA)
def test_PLS_Regression(self): """PLS - Test of optimizer with continuous class data """ #Create the appspack instance opt=paramOptUtilities.Appspack() #Learner to be optimized learner=AZorngPLS.PLSLearner() #dataset to use in the parameters optimization dataSet=self.contTrainDataPath # Define the objective function. This requires: # defining the extreme to find (the min or max): findMin=True or findMin=False fMin=True # defining the method for evaluation (must be a method that accepts as input an orngTest.ExperimentResults): # evaluateMethod="AZutilities.evalUtilities.R2" evalM="AZutilities.evalUtilities.RMSE" # Create a directory for running the appspack (if not defined it will use the present working directory) runPath = miscUtilities.createScratchDir(desc="ParamOptTest") # Run the appspack which will configure the input learner and aditionaly return #[<minimum of objective function found>, <optimized parameters>] tunedPars = opt(learner=learner,\ dataSet=dataSet,\ evaluateMethod = evalM,\ findMin=fMin,\ runPath = runPath,\ useStd = False,\ verbose = 0) print "Returned: ", tunedPars print "====================== optimization Done ===========================" print "Learner optimized flag = ", learner.optimized print "Tuned parameters = ", tunedPars[1] print "Best optimization result = ", tunedPars[0] print "check the file intRes.txt to see the intermediate results of optimizer!" self.assertEqual(opt.usedMPI,False) self.assertEqual(learner.optimized,True) self.assertEqual(round(tunedPars[0],2),round(0.858060000000,2)) #The learner is now with its optimized parameters already set, so we can now make a classifier out of it classifier = learner(self.contTrain) RMSE = evalUtilities.getRMSE(self.contTest,classifier) self.assertEqual(round(RMSE,2),round(0.656979500000,2)) miscUtilities.removeDir(runPath)
def test_saveloadReg(self): """Test the save/load for a regression model - Using average of N classifiers""" learnersNames = ["CvANN","CvSVM","RF"] learner = AZorngConsensus.ConsensusLearner(learnersNames = learnersNames) classifier = learner(self.DataSol) predictions = [] for ex in self.DataSol: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in self.DataSol: predictionsL.append(Loaded(ex)) self.assertEqual(predictions,predictionsL) miscUtilities.removeDir(scratchdir)
def testTopRankedNoGrid(self): """ Test the TopRanked method in getBestModel with a test set and without grid computing. """ # Fix input arg resultsDir = miscUtilities.createScratchDir(desc="GetBestModelTest") descList = [5, 10] grid = False batchQueue = False optParam = True # Method to test getBestModel.getBestModelTopRank(self.trainPath, self.testPath, resultsDir, descList, grid, optParam, batchQueue) # Assert the existens of a results file resultsFile = resultsDir+"/batchResults.tex" self.assert_(os.path.exists(resultsFile), "No results file created with getBestModelTopRank") resultsFile = resultsDir+"/batchResults.pdf" self.assert_(os.path.exists(resultsFile), "No pdf file created with getBestModelTopRank") miscUtilities.removeDir(resultsDir)
def test_SaveLoadCustomRegressionExpression(self): """ Test save/load custom expression using average N regression with object map """ # Arrange learners = {'firstLearner':AZorngCvSVM.CvSVMLearner(), 'secondLearner':AZorngCvANN.CvANNLearner(), 'thirdLearner':AZorngRF.RFLearner()} # Construct expression learner/classifier regressionExpression = "(firstLearner + secondLearner + thirdLearner) / 3" expressionLearner = AZorngConsensus.ConsensusLearner(learners = learners, expression = regressionExpression) expressionClassifier = expressionLearner(self.DataReg) # Construct default learner/classifier result = [] for ex in self.DataReg: result.append(expressionClassifier(ex)) # Act scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") expressionClassifier.write(os.path.join(scratchdir,"./CM.model")) resultLoaded = [] loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) self.assertNotEqual(loaded, None) for ex in self.DataReg: resultLoaded.append(loaded(ex)) # Assert for index, item in enumerate(result): if not float_compare(result[index].value, resultLoaded[index].value): print "Not equal on index: ", index self.assertEqual(float_compare(result[index].value, resultLoaded[index].value), True) self.assertEqual(len(loaded.domain),len(self.DataReg.domain)) self.assertEqual(len(loaded.imputeData) , len(loaded.domain)) self.assertEqual(len(loaded.basicStat), len(loaded.domain)) self.assertEqual(loaded.NTrainEx, len(self.DataReg)) miscUtilities.removeDir(scratchdir)
def testDescSetNoGrid(self): """ Test the descSet method in getBestModel without a test set and without grid computing. """ # Fix input arg resultsDir = miscUtilities.createScratchDir(desc="GetBestModelTest") descList = ["AZ_descriptors"]#, "SELMA"] grid = False batchQueue = False optParam = False # Method to test getBestModel.getBestModelDescSet(self.trainPath2, "noTest", resultsDir, descList, grid, optParam, batchQueue) # Assert the existens of a results file resultsFile = resultsDir+"/batchResults.tex" self.assert_(os.path.exists(resultsFile), "No results file created with getBestModelDescSet") resultsFile = resultsDir+"/batchResults.pdf" self.assert_(os.path.exists(resultsFile), "No pdf file created with getBestModelDescSet") miscUtilities.removeDir(resultsDir)
def test_SaveLoadCustomLogicalExpression(self): """ Test save/load functionality with a custom logical expression """ # Arrange # Construct expression learner/classifier learners = {'firstLearner':AZorngCvSVM.CvSVMLearner(), 'secondLearner':AZorngCvANN.CvANNLearner(), 'thirdLearner':AZorngRF.RFLearner()} discreteExpression = ["firstLearner == Iris-setosa -> Iris-setosa", "-> Iris-virginica"] discreteLearner = AZorngConsensus.ConsensusLearner(learners = learners, expression = discreteExpression) discreteClassifier = discreteLearner(self.irisData) result = [] for ex in self.irisData: result.append(discreteClassifier(ex)) # Act scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") discreteClassifier.write(os.path.join(scratchdir,"./CM.model")) resultLoaded = [] loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) self.assertNotEqual(loaded, None) for ex in self.irisData: resultLoaded.append(loaded(ex)) # Assert for index, item in enumerate(result): if not result[index].value == resultLoaded[index].value: print "Not equal on index: ", index self.assertEqual(result[index].value, resultLoaded[index].value) self.assertEqual(len(loaded.domain),len(self.irisData.domain)) self.assertEqual(len(loaded.imputeData) , len(loaded.domain)) self.assertEqual(len(loaded.basicStat), len(loaded.domain)) self.assertEqual(loaded.NTrainEx, len(self.irisData)) miscUtilities.removeDir(scratchdir)
def test_FeedClassifiersReg(self): """Test the feeding of regression classifiers """ DataSet = dataUtilities.DataTable(os.path.join(AZOC.AZORANGEHOME,"tests/source/data/dummy.tab")) #DataSet = self.DataSol learners = [AZorngCvSVM.CvSVMLearner(), AZorngCvANN.CvANNLearner(), AZorngRF.RFLearner()] classifiers = [l(DataSet) for l in learners] classifier = AZorngConsensus.ConsensusClassifier(classifiers = classifiers) predictions = [] for ex in DataSet: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in DataSet: predictionsL.append(Loaded(ex)) self.assertEqual(predictions,predictionsL) miscUtilities.removeDir(scratchdir)
def test_FeedLearnersReg(self): """Test the creation of Consensus feeding Learners for regression""" #The Learners can be individualy costumized before passing them to the Consensus learners = [AZorngCvSVM.CvSVMLearner(), AZorngCvANN.CvANNLearner(), AZorngRF.RFLearner()] #Passing now the learnersObj instead learner = AZorngConsensus.ConsensusLearner(learnersObj = learners) classifier = learner(self.DataSol) predictions = [] for ex in self.DataSol: predictions.append(classifier(ex)) scratchdir = miscUtilities.createScratchDir(desc="ConsensusSaveLoadTest") classifier.write(os.path.join(scratchdir,"./CM.model")) predictionsL = [] Loaded = AZorngConsensus.Consensusread(os.path.join(scratchdir,"./CM.model")) for ex in self.DataSol: predictionsL.append(Loaded(ex)) self.assertEqual(predictions,predictionsL) miscUtilities.removeDir(scratchdir)
def __call__(self, trainingData, weight=None): """Creates an PLS model from the data in trainingData. """ if not AZBaseClasses.AZLearner.__call__(self,trainingData, weight): return None #Remove from the domain any unused values of discrete attributes including class trainingData = dataUtilities.getDataWithoutUnusedValues(trainingData,True) # Create path for the Orange data scratchdir = miscUtilities.createScratchDir(desc="PLS") OrngFile = os.path.join(scratchdir,"OrngData.tab") # Remove meta attributes from training data to make the imputer work with examples without the meta attributes. #dataUtilities.rmAllMeta(trainingData) if len(trainingData.domain.getmetas()) == 0: trainData = trainingData else: trainData = dataUtilities.getCopyWithoutMeta(trainingData) # Create the imputer self.imputer = orange.ImputerConstructor_average(trainData) # Impute the data trainData = self.imputer(trainData) # Save the Data already imputed to an Orange formated file if self.verbose > 1: print time.asctime(), "Saving Orange Data to a tab file..." orange.saveTabDelimited(OrngFile,trainData) if self.verbose > 1: print time.asctime(), "done" # Create the PLS instance if self.verbose > 1: print time.asctime(), "Creating PLS Object..." learner = pls.PlsAPI() if self.verbose > 1: print time.asctime(), "done" # Assign the PLS parameters learner.SetParameter('v',str(self.verbose)) learner.SetParameter('debug',str(int(self.verbose > 0))) learner.SetParameter('method',self.method) if types.IntType(self.k) > len(trainData.domain.attributes): learner.SetParameter('k',str(len(trainData.domain.attributes))) if self.verbose > 0: print "Warning! The number of components were more than the number of attributes." if self.verbose > 0: print " Components were set to ",len(trainData.domain.attributes) else: learner.SetParameter('k',self.k) learner.SetParameter('precision',self.precision) learner.SetParameter('sDir',scratchdir) #AZOC.SCRATCHDIR) # Read the Orange Formated file and Train the Algorithm # TRAIN if self.verbose > 1: print time.asctime(), "Training..." learner.Train(OrngFile) if self.verbose > 1: print "Train finished at ", time.asctime() print "PLS trained in: " + str(learner.GetCPUTrainTime()) + " seconds"; print "Method: " + learner.GetParameter("method") print "Components: " + learner.GetParameter("k") print "Precision: " + learner.GetParameter("precision") # Remove the scratch file if self.verbose == 0: miscUtilities.removeDir(scratchdir) else: print "The directory " + scratchdir + " was not deleted because DEBUG flag is ON" del trainData impData=self.imputer.defaults return PLSClassifier(classifier = learner, name = "Classifier of " + self.name, classVar = trainingData.domain.classVar, imputeData=impData, verbose = self.verbose, varNames = [attr.name for attr in trainingData.domain.attributes], NTrainEx = len(trainingData), basicStat = self.basicStat, parameters = self.parameters)#learner.GetClassVarName())#
def test_PLS_MPI_2(self): ################################################################### # Test other way of setting appspack ###################################################################i # Classification accuracy: ExpectedCA = [0.851851851852, 0.865] ExpectedCAwithTest = [0.865238095238, 0.884285714286, 0.85619047619, 0.837] #New at orange2.0 #Create the appspack instance opt=paramOptUtilities.Appspack() #Learner to be optimized learner=AZorngPLS.PLSLearner() #dataset to use in the parameters optimization (Discrete class in this example) dataSet=self.discTrainDataPath # Define the objective function. This requires: # defining the extreme to find (the min or max): findMin=True or findMin=False fMin=False # defining the method for evaluation (must be a method that accepts as input an orngTest.ExperimentResults): # evaluateMethod="AZutilities.evalUtilities.CA" evalM="AZutilities.evalUtilities.CA" # Create a directory for running the appspack (if not defined it will use the present working directory) runPath = miscUtilities.createScratchDir(desc="ParamOptTest_PLS_MPI_2") # Load the optimization parameters from the default configuration (AZLearnersParamsConfig.py) parameters = AZLearnersParamsConfig.API("PLSLearner") parameters.setParameter("method","default",'pls1') # change the optimization parameters parameters.setParameter("method","default",'pls1') # make the method fixed (do not optimize) to be pls1 parameters.setParameter("method","optimize",False) parameters.setParameter("method","rangeType","values") # assure that the keyword for the values range type is #set correctly for values instead of interval parameters.setParameter("k","range",[1 , 3 , 5 , 6 , 10]) # make the method fixed (do not optimize) to be pls1 parameters.setParameter("k","optimize",True) parameters.setParameter("k","rangeType","values") # assure that the keyword for the values range type is #set correctly for values instead of interval #[<minimum of objective function found>, <optimized parameters>] tunedPars = opt(learner=learner,\ dataSet=dataSet,\ evaluateMethod = evalM,\ findMin=fMin,\ runPath = runPath,\ useParameters = parameters.getParametersDict(),\ verbose = 0, useStd = False,\ advancedMPIoptions = None, np = 4, machinefile = ["localhost:2","localhost:2"]) print "Returned: ", tunedPars print "====================== optimization Done ===========================" print "Learner optimized flag = ", learner.optimized print "Tuned parameters = ", tunedPars[1] print "Best optimization result = ", tunedPars[0] print "check the file intRes.txt to see the intermediate results of optimizer!" self.assertEqual(learner.optimized,True) # Check if the MPI version was used self.assertEqual(opt.usedMPI, True) self.assert_(round(tunedPars[0],3) in [round(x,3) for x in ExpectedCAwithTest],"Got:" + str(tunedPars[0])) #The learner is now with its optimized parameters already set, so we can now make a classifier out of it classifier = learner(self.discTrain) CA = evalUtilities.getClassificationAccuracy(self.discTest,classifier) self.assert_(round(CA,3) in [round(x,3) for x in ExpectedCA]) resData2 = dataUtilities.DataTable(os.path.join(runPath,"optimizationLog.txt")) self.assert_(len(resData2)>=4) # (orig 5) Must be > 2 #print runPath miscUtilities.removeDir(runPath)
def testRF_MPI(self): """ Tests changing the default range of the optimizer. Use MPI versio0n of appspack """ # Classification accuracy: ExpectedCA = [0.903] #opencv1.1: 0.90480000000000005 optimizer = paramOptUtilities.Appspack() learner = AZorngRF.RFLearner() learnerName = "RFLearner" # Create an interface for setting optimizer parameters pars = AZLearnersParamsConfig.API(learnerName) # Set all parameters to not be optimized pars.setOptimizeAllParameters(False) parameterList = ["nActVars"] # Set the parameters in parameterList to be optimized for parameter in parameterList: pars.setParameter(parameter,"optimize",True) # Set the NumThreads pars.setParameter("NumThreads","optimize",False) # Change the default pars.setParameter("NumThreads","default","1") trainFile=self.discTrainDataPath # Create a directory for running the appspack (if not defined it will use the present working directory) runPath = miscUtilities.createScratchDir(desc="ParamOptTest_RF_MPI") evalM = "AZutilities.evalUtilities.CA" fMin = False # Calculate the optimal parameters. This can take a long period of time! tunedPars = optimizer(learner=learner,\ dataSet=trainFile,\ evaluateMethod = evalM,\ useParameters = pars.getParametersDict(),\ useDefaultPoint = False,\ findMin=fMin,\ runPath = runPath,\ useStd = False,\ verbose = 0,\ #advancedMPIoptions = "-all-local -allcpus") # to use this the # file "<MPICHDIR>/share/machines.LINUX must be properly configured" # Alternatively, we can set machinefile=0 to us also all available cores machinefile =0) print "Returned: ", tunedPars print "====================== optimization Done ===========================" print "Learner optimized flag = ", learner.optimized print "Tuned parameters = ", tunedPars[1] print "Best optimization result = ", tunedPars[0] print "check the file intRes.txt to see the intermediate results of optimizer!" print "Number of cores used: ",optimizer.np verbTunedPars = optimizer.getTunedParameters() # Check that the learner was optimized self.assertEqual(learner.optimized,True) #Check if the number of processors used are all the core available status,out = commands.getstatusoutput("cat /proc/cpuinfo | grep processor") self.assertEqual(optimizer.np, len(out.split("\n"))) # Check if the MPI version was used self.assertEqual(optimizer.usedMPI, True) # Check the number of optimized parameters self.assert_(len(verbTunedPars["optParam"]) in [8,9,10]) # Check the accuracy self.assert_(round(verbTunedPars["bestRes"],3) in [round(x,3) for x in ExpectedCA],"Got:" + str(verbTunedPars["bestRes"])) self.assert_(len(dataUtilities.DataTable(os.path.join(runPath,"optimizationLog.txt")))>=3) # Must be > 2 miscUtilities.removeDir(runPath)
def getAcc(self, callBack=None, algorithm=None, params=None, atts=None, holdout=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None parameters: algorithm - list of feature generation algorithms (set dependent features that have to be calculated inside the crossvalidation) params - dictionary of parameters atts - attributes to be removed before learning (e.g. meta etc...) """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if holdout: self.nExtFolds = 1 if algorithm: self.__log(" Additional features to be calculated inside of cross-validation") for i in algorithm: self.__log(" Algorithm: " + str(i)) for j, v in params.iteritems(): self.__log(" Parameter: " + str(j) + " = " + str(v)) # Set the response type self.responseType = ( self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" ) self.__log(" " + str(self.responseType)) # Create the Train and test sets DataIdxs = None if holdout: self.__log("Using hold out evaluation with " + str(holdout) + "*100 % of data for training") DataIdxs = dataUtilities.SeedDataSampler_holdOut(self.data, holdout) else: DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) # Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} rocs = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) # Check data in advance so that, by chance, it will not fail at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN], negate=1) self.__checkTrainData(trainData) # Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: self.__log(" > " + str(ml) + "...") try: # Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] rocs[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN], negate=1) orig_len = len(trainData.domain.attributes) refs = None methods = [ "rdk_MACCS_keys", "rdk_topo_fps", "rdk_morgan_fps", "rdk_morgan_features_fps", "rdk_atompair_fps", ] train_domain = None # add structural descriptors to the training data (TG) if algorithm: for i in range(len(algorithm)): if algorithm[i] == "structClust": self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) actData = orange.ExampleTable(trainData.domain) for d in trainData: # only valid for simboosted qsar paper experiments!? if d.getclass() == "2": actData.append(d) refs = structuralClustering.getReferenceStructures( actData, threshold=params["threshold"], minClusterSize=params["minClusterSize"], numThreads=2, ) self.__log( " found " + str(len(refs)) + " reference structures in " + str(len(actData)) + " active structures" ) orig_len = orig_len + (len(refs) * len(methods)) trainData_sim = SimBoostedQSAR.getSimDescriptors(refs, trainData, methods) if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_sim, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_sim, []) elif algorithm[i] == "ECFP": self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) trainData_ecfp = getCinfonyDesc.getCinfonyDescResults(trainData, ["rdk.FingerPrints"]) train_domain = trainData_ecfp.domain if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, []) else: self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) trainData_structDesc = getStructuralDesc.getStructuralDescResult( trainData, algorithm[i], params["minsup"] ) if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, []) # trainData.save("/home/girschic/proj/AZ/ProjDev/train.tab") testData = self.data.select(DataIdxs[foldN]) # calculate the feature values for the test data (TG) if algorithm: for i in range(len(algorithm)): if algorithm[i] == "structClust": self.__log(str(algorithm[i])) testData_sim = SimBoostedQSAR.getSimDescriptors(refs, testData, methods) if i == (len(algorithm) - 1): testData = dataUtilities.attributeDeselectionData(testData_sim, atts) else: testData = dataUtilities.attributeDeselectionData(testData_sim, []) elif algorithm[i] == "ECFP": self.__log(str(algorithm[i])) # testData_ecfp = orange.ExampleTable(train_domain) tmp_dat = [] for d in testData: tmp = getCinfonyDesc.getRdkFPforTestInstance(train_domain, d) tmp_dat.append(tmp) testData_ecfp = orange.ExampleTable(tmp_dat[0].domain, tmp_dat) if i == (len(algorithm) - 1): # print "removing atts" testData = dataUtilities.attributeDeselectionData(testData_ecfp, atts) else: # print "removing no atts" testData = dataUtilities.attributeDeselectionData(testData_ecfp, []) else: cut_off = orig_len - len(atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: " + str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts) if i == (len(algorithm) - 1): testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts) else: testData = dataUtilities.attributeDeselectionData(testData_structDesc, []) # testData.save("/home/girschic/proj/AZ/ProjDev/test.tab") nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) # Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += ( " Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n" ) self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData) ) # self.__log(" run path:"+str(runPath)) trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join(runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, ) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized." ) self.__log(" It will be ignored") # self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: " + runPath) # Set learner back to default # MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) # Train the model model = MLmethods[ml](trainData) models[ml].append(model) # Test the model if self.responseType == "Classification": results[ml].append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) roc = self.aroc(testData, [model]) rocs[ml].append(roc) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: print "Unexpected error:", print sys.exc_info()[0] print sys.exc_info()[1] self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: # We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log( "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods." ) if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) if len(consensusMLs) >= 2: # Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs]) ) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0, CLASS1) expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) # Test the model if self.responseType == "Classification": Cresults.append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics # By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def getProbabilitiesAsAttribute(self, algorithm=None, minsup=None, atts=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None parameters: algo - key for the structural feature generation algorithm (set dependent structural features that have to be calculated inside the crossvalidation) minsup - minimum support for the algorithm atts - attributes to be removed before learning (e.g. meta etc...) """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if algorithm: self.__log(" Additional features to be calculated inside of cross-validation") self.__log(" Algorithm for structural features: " + str(algorithm)) self.__log(" Minimum support parameter: " + str(minsup)) # Set the response type self.responseType = ( self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" ) self.__log(" " + str(self.responseType)) # Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) # Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} rocs = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) # Check data in advance so that, by chance, it will not faill at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN], negate=1) self.__checkTrainData(trainData) # Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") for ml in sortedML: self.__log(" > " + str(ml) + "...") try: # Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] rocs[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] ### mods TG prediction_attribute = orange.FloatVariable("class_prob") domain = [data.domain.attributes, prediction_attribute, data.domain.classvar] data_new = orange.ExampleTable(domain) logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN], negate=1) orig_len = len(trainData.domain.attributes) # add structural descriptors to the training data (TG) if algorithm: trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, algorithm, minsup) trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts) testData = self.data.select(DataIdxs[foldN]) # print "IDX: ", # print DataIdxs[foldN] # calculate the feature values for the test data (TG) if algorithm: cut_off = orig_len - len(atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: " + str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts) testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) # Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += ( " Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n" ) self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData) ) trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join(runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, ) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized." ) self.__log(" It will be ignored") # self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: " + runPath) # Set learner back to default # MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) # Train the model model = MLmethods[ml](trainData) models[ml].append(model) # Test the model if self.responseType == "Classification": results[ml].append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) roc = self.aroc(testData, [model]) rocs[ml].append(roc) # save the prediction probabilities else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: # We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log( "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods." ) if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) if len(consensusMLs) >= 2: # Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs]) ) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0, CLASS1) expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) # Test the model if self.responseType == "Classification": Cresults.append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics # By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def getAcc(self, algorithm = None, minsup = None, atts = None): """ For regression problems, it returns the RMSE and the R2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"R2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if (self.algorithm): self.__log(" Additional structural features to be calculated inside of cross-validation") self.__log(" Algorithm for structural features: "+str(self.algorithm)) self.__log(" Minimum support parameter: "+str(self.minsup)) # Set the response type responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" "+str(responseType)) #Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) #Var for saving each Fols result results = {} exp_pred = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models={} self.__log("Calculating Statistics for MLmethods:") self.__log(" "+str([x for x in MLmethods])) for ml in MLmethods: self.__log(" > "+str(ml)+"...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN],negate=1) orig_len = len(trainData.domain.attributes) if (self.algorithm): # add structural descriptors to the training data (TG) trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, self.algorithm, self.minsup) trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, self.atts) runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam") trainData.save(os.path.join(runPath,"trainData.tab")) testData = self.data.select(DataIdxs[foldN]) if (self.algorithm): # calculate the feature values for the test data (TG) cut_off = orig_len - len(self.atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: "+str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData,smarts) testData = dataUtilities.attributeDeselectionData(testData_structDesc, self.atts) paramOptUtilities.getOptParam( learner = MLmethods[ml], trainDataFile = os.path.join(runPath,"trainData.tab"), paramList = self.paramList, useGrid = False, verbose = self.verbose, queueType = self.queueType, runPath = runPath, nExtFolds = None, nFolds = self.nInnerFolds ) if not MLmethods[ml].optimized: self.__log(" The learner "+str(ml)+" was not optimized.") raise Exception("The learner "+str(ml)+" was not optimized.") miscUtilities.removeDir(runPath) #Train the model model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred res = self.createStatObj(results[ml], exp_pred[ml], responseType, self.nExtFolds) if self.verbose > 0: print "AccWOptParamGetter!Results "+ml+":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = res.copy() self.__writeResults(res) self.__log(" OK") except: self.__log(" Learner "+str(ml)+" failed to optimize!") res = self.createStatObj() statistics[ml] = res.copy() if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! stableML={} for modelName in statistics: if statistics[modelName]["StabilityValue"] < AZOC.QSARSTABILITYTHRESHOLD: # Select only stable models stableML[modelName] = statistics[modelName].copy() if len(stableML) >= 2: self.__log("Found "+str(len(stableML))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.") if responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in stableML: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(stableML[ml]["CA"])+" " exprTest0 += ")/IF0(sum([False" for ml in stableML: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0,CLASS1) expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: R2sum = sum([stableML[ml]["R2"] for ml in stableML]) expression = "(1 / "+str(R2sum)+") * (0" for ml in stableML: expression += " + "+str(stableML[ml]["R2"])+" * "+ml+" " expression += ")" #Var for saving each Fols result Cresults = [] Cexp_pred = [] self.__log("Calculating the statistics for a Consensus model") for foldN in range(self.nExtFolds): testData = self.data.select(DataIdxs[foldN]) consensusClassifiers = {} for learnerName in stableML: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression) #Test the model if responseType == "Classification": Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, responseType, self.nExtFolds) statistics["Consensus"] = res.copy() statistics["Consensus"]["IndividualStatistics"] = stableML.copy() self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]