def myGridSearch(data,RBound,MBound): bestlogistic = None best_acc = -float('inf') class bestValues(object): m = float('nan') r = float('nan') for r in range(RBound[0],RBound[1]+RBound[2],RBound[2]): for m in range(MBound[0],MBound[1]+MBound[2],MBound[2]): logistic = Logistic() logistic.setMaxIts(int(m)) logistic.setRidge(pow(10,r)) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(logistic,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc): bestlogistic = logistic best_acc = acc bestValues.m = int(m) bestValues.r = pow(10,r) print "Best accuracy: ", best_acc print "Best values: M = ", bestValues.m, ", Ridge = ", bestValues.r print "-----------------------------------------" return bestlogistic, bestValues.r, bestValues.m, best_acc
def myGridSearch(data,NTreeBounds,NFeaturesBounds): best_acc = -float('inf') bestrandomforest = None class bestValues(object): t = float('nan') f = float('nan') for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]): for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]): randomforest = RandomForest() randomforest.setNumTrees(int(t)) randomforest.setNumFeatures(int(f)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc): bestrandomforest = randomforest best_acc = acc bestValues.t = t bestValues.f = f print "Best accuracy:", best_acc print "Best values: NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f print "-----------------------------------------" return bestrandomforest, bestValues.t, bestValues.f, best_acc
def Logistic_ParamFinder(data): # Possible set for Ridge-value RBounds = [-10,2,1] # possible set for maximum Iteration MBounds = [-1,10,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) logistic = Logistic() gridsearch.setClassifier(logistic) gridsearch.setXProperty(String('classifier.maxIts')) gridsearch.setYProperty(String('classifier.ridge')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('pow(BASE,I)')) gridsearch.setXMin(MBounds[0]) gridsearch.setXMax(MBounds[1]) gridsearch.setXStep(MBounds[2]) gridsearch.setYMin(RBounds[0]) gridsearch.setYMax(RBounds[1]) gridsearch.setYStep(RBounds[2]) gridsearch.setYBase(10) print "searching for logistic lcassifier Max Iteration = [", MBounds[0], ",", MBounds[1], "], Ridge = [ 10E", RBounds[0], ",10E", RBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------- Evaluation bestlogistic = Logistic() bestlogistic.setMaxIts(int(bestValues.x)) bestlogistic.setRidge(pow(10,bestValues.y)) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestlogistic,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() print "best accuracy: ", acc print "best logistic classifier with Ridge = ", bestlogistic.getRidge(), " Max Iteration = ", bestlogistic.getMaxIts() OptLog = bestlogistic OptLogp1 = bestlogistic.getRidge() OptLogp2 = bestlogistic.getMaxIts() OptLogAcc = acc else: OptLog, OptLogp1, OptLogp2, OptLogAcc = myGridSearch(data,RBounds,MBounds) Description = 'Logistic classifier OptRidge = ' + str(OptLogp1) + \ ', OptMaxIts = ' + str(OptLogp2) + ', OptAcc = ' + str(OptLogAcc) print "-----------------------------------------" return OptLog, OptLogp1, OptLogp2, OptLogAcc, Description
def RandomForest_ParamFinder(data): # possible set for Number of trees NTreeBounds = [1,20,1] # possible set for number of features NFeaturesBounds = [0,20,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) randomforest = RandomForest() gridsearch.setClassifier(randomforest) gridsearch.setXProperty(String('classifier.numTrees')) gridsearch.setYProperty(String('classifier.numFeatures')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(NTreeBounds[0]) gridsearch.setXMax(NTreeBounds[1]) gridsearch.setXStep(NTreeBounds[2]) gridsearch.setYMin(NFeaturesBounds[0]) gridsearch.setYMax(NFeaturesBounds[1]) gridsearch.setYStep(NFeaturesBounds[2]) gridsearch.setYBase(10) print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------- Evaluation bestrandomforest = RandomForest() bestrandomforest.setNumTrees(int(bestValues.x)) bestrandomforest.setNumFeatures(int(bestValues.y)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() print "best accuracy: ", acc print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y OptRndFrst = bestrandomforest OptRndFrstp1 = bestValues.x OptRndFrstp2 = bestValues.y OptRndFrstAcc = acc else: OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \ ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc) print "-----------------------------------------" return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
def myGridSearch(data,cBounds,GBound,eBounds): IsBestRBFKernel = False best_acc_poly = -float('inf') best_acc_rbf = -float('inf') # Poly Kernel class bestValues_poly(object): x = float('nan') y = float('nan') for Cbnd in cBounds: for c in range(Cbnd[0],Cbnd[1]+Cbnd[2],Cbnd[2]): for e in range(eBounds[0],eBounds[1]+eBounds[2],eBounds[2]): smo = SMO() kernel = PolyKernel() kernel.setExponent(e) smo.setC(c) smo.setKernel(kernel) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(smo,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc_poly): best_smo_poly = smo best_acc_poly = acc bestValues_poly.x = c bestValues_poly.y = e print "Best accuracy (Poly Kernel): ", best_acc_poly print "Best values (Poly Kernel): C = ", bestValues_poly.x, ", exponent = ", bestValues_poly.y print "-----------------------------------------" # RBF Kernel class bestValues_rbf(object): x = float('nan') y = float('nan') for Cbnd in cBounds: for c in range(Cbnd[0],Cbnd[1]+Cbnd[2],Cbnd[2]): for g in range(GBound[0],GBound[1]+GBound[2],GBound[2]): smo = SMO() kernel = RBFKernel() kernel.setGamma(pow(10,g)) smo.setC(c) smo.setKernel(kernel) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(smo,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc_rbf): best_smo_rbf = smo best_acc_rbf = acc bestValues_rbf.x = c bestValues_rbf.y = g print "Best accuracy (RBF Kernel): ", best_acc_rbf print "Best values (RBF Kernel): C = ", bestValues_rbf.x, ", gamma = ", bestValues_rbf.y if (best_acc_rbf > best_acc_poly): IsBestRBFKernel = True print "best smo classifier is RBF kernel with C = ", bestValues_rbf.x," and gamma = ", pow(10,bestValues_rbf.y) best_smo = best_smo_rbf OptSMOp1 = bestValues_rbf.x OptSMOp2 = pow(10,bestValues_rbf.y) OptSMOAcc = best_acc_rbf OptSMOIsRBF = IsBestRBFKernel else: IsBestRBFKernel = False print "best smo classifier is Poly kernel with C = ", bestValues_poly.x," and exponent = ", bestValues_poly.y best_smo = best_smo_poly OptSMOp1 = bestValues_poly.x OptSMOp2 = bestValues_poly.y OptSMOAcc = best_acc_poly OptSMOIsRBF = IsBestRBFKernel return IsBestRBFKernel, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc
def SMO_ParamFinder(data): # Possible set for C-value cBounds = [[1,10,1],[10,100,10],[100,300,20]] # possible set for exponents eBounds = [1,3,1] # possible set for Gamma GBound = [-5,2,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 # Polynomials Kernel gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) smo = SMO() kernel = PolyKernel() smo.setKernel(kernel) gridsearch.setClassifier(smo) gridsearch.setXProperty(String('classifier.c')) gridsearch.setYProperty(String('classifier.kernel.Exponent')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) best_acc_poly = -float('inf') for cnt in range(0,len(cBounds)): cbound = cBounds[cnt] cmin = cbound[0] cmax = cbound[1] cstep = cbound[2] gridsearch.setXMin(cmin) gridsearch.setXMax(cmax) gridsearch.setXStep(cstep) gridsearch.setYMin(eBounds[0]) gridsearch.setYMax(eBounds[1]) gridsearch.setYStep(eBounds[2]) print "searching for Polykernel C = [", cmin, ",", cmax, "], exponent = [", eBounds[0], ",", eBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # --------------------------------- Evaluation bestsmo = SMO() kernel = PolyKernel() kernel.setExponent(bestValues.y) bestsmo.setC(bestValues.x) bestsmo.setKernel(kernel) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) print "numFolds : ", numFolds evaluation.crossValidateModel(bestsmo,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc_poly): best_smo_poly = bestsmo best_acc_poly = acc bestValues_poly = bestValues print "Best accuracy so far: ",best_acc_poly print "Best values so far: ",bestValues_poly print "Best accuracy (Poly Kernel): ", best_acc_poly print "Best values (Poly Kernel): ", bestValues_poly print "-----------------------------------------" # RBF Kernel smo = SMO() kernel = RBFKernel() smo.setKernel(kernel) gridsearch.setClassifier(smo) gridsearch.setXProperty(String('classifier.c')) gridsearch.setYProperty(String('classifier.kernel.gamma')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('pow(BASE,I)')) gridsearch.setYBase(10) best_acc_rbf = -float('inf') for cnt in range(0,len(cBounds)): cbound = cBounds[cnt] cmin = cbound[0] cmax = cbound[1] cstep = cbound[2] gridsearch.setXMin(cmin) gridsearch.setXMax(cmax) gridsearch.setXStep(cstep) gridsearch.setYMin(GBound[0]) gridsearch.setYMax(GBound[1]) gridsearch.setYStep(GBound[2]) gridsearch.setYBase(10) print "searching for RBF Kernel C = [", cmin, ",", cmax, "], gamma = [10^", GBound[0], ",10^", GBound[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------------------- Evaluation bestsmo = SMO() kernel = RBFKernel() kernel.setGamma(pow(10,bestValues.y)) bestsmo.setC(bestValues.x) bestsmo.setKernel(kernel) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestsmo,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc_rbf): best_smo_rbf = bestsmo best_acc_rbf = acc bestValues_rbf = bestValues print "Best accuracy so far: ",best_acc_rbf print "Best values so far: ",bestValues_rbf print "Best accuracy (RBF Kernel): ", best_acc_rbf print "Best values (RBF Kernel): ", bestValues_rbf print "-----------------------------------------" if (best_acc_rbf > best_acc_poly): IsBestRBFKernel = True print "best smo classifier is RBF kernel with C = ", bestValues_rbf.x, " and gamma = ", pow(10,bestValues.y) best_smo = best_smo_rbf OptSMOp1 = bestValues_rbf.x OptSMOp2 = pow(10,bestValues.y) OptSMOAcc = best_acc_rbf OptSMOIsRBF = IsBestRBFKernel else: IsBestRBFKernel = False print "best smo classifier is Poly kernel with C = ", bestValues_poly.x, " and exponent = ", bestValues_poly.y best_smo = best_smo_poly OptSMOp1 = bestValues_poly.x OptSMOp2 = bestValues_poly.y OptSMOAcc = best_acc_poly OptSMOIsRBF = IsBestRBFKernel else: # we have very small ssample size OptSMOIsRBF, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc = myGridSearch(data,cBounds,GBound,eBounds) if OptSMOIsRBF: Description = 'SMO classifier(RBF kernel): OptC=' + str(OptSMOp1) + \ ', OptGamma=' + str(OptSMOp2) + ', OptAcc=' + str(OptSMOAcc) else: Description = 'SMO classifier(Poly kernel): OptC=' + str(OptSMOp1) + \ ', OptExponent=' + str(OptSMOp2) + ', OptAcc=' + str(OptSMOAcc) return OptSMOIsRBF, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc, Description
def runClassifierAlgo(algo, class_index, training_filename, test_filename, do_model, do_eval, do_predict): """ If <test_filename> Run classifier algorithm <algo> on training data in <training_filename> to build a model then test on data in <test_filename> (equivalent of Weka "Supplied test set") else do 10 fold CV lassifier algorithm <algo> on data in <training_filename> <class_index> is the column containing the dependent variable http://weka.wikispaces.com/Generating+classifier+evaluation+output+manually http://weka.sourceforge.net/doc.dev/weka/classifiers/Evaluation.html """ print ' runClassifierAlgo: training_filename= ', training_filename, ', test_filename=', test_filename misc.checkExists(training_filename) training_file = FileReader(training_filename) training_data = Instances(training_file) if test_filename: test_file = FileReader(test_filename) test_data = Instances(test_file) else: test_data = training_data # set the class Index - the index of the dependent variable training_data.setClassIndex(class_index) test_data.setClassIndex(class_index) # create the model if test_filename: algo.buildClassifier(training_data) evaluation = None # only a trained classifier can be evaluated if do_eval or do_predict: evaluation = Evaluation(test_data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution if test_filename: evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution]) else: # evaluation.evaluateModel(algo, [String('-t ' + training_filename), String('-c 1')]) # print evaluation.toSummaryString() rand = Random(1) evaluation.crossValidateModel(algo, training_data, 4, rand) if False: print 'percentage correct =', evaluation.pctCorrect() print 'area under ROC =', evaluation.areaUnderROC(class_index) confusion_matrix = evaluation.confusionMatrix() for l in confusion_matrix: print '** ', ','.join('%2d'%int(x) for x in l) if verbose: if do_model: print '--> Generated model:\n' print algo.toString() if do_eval: print '--> Evaluation:\n' print evaluation.toSummaryString() if do_predict: print '--> Predictions:\n' print buffer return {'model':str(algo), 'eval':str(evaluation.toSummaryString()), 'predict':str(buffer) }
def Bayes_ParamFinder(data): # ----------------------- Evaluation of Naive Bayes without kernel estimation naivebayes = NaiveBayes() evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(naivebayes,data,numFolds,random,[output, attRange, outputDistribution]) acc_naivebayes = evaluation.pctCorrect() print "Naive Bayesisn accuracy (without kernel density estimation): ", acc_naivebayes # ----------------------- Evaluation of Naive Bayes with kernel estimation naivebayes = NaiveBayes() naivebayes.setUseKernelEstimator(Boolean(True)) # use kernel density estimation evaluation = Evaluation(data) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(naivebayes,data,numFolds,random,[output, attRange, outputDistribution]) acc_naivebayes_withkernel = evaluation.pctCorrect() print "Naive Bayesisn accuracy (with kernel density estimation): ", acc_naivebayes_withkernel # ----------------------- Evaluation of Naive bayes multinomial naivebayesmultinomial = NaiveBayesMultinomial() evaluation = Evaluation(data) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) if (allAttributesPositive(data)): # multinomial bayes classifier only work on positive attributes numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(naivebayesmultinomial,data,numFolds,random,[output, attRange, outputDistribution]) acc_naivemultinomialbayes = evaluation.pctCorrect() else: acc_naivemultinomialbayes = 0 print "Naive Multinomial Bayesisn accuracy : ", acc_naivemultinomialbayes # ------------------------- Comparision if (acc_naivemultinomialbayes > acc_naivebayes): if (acc_naivemultinomialbayes > acc_naivebayes_withkernel): IsOptMultinomialBayes = True IsOptNaiveKernelDensity = False acc = acc_naivemultinomialbayes else: IsOptMultinomialBayes = False IsOptNaiveKernelDensity = True acc = acc_naivebayes_withkernel else: if (acc_naivebayes > acc_naivebayes_withkernel): IsOptMultinomialBayes = False IsOptNaiveKernelDensity = False acc = acc_naivebayes else: IsOptMultinomialBayes = False IsOptNaiveKernelDensity = True acc = acc_naivebayes_withkernel print "-----------------------------------------" OptBayesAcc = acc if IsOptMultinomialBayes: Description = 'Optimal Bayes classifier is Multinomial Bayes: OptAcc = ' + str(OptBayesAcc) elif IsOptNaiveKernelDensity: Description = 'Optimal Bayes classifier is Naive Bayes with kernel density estimation: OptAcc = ' +\ str(OptBayesAcc) else: Description = 'Optimal Bayes classifier is Naive Bayes: OptAcc = ' + str(OptBayesAcc) return IsOptMultinomialBayes, IsOptNaiveKernelDensity, OptBayesAcc, Description
def AdaBoostedSimpleLogistic_ParamFinder(data, param1, param2): # Adaboost params: Possible set for Weight Threshold WeightThresholdBounds = [99,100,1] # Adaboost params: possible set for NumIteration NumItrBound = [5,50,5] # Simple Logisitic params: Possible set for num of boosting NumBoostIterationBounds = [0,200,10] # This section tries to boost the best simple logistic print "searching for the best parameters to boosting on the optimal simple Logistic ...." gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) simplelogistic = SimpleLogistic() adaboostm = AdaBoostM1() simplelogistic.setHeuristicStop(param1) simplelogistic.setNumBoostingIterations(param2) adaboostm.setClassifier(simplelogistic) gridsearch.setClassifier(adaboostm) gridsearch.setXProperty(String('classifier.weightThreshold')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(WeightThresholdBounds[0]) gridsearch.setXMax(WeightThresholdBounds[1]) gridsearch.setXStep(WeightThresholdBounds[2]) gridsearch.setYMin(NumItrBound[0]) gridsearch.setYMax(NumItrBound[1]) gridsearch.setYStep(NumItrBound[2]) print "searching for best parameters for boosting simple Logistic weightThreshold = [", WeightThresholdBounds[0], ",", WeightThresholdBounds[1], "], # Iterations = [", NumItrBound[0], ",", NumItrBound[1], "] ...." gridsearch.buildClassifier(data) bestValues1 = gridsearch.getValues() # ------------------------------ Evaluation simplelogistic = SimpleLogistic() bestadaboostm1 = AdaBoostM1() simplelogistic.setHeuristicStop(param1) simplelogistic.setNumBoostingIterations(param2) bestadaboostm1.setWeightThreshold(int(bestValues1.x)) bestadaboostm1.setNumIterations(int(bestValues1.y)) bestadaboostm1.setClassifier(simplelogistic) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestadaboostm1,data,numFolds,random,[output, attRange, outputDistribution]) best_acc1 = evaluation.pctCorrect() print "best accuracy by boosting the optimal simple Logistic classifier: ", best_acc1 print "Optimal weight Threshold Percent : ", bestValues1.x , "Optimal number of Iterations : ", bestValues1.y print "-----------------------------------------" # ------------------------------------------------------------------------------------------------------------------------- # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration simplelogistic = SimpleLogistic() adaboostm = AdaBoostM1() adaboostm.setClassifier(simplelogistic) gridsearch.setClassifier(adaboostm) gridsearch.setXProperty(String('classifier.classifier.numBoostingIterations')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXBase(10) gridsearch.setXMin(NumBoostIterationBounds[0]) gridsearch.setXMax(NumBoostIterationBounds[1]) gridsearch.setXStep(NumBoostIterationBounds[2]) gridsearch.setYMin(NumItrBound[0]) gridsearch.setYMax(NumItrBound[1]) gridsearch.setYStep(NumItrBound[2]) print "searching for number of boosting Iterations bound = [", NumBoostIterationBounds[0], ",", NumBoostIterationBounds[1], "], # Iteration = [", NumItrBound[0], ",", NumItrBound[1], "] ...." gridsearch.buildClassifier(data) bestValues2 = gridsearch.getValues() # ------------------ Evaluation simplelogistic = SimpleLogistic() bestadaboostm2 = AdaBoostM1() simplelogistic.setNumBoostingIterations(int(bestValues2.x)) bestadaboostm2.setNumIterations(int(bestValues2.y)) bestadaboostm2.setClassifier(simplelogistic) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestadaboostm2,data,numFolds,random,[output, attRange, outputDistribution]) best_acc2 = evaluation.pctCorrect() print "best accuracy by boosting the Simple Logistic classifier (with optimization over ridge): ", best_acc2 print "Optimal number of boosting Iteration : ", bestValues2.x , "Optimal number of Iteration : ", bestValues2.y print "-----------------------------------------" print "Final optimal boosting classifier:" if (best_acc2 > best_acc1): print " Best boosting is based on simple logistic with optimal numBoostingIterations :",\ bestValues2.x, " optimal numIteration :", bestValues2.y print " optimal accuracy: ", best_acc2 IsOptimalBoostingOnOptSimpleLogistic = False # is optimal boosting based on optimal simple Logistic ? IsOptBoostOnOptSimpLog = IsOptimalBoostingOnOptSimpleLogistic OptBoostSimpLog = bestadaboostm2 OptBoostSimpLogp1 = bestValues2.x OptBoostSimpLogp2 = bestValues2.y OptBoostSimpLogAcc = best_acc2 else: print " Best boosting is based on optimal simple Logistic with optimal weight Threshold :",\ bestValues1.x, " optimal numIteration :", bestValues1.y print " optimal accuracy: ", best_acc1 IsOptimalBoostingOnOptSimpleLogistic = True # is optimal boosting based on optimal simple Logistic ? IsOptBoostOnOptSimpLog = IsOptimalBoostingOnOptSimpleLogistic OptBoostSimpLog = bestadaboostm1 OptBoostSimpLogp1 = bestValues1.x OptBoostSimpLogp2 = bestValues1.y OptBoostSimpLogAcc = best_acc1 if IsOptBoostOnOptSimpLog: Description = 'Boosting optimal simple logistic classifier: OptWeightThreshold = ' + \ str(OptBoostSimpLogp1) + ', OptNumIterations=' + \ str(OptBoostSimpLogp2) + ', OptAcc = ' + str(OptBoostSimpLogAcc) else: Description = 'Boosting simple logistic classifier: OptNumBoostingIterations = ' + \ str(OptBoostSimpLogp1) + ', OptNumIterations=' + \ str(OptBoostSimpLogp2) + ', OptAcc = ' + str(OptBoostSimpLogAcc) return IsOptBoostOnOptSimpLog, OptBoostSimpLog, OptBoostSimpLogp1, OptBoostSimpLogp2, \ OptBoostSimpLogAcc, Description
def BaggingSMO_ParamFinder(data, BestSMOIsRBFKernel, param1, param2): # Possible set for C-value cBounds = [[1,10,1],[10,100,10],[100,300,20]] # possible set bag size percent BagSizePercentBound = [ max(10, int(float(1)/float(data.numInstances())*100)+1 ) ,100,10] # max operation is to make sure that least number of samples are provided to the classifier # possible set for Iteration ItrBound = [5,50,5] # This section tries to boost the best smo print "searching for the best parameters to Bag the best SMO ...." gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(False)) smo = SMO() bagging = Bagging() if BestSMOIsRBFKernel: kernel = RBFKernel() kernel.setGamma(param2) smo.setKernel(kernel) smo.setC(param1) else: kernel = PolyKernel() kernel.setExponent(param2) smo.setKernel(kernel) smo.setC(param1) bagging.setClassifier(smo) gridsearch.setClassifier(bagging) gridsearch.setXProperty(String('classifier.bagSizePercent')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(BagSizePercentBound[0]) gridsearch.setXMax(BagSizePercentBound[1]) gridsearch.setXStep(BagSizePercentBound[2]) gridsearch.setYMin(ItrBound[0]) gridsearch.setYMax(ItrBound[1]) gridsearch.setYStep(ItrBound[2]) print "searching for best parameters for bagging SMO bagSizePercent = [", BagSizePercentBound[0], ",", BagSizePercentBound[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...." gridsearch.buildClassifier(data) #bestbagging1 = gridsearch.getBestClassifier() bestValues1 = gridsearch.getValues() # ------------------ Evaluation smo = SMO() bestbagging1 = Bagging() smo.setKernel(kernel) smo.setC(param1) bestbagging1.setBagSizePercent(int(bestValues1.x)) bestbagging1.setNumIterations(int(bestValues1.y)) bestbagging1.setClassifier(smo) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestbagging1,data,numFolds,random,[output, attRange, outputDistribution]) best_acc1 = evaluation.pctCorrect() bestValues1 = gridsearch.getValues() print "best accuracy by bagging the optimal SMO classifier: ", best_acc1 print "Optimal Bag size Percent : ", bestValues1.x , "Optimal number of Iteration : ", bestValues1.y print "-----------------------------------------" # ------------------------------------------------------------------------------------------------------------------------ # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration smo = SMO() kernel = PolyKernel() smo.setKernel(kernel) bagging.setClassifier(smo) gridsearch.setClassifier(bagging) gridsearch.setXProperty(String('classifier.classifier.c')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setGridIsExtendable(Boolean(True)) best_acc2 = -float('inf') for cnt in range(0,len(cBounds)): cbound = cBounds[cnt] cmin = cbound[0] cmax = cbound[1] cstep = cbound[2] gridsearch.setXMin(cmin) gridsearch.setXMax(cmax) gridsearch.setXStep(cstep) gridsearch.setYMin(ItrBound[0]) gridsearch.setYMax(ItrBound[1]) gridsearch.setYStep(ItrBound[2]) print "searching for RBF Kernel C = [", cmin, ",", cmax, "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ------------ Evaluation smo = SMO() bestbagging = Bagging() kernel = PolyKernel() smo.setKernel(kernel) smo.setC(bestValues.x) bestbagging.setNumIterations(int(bestValues.y)) bestbagging.setClassifier(smo) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestbagging,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc2): bestbagging2 = bestbagging best_acc2 = acc bestValues2 = bestValues print "Best accuracy so far by bagging linear SMO: ", best_acc2 print "Best values so far by bagging linear SMO: ", bestValues2 print "Best accuracy by bagging linear SMO: ", best_acc2 print "Best values by bagging linear SMO: ", bestValues2 print "-----------------------------------------" print "Final optimal bagging classifier:" if (best_acc2 > best_acc1): print " Best bagging is based on linear SMO with optimal c-value :", bestValues2.x, " optimal numIteration = ", bestValues2.y print " optimal accuracy: ", best_acc2 IsOptimalBaggingIsOptSMO = False # is optimal bagging based on optimal SMO ? IsOptBagOnOptSMO = IsOptimalBaggingIsOptSMO OptBagSMO = bestbagging2 OptBagSMOp1 = bestValues2.x OptBagSMOp2 = bestValues2.y OptBagSMOAcc = best_acc2 else: print " Best bagging is based on optimal SMO with optimal bagSizePercent :", bestValues1.x, " optimal numIteration = ", bestValues1.y print " optimal accuracy: ", best_acc1 IsOptimalBaggingIsOptSMO = True # is optimal bagging based on optimal SMO ? IsOptBagOnOptSMO = IsOptimalBaggingIsOptSMO OptBagSMO = bestbagging1 OptBagSMOp1 = bestValues1.x OptBagSMOp2 = bestValues1.y OptBagSMOAcc = best_acc1 if IsOptBagOnOptSMO: Description = 'Bagging on optimal SMO classifier: OptBagSizePercent=' + str(OptBagSMOp1) + \ ', OptNumIterations=' + str(OptBagSMOp2) + ', OptAcc=' + str(OptBagSMOAcc) else: Description = 'Bagging on linear SMO classifier: OptC=' + str(OptBagSMOp1) + \ ', OptNumIterations=' + str(OptBagSMOp2) + ', OptAcc=' + str(OptBagSMOAcc) return IsOptBagOnOptSMO, OptBagSMO, OptBagSMOp1, OptBagSMOp2, OptBagSMOAcc, Description
def BaggingLogistic_ParamFinder(data, param1, param2): # Possible set for Ridge-value RBounds = [-10,2,1] # possible set bag size percent BagSizePercentBound = [ max(10, int(float(1)/float(data.numInstances())*100)+1 ) ,100,10] # max operation is to make sure that least number of samples are provided to the classifier # possible set for Iteration ItrBound = [5,50,5] # This section tries to boost the best logistic print "searching for the best parameters to Bag the optimal Logistic ...." gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(False)) logistic = Logistic() bagging = Bagging() logistic.setRidge(param1) logistic.setMaxIts(param2) bagging.setClassifier(logistic) gridsearch.setClassifier(bagging) gridsearch.setXProperty(String('classifier.bagSizePercent')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(BagSizePercentBound[0]) gridsearch.setXMax(BagSizePercentBound[1]) gridsearch.setXStep(BagSizePercentBound[2]) gridsearch.setYMin(ItrBound[0]) gridsearch.setYMax(ItrBound[1]) gridsearch.setYStep(ItrBound[2]) print "searching for best parameters for bagging Logistic bagSizePercent = [", BagSizePercentBound[0], ",", BagSizePercentBound[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...." gridsearch.buildClassifier(data) #bestbagging1 = gridsearch.getBestClassifier() bestValues1 = gridsearch.getValues() # ------------------------------ Evaluation logistic = Logistic() bestbagging1 = Bagging() logistic.setRidge(param1) logistic.setMaxIts(param2) bestbagging1.setBagSizePercent(int(bestValues1.x)) bestbagging1.setNumIterations(int(bestValues1.y)) bestbagging1.setClassifier(logistic) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestbagging1,data,numFolds,random,[output, attRange, outputDistribution]) best_acc1 = evaluation.pctCorrect() print "best accuracy by bagging the optimal Logistic classifier: ", best_acc1 print "Optimal Bag size Percent: ", bestValues1.x, " Optimal number of Iterations: ", bestValues1.y print "-----------------------------------------" # ------------------------------------------------------------------------------------------------------------------------- # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration logistic = Logistic() bagging = Bagging() bagging.setClassifier(logistic) gridsearch.setClassifier(bagging) gridsearch.setXProperty(String('classifier.classifier.ridge')) gridsearch.setYProperty(String('classifier.numIterations')) gridsearch.setXExpression(String('pow(BASE,I)')) gridsearch.setYExpression(String('I')) gridsearch.setXBase(10) gridsearch.setGridIsExtendable(Boolean(True)) gridsearch.setXMin(RBounds[0]) gridsearch.setXMax(RBounds[1]) gridsearch.setXStep(RBounds[2]) gridsearch.setYMin(ItrBound[0]) gridsearch.setYMax(ItrBound[1]) gridsearch.setYStep(ItrBound[2]) print "searching for ridge bound = [10^", RBounds[0], ",10^", RBounds[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...." gridsearch.buildClassifier(data) #bestbagging = gridsearch.getBestClassifier() bestValues2 = gridsearch.getValues() # ------------------ Evaluation logistic = Logistic() bestbagging2 = Bagging() logistic.setRidge(pow(10,bestValues2.x)) bestbagging2.setNumIterations(int(bestValues2.y)) bestbagging2.setClassifier(logistic) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestbagging2,data,numFolds,random,[output, attRange, outputDistribution]) best_acc2 = evaluation.pctCorrect() print "best accuracy by bagging the Logistic classifier (with optimization over ridge): ", best_acc2 print "Optimal Ridge value : ", bestValues2.x , "Optimal number of Iteration : ", bestValues2.y print "-----------------------------------------" print "Final optimal bagging classifier:" if (best_acc2 > best_acc1): print " Best bagging is based on logistic with optimal ridge-value :", bestValues2.x, " optimal numIteration :", bestValues2.y print " optimal accuracy: ", best_acc2 IsOptimalBaggingIsOptLogistic = False # is optimal bagging based on optimal Logistic ? IsOptBagOnOptLog = IsOptimalBaggingIsOptLogistic OptBagLog = bestbagging2 OptBagLogp1 = pow(10,bestValues2.x) OptBagLogp2 = bestValues2.y OptBagLogAcc = best_acc2 else: print " Best bagging is based on optimal Logistic with optimal bagSizePercent :", bestValues1.x, " optimal numIteration :", bestValues1.y print " optimal accuracy: ", best_acc1 IsOptimalBaggingIsOptLogistic = True # is optimal bagging based on optimal Logistic ? IsOptBagOnOptLog = IsOptimalBaggingIsOptLogistic OptBagLog = bestbagging1 OptBagLogp1 = bestValues1.x OptBagLogp2 = bestValues1.y OptBagLogAcc = best_acc1 if IsOptBagOnOptLog: Description = 'Bagging on optimal logistic classifier: OptBagSizePercent= ' + str(OptBagLogp1) + \ ', OptNumIterations=' + str(OptBagLogp2) + ', OptAcc = ' + str(OptBagLogAcc) else: Description = 'Bagging on logistic classifier: OptRidge= ' + str(OptBagLogp1) + \ ', OptNumIterations=' + str(OptBagLogp2) + ', OptAcc = ' + str(OptBagLogAcc) return IsOptBagOnOptLog, OptBagLog, OptBagLogp1, OptBagLogp2, OptBagLogAcc, Description