def myGridSearch(data,RBound,MBound):
    bestlogistic = None
    best_acc     = -float('inf')
    class bestValues(object):
        m = float('nan')
        r = float('nan')
    for r in range(RBound[0],RBound[1]+RBound[2],RBound[2]):
        for m in range(MBound[0],MBound[1]+MBound[2],MBound[2]):
            logistic = Logistic()
            logistic.setMaxIts(int(m))
            logistic.setRidge(pow(10,r))
            evaluation = Evaluation(data)
            output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(logistic,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc):
                bestlogistic = logistic
                best_acc = acc
                bestValues.m = int(m)
                bestValues.r = pow(10,r)
    print "Best accuracy: ", best_acc
    print "Best values:   M = ", bestValues.m, ", Ridge = ", bestValues.r
    print "-----------------------------------------"
    return bestlogistic, bestValues.r, bestValues.m, best_acc
def myGridSearch(data,NTreeBounds,NFeaturesBounds):
    best_acc = -float('inf')
    bestrandomforest = None
    class bestValues(object):
        t = float('nan')
        f = float('nan')
    for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]):
        for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]):
            randomforest = RandomForest()
            randomforest.setNumTrees(int(t))
            randomforest.setNumFeatures(int(f))
            evaluation = Evaluation(data)
            output = output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc):
                bestrandomforest = randomforest
                best_acc = acc
                bestValues.t = t
                bestValues.f = f
    print "Best accuracy:", best_acc
    print "Best values:  NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f
    print "-----------------------------------------"
    return bestrandomforest, bestValues.t, bestValues.f, best_acc
def Logistic_ParamFinder(data): 
    # Possible set for Ridge-value
    RBounds = [-10,2,1]
    # possible set for maximum Iteration
    MBounds = [-1,10,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        logistic = Logistic()
        gridsearch.setClassifier(logistic)
        gridsearch.setXProperty(String('classifier.maxIts'))
        gridsearch.setYProperty(String('classifier.ridge'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('pow(BASE,I)'))
        gridsearch.setXMin(MBounds[0])
        gridsearch.setXMax(MBounds[1])
        gridsearch.setXStep(MBounds[2])
        gridsearch.setYMin(RBounds[0])
        gridsearch.setYMax(RBounds[1])
        gridsearch.setYStep(RBounds[2])
        gridsearch.setYBase(10)
        print "searching for logistic lcassifier Max Iteration = [", MBounds[0], ",", MBounds[1], "], Ridge = [ 10E", RBounds[0], ",10E", RBounds[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # -----------------------  Evaluation
        bestlogistic = Logistic()
        bestlogistic.setMaxIts(int(bestValues.x))
        bestlogistic.setRidge(pow(10,bestValues.y))
        evaluation = Evaluation(data)
        output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestlogistic,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        print "best accuracy: ", acc
        print "best logistic classifier with Ridge = ", bestlogistic.getRidge(), " Max Iteration = ", bestlogistic.getMaxIts()
        OptLog = bestlogistic
        OptLogp1 = bestlogistic.getRidge()
        OptLogp2 = bestlogistic.getMaxIts()
        OptLogAcc = acc
    else:
        OptLog, OptLogp1, OptLogp2, OptLogAcc = myGridSearch(data,RBounds,MBounds)
    Description = 'Logistic classifier OptRidge = ' + str(OptLogp1) + \
            ', OptMaxIts = ' + str(OptLogp2) + ', OptAcc = ' + str(OptLogAcc)
    print "-----------------------------------------"
    return OptLog, OptLogp1, OptLogp2, OptLogAcc, Description
def RandomForest_ParamFinder(data): 
    # possible set for Number of trees
    NTreeBounds = [1,20,1]
    # possible set for number of features
    NFeaturesBounds = [0,20,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        randomforest = RandomForest()
        gridsearch.setClassifier(randomforest)
        gridsearch.setXProperty(String('classifier.numTrees'))
        gridsearch.setYProperty(String('classifier.numFeatures'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('I'))
        gridsearch.setXMin(NTreeBounds[0])
        gridsearch.setXMax(NTreeBounds[1])
        gridsearch.setXStep(NTreeBounds[2])
        gridsearch.setYMin(NFeaturesBounds[0])
        gridsearch.setYMax(NFeaturesBounds[1])
        gridsearch.setYStep(NFeaturesBounds[2])
        gridsearch.setYBase(10)
        print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # -----------------------  Evaluation
        bestrandomforest = RandomForest()
        bestrandomforest.setNumTrees(int(bestValues.x))
        bestrandomforest.setNumFeatures(int(bestValues.y))
        evaluation = Evaluation(data)
        output = output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        print "best accuracy: ", acc
        print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y
        OptRndFrst = bestrandomforest
        OptRndFrstp1 = bestValues.x
        OptRndFrstp2 = bestValues.y
        OptRndFrstAcc = acc
    else:
        OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) 
    Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \
            ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc)
    print "-----------------------------------------"
    return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
Exemple #5
0
def myGridSearch(data,cBounds,GBound,eBounds):
    IsBestRBFKernel = False
    best_acc_poly = -float('inf')
    best_acc_rbf = -float('inf')
    # Poly Kernel 
    class bestValues_poly(object):
        x = float('nan')
        y = float('nan')
    for Cbnd in cBounds:
        for c in range(Cbnd[0],Cbnd[1]+Cbnd[2],Cbnd[2]):
            for e in range(eBounds[0],eBounds[1]+eBounds[2],eBounds[2]):
                smo = SMO()
                kernel = PolyKernel()
                kernel.setExponent(e)
                smo.setC(c)
                smo.setKernel(kernel)
                evaluation = Evaluation(data)
                output = util.get_buffer_for_predictions()[0]
                attRange = Range()  # no additional attributes output
                outputDistribution = Boolean(False)  # we don't want distribution
                random = Random(1)
                numFolds = min(10,data.numInstances())
                evaluation.crossValidateModel(smo,data,numFolds,random,[output, attRange, outputDistribution])
                acc = evaluation.pctCorrect()
                if (acc>best_acc_poly):
                    best_smo_poly = smo
                    best_acc_poly = acc
                    bestValues_poly.x = c
                    bestValues_poly.y = e
    print "Best accuracy (Poly Kernel): ", best_acc_poly
    print "Best values (Poly Kernel):   C = ", bestValues_poly.x, ", exponent = ", bestValues_poly.y
    print "-----------------------------------------"
    # RBF Kernel
    class bestValues_rbf(object):
        x = float('nan')
        y = float('nan')
    for Cbnd in cBounds:
        for c in range(Cbnd[0],Cbnd[1]+Cbnd[2],Cbnd[2]):
            for g in range(GBound[0],GBound[1]+GBound[2],GBound[2]):
                smo = SMO()
                kernel = RBFKernel()
                kernel.setGamma(pow(10,g))
                smo.setC(c)
                smo.setKernel(kernel)
                evaluation = Evaluation(data)
                output = util.get_buffer_for_predictions()[0]
                attRange = Range()  # no additional attributes output
                outputDistribution = Boolean(False)  # we don't want distribution
                random = Random(1)
                numFolds = min(10,data.numInstances())
                evaluation.crossValidateModel(smo,data,numFolds,random,[output, attRange, outputDistribution])
                acc = evaluation.pctCorrect()
                if (acc>best_acc_rbf):
                    best_smo_rbf = smo
                    best_acc_rbf = acc
                    bestValues_rbf.x = c
                    bestValues_rbf.y = g 
    print "Best accuracy (RBF Kernel): ", best_acc_rbf
    print "Best values (RBF Kernel):   C = ", bestValues_rbf.x, ", gamma = ", bestValues_rbf.y
    if (best_acc_rbf > best_acc_poly):
        IsBestRBFKernel = True
        print "best smo classifier is RBF kernel with C = ", bestValues_rbf.x," and gamma = ", pow(10,bestValues_rbf.y)
        best_smo = best_smo_rbf
        OptSMOp1 = bestValues_rbf.x
        OptSMOp2 = pow(10,bestValues_rbf.y)
        OptSMOAcc = best_acc_rbf
        OptSMOIsRBF = IsBestRBFKernel
    else:
        IsBestRBFKernel = False
        print "best smo classifier is Poly kernel with C = ", bestValues_poly.x," and exponent = ", bestValues_poly.y
        best_smo = best_smo_poly
        OptSMOp1 = bestValues_poly.x
        OptSMOp2 = bestValues_poly.y
        OptSMOAcc = best_acc_poly
        OptSMOIsRBF = IsBestRBFKernel
    return IsBestRBFKernel, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc
Exemple #6
0
def SMO_ParamFinder(data):
    # Possible set for C-value
    cBounds = [[1,10,1],[10,100,10],[100,300,20]]
    # possible set for exponents
    eBounds = [1,3,1]
    # possible set for Gamma
    GBound = [-5,2,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        # Polynomials Kernel
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        smo = SMO()
        kernel = PolyKernel()
        smo.setKernel(kernel)
        gridsearch.setClassifier(smo)
        gridsearch.setXProperty(String('classifier.c'))
        gridsearch.setYProperty(String('classifier.kernel.Exponent'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('I'))
        best_acc_poly = -float('inf')
        for cnt in range(0,len(cBounds)):
            cbound = cBounds[cnt]
            cmin =  cbound[0]
            cmax =  cbound[1]
            cstep = cbound[2]           
            gridsearch.setXMin(cmin)
            gridsearch.setXMax(cmax)
            gridsearch.setXStep(cstep)
            gridsearch.setYMin(eBounds[0])
            gridsearch.setYMax(eBounds[1])
            gridsearch.setYStep(eBounds[2])
            print "searching for Polykernel C = [", cmin, ",", cmax, "], exponent = [", eBounds[0], ",", eBounds[1], "] ...."
            gridsearch.buildClassifier(data)
            bestValues = gridsearch.getValues()
            # --------------------------------- Evaluation
            bestsmo = SMO()
            kernel = PolyKernel()
            kernel.setExponent(bestValues.y)
            bestsmo.setC(bestValues.x)
            bestsmo.setKernel(kernel)
            evaluation = Evaluation(data)
            output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            print "numFolds : ", numFolds
            evaluation.crossValidateModel(bestsmo,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc_poly):
                best_smo_poly = bestsmo
                best_acc_poly = acc
                bestValues_poly = bestValues
                print "Best accuracy so far: ",best_acc_poly
                print "Best values so far:   ",bestValues_poly 
        print "Best accuracy (Poly Kernel): ", best_acc_poly
        print "Best values (Poly Kernel):   ", bestValues_poly
        print "-----------------------------------------"
        # RBF Kernel
        smo = SMO()
        kernel = RBFKernel()
        smo.setKernel(kernel)
        gridsearch.setClassifier(smo)
        gridsearch.setXProperty(String('classifier.c'))
        gridsearch.setYProperty(String('classifier.kernel.gamma'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('pow(BASE,I)'))
        gridsearch.setYBase(10)
        best_acc_rbf = -float('inf')
        for cnt in range(0,len(cBounds)):
            cbound = cBounds[cnt]
            cmin =  cbound[0]
            cmax =  cbound[1]
            cstep = cbound[2]           
            gridsearch.setXMin(cmin)
            gridsearch.setXMax(cmax)
            gridsearch.setXStep(cstep)
            gridsearch.setYMin(GBound[0])
            gridsearch.setYMax(GBound[1])
            gridsearch.setYStep(GBound[2])
            gridsearch.setYBase(10)
            print "searching for RBF Kernel C = [", cmin, ",", cmax, "], gamma = [10^", GBound[0], ",10^", GBound[1], "] ...."
            gridsearch.buildClassifier(data)
            bestValues = gridsearch.getValues()
            # ----------------------------------- Evaluation
            bestsmo = SMO()
            kernel = RBFKernel()
            kernel.setGamma(pow(10,bestValues.y))
            bestsmo.setC(bestValues.x)
            bestsmo.setKernel(kernel)
            evaluation = Evaluation(data)
            output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(bestsmo,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc_rbf):
                best_smo_rbf = bestsmo
                best_acc_rbf = acc
                bestValues_rbf = bestValues
                print "Best accuracy so far: ",best_acc_rbf
                print "Best values so far:   ",bestValues_rbf 
        print "Best accuracy (RBF Kernel): ", best_acc_rbf
        print "Best values (RBF Kernel):   ", bestValues_rbf
        print "-----------------------------------------" 
        if (best_acc_rbf > best_acc_poly):
            IsBestRBFKernel = True
            print "best smo classifier is RBF kernel with C = ", bestValues_rbf.x, " and gamma = ", pow(10,bestValues.y)
            best_smo = best_smo_rbf
            OptSMOp1 = bestValues_rbf.x
            OptSMOp2 = pow(10,bestValues.y)
            OptSMOAcc = best_acc_rbf
            OptSMOIsRBF = IsBestRBFKernel
        else:
            IsBestRBFKernel = False
            print "best smo classifier is Poly kernel with C = ", bestValues_poly.x, " and exponent = ", bestValues_poly.y
            best_smo = best_smo_poly
            OptSMOp1 = bestValues_poly.x
            OptSMOp2 = bestValues_poly.y
            OptSMOAcc = best_acc_poly
            OptSMOIsRBF = IsBestRBFKernel
    else:    # we have very small ssample size
        OptSMOIsRBF, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc  = myGridSearch(data,cBounds,GBound,eBounds)
    if OptSMOIsRBF:
        Description = 'SMO classifier(RBF kernel): OptC=' + str(OptSMOp1) + \
                ', OptGamma=' + str(OptSMOp2) + ', OptAcc=' + str(OptSMOAcc) 
    else:
        Description = 'SMO classifier(Poly kernel): OptC=' + str(OptSMOp1) + \
                ', OptExponent=' + str(OptSMOp2) + ', OptAcc=' + str(OptSMOAcc)
    return OptSMOIsRBF, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc, Description
def runClassifierAlgo(algo, class_index, training_filename, test_filename, do_model, do_eval, do_predict):
    """ If <test_filename>
            Run classifier algorithm <algo> on training data in <training_filename> to build a model
            then test on data in <test_filename> (equivalent of Weka "Supplied test set") 
        else
            do 10 fold CV lassifier algorithm <algo> on data in <training_filename>
        
        <class_index> is the column containing the dependent variable 
        
        http://weka.wikispaces.com/Generating+classifier+evaluation+output+manually
        http://weka.sourceforge.net/doc.dev/weka/classifiers/Evaluation.html
    """
    print ' runClassifierAlgo: training_filename= ', training_filename, ', test_filename=', test_filename
    misc.checkExists(training_filename)

    training_file = FileReader(training_filename)
    training_data = Instances(training_file)
    if test_filename:
        test_file = FileReader(test_filename)
        test_data = Instances(test_file)
    else:
        test_data = training_data

   # set the class Index - the index of the dependent variable
    training_data.setClassIndex(class_index)
    test_data.setClassIndex(class_index)

    # create the model
    if test_filename:
        algo.buildClassifier(training_data)

    evaluation = None
    # only a trained classifier can be evaluated
    if do_eval or do_predict:
        evaluation = Evaluation(test_data)
        buffer = StringBuffer()             # buffer for the predictions
        attRange = Range()                  # no additional attributes output
        outputDistribution = Boolean(False) # we don't want distribution
        if test_filename:
            evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution])
        else:
           # evaluation.evaluateModel(algo, [String('-t ' + training_filename), String('-c 1')])
           # print evaluation.toSummaryString()
            rand = Random(1)
            evaluation.crossValidateModel(algo, training_data, 4, rand)
            if False:
                print 'percentage correct =', evaluation.pctCorrect()
                print 'area under ROC =', evaluation.areaUnderROC(class_index)
                confusion_matrix = evaluation.confusionMatrix()
                for l in confusion_matrix:
                    print '** ', ','.join('%2d'%int(x) for x in l)

    if verbose:
        if do_model:
            print '--> Generated model:\n'
            print algo.toString()
        if do_eval:
            print '--> Evaluation:\n'
            print evaluation.toSummaryString()
        if do_predict:
            print '--> Predictions:\n'
            print buffer

    return {'model':str(algo), 'eval':str(evaluation.toSummaryString()), 'predict':str(buffer) }
def Bayes_ParamFinder(data):
    # -----------------------  Evaluation of Naive Bayes without kernel estimation
    naivebayes = NaiveBayes()
    evaluation = Evaluation(data)
    output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(naivebayes,data,numFolds,random,[output, attRange, outputDistribution])
    acc_naivebayes = evaluation.pctCorrect()
    print "Naive Bayesisn accuracy (without kernel density estimation): ", acc_naivebayes
    # -----------------------  Evaluation of Naive Bayes with kernel estimation
    naivebayes = NaiveBayes()
    naivebayes.setUseKernelEstimator(Boolean(True))   # use kernel density estimation
    evaluation = Evaluation(data)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(naivebayes,data,numFolds,random,[output, attRange, outputDistribution])
    acc_naivebayes_withkernel = evaluation.pctCorrect()
    print "Naive Bayesisn accuracy (with kernel density estimation): ", acc_naivebayes_withkernel
    # -----------------------  Evaluation of Naive bayes multinomial
    naivebayesmultinomial = NaiveBayesMultinomial()
    evaluation = Evaluation(data)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    if (allAttributesPositive(data)):  # multinomial bayes classifier only work on positive attributes
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(naivebayesmultinomial,data,numFolds,random,[output, attRange, outputDistribution])
        acc_naivemultinomialbayes = evaluation.pctCorrect()
    else:
        acc_naivemultinomialbayes = 0
    print "Naive Multinomial Bayesisn accuracy : ", acc_naivemultinomialbayes
    # ------------------------- Comparision
    if (acc_naivemultinomialbayes > acc_naivebayes):
        if (acc_naivemultinomialbayes > acc_naivebayes_withkernel):
            IsOptMultinomialBayes = True
            IsOptNaiveKernelDensity = False
            acc = acc_naivemultinomialbayes
        else:
            IsOptMultinomialBayes = False
            IsOptNaiveKernelDensity = True
            acc = acc_naivebayes_withkernel
    else:
        if (acc_naivebayes > acc_naivebayes_withkernel):
            IsOptMultinomialBayes = False
            IsOptNaiveKernelDensity = False
            acc = acc_naivebayes
        else:
            IsOptMultinomialBayes = False
            IsOptNaiveKernelDensity = True
            acc = acc_naivebayes_withkernel
    print "-----------------------------------------"
    OptBayesAcc = acc
    if IsOptMultinomialBayes:
        Description = 'Optimal Bayes classifier is Multinomial Bayes: OptAcc = ' + str(OptBayesAcc)
    elif IsOptNaiveKernelDensity:
        Description = 'Optimal Bayes classifier is Naive Bayes with kernel density estimation: OptAcc = ' +\
             str(OptBayesAcc)
    else:
        Description = 'Optimal Bayes classifier is Naive Bayes: OptAcc = ' + str(OptBayesAcc)
    return IsOptMultinomialBayes, IsOptNaiveKernelDensity, OptBayesAcc, Description
def AdaBoostedSimpleLogistic_ParamFinder(data, param1, param2):
    # Adaboost params: Possible set for Weight Threshold 
    WeightThresholdBounds = [99,100,1]
    # Adaboost params: possible set for NumIteration
    NumItrBound = [5,50,5]
    # Simple Logisitic params: Possible set for num of boosting
    NumBoostIterationBounds = [0,200,10]
    # This section tries to boost the best simple logistic
    print "searching for the best parameters to boosting on the optimal simple Logistic ...."
    gridsearch = GridSearch()
    acctag = gridsearch.getEvaluation()
    acctag = SelectedTag('ACC',acctag.getTags())
    gridsearch.setEvaluation(acctag)
    allfilters = AllFilters()
    gridsearch.setFilter(allfilters)
    gridsearch.setGridIsExtendable(Boolean(True))
    simplelogistic = SimpleLogistic()
    adaboostm = AdaBoostM1()
    simplelogistic.setHeuristicStop(param1)
    simplelogistic.setNumBoostingIterations(param2)
    adaboostm.setClassifier(simplelogistic)
    gridsearch.setClassifier(adaboostm)
    gridsearch.setXProperty(String('classifier.weightThreshold'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXMin(WeightThresholdBounds[0])
    gridsearch.setXMax(WeightThresholdBounds[1])
    gridsearch.setXStep(WeightThresholdBounds[2])
    gridsearch.setYMin(NumItrBound[0])
    gridsearch.setYMax(NumItrBound[1])
    gridsearch.setYStep(NumItrBound[2])
    print "searching for best parameters for boosting simple Logistic weightThreshold = [", WeightThresholdBounds[0], ",", WeightThresholdBounds[1], "], # Iterations = [", NumItrBound[0], ",", NumItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    bestValues1 = gridsearch.getValues()
    # ------------------------------ Evaluation
    simplelogistic = SimpleLogistic()
    bestadaboostm1 = AdaBoostM1()
    simplelogistic.setHeuristicStop(param1)
    simplelogistic.setNumBoostingIterations(param2)
    bestadaboostm1.setWeightThreshold(int(bestValues1.x))
    bestadaboostm1.setNumIterations(int(bestValues1.y))
    bestadaboostm1.setClassifier(simplelogistic)
    evaluation = Evaluation(data)
    output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestadaboostm1,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc1 = evaluation.pctCorrect()
    print "best accuracy by boosting the optimal simple Logistic classifier: ", best_acc1
    print "Optimal weight Threshold  Percent : ", bestValues1.x , "Optimal number of Iterations : ", bestValues1.y
    print "-----------------------------------------"
    # -------------------------------------------------------------------------------------------------------------------------
    # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration  
    simplelogistic = SimpleLogistic()
    adaboostm = AdaBoostM1()
    adaboostm.setClassifier(simplelogistic)
    gridsearch.setClassifier(adaboostm)
    gridsearch.setXProperty(String('classifier.classifier.numBoostingIterations'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXBase(10)
    gridsearch.setXMin(NumBoostIterationBounds[0])
    gridsearch.setXMax(NumBoostIterationBounds[1])
    gridsearch.setXStep(NumBoostIterationBounds[2])
    gridsearch.setYMin(NumItrBound[0])
    gridsearch.setYMax(NumItrBound[1])
    gridsearch.setYStep(NumItrBound[2])
    print "searching for number of boosting Iterations bound  = [", NumBoostIterationBounds[0], ",", NumBoostIterationBounds[1], "], # Iteration = [", NumItrBound[0], ",", NumItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    bestValues2 = gridsearch.getValues()
    # ------------------ Evaluation
    simplelogistic = SimpleLogistic()
    bestadaboostm2 = AdaBoostM1()
    simplelogistic.setNumBoostingIterations(int(bestValues2.x))
    bestadaboostm2.setNumIterations(int(bestValues2.y))
    bestadaboostm2.setClassifier(simplelogistic)    
    evaluation = Evaluation(data)
    output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestadaboostm2,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc2 = evaluation.pctCorrect()
    print "best accuracy by boosting the Simple Logistic classifier (with optimization over ridge): ", best_acc2
    print "Optimal number of boosting Iteration : ", bestValues2.x , "Optimal number of Iteration : ", bestValues2.y
    print "-----------------------------------------"
    print "Final optimal boosting classifier:"
    if (best_acc2 > best_acc1):
        print "     Best boosting is based on simple logistic with optimal numBoostingIterations :",\
             bestValues2.x, " optimal numIteration :", bestValues2.y
        print "     optimal accuracy: ", best_acc2
        IsOptimalBoostingOnOptSimpleLogistic = False    # is optimal boosting based on optimal simple Logistic ?
        IsOptBoostOnOptSimpLog = IsOptimalBoostingOnOptSimpleLogistic
        OptBoostSimpLog = bestadaboostm2
        OptBoostSimpLogp1 = bestValues2.x
        OptBoostSimpLogp2 = bestValues2.y
        OptBoostSimpLogAcc = best_acc2
    else:
        print "     Best boosting is based on optimal simple Logistic with optimal weight Threshold :",\
             bestValues1.x, " optimal numIteration :", bestValues1.y
        print "     optimal accuracy: ", best_acc1
        IsOptimalBoostingOnOptSimpleLogistic = True # is optimal boosting based on optimal simple Logistic ?
        IsOptBoostOnOptSimpLog = IsOptimalBoostingOnOptSimpleLogistic
        OptBoostSimpLog = bestadaboostm1
        OptBoostSimpLogp1 = bestValues1.x
        OptBoostSimpLogp2 = bestValues1.y
        OptBoostSimpLogAcc = best_acc1
    if IsOptBoostOnOptSimpLog:
        Description = 'Boosting optimal simple logistic classifier: OptWeightThreshold = ' + \
            str(OptBoostSimpLogp1) + ', OptNumIterations=' + \
            str(OptBoostSimpLogp2) + ', OptAcc = ' + str(OptBoostSimpLogAcc)
    else:
        Description = 'Boosting simple logistic classifier: OptNumBoostingIterations = ' + \
            str(OptBoostSimpLogp1) + ', OptNumIterations=' + \
            str(OptBoostSimpLogp2) + ', OptAcc = ' + str(OptBoostSimpLogAcc)
    return IsOptBoostOnOptSimpLog, OptBoostSimpLog, OptBoostSimpLogp1, OptBoostSimpLogp2, \
            OptBoostSimpLogAcc, Description
def BaggingSMO_ParamFinder(data, BestSMOIsRBFKernel, param1, param2):
    # Possible set for C-value
    cBounds = [[1,10,1],[10,100,10],[100,300,20]]
    # possible set bag size percent
    BagSizePercentBound = [ max(10, int(float(1)/float(data.numInstances())*100)+1 )  ,100,10]    # max operation is to make sure that least number of samples are provided to the classifier
    # possible set for Iteration
    ItrBound = [5,50,5]
    # This section tries to boost the best smo
    print "searching for the best parameters to Bag the best SMO ...."
    gridsearch = GridSearch()
    acctag = gridsearch.getEvaluation()
    acctag = SelectedTag('ACC',acctag.getTags())
    gridsearch.setEvaluation(acctag)
    allfilters = AllFilters()
    gridsearch.setFilter(allfilters)
    gridsearch.setGridIsExtendable(Boolean(False))
    smo = SMO()
    bagging = Bagging()
    if BestSMOIsRBFKernel:
        kernel = RBFKernel()
        kernel.setGamma(param2)
        smo.setKernel(kernel)
        smo.setC(param1)
    else:
        kernel = PolyKernel()
        kernel.setExponent(param2)
        smo.setKernel(kernel)
        smo.setC(param1)
    bagging.setClassifier(smo)
    gridsearch.setClassifier(bagging)
    gridsearch.setXProperty(String('classifier.bagSizePercent'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXMin(BagSizePercentBound[0])
    gridsearch.setXMax(BagSizePercentBound[1])
    gridsearch.setXStep(BagSizePercentBound[2])
    gridsearch.setYMin(ItrBound[0])
    gridsearch.setYMax(ItrBound[1])
    gridsearch.setYStep(ItrBound[2])
    print "searching for best parameters for bagging SMO bagSizePercent = [", BagSizePercentBound[0], ",", BagSizePercentBound[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    #bestbagging1 = gridsearch.getBestClassifier()
    bestValues1 = gridsearch.getValues()
    # ------------------ Evaluation
    smo = SMO()
    bestbagging1 = Bagging()
    smo.setKernel(kernel)
    smo.setC(param1)
    bestbagging1.setBagSizePercent(int(bestValues1.x))
    bestbagging1.setNumIterations(int(bestValues1.y))
    bestbagging1.setClassifier(smo)
    evaluation = Evaluation(data)
    output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestbagging1,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc1 = evaluation.pctCorrect()
    bestValues1 = gridsearch.getValues()
    print "best accuracy by bagging the optimal SMO classifier: ", best_acc1
    print "Optimal Bag size Percent : ", bestValues1.x , "Optimal number of Iteration : ", bestValues1.y
    print "-----------------------------------------"
    # ------------------------------------------------------------------------------------------------------------------------
    # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration  
    smo = SMO()
    kernel = PolyKernel()
    smo.setKernel(kernel)
    bagging.setClassifier(smo)
    gridsearch.setClassifier(bagging)
    gridsearch.setXProperty(String('classifier.classifier.c'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setGridIsExtendable(Boolean(True))
    best_acc2 = -float('inf')
    for cnt in range(0,len(cBounds)):
        cbound = cBounds[cnt]
        cmin =  cbound[0]
        cmax =  cbound[1]
        cstep = cbound[2]           
        gridsearch.setXMin(cmin)
        gridsearch.setXMax(cmax)
        gridsearch.setXStep(cstep)
        gridsearch.setYMin(ItrBound[0])
        gridsearch.setYMax(ItrBound[1])
        gridsearch.setYStep(ItrBound[2])
        print "searching for RBF Kernel C = [", cmin, ",", cmax, "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # ------------ Evaluation
        smo = SMO()
        bestbagging = Bagging()
        kernel = PolyKernel()
        smo.setKernel(kernel)
        smo.setC(bestValues.x)
        bestbagging.setNumIterations(int(bestValues.y))
        bestbagging.setClassifier(smo)
        evaluation = Evaluation(data)
        output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestbagging,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        if (acc>best_acc2):
            bestbagging2 = bestbagging
            best_acc2 = acc
            bestValues2 = bestValues
            print "Best accuracy so far by bagging linear SMO: ", best_acc2 
            print "Best values so far by bagging linear SMO:   ", bestValues2 
    print "Best accuracy by bagging linear SMO: ", best_acc2
    print "Best values by bagging linear SMO:   ", bestValues2
    print "-----------------------------------------"   
    print "Final optimal bagging classifier:"
    if (best_acc2 > best_acc1):
        print "     Best bagging is based on linear SMO with optimal c-value :", bestValues2.x, " optimal numIteration = ", bestValues2.y
        print "     optimal accuracy: ", best_acc2
        IsOptimalBaggingIsOptSMO = False    # is optimal bagging based on optimal SMO ?
        IsOptBagOnOptSMO = IsOptimalBaggingIsOptSMO
        OptBagSMO = bestbagging2
        OptBagSMOp1 = bestValues2.x
        OptBagSMOp2 = bestValues2.y
        OptBagSMOAcc = best_acc2
    else:
        print "     Best bagging is based on optimal SMO with optimal bagSizePercent :", bestValues1.x, " optimal numIteration = ", bestValues1.y
        print "     optimal accuracy: ", best_acc1
        IsOptimalBaggingIsOptSMO = True     # is optimal bagging based on optimal SMO ?
        IsOptBagOnOptSMO = IsOptimalBaggingIsOptSMO
        OptBagSMO = bestbagging1
        OptBagSMOp1 = bestValues1.x
        OptBagSMOp2 = bestValues1.y
        OptBagSMOAcc = best_acc1
    if IsOptBagOnOptSMO:
        Description = 'Bagging on optimal SMO classifier: OptBagSizePercent=' + str(OptBagSMOp1) + \
                ', OptNumIterations=' + str(OptBagSMOp2) + ', OptAcc=' + str(OptBagSMOAcc)
    else:
        Description = 'Bagging on linear SMO classifier: OptC=' + str(OptBagSMOp1) + \
                 ', OptNumIterations=' + str(OptBagSMOp2) + ', OptAcc=' + str(OptBagSMOAcc)
    return IsOptBagOnOptSMO, OptBagSMO,  OptBagSMOp1, OptBagSMOp2, OptBagSMOAcc, Description
def BaggingLogistic_ParamFinder(data, param1, param2):
    # Possible set for Ridge-value
    RBounds = [-10,2,1]
    # possible set bag size percent
    BagSizePercentBound = [ max(10, int(float(1)/float(data.numInstances())*100)+1 )  ,100,10]    # max operation is to make sure that least number of samples are provided to the classifier
    # possible set for Iteration
    ItrBound = [5,50,5]
    # This section tries to boost the best logistic
    print "searching for the best parameters to Bag the optimal Logistic ...."
    gridsearch = GridSearch()
    acctag = gridsearch.getEvaluation()
    acctag = SelectedTag('ACC',acctag.getTags())
    gridsearch.setEvaluation(acctag)
    allfilters = AllFilters()
    gridsearch.setFilter(allfilters)
    gridsearch.setGridIsExtendable(Boolean(False))
    logistic = Logistic()
    bagging = Bagging()
    logistic.setRidge(param1)
    logistic.setMaxIts(param2)
    bagging.setClassifier(logistic)
    gridsearch.setClassifier(bagging)
    gridsearch.setXProperty(String('classifier.bagSizePercent'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXMin(BagSizePercentBound[0])
    gridsearch.setXMax(BagSizePercentBound[1])
    gridsearch.setXStep(BagSizePercentBound[2])
    gridsearch.setYMin(ItrBound[0])
    gridsearch.setYMax(ItrBound[1])
    gridsearch.setYStep(ItrBound[2])
    print "searching for best parameters for bagging Logistic bagSizePercent = [", BagSizePercentBound[0], ",", BagSizePercentBound[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    #bestbagging1 = gridsearch.getBestClassifier()
    bestValues1 = gridsearch.getValues()
    # ------------------------------ Evaluation
    logistic = Logistic()
    bestbagging1 = Bagging()
    logistic.setRidge(param1)
    logistic.setMaxIts(param2)
    bestbagging1.setBagSizePercent(int(bestValues1.x))
    bestbagging1.setNumIterations(int(bestValues1.y))
    bestbagging1.setClassifier(logistic)
    evaluation = Evaluation(data)
    output = output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestbagging1,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc1 = evaluation.pctCorrect()
    print "best accuracy by bagging the optimal Logistic classifier: ", best_acc1
    print "Optimal Bag size Percent: ", bestValues1.x, " Optimal number of Iterations: ", bestValues1.y
    print "-----------------------------------------"
    # -------------------------------------------------------------------------------------------------------------------------
    # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration  
    logistic = Logistic()
    bagging = Bagging()
    bagging.setClassifier(logistic)
    gridsearch.setClassifier(bagging)
    gridsearch.setXProperty(String('classifier.classifier.ridge'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('pow(BASE,I)'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXBase(10)
    gridsearch.setGridIsExtendable(Boolean(True))
    gridsearch.setXMin(RBounds[0])
    gridsearch.setXMax(RBounds[1])
    gridsearch.setXStep(RBounds[2])
    gridsearch.setYMin(ItrBound[0])
    gridsearch.setYMax(ItrBound[1])
    gridsearch.setYStep(ItrBound[2])
    print "searching for ridge bound  = [10^", RBounds[0], ",10^", RBounds[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    #bestbagging = gridsearch.getBestClassifier()
    bestValues2 = gridsearch.getValues()
    # ------------------ Evaluation
    logistic = Logistic()
    bestbagging2 = Bagging()
    logistic.setRidge(pow(10,bestValues2.x))
    bestbagging2.setNumIterations(int(bestValues2.y))
    bestbagging2.setClassifier(logistic)    
    evaluation = Evaluation(data)
    output = output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestbagging2,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc2 = evaluation.pctCorrect()
    print "best accuracy by bagging the Logistic classifier (with optimization over ridge): ", best_acc2
    print "Optimal Ridge value : ", bestValues2.x , "Optimal number of Iteration : ", bestValues2.y
    print "-----------------------------------------"
    print "Final optimal bagging classifier:"
    if (best_acc2 > best_acc1):
        print "     Best bagging is based on logistic with optimal ridge-value :", bestValues2.x, " optimal numIteration :", bestValues2.y
        print "     optimal accuracy: ", best_acc2
        IsOptimalBaggingIsOptLogistic = False   # is optimal bagging based on optimal Logistic ?
        IsOptBagOnOptLog = IsOptimalBaggingIsOptLogistic
        OptBagLog = bestbagging2
        OptBagLogp1 = pow(10,bestValues2.x)
        OptBagLogp2 = bestValues2.y
        OptBagLogAcc = best_acc2
    else:
        print "     Best bagging is based on optimal Logistic with optimal bagSizePercent :", bestValues1.x, " optimal numIteration :", bestValues1.y
        print "     optimal accuracy: ", best_acc1
        IsOptimalBaggingIsOptLogistic = True        # is optimal bagging based on optimal Logistic ?
        IsOptBagOnOptLog = IsOptimalBaggingIsOptLogistic
        OptBagLog = bestbagging1
        OptBagLogp1 = bestValues1.x
        OptBagLogp2 = bestValues1.y
        OptBagLogAcc = best_acc1
    if IsOptBagOnOptLog:
        Description = 'Bagging on optimal logistic classifier: OptBagSizePercent= ' + str(OptBagLogp1) + \
                ', OptNumIterations=' + str(OptBagLogp2) + ', OptAcc = ' + str(OptBagLogAcc)
    else:
        Description = 'Bagging on logistic classifier: OptRidge= ' + str(OptBagLogp1) + \
                ', OptNumIterations=' + str(OptBagLogp2) + ', OptAcc = ' + str(OptBagLogAcc)
    return IsOptBagOnOptLog, OptBagLog,  OptBagLogp1, OptBagLogp2, OptBagLogAcc, Description