def readCross(num,type,numtrees):

    filename=resultFile+'_'+type+'_'+num+'_all.csv'
    loader=CSVLoader()
    loader.setSource(File(filename))
    data=loader.getDataSet()
    #print data.numAttributes()    
    
    data.setClassIndex(data.numAttributes()-1)

    rf=RF()
    rf.setNumTrees(numtrees)
    #pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) 
    buffer = StringBuffer()  # buffer for the predictions
    output=PlainText()
    output.setHeader(data)
    output.setBuffer(buffer)
    output.setOutputDistribution(True) 
    attRange = Range()  # attributes to output
    outputDistributions = Boolean(True)
    evaluator=Evaluation(data) 
    
    evaluator.crossValidateModel(rf,data,10, Random(1),[output,attRange,outputDistributions])
    

    print evaluator.toSummaryString()
    print evaluator.toClassDetailsString()
    print evaluator.toMatrixString()
    return [evaluator.weightedPrecision(),evaluator.weightedRecall(),evaluator.weightedFMeasure(),evaluator.weightedMatthewsCorrelation(),evaluator.weightedFalseNegativeRate(),evaluator.weightedFalsePositiveRate(),evaluator.weightedTruePositiveRate(),evaluator.weightedTrueNegativeRate(),evaluator.weightedAreaUnderROC()]
def myGridSearch(data,RBound,MBound):
    bestlogistic = None
    best_acc     = -float('inf')
    class bestValues(object):
        m = float('nan')
        r = float('nan')
    for r in range(RBound[0],RBound[1]+RBound[2],RBound[2]):
        for m in range(MBound[0],MBound[1]+MBound[2],MBound[2]):
            logistic = Logistic()
            logistic.setMaxIts(int(m))
            logistic.setRidge(pow(10,r))
            evaluation = Evaluation(data)
            output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(logistic,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc):
                bestlogistic = logistic
                best_acc = acc
                bestValues.m = int(m)
                bestValues.r = pow(10,r)
    print "Best accuracy: ", best_acc
    print "Best values:   M = ", bestValues.m, ", Ridge = ", bestValues.r
    print "-----------------------------------------"
    return bestlogistic, bestValues.r, bestValues.m, best_acc
def myGridSearch(data,NTreeBounds,NFeaturesBounds):
    best_acc = -float('inf')
    bestrandomforest = None
    class bestValues(object):
        t = float('nan')
        f = float('nan')
    for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]):
        for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]):
            randomforest = RandomForest()
            randomforest.setNumTrees(int(t))
            randomforest.setNumFeatures(int(f))
            evaluation = Evaluation(data)
            output = output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc):
                bestrandomforest = randomforest
                best_acc = acc
                bestValues.t = t
                bestValues.f = f
    print "Best accuracy:", best_acc
    print "Best values:  NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f
    print "-----------------------------------------"
    return bestrandomforest, bestValues.t, bestValues.f, best_acc
def Logistic_ParamFinder(data): 
    # Possible set for Ridge-value
    RBounds = [-10,2,1]
    # possible set for maximum Iteration
    MBounds = [-1,10,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        logistic = Logistic()
        gridsearch.setClassifier(logistic)
        gridsearch.setXProperty(String('classifier.maxIts'))
        gridsearch.setYProperty(String('classifier.ridge'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('pow(BASE,I)'))
        gridsearch.setXMin(MBounds[0])
        gridsearch.setXMax(MBounds[1])
        gridsearch.setXStep(MBounds[2])
        gridsearch.setYMin(RBounds[0])
        gridsearch.setYMax(RBounds[1])
        gridsearch.setYStep(RBounds[2])
        gridsearch.setYBase(10)
        print "searching for logistic lcassifier Max Iteration = [", MBounds[0], ",", MBounds[1], "], Ridge = [ 10E", RBounds[0], ",10E", RBounds[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # -----------------------  Evaluation
        bestlogistic = Logistic()
        bestlogistic.setMaxIts(int(bestValues.x))
        bestlogistic.setRidge(pow(10,bestValues.y))
        evaluation = Evaluation(data)
        output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestlogistic,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        print "best accuracy: ", acc
        print "best logistic classifier with Ridge = ", bestlogistic.getRidge(), " Max Iteration = ", bestlogistic.getMaxIts()
        OptLog = bestlogistic
        OptLogp1 = bestlogistic.getRidge()
        OptLogp2 = bestlogistic.getMaxIts()
        OptLogAcc = acc
    else:
        OptLog, OptLogp1, OptLogp2, OptLogAcc = myGridSearch(data,RBounds,MBounds)
    Description = 'Logistic classifier OptRidge = ' + str(OptLogp1) + \
            ', OptMaxIts = ' + str(OptLogp2) + ', OptAcc = ' + str(OptLogAcc)
    print "-----------------------------------------"
    return OptLog, OptLogp1, OptLogp2, OptLogAcc, Description
def RandomForest_ParamFinder(data): 
    # possible set for Number of trees
    NTreeBounds = [1,20,1]
    # possible set for number of features
    NFeaturesBounds = [0,20,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        randomforest = RandomForest()
        gridsearch.setClassifier(randomforest)
        gridsearch.setXProperty(String('classifier.numTrees'))
        gridsearch.setYProperty(String('classifier.numFeatures'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('I'))
        gridsearch.setXMin(NTreeBounds[0])
        gridsearch.setXMax(NTreeBounds[1])
        gridsearch.setXStep(NTreeBounds[2])
        gridsearch.setYMin(NFeaturesBounds[0])
        gridsearch.setYMax(NFeaturesBounds[1])
        gridsearch.setYStep(NFeaturesBounds[2])
        gridsearch.setYBase(10)
        print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # -----------------------  Evaluation
        bestrandomforest = RandomForest()
        bestrandomforest.setNumTrees(int(bestValues.x))
        bestrandomforest.setNumFeatures(int(bestValues.y))
        evaluation = Evaluation(data)
        output = output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        print "best accuracy: ", acc
        print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y
        OptRndFrst = bestrandomforest
        OptRndFrstp1 = bestValues.x
        OptRndFrstp2 = bestValues.y
        OptRndFrstAcc = acc
    else:
        OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) 
    Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \
            ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc)
    print "-----------------------------------------"
    return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
cover = CoverTree()
cover.setDistanceFunction(EuclideanDistance())  # only Euclidean Distance function
tree_algorithms.append(cover)
data.setClassIndex(data.numAttributes() - 1)
for num in range(1,30,2):
   file.write(str(num))
   for algoknn in tree_algorithms :
      log.write("---------------------------------\nK: " + str(num) + ", Search Algorithm: " + algoknn.__class__.__name__ + "\n")
      algo = IBk()
      algo.setNearestNeighbourSearchAlgorithm(algoknn)
      algo.setKNN(num)
      x = time.time()
      algo.buildClassifier(data)
      log.write("Time to build classifier: " + str(time.time() - x) + "\n")
      evaluation = Evaluation(data)
      output = PlainText()  # plain text output for predictions
      output.setHeader(data)
      buffer = StringBuffer() # buffer to use
      output.setBuffer(buffer)
      attRange = Range()                  # no additional attributes output
      outputDistribution = Boolean(False) # we don't want distribution
      x = time.time()
      #evaluation.evaluateModel(algo, data, [output, attRange, outputDistribution])
      evaluation.crossValidateModel(algo, data, 10, rand, [output, attRange, outputDistribution])
      log.write("Time to evaluate model: " + str(time.time() - x) + "\n")
      log.write(evaluation.toSummaryString())
      file.write("," + str(evaluation.rootMeanSquaredError()))
   file.write("\n")
file.close()
log.close()
    print "Confusion Matrix:"
    for l in confusion_matrix:
        print '** ', ','.join('%2d' % int(x) for x in l)

# example to collect an individual statistic for all evaluated classifiers
print "------------------------------------"
print "Example to collect an individual statistic for all evaluated classifiers"
print "Kappa"
for index in range(len(algo_keys)):
    evaluation = my_evaluations[index]
    key = algo_keys[index]
    algo = algo_dict[key]
    print algo.__class__.__name__ + ": " + str(evaluation.kappa())

# Example K fold cross validate model against training data
# NOTE:  This should be done against test data not training data.
print "Cross validation with 10 folds"
for index in range(len(algo_keys)):
    evaluation = my_evaluations[index]
    key = algo_keys[index]
    algo = algo_dict[key]
    output = PlainText()  # plain text output for predictions
    output.setHeader(data)
    buffer = StringBuffer()  # buffer to use
    output.setBuffer(buffer)
    rand = Random(1)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.crossValidateModel(algo, data, 10, rand,
                                  [output, attRange, outputDistribution])
Exemple #8
0
def myGridSearch(data,cBounds,GBound,eBounds):
    IsBestRBFKernel = False
    best_acc_poly = -float('inf')
    best_acc_rbf = -float('inf')
    # Poly Kernel 
    class bestValues_poly(object):
        x = float('nan')
        y = float('nan')
    for Cbnd in cBounds:
        for c in range(Cbnd[0],Cbnd[1]+Cbnd[2],Cbnd[2]):
            for e in range(eBounds[0],eBounds[1]+eBounds[2],eBounds[2]):
                smo = SMO()
                kernel = PolyKernel()
                kernel.setExponent(e)
                smo.setC(c)
                smo.setKernel(kernel)
                evaluation = Evaluation(data)
                output = util.get_buffer_for_predictions()[0]
                attRange = Range()  # no additional attributes output
                outputDistribution = Boolean(False)  # we don't want distribution
                random = Random(1)
                numFolds = min(10,data.numInstances())
                evaluation.crossValidateModel(smo,data,numFolds,random,[output, attRange, outputDistribution])
                acc = evaluation.pctCorrect()
                if (acc>best_acc_poly):
                    best_smo_poly = smo
                    best_acc_poly = acc
                    bestValues_poly.x = c
                    bestValues_poly.y = e
    print "Best accuracy (Poly Kernel): ", best_acc_poly
    print "Best values (Poly Kernel):   C = ", bestValues_poly.x, ", exponent = ", bestValues_poly.y
    print "-----------------------------------------"
    # RBF Kernel
    class bestValues_rbf(object):
        x = float('nan')
        y = float('nan')
    for Cbnd in cBounds:
        for c in range(Cbnd[0],Cbnd[1]+Cbnd[2],Cbnd[2]):
            for g in range(GBound[0],GBound[1]+GBound[2],GBound[2]):
                smo = SMO()
                kernel = RBFKernel()
                kernel.setGamma(pow(10,g))
                smo.setC(c)
                smo.setKernel(kernel)
                evaluation = Evaluation(data)
                output = util.get_buffer_for_predictions()[0]
                attRange = Range()  # no additional attributes output
                outputDistribution = Boolean(False)  # we don't want distribution
                random = Random(1)
                numFolds = min(10,data.numInstances())
                evaluation.crossValidateModel(smo,data,numFolds,random,[output, attRange, outputDistribution])
                acc = evaluation.pctCorrect()
                if (acc>best_acc_rbf):
                    best_smo_rbf = smo
                    best_acc_rbf = acc
                    bestValues_rbf.x = c
                    bestValues_rbf.y = g 
    print "Best accuracy (RBF Kernel): ", best_acc_rbf
    print "Best values (RBF Kernel):   C = ", bestValues_rbf.x, ", gamma = ", bestValues_rbf.y
    if (best_acc_rbf > best_acc_poly):
        IsBestRBFKernel = True
        print "best smo classifier is RBF kernel with C = ", bestValues_rbf.x," and gamma = ", pow(10,bestValues_rbf.y)
        best_smo = best_smo_rbf
        OptSMOp1 = bestValues_rbf.x
        OptSMOp2 = pow(10,bestValues_rbf.y)
        OptSMOAcc = best_acc_rbf
        OptSMOIsRBF = IsBestRBFKernel
    else:
        IsBestRBFKernel = False
        print "best smo classifier is Poly kernel with C = ", bestValues_poly.x," and exponent = ", bestValues_poly.y
        best_smo = best_smo_poly
        OptSMOp1 = bestValues_poly.x
        OptSMOp2 = bestValues_poly.y
        OptSMOAcc = best_acc_poly
        OptSMOIsRBF = IsBestRBFKernel
    return IsBestRBFKernel, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc
Exemple #9
0
def SMO_ParamFinder(data):
    # Possible set for C-value
    cBounds = [[1,10,1],[10,100,10],[100,300,20]]
    # possible set for exponents
    eBounds = [1,3,1]
    # possible set for Gamma
    GBound = [-5,2,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        # Polynomials Kernel
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        smo = SMO()
        kernel = PolyKernel()
        smo.setKernel(kernel)
        gridsearch.setClassifier(smo)
        gridsearch.setXProperty(String('classifier.c'))
        gridsearch.setYProperty(String('classifier.kernel.Exponent'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('I'))
        best_acc_poly = -float('inf')
        for cnt in range(0,len(cBounds)):
            cbound = cBounds[cnt]
            cmin =  cbound[0]
            cmax =  cbound[1]
            cstep = cbound[2]           
            gridsearch.setXMin(cmin)
            gridsearch.setXMax(cmax)
            gridsearch.setXStep(cstep)
            gridsearch.setYMin(eBounds[0])
            gridsearch.setYMax(eBounds[1])
            gridsearch.setYStep(eBounds[2])
            print "searching for Polykernel C = [", cmin, ",", cmax, "], exponent = [", eBounds[0], ",", eBounds[1], "] ...."
            gridsearch.buildClassifier(data)
            bestValues = gridsearch.getValues()
            # --------------------------------- Evaluation
            bestsmo = SMO()
            kernel = PolyKernel()
            kernel.setExponent(bestValues.y)
            bestsmo.setC(bestValues.x)
            bestsmo.setKernel(kernel)
            evaluation = Evaluation(data)
            output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            print "numFolds : ", numFolds
            evaluation.crossValidateModel(bestsmo,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc_poly):
                best_smo_poly = bestsmo
                best_acc_poly = acc
                bestValues_poly = bestValues
                print "Best accuracy so far: ",best_acc_poly
                print "Best values so far:   ",bestValues_poly 
        print "Best accuracy (Poly Kernel): ", best_acc_poly
        print "Best values (Poly Kernel):   ", bestValues_poly
        print "-----------------------------------------"
        # RBF Kernel
        smo = SMO()
        kernel = RBFKernel()
        smo.setKernel(kernel)
        gridsearch.setClassifier(smo)
        gridsearch.setXProperty(String('classifier.c'))
        gridsearch.setYProperty(String('classifier.kernel.gamma'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('pow(BASE,I)'))
        gridsearch.setYBase(10)
        best_acc_rbf = -float('inf')
        for cnt in range(0,len(cBounds)):
            cbound = cBounds[cnt]
            cmin =  cbound[0]
            cmax =  cbound[1]
            cstep = cbound[2]           
            gridsearch.setXMin(cmin)
            gridsearch.setXMax(cmax)
            gridsearch.setXStep(cstep)
            gridsearch.setYMin(GBound[0])
            gridsearch.setYMax(GBound[1])
            gridsearch.setYStep(GBound[2])
            gridsearch.setYBase(10)
            print "searching for RBF Kernel C = [", cmin, ",", cmax, "], gamma = [10^", GBound[0], ",10^", GBound[1], "] ...."
            gridsearch.buildClassifier(data)
            bestValues = gridsearch.getValues()
            # ----------------------------------- Evaluation
            bestsmo = SMO()
            kernel = RBFKernel()
            kernel.setGamma(pow(10,bestValues.y))
            bestsmo.setC(bestValues.x)
            bestsmo.setKernel(kernel)
            evaluation = Evaluation(data)
            output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(bestsmo,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc_rbf):
                best_smo_rbf = bestsmo
                best_acc_rbf = acc
                bestValues_rbf = bestValues
                print "Best accuracy so far: ",best_acc_rbf
                print "Best values so far:   ",bestValues_rbf 
        print "Best accuracy (RBF Kernel): ", best_acc_rbf
        print "Best values (RBF Kernel):   ", bestValues_rbf
        print "-----------------------------------------" 
        if (best_acc_rbf > best_acc_poly):
            IsBestRBFKernel = True
            print "best smo classifier is RBF kernel with C = ", bestValues_rbf.x, " and gamma = ", pow(10,bestValues.y)
            best_smo = best_smo_rbf
            OptSMOp1 = bestValues_rbf.x
            OptSMOp2 = pow(10,bestValues.y)
            OptSMOAcc = best_acc_rbf
            OptSMOIsRBF = IsBestRBFKernel
        else:
            IsBestRBFKernel = False
            print "best smo classifier is Poly kernel with C = ", bestValues_poly.x, " and exponent = ", bestValues_poly.y
            best_smo = best_smo_poly
            OptSMOp1 = bestValues_poly.x
            OptSMOp2 = bestValues_poly.y
            OptSMOAcc = best_acc_poly
            OptSMOIsRBF = IsBestRBFKernel
    else:    # we have very small ssample size
        OptSMOIsRBF, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc  = myGridSearch(data,cBounds,GBound,eBounds)
    if OptSMOIsRBF:
        Description = 'SMO classifier(RBF kernel): OptC=' + str(OptSMOp1) + \
                ', OptGamma=' + str(OptSMOp2) + ', OptAcc=' + str(OptSMOAcc) 
    else:
        Description = 'SMO classifier(Poly kernel): OptC=' + str(OptSMOp1) + \
                ', OptExponent=' + str(OptSMOp2) + ', OptAcc=' + str(OptSMOAcc)
    return OptSMOIsRBF, best_smo, OptSMOp1, OptSMOp2, OptSMOAcc, Description
def runClassifierAlgo(algo, class_index, training_filename, test_filename, do_model, do_eval, do_predict):
    """ If <test_filename>
            Run classifier algorithm <algo> on training data in <training_filename> to build a model
            then test on data in <test_filename> (equivalent of Weka "Supplied test set") 
        else
            do 10 fold CV lassifier algorithm <algo> on data in <training_filename>
        
        <class_index> is the column containing the dependent variable 
        
        http://weka.wikispaces.com/Generating+classifier+evaluation+output+manually
        http://weka.sourceforge.net/doc.dev/weka/classifiers/Evaluation.html
    """
    print ' runClassifierAlgo: training_filename= ', training_filename, ', test_filename=', test_filename
    misc.checkExists(training_filename)

    training_file = FileReader(training_filename)
    training_data = Instances(training_file)
    if test_filename:
        test_file = FileReader(test_filename)
        test_data = Instances(test_file)
    else:
        test_data = training_data

   # set the class Index - the index of the dependent variable
    training_data.setClassIndex(class_index)
    test_data.setClassIndex(class_index)

    # create the model
    if test_filename:
        algo.buildClassifier(training_data)

    evaluation = None
    # only a trained classifier can be evaluated
    if do_eval or do_predict:
        evaluation = Evaluation(test_data)
        buffer = StringBuffer()             # buffer for the predictions
        attRange = Range()                  # no additional attributes output
        outputDistribution = Boolean(False) # we don't want distribution
        if test_filename:
            evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution])
        else:
           # evaluation.evaluateModel(algo, [String('-t ' + training_filename), String('-c 1')])
           # print evaluation.toSummaryString()
            rand = Random(1)
            evaluation.crossValidateModel(algo, training_data, 4, rand)
            if False:
                print 'percentage correct =', evaluation.pctCorrect()
                print 'area under ROC =', evaluation.areaUnderROC(class_index)
                confusion_matrix = evaluation.confusionMatrix()
                for l in confusion_matrix:
                    print '** ', ','.join('%2d'%int(x) for x in l)

    if verbose:
        if do_model:
            print '--> Generated model:\n'
            print algo.toString()
        if do_eval:
            print '--> Evaluation:\n'
            print evaluation.toSummaryString()
        if do_predict:
            print '--> Predictions:\n'
            print buffer

    return {'model':str(algo), 'eval':str(evaluation.toSummaryString()), 'predict':str(buffer) }
def Bayes_ParamFinder(data):
    # -----------------------  Evaluation of Naive Bayes without kernel estimation
    naivebayes = NaiveBayes()
    evaluation = Evaluation(data)
    output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(naivebayes,data,numFolds,random,[output, attRange, outputDistribution])
    acc_naivebayes = evaluation.pctCorrect()
    print "Naive Bayesisn accuracy (without kernel density estimation): ", acc_naivebayes
    # -----------------------  Evaluation of Naive Bayes with kernel estimation
    naivebayes = NaiveBayes()
    naivebayes.setUseKernelEstimator(Boolean(True))   # use kernel density estimation
    evaluation = Evaluation(data)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(naivebayes,data,numFolds,random,[output, attRange, outputDistribution])
    acc_naivebayes_withkernel = evaluation.pctCorrect()
    print "Naive Bayesisn accuracy (with kernel density estimation): ", acc_naivebayes_withkernel
    # -----------------------  Evaluation of Naive bayes multinomial
    naivebayesmultinomial = NaiveBayesMultinomial()
    evaluation = Evaluation(data)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    if (allAttributesPositive(data)):  # multinomial bayes classifier only work on positive attributes
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(naivebayesmultinomial,data,numFolds,random,[output, attRange, outputDistribution])
        acc_naivemultinomialbayes = evaluation.pctCorrect()
    else:
        acc_naivemultinomialbayes = 0
    print "Naive Multinomial Bayesisn accuracy : ", acc_naivemultinomialbayes
    # ------------------------- Comparision
    if (acc_naivemultinomialbayes > acc_naivebayes):
        if (acc_naivemultinomialbayes > acc_naivebayes_withkernel):
            IsOptMultinomialBayes = True
            IsOptNaiveKernelDensity = False
            acc = acc_naivemultinomialbayes
        else:
            IsOptMultinomialBayes = False
            IsOptNaiveKernelDensity = True
            acc = acc_naivebayes_withkernel
    else:
        if (acc_naivebayes > acc_naivebayes_withkernel):
            IsOptMultinomialBayes = False
            IsOptNaiveKernelDensity = False
            acc = acc_naivebayes
        else:
            IsOptMultinomialBayes = False
            IsOptNaiveKernelDensity = True
            acc = acc_naivebayes_withkernel
    print "-----------------------------------------"
    OptBayesAcc = acc
    if IsOptMultinomialBayes:
        Description = 'Optimal Bayes classifier is Multinomial Bayes: OptAcc = ' + str(OptBayesAcc)
    elif IsOptNaiveKernelDensity:
        Description = 'Optimal Bayes classifier is Naive Bayes with kernel density estimation: OptAcc = ' +\
             str(OptBayesAcc)
    else:
        Description = 'Optimal Bayes classifier is Naive Bayes: OptAcc = ' + str(OptBayesAcc)
    return IsOptMultinomialBayes, IsOptNaiveKernelDensity, OptBayesAcc, Description
def feat_trimming(cl_list, config, f, fe, min_feat, new_instances, num_feat, pos_class_weight, progress,
                  progress_per_iteration, result_list, split_ratio, iterative = False):

#    print "num_feat:%s"%num_feat
#    print "min_feat:%s"%min_feat
    min_feat = int(min_feat)
    num_feat = int(num_feat)
    if debug:
        print "num_feat:%s"%num_feat
        print "min_feat:%s"%min_feat
        print "In feat_trimming"
    if split_ratio == 0:
        cut_amount = 1
    else:
        cut_amount = compute_cut_amount(min_feat, num_feat, split_ratio)

    if not iterative:
        num_feat = min_feat

    if num_feat > new_instances.numAttributes():
        return new_instances, 30

#    else:
#        num_feat -= cut_amount

    classifier_list = []
    for cl in cl_list:
        if cl == 0:
            liblinear = Liblinear()
            liblinear.setConvertNominalToBinary(True)
            liblinear.setWeights(str(pos_class_weight) + " 1")
            classifier_list.append(liblinear)
        elif cl == 1:
            k2 = weka.classifiers.bayes.net.search.local.K2()
            k2. setMaxNrOfParents(1)
            bayesNet = BayesNet()
            bayesNet.setSearchAlgorithm(k2)
            classifier_list.append(bayesNet)
        elif cl == 2:
            j48 = J48()
            classifier_list.append(j48)
        elif cl == 3:
            jRip = JRip()
            classifier_list.append(jRip)
        else:
            raise ValueError('Unknown Classifier number -- %d given' % cl)

    while num_feat >= min_feat:
        if debug:
            print "Num_feat:%d, min_feat:%d"%(num_feat, min_feat)

        start = time.time()

        for classifier in classifier_list:
            if debug:
                print "Before selecting Features"


        # Assigns to t_selector the classifier
            if config.optimize:
                master_map, header_rows, new_instances, t_selector = select_features(classifier, config.fmeasure, fe,
                    new_instances, num_feat,
                    config.optimize)
            else:
                master_map, header_rows, new_instances, t_selector = select_features(classifier, config.fmeasure, fe,
                    new_instances, num_feat)

            if debug:
                print "After selecting Features"
            print "Num_selected_features:%d"%(new_instances.numAttributes()-1)

            if isinstance(t_selector, tSelector):
                classifier_name = t_selector.getClassifier().getClass().__name__
            else:
                classifier_name = t_selector.getClass().__name__

            evaluation = Evaluation(new_instances)
            variance_analysis(config, evaluation, new_instances, t_selector)
            if config.temporal_folds:
                do_temporal_cv(t_selector, new_instances, config.folds)
            else:
                evaluation.crossValidateModel(t_selector, new_instances, config.folds, Random(1), [])

            # Add to candidate feature list only if its in the iterative stage
            report_results(classifier_name, config, evaluation, f, fe, new_instances, num_feat, result_list,
                t_selector, add_to_list=not iterative)
        progress = update_progress(progress, progress_per_iteration)
        cut_amount = compute_cut_amount(min_feat, num_feat, split_ratio)

        num_feat -= cut_amount

        # Break slow feature selection after first iteration
        if is_slow_fs(fe):
            break

        if debug:
            elapsed = (time.time() - start)
            print "Time elapsed:%d ms for num_feat=%d, min_feat=%d"%(elapsed, num_feat, min_feat)

    return new_instances, progress
def AdaBoostedSimpleLogistic_ParamFinder(data, param1, param2):
    # Adaboost params: Possible set for Weight Threshold 
    WeightThresholdBounds = [99,100,1]
    # Adaboost params: possible set for NumIteration
    NumItrBound = [5,50,5]
    # Simple Logisitic params: Possible set for num of boosting
    NumBoostIterationBounds = [0,200,10]
    # This section tries to boost the best simple logistic
    print "searching for the best parameters to boosting on the optimal simple Logistic ...."
    gridsearch = GridSearch()
    acctag = gridsearch.getEvaluation()
    acctag = SelectedTag('ACC',acctag.getTags())
    gridsearch.setEvaluation(acctag)
    allfilters = AllFilters()
    gridsearch.setFilter(allfilters)
    gridsearch.setGridIsExtendable(Boolean(True))
    simplelogistic = SimpleLogistic()
    adaboostm = AdaBoostM1()
    simplelogistic.setHeuristicStop(param1)
    simplelogistic.setNumBoostingIterations(param2)
    adaboostm.setClassifier(simplelogistic)
    gridsearch.setClassifier(adaboostm)
    gridsearch.setXProperty(String('classifier.weightThreshold'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXMin(WeightThresholdBounds[0])
    gridsearch.setXMax(WeightThresholdBounds[1])
    gridsearch.setXStep(WeightThresholdBounds[2])
    gridsearch.setYMin(NumItrBound[0])
    gridsearch.setYMax(NumItrBound[1])
    gridsearch.setYStep(NumItrBound[2])
    print "searching for best parameters for boosting simple Logistic weightThreshold = [", WeightThresholdBounds[0], ",", WeightThresholdBounds[1], "], # Iterations = [", NumItrBound[0], ",", NumItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    bestValues1 = gridsearch.getValues()
    # ------------------------------ Evaluation
    simplelogistic = SimpleLogistic()
    bestadaboostm1 = AdaBoostM1()
    simplelogistic.setHeuristicStop(param1)
    simplelogistic.setNumBoostingIterations(param2)
    bestadaboostm1.setWeightThreshold(int(bestValues1.x))
    bestadaboostm1.setNumIterations(int(bestValues1.y))
    bestadaboostm1.setClassifier(simplelogistic)
    evaluation = Evaluation(data)
    output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestadaboostm1,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc1 = evaluation.pctCorrect()
    print "best accuracy by boosting the optimal simple Logistic classifier: ", best_acc1
    print "Optimal weight Threshold  Percent : ", bestValues1.x , "Optimal number of Iterations : ", bestValues1.y
    print "-----------------------------------------"
    # -------------------------------------------------------------------------------------------------------------------------
    # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration  
    simplelogistic = SimpleLogistic()
    adaboostm = AdaBoostM1()
    adaboostm.setClassifier(simplelogistic)
    gridsearch.setClassifier(adaboostm)
    gridsearch.setXProperty(String('classifier.classifier.numBoostingIterations'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXBase(10)
    gridsearch.setXMin(NumBoostIterationBounds[0])
    gridsearch.setXMax(NumBoostIterationBounds[1])
    gridsearch.setXStep(NumBoostIterationBounds[2])
    gridsearch.setYMin(NumItrBound[0])
    gridsearch.setYMax(NumItrBound[1])
    gridsearch.setYStep(NumItrBound[2])
    print "searching for number of boosting Iterations bound  = [", NumBoostIterationBounds[0], ",", NumBoostIterationBounds[1], "], # Iteration = [", NumItrBound[0], ",", NumItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    bestValues2 = gridsearch.getValues()
    # ------------------ Evaluation
    simplelogistic = SimpleLogistic()
    bestadaboostm2 = AdaBoostM1()
    simplelogistic.setNumBoostingIterations(int(bestValues2.x))
    bestadaboostm2.setNumIterations(int(bestValues2.y))
    bestadaboostm2.setClassifier(simplelogistic)    
    evaluation = Evaluation(data)
    output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestadaboostm2,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc2 = evaluation.pctCorrect()
    print "best accuracy by boosting the Simple Logistic classifier (with optimization over ridge): ", best_acc2
    print "Optimal number of boosting Iteration : ", bestValues2.x , "Optimal number of Iteration : ", bestValues2.y
    print "-----------------------------------------"
    print "Final optimal boosting classifier:"
    if (best_acc2 > best_acc1):
        print "     Best boosting is based on simple logistic with optimal numBoostingIterations :",\
             bestValues2.x, " optimal numIteration :", bestValues2.y
        print "     optimal accuracy: ", best_acc2
        IsOptimalBoostingOnOptSimpleLogistic = False    # is optimal boosting based on optimal simple Logistic ?
        IsOptBoostOnOptSimpLog = IsOptimalBoostingOnOptSimpleLogistic
        OptBoostSimpLog = bestadaboostm2
        OptBoostSimpLogp1 = bestValues2.x
        OptBoostSimpLogp2 = bestValues2.y
        OptBoostSimpLogAcc = best_acc2
    else:
        print "     Best boosting is based on optimal simple Logistic with optimal weight Threshold :",\
             bestValues1.x, " optimal numIteration :", bestValues1.y
        print "     optimal accuracy: ", best_acc1
        IsOptimalBoostingOnOptSimpleLogistic = True # is optimal boosting based on optimal simple Logistic ?
        IsOptBoostOnOptSimpLog = IsOptimalBoostingOnOptSimpleLogistic
        OptBoostSimpLog = bestadaboostm1
        OptBoostSimpLogp1 = bestValues1.x
        OptBoostSimpLogp2 = bestValues1.y
        OptBoostSimpLogAcc = best_acc1
    if IsOptBoostOnOptSimpLog:
        Description = 'Boosting optimal simple logistic classifier: OptWeightThreshold = ' + \
            str(OptBoostSimpLogp1) + ', OptNumIterations=' + \
            str(OptBoostSimpLogp2) + ', OptAcc = ' + str(OptBoostSimpLogAcc)
    else:
        Description = 'Boosting simple logistic classifier: OptNumBoostingIterations = ' + \
            str(OptBoostSimpLogp1) + ', OptNumIterations=' + \
            str(OptBoostSimpLogp2) + ', OptAcc = ' + str(OptBoostSimpLogAcc)
    return IsOptBoostOnOptSimpLog, OptBoostSimpLog, OptBoostSimpLogp1, OptBoostSimpLogp2, \
            OptBoostSimpLogAcc, Description
def BaggingSMO_ParamFinder(data, BestSMOIsRBFKernel, param1, param2):
    # Possible set for C-value
    cBounds = [[1,10,1],[10,100,10],[100,300,20]]
    # possible set bag size percent
    BagSizePercentBound = [ max(10, int(float(1)/float(data.numInstances())*100)+1 )  ,100,10]    # max operation is to make sure that least number of samples are provided to the classifier
    # possible set for Iteration
    ItrBound = [5,50,5]
    # This section tries to boost the best smo
    print "searching for the best parameters to Bag the best SMO ...."
    gridsearch = GridSearch()
    acctag = gridsearch.getEvaluation()
    acctag = SelectedTag('ACC',acctag.getTags())
    gridsearch.setEvaluation(acctag)
    allfilters = AllFilters()
    gridsearch.setFilter(allfilters)
    gridsearch.setGridIsExtendable(Boolean(False))
    smo = SMO()
    bagging = Bagging()
    if BestSMOIsRBFKernel:
        kernel = RBFKernel()
        kernel.setGamma(param2)
        smo.setKernel(kernel)
        smo.setC(param1)
    else:
        kernel = PolyKernel()
        kernel.setExponent(param2)
        smo.setKernel(kernel)
        smo.setC(param1)
    bagging.setClassifier(smo)
    gridsearch.setClassifier(bagging)
    gridsearch.setXProperty(String('classifier.bagSizePercent'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXMin(BagSizePercentBound[0])
    gridsearch.setXMax(BagSizePercentBound[1])
    gridsearch.setXStep(BagSizePercentBound[2])
    gridsearch.setYMin(ItrBound[0])
    gridsearch.setYMax(ItrBound[1])
    gridsearch.setYStep(ItrBound[2])
    print "searching for best parameters for bagging SMO bagSizePercent = [", BagSizePercentBound[0], ",", BagSizePercentBound[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    #bestbagging1 = gridsearch.getBestClassifier()
    bestValues1 = gridsearch.getValues()
    # ------------------ Evaluation
    smo = SMO()
    bestbagging1 = Bagging()
    smo.setKernel(kernel)
    smo.setC(param1)
    bestbagging1.setBagSizePercent(int(bestValues1.x))
    bestbagging1.setNumIterations(int(bestValues1.y))
    bestbagging1.setClassifier(smo)
    evaluation = Evaluation(data)
    output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestbagging1,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc1 = evaluation.pctCorrect()
    bestValues1 = gridsearch.getValues()
    print "best accuracy by bagging the optimal SMO classifier: ", best_acc1
    print "Optimal Bag size Percent : ", bestValues1.x , "Optimal number of Iteration : ", bestValues1.y
    print "-----------------------------------------"
    # ------------------------------------------------------------------------------------------------------------------------
    # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration  
    smo = SMO()
    kernel = PolyKernel()
    smo.setKernel(kernel)
    bagging.setClassifier(smo)
    gridsearch.setClassifier(bagging)
    gridsearch.setXProperty(String('classifier.classifier.c'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setGridIsExtendable(Boolean(True))
    best_acc2 = -float('inf')
    for cnt in range(0,len(cBounds)):
        cbound = cBounds[cnt]
        cmin =  cbound[0]
        cmax =  cbound[1]
        cstep = cbound[2]           
        gridsearch.setXMin(cmin)
        gridsearch.setXMax(cmax)
        gridsearch.setXStep(cstep)
        gridsearch.setYMin(ItrBound[0])
        gridsearch.setYMax(ItrBound[1])
        gridsearch.setYStep(ItrBound[2])
        print "searching for RBF Kernel C = [", cmin, ",", cmax, "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # ------------ Evaluation
        smo = SMO()
        bestbagging = Bagging()
        kernel = PolyKernel()
        smo.setKernel(kernel)
        smo.setC(bestValues.x)
        bestbagging.setNumIterations(int(bestValues.y))
        bestbagging.setClassifier(smo)
        evaluation = Evaluation(data)
        output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestbagging,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        if (acc>best_acc2):
            bestbagging2 = bestbagging
            best_acc2 = acc
            bestValues2 = bestValues
            print "Best accuracy so far by bagging linear SMO: ", best_acc2 
            print "Best values so far by bagging linear SMO:   ", bestValues2 
    print "Best accuracy by bagging linear SMO: ", best_acc2
    print "Best values by bagging linear SMO:   ", bestValues2
    print "-----------------------------------------"   
    print "Final optimal bagging classifier:"
    if (best_acc2 > best_acc1):
        print "     Best bagging is based on linear SMO with optimal c-value :", bestValues2.x, " optimal numIteration = ", bestValues2.y
        print "     optimal accuracy: ", best_acc2
        IsOptimalBaggingIsOptSMO = False    # is optimal bagging based on optimal SMO ?
        IsOptBagOnOptSMO = IsOptimalBaggingIsOptSMO
        OptBagSMO = bestbagging2
        OptBagSMOp1 = bestValues2.x
        OptBagSMOp2 = bestValues2.y
        OptBagSMOAcc = best_acc2
    else:
        print "     Best bagging is based on optimal SMO with optimal bagSizePercent :", bestValues1.x, " optimal numIteration = ", bestValues1.y
        print "     optimal accuracy: ", best_acc1
        IsOptimalBaggingIsOptSMO = True     # is optimal bagging based on optimal SMO ?
        IsOptBagOnOptSMO = IsOptimalBaggingIsOptSMO
        OptBagSMO = bestbagging1
        OptBagSMOp1 = bestValues1.x
        OptBagSMOp2 = bestValues1.y
        OptBagSMOAcc = best_acc1
    if IsOptBagOnOptSMO:
        Description = 'Bagging on optimal SMO classifier: OptBagSizePercent=' + str(OptBagSMOp1) + \
                ', OptNumIterations=' + str(OptBagSMOp2) + ', OptAcc=' + str(OptBagSMOAcc)
    else:
        Description = 'Bagging on linear SMO classifier: OptC=' + str(OptBagSMOp1) + \
                 ', OptNumIterations=' + str(OptBagSMOp2) + ', OptAcc=' + str(OptBagSMOAcc)
    return IsOptBagOnOptSMO, OptBagSMO,  OptBagSMOp1, OptBagSMOp2, OptBagSMOAcc, Description
def BaggingLogistic_ParamFinder(data, param1, param2):
    # Possible set for Ridge-value
    RBounds = [-10,2,1]
    # possible set bag size percent
    BagSizePercentBound = [ max(10, int(float(1)/float(data.numInstances())*100)+1 )  ,100,10]    # max operation is to make sure that least number of samples are provided to the classifier
    # possible set for Iteration
    ItrBound = [5,50,5]
    # This section tries to boost the best logistic
    print "searching for the best parameters to Bag the optimal Logistic ...."
    gridsearch = GridSearch()
    acctag = gridsearch.getEvaluation()
    acctag = SelectedTag('ACC',acctag.getTags())
    gridsearch.setEvaluation(acctag)
    allfilters = AllFilters()
    gridsearch.setFilter(allfilters)
    gridsearch.setGridIsExtendable(Boolean(False))
    logistic = Logistic()
    bagging = Bagging()
    logistic.setRidge(param1)
    logistic.setMaxIts(param2)
    bagging.setClassifier(logistic)
    gridsearch.setClassifier(bagging)
    gridsearch.setXProperty(String('classifier.bagSizePercent'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('I'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXMin(BagSizePercentBound[0])
    gridsearch.setXMax(BagSizePercentBound[1])
    gridsearch.setXStep(BagSizePercentBound[2])
    gridsearch.setYMin(ItrBound[0])
    gridsearch.setYMax(ItrBound[1])
    gridsearch.setYStep(ItrBound[2])
    print "searching for best parameters for bagging Logistic bagSizePercent = [", BagSizePercentBound[0], ",", BagSizePercentBound[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    #bestbagging1 = gridsearch.getBestClassifier()
    bestValues1 = gridsearch.getValues()
    # ------------------------------ Evaluation
    logistic = Logistic()
    bestbagging1 = Bagging()
    logistic.setRidge(param1)
    logistic.setMaxIts(param2)
    bestbagging1.setBagSizePercent(int(bestValues1.x))
    bestbagging1.setNumIterations(int(bestValues1.y))
    bestbagging1.setClassifier(logistic)
    evaluation = Evaluation(data)
    output = output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestbagging1,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc1 = evaluation.pctCorrect()
    print "best accuracy by bagging the optimal Logistic classifier: ", best_acc1
    print "Optimal Bag size Percent: ", bestValues1.x, " Optimal number of Iterations: ", bestValues1.y
    print "-----------------------------------------"
    # -------------------------------------------------------------------------------------------------------------------------
    # in this section we set the weak classifier to the linear SMO and optimize over c-value of the SMO and number of iteration  
    logistic = Logistic()
    bagging = Bagging()
    bagging.setClassifier(logistic)
    gridsearch.setClassifier(bagging)
    gridsearch.setXProperty(String('classifier.classifier.ridge'))
    gridsearch.setYProperty(String('classifier.numIterations'))
    gridsearch.setXExpression(String('pow(BASE,I)'))
    gridsearch.setYExpression(String('I'))
    gridsearch.setXBase(10)
    gridsearch.setGridIsExtendable(Boolean(True))
    gridsearch.setXMin(RBounds[0])
    gridsearch.setXMax(RBounds[1])
    gridsearch.setXStep(RBounds[2])
    gridsearch.setYMin(ItrBound[0])
    gridsearch.setYMax(ItrBound[1])
    gridsearch.setYStep(ItrBound[2])
    print "searching for ridge bound  = [10^", RBounds[0], ",10^", RBounds[1], "], # Iteration = [", ItrBound[0], ",", ItrBound[1], "] ...."
    gridsearch.buildClassifier(data)
    #bestbagging = gridsearch.getBestClassifier()
    bestValues2 = gridsearch.getValues()
    # ------------------ Evaluation
    logistic = Logistic()
    bestbagging2 = Bagging()
    logistic.setRidge(pow(10,bestValues2.x))
    bestbagging2.setNumIterations(int(bestValues2.y))
    bestbagging2.setClassifier(logistic)    
    evaluation = Evaluation(data)
    output = output = util.get_buffer_for_predictions()[0]
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    random = Random(1)
    numFolds = min(10,data.numInstances())
    evaluation.crossValidateModel(bestbagging2,data,numFolds,random,[output, attRange, outputDistribution])
    best_acc2 = evaluation.pctCorrect()
    print "best accuracy by bagging the Logistic classifier (with optimization over ridge): ", best_acc2
    print "Optimal Ridge value : ", bestValues2.x , "Optimal number of Iteration : ", bestValues2.y
    print "-----------------------------------------"
    print "Final optimal bagging classifier:"
    if (best_acc2 > best_acc1):
        print "     Best bagging is based on logistic with optimal ridge-value :", bestValues2.x, " optimal numIteration :", bestValues2.y
        print "     optimal accuracy: ", best_acc2
        IsOptimalBaggingIsOptLogistic = False   # is optimal bagging based on optimal Logistic ?
        IsOptBagOnOptLog = IsOptimalBaggingIsOptLogistic
        OptBagLog = bestbagging2
        OptBagLogp1 = pow(10,bestValues2.x)
        OptBagLogp2 = bestValues2.y
        OptBagLogAcc = best_acc2
    else:
        print "     Best bagging is based on optimal Logistic with optimal bagSizePercent :", bestValues1.x, " optimal numIteration :", bestValues1.y
        print "     optimal accuracy: ", best_acc1
        IsOptimalBaggingIsOptLogistic = True        # is optimal bagging based on optimal Logistic ?
        IsOptBagOnOptLog = IsOptimalBaggingIsOptLogistic
        OptBagLog = bestbagging1
        OptBagLogp1 = bestValues1.x
        OptBagLogp2 = bestValues1.y
        OptBagLogAcc = best_acc1
    if IsOptBagOnOptLog:
        Description = 'Bagging on optimal logistic classifier: OptBagSizePercent= ' + str(OptBagLogp1) + \
                ', OptNumIterations=' + str(OptBagLogp2) + ', OptAcc = ' + str(OptBagLogAcc)
    else:
        Description = 'Bagging on logistic classifier: OptRidge= ' + str(OptBagLogp1) + \
                ', OptNumIterations=' + str(OptBagLogp2) + ', OptAcc = ' + str(OptBagLogAcc)
    return IsOptBagOnOptLog, OptBagLog,  OptBagLogp1, OptBagLogp2, OptBagLogAcc, Description