def main():
    """
    Shows how to use the CostSensitiveClassifier.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.CostSensitiveClassifier",
        options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"])
    base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    classifier.classifier = base

    folds = 10
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, folds, Random(1))


    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def use_classifier(data, cli, args):
    cli = cli.format(cli, **args)
    cls = from_commandline(cli, classname="weka.classifiers.Classifier")
    cls.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1))
    return cls, evaluation
def myGridSearch(data,RBound,MBound):
    bestlogistic = None
    best_acc     = -float('inf')
    class bestValues(object):
        m = float('nan')
        r = float('nan')
    for r in range(RBound[0],RBound[1]+RBound[2],RBound[2]):
        for m in range(MBound[0],MBound[1]+MBound[2],MBound[2]):
            logistic = Logistic()
            logistic.setMaxIts(int(m))
            logistic.setRidge(pow(10,r))
            evaluation = Evaluation(data)
            output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(logistic,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc):
                bestlogistic = logistic
                best_acc = acc
                bestValues.m = int(m)
                bestValues.r = pow(10,r)
    print "Best accuracy: ", best_acc
    print "Best values:   M = ", bestValues.m, ", Ridge = ", bestValues.r
    print "-----------------------------------------"
    return bestlogistic, bestValues.r, bestValues.m, best_acc
def myGridSearch(data,NTreeBounds,NFeaturesBounds):
    best_acc = -float('inf')
    bestrandomforest = None
    class bestValues(object):
        t = float('nan')
        f = float('nan')
    for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]):
        for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]):
            randomforest = RandomForest()
            randomforest.setNumTrees(int(t))
            randomforest.setNumFeatures(int(f))
            evaluation = Evaluation(data)
            output = output = util.get_buffer_for_predictions()[0]
            attRange = Range()  # no additional attributes output
            outputDistribution = Boolean(False)  # we don't want distribution
            random = Random(1)
            numFolds = min(10,data.numInstances())
            evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution])
            acc = evaluation.pctCorrect()
            if (acc>best_acc):
                bestrandomforest = randomforest
                best_acc = acc
                bestValues.t = t
                bestValues.f = f
    print "Best accuracy:", best_acc
    print "Best values:  NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f
    print "-----------------------------------------"
    return bestrandomforest, bestValues.t, bestValues.f, best_acc
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate
    evl = Evaluation(train)
    evl.test_model(cls, test)
    print(evl.summary())
def use_classifier(data_filename, cli):
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_filename)
    data.class_is_last()
    cls = from_commandline(cli, classname="weka.classifiers.Classifier")
    cls.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1))
    return cls, evaluation
def RandomForest_ParamFinder(data): 
    # possible set for Number of trees
    NTreeBounds = [1,20,1]
    # possible set for number of features
    NFeaturesBounds = [0,20,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        randomforest = RandomForest()
        gridsearch.setClassifier(randomforest)
        gridsearch.setXProperty(String('classifier.numTrees'))
        gridsearch.setYProperty(String('classifier.numFeatures'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('I'))
        gridsearch.setXMin(NTreeBounds[0])
        gridsearch.setXMax(NTreeBounds[1])
        gridsearch.setXStep(NTreeBounds[2])
        gridsearch.setYMin(NFeaturesBounds[0])
        gridsearch.setYMax(NFeaturesBounds[1])
        gridsearch.setYStep(NFeaturesBounds[2])
        gridsearch.setYBase(10)
        print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # -----------------------  Evaluation
        bestrandomforest = RandomForest()
        bestrandomforest.setNumTrees(int(bestValues.x))
        bestrandomforest.setNumFeatures(int(bestValues.y))
        evaluation = Evaluation(data)
        output = output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        print "best accuracy: ", acc
        print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y
        OptRndFrst = bestrandomforest
        OptRndFrstp1 = bestValues.x
        OptRndFrstp2 = bestValues.y
        OptRndFrstAcc = acc
    else:
        OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) 
    Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \
            ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc)
    print "-----------------------------------------"
    return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
def Logistic_ParamFinder(data): 
    # Possible set for Ridge-value
    RBounds = [-10,2,1]
    # possible set for maximum Iteration
    MBounds = [-1,10,1]
    if (data.numInstances()>10):     # grid search does 10-fold cross validation; hence number of samples must be more than 10
        gridsearch = GridSearch()
        acctag = gridsearch.getEvaluation()
        acctag = SelectedTag('ACC',acctag.getTags())
        gridsearch.setEvaluation(acctag)
        allfilters = AllFilters()
        gridsearch.setFilter(allfilters)
        gridsearch.setGridIsExtendable(Boolean(True))
        logistic = Logistic()
        gridsearch.setClassifier(logistic)
        gridsearch.setXProperty(String('classifier.maxIts'))
        gridsearch.setYProperty(String('classifier.ridge'))
        gridsearch.setXExpression(String('I'))
        gridsearch.setYExpression(String('pow(BASE,I)'))
        gridsearch.setXMin(MBounds[0])
        gridsearch.setXMax(MBounds[1])
        gridsearch.setXStep(MBounds[2])
        gridsearch.setYMin(RBounds[0])
        gridsearch.setYMax(RBounds[1])
        gridsearch.setYStep(RBounds[2])
        gridsearch.setYBase(10)
        print "searching for logistic lcassifier Max Iteration = [", MBounds[0], ",", MBounds[1], "], Ridge = [ 10E", RBounds[0], ",10E", RBounds[1], "] ...."
        gridsearch.buildClassifier(data)
        bestValues = gridsearch.getValues()
        # -----------------------  Evaluation
        bestlogistic = Logistic()
        bestlogistic.setMaxIts(int(bestValues.x))
        bestlogistic.setRidge(pow(10,bestValues.y))
        evaluation = Evaluation(data)
        output = util.get_buffer_for_predictions()[0]
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        random = Random(1)
        numFolds = min(10,data.numInstances())
        evaluation.crossValidateModel(bestlogistic,data,numFolds,random,[output, attRange, outputDistribution])
        acc = evaluation.pctCorrect()
        print "best accuracy: ", acc
        print "best logistic classifier with Ridge = ", bestlogistic.getRidge(), " Max Iteration = ", bestlogistic.getMaxIts()
        OptLog = bestlogistic
        OptLogp1 = bestlogistic.getRidge()
        OptLogp2 = bestlogistic.getMaxIts()
        OptLogAcc = acc
    else:
        OptLog, OptLogp1, OptLogp2, OptLogAcc = myGridSearch(data,RBounds,MBounds)
    Description = 'Logistic classifier OptRidge = ' + str(OptLogp1) + \
            ', OptMaxIts = ' + str(OptLogp2) + ', OptAcc = ' + str(OptLogAcc)
    print "-----------------------------------------"
    return OptLog, OptLogp1, OptLogp2, OptLogAcc, Description
def bagging_logistic(trainData,testData,params,exparams):
    IsOptBagOnOptLog = str2bool(params[0])
    logistic = Logistic()
    bagging = Bagging()
    if IsOptBagOnOptLog:    # optimal bagging is based on optimal logistic
        ridge = float(exparams[0])
        maxIt = int(float(exparams[1]))
        logistic.setMaxIts(maxIt)
        bagSizePercent = int(float(params[1]))
        bagging.setBagSizePercent(bagSizePercent)
    else:   # ridge parameter is also optimized in the process
        ridge = float(params[1])
    numIterations = int(float(params[2]))
    bagging.setNumIterations(numIterations)
    logistic.setRidge(ridge)
    bagging.setClassifier(logistic)
    bagging.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(bagging, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(bagging, testData, [testOutput, attRange, outputDistribution])
    return trainBuffer, testBuffer, trainSummary
def smo(trainData,testData,params,exparams):
    kerType = str2bool(params[0]) 
    cValue = float(params[1])
    kerParam = float(params[2])
    if kerType:     # RBF kernel
        kernel = RBFKernel()
        kernel.setGamma(kerParam)
    else:       # Polynomial kernel
        kernel = PolyKernel()
        kernel.setExponent(kerParam)
    smo = SMO()
    smo.setKernel(kernel)
    smo.setC(cValue)
    smo.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(smo, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(smo, testData, [testOutput, attRange, outputDistribution])
    return trainBuffer, testBuffer, trainSummary
def simple_logistic(trainData,testData,params,exparams):
    heuristicStop = int(float(params[0]))
    numBoostingIterations = int(float(params[1]))
    simplelogistic = SimpleLogistic()
    simplelogistic.setHeuristicStop(heuristicStop)
    simplelogistic.setNumBoostingIterations(numBoostingIterations)
    if (trainData.numInstances()<5):   # special case for small sample size
        simplelogistic.setUseCrossValidation(False) 
    simplelogistic.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(simplelogistic, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(simplelogistic, testData, [testOutput, attRange, outputDistribution])
    return trainBuffer, testBuffer, trainSummary
Exemple #12
0
    def test_model(self, test_data, empty_solution, evaluate = False):
        model_weka = None
        if os.path.isfile(self.prediction_file):
            print 'Model ' + self.name + ' already tested.'
        elif not os.path.isfile(self.model_file):
            print 'Impossible testing this model. It should be trained first.'
            return
        else: 
            print 'Starting to test_model model ' + self.name + '.'
            model_weka = Classifier(jobject = serialization.read(self.model_file)) 
            evaluation = Evaluation(data = test_data)
            evaluation.test_model(classifier = model_weka, data = test_data)
            
            predictions = evaluation.predictions()
            rows        = read_sheet(file_name = empty_solution)
            solutions   = []

            for row in rows:
                solution = [row['userid'], row['tweetid'], predictions.pop(0).predicted()]
                solutions.append(solution)
            write_the_solution_file(solutions, self.prediction_file)
            print 'Model ' + self.name + ' tested.'
        
        if evaluate == True:
            if os.path.isfile(self.evaluation_file):
                print 'Model ' + self.name + ' already evaluated.'
                return
            elif model_weka == None:
                model_weka = Classifier(jobject = serialization.read(self.model_file)) 
                evaluation = Evaluation(data = test_data)
                evaluation.test_model(classifier = model_weka, data = test_data)
            save_file(file_name = self.evaluation_file, content = evaluation.to_summary())
            print 'Model ' + self.name + ' evaluated.'
def bayesian(trainData,testData,params,exparams):
    IsOptMultinomialBayes   = str2bool(params[0]) 
    IsOptNaiveKernelDensity = str2bool(params[1]) 
    if IsOptMultinomialBayes:    # optimal bayesian classifier is multinomial
        bayes = NaiveBayesMultinomial()
    else:
        bayes = NaiveBayes()
        if IsOptNaiveKernelDensity:   # use kernel density estimation
            bayes.setUseKernelEstimator(Boolean(True))   
    bayes.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(bayes, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(bayes, testData, [testOutput, attRange, outputDistribution])   
    return trainBuffer, testBuffer, trainSummary
def build_and_classify(classifier, classifier_name, approach_name, infile, percentage='10'):
    """
    Creates model and classifies against input data. Returns accuracy statistics
    """
    # set seed so results are consistent
    random.seed('iot')

    # load data
    loader = Loader(classname='weka.core.converters.CSVLoader')
    data = loader.load_file(infile)
    data.class_is_last()

    # convert all numeric attributes to nominal
    to_nominal = Filter(classname='weka.filters.unsupervised.attribute.NumericToNominal',
                        options=['-R', 'first-last'])
    to_nominal.inputformat(data)
    data = to_nominal.filter(data)

    # randomize data with constant seed
    randomize = Filter(classname='weka.filters.unsupervised.instance.Randomize',
                       options=['-S', '42'])
    randomize.inputformat(data)

    data = randomize.filter(data)

    # create training set and testing set
    train_percent_filter = Filter(classname='weka.filters.unsupervised.instance.RemovePercentage',
                                  options=['-P', percentage, '-V'])
    train_percent_filter.inputformat(data)

    train = train_percent_filter.filter(data)
    test = data

    # build and test classifier
    classifier.build_classifier(train)
    evaluation = Evaluation(train)
    evaluation.test_model(classifier, test)

    # return results as array
    results = [
        approach_name,
        classifier_name,
        percentage,
        evaluation.percent_correct,
        evaluation.weighted_f_measure
    ]
    return results
Exemple #15
0
def test_weka_classifier(clf, train, test):

    clf.build_classifier(train)

    evl = Evaluation(train)
    evl.test_model(clf, test)

    acc = evl.percent_correct
    auc = evl.weighted_area_under_roc
    err = evl.error_rate
    log = evl.sf_mean_scheme_entropy

    print(
        "# testing  | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}".
        format(log, acc, auc, err))

    return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
    def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]):
        
        if arrfFile is not None:
            self.initData( arrfFile )
            
        if self.data is None:
            return 

        print 'Classificador ' + str(classname) + ' ' + ' '.join(options)
        cls = Classifier(classname=classname, options=options)
        
        evl = Evaluation(self.data)
        evl.crossvalidate_model(cls, self.data, 10, Random(1))

        print(evl.percent_correct)
        print(evl.summary())
        print(evl.class_details())
    def cross_validate(self, detail = True):
        """Perform cross validation using trained data. 
        
        Parameters
        ----------
        detail : boolean, optional, default = True
            If true return a detailed information of cross validation.
            
        Returns
        -------
        info : string
            Info with results of cross validation.
        """
        
        #print 'cross_validation'
        
        start_time = TimeUtils.get_time()
        
        info =  "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options]))
        
        if detail == True:
            info += "Relation:\t%s\n" % (self.data.relationname)
            info += "Instances:\t%d\n" % (self.data.num_instances)
            info += "Attributes:\t%d\n\n" % (self.data.num_attributes)
        
        evl = WEvaluation(self.data)
        evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1))
        
        if detail == False:
            info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct)

        info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time)
        #info += str(evl.percent_correct) + "\n\n"
        
        if detail == True:
            info += "=== Stratified cross-validation ===\n"
            info += evl.summary() + "\n\n"
            
            info += str(evl.class_details()) + "\n\n"
            
            classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)]
            cm = evl.confusion_matrix
            info += Classifier.confusion_matrix(classes, cm)

        return info
Exemple #18
0
    def runCV(this, arffFile, classifier, folds):

        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(arffFile)
        data.class_is_last()

        classes = [str(code) for code in data.class_attribute.values]
        header = ["Accuracy"]
        for name in classes:
            header += [name + " TP", name + " FP", name + " AUC ROC"]
        values = []

        cls = Classifier(classname=classifier)

        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, folds, Random(1))

        values.append(evl.percent_correct)
        for name in classes:
            index = classes.index(name)
            values += [
                evl.true_positive_rate(index) * 100,
                evl.false_positive_rate(index) * 100,
                evl.area_under_roc(index)
            ]

        this.values = values
        this.header = header
Exemple #19
0
    def crossTest(this, trainingFile, classifier, testFile):

        loader = Loader(classname="weka.core.converters.ArffLoader")
        data1 = loader.load_file(trainingFile)
        data1.class_is_last()

        cls = Classifier(classname=classifier)
        cls.build_classifier(data1)

        data2 = loader.load_file(testFile)
        data2.class_is_last()

        classes = [str(code) for code in data2.class_attribute.values]
        header = ["Accuracy"]
        for name in classes:
            header += [name + " TP", name + " FP", name + " AUC ROC"]
        values = []

        evl = Evaluation(data2)
        evl.test_model(cls, data2)

        values.append(evl.percent_correct)
        for name in classes:
            index = classes.index(name)
            values += [
                evl.true_positive_rate(index) * 100,
                evl.false_positive_rate(index) * 100,
                evl.area_under_roc(index)
            ]

        this.values = values
        this.header = header
Exemple #20
0
    def predict(self, X):
        evaluation = Evaluation(self.train_data)

        # Add class column (we can't copy X, because this is a large object, so we add the column and remove it later)
        X['class'] = None

        filename = self.to_arff(X, True)

        # Remove class column
        del X['class']

        loader = Loader("weka.core.converters.ArffLoader")
        test_data = loader.load_file(filename)
        test_data.class_is_last()

        preds = evaluation.test_model(self.classifier, test_data)

        return preds
def use_classifier(data):
    """
    Uses the meta-classifier AttributeSelectedClassifier for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n1. Meta-classifier")
    classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    base = Classifier(classname="weka.classifiers.trees.J48")
    # setting nested options is always a bit tricky, getting all the escaped double quotes right
    # simply using the bean property for setting Java objects is often easier and less error prone
    classifier.set_property("classifier", base.jobject)
    classifier.set_property("evaluator", aseval.jobject)
    classifier.set_property("search", assearch.jobject)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(1))
    print(evaluation.summary())
def use_classifier(data):
    """
    Uses the meta-classifier AttributeSelectedClassifier for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n1. Meta-classifier")
    classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    base = Classifier(classname="weka.classifiers.trees.J48")
    # setting nested options is always a bit tricky, getting all the escaped double quotes right
    # simply using the bean property for setting Java objects is often easier and less error prone
    classifier.set_property("classifier", base.jobject)
    classifier.set_property("evaluator", aseval.jobject)
    classifier.set_property("search", assearch.jobject)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(1))
    print(evaluation.summary())
Exemple #23
0
def run():
    jvm.start()
    load_csv = Loader("weka.core.converters.CSVLoader")
    data_csv = load_csv.load_file(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.csv"
    )

    saver = Saver("weka.core.converters.ArffSaver")
    saver.save_file(
        data_csv,
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff"
    )

    load_arff = Loader("weka.core.converters.ArffLoader")
    data_arff = load_arff.load_file(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff"
    )
    data_arff.class_is_last()

    cls = Classifier(classname="weka.classifiers.trees.J48",
                     options=["-C", "0.5"])
    cls.build_classifier(data_arff)
    for index, inst in enumerate(data_arff):
        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)
        # save tree prune in txt file

    saveFile = open(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.txt",
        "w")
    saveFile.write(str(cls))
    # print(cls)
    saveFile.close()

    global j48
    J48_class = Classifier(classname="weka.classifiers.trees.J48",
                           options=["-C", "0.25", "-M", "2"])
    J48_class.build_classifier(data_arff)
    evaluationj48 = Evaluation(data_arff)
    evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100))
    j48 = str(evaluationj48.percent_correct)
    jvm.stop()
    return j48
Exemple #24
0
def runBayes(file, bound):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file)
    data.class_is_first()

    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", bound])
    cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")

    remove.inputformat(data)
    filtered = remove.filter(data)

    evl = Evaluation(filtered)
    evl.crossvalidate_model(cls, filtered, 10, Random(1))

    print(evl.percent_correct)
    #print(evl.summary())
    result = evl.class_details()
    print(result)
    return result
Exemple #25
0
def CV10(dataset, algo):
    print "inside 10cv"
    print("dataset ----" + dataset)
    print("algorithm ----" + algo)

    #Executing 10FCV

    #	jvm.start(packages=True)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(dataset)
    data.class_is_last()

    #print(data)

    cls = Classifier(classname=algo)

    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 2, Random(5))

    print("areaUnderROC/1: " + str(evl.area_under_roc(1)))
Exemple #26
0
    def executeKFoldClassifier(self, featureInclusion, kFold):
        deleteFeatures = 0
        for i in range(0, len(featureInclusion)):
            if featureInclusion[i]:
                self.instances.deleteAttributeAt(i - deleteFeatures)
                deleteFeatures += 1
        self.instances.setClassIndex(self.instances.numAttributes - 1)

        cvParameterSelection = javabridge.make_instance(
            "weka/classifiers/meta/CVParameterSelection", "()V")
        javabridge.call(cvParameterSelection, "setNumFolds", "(I)V", kFold)
        javabridge.call(cvParameterSelection,
                        "buildClassifier(weka/core/Instances)V",
                        self.instances)

        eval = Evaluation(self.instances)
        eval.crossvalidate_model(cvParameterSelection, self.instances, kFold,
                                 random())

        return eval.percent_correct()
Exemple #27
0
def train_and_eval_weka_classifier(clf, train, valid, n_instances):

    # total_inst = train.num_instances

    total_train_inst = train.num_instances

    percentage = (n_instances * 100) / total_train_inst

    if percentage == 100:
        opt = train
    else:
        opt, extra = train.train_test_split(percentage, Random(1))

    # inst_train2 = train2.num_instances

    print('total_train_inst:    ', total_train_inst, '| percentage:    ',
          percentage, '| used_inst:     ', opt.num_instances)

    import signal

    class AlarmException(Exception):
        pass

    def alarmHandler(signum, frame):
        raise AlarmException

    clf.build_classifier(opt)

    evl = Evaluation(opt)
    evl.test_model(clf, valid)

    acc = evl.percent_correct
    auc = evl.weighted_area_under_roc
    err = evl.error_rate
    log = evl.sf_mean_scheme_entropy

    print(
        "# validating  | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}"
        .format(log, acc, auc, err))

    return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
Exemple #28
0
def HOV(dataset, algo):
    print "inside hov"
    print("dataset ----" + dataset)
    print("algorithm ----" + algo)

    #Executing HOV \_*-*_/

    #	jvm.start(packages=True)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(dataset)
    data.class_is_last()

    train, test = data.train_test_split(70.0, Random(10))

    cls = Classifier(classname=algo)
    cls.build_classifier(train)

    evl = Evaluation(train)
    evl.test_model(cls, test)

    return (str(evl.area_under_roc(1)))
Exemple #29
0
def SMOreg():
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file("First_trial_regression.arff")
    data.class_is_last()

    cls = KernelClassifier(classname="weka.classifiers.functions.SMOreg",
                           options=["-N", "0"])
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.RBFKernel",
        options=["-G", "0.2"])
    cls.kernel = kernel
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(486), pout)

    print(evl.summary())
    print(pout.buffer_content())

    # save model
    serialization.write_all("SMOreg.model2", cls)
Exemple #30
0
def weka_bayesnet(filearffpath='data/datatobayes.arff'):
    """Simple calling of the bayesian network from python.
    """
    #Preparing the data
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file('data/datatobayes.arff')
    #data = loader.load_file('data/Full.arff')
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "first"])
    remove.inputformat(data)
    filtered = data  #remove.filter(data)

    #Classifier test
    from weka.classifiers import Classifier, Evaluation
    from weka.core.classes import Random
    filtered.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.BayesNet",
                            options=['-D'])  #
    evaluation = Evaluation(filtered)
    evaluation.crossvalidate_model(classifier, filtered, 10, Random(42))
    return evaluation.area_under_roc(class_index=0)  #ROC, no std of kfold
 def evaluation(self, classifier, trainingData, testingData = None):
     trainingData.set_class_index(trainingData.num_attributes() - 1)
     if testingData == None:
         evaluation = Evaluation(trainingData) 
                             # initialize with priors
         evaluation.crossvalidate_model(classifier, trainingData, 10, Random(42))  # 10-fold CV
         return evaluation
     else:
         print "testing data exists"
         if testingData.num_attributes() == trainingData.num_attributes():
             testingData.set_class_index(testingData.num_attributes() - 1)
             evaluation = Evaluation(trainingData)   
             
             classifier.build_classifier(trainingData)
             evaluation.test_model(classifier, testingData)
             
             #for attribute in trainingData.attributes():
             #    print "train:" + str(attribute)
             #for attribute in testingData.attributes():
             #    print "test:" + str(attribute)
                 
                 
             return evaluation
         else:
             print "testing Data doesn't have same attribute with training data"
             for attribute in trainingData.attributes():
                 print "train:" + str(attribute)
             for attribute in testingData.attributes():
                 print "test:" + str(attribute)
Exemple #32
0
def test_classifier(dataset: Instances, classifier: Classifier, params: dict):
    vars = params.keys()
    vals = params.values()

    results = defaultdict(list)

    for val_combo in itertools.product(*vals):
        results["numInstances"].append(dataset.num_instances)
        results["numAttributes"].append(dataset.num_attributes)
        opts = dict(zip(vars, val_combo))

        for opt in opts:
            results[opt].append(opts[opt])
            classifier.set_property(
                opt, opts[opt] if not isinstance(opts[opt], float) else
                typeconv.double_to_float(opts[opt]))

        evl = Evaluation(dataset)
        classifier.build_classifier(dataset)
        evl.test_model(classifier, dataset)
        results["Training_Accuracy"].append(evl.percent_correct)
        results["size"].append(
            int(javabridge.call(classifier.jobject, "measureTreeSize", "()D")))
        evl.crossvalidate_model(classifier, dataset, 10, Random(1))
        results["CV_Accuracy"].append(evl.percent_correct)

    return results
Exemple #33
0
def vote_classifier_train(dicrectory, nameOfDataSet, flag):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(dicrectory)
    data.class_is_last()
    meta = MultipleClassifiersCombiner(
        classname="weka.classifiers.meta.Vote",
        options=[
            '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B',
            'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1',
            '-B',
            'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- '
            '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B',
            'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump',
            '-B',
            'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- '
            '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B',
            'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG'
        ])
    eval = Evaluation(data)
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    if flag:
        eval.crossvalidate_model(meta, data, 10, Random(1), pout)
    else:
        eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout)
    gc.collect()
    print_and_save('Proposed model', flag, nameOfDataSet, eval)
Exemple #34
0
    def run_naive_bayes_crossval(self, output_directory):
        # build classifier
        print("\nBuilding Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "NB Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("Naive_Bayes_Crossval", resultsString,
                          output_directory)
def Boost_J48(data, rnm):
    data.class_is_last()
    fc1 = FilteredClassifier()
    fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
    fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"])
    fc2.classifier = fc1
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    fc2.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output)
    f0 = open(rnm + '_Boost_J48_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc2)
    f0.close()
    f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evaluation.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evaluation.class_details())
    f2.close()
    plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False)
    value_Boost_J48 = str(evaluation.percent_correct)
    return value_Boost_J48
def RandomTree(data, rnm):
    data.class_is_last()
    fc = FilteredClassifier()
    fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"])
    fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, folds, Random(1), pred_output)
    fc.build_classifier(data)
    f0 = open(rnm + '_RT_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc)
    f0.close()
    f1 = open(rnm + '_RT_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_RT_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evl.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evl.class_details())
    f2.close()
    plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False)
    value_RT = str(evl.percent_correct)
    return value_RT
Exemple #37
0
def run_ibk(file):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running IBk on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    # Load data with class as first attr
    data = load_Arff_file(file)
    data.class_is_first()

    # Use IBk and set options
    cls = Classifier(classname="weka.classifiers.lazy.IBk",
                     options=["-K", "3"])
    # print(cls.options)

    # Predictions stored in pout
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    # Evaluate data
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout)

    # Save summary, class details and confusion matrix to file
    result_output = filename_base + "_eval_results.txt"
    output_eval(evaluation, dir / result_output)

    # Save the predicited results to file
    prediction_output = filename_base + "_pred_results.txt"
    output_pred(pout, dir / prediction_output)

    print("IBk complete")
def do_temporal_cv(t_selector, instances, num_folds):
    num_instances = instances.numInstances()
    results = []
    # Split folds
    for f in xrange(2, num_folds+1):
        print "fold:%d"%f
        for pair in split_temporal_train_test(f, num_instances):
    #        train_start = pair.train_start
    #        train_end = pair.train_end

            train_set = Instances(instances, int(pair.train_start), int(pair.train_end - pair.train_start+1))
            test_set = Instances(instances, int(pair.test_start), int(pair.test_end - pair.test_start +1))

            t_selector.buildClassifier(train_set)
            e = Evaluation(train_set)
            e.evaluateModel(t_selector, test_set)

            if e.recall(0) > 0 and e.precision(0) > 0:
                results.append(Result(instances.numAttributes(), e))


            #            print "precision: %.2f"%evalTest.precision(0)
#            print "recall: %.2f"%evalTest.recall(0)
#            print evalTest.toSummaryString()
    #        System.out.println(strSummary);
    sum_precision = 0
    sum_recall = 0
    for r in results:
#        print "precision:"
#        print r.precision
#        print "recall:"
#        print r.recall
        sum_precision += r.precision
        sum_recall +=r.recall


    precision = sum_precision*1.0/len(results)
    recall = sum_recall*1.0/len(results)
    avg_fmeasure = harmonic_mean([precision, recall])
    print "f_measure:%.2f"%avg_fmeasure
def DecisionTree(rnd_data, folds, seed, data):

    data_size = rnd_data.num_instances
    fold_size = math.floor(data_size / folds)

    # cross-validation
    evaluation = Evaluation(rnd_data)
    for i in range(folds):
        this_fold = fold_size
        test_start = i * fold_size
        test_end = (test_start + fold_size)
        if ((data_size - test_end) / fold_size < 1):
            this_fold = data_size - test_start
        test = Instances.copy_instances(rnd_data, test_start,
                                        this_fold)  # generate validation fold
        if i == 0:
            train = Instances.copy_instances(rnd_data, test_end,
                                             data_size - test_end)
        else:
            train_1 = Instances.copy_instances(rnd_data, 0, test_start)
            train_2 = Instances.copy_instances(rnd_data, test_end,
                                               data_size - test_end)
            train = Instances.append_instances(
                train_1, train_2)  # generate training fold

        # build and evaluate classifier
        cls = Classifier(classname="weka.classifiers.trees.J48")
        cls.build_classifier(train)  # build classifier on training set
        evaluation.test_model(cls,
                              test)  # test classifier on validation/test set

    print("")
    print("=== Decision Tree ===")
    print("Classifier: " + cls.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(
        evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
	def training(self):
		# Preparação dos dados
		self.imp = Imputation(self.data)

		# Seleciona as caracteristicas
		self.features = FeatureSelection(self.imp.imputed_data)
		data_selected = self.features.data_selected
		self.selected_features = self.features.selected_features

		# Encontra os padrões ausentes
		self.missing_patterns = MissingPatterns(self.data, self.selected_features).missing_patterns

		# Realiza o treinamento dos classificadores
		#print('test train')
		for mpi in self.missing_patterns:

			# Seleciona as caracteristicas
			cpi = set(self.selected_features) - set(mpi)
			data_temp = Instances.copy_instances(data_selected, from_row=0, num_rows=data_selected.num_instances)
			data_temp.class_is_last()

			# Separa os dados de treinamento
			data_temp = self.reduceData(data_temp, cpi, self.data)

			
			# Treina os classificadores com os dados imputados
			classifier = Classifier(classname=self.learn_class, options=self.options)
			classifier.build_classifier(data_temp)
			
			#print(classifier.distribution_for_instance(data_selected.get_instance(30)))
			

			#!!!!!! Verica o peso de cada classificador (sua acuracia de classificação)
			evl = Evaluation(data_temp)
			evl.crossvalidate_model(classifier, data_temp, 15, Random(1))

			# Adiciona os classificadores treinados ao conjunto de classificadores
			my_classifier = MyClassifier(classifier, cpi, 1 - evl.mean_absolute_error)
			self.classifiers.add(my_classifier)
    def crossEvaluate(self):
        """
        Evaluate classifier using cross-validation using K folds
        :return:
        """
        if self.classifierInstance is not None:
            print '[Cross-validate data]'

            try:
                # Cross validation evaluation
                evaluatorInstance = Evaluation(self.classificationData)
                evaluatorInstance.crossvalidate_model(self.classifierInstance,
                                                      self.classificationData,
                                                      self.evaluationNumFolds,
                                                      Random(1))

                # Store evaluation results
                self.setEvaluationResults(evaluatorInstance)
                return True
            except:
                return False
        return False
def baggin_smo(trainData,testData,params,exparams):
    IsOptBagOnOptSMO =  str2bool(params[0]) 
    if IsOptBagOnOptSMO:    # optimal bagging is based on optimal SMO thus I should use extra params
        kerType =  str2bool(params[0]) 
        cValue = float(exparams[1])
        kerParam = float(exparams[2])
        if kerType:     # RBF kernel
            kernel = RBFKernel()
            kernel.setGamma(kerParam)
        else:       # Polynomial kernel
            kernel = PolyKernel()
            kernel.setExponent(kerParam)
        bagSizePercent = int(float(params[1]))
        numIterations = int(float(params[2]))
        smo = SMO()
        bagging = Bagging()
        smo.setKernel(kernel)
        smo.setC(cValue)
        bagging.setBagSizePercent(bagSizePercent)
        bagging.setNumIterations(numIterations)
        bagging.setClassifier(smo)
    else:   # optimal bagging is based on linear SMO
        cValue = float(params[1])
        numIterations = int(float(params[2]))
        smo = SMO()
        bagging = Bagging()
        kernel = PolyKernel()
        smo.setKernel(kernel)
        smo.setC(cValue)
        bagging.setNumIterations(numIterations)
        bagging.setClassifier(smo)
    bagging.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(bagging, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(bagging, testData, [testOutput, attRange, outputDistribution])
    return trainBuffer, testBuffer, trainSummary
Exemple #43
0
def CV5x2(dataset,  algo, num_datasets):

	loader = Loader(classname="weka.core.converters.ArffLoader")
	data = loader.load_file(dataset)
	data.class_is_last()

	cls = Classifier(classname=algo)

	evl = Evaluation(data)
	evl.crossvalidate_model(cls, data, 2, Random(5))

	print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False))
        print(evl.matrix("=== on click prediction(confusion matrix) ==="))
	print("For Algo"+ str(algo)+"areaUnderROC/1: for CV5x2 " + str(evl.area_under_roc(1)))

	return evl.area_under_roc(1)
Exemple #44
0
def train(option, sym, num):
    # load dataset given the symbol
    path = os.path.join('HistSet', 'histSet_%s.arff' % sym)
    loader = Loader("weka.core.converters.ArffLoader")
    dataset = loader.load_file(path)
    dataset.class_is_last()  # set the last attribute as class attribute

    # load testset
    # testset = loader.load_file(os.path.join('HistSet', 'testSet_LTC.arff'))
    # testset.class_is_last()

    # define classifier
    cmd = {
        'DecisionTable':
        'weka.classifiers.rules.DecisionTable -X 1 -S "weka.attributeSelection.BestFirst -D 1 -N 5"',
        'SMOreg':
        'weka.classifiers.functions.SMOreg -C 1.0 -N 0 -I "weka.classifiers.functions.supportVector.RegSMOImproved -L 0.001 -W 1 -P 1.0E-12 -T 0.001 -V" -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0"',
        'LinearRegression':
        'weka.classifiers.functions.LinearRegression -S 0 -R 1.0E-8',
        'GaussianProcesses':
        'weka.classifiers.functions.GaussianProcesses -L 1.0 -N 0 -K "weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 1.0"',
    }

    cls = from_commandline(cmd[option],
                           classname='weka.classifiers.Classifier')
    cls.build_classifier(dataset)

    # begin evaluating
    evaluation = Evaluation(dataset)

    # evaluation.evaluate_train_test_split(cls, dataset, 90, Random(1))   # evaluate by splitting train/test set
    evl = evaluation.test_model(cls, dataset)  # evaluate on test set
    print('predictions (' + str(len(evl)) + '): ')
    for i in range(num):
        print(evl[i - num], end=' ')
    # print(evaluation.summary())
    return evl[-num:]
Exemple #45
0
def run_bayesNet(file):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running BayesNet on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    # Load data with class as first attr
    data = load_Arff_file(file)
    data.class_is_first()

    # Use BayesNet and set options
    cls = Classifier(classname="weka.classifiers.bayes.BayesNet",
                     options=[
                         "-D", "-Q",
                         "weka.classifiers.bayes.net.search.local.TAN", "--",
                         "-P", "1", "-S", "BAYES", "-E",
                         "weka.classifiers.bayes.net.estimate.SimpleEstimator",
                         "--", "-A", "0.5"
                     ])

    # Predictions stored in pout
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    # Evaluate data
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout)

    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.confusion_matrix)

    # Generate grid for ROC
    # plcls.plot_roc(evaluation, class_index=[0,1], wait=True)

    # mk dirs for output
    dir = dir / "bayesNet_results"
    dir.mkdir(parents=True, exist_ok=True)

    # Save summary, class details and confusion matrix to file
    result_output = filename_base + "_bayesNet_eval_results_TAN.txt"
    output_eval(evaluation, dir / result_output)

    # Save the predicited results to file
    prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt"
    output_pred(pout, dir / prediction_output)

    print("BayesNet complete")
Exemple #46
0
def experiment_file_random(path_features, path_folder_save_results, options,
                           classifier, fold, random, name):
    print(name + "  Start: " + str(datetime.datetime.now()))
    time = datetime.datetime.now()
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))
    d_results = {
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }
    data = converters.load_any_file(path_features)
    data.class_is_last()
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, fold, Random(random), pout)
    d_results['percent_correct'].append(evl.percent_correct)
    d_results['percent_incorrect'].append(evl.percent_incorrect)
    d_results['confusion_matrix'].append(
        evl.matrix())  # Generates the confusion matrix.

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv',
                     index=False)

    save = pout.buffer_content()

    check_folder_or_create(path_folder_save_results + '/' + 'prediction')

    with open(
            path_folder_save_results + '/' + 'prediction/' + str(name) +
            '.csv', 'w') as f:
        f.write(save)
    print(name + "  End: " + str(datetime.datetime.now() - time))
def train_trees(data, attributes):

    clfs = []
    evls = []
    dt_y_hat = []

    unused_attributes = []

    for i, att in enumerate(attributes):

        data.class_index = i

        count_non_nans = np.count_nonzero(~np.isnan(data.values(i)))

        if count_non_nans < 5:

            unused_attributes.append(i)
            print('Not using attribute {}, only {} real values\n\n'.format(
                att, count_non_nans))
            clfs.append(None)
            evls.append(None)
            dt_y_hat.append(None)
            continue

        this_clf = Classifier(classname='weka.classifiers.trees.J48',
                              options=['-U', '-B', '-M', '2'])
        this_clf.build_classifier(data)

        this_evl = Evaluation(data)
        this_evl.crossvalidate_model(this_clf, data, 5, Random(1))

        dt_y_hat.append(this_clf.distributions_for_instances(data))
        clfs.append(this_clf)
        evls.append(this_evl)

    return clfs, evls, dt_y_hat, unused_attributes
Exemple #48
0
def obtainBayesNet(file):
    #The path of the arff extension file must be put.
    data = converters.load_any_file(folderPathOfArffFiles + file + ".arff")

    #In the case of this specific data set, the first two attributes were removed since they
    #   represent the name and ranking which are unique values that would affect the classification.
    #   Depending on the data set, certain attributes must be removed.
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "1-2"])
    remove.inputformat(data)
    data = remove.filter(data)
    #It is specified that the class value is the last attribute.
    data.class_is_last()

    #Define the classifier to be used.
    classifier = Classifier(classname="weka.classifiers.bayes.BayesNet")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, kFold, Random(42))

    #The ROC-AUC is extracted from the string that is received from Weka.
    info = evaluation.class_details()
    roc_area = float(info[406:411])

    return roc_area
def ClassifyParam(mode, binWidths):
	if not os.path.exists("classificationResults"):
		os.makedirs("classificationResults")

	if("normal" in mode):
		file = open("classificationResults/AllVsAll.csv","w") 

		file.write("BinWidth, Accuracy\n")

		for binWidth in binWidths:

			train_set = "Data/arff/TrainSet_%s.arff"%(binWidth)
			test_set = "Data/arff/TestSet_%s.arff"%(binWidth)
			print "Loading Datasets..."

			train_data = converters.load_any_file(train_set)
			test_data = converters.load_any_file(test_set)
			#Set class attribute
			train_data.class_is_last()
			test_data.class_is_last()
			print "Dataset Loaded!"


			classifier_name = "weka.classifiers.meta.FilteredClassifier"

			classifier = Classifier(classname=classifier_name, options=[
				"-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
				"-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])


			start_train = time.time()
			classifier.build_classifier(train_data)
			end_train = time.time()
			print "Train\t%s\t%s"%(binWidth, end_train-start_train)

			for index, inst in enumerate(test_data):
				if(index == 0):
					start_sample = time.time()
					classifier.classify_instance(inst)
					end_sample = time.time()
					print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)

			print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
			evaluation = Evaluation(test_data)
			start_batch = time.time()
			evaluation.test_model(classifier, test_data)
			end_batch = time.time()
			print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)

			
			print evaluation.summary()
			acc = evaluation.percent_correct/100.0
			print "Percent correct: " + str(acc)

			file.write("%s, %s\n"%(binWidth, acc))
		file.close()
def random_forest(trainData,testData,params,exparams):
    numTrees = int(float(params[0]))
    numFeatures = int(float(params[1]))
    randomforest = RandomForest()
    randomforest.setNumTrees(numTrees)
    randomforest.setNumFeatures(numFeatures)
    randomforest.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(randomforest, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(randomforest, testData, [testOutput, attRange, outputDistribution])
    return trainBuffer, testBuffer, trainSummary
def logistic(trainData,testData,params,exparams):
    ridge = float(params[0])
    maxIt = int(float(params[1]))
    print "Ridge=%s, maxIt=%s" %(str(ridge),str(maxIt))
    logistic = Logistic()
    logistic.setMaxIts(maxIt)
    logistic.setRidge(ridge)
    logistic.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(logistic, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(logistic, testData, [testOutput, attRange, outputDistribution])
    return trainBuffer, testBuffer, trainSummary
def adaboostM1_simple_logistic(trainData,testData,params,exparams):
    IsOptBoostOnOptSimpLog = str2bool(params[0])  
    simplelogistic = SimpleLogistic()
    adaboostm = AdaBoostM1()
    if IsOptBoostOnOptSimpLog:  # optimal adaboost is based on optimal simple logisatic 
        heuristicStop = int(float(exparams[0]))
        numBoostingIterations = int(float(exparams[1]))
        weightThreshold = int(float(params[1]))
        numIterations = int(float(params[2]))
        simplelogistic.setHeuristicStop(heuristicStop)
        simplelogistic.setNumBoostingIterations(numBoostingIterations)
        adaboostm.setWeightThreshold(weightThreshold)
        adaboostm.setNumIterations(numIterations)       
    else:
        numBoostingIterations = int(float(params[1]))
        numIterations = int(float(params[2]))
        simplelogistic.setNumBoostingIterations(numBoostingIterations)
        adaboostm.setNumIterations(numIterations)       
    adaboostm.setClassifier(simplelogistic)
    adaboostm.buildClassifier(trainData)  # only a trained classifier can be evaluated
    # evaluate it on the training
    evaluation = Evaluation(trainData)
    (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(adaboostm, trainData, [trainOutput, attRange, outputDistribution])
    print "--> Evaluation:\n"
    print evaluation.toSummaryString()
    trainSummary = makeTrainEvalSummary(evaluation)
    # evaluate it on testing
    evaluation = Evaluation(testData)
    (testOutput, testBuffer) = util.get_buffer_for_predictions(testData)
    attRange = Range()  # no additional attributes output
    outputDistribution = Boolean(False)  # we don't want distribution
    evaluation.evaluateModel(adaboostm, testData, [testOutput, attRange, outputDistribution])
    return trainBuffer, testBuffer, trainSummary
def runClassifierAlgo(algo, training_filename, test_filename, do_model, do_eval, do_predict):
    """ Run classifier algorithm <algo> on training data in <training_filename> to build a model
        then run in on data in <test_filename> (equivalent of WEKA "Supplied test set") """
    training_file = FileReader(training_filename)
    training_data = Instances(training_file)
    test_file = FileReader(test_filename)
    test_data = Instances(test_file)

    # set the class Index - the index of the dependent variable
    training_data.setClassIndex(class_index)
    test_data.setClassIndex(class_index)

    # create the model
    algo.buildClassifier(training_data)

    evaluation = None
    # only a trained classifier can be evaluated
    if do_eval or do_predict:
        evaluation = Evaluation(test_data)
        buffer = StringBuffer()  # buffer for the predictions
        attRange = Range()  # no additional attributes output
        outputDistribution = Boolean(False)  # we don't want distribution
        evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution])

    if verbose:
        if do_model:
            print "--> Generated model:\n"
            print algo.toString()
        if do_eval:
            print "--> Evaluation:\n"
            print evaluation.toSummaryString()
        if do_predict:
            print "--> Predictions:\n"
            print buffer

    return {"model": str(algo), "eval": str(evaluation.toSummaryString()), "predict": str(buffer)}
Exemple #54
0
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.class_is_last()

classifiers = [
    "weka.classifiers.bayes.NaiveBayes",
    "weka.classifiers.lazy.IBk",
    "weka.classifiers.trees.J48"
]

# cross-validate classifiers
for classifier in classifiers:
    # classifier itself
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("%s: %0.0f%%" % (classifier, evl.percent_correct))
    # meta with cfssubseteval
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    meta.options = \
        ["-E", "weka.attributeSelection.CfsSubsetEval",
         "-S", "weka.attributeSelection.BestFirst",
         "-W", classifier]
    evl = Evaluation(data)
    evl.crossvalidate_model(meta, data, 10, Random(1))
    print("%s (cfs): %0.0f%%" % (classifier, evl.percent_correct))
    # meta with wrapper
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    meta.options = \
        ["-E", "weka.attributeSelection.WrapperSubsetEval -B " + classifier,
Exemple #55
0
if data_dir is None:
  data_dir = "." + os.sep + "data"

import os
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation
from weka.filters import Filter

jvm.start()

# load weather.nominal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# perform 10-fold cross-validation
cls = Classifier(classname="weka.classifiers.rules.OneR")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation:\n" + evl.to_summary())

# build model on full dataset and output it
cls.build_classifier(data)
print("Model:\n\n" + str(cls))

jvm.stop()
Exemple #56
0
# check commandline parameters
if (not (len(sys.argv) == 2)):
    print "Usage: UsingJ48Ext.py <ARFF-file>"
    sys.exit()

# load data file
print "Loading data..."
file = FileReader(sys.argv[1])
data = Instances(file)

# set the class Index - the index of the dependent variable
data.setClassIndex(data.numAttributes() - 1)

# create the model
evaluation = Evaluation(data)
buffer = StringBuffer()  # buffer for the predictions
attRange = Range()  # no additional attributes output
outputDistribution = Boolean(False)  # we don't want distribution
j48 = J48()
j48.buildClassifier(data)  # only a trained classifier can be evaluated
evaluation.evaluateModel(j48, data, [buffer, attRange, outputDistribution])

# print out the built model
print "--> Generated model:\n"
print j48

print "--> Evaluation:\n"
print evaluation.toSummaryString()

print "--> Predictions:\n"
Exemple #57
0
from weka.classifiers import Classifier, Evaluation, CostMatrix, PredictionOutput

jvm.start()

datasets = [
    "ionosphere.arff",
    "credit-g.arff",
    "breast-cancer.arff",
    "diabetes.arff"
]
classifiers = [
    "weka.classifiers.functions.VotedPerceptron",
    "weka.classifiers.functions.SMO",
]

for dataset in datasets:
    # load dataset
    fname = data_dir + os.sep + dataset
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(fname)
    data.set_class_index(data.num_attributes() - 1)

    for classifier in classifiers:
        # cross-validate classifier
        cls = Classifier(classname=classifier)
        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, 10, Random(1))
        print("%s / %s: %0.1f%%" % (dataset, classifier, evl.percent_correct()))

jvm.stop()
Exemple #58
0
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# plot
pld.scatter_plot(
    data, data.get_attribute_by_name("petalwidth").get_index(),
    data.get_attribute_by_name("petallength").get_index(),
    wait=False)

# add classifier errors to dataset
addcls = Filter(
    classname="weka.filters.supervised.attribute.AddClassification",
    options=["-W", "weka.classifiers.trees.J48", "-classification", "-error"])
addcls.set_inputformat(data)
filtered = addcls.filter(data)
print(filtered)

# build J48
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(data)
evl = Evaluation(data)
evl.test_model(cls, data)

# plot classifier errors
plc.plot_classifier_errors(evl.predictions(), wait=True)

jvm.stop()

Exemple #59
0
	def run(self):
		# Attach JVM
		javabridge.attach()

		# Debug

		print "Classifier"
		print self.classifier
		print "Params"
		print self.parameters
		print "Model Params"
		print self.modelParams

		# Get data for testing and learning
		learnerData = self.retrieveData(self.questionID, "learner")
		testData = self.retrieveData(self.questionID, 'test')
		masterData = self.retrieveData(self.questionID, 'all')
		masterData = self.addNominals(masterData)

		# Check if there is enough correct data to run
		if (learnerData.num_instances < 1 or testData.num_instances < 1):
			self.status = self.config.NOT_ENOUGH_DATA
			return False

		# If this is a prediction and there is a valid patient, change masterData header
		patientObj = self.buildPatientObject()
		patientInstance = None
		if ((patientObj is not None) and (self.predict == 1)):
			masterData = self.addPatientNominals(patientObj, masterData)
			patientInstance = self.createPatientInstance(patientObj, masterData)
			masterData.add_instance(patientInstance)

		elif (patientObj is None) and (self.predict == 1):
			print 'No patient defined for prediction. Exiting'
			return True
		# Fix dataset headers up to match and fix instances to match headers
		masterData.delete()
		learner = masterData.copy_instances(masterData, 0, 0)
		test = masterData.copy_instances(masterData, 0, 0)
		self.addInstancesToDataset(learnerData, learner)
		self.addInstancesToDataset(testData, test)

		# Comparison of data for testing purposes
		# print 'learnerData'
		# print learnerData

		# print 'learner'
		# print learner

		# print 'testData'
		# print testData

		# print 'test'
		# print test

		# pdb.set_trace()
		# Instantiate classifier
		self.cls = Classifier(classname=self.classifier, options=self.parameters)

		# Run classifier
		self.cls.build_classifier(learner)
		# for index, inst in enumerate(learnerData):
			# prediction = self.cls.classify_instance(inst)
			# distribution = self.cls.distribution_for_instance(inst)

		# Test classifier
		evl = Evaluation(learner)
		evl.test_model(self.cls, test)

		# Store information about matrix
		self.acc = evl.percent_correct
		self.val = None

		# Convert numpy array into simple array
		confusionMatrix = []
		confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]])
		confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]])

		# Convert matrix into json format
		self.matrix = json.dumps(confusionMatrix)

		
		# print 'Classifier: ', self.classifier
		# print 'ID: ', self.questionID
		# print 'ACC: ', self.acc
		# print(evl.summary())

		# If this is a prediction... make the prediction
		if ((patientObj is not None) and (self.predict == 1)):
			masterData.add_instance(patientInstance)
			print "Running prediction on patient: "
			print masterData.get_instance(0)
			self.prediction = self.cls.classify_instance(masterData.get_instance(0))
			#self.uploadPrediction()

		# Temporarily store file to serialize to
		fileName = str(self.questionID) + self.algorithm + ".model"
		serialization.write(fileName, self.cls)

		# Open that file and store it
		self.model = None
		with open(fileName, 'rb') as f:
			self.model = f.read()

		# Remove temporary file
		os.remove(fileName)

		# Set status to awaiting feedback
		self.status = self.config.AWAITING_FEEDBACK_STATUS
		return True
Exemple #60
0
data_dir = os.environ.get("WEKAMOOC_DATA")
if data_dir is None:
    data_dir = "." + os.sep + "data"

import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.classifiers import Classifier, Evaluation, PredictionOutput
from weka.core.classes import Random
import weka.plot.classifiers as plc

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.class_is_last()

# cross-validate NaiveBayes
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1), pout)
print(evl.summary())
print(evl.matrix())
print(pout)
plc.plot_roc(evl, wait=True)

jvm.stop()