def score(self, testExamples, labels): f = open("testingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(testExamples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(testExamples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.testingData = loader.load_file("testingweka.arff") self.testingData.set_class_index(self.testingData.num_attributes() - 1) evaluation = Evaluation(self.trainingData) evaluation.test_model(self.classifier, self.testingData) #print evaluation.percent_correct() #jvm.stop() return evaluation.percent_correct()
def execute(self,featureInclusion, kFold, classIndex): deletedFeatures = 0 for i in range(0,len(featureInclusion)): if featureInclusion[i] == False: self.instances.deleteAttributeAt( i - deletedFeatures) deletedFeatures += 1 self.instances.setClassIndex(classIndex) cvParameterSelection = javabridge.make_instance("Lweka/classifiers/meta/CVParameterSelection","()V") javabridge.call(cvParameterSelection, "setNumFolds", "(I)V", kFold) javabridge.call(cvParameterSelection,"buildClassifier(Lweka/core/Instances)V",self.instances) eval = Evaluation(self.instances) eval.crossvalidate_model(cvParameterSelection, self.instances, kFold, Random(1)) return eval.percent_correct()
if plot.matplotlib_available: import matplotlib.pyplot as plt jvm.start() # load glass fname = data_dir + os.sep + "glass.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # compute baseline evl = Evaluation(data) evl.crossvalidate_model(Classifier("weka.classifiers.rules.ZeroR"), data, 10, Random(1)) baseline = evl.percent_correct() # generate learning curves percentages = [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] repetitions = [1, 10, 100] curves = {} for repetition in repetitions: # progress info sys.stdout.write("Repetitions=" + str(repetition)) # initialize curve curve = {} for percentage in percentages: curve[percentage] = 0 curves[repetition] = curve # run and add up percentage correct from repetition for seed in xrange(repetition):
# load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]: # train/test split 90% using classifier cls = Classifier(classname=classifier) evl = Evaluation(data) evl.evaluate_train_test_split(cls, data, 90.0, Random(1)) print("\n" + classifier + " train/test split (90%):\n" + evl.to_summary()) cls.build_classifier(data) print(classifier + " model:\n\n" + str(cls)) # calculate mean/stdev over 10 cross-validations for classifier in [ "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]: accuracy = [] for i in xrange(1,11): cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(i)) accuracy.append(evl.percent_correct()) nacc = numpy.array(accuracy) print("%s: %0.2f +/-%0.2f" % (classifier, numpy.mean(nacc), numpy.std(nacc))) jvm.stop()
preds.sort(preds.get_attribute_by_name("distribution-good").get_index()) print(evl.to_summary()) print(evl.to_matrix()) print(preds) # cross-validate CostSensitiveClassifier with J48 (minimize cost) classifier = "weka.classifiers.meta.CostSensitiveClassifier" base = "weka.classifiers.trees.J48" print("\n--> " + classifier + "/" + base + "\n") cost = array([[0, 1], [5, 0]]) matrx = CostMatrix(matrx=cost) cls = Classifier(classname=classifier, options=["-M", "-W", base, "-cost-matrix", matrx.to_matlab()]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("Accuracy: %0.1f" % evl.percent_correct()) print(evl.to_matrix()) # cross-validate Bagging with J48 classifier = "weka.classifiers.meta.Bagging" base = "weka.classifiers.trees.J48" print("\n--> " + classifier + "/" + base + "\n") cls = Classifier(classname=classifier, options=["-W", base]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("Accuracy: %0.1f" % evl.percent_correct()) print(evl.to_matrix()) # cross-validate CostSensitiveClassifier with NaiveBayes classifier = "weka.classifiers.meta.CostSensitiveClassifier"
print("Please restart") jvm.stop() exit() # load diabetes fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # J48 cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("J48: %0.1f%%" % evl.percent_correct()) # CVParameterSelection with J48 - confidenceFactor cls = Classifier(classname="weka.classifiers.meta.CVParameterSelection", options=["-W", "weka.classifiers.trees.J48", "-P", "C 0.1 0.9 9"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("CVParameterSelection (confidenceFactor): %0.1f%%" % evl.percent_correct()) # CVParameterSelection with J48 - confidenceFactor+minNumObj cls = Classifier(classname="weka.classifiers.meta.CVParameterSelection", options=["-W", "weka.classifiers.trees.J48", "-P", "C 0.1 0.9 9", "-P", "M 1 10 10"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("CVParameterSelection (confidenceFactor+minNumObj): %0.1f%%" % evl.percent_correct())
fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-S"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.set_classifier(Classifier(classname=classifier)) cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct()) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.to_matrix("Matrix:")) jvm.stop()
jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # determine baseline with ZeroR zeror = Classifier(classname="weka.classifiers.rules.ZeroR") zeror.build_classifier(data) evl = Evaluation(data) evl.test_model(zeror, data) print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct()) print("\nHoldout 10%...") # use seed 1-10 and perform random split with 90% perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.evaluate_train_test_split( Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i)) perc.append(round(evl.percent_correct(), 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct())) # calculate mean and standard deviation nperc = numpy.array(perc) print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc)))
# load ionosphere fname = data_dir + os.sep + "ionosphere.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # 1. cheating with default filter fltr = Filter(classname="weka.filters.supervised.attribute.Discretize", options=[]) fltr.set_inputformat(data) filtered = fltr.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) cls.build_classifier(filtered) print("cheating (default): accuracy=%0.1f nodes=%s" % (evl.percent_correct(), get_nodes(str(cls)))) # 2. using FilteredClassifier with default filter cls = FilteredClassifier() cls.set_classifier(Classifier(classname="weka.classifiers.trees.J48")) cls.set_filter(Filter(classname="weka.filters.supervised.attribute.Discretize", options=[])) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) cls.build_classifier(data) print("FilteredClassifier (default): accuracy=%0.1f nodes=%s" % (evl.percent_correct(), get_nodes(str(cls)))) # 3. using FilteredClassifier (make binary) cls = FilteredClassifier() cls.set_classifier(Classifier(classname="weka.classifiers.trees.J48")) cls.set_filter(Filter(classname="weka.filters.supervised.attribute.Discretize", options=["-D"])) evl = Evaluation(data)
# load weather.nominal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # define classifiers classifiers = ["weka.classifiers.rules.OneR", "weka.classifiers.trees.J48"] # cross-validate original dataset for classifier in classifiers: cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s (original): %0.0f%%" % (classifier, evl.percent_correct())) # replace 'outlook' in first 4 'no' instances with 'missing' modified = Instances.copy_instances(data) count = 0 for i in xrange(modified.num_instances()): if modified.get_instance(i).get_string_value(modified.get_class_index()) == "no": count += 1 modified.get_instance(i).set_missing(0) if count == 4: break # cross-validate modified dataset for classifier in classifiers: cls = Classifier(classname=classifier) evl = Evaluation(modified)
fname = data_dir + os.sep + "segment-challenge.arff" print("\nLoading dataset: " + fname + "\n") train = loader.load_file(fname) train.set_class_index(train.num_attributes() - 1) fname = data_dir + os.sep + "segment-test.arff" print("\nLoading dataset: " + fname + "\n") test = loader.load_file(fname) test.set_class_index(train.num_attributes() - 1) # build J48 cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # evaluate on test evl = Evaluation(train) evl.test_model(cls, test) print("Test set accuracy: %0.0f%%" % evl.percent_correct()) # evaluate on train evl = Evaluation(train) evl.test_model(cls, train) print("Train set accuracy: %0.0f%%" % evl.percent_correct()) # evaluate on random split evl = Evaluation(train) evl.evaluate_train_test_split(cls, train, 66.0, Random(1)) print("Random split accuracy: %0.0f%%" % evl.percent_correct()) jvm.stop()
options=["-N", "10", "-F", str(i), "-S", "1", "-V"]) remove.set_inputformat(data) train = remove.filter(data) # create test set remove = Filter( classname="weka.filters.supervised.instance.StratifiedRemoveFolds", options=["-N", "10", "-F", str(i), "-S", "1"]) remove.set_inputformat(data) test = remove.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) evl.test_model(cls, test) print("Simulated CV accuracy: %0.1f%%" % (evl.percent_correct())) # perform actual cross-validation evl = Evaluation(data) cls = Classifier(classname="weka.classifiers.trees.J48") evl.crossvalidate_model(cls, data, 10, Random(1)) print("Actual CV accuracy: %0.1f%%" % (evl.percent_correct())) # deploy print("Build model on full dataset:\n") cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(data) print(cls) jvm.stop()
evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (without 'outlook'):\n" + evl.to_summary()) cls.build_classifier(data) print("Model:\n\n" + str(cls)) # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) cls = Classifier(classname="weka.classifiers.rules.ZeroR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("Accuracy 10-fold cross-validation (ZeroR): %0.1f%%" % evl.percent_correct()) cls = Classifier(classname="weka.classifiers.rules.OneR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("Accuracy 10-fold cross-validation (OneR): %0.1f%%" % evl.percent_correct()) cls.build_classifier(data) print(cls) cls = Classifier(classname="weka.classifiers.rules.OneR", options=["-B", "1"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("Accuracy 10-fold cross-validation (OneR -B 1): %0.1f%%" % evl.percent_correct()) cls = Classifier(classname="weka.classifiers.rules.OneR", options=["-B", "1"]) cls.build_classifier(data)
# install stackingC if necessary if not packages.is_installed("stackingC"): print("Installing stackingC...") packages.install_package("stackingC") jvm.stop() print("Please restart") exit() # load glass loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "glass.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # compare several meta-classifiers with J48 for classifier in [("weka.classifiers.trees.J48", []), ("weka.classifiers.meta.Bagging", []), ("weka.classifiers.trees.RandomForest", []), ("weka.classifiers.meta.AdaBoostM1", []), ("weka.classifiers.meta.Stacking", []), ("weka.classifiers.meta.StackingC", ["-B", "weka.classifiers.lazy.IBk", "-B", "weka.classifiers.rules.PART", "-B", "weka.classifiers.trees.J48"])]: # cross-validate classifier cname, coptions = classifier cls = Classifier(classname=cname, options=coptions) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print(cname + " cross-validated accuracy: %0.2f" % evl.percent_correct()) jvm.stop()
# load a dataset iris_file = "HairEyeColor.csv" print("Loading dataset: " + iris_file) loader = Loader(classname="weka.core.converters.CSVLoader") iris_data = loader.load_file(iris_file) print (iris_data.num_attributes) iris_data.set_class_index(iris_data.num_attributes() - 1) # build a classifier and output model print ("Training J48 classifier on iris") classifier = Classifier(classname="weka.test.Regression") #classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"]) # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: #classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph()) #plot_graph.plot_dot_graph(classifier.graph()) evaluation = Evaluation(iris_data) # initialize with priors evaluation.crossvalidate_model(classifier, iris_data, 10, Random(42)) # 10-fold CV print(evaluation.to_summary()) print("pctCorrect: " + str(evaluation.percent_correct())) print("incorrect: " + str(evaluation.incorrect())) jvm.stop()
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) classifiers = [ "weka.classifiers.trees.J48", "weka.classifiers.lazy.IBk" ] # cross-validate classifiers for classifier in classifiers: cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s: %0.0f%%" % (classifier, evl.percent_correct())) # wrapper for classifier in classifiers: aseval = ASEvaluation(classname="weka.attributeSelection.WrapperSubsetEval", options=["-B", classifier]) assearch = ASSearch(classname="weka.attributeSelection.BestFirst", options=[]) attsel = AttributeSelection() attsel.set_evaluator(aseval) attsel.set_search(assearch) attsel.select_attributes(data) reduced = attsel.reduce_dimensionality(data) cls = Classifier(classname=classifier) evl = Evaluation(reduced)
data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) classifiers = [ "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.lazy.IBk", "weka.classifiers.trees.J48" ] # cross-validate classifiers for classifier in classifiers: # classifier itself cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s: %0.0f%%" % (classifier, evl.percent_correct())) # meta with cfssubseteval meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier") meta.set_options( ["-E", "weka.attributeSelection.CfsSubsetEval", "-S", "weka.attributeSelection.BestFirst", "-W", classifier]) evl = Evaluation(data) evl.crossvalidate_model(meta, data, 10, Random(1)) print("%s (cfs): %0.0f%%" % (classifier, evl.percent_correct())) # meta with wrapper meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier") meta.set_options( ["-E", "weka.attributeSelection.WrapperSubsetEval -B " + classifier, "-S", "weka.attributeSelection.BestFirst", "-W", classifier])
data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # cross-validate classifiers classifiers = [ "weka.classifiers.functions.MultilayerPerceptron", "weka.classifiers.trees.J48", "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.functions.SMO", "weka.classifiers.lazy.IBk" ] for classifier in classifiers: cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s: %0.0f%%" % (classifier, evl.percent_correct())) # configure experiment print("This will take some time, so grab a cuppa... And a muffin... And read the paper...") datasets = [ data_dir + os.sep + "iris.arff", data_dir + os.sep + "breast-cancer.arff", data_dir + os.sep + "credit-g.arff", data_dir + os.sep + "diabetes.arff", data_dir + os.sep + "glass.arff", data_dir + os.sep + "ionosphere.arff" ] classifiers = [ Classifier(classname="weka.classifiers.functions.MultilayerPerceptron"), Classifier(classname="weka.classifiers.rules.ZeroR"), Classifier(classname="weka.classifiers.rules.OneR"),
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) for equal in ["", "-F"]: print("\nEqual frequency binning? " + str(equal == "-F") + "\n") for bins in [0, 40, 10, 5, 2]: if bins > 0: fltr = Filter(classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", str(bins), equal]) fltr.set_inputformat(data) filtered = fltr.filter(data) else: filtered = data cls = Classifier(classname="weka.classifiers.trees.J48") # cross-validate evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) # build classifier on full dataset cls.build_classifier(filtered) # get size of tree from model strings lines = str(cls).split("\n") nodes = "N/A" for line in lines: if line.find("Size of the tree :") > -1: nodes = line.replace("Size of the tree :", "").strip() # output stats print("bins=%i accuracy=%0.1f nodes=%s" % (bins, evl.percent_correct(), nodes)) jvm.stop()
from weka.classifiers import Classifier, Evaluation, CostMatrix, PredictionOutput jvm.start() datasets = [ "ionosphere.arff", "credit-g.arff", "breast-cancer.arff", "diabetes.arff" ] classifiers = [ "weka.classifiers.functions.VotedPerceptron", "weka.classifiers.functions.SMO", ] for dataset in datasets: # load dataset fname = data_dir + os.sep + dataset loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) for classifier in classifiers: # cross-validate classifier cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s / %s: %0.1f%%" % (dataset, classifier, evl.percent_correct())) jvm.stop()
import numpy import weka.core.jvm as jvm from weka.core.converters import Loader from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation jvm.start() # load segment-challenge loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "segment-challenge.arff" print("\nLoading dataset: " + fname + "\n") train = loader.load_file(fname) train.set_class_index(train.num_attributes() - 1) # use seed 1-10 and perform random split with 90% perc = [] for i in xrange(1, 11): evl = Evaluation(train) evl.evaluate_train_test_split( Classifier(classname="weka.classifiers.trees.J48"), train, 90.0, Random(i)) perc.append(round(evl.percent_correct(), 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct())) # calculate mean and standard deviation nperc = numpy.array(perc) print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc))) jvm.stop()