def test_model(self, test_data, empty_solution, evaluate = False): model_weka = None if os.path.isfile(self.prediction_file): print 'Model ' + self.name + ' already tested.' elif not os.path.isfile(self.model_file): print 'Impossible testing this model. It should be trained first.' return else: print 'Starting to test_model model ' + self.name + '.' model_weka = Classifier(jobject = serialization.read(self.model_file)) evaluation = Evaluation(data = test_data) evaluation.test_model(classifier = model_weka, data = test_data) predictions = evaluation.predictions() rows = read_sheet(file_name = empty_solution) solutions = [] for row in rows: solution = [row['userid'], row['tweetid'], predictions.pop(0).predicted()] solutions.append(solution) write_the_solution_file(solutions, self.prediction_file) print 'Model ' + self.name + ' tested.' if evaluate == True: if os.path.isfile(self.evaluation_file): print 'Model ' + self.name + ' already evaluated.' return elif model_weka == None: model_weka = Classifier(jobject = serialization.read(self.model_file)) evaluation = Evaluation(data = test_data) evaluation.test_model(classifier = model_weka, data = test_data) save_file(file_name = self.evaluation_file, content = evaluation.to_summary()) print 'Model ' + self.name + ' evaluated.'
if data_dir is None: data_dir = "." + os.sep + "data" import os import weka.core.jvm as jvm from weka.core.converters import Loader from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation from weka.filters import Filter jvm.start() # load weather.nominal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # perform 10-fold cross-validation cls = Classifier(classname="weka.classifiers.rules.OneR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation:\n" + evl.to_summary()) # build model on full dataset and output it cls.build_classifier(data) print("Model:\n\n" + str(cls)) jvm.stop()
# Use the WEKAMOOC_DATA environment variable to set the location # for the datasets import os data_dir = os.environ.get("WEKAMOOC_DATA") if data_dir is None: data_dir = "." + os.sep + "data" import os import weka.core.jvm as jvm from weka.core.converters import Loader from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation jvm.start() # load glass loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "glass.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) for k in [1, 5, 20]: # cross-validate IBk, display model cls = Classifier(classname="weka.classifiers.lazy.IBk", options=["-K", str(k)]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (k=" + str(k) + "):\n" + evl.to_summary()) jvm.stop()
data.set_class_index(data.num_attributes() - 1) # 1a filter data print("Filtering data...") fltr = Filter("weka.filters.unsupervised.attribute.StringToWordVector") fltr.set_inputformat(data) filtered = fltr.filter(data) filtered.set_class_index(0) # 1b build classifier print("Building/evaluating classifier...") cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(filtered) evl = Evaluation(filtered) evl.test_model(cls, filtered) print(evl.to_summary()) print(str(cls)) plg.plot_dot_graph(cls.graph()) # 2. filtered classifier fname = data_dir + os.sep + "simpletext-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) print("Building/evaluating filtered classifier...") cls = FilteredClassifier() cls.set_classifier(Classifier(classname="weka.classifiers.trees.J48")) cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector")) cls.build_classifier(data) pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText")
jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]: # train/test split 90% using classifier cls = Classifier(classname=classifier) evl = Evaluation(data) evl.evaluate_train_test_split(cls, data, 90.0, Random(1)) print("\n" + classifier + " train/test split (90%):\n" + evl.to_summary()) cls.build_classifier(data) print(classifier + " model:\n\n" + str(cls)) # calculate mean/stdev over 10 cross-validations for classifier in [ "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]: accuracy = [] for i in xrange(1,11): cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(i)) accuracy.append(evl.percent_correct()) nacc = numpy.array(accuracy) print("%s: %0.2f +/-%0.2f" % (classifier, numpy.mean(nacc), numpy.std(nacc)))
from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation jvm.start() for dataset in ["diabetes.arff", "breast-cancer.arff"]: # load dataset loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + dataset print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # cross-validate default J48, display model cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (default):\n" + evl.to_summary()) cls.build_classifier(data) print("Model (default):\n\n" + str(cls)) # cross-validate unpruned J48, display model cls = Classifier(classname="weka.classifiers.trees.J48", options=["-U"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (unpruned):\n" + evl.to_summary()) cls.build_classifier(data) print("Model (unpruned):\n\n" + str(cls)) jvm.stop()
wfilter = Filter(classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # convert content to string wfilter = Filter(classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # set class attribute data.set_class_index(data.num_attributes() - 1) # generate baseline zeror = Classifier(classname="weka.classifiers.rules.ZeroR") evaluation = Evaluation(data) evaluation.crossvalidate_model(zeror, data, 10, Random(1)) print("\nBaseline:\n" + evaluation.to_summary()) # perform text mining j48 = Classifier(classname="weka.classifiers.trees.J48") stwv = Filter( classname="weka.filters.unsupervised.attribute.StringToWordVector", options=["-R", "1", "-P", "att-"]) stwv.set_inputformat(data) data = stwv.filter(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(j48, data, 10, Random(1)) print("\nJ48:\n" + evaluation.to_summary()) # stop JVM jvm.stop()
# load a dataset iris_file = "HairEyeColor.csv" print("Loading dataset: " + iris_file) loader = Loader(classname="weka.core.converters.CSVLoader") iris_data = loader.load_file(iris_file) print (iris_data.num_attributes) iris_data.set_class_index(iris_data.num_attributes() - 1) # build a classifier and output model print ("Training J48 classifier on iris") classifier = Classifier(classname="weka.test.Regression") #classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"]) # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: #classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph()) #plot_graph.plot_dot_graph(classifier.graph()) evaluation = Evaluation(iris_data) # initialize with priors evaluation.crossvalidate_model(classifier, iris_data, 10, Random(42)) # 10-fold CV print(evaluation.to_summary()) print("pctCorrect: " + str(evaluation.percent_correct())) print("incorrect: " + str(evaluation.incorrect())) jvm.stop()
data = wfilter.filter(data) # convert content to string wfilter = Filter( classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # set class attribute data.set_class_index(data.num_attributes() - 1) # generate baseline zeror = Classifier(classname="weka.classifiers.rules.ZeroR") evaluation = Evaluation(data) evaluation.crossvalidate_model(zeror, data, 10, Random(1)) print("\nBaseline:\n" + evaluation.to_summary()) # perform text mining j48 = Classifier(classname="weka.classifiers.trees.J48") stwv = Filter( classname="weka.filters.unsupervised.attribute.StringToWordVector", options=["-R", "1", "-P", "att-"]) stwv.set_inputformat(data) data = stwv.filter(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(j48, data, 10, Random(1)) print("\nJ48:\n" + evaluation.to_summary()) # stop JVM jvm.stop()
from weka.classifiers import Classifier, Evaluation jvm.start() # load weather.nominal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # perform 10-fold cross-validation cls = Classifier(classname="weka.classifiers.rules.OneR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (full):\n" + evl.to_summary()) cls.build_classifier(data) print("Model:\n\n" + str(cls)) # remove attribute "outlook" print("Removing attribute 'outlook'") data.delete_attribute(data.get_attribute_by_name("outlook").get_index()) # perform 10-fold cross-validation (reduced dataset) cls = Classifier(classname="weka.classifiers.rules.OneR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (without 'outlook'):\n" + evl.to_summary()) cls.build_classifier(data) print("Model:\n\n" + str(cls))