def J48(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_J48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_J48_ROC.png', wait=False) value_J48 = str(evl.percent_correct) return value_J48
print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name( "reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name( "reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name( "reference value").index cls = FilteredClassifier() cls.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-S"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.set_classifier(Classifier(classname=classifier)) cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct()) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.to_matrix("Matrix:")) jvm.stop()
print("Train/test/predict...") groups = ["DataSet1", "DataSet2"] # groups = ["DataSet2"] for group in groups: print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name("reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name("reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name("reference value").index cls = FilteredClassifier() cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
jvm.start(system_cp=True, packages=True, max_heap_size="512m") # Train classifier loader = Loader(classname="weka.core.converters.ArffLoader", options=["-charset", "UTF-8"]) train_data = loader.load_file(os.path.dirname(os.path.realpath(__file__)) + "/datasets/train.arff") train_data.class_is_last() string_to_word_vector_filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector") cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial") fc = FilteredClassifier() fc.filter = string_to_word_vector_filter fc.classifier = cls fc.build_classifier(train_data) # Create test data class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"]) str_att = Attribute.create_string("title") test_dataset = Instances.create_instances( name="test_news_set", atts=[str_att, class_att], capacity=1 ) inst = Instance.create_instance([Instance.missing_value(), Instance.missing_value()]) test_dataset.add_instance(inst) test_dataset.get_instance(0).set_string_value(0, article['processed']['title'])
fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.class_is_last() setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.classifier = Classifier(classname=classifier) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.matrix("Matrix:")) jvm.stop()
evl.crossvalidate_model(fc, train, 5, Random(1)) # Prints Out Confusion Matrix along with other summary statistics print("LMT (SMOTE balanced classes) CV = 5 Error: %.2f%%" % (evl.percent_incorrect)) print(evl.matrix()) #Confusion Matrix # Plots ROC plcls.plot_roc(evl, class_index=[0, 1], wait=True) # Extra Summary print(evl.summary()) print(evl.class_details()) # Evaluate the classifier on test set fc.build_classifier(train) tevl = Evaluation(test) tevl.test_model(fc, test) # Prints Out Confusion Matrix along with other summary statistics print("LMT (SMOTE balanced classes) Test Error: %.2f%%" % (tevl.percent_incorrect)) print(tevl.matrix()) #Confusion Matrix # Plots ROC plcls.plot_roc(tevl, class_index=[0, 1], wait=True) # Extra Summary print(tevl.summary()) print(tevl.class_details())