def vote_classifier_train(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() meta = MultipleClassifiersCombiner( classname="weka.classifiers.meta.Vote", options=[ '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B', 'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG' ]) eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(meta, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout) gc.collect() print_and_save('Proposed model', flag, nameOfDataSet, eval)
def experimenter(self): """Perform a test using all classifiers available. Returns ------- info : string Info with results of experimenter. """ info = "" #print 'experimenter' aliases = sorted(WekaAlias.get_aliases()) for alias in aliases: try: # Ignore very slow classifiers. if alias == 'KStar' or alias == 'LWL' or alias == 'MultilayerPerceptron': continue start_time = TimeUtils.get_time() classifier = WClassifier(classname=WekaAlias.get_classifier(alias)) info += "Scheme:\t%s %s\n" % (str(classifier.classname) , " ".join([str(option) for option in classifier.options])) evl = WEvaluation(self.data) evl.evaluate_train_test_split(classifier, self.data, 66, WRandom(1)) info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct) info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time) except Exception as e: if str(e) != 'Object does not implement or subclass weka.classifiers.Classifier: __builtin__.NoneType': info += "Exception in %s: %s\n\n" % (WekaAlias.get_aliases()[alias], str(e)) return info
def naive_bayse(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() cls = Classifier(classname='weka.classifiers.bayes.NaiveBayes') eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(cls, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(cls, data, 80.0, Random(1), pout) print_and_save('Naive Bayes model', flag, nameOfDataSet, eval) gc.collect()
def evaluate(classifier, data): """ Private function that makes evaluation of classifier on given data. With command line arguments we can chose which evaluation to use. :param classifier: Classifier :param data: weka arff data :return: Evaluation """ args = evaluate_parser() evaluation = Evaluation(data) if args['evaluation'] == 'train_test': evaluation.evaluate_train_test_split(classifier, data, int(args['train_size']), Random(1)) elif args['evaluation'] == 'cross_validate': evaluation.crossvalidate_model(classifier, data, int(args['folds']), Random(42)) else: evaluation.test_model(classifier, data) return evaluation
from utilities import * import weka.core.jvm as jvm from weka.core.converters import Loader, Saver from weka.classifiers import Classifier, Evaluation from weka.core.classes import Random jvm.start(max_heap_size="3072m") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("./Dataset/trainGrid.arff") data.class_is_last() #classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") evaluation = Evaluation(data) #evaluation.crossvalidate_model(classifier, data, 10, Random(42)) evaluation.evaluate_train_test_split(classifier, data, 66, Random(42)) res = evaluation.summary() res += "\n" + evaluation.matrix() #f = open('./Dataset/resultsGrid.txt', 'w') #f.write(res) print res jvm.stop()
data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # determine baseline with ZeroR zeror = Classifier(classname="weka.classifiers.rules.ZeroR") zeror.build_classifier(data) evl = Evaluation(data) evl.test_model(zeror, data) print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct()) print("\nHoldout 10%...") # use seed 1-10 and perform random split with 90% perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.evaluate_train_test_split( Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i)) perc.append(round(evl.percent_correct(), 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct())) # calculate mean and standard deviation nperc = numpy.array(perc) print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc))) print("\n10-fold Cross-validation...") # use seed 1-10 and perform 10-fold CV perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.crossvalidate_model(Classifier(classname="weka.classifiers.trees.J48"), data, 10, Random(i)) perc.append(round(evl.percent_correct(), 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct()))
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer( classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc(evaluation, title="ROC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) # train 2nd classifier on diabetes dataset classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(diabetes_data) evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42)) plot_cls.plot_rocs({ "NB": evaluation, "RF": evaluation2 }, title="ROC diabetes", class_index=0, wait=False) plot_cls.plot_prcs({ "NB": evaluation, "RF": evaluation2 }, title="PRC diabetes", class_index=0, wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print( str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # train 2nd classifier and show errors in same plot classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg") evaluation2 = Evaluation(bolts_data) evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42)) plot_cls.plot_classifier_errors( { "LR": evaluation.predictions, "SMOreg": evaluation2.predictions }, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") ] plot_cls.plot_learning_curve(cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in xrange(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
from weka.classifiers import Classifier, Evaluation jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]: # train/test split 90% using classifier cls = Classifier(classname=classifier) evl = Evaluation(data) evl.evaluate_train_test_split(cls, data, 90.0, Random(1)) print("\n" + classifier + " train/test split (90%):\n" + evl.to_summary()) cls.build_classifier(data) print(classifier + " model:\n\n" + str(cls)) # calculate mean/stdev over 10 cross-validations for classifier in [ "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]: accuracy = [] for i in xrange(1,11): cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(i)) accuracy.append(evl.percent_correct()) nacc = numpy.array(accuracy)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
def testing(): logging.disable("weka") print "PROSES KLASIFIKASI\n------------------" jvm.start() pruning = 0 while pruning < 2: persen_train = 0 while persen_train < 4: fitur_hapus = 15 while fitur_hapus >= 0: list_akurasi = [] list_recall = [] list_presisi = [] list_fmeasure = [] list_roc = [] count = 0 nama = "hasilTest/" if(pruning == 0): nama += "unpruning" if(persen_train == 0): nama += "40" elif(persen_train == 1): nama += "50" elif(persen_train == 2): nama += "60" else: nama += "70" else: nama += "pruning" if(persen_train == 0): nama += "40" elif(persen_train == 1): nama += "50" elif(persen_train == 2): nama += "60" else: nama += "70" if(fitur_hapus > 0): nama += "removeF" + str(fitur_hapus) + ".txt" else: nama += "normal.txt" f = open(nama, "w") if(pruning == 0): nama = "unpruning" print "Tanpa Pruning" f.write("Hasil Decision Tree C4.5 tanpa Pruning (unpruning)\n") if(persen_train == 0): nama += "40" f.write("Dengan Training Set sebesar 40%\n") elif(persen_train == 1): nama += "50" f.write("Dengan Training Set sebesar 50%\n") elif(persen_train == 2): nama += "60" f.write("Dengan Training Set sebesar 60%\n") else: nama += "70" f.write("Dengan Training Set sebesar 70%\n") else: nama = "pruning" print "Dengan Pruning" f.write("Hasil Decision Tree C4.5 Pruning\n") if(persen_train == 0): nama += "40" f.write("Dengan Training Set sebesar 40%\n") elif(persen_train == 1): nama += "50" f.write("Dengan Training Set sebesar 50%\n") elif(persen_train == 2): nama += "60" f.write("Dengan Training Set sebesar 60%\n") else: nama += "70" f.write("Dengan Training Set sebesar 70%\n") if(fitur_hapus > 0): f.write("Menggunakan remove pada fitur " + str(fitur_hapus) + "\n\n") else: f.write("\n") f.write("No. Akurasi Recall Presisi F-Measure ROC\n") if persen_train == 0: print "40% Data Training" elif persen_train == 1: print "50% Data Training" elif persen_train == 2: print "60% Data Training" else: print "70% Data Training" print "Fitur yang dihapus:", fitur_hapus print "\nNo.\tAkurasi\tRecall\tPresisi\tF-Measure\tROC" while count < 100: loader = Loader(classname = "weka.core.converters.ArffLoader") data = loader.load_file("hasil.arff") data.class_is_last() if(fitur_hapus > 0): remove = Filter(classname = "weka.filters.unsupervised.attribute.Remove", options = ["-R", str(fitur_hapus)]) remove.inputformat(data) data_baru = remove.filter(data) data_baru.class_is_last() else: data_baru = loader.load_file("hasil.arff") data_baru.class_is_last() filter = Filter(classname = "weka.filters.unsupervised.instance.Randomize", options = ["-S", str(int(time.time()))]) filter.inputformat(data_baru) data_random = filter.filter(data_baru) data_random.class_is_last() if(pruning == 0): classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-U"]) else: classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-C", "0.25"]) evaluation = Evaluation(data_random) if(persen_train == 0): evaluation.evaluate_train_test_split(classifier, data_random, percentage = 40) elif(persen_train == 1): evaluation.evaluate_train_test_split(classifier, data_random, percentage = 50) elif(persen_train == 2): evaluation.evaluate_train_test_split(classifier, data_random, percentage = 60) else: evaluation.evaluate_train_test_split(classifier, data_random, percentage = 70) f.write(str(count + 1) + str( ". " ) + str(evaluation.weighted_true_positive_rate) + str( " " ) + str(evaluation.weighted_recall) + str( " " ) + str(evaluation.weighted_precision) + str( " " ) + str(evaluation.weighted_f_measure) + str( " " ) + str(evaluation.weighted_area_under_roc) + "\n") print count + 1, evaluation.weighted_true_positive_rate, evaluation.weighted_recall, evaluation.weighted_precision, evaluation.weighted_f_measure, evaluation.weighted_area_under_roc list_akurasi.append(evaluation.weighted_true_positive_rate) list_recall.append(evaluation.weighted_recall) list_presisi.append(evaluation.weighted_precision) list_fmeasure.append(evaluation.weighted_f_measure) list_roc.append(evaluation.weighted_area_under_roc) count += 1 time.sleep(1) list_akurasi.sort() list_recall.sort() list_presisi.sort() list_fmeasure.sort() list_roc.sort() f.write( "" + "\n") f.write( "Rata-Rata" + "\n") f.write( "Akurasi:" + str(sum(list_akurasi) / 100.0) + "\n") f.write( "Recall:" + str(sum(list_recall) / 100.0) + "\n") f.write( "Presisi:" + str(sum(list_presisi) / 100.0) + "\n") f.write( "F-Measure:" + str(sum(list_fmeasure) / 100.0) + "\n") f.write( "ROC:" + str(sum(list_roc) / 100.0) + "\n") f.write( "" + "\n") f.write( "Max" + "\n") f.write( "Akurasi:" + str(list_akurasi[-1] ) + "\n") f.write( "Recall:" + str(list_recall[-1] ) + "\n") f.write( "Presisi:" + str(list_presisi[-1] ) + "\n") f.write( "F-Measure:" + str(list_fmeasure[-1] ) + "\n") f.write( "ROC:" + str(list_roc[-1] ) + "\n") f.write( "" + "\n") f.write( "Min" + "\n") f.write( "Akurasi:" + str(list_akurasi[0] ) + "\n") f.write( "Recall:" + str(list_recall[0] ) + "\n") f.write( "Presisi:" + str(list_presisi[0] ) + "\n") f.write( "F-Measure:" + str(list_fmeasure[0] ) + "\n") f.write( "ROC:" + str(list_roc[0] ) + "\n") f.write( "" + "\n") print "" print "Rata-Rata" print "Akurasi:", sum(list_akurasi) / 100.0 print "Recall:", sum(list_recall) / 100.0 print "Presisi:", sum(list_presisi) / 100.0 print "F-Measure:", sum(list_fmeasure) / 100.0 print "ROC:", sum(list_roc) / 100.0 print "" print "Max" print "Akurasi:", list_akurasi[-1] print "Recall:", list_recall[-1] print "Presisi:", list_presisi[-1] print "F-Measure:", list_fmeasure[-1] print "ROC:", list_roc[-1] print "" print "Min" print "Akurasi:", list_akurasi[0] print "Recall:", list_recall[0] print "Presisi:", list_presisi[0] print "F-Measure:", list_fmeasure[0] print "ROC:", list_roc[0] print "" f.close() fitur_hapus -= 1 persen_train += 1 pruning += 1 jvm.stop()
from weka.classifiers import Classifier, Evaluation jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]: # train/test split 90% using classifier cls = Classifier(classname=classifier) evl = Evaluation(data) evl.evaluate_train_test_split(cls, data, 90.0, Random(1)) print("\n" + classifier + " train/test split (90%):\n" + evl.summary()) cls.build_classifier(data) print(classifier + " model:\n\n" + str(cls)) # calculate mean/stdev over 10 cross-validations for classifier in [ "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]: accuracy = [] for i in xrange(1,11): cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(i)) accuracy.append(evl.percent_correct) nacc = numpy.array(accuracy)
fname = data_dir + os.sep + "segment-challenge.arff" print("\nLoading dataset: " + fname + "\n") train = loader.load_file(fname) train.set_class_index(train.num_attributes() - 1) fname = data_dir + os.sep + "segment-test.arff" print("\nLoading dataset: " + fname + "\n") test = loader.load_file(fname) test.set_class_index(train.num_attributes() - 1) # build J48 cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # evaluate on test evl = Evaluation(train) evl.test_model(cls, test) print("Test set accuracy: %0.0f%%" % evl.percent_correct()) # evaluate on train evl = Evaluation(train) evl.test_model(cls, train) print("Train set accuracy: %0.0f%%" % evl.percent_correct()) # evaluate on random split evl = Evaluation(train) evl.evaluate_train_test_split(cls, train, 66.0, Random(1)) print("Random split accuracy: %0.0f%%" % evl.percent_correct()) jvm.stop()
data.class_is_last() # determine baseline with ZeroR zeror = Classifier(classname="weka.classifiers.rules.ZeroR") zeror.build_classifier(data) evl = Evaluation(data) evl.test_model(zeror, data) print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct) print("\nHoldout 10%...") # use seed 1-10 and perform random split with 90% perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.evaluate_train_test_split( Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i)) perc.append(round(evl.percent_correct, 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct)) # calculate mean and standard deviation nperc = numpy.array(perc) print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc))) print("\n10-fold Cross-validation...") # use seed 1-10 and perform 10-fold CV perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.crossvalidate_model(Classifier(classname="weka.classifiers.trees.J48"), data, 10, Random(i))
fname = data_dir + os.sep + "segment-challenge.arff" print("\nLoading dataset: " + fname + "\n") train = loader.load_file(fname) train.class_is_last() fname = data_dir + os.sep + "segment-test.arff" print("\nLoading dataset: " + fname + "\n") test = loader.load_file(fname) test.class_is_last() # build J48 cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # evaluate on test evl = Evaluation(train) evl.test_model(cls, test) print("Test set accuracy: %0.0f%%" % evl.percent_correct) # evaluate on train evl = Evaluation(train) evl.test_model(cls, train) print("Train set accuracy: %0.0f%%" % evl.percent_correct) # evaluate on random split evl = Evaluation(train) evl.evaluate_train_test_split(cls, train, 66.0, Random(1)) print("Random split accuracy: %0.0f%%" % evl.percent_correct) jvm.stop()