def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def naiveBayes(data): classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"]) nfolds=13 rnd = Random(0) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, nfolds, rnd) print(" Naive Bayes Cross-validation information") print(evaluation.summary()) print("precision: " + str(evaluation.precision(1))) print("recall: " + str(evaluation.recall(1))) print("F-measure: " + str(evaluation.f_measure(1))) print("==confusion matrix==") print(" a b") print(evaluation.confusion_matrix) print #write to file f = open("naiveeval.txt", "w") f.write(evaluation.summary()) f.write("\n") f.write("==confusion matrix==\n") f.write(" a b\n") for item in evaluation.confusion_matrix: f.write("%s\n" % item) f.close() #plot roc graph plcls.plot_roc(evaluation, title="Naive Bayes ROC", outfile="NBROC", wait=True) return evaluation.percent_correct
def vote_classifier_train(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() meta = MultipleClassifiersCombiner( classname="weka.classifiers.meta.Vote", options=[ '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B', 'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG' ]) eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(meta, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout) gc.collect() print_and_save('Proposed model', flag, nameOfDataSet, eval)
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print( evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def evaluation(self, classifier, trainingData, testingData = None): trainingData.set_class_index(trainingData.num_attributes() - 1) if testingData == None: evaluation = Evaluation(trainingData) # initialize with priors evaluation.crossvalidate_model(classifier, trainingData, 10, Random(42)) # 10-fold CV return evaluation else: print "testing data exists" if testingData.num_attributes() == trainingData.num_attributes(): testingData.set_class_index(testingData.num_attributes() - 1) evaluation = Evaluation(trainingData) classifier.build_classifier(trainingData) evaluation.test_model(classifier, testingData) #for attribute in trainingData.attributes(): # print "train:" + str(attribute) #for attribute in testingData.attributes(): # print "test:" + str(attribute) return evaluation else: print "testing Data doesn't have same attribute with training data" for attribute in trainingData.attributes(): print "train:" + str(attribute) for attribute in testingData.attributes(): print "test:" + str(attribute)
def experiment_file_random(path_features, path_folder_save_results, options, classifier, fold, random, name): print("start weka") cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) d_results = { 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } data = converters.load_any_file(path_features) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv', index=False) save = pout.buffer_content() with open( path_folder_save_results + '/' + 'prediction/' + str(name) + '.csv', 'w') as f: f.write(save)
def evaluation(self, classifier, trainingData, testingData=None): trainingData.set_class_index(trainingData.num_attributes() - 1) if testingData == None: evaluation = Evaluation(trainingData) # initialize with priors evaluation.crossvalidate_model(classifier, trainingData, 10, Random(42)) # 10-fold CV return evaluation else: print "testing data exists" if testingData.num_attributes() == trainingData.num_attributes(): testingData.set_class_index(testingData.num_attributes() - 1) evaluation = Evaluation(trainingData) classifier.build_classifier(trainingData) evaluation.test_model(classifier, testingData) #for attribute in trainingData.attributes(): # print "train:" + str(attribute) #for attribute in testingData.attributes(): # print "test:" + str(attribute) return evaluation else: print "testing Data doesn't have same attribute with training data" for attribute in trainingData.attributes(): print "train:" + str(attribute) for attribute in testingData.attributes(): print "test:" + str(attribute)
def naivebay_classifier_weka(data): classifier = Classifier("weka.classifiers.bayes.NaiveBayes") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(42)) print(evaluation.summary()) print(evaluation.confusion_matrix) return classifier
def Boost_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Boost_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False) value_Boost_J48 = str(evaluation.percent_correct) return value_Boost_J48
def run(): jvm.start() load_csv = Loader("weka.core.converters.CSVLoader") data_csv = load_csv.load_file( "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.csv" ) saver = Saver("weka.core.converters.ArffSaver") saver.save_file( data_csv, "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff" ) load_arff = Loader("weka.core.converters.ArffLoader") data_arff = load_arff.load_file( "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff" ) data_arff.class_is_last() global j48 J48_class = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) J48_class.build_classifier(data_arff) evaluationj48 = Evaluation(data_arff) evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100)) j48 = str(evaluationj48.percent_correct) jvm.stop() return j48
def test_classifier(dataset: Instances, classifier: Classifier, params: dict): vars = params.keys() vals = params.values() results = defaultdict(list) for val_combo in itertools.product(*vals): results["numInstances"].append(dataset.num_instances) results["numAttributes"].append(dataset.num_attributes) opts = dict(zip(vars, val_combo)) for opt in opts: results[opt].append(opts[opt]) classifier.set_property( opt, opts[opt] if not isinstance(opts[opt], float) else typeconv.double_to_float(opts[opt])) evl = Evaluation(dataset) classifier.build_classifier(dataset) evl.test_model(classifier, dataset) results["Training_Accuracy"].append(evl.percent_correct) results["size"].append( int(javabridge.call(classifier.jobject, "measureTreeSize", "()D"))) evl.crossvalidate_model(classifier, dataset, 10, Random(1)) results["CV_Accuracy"].append(evl.percent_correct) return results
def RandomTree(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RT_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RT_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RT_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False) value_RT = str(evl.percent_correct) return value_RT
def runSMO(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = KernelClassifier( classname="weka.classifiers.functions.SMO", options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.PolyKernel", options=["-C", "250007", "-E", "1.0"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) #print(pout.buffer_content()) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def create_model(input_file, output_file): # Load data data = converters.load_any_file(input_file) data.class_is_last() # set class attribute # filter data print_title("Filtering Data") discretize = Filter( classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", "10", "-M", "-1.0", "-R", "first-last"]) discretize.inputformat( data) # let the filter know about the type of data to filter filtered_data = discretize.filter(data) print("Done! (believe it or not)") print_title("Build Classifier") classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1"]) classifier.build_classifier(filtered_data) print("Done! (believe it or not)") serialization.write_all(output_file, [classifier, discretize]) print("Model and filter saved to ", output_file) evaluation = Evaluation(data) # initialize with priors evaluation.crossvalidate_model(classifier, filtered_data, 10, Random(42)) # 10-fold CV print(evaluation.summary()) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect))
def run_naive_bayes_crossval(self, output_directory): # build classifier print("\nBuilding Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "NB Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Naive_Bayes_Crossval", resultsString, output_directory)
def proses(): #diluar def index = 0 import math from weka.classifiers import Kernel, KernelClassifier from weka.classifiers import PredictionOutput import numpy as np klasifi = KernelClassifier(classname="weka.classifiers.functions.SMOreg", options=["-N", "0"]) vm = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.1"]) klasifi.vm = vm output_x = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") kelola = Evaluation(anomali) kelola.crossvalidate_model(klasifi, anomali, 10, Random(0), output=output_x) process = 0 for x in anomali.values(anomali.class_index): data_inst.append(x) for x in kelola.predictions: i = str(x) index = i.split() data_pred.append(float(index[2])) data_std.insert(idx, math.ceil(np.std(data_inst)) * 0.1) print('\n DONE PROCESSING DATASET ATTRIBUTE ', anomali.attribute(anomali.class_index), '...')
def run(dataset_path): start = time.time() ### load a dataset ### train_data = model.load_dataset_weka(dataset_path) # to_nomial_class_filter = Filter( classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) to_nomial_class_filter.inputformat(train_data) ### Naive Bayes ### Choose what you want classifier = Classifier("weka.classifiers.bayes.NaiveBayesMultinomial") # classifier = Classifier("weka.classifiers.bayes.NaiveBayes") # classifier.build_classifer(train_data) evaluation = Evaluation(to_nomial_class_filter.filter(train_data)) evaluation.crossvalidate_model(classifier, to_nomial_class_filter.filter(train_data), 10, Random(42)) # print(evaluation.summary()) # print(evaluation.class_details()) # print(evaluation.matrix()) # ### Naive Bayes ### # mlp = Classifier("weka.classifiers.bayes.Naive Bayes") # mlp.build_classifer(train_file_5EMO) print(time.time() - start)
def use_classifier(data, cli, args): cli = cli.format(cli, **args) cls = from_commandline(cli, classname="weka.classifiers.Classifier") cls.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1)) return cls, evaluation
def runCV(this, arffFile, classifier, folds): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arffFile) data.class_is_last() classes = [str(code) for code in data.class_attribute.values] header = ["Accuracy"] for name in classes: header += [name + " TP", name + " FP", name + " AUC ROC"] values = [] cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, folds, Random(1)) values.append(evl.percent_correct) for name in classes: index = classes.index(name) values += [ evl.true_positive_rate(index) * 100, evl.false_positive_rate(index) * 100, evl.area_under_roc(index) ] this.values = values this.header = header
def fitness(toeval : Individual): cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings()) fc = FilteredClassifier() fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) return evl.percent_correct
def f_smote(): jvm.start() train_data, test_data = b_i_impute_data() train_data = train_data[:10000] y_train = train_data["class"] x_train = train_data.drop("class", axis=1) sm = SMOTE(ratio="minority") x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns) y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"]) train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1) print_f("smote train data shape", train_data_sm_df.shape) train_data_sm_df.to_csv("./train_data_sm.csv", index=False) train_data_sm = converters.load_any_file("train_data_sm.csv") train_data_sm.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print_f("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print_f("bulding classifier") cls.build_classifier(train_data_sm) print_f("Evaluating") evl = Evaluation(train_data_sm) evl.crossvalidate_model(cls, train_data_sm, 5, Random(1)) print_f("Train Accuracy:", evl.percent_correct) print_f("Train summary") print_f(evl.summary()) print_f("Train class details") print_f(evl.class_details()) print_f("Train confusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True, outfile="./plots/2_f_smote_10k.png") plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) evl = Evaluation(test_data) print_f("testing model") evl.test_model(cls, test_data) print_f("Test Accuracy:", evl.percent_correct) print_f("Test summary") print_f(evl.summary()) print_f(" Testclass details") print_f(evl.class_details()) print_f("Testconfusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/f_test_roc_curve.png")
def use_classifier(data_filename, cli): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_filename) data.class_is_last() cls = from_commandline(cli, classname="weka.classifiers.Classifier") cls.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1)) return cls, evaluation
def run_bayesNet(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running BayesNet on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # Use BayesNet and set options cls = Classifier(classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.TAN", "--", "-P", "1", "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") # Evaluate data evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.confusion_matrix) # Generate grid for ROC # plcls.plot_roc(evaluation, class_index=[0,1], wait=True) # mk dirs for output dir = dir / "bayesNet_results" dir.mkdir(parents=True, exist_ok=True) # Save summary, class details and confusion matrix to file result_output = filename_base + "_bayesNet_eval_results_TAN.txt" output_eval(evaluation, dir / result_output) # Save the predicited results to file prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt" output_pred(pout, dir / prediction_output) print("BayesNet complete")
def experiment_more_file(path_files, path_folder_save_results, fold, options, classifier, random, name): cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) file_list = os.listdir(path_files) for file in file_list: if ".csv" not in file: file_list.remove(file) d_results = { 'name_file': [], 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } print(file_list) for file in file_list: print(str(file)) data = converters.load_any_file(path_files + "/" + file) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['name_file'].append(str(file)) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() with open( path_folder_save_results + '/' + 'prediction/' + str(name) + str(file)[:-4] + 'pred_data.csv', 'w') as f: f.write(save) d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv", index=False)
def naive_bayse(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() cls = Classifier(classname='weka.classifiers.bayes.NaiveBayes') eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(cls, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(cls, data, 80.0, Random(1), pout) print_and_save('Naive Bayes model', flag, nameOfDataSet, eval) gc.collect()
def CV5x2(dataset, algo, num_datasets): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() cls = Classifier(classname=algo) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 2, Random(5)) print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False)) print(evl.matrix("=== on click prediction(confusion matrix) ===")) print("For Algo"+ str(algo)+"areaUnderROC/1: for CV5x2 " + str(evl.area_under_roc(1))) return evl.area_under_roc(1)
def evaluate_classifier(cls, data, crossvalidate=False, n_folds=10): """ Evaluation :param cls: trained classifier :param data: data to test the model on :param crossvalidate: True to use crossvalidation :param n_folds: number of folds to cross validate for :return: evaluation object """ evl = Evaluation(data) if crossvalidate: evl.crossvalidate_model(cls, data, n_folds, Random(5)) else: evl.test_model(cls, data) return evl
def obtainSVM(file): data = converters.load_any_file(folderPathOfArffFiles + file + ".arff") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-2"]) remove.inputformat(data) data = remove.filter(data) data.class_is_last() classifier = Classifier(classname="weka.classifiers.functions.LibSVM") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, kFold, Random(42)) info = evaluation.class_details() roc_area = float(info[406:411]) return roc_area
def execute(self,featureInclusion, kFold, classIndex): deletedFeatures = 0 for i in range(0,len(featureInclusion)): if featureInclusion[i] == False: self.instances.deleteAttributeAt( i - deletedFeatures) deletedFeatures += 1 self.instances.setClassIndex(classIndex) cvParameterSelection = javabridge.make_instance("Lweka/classifiers/meta/CVParameterSelection","()V") javabridge.call(cvParameterSelection, "setNumFolds", "(I)V", kFold) javabridge.call(cvParameterSelection,"buildClassifier(Lweka/core/Instances)V",self.instances) eval = Evaluation(self.instances) eval.crossvalidate_model(cvParameterSelection, self.instances, kFold, Random(1)) return eval.percent_correct()
def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]): if arrfFile is not None: self.initData( arrfFile ) if self.data is None: return print 'Classificador ' + str(classname) + ' ' + ' '.join(options) cls = Classifier(classname=classname, options=options) evl = Evaluation(self.data) evl.crossvalidate_model(cls, self.data, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details())
def SimpleLogistic(): # load a dataset loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("First_trial_classification.arff") data.class_is_last() # set class attribute cls = Classifier(classname="weka.classifiers.functions.SimpleLogistic") pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(486), pout) print(evl.summary()) print(pout.buffer_content()) # save model serialization.write_all("SimpleLogistic2.model", cls)
def cross_validate(self, detail = True): """Perform cross validation using trained data. Parameters ---------- detail : boolean, optional, default = True If true return a detailed information of cross validation. Returns ------- info : string Info with results of cross validation. """ #print 'cross_validation' start_time = TimeUtils.get_time() info = "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options])) if detail == True: info += "Relation:\t%s\n" % (self.data.relationname) info += "Instances:\t%d\n" % (self.data.num_instances) info += "Attributes:\t%d\n\n" % (self.data.num_attributes) evl = WEvaluation(self.data) evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1)) if detail == False: info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct) info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time) #info += str(evl.percent_correct) + "\n\n" if detail == True: info += "=== Stratified cross-validation ===\n" info += evl.summary() + "\n\n" info += str(evl.class_details()) + "\n\n" classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)] cm = evl.confusion_matrix info += Classifier.confusion_matrix(classes, cm) return info
def run_crossval(self, output_directory, classifier_name, classifier_weka_spec, options_list): # build classifier print("\nBuilding " + classifier_name + " Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname=classifier_weka_spec, options=options_list) cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = classifier_name + " Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString += "\n" resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString += "\n" resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\n\n" + classifier_name + " Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) options_string = "" for option in options_list: options_string = options_string + str(option) options_string = options_string.replace(".", "-") options_string = options_string.replace("-", "_") #Save Results and Cleanup self.save_results(classifier_name + options_string + "_Crossval", resultsString, output_directory)
def use_classifier(data): """ Uses the meta-classifier AttributeSelectedClassifier for attribute selection. :param data: the dataset to use :type data: Instances """ print("\n1. Meta-classifier") classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"]) base = Classifier(classname="weka.classifiers.trees.J48") # setting nested options is always a bit tricky, getting all the escaped double quotes right # simply using the bean property for setting Java objects is often easier and less error prone classifier.set_property("classifier", base.jobject) classifier.set_property("evaluator", aseval.jobject) classifier.set_property("search", assearch.jobject) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(1)) print(evaluation.summary())
def run(): jvm.start() load_csv = Loader("weka.core.converters.CSVLoader") data_csv = load_csv.load_file( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.csv" ) saver = Saver("weka.core.converters.ArffSaver") saver.save_file( data_csv, "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff" ) load_arff = Loader("weka.core.converters.ArffLoader") data_arff = load_arff.load_file( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff" ) data_arff.class_is_last() cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"]) cls.build_classifier(data_arff) for index, inst in enumerate(data_arff): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) # save tree prune in txt file saveFile = open( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.txt", "w") saveFile.write(str(cls)) # print(cls) saveFile.close() global j48 J48_class = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) J48_class.build_classifier(data_arff) evaluationj48 = Evaluation(data_arff) evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100)) j48 = str(evaluationj48.percent_correct) jvm.stop() return j48
print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct()) print("\nHoldout 10%...") # use seed 1-10 and perform random split with 90% perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.evaluate_train_test_split( Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i)) perc.append(round(evl.percent_correct(), 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct())) # calculate mean and standard deviation nperc = numpy.array(perc) print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc))) print("\n10-fold Cross-validation...") # use seed 1-10 and perform 10-fold CV perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.crossvalidate_model(Classifier(classname="weka.classifiers.trees.J48"), data, 10, Random(i)) perc.append(round(evl.percent_correct(), 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct())) # calculate mean and standard deviation nperc = numpy.array(perc) print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc))) jvm.stop()
# load weather.nominal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # define classifiers classifiers = ["weka.classifiers.rules.OneR", "weka.classifiers.trees.J48"] # cross-validate original dataset for classifier in classifiers: cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s (original): %0.0f%%" % (classifier, evl.percent_correct)) # replace 'outlook' in first 4 'no' instances with 'missing' modified = Instances.copy_instances(data) count = 0 for i in xrange(modified.num_instances): if modified.get_instance(i).get_string_value(modified.class_index) == "no": count += 1 modified.get_instance(i).set_missing(0) if count == 4: break # cross-validate modified dataset for classifier in classifiers: cls = Classifier(classname=classifier)
def test(self, folds = 10): evaluation = Evaluation(self.data) # initialize with priors evaluation.crossvalidate_model(self.classifier, self.data, folds, Random(42)) # 10-fold CV print('Total number of instances: '+str(evaluation.num_instances)+'.') print(str(round(evaluation.percent_correct,2))+'% / '+str(round(evaluation.correct, 2))+' correct.') print(str(round(evaluation.percent_incorrect,2))+'% / '+str(round(evaluation.incorrect, 2))+' incorrect.')
wfilter = Filter(classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # convert content to string wfilter = Filter(classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # set class attribute data.set_class_index(data.num_attributes() - 1) # generate baseline zeror = Classifier(classname="weka.classifiers.rules.ZeroR") evaluation = Evaluation(data) evaluation.crossvalidate_model(zeror, data, 10, Random(1)) print("\nBaseline:\n" + evaluation.to_summary()) # perform text mining j48 = Classifier(classname="weka.classifiers.trees.J48") stwv = Filter( classname="weka.filters.unsupervised.attribute.StringToWordVector", options=["-R", "1", "-P", "att-"]) stwv.set_inputformat(data) data = stwv.filter(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(j48, data, 10, Random(1)) print("\nJ48:\n" + evaluation.to_summary()) # stop JVM jvm.stop()
def process_classifier(runType, cls, occ, devList, fewCats, label, subtract): global devCount global save_orig global save_subtract conf_matrix = {} if occ: table = 'temp_dat_occ_vector_occ' else: table = 'temp_dat_occ_vector_2' writeStr = '=========================================================================================\n' + \ 'Running ' + runType + ' classifier for \'' + label + '\'' sys.stdout.write(writeStr + '\r') total_conf.write(writeStr + '\n') sys.stdout.flush() if runType == 'unseen': i = 0 indiv_results = {} for dev in devList: devCount += 1 remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start)) sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \ str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining) \r') sys.stdout.flush() if fewCats: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC in (select * from id_fewcats_mac) ' 'and deviceMAC!=\'' + dev + '\';') else: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC!=\'' + dev + '\';') results = aws_c.fetchall() # Generate type list total_types = ['{'] for data in results: if(data[-1] not in total_types): total_types.append('\"') total_types.append(data[-1]) total_types.append('\"') total_types.append(',') total_types[-1] = '}' typeStr = ''.join(total_types) arff_train = label + '_' + dev + '_train' arff_test = label + '_' + dev + '_test' gen_arff(arff_train, typeStr, results, occ, arff_idcol) if fewCats: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC in (select * from id_fewcats_mac) ' 'and deviceMAC=\'' + dev + '\';') else: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC=\'' + dev + '\';') gen_arff(arff_test, typeStr, aws_c.fetchall(), occ, arff_idcol) train = loader.load_file(arff_train + '.arff') train.class_is_last() mv(arff_train + '.arff', master_saveDir) test = loader.load_file(arff_test + '.arff') test.class_is_last() mv(arff_test + '.arff', master_saveDir) cls.build_classifier(train) # output predictions testName = '' predictions = [] for index, inst in enumerate(test): if testName != '': if testName != inst.get_string_value(inst.class_index): print(str(testName) + ' ' + str(inst.get_string_value(inst.class_index))) exit() else: testName = inst.get_string_value(inst.class_index) else: testName = inst.get_string_value(inst.class_index) if testName not in conf_matrix: conf_matrix[testName] = {} pred = cls.classify_instance(inst) # dist = cls.distribution_for_instance(inst) # if(pred == inst.get_value(inst.class_index)): predName = inst.class_attribute.value(int(pred)) if predName not in conf_matrix[testName]: conf_matrix[testName][predName] = 0 conf_matrix[testName][predName] += 1 predictions.append(predName) total = 0 if testName != '': for predName in conf_matrix[testName]: if predName == testName: correct = conf_matrix[testName][predName] total += correct else: total += conf_matrix[testName][predName] # while (len(predictions) * 2) <= 100: # predictions += pyrandom.sample(predictions, len(predictions)) # if len(predictions) < 100: # predictions += pyrandom.sample(predictions, 100 - len(predictions)) lots_predictions = [] while len(lots_predictions) < 10000: lots_predictions += pyrandom.sample(predictions, 1) #indiv_results[dev] = [testName, pyrandom.sample(predictions, 100)] indiv_results[dev] = [testName, lots_predictions] # while len(predictions) < 100: # predictions += pyrandom.sample(predictions, 1) # indiv_results[dev] = [testName, predictions] # indiv_results[dev] = [testName, predictions] # Prep to print the how-many-days graph # days_output.write('\n\n\"' + dev + '\"\n') #print(str(testName) + ' ' + str(correct) + ' ' + str(total) + ' ' + str(float(correct)/total)) # i += 1 # if i == 10: # break correct, total = print_conf_matrix(conf_matrix, sys.stdout, False, False, False) correct, total = print_conf_matrix(conf_matrix, total_conf, False, False, False) if subtract == 'orig': save_orig = copy.deepcopy(conf_matrix) elif subtract == 'subtract': save_subtract = copy.deepcopy(conf_matrix) final_result = round(100*float(correct)/total,2) writeStr = '\nCorrectly Classified Instances\t\t' + str(correct) + '\t\t' + str(final_result) + '\n' + \ 'Incorrectly Classified Instances\t' + str(total-correct) + '\t\t' + str(round(100*float(total-correct)/total,2)) + '\n' + \ 'Total Number of Instances\t\t' + str(total) + '\n' print(writeStr) total_conf.write(writeStr + '\n') conf_interval = 10 total_instances = float(sum([sum([conf_matrix[test][pred] for pred in conf_matrix[test]]) for test in conf_matrix])) p_d = {} p_e = {} p_e_given_d = {} for testName in conf_matrix: count_d = float(sum([conf_matrix[testName][label] for label in conf_matrix[testName]])) p_d[testName] = count_d / total_instances p_e[testName] = float(sum([conf_matrix[label][testName] for label in conf_matrix if testName in conf_matrix[label]]) / total_instances) p_e_given_d[testName] = {} for predName in conf_matrix: if predName in conf_matrix[testName]: p_e_given_d[testName][predName] = conf_matrix[testName][predName] / count_d else: p_e_given_d[testName][predName] = 0 confidence = open('confidence.dat', 'w') for testName in conf_matrix: confidence.write('\n\n\"' + testName + '\"\n') print(testName) for classEvents in range(1, (conf_interval+1)): numerator = math.pow(p_e_given_d[testName][testName], classEvents) * p_d[testName] demoninator = 0 for otherName in conf_matrix: demoninator += math.pow(p_e_given_d[otherName][testName], classEvents) * p_d[otherName] confidence.write(str(classEvents) + '\t' + str(numerator/demoninator) + '\n') print(str(classEvents) + '\t' + str(numerator/demoninator)) print('') for predName in p_e_given_d['Router/Modem']: print('P( ' + predName + ' | Router/Modem ):\t' + str(p_e_given_d['Router/Modem'][predName])) for predName in p_e_given_d['Cable Box']: print('P( ' + predName + ' | Cable Box ):\t' + str(p_e_given_d['Cable Box'][predName])) #router = open('router', 'w') print('Router Stuff:') routerDev = 'Router/Modem' lampDev = 'Lamp' cableDev = 'Cable Box' origClassList = ['Router/Modem', 'Cable Box', 'Lamp', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Router/Modem'] classListList = [['Router/Modem'] + list(listItem) for listItem in set(itertools.permutations(origClassList))] classListList = [ ['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'], ['Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'], ['Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp'] ] for idClass, classList in enumerate(classListList): print(idClass) for classEvents in range(1, (conf_interval+1)): numerator_router = p_d[routerDev] numerator_lamp = p_d[lampDev] numerator_cable = p_d[cableDev] for idx, classInst in enumerate(classList): if idx < classEvents: numerator_router *= p_e_given_d[routerDev][classInst] numerator_lamp *= p_e_given_d[lampDev][classInst] numerator_cable *= p_e_given_d[cableDev][classInst] demoninator = 0 for otherName in conf_matrix: obsValue = p_d[otherName] for idx, classInst in enumerate(classList): if idx < classEvents: obsValue *= p_e_given_d[otherName][classInst] demoninator += obsValue print(str(classEvents) + '\t' + str(numerator_router/demoninator) + '\t' + str(numerator_lamp/demoninator) + '\t' + str(numerator_cable/demoninator) + '\t\"' + classList[classEvents-1]) + '\"' print('') numberDevList(indiv_results) eachDev = open('indiv_results.dat', 'w') newIDStream = open('new_id.dat', 'w') for devItem in indiv_results: print_obsResults(conf_matrix, conf_interval, p_d, p_e, p_e_given_d, indiv_results[devItem], eachDev, devItem, newIDStream) print('') print('total devices: ' + str(len(indiv_results))) # print('total devices: ' + str(total_devices)) # print('total correct: ' + str(total_correct)) # print(' pct correct: ' + str(round(100*float(total_correct)/total_devices,2)) + '\n') print('initial confidence: ' + str(round(100*float(sum(initial_confidence))/len(initial_confidence),2))) print('initial accuracy: ' + str(round(100*float(sum(initial_accuracy))/len(initial_accuracy),2)) + '\n') # print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_correct))/len(final_confidence_correct),2))) # print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_incorrect))/len(final_confidence_incorrect),2))) # print('final accuracy: ' + str(round(100*float(total_correct)/total_devices,2))) for devType in final_accuracy: print('final accuracy ' + devType + ' : ' + str(round(float(sum(final_accuracy[devType]))/len(final_accuracy[devType]),6))) print('final confidence (correct) ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType]))/len(final_confidence_correct[devType]),6))) if len(final_confidence_incorrect[devType]) > 0: print('final confidence (incorrect) ' + devType + ' : ' + str(round(float(sum(final_confidence_incorrect[devType]))/len(final_confidence_incorrect[devType]),6))) else: print('final confidence (incorrect) ' + devType + ' : ' + str(0)) print('final confidence ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType])+sum(final_confidence_incorrect[devType]))/(len(final_confidence_correct[devType])+len(final_confidence_incorrect[devType])),2))) print_conf_matrix(new_conf_matrix, sys.stdout, False, False, False) for topType in actual_confidence_matrix: for botType in actual_confidence_matrix[topType]: storeArray = actual_confidence_matrix[topType][botType] if len(storeArray) > 0: actual_confidence_matrix[topType][botType] = round(sum(storeArray)/len(storeArray),2) else: actual_confidence_matrix[topType][botType] = 0 print_conf_matrix(conf_matrix, sys.stdout, False, False, False) print_conf_matrix(actual_confidence_matrix, sys.stdout, False, False, False) print_conf_matrix(actual_confidence_matrix, sys.stdout, True, False, True) for devType in acc_over_time_dev: printOverTime(devType, acc_over_time_dev[devType], conf_over_time_dev[devType]) printOverTime('total', acc_over_time, conf_over_time) elif runType == 'seen': if fewCats: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC in (select * from id_fewcats_mac);') else: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject);') results = aws_c.fetchall() devCount += 1 remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start)) sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \ str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining) \r') sys.stdout.flush() # Generate type list total_types = ['{'] for data in results: if(data[-1] not in total_types): total_types.append('\"') total_types.append(data[-1]) total_types.append('\"') total_types.append(',') total_types[-1] = '}' typeStr = ''.join(total_types) arff_file = label + '_train' gen_arff(arff_file, typeStr, results, occ, arff_idcol) train = loader.load_file(arff_file + '.arff') train.class_is_last() mv(arff_file + '.arff', master_saveDir) cls.build_classifier(train) evl = Evaluation(train) evl.crossvalidate_model(cls, train, 10, Random(1)) print('\n') #print(evl.percent_correct) #print(evl.class_details()) print(evl.matrix()) total_conf.write('\n' + evl.matrix()) print(evl.summary()) total_conf.write(evl.summary() + '\n') final_result = round(evl.percent_correct, 2) else: success = [] for startDev in devList: for changeToDev in devList: if startDev != changeToDev: devCount += 1 remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start)) sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \ str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining) \r') sys.stdout.flush() aws_c.execute('select * from temp_dat_occ_vector_2 ' \ 'where duty!=0 and deviceMAC in (\'' + startDev + '\',\'' + changeToDev + '\');') results = [x[:-1] + (x[1],) for x in aws_c.fetchall()] # Class label is just the deviceMAC if len(results) > 10: # Generate type list typeStr = '{' + startDev + ',' + changeToDev + '}' arff_file = label + '_' + startDev + '_' + changeToDev + '_train' gen_arff(arff_file, typeStr, results, occ, arff_idcol) train = loader.load_file(arff_file + '.arff') train.class_is_last() mv(arff_file + '.arff', master_saveDir) cls.build_classifier(train) evl = Evaluation(train) evl.crossvalidate_model(cls, train, 10, Random(1)) print('\n') #print(evl.percent_correct) #print(evl.class_details()) print(evl.matrix()) total_conf.write('\n' + evl.matrix()) print(evl.summary()) total_conf.write(evl.summary() + '\n') success.append(evl.percent_correct) if len(success) > 0: final_result = [sum(success)/len(success), percentile(success, 5), percentile(success, 10), percentile(success, 95)] else: final_result = False if label in total_results: print('Warning label ' + label + ' exists twice, overwriting...') if final_result != False: total_results[label] = final_result
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) for equal in ["", "-F"]: print("\nEqual frequency binning? " + str(equal == "-F") + "\n") for bins in [0, 40, 10, 5, 2]: if bins > 0: fltr = Filter(classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", str(bins), equal]) fltr.set_inputformat(data) filtered = fltr.filter(data) else: filtered = data cls = Classifier(classname="weka.classifiers.trees.J48") # cross-validate evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) # build classifier on full dataset cls.build_classifier(filtered) # get size of tree from model strings lines = str(cls).split("\n") nodes = "N/A" for line in lines: if line.find("Size of the tree :") > -1: nodes = line.replace("Size of the tree :", "").strip() # output stats print("bins=%i accuracy=%0.1f nodes=%s" % (bins, evl.percent_correct(), nodes)) jvm.stop()
import weka.plot as plot if plot.matplotlib_available: import matplotlib.pyplot as plt jvm.start() # load glass fname = data_dir + os.sep + "glass.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # compute baseline evl = Evaluation(data) evl.crossvalidate_model(Classifier("weka.classifiers.rules.ZeroR"), data, 10, Random(1)) baseline = evl.percent_correct() # generate learning curves percentages = [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] repetitions = [1, 10, 100] curves = {} for repetition in repetitions: # progress info sys.stdout.write("Repetitions=" + str(repetition)) # initialize curve curve = {} for percentage in percentages: curve[percentage] = 0 curves[repetition] = curve # run and add up percentage correct from repetition
# load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]: # train/test split 90% using classifier cls = Classifier(classname=classifier) evl = Evaluation(data) evl.evaluate_train_test_split(cls, data, 90.0, Random(1)) print("\n" + classifier + " train/test split (90%):\n" + evl.to_summary()) cls.build_classifier(data) print(classifier + " model:\n\n" + str(cls)) # calculate mean/stdev over 10 cross-validations for classifier in [ "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]: accuracy = [] for i in xrange(1,11): cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(i)) accuracy.append(evl.percent_correct()) nacc = numpy.array(accuracy) print("%s: %0.2f +/-%0.2f" % (classifier, numpy.mean(nacc), numpy.std(nacc))) jvm.stop()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
data_dir = os.environ.get("WEKAMOOC_DATA") if data_dir is None: data_dir = "." + os.sep + "data" import weka.core.jvm as jvm from weka.core.converters import Loader from weka.classifiers import Classifier, Evaluation, PredictionOutput from weka.core.classes import Random import weka.plot.classifiers as plc jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() # cross-validate NaiveBayes cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1), pout) print(evl.summary()) print(evl.matrix()) print(pout) plc.plot_roc(evl, wait=True) jvm.stop()
fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) # we'll set the class attribute after filtering # apply NominalToBinary filter and set class attribute fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary") fltr.inputformat(data) filtered = fltr.filter(data) filtered.class_is_last() # cross-validate LinearRegression on filtered data, display model cls = Classifier(classname="weka.classifiers.functions.LinearRegression") pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) print("10-fold cross-validation:\n" + evl.summary()) print("Predictions:\n\n" + str(pout)) cls.build_classifier(filtered) print("Model:\n\n" + str(cls)) # use AddClassification filter with LinearRegression on filtered data print("Applying AddClassification to filtered data:\n") fltr = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-W", "weka.classifiers.functions.LinearRegression", "-classification"]) fltr.inputformat(filtered) classified = fltr.filter(filtered) print(classified) # convert class back to nominal
from weka.classifiers import Classifier cls = Classifier(classname= "weka.classifiers.bayes.NaiveBayes" ) # No options of interest to adjust # Build classifier on training data cls.build_classifier(train) # print(cls) #import weka.plot.graph as graph #graph.plot_dot_graph(cls.graph) from weka.classifiers import Evaluation from weka.core.classes import Random evl = Evaluation(train) evl.crossvalidate_model(cls, train, 10, Random(1)) print ("Kappa Score") print (evl.kappa) # 0.50 - Not bad print ("Evaluation Summary") print (evl.summary()) # Accuracy: 83% ## Test model on new data ## evl = Evaluation(test) from weka.classifiers import PredictionOutput pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evl.crossvalidate_model(cls, test, 10, Random(1), pred_output)
from weka.filters import Filter # convert csv into arff format (weka compatable) # use convertcsvtoarff.py file # load arff file loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file("reviewsinformation_task2.arff") iris_data.class_is_last() loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) #print("model:\n" + str(classifier)) evaluation = Evaluation('test_data.arff') evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix())
loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() classifiers = [ "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.lazy.IBk", "weka.classifiers.trees.J48" ] # cross-validate classifiers for classifier in classifiers: # classifier itself cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s: %0.0f%%" % (classifier, evl.percent_correct)) # meta with cfssubseteval meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier") meta.options = \ ["-E", "weka.attributeSelection.CfsSubsetEval", "-S", "weka.attributeSelection.BestFirst", "-W", classifier] evl = Evaluation(data) evl.crossvalidate_model(meta, data, 10, Random(1)) print("%s (cfs): %0.0f%%" % (classifier, evl.percent_correct)) # meta with wrapper meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier") meta.options = \ ["-E", "weka.attributeSelection.WrapperSubsetEval -B " + classifier, "-S", "weka.attributeSelection.BestFirst",
# load a dataset iris_file = "HairEyeColor.csv" print("Loading dataset: " + iris_file) loader = Loader(classname="weka.core.converters.CSVLoader") iris_data = loader.load_file(iris_file) print (iris_data.num_attributes) iris_data.set_class_index(iris_data.num_attributes() - 1) # build a classifier and output model print ("Training J48 classifier on iris") classifier = Classifier(classname="weka.test.Regression") #classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"]) # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: #classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph()) #plot_graph.plot_dot_graph(classifier.graph()) evaluation = Evaluation(iris_data) # initialize with priors evaluation.crossvalidate_model(classifier, iris_data, 10, Random(42)) # 10-fold CV print(evaluation.to_summary()) print("pctCorrect: " + str(evaluation.percent_correct())) print("incorrect: " + str(evaluation.incorrect())) jvm.stop()