def experiment_file_random(path_features, path_folder_save_results, options, classifier, fold, random, name): print("start weka") cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) d_results = { 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } data = converters.load_any_file(path_features) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv', index=False) save = pout.buffer_content() with open( path_folder_save_results + '/' + 'prediction/' + str(name) + '.csv', 'w') as f: f.write(save)
def Boost_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Boost_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False) value_Boost_J48 = str(evaluation.percent_correct) return value_Boost_J48
def RandomTree(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RT_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RT_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RT_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False) value_RT = str(evl.percent_correct) return value_RT
def experiment_more_file(path_files, path_folder_save_results, fold, options, classifier, random, name): cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) file_list = os.listdir(path_files) for file in file_list: if ".csv" not in file: file_list.remove(file) d_results = { 'name_file': [], 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } print(file_list) for file in file_list: print(str(file)) data = converters.load_any_file(path_files + "/" + file) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['name_file'].append(str(file)) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() with open( path_folder_save_results + '/' + 'prediction/' + str(name) + str(file)[:-4] + 'pred_data.csv', 'w') as f: f.write(save) d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv", index=False)
def runSMO(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = KernelClassifier( classname="weka.classifiers.functions.SMO", options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.PolyKernel", options=["-C", "250007", "-E", "1.0"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) #print(pout.buffer_content()) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def proses(): #diluar def index = 0 import math from weka.classifiers import Kernel, KernelClassifier from weka.classifiers import PredictionOutput import numpy as np klasifi = KernelClassifier(classname="weka.classifiers.functions.SMOreg", options=["-N", "0"]) vm = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.1"]) klasifi.vm = vm output_x = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") kelola = Evaluation(anomali) kelola.crossvalidate_model(klasifi, anomali, 10, Random(0), output=output_x) process = 0 for x in anomali.values(anomali.class_index): data_inst.append(x) for x in kelola.predictions: i = str(x) index = i.split() data_pred.append(float(index[2])) data_std.insert(idx, math.ceil(np.std(data_inst)) * 0.1) print('\n DONE PROCESSING DATASET ATTRIBUTE ', anomali.attribute(anomali.class_index), '...')
def vote_classifier_train(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() meta = MultipleClassifiersCombiner( classname="weka.classifiers.meta.Vote", options=[ '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B', 'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG' ]) eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(meta, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout) gc.collect() print_and_save('Proposed model', flag, nameOfDataSet, eval)
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. The predictions get recorded in two different ways: 1. in-memory via the test_model method 2. directly to file (more memory efficient), but a separate run of making predictions :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate and record predictions in memory helper.print_title("recording predictions in-memory") output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution"]) evl = Evaluation(train) evl.test_model(cls, test, output=output) print(evl.summary()) helper.print_info("Predictions:") print(output.buffer_content()) # record/output predictions separately helper.print_title("recording/outputting predictions separately") outputfile = helper.get_tmp_dir() + "/j48_vote.csv" output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution", "-suppress", "-file", outputfile]) output.header = test output.print_all(cls, test) helper.print_info("Predictions stored in:" + outputfile) # by using "-suppress" we don't store the output in memory, the following statement won't output anything print(output.buffer_content())
def SimpleLogistic(): # load a dataset loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("First_trial_classification.arff") data.class_is_last() # set class attribute cls = Classifier(classname="weka.classifiers.functions.SimpleLogistic") pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(486), pout) print(evl.summary()) print(pout.buffer_content()) # save model serialization.write_all("SimpleLogistic2.model", cls)
def index(): if request.method == "GET": return render_template('bot.html') if request.method == "POST": # jvm.stop() jvm.start() f = open("instances.arff", "a") args = request.form.to_dict() weight_lb = float(args['weight']) * 2.20462 bmi = (weight_lb / pow(float(args['height']), 2)) * 703 hypertensive_status = args['hypertensive_status'] heart_disease_status = args['heart_disease_status'] if heart_disease_status == "Yes": heart_disease_status = '1' else: heart_disease_status = '0' if hypertensive_status == "Yes": hypertensive_status = '1' else: hypertensive_status = '0' st = "\n"+args['gender']+","+args['age']+","+hypertensive_status+","+heart_disease_status+","+args['marrital_status'] + \ ","+args['work_type']+","+args['residence']+"," + \ args['hypertension']+","+str(bmi)+",'"+args['smoking_status'].lower()+"',?" print(st) f.write(st) f.close() objects = serialization.read_all("J48.model") loader = Loader(classname="weka.core.converters.ArffLoader") csr = Classifier(jobject=objects[0]) output_results = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") data1 = loader.load_file("instances.arff") data1.class_is_last() ev2 = Evaluation(data1) ev2.test_model(csr, data1, output_results) TESTDATA = StringIO("Instance,Actual,Predicted," + output_results.buffer_content()) df = pd.read_csv(TESTDATA) prediction = list(df.Predicted).pop().split(":")[1] print(prediction) # jvm.stop() response = {"status": "200", "prediction": prediction} return Response(json.dumps(response, indent=2), mimetype="application/json")
def run_bayesNet(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running BayesNet on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # Use BayesNet and set options cls = Classifier(classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.TAN", "--", "-P", "1", "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") # Evaluate data evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.confusion_matrix) # Generate grid for ROC # plcls.plot_roc(evaluation, class_index=[0,1], wait=True) # mk dirs for output dir = dir / "bayesNet_results" dir.mkdir(parents=True, exist_ok=True) # Save summary, class details and confusion matrix to file result_output = filename_base + "_bayesNet_eval_results_TAN.txt" output_eval(evaluation, dir / result_output) # Save the predicited results to file prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt" output_pred(pout, dir / prediction_output) print("BayesNet complete")
def SMOreg(): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("First_trial_regression.arff") data.class_is_last() cls = KernelClassifier(classname="weka.classifiers.functions.SMOreg", options=["-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.2"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(486), pout) print(evl.summary()) print(pout.buffer_content()) # save model serialization.write_all("SMOreg.model2", cls)
def naive_bayse(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() cls = Classifier(classname='weka.classifiers.bayes.NaiveBayes') eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(cls, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(cls, data, 80.0, Random(1), pout) print_and_save('Naive Bayes model', flag, nameOfDataSet, eval) gc.collect()
def trainAndMakePred(train, test): #IBK test and prediction classifierIBK = Classifier(classname="weka.classifiers.lazy.IBk", options=["-K", "5"]) classifierIBK.build_classifier(train) evaluationIBK = Evaluation(train) predicted_labelsIBK = evaluationIBK.test_model(classifierIBK, train) print(" IBKTraining information ") print(evaluationIBK.summary()) pred_outputIBK = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV") evaluationIBK = Evaluation(test) predicted_indicesIBK = evaluationIBK.test_model(classifierIBK, test, pred_outputIBK) print(" IBK Prediction information ") print(pred_outputIBK) #Naive bayes and prediction classifierNB = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"]) classifierNB.build_classifier(train) evaluationNB = Evaluation(train) predicted_labelsNB = evaluationNB.test_model(classifierNB, train) print(" Naive Bayes Training information ") print(evaluationNB.summary()) pred_outputNB = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV") evaluationNB = Evaluation(test) predicted_indicesNB = evaluationNB.test_model(classifierNB, test, pred_outputNB) print(" Naive Bayes Prediction information ") print(pred_outputNB) #out put predictions to file a = 1 ID = 901 f = open("predict.csv", "w") f.write("ID,Predict 1,Predict 2\n") for pred1, pred2 in zip(predicted_indicesIBK, predicted_indicesNB): f.write("%s,%s,%s\n" % (ID,pred1,pred2)) ID += 1 f.close()
def run_ibk(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running IBk on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # Use IBk and set options cls = Classifier(classname="weka.classifiers.lazy.IBk", options=["-K", "3"]) # print(cls.options) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") # Evaluate data evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout) # Save summary, class details and confusion matrix to file result_output = filename_base + "_eval_results.txt" output_eval(evaluation, dir / result_output) # Save the predicited results to file prediction_output = filename_base + "_pred_results.txt" output_pred(pout, dir / prediction_output) print("IBk complete")
def run_weka_csv_train_test(train_file_path, test_file_path): """ 1) From previous process, for each fold create input .csv's which will then be read here 1.1) Input csv will be on exact same data, which was fed to DT model 2) Extract the rules using JRip 3) Evaluate predictions with same metrics as was done for previous work 3.1) Accuracy, AUC, F-Score, Precision, Recall need to make new .csv, which contains both, payload and usual stuff.... :return: """ train_df = read_df_csv(train_file_path) test_df = read_df_csv(test_file_path) cls = Classifier(classname="weka.classifiers.rules.JRip" ) #options=["-O", "2"]), default opt. is 2 loader = Loader(classname="weka.core.converters.CSVLoader") # print(cls.to_help()) train_jrip_data = read_weka_csv(train_file_path, loader) test_jrip_data = read_weka_csv(test_file_path, loader) # If dataset included Case ID #train_case_id = train_jrip_data.attribute_by_name("Case_ID") #test_case_id = test_jrip_data.attribute_by_name("Case_ID") #test_jrip_data.delete_attribute(test_case_id.index) #train_jrip_data.delete_attribute(train_case_id.index) train_label_attribute = train_jrip_data.attribute_by_name("Label") test_label_attribute = test_jrip_data.attribute_by_name("Label") # Convert numeric attribut to nominal. Required for label! nominal = Filter( classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) nominal.inputformat(train_jrip_data) nominaldata1 = nominal.filter(train_jrip_data) nominaldata1.class_index = train_label_attribute.index nominaldata2 = nominal.filter( test_jrip_data) # re-use the initialized filter! nominaldata2.class_index = test_label_attribute.index msg = nominaldata1.equal_headers(nominaldata2) if msg is not None: raise Exception("Train and test not compatible:\n" + msg) # Build classifier cls.build_classifier(nominaldata1) # Get rules print(cls.jwrapper) pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) # Perform evaluation on train and test data evl = Evaluation(nominaldata1) evl.test_model(cls, nominaldata2, output=pred_output) evl2 = Evaluation(nominaldata1) evl2.test_model(cls, nominaldata1, output=pred_output) # Return interesting metrics for both train and test. return get_interesting(evl2), get_interesting(evl)
if isreal(classvarStr) == True: classifier = Classifier(classname="weka.classifiers.trees.M5P", options=["-U", "-M", "500.0"]) print("\n--> building:") print(classifier.to_commandline()) classifier.build_classifier(dataA) print("\n--> classifier:\n") print(classifier) print("\n--> graph:\n") print(classifier.graph) outputfile = helper.get_tmp_dir() + "/result.csv" output = PredictionOutput( classname='weka.classifiers.evaluation.output.prediction.CSV', options=["-distribution", "-suppress", "-file", outputfile]) print("\n--> Output:\n") output.header = dataA output.print_all(classifier, dataA) helper.print_info("Predictions stored in:" + outputfile) print(output.buffer_content()) Eval = Evaluation(dataA) Eval.test_model(classifier, dataA, output=output) print(Eval.summary()) ListEval = [] Corr = [] Corrf = [] ListEval = Eval.summary().split('Mean absolute error') print("ListEval :") print(ListEval)
def experiment_sequential_file(path_indices, path_features, path_folder_save_results, options, classifier, name, indicator_col, images): ind_f = load(path_indices) lst = ind_f.files for item in lst: ind = ind_f[item] + 1 cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) data = converters.load_any_file(path_features) ind = np.append(ind, len(data)) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") d_results = { 'index': [], 'percent_correct': [], 'percent_incorrect': [], 'precision': [], 'recall': [], 'f-score': [], 'confusion_matrix': [] } for j in range(len(ind) - 1): first = ind[j] if j == len(ind) - 2: last = ind[j + 1] else: last = ind[j + 1] - 1 d_test = data.subset(row_range=str(first) + '-' + str(last)) if j == 0: # first d_train = data.subset(row_range=str(last + 1) + '-' + str(ind[-1])) # last element print(str(last + 1) + '-' + str(ind[-1])) elif j == len(ind) - 2: # last d_train = data.subset(row_range='1-' + str(first - 1)) # last element print('1-' + str(first - 1)) else: # central s = '1-' + str(first - 1) + ',' + str(last + 1) + '-' + str( ind[-1]) print(s) d_train = data.subset(row_range=s) cls.build_classifier(d_train) evl = Evaluation(data) evl.test_model(cls, d_test, pout) # print(type(d_train)) # print(type(d_test)) d_results['index'].append(str(ind[j])) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['precision'].append(evl.precision(1)) d_results['recall'].append(evl.recall(1)) d_results['f-score'].append(evl.f_measure(1)) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() check_folder_or_create(path_folder_save_results + '/' + 'prediction') with open( path_folder_save_results + '/' + 'prediction/' + name + 'pred_data.csv', 'w') as f: f.write(save) buffer_save = pd.read_csv(path_folder_save_results + '/' + 'prediction/' + name + 'pred_data.csv', index_col=False, header=None) col_label = buffer_save[1] col_prediction = buffer_save[2] col_different = buffer_save[3] create_prediction(col_label, col_prediction, col_different, indicator_col, images, name, path_folder_save_results + '/prediction/') d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv', index=False)
def experiment_more_file(path_files, path_folder_save_results, fold, options, classifier, random, name, voting=False): cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) file_list = os.listdir(path_files) for file in file_list: if ".csv" not in file: file_list.remove(file) d_results = { 'name_file': [], 'percent_correct': [], 'percent_incorrect': [], 'precision': [], 'recall': [], 'f-score': [], 'confusion_matrix': [] } for file in file_list: indicator_table = pd.read_csv(path_files + '/indicator/' + file[0] + '_indicator.csv') indicator = list(indicator_table['indicator']) images = list(indicator_table['image']) data = converters.load_any_file(path_files + "/" + file) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['name_file'].append(str(file)) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['precision'].append(evl.precision(1)) d_results['recall'].append(evl.recall(1)) d_results['f-score'].append(evl.f_measure(1)) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() check_folder_or_create(path_folder_save_results + '/' + name + '/' + 'prediction') with open( path_folder_save_results + '/' + name + '/' + 'prediction/pred_data.csv', 'w') as f: f.write(save) buffer_save = pd.read_csv(path_folder_save_results + '/' + name + '/' + 'prediction/pred_data.csv', index_col=False) col_label = buffer_save['actual'] col_prediction = buffer_save['predicted'] col_different = buffer_save['error'] create_prediction( col_label, col_prediction, col_different, indicator, images, file[:-4], path_folder_save_results + '/' + name + '/prediction/') d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv")
def experiment_sequential_file(path_indices, path_features, path_folder_save_results, options, classifier, name): ind_f = load(path_indices) lst = ind_f.files for item in lst: ind = ind_f[item] + 1 cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) data = converters.load_any_file(path_features) ind = np.append(ind, len(data)) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") d_results = { 'index': [], 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } for j in range(len(ind) - 1): print(j) print(str(ind[j]) + '-' + str(ind[j + 1])) d_test = data.subset(row_range=str(ind[j]) + '-' + str(ind[j + 1])) if j == 0: # first d_train = data.subset(row_range=str(ind[j + 1] + 1) + '-' + str(ind[-1])) # last element elif j == len(ind) - 2: # last d_train = data.subset(row_range='1-' + str(ind[j] - 1)) # last element else: # central s = '1-' + str(ind[j] - 1) + ',' + str(ind[j + 1] + 1) + '-' + str( ind[-1]) d_train = data.subset(row_range=s) cls.build_classifier(d_train) evl = Evaluation(data) evl.test_model(cls, d_test, pout) save = pout.buffer_content() with open( path_folder_save_results + '/' + '/prediction/' + name + str(j) + 'pred_data.csv', 'w') as f: f.write(save) d_results['index'].append(str(ind[j])) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv', index=False)
def run_multilayerPercepton(file, file2=None): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running Multilayer Percepton on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] print("loading data...") # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # If 2nd file load that data too if file2: print("Loading test...") test = load_Arff_file(file2) test.class_is_first() file_names = [ "MP_N-500_default_H-1", "MP_N-500_H-3", "MP_N-500_H-5", "MP_N-500_H-7", "MP_N-500_H-3-5", "MP_N-500_H-5-3", "MP_N-500_H-3-5-7", "MP_N-500_H-7-3-5", "MP_N-500_H-5-7-3", "MP_N-500_L-01", "MP_N-500_L-02", "MP_N-500_L-04", "MP_N-500_L-05", "MP_N-500_M-01", "MP_N-500_M-03", "MP_N-500_M-04", "MP_N-500_M-05", "MP_N-500_E-5", "MP_N-500_E-10", "MP_N-500_E-15", "MP_N-500_E-25", ] options_list = [ [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # DEFAULT [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "3" ], # -H START [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "5" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "7" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "3, 5" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "5, 3" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "3, 5, 7" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "7, 3, 5" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "5, 7, 3" ], # -H END [ "-L", "0.1", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -L START [ "-L", "0.2", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.4", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.5", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -L END [ "-L", "0.3", "-M", "0.1", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -M START [ "-L", "0.3", "-M", "0.3", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.3", "-M", "0.4", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.3", "-M", "0.5", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -M END [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "5", "-H", "1" ], # -E START [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "10", "-H", "1" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "15", "-H", "1" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "25", "-H", "1" ], # -E END ] for i in range(len(options_list)): start = time.time() print("Beginning iteration " + str(i) + ": " + file_names[i]) # Use MultilayerPercepton and set options cls = Classifier( classname="weka.classifiers.functions.MultilayerPerceptron", options=options_list[i]) # Build classifier with train data cls.build_classifier(data) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText" ) # Evaluate data on test data evaluation = Evaluation(data) evaluation.test_model(cls, test, output=pout) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.confusion_matrix) # Generate grid for ROC # plcls.plot_roc(evaluation, class_index=[0,1], wait=True) # mk dirs for output tempdir = dir / "Results/" / "MP-ALL_N-500_results/" / (file_names[i] + "_results/") tempdir.mkdir(parents=True, exist_ok=True) # Save summary, class details and confusion matrix to file result_output = file_names[i] + "_results.txt" print(tempdir) print(result_output) print((tempdir / result_output).absolute()) output_eval(evaluation, tempdir / result_output) # Save the predicited results to file prediction_output = file_names[i] + "_prediction.txt" output_pred(pout, tempdir / prediction_output) end = time.time() timetaken = round(end - start, 2) print("Time taken to run iteration " + str(i) + ": %s seconds" % (timetaken)) print("Multilayer Percepton complete")
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer( classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc(evaluation, title="ROC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) # train 2nd classifier on diabetes dataset classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(diabetes_data) evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42)) plot_cls.plot_rocs({ "NB": evaluation, "RF": evaluation2 }, title="ROC diabetes", class_index=0, wait=False) plot_cls.plot_prcs({ "NB": evaluation, "RF": evaluation2 }, title="PRC diabetes", class_index=0, wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print( str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # train 2nd classifier and show errors in same plot classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg") evaluation2 = Evaluation(bolts_data) evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42)) plot_cls.plot_classifier_errors( { "LR": evaluation.predictions, "SMOreg": evaluation2.predictions }, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") ] plot_cls.plot_learning_curve(cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in xrange(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
def handle_message(message): global accel_x global accel_y global accel_z global gyro_x global gyro_y global gyro_z if message['sensorName'] == 'accelerometer': accel_x.append(float(message['x'])) accel_y.append(float(message['y'])) accel_z.append(float(message['z'])) elif message['sensorName'] == 'gyroscope': gyro_x.append(float(message['x'])) gyro_y.append(float(message['y'])) gyro_z.append(float(message['z'])) elif message['sensorName'] == "stop": # stop signal stop() if len(gyro_x) >= 25 and len(accel_x) >= 25: # only classify when both gyroscope and accelerometer data has more than 25 samples processDataToArff(accel_x, accel_y, accel_z, gyro_x, gyro_y, gyro_z) jvm.start() loader = Loader(classname="weka.core.converters.ArffLoader") # load the training data train = loader.load_file("train.arff") train.class_is_last() cls = Classifier(classname="weka.classifiers.trees.LMT") # train the classifier cls.build_classifier(train) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(train) # load the classify data test = loader.load_file("classify.arff") test.class_is_last() evl.test_model(cls, test, pout) result = pout.buffer_content() resultLines = result.splitlines() for i in range(len(resultLines)): if (resultLines[i].find("upDown") != -1): result = 1 elif (resultLines[i].find("leftRight") != -1): result = 2 elif (resultLines[i].find("inOut") != -1): result = 3 elif (resultLines[i].find("rotation") != -1): result = 4 else: result = "error" if result == 1: stop() playD() elif result == 2: stop() playBm() elif result == 3: stop() playA() elif result == 4: stop() playG() # clear the arrays for new data gyro_x = [] gyro_y = [] gyro_z = [] accel_x = [] accel_y = [] accel_z = []
processDataToArff("train.arff", False) processDataToArff("test.arff", True) # setup training model loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file("train.arff") train.class_is_last() test = loader.load_file("test.arff") test.class_is_last() # print(train) cls = Classifier( classname="weka.classifiers.trees.LMT") #use LMT as our algorithm cls.build_classifier(train) #train the model using train.arff pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(train) evl.test_model(cls, test, pout) # print the result result = pout.buffer_content() #print(result) # split the result and only print the gesture resultLines = result.splitlines() for i in range(len(resultLines)): if (resultLines[i].find("upDown") != -1): print("%d upDown" % (i + 1)) elif (resultLines[i].find("leftRight") != -1): print("%d leftRight" % (i + 1)) elif (resultLines[i].find("inOut") != -1):
sampled = pd.merge_asof(X_resampled,y_resampled,right_index=True,left_index=True) print(sampled) #corpus_name = 'data/bugs/resampled.csv' #corpus = os.path.join("/content/gdrive/My Drive", corpus_name) #sampled.to_csv(corpus,index = False) loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(corpus) data.class_is_last() """Naive Bayes Classifier for Bug Prediction""" classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(42), output=pred_output) plot_cls.plot_roc(evaluation, title="ROC bugs",class_index=range(0, data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC bugs - NaiveBayes",class_index=range(0, data.class_attribute.num_values), wait=False) """Performance Metrics - Naive Bayes Classifier""" print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("fMeasure: " + str(evaluation.f_measure(1))) print("precision: " + str(evaluation.precision(1))) print("recall: " + str(evaluation.recall(1)))
# load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) # we'll set the class attribute after filtering # apply NominalToBinary filter and set class attribute fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary") fltr.inputformat(data) filtered = fltr.filter(data) filtered.class_is_last() # cross-validate LinearRegression on filtered data, display model cls = Classifier(classname="weka.classifiers.functions.LinearRegression") pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) print("10-fold cross-validation:\n" + evl.summary()) print("Predictions:\n\n" + str(pout)) cls.build_classifier(filtered) print("Model:\n\n" + str(cls)) # use AddClassification filter with LinearRegression on filtered data print("Applying AddClassification to filtered data:\n") fltr = Filter(classname="weka.filters.supervised.attribute.AddClassification", options=[ "-W", "weka.classifiers.functions.LinearRegression", "-classification" ]) fltr.inputformat(filtered)
print(evl.to_summary()) print(str(cls)) plg.plot_dot_graph(cls.graph()) # 2. filtered classifier fname = data_dir + os.sep + "simpletext-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) print("Building/evaluating filtered classifier...") cls = FilteredClassifier() cls.set_classifier(Classifier(classname="weka.classifiers.trees.J48")) cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector")) cls.build_classifier(data) pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText") pout.set_header(test) evl = Evaluation(data) evl.test_model(cls, test, pout) print(str(pout)) print(str(cls)) # load ReutersCorn-train fname = data_dir + os.sep + "ReutersCorn-train.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # load ReutersCorn-test fname = data_dir + os.sep + "ReutersCorn-test.arff"
f.close() # In[4]: f= open("instances.arff","r") print(f.read()) f.close() # In[10]: from io import StringIO output_results = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV") data1 = loader.load_file("instances.arff") data1.class_is_last() ev2 = Evaluation(data1) ev2.test_model(csr,data1,output_results) print("Class prediction: ",output_results.buffer_content()[-13:-10]) print("\n\n Instance"," Actual"," Predicted") print(output_results.buffer_content()) TESTDATA = StringIO("Instance,Actual,Predicted,"+output_results.buffer_content()) # jvm.stop() x = pd.read_csv(TESTDATA) # In[14]: