def test_model(self, test_data, empty_solution, evaluate = False): model_weka = None if os.path.isfile(self.prediction_file): print 'Model ' + self.name + ' already tested.' elif not os.path.isfile(self.model_file): print 'Impossible testing this model. It should be trained first.' return else: print 'Starting to test_model model ' + self.name + '.' model_weka = Classifier(jobject = serialization.read(self.model_file)) evaluation = Evaluation(data = test_data) evaluation.test_model(classifier = model_weka, data = test_data) predictions = evaluation.predictions() rows = read_sheet(file_name = empty_solution) solutions = [] for row in rows: solution = [row['userid'], row['tweetid'], predictions.pop(0).predicted()] solutions.append(solution) write_the_solution_file(solutions, self.prediction_file) print 'Model ' + self.name + ' tested.' if evaluate == True: if os.path.isfile(self.evaluation_file): print 'Model ' + self.name + ' already evaluated.' return elif model_weka == None: model_weka = Classifier(jobject = serialization.read(self.model_file)) evaluation = Evaluation(data = test_data) evaluation.test_model(classifier = model_weka, data = test_data) save_file(file_name = self.evaluation_file, content = evaluation.to_summary()) print 'Model ' + self.name + ' evaluated.'
def run_naive_bayes_split(self, output_directory): # build classifier print("\nBuilding Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "NB Split Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nEvaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.test_model(cls, self.testing_data) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nNB Split Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Naive_Bayes", resultsString, output_directory)
def testNB(training_data, testing_data): train_data = Instances.copy_instances(training_data) test_data = Instances.copy_instances(testing_data) evaluation = Evaluation(train_data) classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") classifier.build_classifier( train_data) # build classifier on the training data evaluation.test_model(classifier, test_data) # test and evaluate model on the test set print("") print("") print( evaluation.summary( "--------------Naive Bayes Evaluation--------------")) print("Accuracy: " + str(evaluation.percent_correct)) print("") print("Label\tPrecision\t\tRecall\t\t\tF-Measure") print("<=50K\t" + str(evaluation.precision(0)) + "\t" + str(evaluation.recall(0)) + "\t" + str(evaluation.f_measure(0))) print(">50K\t" + str(evaluation.precision(1)) + "\t" + str(evaluation.recall(1)) + "\t" + str(evaluation.f_measure(1))) print("Mean\t" + str(((evaluation.precision(1)) + (evaluation.precision(0))) / 2) + "\t" + str(((evaluation.recall(1)) + (evaluation.recall(0))) / 2) + "\t" + str(((evaluation.f_measure(1)) + (evaluation.f_measure(0))) / 2))
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def testDataEvaluate(self, testDataArffFileName): """ Evaluation using test data :param testDataArffFileName: File name for testing ARFF :return: TRUE if evaluation was achievable """ if self.classifierInstance is not None: print '[Using test data for evaluation]' try: testFileFullPath = dirconfig.arffPath + testDataArffFileName + '.arff' testData = self.loadArffData(testFileFullPath) if testData is not None: # Evaluate using test data evaluatorInstance = Evaluation( data=self.classificationData) evaluatorInstance.test_model( classifier=self.classifierInstance, data=testData) # Store evaluation results self.setEvaluationResults(evaluatorInstance) return True except: return False return False
def predict_proba(self, X): evaluation = Evaluation(self.train_data) # Add class column (we can't copy X, because this is a large object, so we add the column and remove it later) X['class'] = None filename = self.to_arff(X, True) # Remove class column del X['class'] loader = Loader("weka.core.converters.ArffLoader") test_data = loader.load_file(filename) test_data.class_is_last() evaluation.test_model(self.classifier, test_data) probas = None # Return probabilities for pred in evaluation.predictions: if probas is None: probas = pred.distribution else: probas = np.vstack([probas, pred.distribution]) return probas
def train_and_eval_weka_classifier(clf, train, valid, n_instances): total_train_inst = train.num_instances percentage = (n_instances * 100) / total_train_inst if percentage == 100: opt = train else: opt, residual = train.train_test_split(percentage, Random(1)) # opt, residual = train.train_test_split(percentage, Random(1)) print('total_train_inst: ', total_train_inst, '| percentage: ', percentage, '| used_inst: ', opt.num_instances) clf.build_classifier(opt) evl = Evaluation(opt) evl.test_model(clf, valid) # evl.crossvalidate_model(clf, opt, 10, Random(1)) acc = evl.percent_correct auc = evl.weighted_area_under_roc err = evl.error_rate log = evl.sf_mean_scheme_entropy print( "# validating | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}" .format(log, acc, auc, err)) return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
def test_classifier(dataset: Instances, classifier: Classifier, params: dict): vars = params.keys() vals = params.values() results = defaultdict(list) for val_combo in itertools.product(*vals): results["numInstances"].append(dataset.num_instances) results["numAttributes"].append(dataset.num_attributes) opts = dict(zip(vars, val_combo)) for opt in opts: results[opt].append(opts[opt]) classifier.set_property( opt, opts[opt] if not isinstance(opts[opt], float) else typeconv.double_to_float(opts[opt])) evl = Evaluation(dataset) classifier.build_classifier(dataset) evl.test_model(classifier, dataset) results["Training_Accuracy"].append(evl.percent_correct) results["size"].append( int(javabridge.call(classifier.jobject, "measureTreeSize", "()D"))) evl.crossvalidate_model(classifier, dataset, 10, Random(1)) results["CV_Accuracy"].append(evl.percent_correct) return results
def evaluation(self, classifier, trainingData, testingData = None): trainingData.set_class_index(trainingData.num_attributes() - 1) if testingData == None: evaluation = Evaluation(trainingData) # initialize with priors evaluation.crossvalidate_model(classifier, trainingData, 10, Random(42)) # 10-fold CV return evaluation else: print "testing data exists" if testingData.num_attributes() == trainingData.num_attributes(): testingData.set_class_index(testingData.num_attributes() - 1) evaluation = Evaluation(trainingData) classifier.build_classifier(trainingData) evaluation.test_model(classifier, testingData) #for attribute in trainingData.attributes(): # print "train:" + str(attribute) #for attribute in testingData.attributes(): # print "test:" + str(attribute) return evaluation else: print "testing Data doesn't have same attribute with training data" for attribute in trainingData.attributes(): print "train:" + str(attribute) for attribute in testingData.attributes(): print "test:" + str(attribute)
def score(self, testExamples, labels): f = open("testingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(testExamples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(testExamples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.testingData = loader.load_file("testingweka.arff") self.testingData.set_class_index(self.testingData.num_attributes() - 1) evaluation = Evaluation(self.trainingData) evaluation.test_model(self.classifier, self.testingData) #print evaluation.percent_correct() #jvm.stop() return evaluation.percent_correct()
def calcError(self, newModel, test_data_of_kfold): '''Return the error from the model with test data from k fold cross validation''' error = 0.0 evl = Evaluation(test_data_of_kfold) evl.test_model(newModel, test_data_of_kfold) return 100 - evl.percent_correct
def evaluation(self, classifier, trainingData, testingData=None): trainingData.set_class_index(trainingData.num_attributes() - 1) if testingData == None: evaluation = Evaluation(trainingData) # initialize with priors evaluation.crossvalidate_model(classifier, trainingData, 10, Random(42)) # 10-fold CV return evaluation else: print "testing data exists" if testingData.num_attributes() == trainingData.num_attributes(): testingData.set_class_index(testingData.num_attributes() - 1) evaluation = Evaluation(trainingData) classifier.build_classifier(trainingData) evaluation.test_model(classifier, testingData) #for attribute in trainingData.attributes(): # print "train:" + str(attribute) #for attribute in testingData.attributes(): # print "test:" + str(attribute) return evaluation else: print "testing Data doesn't have same attribute with training data" for attribute in trainingData.attributes(): print "train:" + str(attribute) for attribute in testingData.attributes(): print "test:" + str(attribute)
def crossTest(this, trainingFile, classifier, testFile): loader = Loader(classname="weka.core.converters.ArffLoader") data1 = loader.load_file(trainingFile) data1.class_is_last() cls = Classifier(classname=classifier) cls.build_classifier(data1) data2 = loader.load_file(testFile) data2.class_is_last() classes = [str(code) for code in data2.class_attribute.values] header = ["Accuracy"] for name in classes: header += [name + " TP", name + " FP", name + " AUC ROC"] values = [] evl = Evaluation(data2) evl.test_model(cls, data2) values.append(evl.percent_correct) for name in classes: index = classes.index(name) values += [ evl.true_positive_rate(index) * 100, evl.false_positive_rate(index) * 100, evl.area_under_roc(index) ] this.values = values this.header = header
def f_smote(): jvm.start() train_data, test_data = b_i_impute_data() train_data = train_data[:10000] y_train = train_data["class"] x_train = train_data.drop("class", axis=1) sm = SMOTE(ratio="minority") x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns) y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"]) train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1) print_f("smote train data shape", train_data_sm_df.shape) train_data_sm_df.to_csv("./train_data_sm.csv", index=False) train_data_sm = converters.load_any_file("train_data_sm.csv") train_data_sm.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print_f("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print_f("bulding classifier") cls.build_classifier(train_data_sm) print_f("Evaluating") evl = Evaluation(train_data_sm) evl.crossvalidate_model(cls, train_data_sm, 5, Random(1)) print_f("Train Accuracy:", evl.percent_correct) print_f("Train summary") print_f(evl.summary()) print_f("Train class details") print_f(evl.class_details()) print_f("Train confusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True, outfile="./plots/2_f_smote_10k.png") plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) evl = Evaluation(test_data) print_f("testing model") evl.test_model(cls, test_data) print_f("Test Accuracy:", evl.percent_correct) print_f("Test summary") print_f(evl.summary()) print_f(" Testclass details") print_f(evl.class_details()) print_f("Testconfusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/f_test_roc_curve.png")
def ClassifyParam(mode, binWidths): if not os.path.exists("classificationResults"): os.makedirs("classificationResults") if("normal" in mode): file = open("classificationResults/AllVsAll.csv","w") file.write("BinWidth, Accuracy\n") for binWidth in binWidths: train_set = "Data/arff/TrainSet_%s.arff"%(binWidth) test_set = "Data/arff/TestSet_%s.arff"%(binWidth) print "Loading Datasets..." train_data = converters.load_any_file(train_set) test_data = converters.load_any_file(test_set) #Set class attribute train_data.class_is_last() test_data.class_is_last() print "Dataset Loaded!" classifier_name = "weka.classifiers.meta.FilteredClassifier" classifier = Classifier(classname=classifier_name, options=[ "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"", "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"]) start_train = time.time() classifier.build_classifier(train_data) end_train = time.time() print "Train\t%s\t%s"%(binWidth, end_train-start_train) for index, inst in enumerate(test_data): if(index == 0): start_sample = time.time() classifier.classify_instance(inst) end_sample = time.time() print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample) print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth) evaluation = Evaluation(test_data) start_batch = time.time() evaluation.test_model(classifier, test_data) end_batch = time.time() print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch) print evaluation.summary() acc = evaluation.percent_correct/100.0 print "Percent correct: " + str(acc) file.write("%s, %s\n"%(binWidth, acc)) file.close()
def evaluate_classifier(cls, train_data, test_data): """ Evaluation :param cls: trained classifier :param train_data: data to initialize priors with :return: evaluation object """ evl = Evaluation(train_data) evl.test_model(cls, test_data) return evl
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. The predictions get recorded in two different ways: 1. in-memory via the test_model method 2. directly to file (more memory efficient), but a separate run of making predictions :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate and record predictions in memory helper.print_title("recording predictions in-memory") output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution"]) evl = Evaluation(train) evl.test_model(cls, test, output=output) print(evl.summary()) helper.print_info("Predictions:") print(output.buffer_content()) # record/output predictions separately helper.print_title("recording/outputting predictions separately") outputfile = helper.get_tmp_dir() + "/j48_vote.csv" output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution", "-suppress", "-file", outputfile]) output.header = test output.print_all(cls, test) helper.print_info("Predictions stored in:" + outputfile) # by using "-suppress" we don't store the output in memory, the following statement won't output anything print(output.buffer_content())
def ClassifyWithDT(f3, test, tree , fileOut) : eval= Evaluation(f3) tree.build_classifier(f3) eval.test_model(tree, test) fileOut.write("\n\nSelf-Training data========"+str((1-eval.error_rate)*100)+" number of instances=="+str(f3.num_instances)+"\n") fileOut.write("\n Error Rate=="+str(eval.error_rate) + "\n") fileOut.write("\n precision recall areaUnderROC \n\n"); for i in range(test.get_instance(0).num_classes) : fileOut.write(str(eval.precision(i)) +" "+str(eval.recall(i)) + " " + str(eval.area_under_roc(i))+"\n") return eval
def evaluate_classifier(cls, data, crossvalidate=False, n_folds=10): """ Evaluation :param cls: trained classifier :param data: data to test the model on :param crossvalidate: True to use crossvalidation :param n_folds: number of folds to cross validate for :return: evaluation object """ evl = Evaluation(data) if crossvalidate: evl.crossvalidate_model(cls, data, n_folds, Random(5)) else: evl.test_model(cls, data) return evl
def build_and_classify(classifier, classifier_name, approach_name, infile, percentage='10'): """ Creates model and classifies against input data. Returns accuracy statistics """ # set seed so results are consistent random.seed('iot') # load data loader = Loader(classname='weka.core.converters.CSVLoader') data = loader.load_file(infile) data.class_is_last() # convert all numeric attributes to nominal to_nominal = Filter(classname='weka.filters.unsupervised.attribute.NumericToNominal', options=['-R', 'first-last']) to_nominal.inputformat(data) data = to_nominal.filter(data) # randomize data with constant seed randomize = Filter(classname='weka.filters.unsupervised.instance.Randomize', options=['-S', '42']) randomize.inputformat(data) data = randomize.filter(data) # create training set and testing set train_percent_filter = Filter(classname='weka.filters.unsupervised.instance.RemovePercentage', options=['-P', percentage, '-V']) train_percent_filter.inputformat(data) train = train_percent_filter.filter(data) test = data # build and test classifier classifier.build_classifier(train) evaluation = Evaluation(train) evaluation.test_model(classifier, test) # return results as array results = [ approach_name, classifier_name, percentage, evaluation.percent_correct, evaluation.weighted_f_measure ] return results
def test_weka_classifier(clf, train, test): clf.build_classifier(train) evl = Evaluation(train) evl.test_model(clf, test) acc = evl.percent_correct auc = evl.weighted_area_under_roc err = evl.error_rate log = evl.sf_mean_scheme_entropy print( "# testing | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}". format(log, acc, auc, err)) return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
def case2(): loader1 = Loader(classname="weka.core.converters.ArffLoader") test_file = input("Enter the name of the test file:") data1 = loader1.load_file(test_file) data1.class_is_last() evaluation = Evaluation(data1) evl = evaluation.test_model(cls, data1) print(evaluation.matrix("=== (confusion matrix) ==="))
def index(): if request.method == "GET": return render_template('bot.html') if request.method == "POST": # jvm.stop() jvm.start() f = open("instances.arff", "a") args = request.form.to_dict() weight_lb = float(args['weight']) * 2.20462 bmi = (weight_lb / pow(float(args['height']), 2)) * 703 hypertensive_status = args['hypertensive_status'] heart_disease_status = args['heart_disease_status'] if heart_disease_status == "Yes": heart_disease_status = '1' else: heart_disease_status = '0' if hypertensive_status == "Yes": hypertensive_status = '1' else: hypertensive_status = '0' st = "\n"+args['gender']+","+args['age']+","+hypertensive_status+","+heart_disease_status+","+args['marrital_status'] + \ ","+args['work_type']+","+args['residence']+"," + \ args['hypertension']+","+str(bmi)+",'"+args['smoking_status'].lower()+"',?" print(st) f.write(st) f.close() objects = serialization.read_all("J48.model") loader = Loader(classname="weka.core.converters.ArffLoader") csr = Classifier(jobject=objects[0]) output_results = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") data1 = loader.load_file("instances.arff") data1.class_is_last() ev2 = Evaluation(data1) ev2.test_model(csr, data1, output_results) TESTDATA = StringIO("Instance,Actual,Predicted," + output_results.buffer_content()) df = pd.read_csv(TESTDATA) prediction = list(df.Predicted).pop().split(":")[1] print(prediction) # jvm.stop() response = {"status": "200", "prediction": prediction} return Response(json.dumps(response, indent=2), mimetype="application/json")
def run_bayes_hill_split(self, output_directory, parents=1): # build classifier print("\nBuilding Bayes Classifier on training data. Parents = " + str(parents) + "\n") buildTimeStart = time.time() cls = Classifier( classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.HillClimber", "--", "-P", "" + str(parents), "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "Bayes Split Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nEvaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.test_model(cls, self.testing_data) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nBayes Split Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Bayes_Hill_P" + str(parents) + "_", resultsString, output_directory) self.save_results("Bayes_Hill_P" + str(parents) + "_Graph", cls.graph, output_directory, True)
def train_weka_model(self, training_data_dir, save_model_dir, log_file, mimic_env=None): """ Just runs some example code. """ loader = Loader(classname="weka.core.converters.CSVLoader") training_data = loader.load_file(training_data_dir) training_data.class_is_last() self.classifier = Classifier(classname="weka.classifiers.trees.M5P", options=self.options) # classifier help, check https://weka.sourceforge.io/doc.dev/weka/classifiers/trees/M5P.html self.classifier.build_classifier(training_data) # print(classifier) graph = self.classifier.graph node_number = float(graph.split('\n')[-3].split()[0].replace('N', '')) leaves_number = node_number / 2 serialization.write(save_model_dir, self.classifier) # print('Leaves number is {0}'.format(leave_number), file=log_file) evaluation = Evaluation(training_data) predicts = evaluation.test_model(self.classifier, training_data) # return_value = None # if mimic_env is not None: predict_dictionary = {} for predict_index in range(len(predicts)): predict_value = predicts[predict_index] if predict_value in predict_dictionary.keys(): predict_dictionary[predict_value].append(predict_index) else: predict_dictionary.update({predict_value: [predict_index]}) # return_value = mimic_env.get_return(state=list(predict_dictionary.values())) return_value_log = mimic_env.get_return( state=list(predict_dictionary.values())) return_value_log_struct = mimic_env.get_return( state=list(predict_dictionary.values()), apply_structure_cost=True) return_value_var_reduction = mimic_env.get_return( state=list(predict_dictionary.values()), apply_variance_reduction=True) # print("Training return is {0}".format(return_value), file=log_file) summary = evaluation.summary() numbers = summary.split('\n') corr = float(numbers[1].split()[-1]) mae = float(numbers[2].split()[-1]) rmse = float(numbers[3].split()[-1]) rae = float(numbers[4].split()[-2]) / 100 rrse = float(numbers[5].split()[-2]) / 100 # print(evl) # print("Training summary is "+summary, file=log_file) return return_value_log, return_value_log_struct, \ return_value_var_reduction, mae, rmse, leaves_number
def run_split(self, output_directory, classifier_name, classifier_weka_spec, options_list): # build classifier print("\nBuilding " + classifier_name + " Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname=classifier_weka_spec, options=options_list) cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = classifier_name + "Split Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nEvaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.test_model(cls, self.testing_data) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString += "\n" resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString += "\n" resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\n\n" + classifier_name + "Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) options_string = "" for option in options_list: options_string = options_string + str(option) options_string = options_string.replace(".", "-") options_string = options_string.replace("-", "_") #Save Results and Cleanup self.save_results(classifier_name + options_string + "_Split", resultsString, output_directory)
def e_model_tree(): # train_data, test_data = b_i_impute_data() # train_data.to_csv("./train_data.csv", index=False) # test_data.to_csv("./test_data.csv",index=False) jvm.start() train_data = converters.load_any_file("train_data.csv") train_data.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print("2") cls.build_classifier(train_data) print("3") evl = Evaluation(train_data) evl.crossvalidate_model(cls, train_data, 5, Random(1)) print("Train Accuracy:", evl.percent_correct) print("Train summary") print(evl.summary()) print("Train class details") print(evl.class_details()) print("Train confusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_train_roc_curve.png") evl = Evaluation(test_data) evl.test_model(cls, test_data) print("Test Accuracy:", evl.percent_correct) print("Test summary") print(evl.summary()) print(" Testclass details") print(evl.class_details()) print("Testconfusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_test_roc_curve.png")
def case2(): loader1 = Loader(classname="weka.core.converters.ArffLoader") file = input("Enter the name of the model file:") cls2 = Classifier(jobject=serialization.read(file)) test_file = input("Enter the name of the test file:") data1 = loader1.load_file(test_file) data1.class_is_last() evaluation = Evaluation(data1) evl = evaluation.test_model(cls2, data1) print(evaluation.matrix("=== (confusion matrix) ==="))
def HOV(dataset, algo, num_datasets): #Executing HOV \_*-*_/ loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() train, test = data.train_test_split(70.0, Random(10)) cls = Classifier(classname=algo) cls.build_classifier(train) evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False)) print(evl.matrix("=== on click prediction(confusion matrix) ===")) print("For Algo"+ str(algo)+"areaUnderROC/1: for HOV " + str(evl.area_under_roc(1))) return evl.area_under_roc(1)
def train_and_eval_weka_classifier(clf, train, valid, n_instances): # total_inst = train.num_instances total_train_inst = train.num_instances percentage = (n_instances * 100) / total_train_inst if percentage == 100: opt = train else: opt, extra = train.train_test_split(percentage, Random(1)) # inst_train2 = train2.num_instances print('total_train_inst: ', total_train_inst, '| percentage: ', percentage, '| used_inst: ', opt.num_instances) import signal class AlarmException(Exception): pass def alarmHandler(signum, frame): raise AlarmException clf.build_classifier(opt) evl = Evaluation(opt) evl.test_model(clf, valid) acc = evl.percent_correct auc = evl.weighted_area_under_roc err = evl.error_rate log = evl.sf_mean_scheme_entropy print( "# validating | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}" .format(log, acc, auc, err)) return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
def HOV(dataset, algo): print "inside hov" print("dataset ----" + dataset) print("algorithm ----" + algo) #Executing HOV \_*-*_/ # jvm.start(packages=True) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() train, test = data.train_test_split(70.0, Random(10)) cls = Classifier(classname=algo) cls.build_classifier(train) evl = Evaluation(train) evl.test_model(cls, test) return (str(evl.area_under_roc(1)))
def get_action(temp_data_dir, classifier, loader): data = loader.load_file(temp_data_dir) data.class_is_last() evaluation = Evaluation(data) eval = evaluation.test_model(classifier, data) Q_list = eval.tolist() act = ACTION_LIST[Q_list.index(max(Q_list))] if act == 0: return [1, 0] else: return [0, 1]
def DecisionTree(rnd_data, folds, seed, data): data_size = rnd_data.num_instances fold_size = math.floor(data_size / folds) # cross-validation evaluation = Evaluation(rnd_data) for i in range(folds): this_fold = fold_size test_start = i * fold_size test_end = (test_start + fold_size) if ((data_size - test_end) / fold_size < 1): this_fold = data_size - test_start test = Instances.copy_instances(rnd_data, test_start, this_fold) # generate validation fold if i == 0: train = Instances.copy_instances(rnd_data, test_end, data_size - test_end) else: train_1 = Instances.copy_instances(rnd_data, 0, test_start) train_2 = Instances.copy_instances(rnd_data, test_end, data_size - test_end) train = Instances.append_instances( train_1, train_2) # generate training fold # build and evaluate classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # build classifier on training set evaluation.test_model(cls, test) # test classifier on validation/test set print("") print("=== Decision Tree ===") print("Classifier: " + cls.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print( evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
def main(): """ Just runs some example code. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "vote.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = Classifier(classname="weka.classifiers.trees.J48") # randomize data folds = 10 seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) # perform cross-validation and add predictions predicted_data = None evaluation = Evaluation(rand_data) for i in xrange(folds): train = rand_data.train_cv(folds, i) # the above code is used by the StratifiedRemoveFolds filter, # the following code is used by the Explorer/Experimenter # train = rand_data.train_cv(folds, i, rnd) test = rand_data.test_cv(folds, i) # build and evaluate classifier cls = Classifier.make_copy(classifier) cls.build_classifier(train) evaluation.test_model(cls, test) # add predictions addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-classification", "-distribution", "-error"]) # setting the java object directory avoids issues with correct quoting in option array addcls.set_property("classifier", Classifier.make_copy(classifier)) addcls.inputformat(train) addcls.filter(train) # trains the classifier pred = addcls.filter(test) if predicted_data is None: predicted_data = Instances.template_instances(pred, 0) for n in xrange(pred.num_instances): predicted_data.add_instance(pred.get_instance(n)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ===")) print("") print(predicted_data)
def runner(self, cdat, heap_size = 16384, seed = None, verbose = True): self.set_status(Pipeline.RUNNING) self.logs.append('Initializing Pipeline') para = self.config self.logs.append('Reading Pipeline Configuration') head = '' name = get_rand_uuid_str() self.logs.append('Reading Input File') for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.RUNNING if stage.code == 'dat.fle': head = os.path.abspath(stage.value.path) name, _ = os.path.splitext(stage.value.name) self.logs.append('Parsing to ARFF') path = os.path.join(head, '{name}.arff'.format(name = name)) # This bug, I don't know why, using Config.schema instead. # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose) for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.COMPLETE self.logs.append('Saved ARFF at {path}'.format(path = path)) self.logs.append('Splitting to Training and Testing Sets') JVM.start(max_heap_size = '{size}m'.format(size = heap_size)) load = Loader(classname = 'weka.core.converters.ArffLoader') # data = load.load_file(path) # save = Saver(classname = 'weka.core.converters.ArffSaver') data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only data.class_is_last() # For Debugging Purposes Only # data.class_index = cdat.iclss for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.RUNNING self.logs.append('Splitting Training Set') # TODO - Check if this seed is worth it. seed = assign_if_none(seed, random.randint(0, 1000)) opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)] wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V']) wobj.inputformat(data) tran = wobj.filter(data) self.logs.append('Splitting Testing Set') wobj.options = opts test = wobj.filter(data) for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.COMPLETE self.logs.append('Performing Feature Selection') feat = [ ] for comb in para.FEATURE_SELECTION: if comb.USE: for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.RUNNING srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Search.NAME, options = assign_if_none(comb.Search.OPTIONS, [ ]) )) ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Evaluator.NAME, options = assign_if_none(comb.Evaluator.OPTIONS, [ ]) )) attr = AttributeSelection() attr.search(srch) attr.evaluator(ewal) attr.select_attributes(tran) meta = addict.Dict() meta.search = comb.Search.NAME meta.evaluator = comb.Evaluator.NAME meta.features = [tran.attribute(index).name for index in attr.selected_attributes] feat.append(meta) for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.COMPLETE models = [ ] for model in para.MODEL: if model.USE: summary = addict.Dict() self.logs.append('Modelling {model}'.format(model = model.LABEL)) summary.label = model.LABEL summary.name = model.NAME summary.options = assign_if_none(model.OPTIONS, [ ]) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.RUNNING for i, instance in enumerate(data): iclass = list(range(instance.num_classes)) options = assign_if_none(model.OPTIONS, [ ]) classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options) classifier.build_classifier(tran) serializer.write(os.path.join(head, '{name}.{classname}.model'.format( name = name, classname = model.NAME )), classifier) self.logs.append('Testing model {model}'.format(model = model.LABEL)) evaluation = Evaluation(tran) evaluation.test_model(classifier, test) summary.summary = evaluation.summary() frame = pd.DataFrame(data = evaluation.confusion_matrix) axes = sns.heatmap(frame, cbar = False, annot = True) b64str = get_b64_plot(axes) summary.confusion_matrix = addict.Dict({ 'value': evaluation.confusion_matrix.tolist(), 'plot': b64str }) self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL)) buffer = io.BytesIO() plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.learning_curve = b64str buffer = io.BytesIO() plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.roc_curve = b64str buffer = io.BytesIO() plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.prc_curve = b64str if classifier.graph: summary.graph = classifier.graph for i, instance in enumerate(test): prediction = classifier.classify_instance(instance) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.COMPLETE models.append(summary) self.gist.models = models JVM.stop() JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist) self.logs.append('Pipeline Complete') self.set_status(Pipeline.COMPLETE)
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # plot pld.scatter_plot( data, data.get_attribute_by_name("petalwidth").get_index(), data.get_attribute_by_name("petallength").get_index(), wait=False) # add classifier errors to dataset addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-W", "weka.classifiers.trees.J48", "-classification", "-error"]) addcls.set_inputformat(data) filtered = addcls.filter(data) print(filtered) # build J48 cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(data) evl = Evaluation(data) evl.test_model(cls, data) # plot classifier errors plc.plot_classifier_errors(evl.predictions(), wait=True) jvm.stop()
def run(self): # Attach JVM javabridge.attach() # Debug print "Classifier" print self.classifier print "Params" print self.parameters print "Model Params" print self.modelParams # Get data for testing and learning learnerData = self.retrieveData(self.questionID, "learner") testData = self.retrieveData(self.questionID, 'test') masterData = self.retrieveData(self.questionID, 'all') masterData = self.addNominals(masterData) # Check if there is enough correct data to run if (learnerData.num_instances < 1 or testData.num_instances < 1): self.status = self.config.NOT_ENOUGH_DATA return False # If this is a prediction and there is a valid patient, change masterData header patientObj = self.buildPatientObject() patientInstance = None if ((patientObj is not None) and (self.predict == 1)): masterData = self.addPatientNominals(patientObj, masterData) patientInstance = self.createPatientInstance(patientObj, masterData) masterData.add_instance(patientInstance) elif (patientObj is None) and (self.predict == 1): print 'No patient defined for prediction. Exiting' return True # Fix dataset headers up to match and fix instances to match headers masterData.delete() learner = masterData.copy_instances(masterData, 0, 0) test = masterData.copy_instances(masterData, 0, 0) self.addInstancesToDataset(learnerData, learner) self.addInstancesToDataset(testData, test) # Comparison of data for testing purposes # print 'learnerData' # print learnerData # print 'learner' # print learner # print 'testData' # print testData # print 'test' # print test # pdb.set_trace() # Instantiate classifier self.cls = Classifier(classname=self.classifier, options=self.parameters) # Run classifier self.cls.build_classifier(learner) # for index, inst in enumerate(learnerData): # prediction = self.cls.classify_instance(inst) # distribution = self.cls.distribution_for_instance(inst) # Test classifier evl = Evaluation(learner) evl.test_model(self.cls, test) # Store information about matrix self.acc = evl.percent_correct self.val = None # Convert numpy array into simple array confusionMatrix = [] confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]]) confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]]) # Convert matrix into json format self.matrix = json.dumps(confusionMatrix) # print 'Classifier: ', self.classifier # print 'ID: ', self.questionID # print 'ACC: ', self.acc # print(evl.summary()) # If this is a prediction... make the prediction if ((patientObj is not None) and (self.predict == 1)): masterData.add_instance(patientInstance) print "Running prediction on patient: " print masterData.get_instance(0) self.prediction = self.cls.classify_instance(masterData.get_instance(0)) #self.uploadPrediction() # Temporarily store file to serialize to fileName = str(self.questionID) + self.algorithm + ".model" serialization.write(fileName, self.cls) # Open that file and store it self.model = None with open(fileName, 'rb') as f: self.model = f.read() # Remove temporary file os.remove(fileName) # Set status to awaiting feedback self.status = self.config.AWAITING_FEEDBACK_STATUS return True
fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-S"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.set_classifier(Classifier(classname=classifier)) cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct()) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.to_matrix("Matrix:")) jvm.stop()
data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # 1a filter data print("Filtering data...") fltr = Filter("weka.filters.unsupervised.attribute.StringToWordVector") fltr.set_inputformat(data) filtered = fltr.filter(data) filtered.set_class_index(0) # 1b build classifier print("Building/evaluating classifier...") cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(filtered) evl = Evaluation(filtered) evl.test_model(cls, filtered) print(evl.to_summary()) print(str(cls)) plg.plot_dot_graph(cls.graph()) # 2. filtered classifier fname = data_dir + os.sep + "simpletext-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) print("Building/evaluating filtered classifier...") cls = FilteredClassifier() cls.set_classifier(Classifier(classname="weka.classifiers.trees.J48")) cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector")) cls.build_classifier(data)
def plot_learning_curve(classifiers, train, test=None, increments=100, metric="percent_correct", title="Learning curve", label_template="[#] @ $", key_loc="lower right", outfile=None, wait=True): """ Plots :param classifiers: list of Classifier template objects :type classifiers: list of Classifier :param train: dataset to use for the building the classifier, used for evaluating it test set None :type train: Instances :param test: optional dataset to use for the testing the built classifiers :type test: Instances :param increments: the increments (>= 1: # of instances, <1: percentage of dataset) :type increments: float :param metric: the name of the numeric metric to plot (Evaluation.<metric>) :type metric: str :param title: the title for the plot :type title: str :param label_template: the template for the label in the plot (#: 1-based index, @: full classname, !: simple classname, $: options) :type label_template: str :param key_loc: the location string for the key :type key_loc: str :param outfile: the output file, ignored if None :type outfile: str :param wait: whether to wait for the user to close the plot :type wait: bool """ if not plot.matplotlib_available: logger.error("Matplotlib is not installed, plotting unavailable!") return if not train.has_class(): logger.error("Training set has no class attribute set!") return if (test is not None) and (train.equal_headers(test) is not None): logger.error("Training and test set are not compatible: " + train.equal_headers(test)) return if increments >= 1: inc = increments else: inc = round(train.num_instances * increments) steps = [] cls = [] evls = {} for classifier in classifiers: cl = Classifier.make_copy(classifier) cls.append(cl) evls[cl] = [] if test is None: tst = train else: tst = test for i in xrange(train.num_instances): if (i > 0) and (i % inc == 0): steps.append(i+1) for cl in cls: # train if cl.is_updateable: if i == 0: tr = Instances.copy_instances(train, 0, 1) cl.build_classifier(tr) else: cl.update_classifier(train.get_instance(i)) else: if (i > 0) and (i % inc == 0): tr = Instances.copy_instances(train, 0, i + 1) cl.build_classifier(tr) # evaluate if (i > 0) and (i % inc == 0): evl = Evaluation(tst) evl.test_model(cl, tst) evls[cl].append(getattr(evl, metric)) fig, ax = plt.subplots() ax.set_xlabel("# of instances") ax.set_ylabel(metric) ax.set_title(title) fig.canvas.set_window_title(title) ax.grid(True) i = 0 for cl in cls: evl = evls[cl] i += 1 plot_label = label_template.\ replace("#", str(i)).\ replace("@", cl.classname).\ replace("!", cl.classname[cl.classname.rfind(".") + 1:]).\ replace("$", join_options(cl.config)) ax.plot(steps, evl, label=plot_label) plt.draw() plt.legend(loc=key_loc, shadow=True) if outfile is not None: plt.savefig(outfile) if wait: plt.show()
trainData = loader.load_file('segment-challenge.arff') trainData.class_is_last() testData = loader.load_file('segment-test.arff') testData.class_is_last() # Default C4.5 tree classifier = Classifier(classname="weka.classifiers.trees.J48") # Search for the best parameters and build a classifier with them classifier.build_classifier(trainData) print("\n\n=========== Classifier information ================\n\n") print(classifier.options) print(classifier) print("\n\n=========== Train results ================\n\n") evaluation = Evaluation(trainData) evaluation.test_model(classifier, trainData) print(classifier.to_commandline()) print(evaluation.matrix()) print("Train recognition: %0.2f%%" % evaluation.percent_correct) print("\n\n=========== Test results ================\n\n") evaluation = Evaluation(testData) evaluation.test_model(classifier, testData) print(classifier.to_commandline()) print(evaluation.matrix()) print("Test recognition: %0.2f%%" % evaluation.percent_correct) jvm.stop()
print("Train/test/predict...") groups = ["DataSet1", "DataSet2"] # groups = ["DataSet2"] for group in groups: print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name("reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name("reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name("reference value").index cls = FilteredClassifier() cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
from weka.classifiers import Classifier, Evaluation jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # determine baseline with ZeroR zeror = Classifier(classname="weka.classifiers.rules.ZeroR") zeror.build_classifier(data) evl = Evaluation(data) evl.test_model(zeror, data) print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct()) print("\nHoldout 10%...") # use seed 1-10 and perform random split with 90% perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.evaluate_train_test_split( Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i)) perc.append(round(evl.percent_correct(), 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct())) # calculate mean and standard deviation nperc = numpy.array(perc) print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc)))
def classify_and_save(classifier, name, outfile): random.seed("ML349") csv_header = [ "Game Name", "SteamID", "Algorithm", "Number Players", "%Players of Training Set", "Accuracy", "Precision (0)", "Recall (0)", "F1 (0)", "Precision (1)", "Recall (1)", "F1 (1)" ] game_results = [] with open("data/games_by_username_all.csv", "r") as f: game_list = f.next().rstrip().split(",") loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file("data/final_train.arff") test = loader.load_file("data/final_test.arff") count = 0 for i in itertools.chain(xrange(0, 50), random.sample(xrange(50, len(game_list)), 450)): train.class_index = i test.class_index = i count += 1 classifier.build_classifier(train) evaluation = Evaluation(train) evaluation.test_model(classifier, test) confusion = evaluation.confusion_matrix num_players = sum(confusion[1]) steam_id = repr(train.class_attribute).split(" ")[1] result = [ game_list[i], steam_id, name, int(num_players), num_players/1955, evaluation.percent_correct, evaluation.precision(0), evaluation.recall(0), evaluation.f_measure(0), evaluation.precision(1), evaluation.recall(1), evaluation.f_measure(1) ] game_results.append(result) print "\nResult #{2}/500 for {0} (SteamID {1}):".format(game_list[i], steam_id, count), print evaluation.summary() with open(outfile, "wb") as f: csv_writer = csv.writer(f, delimiter=",") csv_writer.writerow(csv_header) for r in game_results: csv_writer.writerow(r)
fname = data_dir + os.sep + "segment-challenge.arff" print("\nLoading dataset: " + fname + "\n") train = loader.load_file(fname) train.set_class_index(train.num_attributes() - 1) fname = data_dir + os.sep + "segment-test.arff" print("\nLoading dataset: " + fname + "\n") test = loader.load_file(fname) test.set_class_index(train.num_attributes() - 1) # build J48 cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # evaluate on test evl = Evaluation(train) evl.test_model(cls, test) print("Test set accuracy: %0.0f%%" % evl.percent_correct()) # evaluate on train evl = Evaluation(train) evl.test_model(cls, train) print("Train set accuracy: %0.0f%%" % evl.percent_correct()) # evaluate on random split evl = Evaluation(train) evl.evaluate_train_test_split(cls, train, 66.0, Random(1)) print("Random split accuracy: %0.0f%%" % evl.percent_correct()) jvm.stop()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
grid.evaluation = "ACC" grid.parameters = [gamma, cost] # LibSVM is added to grid configuration grid.classifier = classifier # Search for the best parameters and build a classifier with them grid.build_classifier(trainData) best = grid.best best.build_classifier(trainData) print(best.options) print("C", best.options[best.options.index("-C")+1]) print("gamma", best.options[best.options.index("-G")+1]) print("\n\n=========== Train results ================\n\n") print(grid) evaluation = Evaluation(trainData) evaluation.test_model(best, trainData) print(best.to_commandline()) print(evaluation.matrix()) print("Train recognition: %0.2f%%" % evaluation.percent_correct) print("\n\n=========== Test results ================\n\n") evaluation = Evaluation(testData) evaluation.test_model(best, testData) print(best.to_commandline()) print(evaluation.matrix()) print("Test recognition: %0.2f%%" % evaluation.percent_correct) jvm.stop()