def f_smote(): jvm.start() train_data, test_data = b_i_impute_data() train_data = train_data[:10000] y_train = train_data["class"] x_train = train_data.drop("class", axis=1) sm = SMOTE(ratio="minority") x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns) y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"]) train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1) print_f("smote train data shape", train_data_sm_df.shape) train_data_sm_df.to_csv("./train_data_sm.csv", index=False) train_data_sm = converters.load_any_file("train_data_sm.csv") train_data_sm.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print_f("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print_f("bulding classifier") cls.build_classifier(train_data_sm) print_f("Evaluating") evl = Evaluation(train_data_sm) evl.crossvalidate_model(cls, train_data_sm, 5, Random(1)) print_f("Train Accuracy:", evl.percent_correct) print_f("Train summary") print_f(evl.summary()) print_f("Train class details") print_f(evl.class_details()) print_f("Train confusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True, outfile="./plots/2_f_smote_10k.png") plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) evl = Evaluation(test_data) print_f("testing model") evl.test_model(cls, test_data) print_f("Test Accuracy:", evl.percent_correct) print_f("Test summary") print_f(evl.summary()) print_f(" Testclass details") print_f(evl.class_details()) print_f("Testconfusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/f_test_roc_curve.png")
def run_naive_bayes_crossval(self, output_directory): # build classifier print("\nBuilding Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "NB Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Naive_Bayes_Crossval", resultsString, output_directory)
def Boost_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Boost_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False) value_Boost_J48 = str(evaluation.percent_correct) return value_Boost_J48
def RandomTree(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RT_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RT_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RT_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False) value_RT = str(evl.percent_correct) return value_RT
def runSMO(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = KernelClassifier( classname="weka.classifiers.functions.SMO", options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.PolyKernel", options=["-C", "250007", "-E", "1.0"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) #print(pout.buffer_content()) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def run_bayesNet(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running BayesNet on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # Use BayesNet and set options cls = Classifier(classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.TAN", "--", "-P", "1", "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") # Evaluate data evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.confusion_matrix) # Generate grid for ROC # plcls.plot_roc(evaluation, class_index=[0,1], wait=True) # mk dirs for output dir = dir / "bayesNet_results" dir.mkdir(parents=True, exist_ok=True) # Save summary, class details and confusion matrix to file result_output = filename_base + "_bayesNet_eval_results_TAN.txt" output_eval(evaluation, dir / result_output) # Save the predicited results to file prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt" output_pred(pout, dir / prediction_output) print("BayesNet complete")
def e_model_tree(): # train_data, test_data = b_i_impute_data() # train_data.to_csv("./train_data.csv", index=False) # test_data.to_csv("./test_data.csv",index=False) jvm.start() train_data = converters.load_any_file("train_data.csv") train_data.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print("2") cls.build_classifier(train_data) print("3") evl = Evaluation(train_data) evl.crossvalidate_model(cls, train_data, 5, Random(1)) print("Train Accuracy:", evl.percent_correct) print("Train summary") print(evl.summary()) print("Train class details") print(evl.class_details()) print("Train confusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_train_roc_curve.png") evl = Evaluation(test_data) evl.test_model(cls, test_data) print("Test Accuracy:", evl.percent_correct) print("Test summary") print(evl.summary()) print(" Testclass details") print(evl.class_details()) print("Testconfusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_test_roc_curve.png")
def obtainSVM(file): data = converters.load_any_file(folderPathOfArffFiles + file + ".arff") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-2"]) remove.inputformat(data) data = remove.filter(data) data.class_is_last() classifier = Classifier(classname="weka.classifiers.functions.LibSVM") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, kFold, Random(42)) info = evaluation.class_details() roc_area = float(info[406:411]) return roc_area
def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]): if arrfFile is not None: self.initData( arrfFile ) if self.data is None: return print 'Classificador ' + str(classname) + ' ' + ' '.join(options) cls = Classifier(classname=classname, options=options) evl = Evaluation(self.data) evl.crossvalidate_model(cls, self.data, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details())
def run_bayes_hill_split(self, output_directory, parents=1): # build classifier print("\nBuilding Bayes Classifier on training data. Parents = " + str(parents) + "\n") buildTimeStart = time.time() cls = Classifier( classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.HillClimber", "--", "-P", "" + str(parents), "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "Bayes Split Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nEvaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.test_model(cls, self.testing_data) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nBayes Split Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Bayes_Hill_P" + str(parents) + "_", resultsString, output_directory) self.save_results("Bayes_Hill_P" + str(parents) + "_Graph", cls.graph, output_directory, True)
def cross_validate(self, detail = True): """Perform cross validation using trained data. Parameters ---------- detail : boolean, optional, default = True If true return a detailed information of cross validation. Returns ------- info : string Info with results of cross validation. """ #print 'cross_validation' start_time = TimeUtils.get_time() info = "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options])) if detail == True: info += "Relation:\t%s\n" % (self.data.relationname) info += "Instances:\t%d\n" % (self.data.num_instances) info += "Attributes:\t%d\n\n" % (self.data.num_attributes) evl = WEvaluation(self.data) evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1)) if detail == False: info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct) info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time) #info += str(evl.percent_correct) + "\n\n" if detail == True: info += "=== Stratified cross-validation ===\n" info += evl.summary() + "\n\n" info += str(evl.class_details()) + "\n\n" classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)] cm = evl.confusion_matrix info += Classifier.confusion_matrix(classes, cm) return info
def run_crossval(self, output_directory, classifier_name, classifier_weka_spec, options_list): # build classifier print("\nBuilding " + classifier_name + " Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname=classifier_weka_spec, options=options_list) cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = classifier_name + " Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString += "\n" resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString += "\n" resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\n\n" + classifier_name + " Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) options_string = "" for option in options_list: options_string = options_string + str(option) options_string = options_string.replace(".", "-") options_string = options_string.replace("-", "_") #Save Results and Cleanup self.save_results(classifier_name + options_string + "_Crossval", resultsString, output_directory)
def crossValidate(self, arrfFile=None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]): if arrfFile is not None: self.initData(arrfFile) if self.data is None: return print 'Classificador ' + str(classname) + ' ' + ' '.join(options) cls = Classifier(classname=classname, options=options) evl = Evaluation(self.data) evl.crossvalidate_model(cls, self.data, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details())
def runBayes(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def TrainingModel(arff, modelOutput, clsfier): # 启动java虚拟机 jvm.start() # 导入训练集 loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(arff) train.class_is_first() # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高 cls_name = "weka.classifiers." + clsfier clsf = Classifier(classname=cls_name) clsf.build_classifier(train) print(clsf) # 建立模型 fc = FilteredClassifier() fc.classifier = clsf evl = Evaluation(train) evl.crossvalidate_model(fc, train, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) print(evl.matrix()) # 结果统计 matrixResults = evl.confusion_matrix TN = float(matrixResults[0][0]) FP = float(matrixResults[0][1]) FN = float(matrixResults[1][0]) TP = float(matrixResults[1][1]) TPR = TP / (TP + FN) TNR = TN / (FP + TN) PPV = TP / (TP + FP) NPV = TN / (TN + FN) print("算法: " + clsfier) print("敏感度 TPR: " + str(TPR)) print("特异度 TNR: " + str(TNR)) print("PPV: " + str(PPV)) print("NPV: " + str(NPV)) # 保存模型 clsf.serialize(modelOutput, header=train) # 退出虚拟机 jvm.stop() print("分析模型建立完成")
def run_ibk_crossval(self, output_directory): # build classifier print("\nBuilding Classifier on training data.") buildTimeStart = time.time() cls = Classifier( classname="weka.classifiers.lazy.IBk", options=[ "-K", "3", "-W", "0", "-A", "weka.core.neighboursearch.LinearNNSearch -A \"weka.core.EuclideanDistance -R first-last\"" ]) cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "IBK Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nIBK Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("IBK_Crossval", resultsString, output_directory)
def obtainBayesNet(file): #The path of the arff extension file must be put. data = converters.load_any_file(folderPathOfArffFiles + file + ".arff") #In the case of this specific data set, the first two attributes were removed since they # represent the name and ranking which are unique values that would affect the classification. # Depending on the data set, certain attributes must be removed. remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-2"]) remove.inputformat(data) data = remove.filter(data) #It is specified that the class value is the last attribute. data.class_is_last() #Define the classifier to be used. classifier = Classifier(classname="weka.classifiers.bayes.BayesNet") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, kFold, Random(42)) #The ROC-AUC is extracted from the string that is received from Weka. info = evaluation.class_details() roc_area = float(info[406:411]) return roc_area
def train(training_dataset_path, model_cache_file_name, evaluation_is_on, summary_file_path): """Model Training function The function uses the WEKA machine learning library, implemented by python-weka-wrapper Python library. Divides the data into given folds, and do the training and evaluation. Trained model copied to __predictors global variable and also saved (together with training data set) to the model_cache_file_name file. Evaluation summary is being written to summary_file_path file. Args: :param training_dataset_path: the path of the input arff file. :param model_cache_file_name: :param evaluation_is_on: run evaluation after training (true / false) :param summary_file_path: the path of the model evaluation summary file. Returns: None """ global __classifiers global __predictors training_data = converters.load_any_file(training_dataset_path) training_data.class_is_last() lines = [] summaries = [] summary_line = [ 'Model'.ljust(16), 'Precision'.ljust(12), 'Recall'.ljust(12), 'F-measure'.ljust(12), 'Accuracy'.ljust(12), 'FPR'.ljust(12) ] summaries.append('\t'.join(summary_line)) for classifier, option_str in __classifiers.items(): option_list = re.findall(r'"(?:[^"]+)"|(?:[^ ]+)', option_str) option_list = [s.replace('"', '') for s in option_list] classifier_name = classifier.split('.')[-1] info_str = "Using classifier: {classifier}, options: {options}".format( classifier=classifier_name, options=str(option_list)) localizer_log.msg(info_str) lines.append(info_str) # Train cls = Classifier(classname=classifier, options=option_list) localizer_log.msg("Start building classifier") cls.build_classifier(training_data) localizer_log.msg("Completed building classifier") localizer_log.msg("Saving trained model to {model_cache_name}".format( model_cache_name=model_cache_file_name)) # localizer_config.save_model(cls, training_data, model_cache_file_name) path = os.path.join('caches', 'model') if not os.path.exists(path): os.makedirs(path, exist_ok=True) path = os.path.join(path, model_cache_file_name + '.cache') cls.serialize(path) localizer_log.msg("Trained model saved") classifier2, _ = Classifier.deserialize(path) print(classifier2) __predictors[classifier_name] = cls if evaluation_is_on: # Model evaluation localizer_log.msg("Start evaluation classifier") evl = Evaluation(training_data) localizer_log.msg("Complete evaluation classifier") localizer_log.msg("Start cross-validating classifier") evl.crossvalidate_model(cls, training_data, 10, Random(1)) localizer_log.msg("Complete cross-validating classifier") # print(evl.percent_correct) # print(evl.summary()) # print(evl.class_details()) lines.append(evl.summary()) lines.append(evl.class_details()) summary_line = [] summary_line.append(classifier_name.ljust(16)) summary_line.append("{:.3f}".format(evl.weighted_precision * 100).ljust(12)) summary_line.append("{:.3f}".format(evl.weighted_recall * 100).ljust(12)) summary_line.append("{:.3f}".format(evl.weighted_f_measure * 100).ljust(12)) summary_line.append("{:.3f}".format(evl.percent_correct).ljust(12)) summary_line.append("{:.3f}".format( evl.weighted_false_positive_rate * 100).ljust(12)) summaries.append('\t'.join(summary_line)) # Save evaluation summary to file with open(summary_file_path, 'w') as f: f.writelines('\n'.join(lines)) f.writelines('\n' * 5) f.writelines('\n'.join(summaries))
def run_multilayerPercepton(file, file2=None): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running Multilayer Percepton on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] print("loading data...") # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # If 2nd file load that data too if file2: print("Loading test...") test = load_Arff_file(file2) test.class_is_first() file_names = [ "MP_N-500_default_H-1", "MP_N-500_H-3", "MP_N-500_H-5", "MP_N-500_H-7", "MP_N-500_H-3-5", "MP_N-500_H-5-3", "MP_N-500_H-3-5-7", "MP_N-500_H-7-3-5", "MP_N-500_H-5-7-3", "MP_N-500_L-01", "MP_N-500_L-02", "MP_N-500_L-04", "MP_N-500_L-05", "MP_N-500_M-01", "MP_N-500_M-03", "MP_N-500_M-04", "MP_N-500_M-05", "MP_N-500_E-5", "MP_N-500_E-10", "MP_N-500_E-15", "MP_N-500_E-25", ] options_list = [ [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # DEFAULT [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "3" ], # -H START [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "5" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "7" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "3, 5" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "5, 3" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "3, 5, 7" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "7, 3, 5" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "5, 7, 3" ], # -H END [ "-L", "0.1", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -L START [ "-L", "0.2", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.4", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.5", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -L END [ "-L", "0.3", "-M", "0.1", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -M START [ "-L", "0.3", "-M", "0.3", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.3", "-M", "0.4", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.3", "-M", "0.5", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -M END [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "5", "-H", "1" ], # -E START [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "10", "-H", "1" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "15", "-H", "1" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "25", "-H", "1" ], # -E END ] for i in range(len(options_list)): start = time.time() print("Beginning iteration " + str(i) + ": " + file_names[i]) # Use MultilayerPercepton and set options cls = Classifier( classname="weka.classifiers.functions.MultilayerPerceptron", options=options_list[i]) # Build classifier with train data cls.build_classifier(data) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText" ) # Evaluate data on test data evaluation = Evaluation(data) evaluation.test_model(cls, test, output=pout) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.confusion_matrix) # Generate grid for ROC # plcls.plot_roc(evaluation, class_index=[0,1], wait=True) # mk dirs for output tempdir = dir / "Results/" / "MP-ALL_N-500_results/" / (file_names[i] + "_results/") tempdir.mkdir(parents=True, exist_ok=True) # Save summary, class details and confusion matrix to file result_output = file_names[i] + "_results.txt" print(tempdir) print(result_output) print((tempdir / result_output).absolute()) output_eval(evaluation, tempdir / result_output) # Save the predicited results to file prediction_output = file_names[i] + "_prediction.txt" output_pred(pout, tempdir / prediction_output) end = time.time() timetaken = round(end - start, 2) print("Time taken to run iteration " + str(i) + ": %s seconds" % (timetaken)) print("Multilayer Percepton complete")
from weka.classifiers import Classifier import csv from weka.classifiers import Evaluation from weka.core.classes import Random data = pd.read_csv("data2.csv") data = shuffle(input) cls = Classifier(classname="weka.classifiers.trees.LMT", options=["-C", "0.3"]) cls.build_classifier(data) evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details())
loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file("reviewsinformation_task2.arff") iris_data.class_is_last() loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) #print("model:\n" + str(classifier)) evaluation = Evaluation('test_data.arff') evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix())
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer( classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc(evaluation, title="ROC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) # train 2nd classifier on diabetes dataset classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(diabetes_data) evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42)) plot_cls.plot_rocs({ "NB": evaluation, "RF": evaluation2 }, title="ROC diabetes", class_index=0, wait=False) plot_cls.plot_prcs({ "NB": evaluation, "RF": evaluation2 }, title="PRC diabetes", class_index=0, wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print( str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # train 2nd classifier and show errors in same plot classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg") evaluation2 = Evaluation(bolts_data) evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42)) plot_cls.plot_classifier_errors( { "LR": evaluation.predictions, "SMOreg": evaluation2.predictions }, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") ] plot_cls.plot_learning_curve(cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in xrange(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
def createTrainedModel(): from weka.core.converters import Loader folderList = os.listdir(outputModel) i = 0 classi = "" loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(os.path.join(outputModel, "genderTrain.arff")) data.class_is_last() from weka.classifiers import Classifier classi = "weka.classifiers.bayes.NaiveBayes" cls = Classifier(classname=classi) from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-1.7976931348623157E308", "-1"]) #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval") #attsel = AttributeSelection() #attsel.search(search) #attsel.evaluator(evaluator) #attsel.select_attributes(data) cls.build_classifier(data) import weka.core.serialization as serialization from weka.core.dataset import Instances serialization.write_all( os.path.join(outputModel, "GenderModel" + ".model"), [cls, Instances.template_instances(data)]) from weka.classifiers import Evaluation from weka.core.classes import Random evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print "Gender model predictions" print cls #print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) data = loader.load_file(os.path.join(outputModel, "ageTrain.arff")) data.class_is_last() classi = "weka.classifiers.bayes.NaiveBayes" cls = Classifier(classname=classi) from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-1.7976931348623157E308", "-1"]) #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval") #attsel = AttributeSelection() #attsel.search(search) #attsel.evaluator(evaluator) #attsel.select_attributes(data) #classi = "weka.classifiers.trees.J48" #classi = "weka.classifiers.functions.Logistic" #classi = "weka.classifiers.trees.RandomForest" #classi = "weka.classifiers.bayes.NaiveBayes" #classi = "weka.classifiers.functions.SMOreg" cls.build_classifier(data) print "Age model predictions" print cls import weka.core.serialization as serialization from weka.core.dataset import Instances serialization.write_all(os.path.join(outputModel, "AgeModel" + ".model"), [cls, Instances.template_instances(data)]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) #print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) os._exit(0)
def call_weka(file_dir, ml_opt, ofile_dir): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file_dir) data.class_is_last() filtered = data ml_id = '' if ml_opt != '0': if ml_opt == '1': classifier = Classifier( classname="weka.classifiers.functions.LibSVM", options=[ "-S", "0", "-K", "2", "-D", "3", "-G", "0.0", "-R", "0.0", "-N", "0.5", "-M", "40.0", "-C", "1.0", "-E", "0.001", "-P", "0.1", "-seed", "1" ]) ml_id = 'SVM' elif ml_opt == '3': classifier = Classifier( classname="weka.classifiers.functions.MLPClassifier", options=[ '-N', '2', '-R', '0.01', '-O', '1.0E-6', '-P', '1', '-E', '1', '-S', '1' ]) ml_id = 'MLPC' elif ml_opt == '4': classifier = Classifier( classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1", "-num-slots", "1"]) ml_id = 'RF' elif ml_opt == '2': classifier = Classifier(classname="weka.classifiers.meta.Bagging", options=[ "-P", "100", "-S", "1", "-I", "10", "-W", "weka.classifiers.trees.M5P", "--", "-M", "4.0" ]) ml_id = 'BagM5P' elif ml_opt == '5': classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) ml_id = 'J48' elif ml_opt == '7': classifier = Classifier( classname="weka.classifiers.functions.RBFNetwork", options=[ "-B", "2", "-S", "1", "-R", "1.0E-8", "-M", "-1", "-W", "0.1" ]) ml_id = 'RBFNet' elif ml_opt == '8': classifier = Classifier( classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.K2", "--", "-P", "1", "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) ml_id = 'BayesNet' elif ml_opt == '6': classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayes") ml_id = 'NaiveBayes' elif ml_opt == '9': classifier = Classifier( classname="weka.classifiers.functions.SimpleLogistic", options=["-I", "0", "-M", "500", "-H", "50", "-W", "0.0"]) ml_id = 'LogReg' filtered.class_is_last() evaluation = Evaluation(filtered) evaluation.crossvalidate_model(classifier, filtered, 10, Random(42)) print "Evaluation: Done." ofile = open(ofile_dir + ml_id + "_results.txt", 'wb') print >> ofile, evaluation.summary() print >> ofile, evaluation.class_details().encode('ascii', 'ignore') print >> ofile, evaluation.matrix().encode('ascii', 'ignore') serialization.write(ofile_dir + ml_id + ".model", classifier) print "Saving " + ml_id + " Model: Done." ofile.close()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
dataSet50x20 = loader.load_file("trainingSet/dataSet50x20.arff") dataSet50x20.class_is_last() classifier1 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"]) classifier2 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "11"]) classifier3 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"]) print "\n\nTraining neural network 1" evaluation1 = Evaluation(dataSet20x20) evaluation1.crossvalidate_model(classifier1, dataSet20x20, 10, Random(42)) classifier1.build_classifier(dataSet20x20) serialization.write("trainingSet/nn1.model", classifier1) print "\n\n====================================================== NUERAL NETWORK 1 ======================================================" print(evaluation1.summary()) print(evaluation1.class_details()) print "Training neural network 2" evaluation2 = Evaluation(dataSet20x50) evaluation2.crossvalidate_model(classifier2, dataSet20x50, 10, Random(42)) classifier2.build_classifier(dataSet20x50) serialization.write("trainingSet/nn2.model", classifier2) print "\n\n====================================================== NUERAL NETWORK 2 ======================================================" print(evaluation2.summary()) print(evaluation2.class_details()) print "Training neural network 3" evaluation3 = Evaluation(dataSet50x20) evaluation3.crossvalidate_model(classifier3, dataSet50x20, 10, Random(42)) classifier3.build_classifier(dataSet50x20) serialization.write("trainingSet/nn3.model", classifier3)
from weka.filters import Filter # convert csv into arff format (weka compatable) # use convertcsvtoarff.py file # load arff file loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file("reviewsinformation_task2.arff") iris_data.class_is_last() loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) #print("model:\n" + str(classifier)) evaluation = Evaluation('test_data.arff') evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix())