def Boost_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Boost_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False) value_Boost_J48 = str(evaluation.percent_correct) return value_Boost_J48
def RandomTree(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RT_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RT_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RT_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False) value_RT = str(evl.percent_correct) return value_RT
def fitness(toeval : Individual): cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings()) fc = FilteredClassifier() fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) return evl.percent_correct
def TrainingModel(arff, modelOutput, clsfier): # 启动java虚拟机 jvm.start() # 导入训练集 loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(arff) train.class_is_first() # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高 cls_name = "weka.classifiers." + clsfier clsf = Classifier(classname=cls_name) clsf.build_classifier(train) print(clsf) # 建立模型 fc = FilteredClassifier() fc.classifier = clsf evl = Evaluation(train) evl.crossvalidate_model(fc, train, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) print(evl.matrix()) # 结果统计 matrixResults = evl.confusion_matrix TN = float(matrixResults[0][0]) FP = float(matrixResults[0][1]) FN = float(matrixResults[1][0]) TP = float(matrixResults[1][1]) TPR = TP / (TP + FN) TNR = TN / (FP + TN) PPV = TP / (TP + FP) NPV = TN / (TN + FN) print("算法: " + clsfier) print("敏感度 TPR: " + str(TPR)) print("特异度 TNR: " + str(TNR)) print("PPV: " + str(PPV)) print("NPV: " + str(NPV)) # 保存模型 clsf.serialize(modelOutput, header=train) # 退出虚拟机 jvm.stop() print("分析模型建立完成")
print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name( "reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name( "reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name( "reference value").index cls = FilteredClassifier() cls.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file("C:/Arpit/aps.failure_training_set.csv") data_test = loader.load_file("C:/Arpit/aps.failure_test_set.csv") # print(str(data))data = loader.load_file( + "aps.failure_training_set.csv") data.class_is_last() data_test.class_is_last() # remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-3"]) cls = Classifier(classname="weka.classifiers.trees.LMT") fc = FilteredClassifier() # fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) preds = evl.test_model(cls, data_test) conf = evl.confusion_matrix print(evl.percent_incorrect) # print(evl.summary()) # print(evl.class_details()) sns.heatmap(conf, cmap="YlGnBu", annot=True, linewidths=.5, fmt='d') print("AUC", evl.area_under_prc) import weka.plot.classifiers as plcls # NB: matplotlib is required plcls.plot_roc(evl, class_index=[0, 1], wait=True)
def main(): """ Just runs some example code. """ #case=["selected_chi100","selected_chi150","selected_chi200","selected_chi250","selected_chi300","selected_chi350","selected_fe100","selected_fe150","selected_fe200","selected_fe250","selected_fe300","selected_fe350","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfm250","selected_sfm300","selected_sfm350","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_sfmt250","selected_sfmt300","selected_sfmt350","selected_cs100","selected_cs150","selected_cs200","selected_cs250","selected_cs300","selected_cs350"] #case=["feature_vector100","feature_vector110","feature_vector120","feature_vector130","feature_vector140","feature_vector150","feature_vector90","feature_vector80","feature_vector70","feature_vector60","feature_vector50","feature_vector40","feature_vector30","feature_vector20","feature_vector10","feature_vector5"] case = [ "selected_chi100", "selected_chi150", "selected_chi200", "selected_chi250", "selected_chi300", "selected_chi350", "selected_fe100", "selected_fe150", "selected_fe200", "selected_fe250", "selected_fe300", "selected_fe350", "selected_sfm100", "selected_sfm150", "selected_sfm200", "selected_sfm250", "selected_sfm300", "selected_sfm350", "selected_sfmt100", "selected_sfmt150", "selected_sfmt200", "selected_sfmt250", "selected_sfmt300", "selected_sfmt350" ] #case=["selected_chi100","selected_chi150","selected_chi200","selected_fe100","selected_fe150","selected_fe200","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_cs100","selected_cs150","selected_cs200",] for nomefile in case: print(nomefile) feature_vector = "./selected_vectors30/" + nomefile + ".arff" # legge il file in formato arff loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(feature_vector) data.class_is_last() #print(data) f = open("./selected_vectors30/risultati/" + nomefile + ".txt", "w+") # file risultati in txt #intestazione excel intest = [ "Correlation coefficient", "Mean absolute error", "Root mean squared error", "Relative absolute error", "Root relative squared error", "Total Number of Instances" ] workbook = xlsxwriter.Workbook("./selected_vectors30/risultati/" + nomefile + ".xlsx") # file excel worksheet = workbook.add_worksheet() for col_num, dati in enumerate(intest): worksheet.write(0, col_num + 1, dati) riga = 1 #lista degli algoritmi da eseguire #alg=["meta.Bagging","meta.RandomSubSpace","rules.M5Rules","trees.M5P","trees.RandomForest"] alg = [ "bayes.NaiveBayes", "bayes.NaiveBayesUpdateable", "functions.Logistic", "functions.SGD", "functions.SimpleLogistic", "functions.SMO", "functions.VotedPerceptron", "meta.AdaBoostM1", "meta.AttributeSelectedClassifier", "meta.Bagging", "meta.ClassificationViaRegression", "meta.IterativeClassifierOptimizer", "meta.LogitBoost", "meta.RandomCommittee", "meta.RandomSubSpace", "rules.DecisionTable", "rules.JRip", "rules.OneR", "trees.DecisionStump", "trees.J48", "trees.RandomForest", "trees.REPTree" ] for row_num, dati in enumerate(alg): worksheet.write(row_num + 1, 0, dati) for i in alg: remove = Filter( classname="weka.filters.unsupervised.attribute.Remove") cls = Classifier(classname="weka.classifiers." + i) fc = FilteredClassifier() fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) # 10 fold cross validation #evl.evaluate_train_test_split(fc,data,50,None,None) # 50% split cross validation k = evl.summary() #scrittura sui file f.write(i + "\n") f.write(k + "\n") my_list = k.split('\n') for col_num, dati in enumerate(my_list): worksheet.write(riga, col_num, dati[-10:]) print(i) riga += 1 f.close() workbook.close()
print("Train/test/predict...") groups = ["DataSet1", "DataSet2"] # groups = ["DataSet2"] for group in groups: print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name("reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name("reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name("reference value").index cls = FilteredClassifier() cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.class_is_last() setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.classifier = Classifier(classname=classifier) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.matrix("Matrix:")) jvm.stop()
print("Error is",tevlmt.error_rate) tcm2e = tevlmt.confusion_matrix tcm2E = pd.DataFrame(tcm2e, index = ["neg","pos"],columns = ["neg","pos"]) plt.figure(figsize = (7,7)) axis = sns.heatmap(tcm2E, annot=True, cbar=False, cmap="Reds") plcls.plot_roc(tevlmt,class_index=[1]) packages.install_package("SMOTE") smote = Filter(classname="weka.filters.supervised.instance.SMOTE",options=["-P", "4800"]) smt = Classifier(classname="weka.classifiers.trees.LMT") fc = FilteredClassifier() fc.filter = smote fc.classifier = smt fc.build_classifier(Wtrain) evsmt = Evaluation(Wtrain) evsmt.crossvalidate_model(fc, Wtrain, 5, Random(1)) print("Error is",evsmt.error_rate) cm2f = evsmt.confusion_matrix cm2F = pd.DataFrame(cm2f, index = ["neg","pos"],columns = ["neg","pos"]) plt.figure(figsize = (7,7)) axis = sns.heatmap(cm2F, annot=True, cbar=False, cmap="Reds") plcls.plot_roc(evsmt,class_index=[1]) tevsmt = Evaluation(Wtrain)