def Boost_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Boost_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False) value_Boost_J48 = str(evaluation.percent_correct) return value_Boost_J48
def fitness(toeval : Individual): cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings()) fc = FilteredClassifier() fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) return evl.percent_correct
def TrainingModel(arff, modelOutput, clsfier): # 启动java虚拟机 jvm.start() # 导入训练集 loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(arff) train.class_is_first() # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高 cls_name = "weka.classifiers." + clsfier clsf = Classifier(classname=cls_name) clsf.build_classifier(train) print(clsf) # 建立模型 fc = FilteredClassifier() fc.classifier = clsf evl = Evaluation(train) evl.crossvalidate_model(fc, train, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) print(evl.matrix()) # 结果统计 matrixResults = evl.confusion_matrix TN = float(matrixResults[0][0]) FP = float(matrixResults[0][1]) FN = float(matrixResults[1][0]) TP = float(matrixResults[1][1]) TPR = TP / (TP + FN) TNR = TN / (FP + TN) PPV = TP / (TP + FP) NPV = TN / (TN + FN) print("算法: " + clsfier) print("敏感度 TPR: " + str(TPR)) print("特异度 TNR: " + str(TNR)) print("PPV: " + str(PPV)) print("NPV: " + str(NPV)) # 保存模型 clsf.serialize(modelOutput, header=train) # 退出虚拟机 jvm.stop() print("分析模型建立完成")
def RandomTree(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RT_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RT_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RT_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False) value_RT = str(evl.percent_correct) return value_RT
fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-S"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.set_classifier(Classifier(classname=classifier)) cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct()) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.to_matrix("Matrix:")) jvm.stop()
data.class_is_last() # 1. cheating with default filter fltr = Filter(classname="weka.filters.supervised.attribute.Discretize", options=[]) fltr.inputformat(data) filtered = fltr.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) cls.build_classifier(filtered) print("cheating (default): accuracy=%0.1f nodes=%s" % (evl.percent_correct, get_nodes(str(cls)))) # 2. using FilteredClassifier with default filter cls = FilteredClassifier() cls.classifier = Classifier(classname="weka.classifiers.trees.J48") cls.filter = Filter(classname="weka.filters.supervised.attribute.Discretize", options=[]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) cls.build_classifier(data) print("FilteredClassifier (default): accuracy=%0.1f nodes=%s" % (evl.percent_correct, get_nodes(str(cls)))) # 3. using FilteredClassifier (make binary) cls = FilteredClassifier() cls.classifier = Classifier(classname="weka.classifiers.trees.J48") cls.filter = Filter(classname="weka.filters.supervised.attribute.Discretize", options=["-D"]) evl = Evaluation(data)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer( classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc(evaluation, title="ROC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) # train 2nd classifier on diabetes dataset classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(diabetes_data) evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42)) plot_cls.plot_rocs({ "NB": evaluation, "RF": evaluation2 }, title="ROC diabetes", class_index=0, wait=False) plot_cls.plot_prcs({ "NB": evaluation, "RF": evaluation2 }, title="PRC diabetes", class_index=0, wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print( str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # train 2nd classifier and show errors in same plot classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg") evaluation2 = Evaluation(bolts_data) evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42)) plot_cls.plot_classifier_errors( { "LR": evaluation.predictions, "SMOreg": evaluation2.predictions }, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") ] plot_cls.plot_learning_curve(cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in xrange(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
# progress info sys.stdout.write("Repetitions=" + str(repetition)) # initialize curve curve = {} for percentage in percentages: curve[percentage] = 0 curves[repetition] = curve # run and add up percentage correct from repetition for seed in xrange(repetition): seed += 1 sys.stdout.write(".") for percentage in percentages: cls = Classifier(classname="weka.classifiers.trees.J48") flt = Filter(classname="weka.filters.unsupervised.instance.Resample", options=["-Z", str(percentage), "-no-replacement"]) fc = FilteredClassifier() fc.set_classifier(cls) fc.set_filter(flt) evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(seed)) curve[percentage] += (evl.percent_correct() / repetition) # progress info sys.stdout.write("\n") # output the results if not plot.matplotlib_available: print("ZeroR: " + str(baseline)) for repetition in repetitions: y = [] for percentage in percentages: y.append(curves[repetition][percentage])
def main(): """ Just runs some example code. """ #case=["selected_chi100","selected_chi150","selected_chi200","selected_chi250","selected_chi300","selected_chi350","selected_fe100","selected_fe150","selected_fe200","selected_fe250","selected_fe300","selected_fe350","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfm250","selected_sfm300","selected_sfm350","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_sfmt250","selected_sfmt300","selected_sfmt350","selected_cs100","selected_cs150","selected_cs200","selected_cs250","selected_cs300","selected_cs350"] #case=["feature_vector100","feature_vector110","feature_vector120","feature_vector130","feature_vector140","feature_vector150","feature_vector90","feature_vector80","feature_vector70","feature_vector60","feature_vector50","feature_vector40","feature_vector30","feature_vector20","feature_vector10","feature_vector5"] case = [ "selected_chi100", "selected_chi150", "selected_chi200", "selected_chi250", "selected_chi300", "selected_chi350", "selected_fe100", "selected_fe150", "selected_fe200", "selected_fe250", "selected_fe300", "selected_fe350", "selected_sfm100", "selected_sfm150", "selected_sfm200", "selected_sfm250", "selected_sfm300", "selected_sfm350", "selected_sfmt100", "selected_sfmt150", "selected_sfmt200", "selected_sfmt250", "selected_sfmt300", "selected_sfmt350" ] #case=["selected_chi100","selected_chi150","selected_chi200","selected_fe100","selected_fe150","selected_fe200","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_cs100","selected_cs150","selected_cs200",] for nomefile in case: print(nomefile) feature_vector = "./selected_vectors30/" + nomefile + ".arff" # legge il file in formato arff loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(feature_vector) data.class_is_last() #print(data) f = open("./selected_vectors30/risultati/" + nomefile + ".txt", "w+") # file risultati in txt #intestazione excel intest = [ "Correlation coefficient", "Mean absolute error", "Root mean squared error", "Relative absolute error", "Root relative squared error", "Total Number of Instances" ] workbook = xlsxwriter.Workbook("./selected_vectors30/risultati/" + nomefile + ".xlsx") # file excel worksheet = workbook.add_worksheet() for col_num, dati in enumerate(intest): worksheet.write(0, col_num + 1, dati) riga = 1 #lista degli algoritmi da eseguire #alg=["meta.Bagging","meta.RandomSubSpace","rules.M5Rules","trees.M5P","trees.RandomForest"] alg = [ "bayes.NaiveBayes", "bayes.NaiveBayesUpdateable", "functions.Logistic", "functions.SGD", "functions.SimpleLogistic", "functions.SMO", "functions.VotedPerceptron", "meta.AdaBoostM1", "meta.AttributeSelectedClassifier", "meta.Bagging", "meta.ClassificationViaRegression", "meta.IterativeClassifierOptimizer", "meta.LogitBoost", "meta.RandomCommittee", "meta.RandomSubSpace", "rules.DecisionTable", "rules.JRip", "rules.OneR", "trees.DecisionStump", "trees.J48", "trees.RandomForest", "trees.REPTree" ] for row_num, dati in enumerate(alg): worksheet.write(row_num + 1, 0, dati) for i in alg: remove = Filter( classname="weka.filters.unsupervised.attribute.Remove") cls = Classifier(classname="weka.classifiers." + i) fc = FilteredClassifier() fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) # 10 fold cross validation #evl.evaluate_train_test_split(fc,data,50,None,None) # 50% split cross validation k = evl.summary() #scrittura sui file f.write(i + "\n") f.write(k + "\n") my_list = k.split('\n') for col_num, dati in enumerate(my_list): worksheet.write(riga, col_num, dati[-10:]) print(i) riga += 1 f.close() workbook.close()
train = converters.load_any_file("imbalanced_train.arff") test = converters.load_any_file("imbalanced_test.arff") train.class_is_last() test.class_is_last() # Minority Class is getting Sampled 5x smote = Filter(classname="weka.filters.supervised.instance.SMOTE", options=["-P", "500.0"]) # Base Classifier cls = Classifier(classname="weka.classifiers.trees.LMT", options=["-B", "-I", "10"]) # Filtered Classifier fc = FilteredClassifier() fc.filter = smote fc.classifier = cls # 5 Fold K cross validation evl = Evaluation(train) evl.crossvalidate_model(fc, train, 5, Random(1)) # Prints Out Confusion Matrix along with other summary statistics print("LMT (SMOTE balanced classes) CV = 5 Error: %.2f%%" % (evl.percent_incorrect)) print(evl.matrix()) #Confusion Matrix # Plots ROC plcls.plot_roc(evl, class_index=[0, 1], wait=True)
sys.stdout.write("Repetitions=" + str(repetition)) # initialize curve curve = {} for percentage in percentages: curve[percentage] = 0 curves[repetition] = curve # run and add up percentage correct from repetition for seed in xrange(repetition): seed += 1 sys.stdout.write(".") for percentage in percentages: cls = Classifier(classname="weka.classifiers.trees.J48") flt = Filter( classname="weka.filters.unsupervised.instance.Resample", options=["-Z", str(percentage), "-no-replacement"]) fc = FilteredClassifier() fc.classifier = cls fc.filter = flt evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(seed)) curve[percentage] += (evl.percent_correct / repetition) # progress info sys.stdout.write("\n") # output the results if not plot.matplotlib_available: print("ZeroR: " + str(baseline)) for repetition in repetitions: y = [] for percentage in percentages: y.append(curves[repetition][percentage])
db = client["news-scraper"] articles_collection = db.articles article = articles_collection.find_one({"_id": article_id}) jvm.start(system_cp=True, packages=True, max_heap_size="512m") # Train classifier loader = Loader(classname="weka.core.converters.ArffLoader", options=["-charset", "UTF-8"]) train_data = loader.load_file(os.path.dirname(os.path.realpath(__file__)) + "/datasets/train.arff") train_data.class_is_last() string_to_word_vector_filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector") cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial") fc = FilteredClassifier() fc.filter = string_to_word_vector_filter fc.classifier = cls fc.build_classifier(train_data) # Create test data class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"]) str_att = Attribute.create_string("title") test_dataset = Instances.create_instances( name="test_news_set", atts=[str_att, class_att], capacity=1 )
fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.class_is_last() setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.classifier = Classifier(classname=classifier) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.matrix("Matrix:")) jvm.stop()
import matplotlib as plt jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file("C:/Arpit/aps.failure_training_set.csv") data_test = loader.load_file("C:/Arpit/aps.failure_test_set.csv") # print(str(data))data = loader.load_file( + "aps.failure_training_set.csv") data.class_is_last() data_test.class_is_last() # remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-3"]) cls = Classifier(classname="weka.classifiers.trees.LMT") fc = FilteredClassifier() # fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) preds = evl.test_model(cls, data_test) conf = evl.confusion_matrix print(evl.percent_incorrect) # print(evl.summary()) # print(evl.class_details()) sns.heatmap(conf, cmap="YlGnBu", annot=True, linewidths=.5, fmt='d') print("AUC", evl.area_under_prc)
print("Train/test/predict...") groups = ["DataSet1", "DataSet2"] # groups = ["DataSet2"] for group in groups: print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name("reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name("reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name("reference value").index cls = FilteredClassifier() cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name( "reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name( "reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name( "reference value").index cls = FilteredClassifier() cls.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
print("Error is",tevlmt.error_rate) tcm2e = tevlmt.confusion_matrix tcm2E = pd.DataFrame(tcm2e, index = ["neg","pos"],columns = ["neg","pos"]) plt.figure(figsize = (7,7)) axis = sns.heatmap(tcm2E, annot=True, cbar=False, cmap="Reds") plcls.plot_roc(tevlmt,class_index=[1]) packages.install_package("SMOTE") smote = Filter(classname="weka.filters.supervised.instance.SMOTE",options=["-P", "4800"]) smt = Classifier(classname="weka.classifiers.trees.LMT") fc = FilteredClassifier() fc.filter = smote fc.classifier = smt fc.build_classifier(Wtrain) evsmt = Evaluation(Wtrain) evsmt.crossvalidate_model(fc, Wtrain, 5, Random(1)) print("Error is",evsmt.error_rate) cm2f = evsmt.confusion_matrix cm2F = pd.DataFrame(cm2f, index = ["neg","pos"],columns = ["neg","pos"]) plt.figure(figsize = (7,7)) axis = sns.heatmap(cm2F, annot=True, cbar=False, cmap="Reds") plcls.plot_roc(evsmt,class_index=[1])