def Bag_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.Bagging", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname = "weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"] ) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Bag_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Bag_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Bag_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Bag_J48_ROC.png', wait=False) value_Bag_J48 = str(evaluation.percent_correct) return value_Bag_J48
def RandomForest(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RF_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RF_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RF_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RF_ROC.png', wait=False) value_RF = str(evl.percent_correct) return value_RF
def fitness(toeval : Individual): cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings()) fc = FilteredClassifier() fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) return evl.percent_correct
print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name( "reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name( "reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name( "reference value").index cls = FilteredClassifier() cls.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
def main(): """ Just runs some example code. """ #case=["selected_chi100","selected_chi150","selected_chi200","selected_chi250","selected_chi300","selected_chi350","selected_fe100","selected_fe150","selected_fe200","selected_fe250","selected_fe300","selected_fe350","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfm250","selected_sfm300","selected_sfm350","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_sfmt250","selected_sfmt300","selected_sfmt350","selected_cs100","selected_cs150","selected_cs200","selected_cs250","selected_cs300","selected_cs350"] #case=["feature_vector100","feature_vector110","feature_vector120","feature_vector130","feature_vector140","feature_vector150","feature_vector90","feature_vector80","feature_vector70","feature_vector60","feature_vector50","feature_vector40","feature_vector30","feature_vector20","feature_vector10","feature_vector5"] case = [ "selected_chi100", "selected_chi150", "selected_chi200", "selected_chi250", "selected_chi300", "selected_chi350", "selected_fe100", "selected_fe150", "selected_fe200", "selected_fe250", "selected_fe300", "selected_fe350", "selected_sfm100", "selected_sfm150", "selected_sfm200", "selected_sfm250", "selected_sfm300", "selected_sfm350", "selected_sfmt100", "selected_sfmt150", "selected_sfmt200", "selected_sfmt250", "selected_sfmt300", "selected_sfmt350" ] #case=["selected_chi100","selected_chi150","selected_chi200","selected_fe100","selected_fe150","selected_fe200","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_cs100","selected_cs150","selected_cs200",] for nomefile in case: print(nomefile) feature_vector = "./selected_vectors30/" + nomefile + ".arff" # legge il file in formato arff loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(feature_vector) data.class_is_last() #print(data) f = open("./selected_vectors30/risultati/" + nomefile + ".txt", "w+") # file risultati in txt #intestazione excel intest = [ "Correlation coefficient", "Mean absolute error", "Root mean squared error", "Relative absolute error", "Root relative squared error", "Total Number of Instances" ] workbook = xlsxwriter.Workbook("./selected_vectors30/risultati/" + nomefile + ".xlsx") # file excel worksheet = workbook.add_worksheet() for col_num, dati in enumerate(intest): worksheet.write(0, col_num + 1, dati) riga = 1 #lista degli algoritmi da eseguire #alg=["meta.Bagging","meta.RandomSubSpace","rules.M5Rules","trees.M5P","trees.RandomForest"] alg = [ "bayes.NaiveBayes", "bayes.NaiveBayesUpdateable", "functions.Logistic", "functions.SGD", "functions.SimpleLogistic", "functions.SMO", "functions.VotedPerceptron", "meta.AdaBoostM1", "meta.AttributeSelectedClassifier", "meta.Bagging", "meta.ClassificationViaRegression", "meta.IterativeClassifierOptimizer", "meta.LogitBoost", "meta.RandomCommittee", "meta.RandomSubSpace", "rules.DecisionTable", "rules.JRip", "rules.OneR", "trees.DecisionStump", "trees.J48", "trees.RandomForest", "trees.REPTree" ] for row_num, dati in enumerate(alg): worksheet.write(row_num + 1, 0, dati) for i in alg: remove = Filter( classname="weka.filters.unsupervised.attribute.Remove") cls = Classifier(classname="weka.classifiers." + i) fc = FilteredClassifier() fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) # 10 fold cross validation #evl.evaluate_train_test_split(fc,data,50,None,None) # 50% split cross validation k = evl.summary() #scrittura sui file f.write(i + "\n") f.write(k + "\n") my_list = k.split('\n') for col_num, dati in enumerate(my_list): worksheet.write(riga, col_num, dati[-10:]) print(i) riga += 1 f.close() workbook.close()
print("Train/test/predict...") groups = ["DataSet1", "DataSet2"] # groups = ["DataSet2"] for group in groups: print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name("reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name("reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name("reference value").index cls = FilteredClassifier() cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
curve = {} for percentage in percentages: curve[percentage] = 0 curves[repetition] = curve # run and add up percentage correct from repetition for seed in xrange(repetition): seed += 1 sys.stdout.write(".") for percentage in percentages: cls = Classifier(classname="weka.classifiers.trees.J48") flt = Filter( classname="weka.filters.unsupervised.instance.Resample", options=["-Z", str(percentage), "-no-replacement"]) fc = FilteredClassifier() fc.classifier = cls fc.filter = flt evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(seed)) curve[percentage] += (evl.percent_correct / repetition) # progress info sys.stdout.write("\n") # output the results if not plot.matplotlib_available: print("ZeroR: " + str(baseline)) for repetition in repetitions: y = [] for percentage in percentages: y.append(curves[repetition][percentage]) print("Repetitions = " + str(repetition) + ":\n" + str(y)) else:
articles_collection = db.articles article = articles_collection.find_one({"_id": article_id}) jvm.start(system_cp=True, packages=True, max_heap_size="512m") # Train classifier loader = Loader(classname="weka.core.converters.ArffLoader", options=["-charset", "UTF-8"]) train_data = loader.load_file(os.path.dirname(os.path.realpath(__file__)) + "/datasets/train.arff") train_data.class_is_last() string_to_word_vector_filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector") cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial") fc = FilteredClassifier() fc.filter = string_to_word_vector_filter fc.classifier = cls fc.build_classifier(train_data) # Create test data class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"]) str_att = Attribute.create_string("title") test_dataset = Instances.create_instances( name="test_news_set", atts=[str_att, class_att], capacity=1 )
fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.class_is_last() setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.classifier = Classifier(classname=classifier) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.matrix("Matrix:")) jvm.stop()
test = converters.load_any_file("imbalanced_test.arff") train.class_is_last() test.class_is_last() # Minority Class is getting Sampled 5x smote = Filter(classname="weka.filters.supervised.instance.SMOTE", options=["-P", "500.0"]) # Base Classifier cls = Classifier(classname="weka.classifiers.trees.LMT", options=["-B", "-I", "10"]) # Filtered Classifier fc = FilteredClassifier() fc.filter = smote fc.classifier = cls # 5 Fold K cross validation evl = Evaluation(train) evl.crossvalidate_model(fc, train, 5, Random(1)) # Prints Out Confusion Matrix along with other summary statistics print("LMT (SMOTE balanced classes) CV = 5 Error: %.2f%%" % (evl.percent_incorrect)) print(evl.matrix()) #Confusion Matrix # Plots ROC plcls.plot_roc(evl, class_index=[0, 1], wait=True) # Extra Summary