def main(args): """ Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the actual class from a test set. Class attribute is assumed to be the last attribute. :param args: the commandline arguments (train and test datasets) :type args: list """ # load a dataset helper.print_info("Loading train: " + args[1]) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(args[1]) train.class_index = train.num_attributes - 1 helper.print_info("Loading test: " + args[2]) test = loader.load_file(args[2]) test.class_is_last() # classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # output predictions print("# - actual - predicted - error - distribution") for index, inst in enumerate(test): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print( "%d - %s - %s - %s - %s" % (index+1, inst.get_string_value(inst.class_index), inst.class_attribute.value(int(pred)), "yes" if pred != inst.get_value(inst.class_index) else "no", str(dist.tolist())))
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def main(args): """ Trains a NaiveBayesUpdateable classifier incrementally on a dataset. The dataset can be supplied as parameter. :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file, incremental=True) data.class_is_last() # classifier nb = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") nb.build_classifier(data) # train incrementally for inst in loader: nb.update_classifier(inst) print(nb)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: single object") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: single object") serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i+1) + ":") if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj)
class python_weka(object): def __init__(self, input_x, input_y, labels): self.input_x = input_x self.input_y = input_y self.labels = labels def write_arff(self, filename, relation, train_or_predict, input_x, input_y=None): f = open(filename, "w") f.write("@relation " + relation + "\n") for i in self.labels: train_or_predict += 1 if train_or_predict == len(self.labels): break f.write("@attribute " + i + " " + self.labels[i] + "\n") f.write("\n") f.write("@data" + "\n") for i in range(len(input_x)): for j in input_x[i]: f.write(str(j) + " ") if train_or_predict == 0: f.write(str(input_y[i])) else: f.write(str(0)) f.write("\n") f.close() def train(self): filename = "train.arff" self.write_arff(filename, "train", 0, self.input_x, self.input_y) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_last() self.cls = Classifier(classname="weka.classifiers.meta.Bagging", options=["-S", "5"]) self.cls.build_classifier(data) os.remove(filename) def predict(self, test_data): filename = "test.arff" self.write_arff(filename, "test", 0, test_data) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_last() # evl = Evaluation(data) # evl.evaluate_model(self.cls,data) # data.set_class_label(data.numAttributes() - 1) # data.setClassIndex(data.numAttributes() - 1) result = [] for index, inst in enumerate(data): pred = self.cls.classify_instance(inst) dist = self.cls.distribution_for_instance(inst) result.append(dist[0]) # print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) # print str(index+1) + 'dist:'+ str(dist) os.remove(filename) return result
def playback_speed_checker(inputFile, dirRef): TRAINING_ARFF = 'dataset_playback.arff' inputRef = "" # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Find reference file for file in os.listdir(dirRef): if str(file).find(str(os.path.basename(inputFile))) != -1: inputRef = os.path.join(dirRef, file) break # Calculation distance (result, distance) = dtw_checker(inputFile, inputRef) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier #cls = Classifier(classname="weka.classifiers.functions.SMO") cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0) speed_instance.dataset = data # Classify instance speed_flag = cls.classify_instance(speed_instance) if (distance == 0): speed_class = 'nominal' else: if speed_flag == 0: speed_class = 'down_speed' if speed_flag == 0: speed_class = 'up_speed' # print os.path.basename(inputFile) + ' --- ' + speed_class # Stop JVM jvm.stop() print "SPEED IS: " + speed_class return speed_class
def main(): """ Just runs some example code. """ # load a dataset bodyfat_file = helper.get_data_dir() + os.sep + "bodyfat.arff" helper.print_info("Loading dataset: " + bodyfat_file) loader = Loader("weka.core.converters.ArffLoader") bodyfat_data = loader.load_file(bodyfat_file) bodyfat_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.M5P") classifier.build_classifier(bodyfat_data) print(classifier)
def train_model(self, training_data): model_weka = None if os.path.isfile(self.model_file): print 'Model ' + self.name + ' already trained.' else: print 'Starting to train_model model ' + self.name + '.' model_weka = Classifier(classname = self.classname, options = self.options) model_weka.build_classifier(data = training_data) serialization.write(filename = self.model_file, jobject = model_weka) print 'Model ' + self.name + ' trained and saved.' if os.path.isfile(self.parameter_file): print 'Parameters of the model ' + self.name + ' already saved.' else: if model_weka == None: model_weka = Classifier(jobject = serialization.read(self.model_file)) save_file(file_name = self.parameter_file, content = str(model_weka)) print 'Parameters of the model ' + self.name + ' saved.'
def riaa_checker(inputFile): TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff' # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Calculation of bark bands information (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier cls = Classifier(classname="weka.classifiers.functions.SMO") #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0) bark_instance.dataset = data # Classify instance riaa_flag = cls.classify_instance(bark_instance) if riaa_flag == 0: riaa_class = 'riaa_ok' else: riaa_class = 'riaa_ko' # print os.path.basename(inputFile) + ' --- ' + riaa_class # Stop JVM jvm.stop() print "RIAA FILTERING?: " + riaa_class return riaa_class
def getDecisionTree(self, inputPath): #load arff data = self.load_Arff(inputPath) #classifier data.set_class_index(data.num_attributes() - 1) # set class attribute classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) data.set_class_index(data.num_attributes() - 1) classifier.build_classifier(data) classifierStr = str(classifier) for index in range(0,data.num_instances()): instance = data.get_instance(index) #print instance result = classifier.distribution_for_instance(instance) #print result graph = classifier.graph() return graph
def classify(train, test, name="RF", tuning=False): jvm.start() if isinstance(train, list) and isinstance(test, list): train = weka_instance(train) trn_data = converters.load_any_file(train) test = weka_instance(test) tst_data = converters.load_any_file(test) elif os.path.isfile(train) and os.path.isfile(test): trn_data = converters.load_any_file(train) tst_data = converters.load_any_file(test) else: trn = csv_as_ndarray(train) tst = csv_as_ndarray(test) trn_data = converters.ndarray_to_instances(trn, relation="Train") tst_data = converters.ndarray_to_instances(tst, relation="Test") trn_data.class_is_last() tst_data.class_is_last() # t = time() if tuning: opt = tune(train) else: opt = default_opt # print("Time to tune: {} seconds".format(time() - t)) cls = Classifier(classname=classifiers[name.lower()], options=opt) cls.build_classifier(trn_data) distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data] preds = [cls.classify_instance(inst) for inst in tst_data] jvm.stop() return preds, distr
def run_classifier(path, prot, sel, cols, prot_vals, beta): DIs = dict() jvm.start() for i in range(len(cols)-1): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(path) # remove selected attribute from the data # NOTE: options are ONE indexed, not ZERO indexed remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \ options=["-R", str(sel[2]+1)]) remove.inputformat(data) data = remove.filter(data) # if running for only one attribue, remove all others (except protected) if i > 0: for j in range(1, prot[2]+1): if i != j: remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \ options=["-R", ("1" if i>j else "2")]) remove.inputformat(data) data = remove.filter(data) # set prot attribute as Class attribute data.class_is_last() # run classifier cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(data) # count the number of each combination pos_and_pred = float(0.0) pos_and_not_pred = float(0.0) neg_and_pred = float(0.0) neg_and_not_pred = float(0.0) for ind, inst in enumerate(data): if cls.classify_instance(inst): if prot_vals[ind] == prot[1]: pos_and_pred += 1 else: neg_and_pred += 1 else: if prot_vals[ind] == prot[1]: pos_and_not_pred += 1 else: neg_and_not_pred += 1 # calculate DI BER = ((pos_and_not_pred / (pos_and_pred + pos_and_not_pred)) + \ (neg_and_pred / (neg_and_pred + neg_and_not_pred))) * 0.5 if BER > 0.5: BER = 1 - BER DI = 1 - ((1 - 2 * BER) / (beta + 1 - 2 * BER)) if i == 0: # consider changing this to a 'code word' instead of 'all' DIs["all"] = DI else: DIs[cols[i-1]] = DI jvm.stop() return DIs
def random_forest(train_data): cls = Classifier(classname="weka.classifiers.trees.RandomForest") cls.build_classifier(train_data) return cls
from weka.core.converters import Loader from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # determine baseline with ZeroR zeror = Classifier(classname="weka.classifiers.rules.ZeroR") zeror.build_classifier(data) evl = Evaluation(data) evl.test_model(zeror, data) print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct) print("\nHoldout 10%...") # use seed 1-10 and perform random split with 90% perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.evaluate_train_test_split( Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i)) perc.append(round(evl.percent_correct, 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct))
jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file("data_train.csv") data.class_is_last() knn_classifier = Classifier(classname="weka.classifiers.lazy.IBk", options=["-K", "3"]) lin_classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "0"]) svm_classifier = Classifier(classname="weka.classifiers.functions.SMOreg", options=["-C", "1.0"]) knn_classifier.build_classifier(data) lin_classifier.build_classifier(data) svm_classifier.build_classifier(data) classifiers = [knn_classifier, lin_classifier, svm_classifier] print("###################### Classifiers ######################") for classifier in classifiers: print("~~~~~~~~~~~~~~~~~~~") print(classifier) classifier_names = [ "KNN Classifier", "LinearRegression Classifier", "SVM Classifier" ] documents = get_docs(sys.argv[1])
jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]: # train/test split 90% using classifier cls = Classifier(classname=classifier) evl = Evaluation(data) evl.evaluate_train_test_split(cls, data, 90.0, Random(1)) print("\n" + classifier + " train/test split (90%):\n" + evl.summary()) cls.build_classifier(data) print(classifier + " model:\n\n" + str(cls)) # calculate mean/stdev over 10 cross-validations for classifier in [ "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]: accuracy = [] for i in xrange(1,11): cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(i)) accuracy.append(evl.percent_correct) nacc = numpy.array(accuracy) print("%s: %0.2f +/-%0.2f" % (classifier, numpy.mean(nacc), numpy.std(nacc)))
def bayes_net(train_data): train_data.class_is_last() cls = Classifier(classname="weka.classifiers.bayes.BayesNet") cls.build_classifier(train_data) return cls
def bagging(train_data): train_data.class_is_last() cls = Classifier(classname="weka.classifiers.meta.Bagging") cls.build_classifier(train_data) return cls
def classify(fileToClassify, fileToCompare, predictionYear=None, pastResultYears=None, prefix="NFL", classifierFunction=[ "LinearRegression", ["-S", "0", "-R", "1.0E-8", "-num-decimal-places", "4"] ]): # Start Java VM jvm.start(max_heap_size="1024m") # Load CSV files into weka loader loader = Loader(classname="weka.core.converters.CSVLoader") fileToClassifyData = loader.load_file(fileToClassify) fileToClassifyData.class_is_last() fileToCompareData = loader.load_file(fileToCompare) fileToCompareData.class_is_last() predictionYear = "".join(map(str, predictionYear)) pastResultYears = "-".join(map(str, pastResultYears)) # Generate Classifier based on data classifier = Classifier(classname="weka.classifiers.functions.{}".format( classifierFunction[0]), options=classifierFunction[1]) classifier.build_classifier(fileToClassifyData) print(classifier) # Var builder for graph count = 0.0 countPred = 0.0 graphDetails = [ ['TITLE'], [ '{1} Data Ratings (Official) {0}'.format(pastResultYears, prefix), [], [] ], [ '{1} Data Ratings (Predicted) {0}'.format(predictionYear, prefix), [], [] ] ] # Time to predict results based on classifier for index, inst in enumerate(fileToCompareData): pred = classifier.classify_instance(inst) temp = list(enumerate(inst))[-1][1] countPred += pred count += temp # index=list(enumerate(inst))[3+1][1] index += 1 print('YOLO', list(enumerate(inst))[3][1]) print("{0:.3f} accurate compared to results.".format(countPred / count)) dist = classifier.distribution_for_instance(inst) # NFL Results graphDetails[1][1].append(index) graphDetails[1][2].append(temp) # Predicted Results graphDetails[2][1].append(index) graphDetails[2][2].append(pred) print( str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist) + " , original: " + str(temp)) graphDetails[0][ 0] = 'Player Rating Predictions For {0} ({1:.3f} Accurate)'.format( predictionYear, 100 - (countPred / count)) jvm.stop() BuildGraph(graphDetails)
def classifyTest(fileToClassify, fileToCompare, predictionYear=None, pastResultYears=None, classifier=None): # Start Java VM jvm.start(max_heap_size="1024m") # Load CSV files into weka loader loader = Loader(classname="weka.core.converters.CSVLoader") fileToClassifyData = loader.load_file(fileToClassify) fileToClassifyData.class_is_last() fileToCompareData = loader.load_file(fileToCompare) fileToCompareData.class_is_last() # Generate Classifier based on data classifier = Classifier( classname="weka.classifiers.functions.MultilayerPerceptron", options=[ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "a" ]) classifier.build_classifier(fileToClassifyData) print(classifier) # Var builder for graph count = 0.0 countPred = 0.0 graphDetails = [ ['TITLE'], ['NFL Data Ratings (Official) {0}'.format(pastResultYears), [], []], ['NFL Data Ratings (Predicted) {0}'.format(predictionYear), [], []] ] # Time to predict results based on classifier for index, inst in enumerate(fileToCompareData): pred = classifier.classify_instance(inst) temp = list(enumerate(inst))[-1][1] countPred += pred count += temp # index=list(enumerate(inst))[3+1][1] index += 1 print('YOLO', list(enumerate(inst))[3][1]) print("{0:.3f} accurate compared to results.".format(countPred / count)) dist = classifier.distribution_for_instance(inst) # NFL Results graphDetails[1][1].append(index) graphDetails[1][2].append(temp) # Predicted Results graphDetails[2][1].append(index) graphDetails[2][2].append(pred) print( str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist) + " , original: " + str(temp)) graphDetails[0][ 0] = 'Player Rating Predictions For {0} ({1:.3f} Accurate)'.format( predictionYear, 100 - (countPred / count)) jvm.stop() print(graphDetails) BuildGraph(graphDetails)
smote_test_data.class_is_first() # load logistic model tree algorithm log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_smote_test_obj = Evaluation(smote_test_data) eval_smote_test_obj.crossvalidate_model(classifier=log_tree, data=smote_test_data, num_folds=5, rnd=Random(1)) print("SMOTE Test CV (5-folds) Error = %.2f%%" % (eval_smote_test_obj.percent_incorrect)) print(eval_smote_test_obj.matrix()) print("=================\"Summary\"====================") print(eval_smote_test_obj.summary()) log_tree.build_classifier(smote_test_data) y_predict = eval_smote_test_obj.test_model(log_tree, smote_test_data) y_test = to_binary_numeric(y_test.head(500), classNeg="neg") falsePositiveRate, truePositiveRate, thresholds = roc_curve( y_test, y_predict) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate, truePositiveRate, color='red', label='ROC = ' + str(area)) plt.plot([0, 1], [0, 1], linestyle='dotted') plt.xlabel('False Positive Rate')
def predict(self, to_test_file: str, trained_file: str): if to_test_file == "" or trained_file == "": raise Exception ("Please fill all the fields") # if if not(os.path.isfile(to_test_file) or os.path.isfile(trained_file)): raise Exception ("The file to test and the trained file must be paths to existing files") # if if not (to_test_file.endswith(".arff") or trained_file.endswith(".arff")): raise Exception ("The file to test and the trained one must be arffs files") # if # Checking files headers, they must be the same to do predictions trained_header = "" to_test_header = "" try: with open(trained_file, "r") as tf: read_file = True while read_file: line = tf.readline() if not "@data" in line: trained_header += line else: read_file = False # if # while # with with open(to_test_file, "r") as tt: read_file = True while read_file: line = tt.readline() if not "@data" in line: to_test_header += line else: read_file = False # if # while # with except: raise Exception("Error opening the referencce arff file") # except if not trained_header == to_test_header: raise Exception("Files header must be the same") # if # Loading trained and test data from arff files loader = Loader(classname="weka.core.converters.ArffLoader") trained_data = loader.load_file(trained_file) trained_data.class_index = trained_data.num_attributes - 1 to_test_data = loader.load_file(to_test_file) to_test_data.class_is_last() # Building classifier from trained data cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) cls.build_classifier(trained_data) # Evaluating predictions for i, inst in enumerate(to_test_data): pred = cls.classify_instance(inst) # dist = cls.distribution_for_instance(inst) row = [int(i+1), int(pred+1)] self._predicted.append(row)
# output arff files processDataToArff("train.arff", False) processDataToArff("test.arff", True) # setup training model loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file("train.arff") train.class_is_last() test = loader.load_file("test.arff") test.class_is_last() # print(train) cls = Classifier( classname="weka.classifiers.trees.LMT") #use LMT as our algorithm cls.build_classifier(train) #train the model using train.arff pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(train) evl.test_model(cls, test, pout) # print the result result = pout.buffer_content() #print(result) # split the result and only print the gesture resultLines = result.splitlines() for i in range(len(resultLines)): if (resultLines[i].find("upDown") != -1): print("%d upDown" % (i + 1))
# %% from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(y) y = le.transform(y) # %%jhdjghfjdh nominaldata = nominal.filter(dataset) nominaldata.class_is_last() # %% from weka.classifiers import Classifier, Evaluation from weka.core.classes import Random cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) cls.build_classifier(nominaldata) print(cls) import weka.plot.graph as graph # NB: pygraphviz and PIL are required graph.plot_dot_graph(cls.graph) evaluation = Evaluatiojgdjhfsdminaldata, 10, Random(42)) # 10-fold CV print(evaluation.summary()) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) # %% jvm.stop() # %%
class SklearnWekaWrapper(object): def __init__(self, class_name, options=None): if options is not None: self._classifier = Classifier(classname=class_name, options=[ option for option in options.split()]) else: self._classifier = Classifier(classname=class_name) def fit(self, training_set, ground_through): self.ground_through = ground_through training_set = self._sklearn2weka(training_set, self.ground_through) training_set.class_is_last() self._classifier.build_classifier(training_set) def predict(self, testing_set): testing_set = self._sklearn2weka(testing_set, self.ground_through) testing_set.class_is_last() preds = [] for index, inst in enumerate(testing_set): pred = self._classifier.classify_instance(inst) preds.append(pred) preds = np.vectorize(self._dict.get)(preds) return np.array(preds) def predict_proba(self, testing_set): testing_set = self._sklearn2weka(testing_set, self.ground_through) testing_set.class_is_last() dists = [] for index, inst in enumerate(testing_set): dist = self._classifier.distribution_for_instance(inst) dists.append(dist) return np.array(dists) def _sklearn2weka(self, features, labels=None): encoder = CategoricalEncoder(encoding='ordinal') labels_nominal = encoder.fit_transform(np.array(labels).reshape(-1, 1)) if not hasattr(self, 'dict') and labels is not None: dict = {} for label, nominal in zip(labels, labels_nominal): if nominal.item(0) not in dict: dict[nominal.item(0)] = label self._dict = dict labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1]) weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset') weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1]) if labels is not None: for index, inst in enumerate(weka_dataset): inst.set_value(features.shape[1], labels_column[index]) weka_dataset.set_instance(index,inst) return weka_dataset
def ClassifyParam(website, mode, binWidths, truncation_modes=["full", "truncated"]): if not os.path.exists("classificationResults"): os.makedirs("classificationResults") if("normal" in mode): for truncation in truncation_modes: file = open("classificationResults/SingleWebsite_%s_%s.csv"%(truncation, website),"w") file.write("BinWidth, Accuracy, FalsePositiveRate, FalseNegativeRate\n") for binWidth in binWidths: train_set_file = "TrainSet_%s_%s.arff"%(truncation, binWidth) train_set = "Data/%s/arff/%s"%(website, train_set_file) test_set = "Data/%s/arff/%s"%(website, train_set_file.replace("TrainSet", "TestSet")) print "Loading Datasets..." print "Train: " + train_set train_data = converters.load_any_file(train_set) print "Test: " + test_set test_data = converters.load_any_file(test_set) #Set class attribute train_data.class_is_last() test_data.class_is_last() print "Dataset Loaded!" classifier_name = "weka.classifiers.meta.FilteredClassifier" classifier = Classifier(classname=classifier_name, options=[ "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"", "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"]) start_train = time.time() classifier.build_classifier(train_data) end_train = time.time() print "Train\t%s\t%s"%(binWidth, end_train-start_train) for index, inst in enumerate(test_data): if(index == 0): start_sample = time.time() classifier.classify_instance(inst) end_sample = time.time() print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample) print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth) evaluation = Evaluation(test_data) start_batch = time.time() evaluation.test_model(classifier, test_data) end_batch = time.time() print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch) print evaluation.summary() print evaluation.matrix() #Just as an example, we're measuring the fpr and fnr of the website indexed as class 1 tp = evaluation.num_true_positives(1) tn = evaluation.num_true_negatives(1) fp = evaluation.num_false_positives(1) fn = evaluation.num_false_negatives(1) acc = (tp+tn)/float(tp+tn+fp+fn) fpr = evaluation.false_positive_rate(1) fnr = evaluation.false_negative_rate(1) print "Accuracy: %s"%(acc) print "False Positive Rate: %s"%(fpr) print "False Negative Rate: %s"%(fnr) file.write("%s, %s, %s, %s\n"%(binWidth, acc, fpr, fnr)) file.close()
class SklearnWekaWrapper(object): def __init__(self, classifier_name): if classifier_name == 'wrf': class_name='weka.classifiers.trees.RandomForest' options=None elif classifier_name == 'wj48': class_name='weka.classifiers.trees.J48' options=None elif classifier_name == 'wnb': class_name='weka.classifiers.bayes.NaiveBayes' options='-D' elif classifier_name == 'wbn': class_name='weka.classifiers.bayes.BayesNet' options='-D -Q weka.classifiers.bayes.net.search.local.TAN -- -S BAYES -E weka.classifiers.bayes.net.estimate.SimpleEstimator -- -A 0.5' if options is not None: Classifier(classname=class_name, options=[option for option in options.split()]) else: self._classifier = Classifier(classname=class_name) def fit(self, training_set, ground_truth): self.ground_truth = ground_truth training_set = self._sklearn2weka(training_set, self.ground_truth) training_set.class_is_last() self._classifier.build_classifier(training_set) def predict(self, testing_set): testing_set = self._sklearn2weka(testing_set, self.ground_truth) testing_set.class_is_last() preds = [] for index, inst in enumerate(testing_set): pred = self._classifier.classify_instance(inst) preds.append(pred) preds = np.vectorize(self._dict.get)(preds) return np.array(preds) def predict_proba(self, testing_set): testing_set = self._sklearn2weka(testing_set, self.ground_truth) testing_set.class_is_last() dists = [] for index, inst in enumerate(testing_set): dist = self._classifier.distribution_for_instance(inst) dists.append(dist) return np.array(dists) def set_oracle(self, oracle): pass def _sklearn2weka(self, features, labels=None): features_encoder = OrdinalEncoder() labels_nominal = features_encoder.fit_transform(np.array(labels).reshape(-1, 1)) if not hasattr(self, 'dict') and labels is not None: dict = {} for label, nominal in zip(labels, labels_nominal): if nominal.item(0) not in dict: dict[nominal.item(0)] = label self._dict = dict labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1]) weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset') weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1]) if labels is not None: for index, inst in enumerate(weka_dataset): inst.set_value(features.shape[1], labels_column[index]) weka_dataset.set_instance(index,inst) return weka_dataset
class Experiment: data = None class_index = -1 classifier = None attrs = [] def __init__(self): # jvm.start(max_heap_size="2500M") pass def out(self, x): print x.__str__().encode('ascii', 'ignore') def loadCSV(self, filename, path='/home/sbiastoch/Schreibtisch/csv_files/'): weka_loader = Loader(classname="weka.core.converters.CSVLoader") self.data = weka_loader.load_file(path+filename) def setClassIndex(self, index): if index < 0: self.data.class_index = self.data.num_attributes + index else: self.data.class_index = index def train_J48(self, min_per_rule=20): params = [ '-C','0.3', '-M',str(min_per_rule), # '-N',str(folds), # '-R', ] self.base_classifier = Classifier(classname='weka.classifiers.trees.J48', options=params) self._train() def train_JRip(self, min_per_rule=20, optimizations=2, folds=3, seed=42): params = [ '-F', str(folds), # folds '-N', str(min_per_rule), # min elements per rule '-O', str(optimizations), # optimizations '-S', str(seed) #seed ] self.base_classifier = Classifier(classname='weka.classifiers.rules.JRip', options=params) self._train() def _train(self): params = [ '-F','weka.filters.unsupervised.attribute.RemoveByName -E ^('+'|'.join(self.attrs)+')$ -V', '-W', self.base_classifier.classname, '--', ] params.extend(self.base_classifier.options) # self.classifier = Classifier(classname='weka.classifiers.meta.FilteredClassifier', options=params) self.classifier = FilteredClassifier(options=params) # self.classifier.filter(Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=['-E','^('+'|'.join(self.attrs)+')$','-V'])) self.classifier.build_classifier(self.data) self.out(self.classifier.__str__().encode('ascii', 'ignore').split("\n")[-2]) def test(self, folds = 10): evaluation = Evaluation(self.data) # initialize with priors evaluation.crossvalidate_model(self.classifier, self.data, folds, Random(42)) # 10-fold CV print('Total number of instances: '+str(evaluation.num_instances)+'.') print(str(round(evaluation.percent_correct,2))+'% / '+str(round(evaluation.correct, 2))+' correct.') print(str(round(evaluation.percent_incorrect,2))+'% / '+str(round(evaluation.incorrect, 2))+' incorrect.') def saveCSV(self, filename, path='/home/sbiastoch/Schreibtisch/csv_files/'): saver = Saver(classname="weka.core.converters.CSVSaver") saver.save_file(self.data, path+filename) def loadClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'): objects = serialization.read_all(path+filename) self.classifier = Classifier(jobject=objects[0]) #self.data = Instances(jobject=objects[1]) def saveClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'): serialization.write_all(path+filename, [self.classifier, Instances.template_instances(self.data)]) def remove_correct_classified(self, invert = False): options=[ '-W', self.classifier.to_commandline(), '-C', str(self.class_index), #classindex # '-F','0', # folds # '-T','0.1', #threshold by numeric classes '-I','0', # max iterations '-V' if not invert else '' ] # invert classname = "weka.filters.unsupervised.instance.RemoveMisclassified" remove = Filter(classname=classname, options=options) remove.inputformat(self.data) self.data = remove.filter(self.data) def remove_incorrect_classified(self): self.remove_correct_classified(True) def set_attributes(self, attrs): self.attrs = attrs def select_missclassified(self): remove = Filter(classname="weka.filters.supervised.attribute.AddClassification", options=['-classification' ,'-error' ,'-W' ,self.base_classifier.to_commandline()]) remove.inputformat(self.data) self.data = remove.filter(self.data) remove = Filter(classname="weka.filters.unsupervised.instance.RemoveWithValues", options=['-S','0.0','-C','last','-L','last','-V']) remove.inputformat(self.data) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=['-R',str(self.data.num_attributes-2)+',last']) remove.inputformat(self.data) self.data = remove.filter(self.data) def merge_nominal_attributes(self, significance=0.01): remove = Filter(classname="weka.filters.supervised.attribute.MergeNominalValues", options=['-L',str(significance),'-R','first-last']) remove.inputformat(self.data) self.data = remove.filter(self.data)
def evaluate_j48(datasets_path, intermediary_path): # for examples on how to use this function, refer to # http://pythonhosted.org/python-weka-wrapper/examples.html#build-classifier-on-dataset-output-predictions import weka.core.jvm as jvm from weka.core.converters import Loader from weka.classifiers import Classifier from sklearn.metrics import precision_score, accuracy_score, f1_score from networkx.drawing.nx_agraph import graphviz_layout jvm.start() json_results = { 'runs': { '1': dict() } } try: for dataset in os.listdir(datasets_path): dataset_name = dataset.split('.')[0] json_results['runs']['1'][dataset_name] = dict() loader = Loader(classname="weka.core.converters.ArffLoader") y_pred_all = [] y_true_all = [] heights = [] n_nodes = [] for n_fold in it.count(): try: train_s = loader.load_file( os.path.join(intermediary_path, '%s_fold_%d_train.arff' % (dataset_name, n_fold))) val_s = loader.load_file( os.path.join(intermediary_path, '%s_fold_%d_val.arff' % (dataset_name, n_fold))) test_s = loader.load_file( os.path.join(intermediary_path, '%s_fold_%d_test.arff' % (dataset_name, n_fold))) train_s.relationname = dataset_name val_s.relationname = dataset_name test_s.relationname = dataset_name train_s.class_is_last() val_s.class_is_last() test_s.class_is_last() warnings.warn('WARNING: appending validation set in training set.') for inst in val_s: train_s.add_instance(inst) cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) # cls = Classifier(classname="weka.classifiers.trees.REPTree", # options=["-M", "2", "-V", "0.001", "-N", "3", "-S", "1", "-L", "-1", "-I", "0.0"]) cls.build_classifier(train_s) warnings.warn('WARNING: will only work for binary splits!') graph = cls.graph.encode('ascii') out = StringIO.StringIO(graph) G = nx.Graph(nx.nx_pydot.read_dot(out)) # TODO plotting! # fig = plt.figure(figsize=(40, 30)) # pos = graphviz_layout(G, root='N0', prog='dot') # # edgelist = G.edges(data=True) # nodelist = G.nodes(data=True) # # edge_labels = {(x1, x2): v['label'] for x1, x2, v in edgelist} # node_colors = {node_id: ('#98FB98' if 'shape' in _dict else '#0099FF') for node_id, _dict in nodelist} # node_colors['N0'] = '#FFFFFF' # node_colors = node_colors.values() # # nx.draw_networkx_nodes(G, pos, node_color=node_colors) # nx.draw_networkx_edges(G, pos, style='dashed', arrows=False) # nx.draw_networkx_labels(G, pos, {k: v['label'] for k, v in G.node.iteritems()}) # nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels) # plt.axis('off') # plt.show() # exit(0) # TODO plotting! heights += [max(map(len, nx.shortest_path(G, source='N0').itervalues()))] n_nodes += [len(G.node)] y_test_true = [] y_test_pred = [] # y_train_true = [] # y_train_pred = [] # for index, inst in enumerate(train_s): # y_train_true += [inst.get_value(inst.class_index)] # y_train_pred += [cls.classify_instance(inst)] for index, inst in enumerate(test_s): y_test_true += [inst.get_value(inst.class_index)] y_test_pred += [cls.classify_instance(inst)] y_true_all += y_test_true y_pred_all += y_test_pred except Exception as e: break json_results['runs']['1'][dataset_name] = { 'confusion_matrix': confusion_matrix(y_true_all, y_pred_all).tolist(), 'height': heights, 'n_nodes': n_nodes, } # interprets json_results = json.load(open('/home/henry/Desktop/j48/j48_results.json', 'r')) n_runs = len(json_results['runs'].keys()) some_run = json_results['runs'].keys()[0] n_datasets = len(json_results['runs'][some_run].keys()) df = pd.DataFrame( columns=['run', 'dataset', 'test_acc', 'height mean', 'height std', 'n_nodes mean', 'n_nodes std'], index=np.arange(n_runs * n_datasets), dtype=np.float32 ) df['dataset'] = df['dataset'].astype(np.object) count_row = 0 for n_run, run in json_results['runs'].iteritems(): for dataset_name, dataset in run.iteritems(): conf_matrix = np.array(dataset['confusion_matrix'], dtype=np.float32) test_acc = np.diag(conf_matrix).sum() / conf_matrix.sum() height_mean = np.mean(dataset['height']) height_std = np.std(dataset['height']) n_nodes_mean = np.mean(dataset['n_nodes']) n_nodes_std = np.std(dataset['n_nodes']) df.loc[count_row] = [ int(n_run), str(dataset_name), float(test_acc), float(height_mean), float(height_std), float(n_nodes_mean), float(n_nodes_std) ] count_row += 1 print df json.dump(json_results, open('j48_results.json', 'w'), indent=2) df.to_csv('j48_results.csv', sep=',', quotechar='\"', index=False) finally: jvm.stop()
def rbfc(train_data): train_data.class_is_last() cls = Classifier(classname="weka.classifiers.functions.RBFClassifier") cls.build_classifier(train_data) return cls
fileOut.write("##################### y => " + str(y)+"\n"); print("##################### y => " + str(y)+"\n"); # f1 = loader.load_file(PathToData + dataset + "/train.arff") train.class_is_last() test.class_is_last() # f1.class_is_last() labledDataSet , UnlabledDataSet = splitTrainSet(train); tree = Classifier(classname="weka.classifiers.trees.J48", options=["-A"]) # tree = Classifier(classname="weka.classifiers.trees.J48") tree.build_classifier(labledDataSet) eval = Evaluation(labledDataSet) eval.test_model(tree, test) fileOut.write("Labeled data======== " + str((1.0 - eval.error_rate )* 100) + " number of instances== " + str(labledDataSet.num_instances) + "\n") Newtrainpool = LabeledUnlabeldata(labledDataSet, UnlabledDataSet, tree, y ) # Newtrainpool = LabeledUnlabeldata(labledDataSet, UnlabledDataSet, tree, y , cal_method=Method) fileOut.write("\n\nLabeled data======== " + str((1.0 - eval.error_rate )* 100) + " number of instances== " + str(labledDataSet.num_instances) + "\n") fileOut.write(" Decision Tree \n") fileOut.write("\n precision recall areaUnderROC \n\n") for i in range(test.get_instance(0).num_classes) :
def adaboost(train_data): train_data.class_is_last() cls = Classifier(classname="weka.classifiers.meta.AdaBoostM1") cls.build_classifier(train_data) return cls
def runner(self, cdat, heap_size = 16384, seed = None, verbose = True): self.set_status(Pipeline.RUNNING) self.logs.append('Initializing Pipeline') para = self.config self.logs.append('Reading Pipeline Configuration') head = '' name = get_rand_uuid_str() self.logs.append('Reading Input File') for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.RUNNING if stage.code == 'dat.fle': head = os.path.abspath(stage.value.path) name, _ = os.path.splitext(stage.value.name) self.logs.append('Parsing to ARFF') path = os.path.join(head, '{name}.arff'.format(name = name)) # This bug, I don't know why, using Config.schema instead. # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose) for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.COMPLETE self.logs.append('Saved ARFF at {path}'.format(path = path)) self.logs.append('Splitting to Training and Testing Sets') JVM.start(max_heap_size = '{size}m'.format(size = heap_size)) load = Loader(classname = 'weka.core.converters.ArffLoader') # data = load.load_file(path) # save = Saver(classname = 'weka.core.converters.ArffSaver') data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only data.class_is_last() # For Debugging Purposes Only # data.class_index = cdat.iclss for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.RUNNING self.logs.append('Splitting Training Set') # TODO - Check if this seed is worth it. seed = assign_if_none(seed, random.randint(0, 1000)) opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)] wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V']) wobj.inputformat(data) tran = wobj.filter(data) self.logs.append('Splitting Testing Set') wobj.options = opts test = wobj.filter(data) for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.COMPLETE self.logs.append('Performing Feature Selection') feat = [ ] for comb in para.FEATURE_SELECTION: if comb.USE: for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.RUNNING srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Search.NAME, options = assign_if_none(comb.Search.OPTIONS, [ ]) )) ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Evaluator.NAME, options = assign_if_none(comb.Evaluator.OPTIONS, [ ]) )) attr = AttributeSelection() attr.search(srch) attr.evaluator(ewal) attr.select_attributes(tran) meta = addict.Dict() meta.search = comb.Search.NAME meta.evaluator = comb.Evaluator.NAME meta.features = [tran.attribute(index).name for index in attr.selected_attributes] feat.append(meta) for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.COMPLETE models = [ ] for model in para.MODEL: if model.USE: summary = addict.Dict() self.logs.append('Modelling {model}'.format(model = model.LABEL)) summary.label = model.LABEL summary.name = model.NAME summary.options = assign_if_none(model.OPTIONS, [ ]) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.RUNNING for i, instance in enumerate(data): iclass = list(range(instance.num_classes)) options = assign_if_none(model.OPTIONS, [ ]) classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options) classifier.build_classifier(tran) serializer.write(os.path.join(head, '{name}.{classname}.model'.format( name = name, classname = model.NAME )), classifier) self.logs.append('Testing model {model}'.format(model = model.LABEL)) evaluation = Evaluation(tran) evaluation.test_model(classifier, test) summary.summary = evaluation.summary() frame = pd.DataFrame(data = evaluation.confusion_matrix) axes = sns.heatmap(frame, cbar = False, annot = True) b64str = get_b64_plot(axes) summary.confusion_matrix = addict.Dict({ 'value': evaluation.confusion_matrix.tolist(), 'plot': b64str }) self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL)) buffer = io.BytesIO() plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.learning_curve = b64str buffer = io.BytesIO() plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.roc_curve = b64str buffer = io.BytesIO() plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.prc_curve = b64str if classifier.graph: summary.graph = classifier.graph for i, instance in enumerate(test): prediction = classifier.classify_instance(instance) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.COMPLETE models.append(summary) self.gist.models = models JVM.stop() JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist) self.logs.append('Pipeline Complete') self.set_status(Pipeline.COMPLETE)
from weka.classifiers import Evaluation from weka.core.classes import Random from weka.classifiers import Classifier if classifier == 0: for kernel in range(0, 2): if kernel == 0: mapper = Classifier( classname= "weka.classifiers.misc.InputMappedClassifier", options=[ "-M", "-W", "weka.classifiers.bayes.NaiveBayes" ]) Class = 'NaiveBayes' mapper.build_classifier(dataTrain) evaluation = Evaluation(dataTrain) evaluation.test_model(mapper, dataTest) roc_NB.append( evaluation.area_under_roc(1) * 100) recall_NB.append(evaluation.recall(1) * 100) precision_NB.append( evaluation.precision(1) * 100) mapper.build_classifier(dataLastTrain) evaluation = Evaluation(dataLastTrain) evaluation.test_model(mapper, dataLastTest) roc_NB_Last.append( evaluation.area_under_roc(1) * 100) recall_NB_Last.append(
def train(request): jvm.start() d_att1 = Attribute.create_numeric("bodydearword.feature") d_att2 = Attribute.create_numeric("bodyform.feature") d_att3 = Attribute.create_numeric("bodyhtml.feature") d_att4 = Attribute.create_numeric("bodymultipart.feature") d_att5 = Attribute.create_numeric("bodynumchars.feature") d_att6 = Attribute.create_numeric("bodynumfunctionwords.feature") d_att7 = Attribute.create_numeric("bodynumuniqwords.feature") d_att8 = Attribute.create_numeric("bodynumwords.feature") d_att9 = Attribute.create_numeric("bodyrichness.feature") d_att10 = Attribute.create_numeric("bodysuspensionword.feature") d_att11 = Attribute.create_numeric("bodyverifyyouraccountphrase.feature") d_att12 = Attribute.create_numeric("externalsabinary.feature") d_att13 = Attribute.create_numeric("externalsascore.feature") d_att14 = Attribute.create_numeric("scriptjavascript.feature") d_att15 = Attribute.create_numeric("scriptonclick.feature") d_att16 = Attribute.create_numeric("scriptpopup.feature") d_att17 = Attribute.create_numeric("scriptstatuschange.feature") d_att18 = Attribute.create_numeric("scriptunmodalload.feature") d_att19 = Attribute.create_numeric("senddiffreplyto.feature") d_att20 = Attribute.create_numeric("sendnumwords.feature") d_att21 = Attribute.create_numeric("sendunmodaldomain.feature") d_att22 = Attribute.create_numeric("subjectbankword.feature") d_att23 = Attribute.create_numeric("subjectdebitword.feature") d_att24 = Attribute.create_numeric("subjectfwdword.feature") d_att25 = Attribute.create_numeric("subjectnumchars.feature") d_att26 = Attribute.create_numeric("subjectnumwords.feature") d_att27 = Attribute.create_numeric("subjectreplyword.feature") d_att28 = Attribute.create_numeric("subjectrichness.feature") d_att29 = Attribute.create_numeric("subjectverifyword.feature") d_att30 = Attribute.create_numeric("urlatchar.feature") d_att31 = Attribute.create_numeric("urlbaglink.feature") d_att32 = Attribute.create_numeric("urlip.feature") d_att33 = Attribute.create_numeric("urlnumdomains.feature") d_att34 = Attribute.create_numeric("urlnumexternallink.feature") d_att35 = Attribute.create_numeric("urlnumimagelink.feature") d_att36 = Attribute.create_numeric("urlnuminternallink.feature") d_att37 = Attribute.create_numeric("urlnumip.feature") d_att38 = Attribute.create_numeric("urlnumlink.feature") d_att39 = Attribute.create_numeric("urlnumperiods.feature") d_att40 = Attribute.create_numeric("urlnumport.feature") d_att41 = Attribute.create_numeric("urlport.feature") d_att42 = Attribute.create_numeric("urltwodoains.feature") d_att43 = Attribute.create_numeric("urlunmodalbaglink.feature") d_att44 = Attribute.create_numeric("urlwordclicklink.feature") d_att45 = Attribute.create_numeric("urlwordherelink.feature") d_att46 = Attribute.create_numeric("urlwordloginlink.feature") d_att47 = Attribute.create_numeric("urlwordupdatelink.feature") d_att48 = Attribute.create_nominal("class", {'phish', 'ham'}) # data_dir = settings.BASE_DIR + "/phishing/public/datasets/" # loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_dir + "dataset.arff") data.class_is_last() cls = Classifier(classname="weka.classifiers.trees.J48") cls.options = ["-C", "0.3"] cls.build_classifier(data) serialization.write(data_dir + "out.model", cls) classifier = Classifier(jobject=serialization.read(data_dir + "out.model")) dataset = Instances.create_instances("test", [ d_att1, d_att2, d_att3, d_att4, d_att5, d_att6, d_att7, d_att8, d_att9, d_att10, d_att11, d_att12, d_att13, d_att14, d_att15, d_att16, d_att17, d_att18, d_att19, d_att20, d_att21, d_att22, d_att23, d_att24, d_att25, d_att26, d_att27, d_att28, d_att29, d_att30, d_att31, d_att32, d_att33, d_att34, d_att35, d_att36, d_att37, d_att38, d_att39, d_att40, d_att41, d_att42, d_att43, d_att44, d_att45, d_att46, d_att47, d_att48 ], 0) values = [ 0, 0, 0, 0, 890, 1, 124, 198, 0.22247191011236, 0, 0, 0, 0.0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 21, 4, 1, 0.19047619047619, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Instance.missing_value() ] inst = Instance.create_instance(values) dataset.add_instance(inst) dataset.class_is_last() # print(str(dataset)) var = '' for inst1 in dataset: pred = classifier.classify_instance(inst1) var = inst1.class_attribute.value(int(pred)) if var == 'ham': print('No es pishing') # do somthing else: print('Es pishing') # do somthing print(var) jvm.stop() return HttpResponse(str(var))
class P1: # INIT for class (runs project) - !arr initializes empty data field def __init__(self, load): self.cls = None # Reuseable self.evl = None self.data = [] self.IBK = {} self.J48 = {} if load: self.loader = Loader(classname="weka.core.converters.ArffLoader") self.parseARFF("data") def run(self): self.run1(1) self.run2() # for i in self.J48: # print "J48 - "+str(i) # for j in self.J48[i]: # print str(j) self.graph() ########################################################################### # GRAPHING METHODS # ########################################################################### def graph(self): if os.path.exists(os.getcwd() + '/graphs'): shutil.rmtree(os.getcwd() + '/graphs') # Removed extant os.makedirs(os.getcwd() + '/graphs') plt1 = pd.DataFrame({ 'features': map(str, range(14, 95, 10)), 'J48acc': self.J48[1], 'IBKacc': self.IBK[1] }) plt2 = pd.DataFrame({ 'examples': map(int, range(50, 501, 50)), 'xaxis': map(float, range(0, 10)), 'J48std14': self.J48["14AVG"]['stdev'], 'J48acc14': self.J48["14AVG"]['acc'], 'IBKstd14': self.IBK["14AVG"]['stdev'], 'IBKacc14': self.IBK["14AVG"]['acc'], 'J48std54': self.J48["54AVG"]['stdev'], 'J48acc54': self.J48["54AVG"]['acc'], 'IBKstd54': self.IBK["54AVG"]['stdev'], 'IBKacc54': self.IBK["54AVG"]['acc'] }) self.graph1(plt1) self.graph2(plt2) def graph1(self, plt1): # 1st graph: IBK accuracy over # of features ax = plt1[['features', 'J48acc']].plot(x='features', linestyle='-', marker='+') plt1[['features', 'J48acc']].plot(x='features', kind='bar', ax=ax) plt.ylim(0.7, 0.8) plt.title("J48 accuracy over features") plt.ylabel("Accuracy") plt.xlabel("# Features") plt.show() plt.savefig('graphs/J48.png') plt.gcf().clear() # 2nd graph: J48 accuracy over # of features ax = plt1[['features', 'IBKacc']].plot(x='features', linestyle='-', marker='+') plt1[['features', 'IBKacc']].plot(x='features', kind='bar', ax=ax) plt.ylim(0.4, 0.8) plt.title("IBK accuracy over features") plt.ylabel("Accuracy") plt.xlabel("# Features") plt.show() plt.savefig('graphs/IBK.png') plt.gcf().clear() def graph2(self, plt2): # Setting the positions and width for the bars pos = list(range(len(plt2['IBKacc14']))) width = 0.2 # Plotting the bars fig, ax = plt.subplots(figsize=(10, 5)) # Create a bar with pre_score data, # in position pos, plt.bar(pos, plt2['IBKacc14'], width, alpha=0.5, color='#EE3224', yerr=plt2['IBKstd14'], label="IBKacc14") plt.bar([p + width for p in pos], plt2['IBKacc54'], width, alpha=0.5, color='#F78F1E', yerr=plt2['IBKstd54'], label="IBKacc54") plt.bar([p + width * 2 for p in pos], plt2['J48acc14'], width, alpha=0.5, color='#FFC222', yerr=plt2['J48std14'], label="J48acc14") plt.bar([p + width * 3 for p in pos], plt2['J48acc54'], width, alpha=0.5, color='green', yerr=plt2['J48std54'], label="J48acc54") # Set the position of the x ticks ax.set_xticks([p + 1.5 * width for p in pos]) # Set the labels for the x ticks ax.set_xticklabels(plt2['examples']) # Setting the x-axis and y-axis limits plt.xlim(min(pos) - width, max(pos) + width * 4) plt.ylim(0.0, 1.3) plt.title("J48 v IBK Performance Over Training Sample Size") plt.ylabel("Accuracy") plt.xlabel("# Examples") # Shrink current axis by 20% box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) # Put a legend to the right of the current axis ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.show() plt.savefig('graphs/14_54.png') plt.gcf().clear() ########################################################################### # DATA-GENERATING METHODS # ########################################################################### def run1(self, num): self.gen1( num) # Has imported saved in init object w /data/ presumption self.data1 = deepcopy(self.data) def run2(self): self.IBK["14AVG"] = {'acc': [0.0] * 10, 'stdev': [0.0] * 10} self.IBK["54AVG"] = {'acc': [0.0] * 10, 'stdev': [0.0] * 10} self.J48["14AVG"] = {'acc': [0.0] * 10, 'stdev': [0.0] * 10} self.J48["54AVG"] = {'acc': [0.0] * 10, 'stdev': [0.0] * 10} # Loops through algorithm 10 times and generates avg, stdev along folds for i in range(0, 10): self.data = self.data1 self.gen2() for i, v in enumerate(self.IBK[14]): self.IBK["14AVG"]['acc'][i] += v['acc'] self.IBK["14AVG"]['stdev'][i] += v['stdev'] for i, v in enumerate(self.IBK[54]): self.IBK["54AVG"]['acc'][i] += v['acc'] self.IBK["54AVG"]['stdev'][i] += v['stdev'] for i, v in enumerate(self.J48[14]): self.J48["14AVG"]['acc'][i] += v['acc'] self.J48["14AVG"]['stdev'][i] += v['stdev'] for i, v in enumerate(self.J48[54]): self.J48["54AVG"]['acc'][i] += v['acc'] self.J48["54AVG"]['stdev'][i] += v['stdev'] self.IBK["14AVG"]['acc'] = [(i / 10.0) for i in self.IBK["14AVG"]['acc']] self.IBK["14AVG"]['stdev'] = [(i / 10.0) for i in self.IBK["14AVG"]['stdev']] self.J48["14AVG"]['acc'] = [(i / 10.0) for i in self.J48["14AVG"]['acc']] self.J48["14AVG"]['stdev'] = [(i / 10.0) for i in self.J48["14AVG"]['stdev']] self.IBK["54AVG"]['acc'] = [(i / 10.0) for i in self.IBK["54AVG"]['acc']] self.IBK["54AVG"]['stdev'] = [(i / 10.0) for i in self.IBK["54AVG"]['stdev']] self.J48["54AVG"]['acc'] = [(i / 10.0) for i in self.J48["54AVG"]['acc']] self.J48["54AVG"]['stdev'] = [(i / 10.0) for i in self.J48["54AVG"]['stdev']] def gen1(self, label): self.IBK[label] = [] self.J48[label] = [] for x in range(0, 9): # Train IBk self.train(True, self.data[x + 9]['data']) stash1 = self.test1(self.data[x]['data']) self.IBK[label].append(stash1) # Train J48 self.train(False, self.data[x + 9]['data']) stash2 = self.test1(self.data[x]['data']) self.J48[label].append(stash2) def gen2(self): # Make temp folder to hold train- and test- .arffs if os.path.exists(os.getcwd() + '/temp'): shutil.rmtree(os.getcwd() + '/temp') # Removed extant os.makedirs(os.getcwd() + '/temp') # For the 14 feature and 54-feature sets ALONE: test = [] for i in [0, 4]: test.append(self.data[i]['file']) # Add file to test set train = self.data[i + 9] raw = self.getARFFpieces(train['file']) temp = {} temp['header'] = raw['header'] for k in range(50, 501, 50): temp['train'] = self.getXrandom(k, raw['train']) self.genARFF(i, k, temp) self.parseARFF("temp") # Changes self.data permanently freshtests = self.parseSomeARFF(test) storeall = self.data self.data = [i for i in self.data if "14" in i['file']] self.data.append(freshtests[0]) # Adds the 14-feature TEST set self.gen3(14) self.data = [i for i in storeall if "54" in i['file']] self.data.append(freshtests[1]) # Adds the 54-feature TEST set self.gen3(54) def gen3(self, label): self.IBK[label] = [] self.J48[label] = [] for x in range(0, 10): # Train IBk self.train(True, self.data[x]['data']) stash1 = self.test2(self.data[10]['data']) self.IBK[label].append(stash1) # Train J48 self.train(False, self.data[x]['data']) stash2 = self.test2(self.data[10]['data']) self.J48[label].append(stash2) ########################################################################### # TRAINING & TESTING METHODS # ########################################################################### # Stashes newly trained classifier into self.cls attribute def train(self, IBk, xtrain): if IBk: self.cls = Classifier( classname="weka.classifiers.lazy.IBk") # TODO - options? else: self.cls = Classifier(classname="weka.classifiers.trees.J48", options=["-B"]) self.cls.build_classifier(xtrain) #Builds with train set for next step self.evl = Evaluation(xtrain) def test1(self, xtest): if self.cls == None: print("Fail: no classifier to test with\n") self.evl.test_model(self.cls, xtest) return self.findacc(self.evl.confusion_matrix) def test2(self, xtest): if self.cls == None: print("Fail: no classifier to test with\n") self.evl.test_model(self.cls, xtest) return { 'acc': self.findacc(self.evl.confusion_matrix), 'stdev': self.evl.root_mean_squared_error } def findacc(self, cmatrix): # a+d / a+b+c+d x = map( float, filter(None, [re.sub("[^0-9]", "", i) for i in str(cmatrix).split('.')])) return (x[0] + x[3]) / (sum(x)) ########################################################################### # ARFF PARSING & HANDLING METHODS # ########################################################################### # Pulls header and particular data examples apart (rms empty lines) # & returns as a dict of arrays def getARFFpieces(self, fname1): # TODO - clean components temp = {'header': [], 'train': []} for file in glob.glob('data/*.arff'): if (file == fname1): #TRAIN with open(os.getcwd() + '/' + file) as f: rest = False for line in f: if line[0] == "@": temp['header'].append(line) if "@DATA" in line: rest = True elif rest and len(line) > 3: # Cutoff for EOF lines temp['train'].append(line) return temp # Create new ARFF files from which to import def genARFF(self, i, k, data): if i == 0: path = "14-" else: path = "54-" if k == 50: path += "0" path += str(k) + ".arff" with open((os.getcwd() + "/temp/Train" + path), "w") as train: for j in data['header']: train.write(j) for j in data['train']: train.write(j) # Get a random x # of documents from a list & return def getXrandom(self, x, arr): ret = [] used = [] max = len(arr) - 1 # Max randint value if (max + 1) < x: print "Err: can't take " + str(x) + " values from a set of " + str( max) return while len(ret) < x: new = randint(0, max) # Shuffle functional if new not in used: # make sure to not repeat used.append(new) ret.append(arr[new]) # Saves that line return ret # Stashes loaded Instances to data array given parameter for subdirectory def parseARFF(self, param): self.data = [ ] # CLEARS FROM PREVIOUS CALL - needed in special iteration case try: for file in glob.glob(str(param) + "/*.arff"): #with open(os.getcwd()+"/"+file, 'r') as f: temp = self.loader.load_file(file) temp.class_is_last() self.data.append({'file': file, 'data': temp}) except: print "Missing data/ folder in cwd\n" def parseSomeARFF(self, paths): ret = [] for path in paths: temp = self.loader.load_file(os.getcwd() + '/' + path) temp.class_is_last() ret.append({'file': path, 'data': temp}) return ret
# we'll set the class attribute after filtering # apply NominalToBinary filter and set class attribute fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary") fltr.inputformat(data) filtered = fltr.filter(data) filtered.class_is_last() # cross-validate LinearRegression on filtered data, display model cls = Classifier(classname="weka.classifiers.functions.LinearRegression") pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) print("10-fold cross-validation:\n" + evl.summary()) print("Predictions:\n\n" + str(pout)) cls.build_classifier(filtered) print("Model:\n\n" + str(cls)) # use AddClassification filter with LinearRegression on filtered data print("Applying AddClassification to filtered data:\n") fltr = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-W", "weka.classifiers.functions.LinearRegression", "-classification"]) fltr.inputformat(filtered) classified = fltr.filter(filtered) print(classified) # convert class back to nominal fltr = Filter(classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "9"]) fltr.inputformat(classified) nominal = fltr.filter(classified)
print(train.num_instances) print(test.num_instances) # Check data in datasets print(train.num_attributes) print(test.num_attributes) # Create classifier from weka.classifiers import Classifier cls = Classifier(classname= "weka.classifiers.bayes.NaiveBayes" ) # No options of interest to adjust # Build classifier on training data cls.build_classifier(train) # print(cls) #import weka.plot.graph as graph #graph.plot_dot_graph(cls.graph) from weka.classifiers import Evaluation from weka.core.classes import Random evl = Evaluation(train) evl.crossvalidate_model(cls, train, 10, Random(1)) print ("Kappa Score") print (evl.kappa) # 0.50 - Not bad print ("Evaluation Summary") print (evl.summary()) # Accuracy: 83%
test_data.class_is_first() # load logistic model tree algorithm log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_test_obj = Evaluation(test_data) eval_test_obj.crossvalidate_model(classifier=log_tree, data=test_data, num_folds=5, rnd=Random(1)) print("Test CV (10-folds) Error = %.2f%%" % (eval_test_obj.percent_incorrect)) print(eval_test_obj.matrix()) print("=================\"Summary\"====================") print(eval_test_obj.summary()) log_tree.build_classifier(test_data) y_predict = eval_test_obj.test_model(log_tree, test_data) y_test = to_binary_numeric(y_test.head(500), classNeg="neg") falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_test, y_predict, pos_label=0) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate, truePositiveRate, color='red', label='ROC = ' + str(area)) plt.plot([0, 1], [0, 1], linestyle='dotted')
# load a dataset iris_file = "HairEyeColor.csv" print("Loading dataset: " + iris_file) loader = Loader(classname="weka.core.converters.CSVLoader") iris_data = loader.load_file(iris_file) print (iris_data.num_attributes) iris_data.set_class_index(iris_data.num_attributes() - 1) # build a classifier and output model print ("Training J48 classifier on iris") classifier = Classifier(classname="weka.test.Regression") #classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"]) # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: #classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph()) #plot_graph.plot_dot_graph(classifier.graph()) evaluation = Evaluation(iris_data) # initialize with priors evaluation.crossvalidate_model(classifier, iris_data, 10, Random(42)) # 10-fold CV print(evaluation.to_summary()) print("pctCorrect: " + str(evaluation.percent_correct())) print("incorrect: " + str(evaluation.incorrect())) jvm.stop()
def run_multilayerPercepton(file, file2=None): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running Multilayer Percepton on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] print("loading data...") # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # If 2nd file load that data too if file2: print("Loading test...") test = load_Arff_file(file2) test.class_is_first() file_names = [ "MP_N-500_default_H-1", "MP_N-500_H-3", "MP_N-500_H-5", "MP_N-500_H-7", "MP_N-500_H-3-5", "MP_N-500_H-5-3", "MP_N-500_H-3-5-7", "MP_N-500_H-7-3-5", "MP_N-500_H-5-7-3", "MP_N-500_L-01", "MP_N-500_L-02", "MP_N-500_L-04", "MP_N-500_L-05", "MP_N-500_M-01", "MP_N-500_M-03", "MP_N-500_M-04", "MP_N-500_M-05", "MP_N-500_E-5", "MP_N-500_E-10", "MP_N-500_E-15", "MP_N-500_E-25", ] options_list = [ [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # DEFAULT [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "3" ], # -H START [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "5" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "7" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "3, 5" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "5, 3" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "3, 5, 7" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "7, 3, 5" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "5, 7, 3" ], # -H END [ "-L", "0.1", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -L START [ "-L", "0.2", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.4", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.5", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -L END [ "-L", "0.3", "-M", "0.1", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -M START [ "-L", "0.3", "-M", "0.3", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.3", "-M", "0.4", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], [ "-L", "0.3", "-M", "0.5", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "1" ], # -M END [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "5", "-H", "1" ], # -E START [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "10", "-H", "1" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "15", "-H", "1" ], [ "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "25", "-H", "1" ], # -E END ] for i in range(len(options_list)): start = time.time() print("Beginning iteration " + str(i) + ": " + file_names[i]) # Use MultilayerPercepton and set options cls = Classifier( classname="weka.classifiers.functions.MultilayerPerceptron", options=options_list[i]) # Build classifier with train data cls.build_classifier(data) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText" ) # Evaluate data on test data evaluation = Evaluation(data) evaluation.test_model(cls, test, output=pout) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.confusion_matrix) # Generate grid for ROC # plcls.plot_roc(evaluation, class_index=[0,1], wait=True) # mk dirs for output tempdir = dir / "Results/" / "MP-ALL_N-500_results/" / (file_names[i] + "_results/") tempdir.mkdir(parents=True, exist_ok=True) # Save summary, class details and confusion matrix to file result_output = file_names[i] + "_results.txt" print(tempdir) print(result_output) print((tempdir / result_output).absolute()) output_eval(evaluation, tempdir / result_output) # Save the predicited results to file prediction_output = file_names[i] + "_prediction.txt" output_pred(pout, tempdir / prediction_output) end = time.time() timetaken = round(end - start, 2) print("Time taken to run iteration " + str(i) + ": %s seconds" % (timetaken)) print("Multilayer Percepton complete")
from weka.classifiers import Evaluation from weka.core.classes import Random from weka.classifiers import Classifier if classifier == 0: SMOTE = Filter(classname="weka.filters.supervised.instance.SMOTE", options=['-P', str(smote)]) SMOTE.inputformat(dataSlowTrain) dataSlowTrain = SMOTE.filter(dataSlowTrain) SMOTE.inputformat(dataFastTrain) dataFastTrain = SMOTE.filter(dataFastTrain) for kernel in range(0,1): if kernel == 0: mapper = Classifier(classname="weka.classifiers.misc.InputMappedClassifier", options=["-M","-W", "weka.classifiers.bayes.NaiveBayes"]) Class = 'NaiveBayes' mapper.build_classifier(dataSlowTrain) evaluation = Evaluation(dataSlowTrain) evaluation.test_model(mapper,dataSlowTest) roc_NB.append(evaluation.area_under_roc(1)*100) recall_NB.append(evaluation.recall(yIndexSlow)*100) precision_NB.append(evaluation.precision(yIndexSlow)*100) mapper.build_classifier(dataFastTrain) evaluation = Evaluation(dataFastTrain) evaluation.test_model(mapper, dataFastTest) roc_NB_Last.append(evaluation.area_under_roc(1) * 100) recall_NB_Last.append(evaluation.recall(yIndexFast) * 100) precision_NB_Last.append(evaluation.precision(yIndexFast) * 100) mapper.build_classifier(dataNeutralTrain)
from weka.core.converters import Loader from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # determine baseline with ZeroR zeror = Classifier(classname="weka.classifiers.rules.ZeroR") zeror.build_classifier(data) evl = Evaluation(data) evl.test_model(zeror, data) print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct()) print("\nHoldout 10%...") # use seed 1-10 and perform random split with 90% perc = [] for i in xrange(1, 11): evl = Evaluation(data) evl.evaluate_train_test_split( Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i)) perc.append(round(evl.percent_correct(), 1)) print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct())) # calculate mean and standard deviation
HAM = 0.0 l1 = [0,0] l2 = [0,0] count_spams = 0 counts_hams = 0 l = sys.argv tec = l[1] if(tec == "1"): cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") elif(tec == "2"): cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25"]) else: cls = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "10"]) cls.build_classifier(data_train) for index, inst in enumerate(data_test): pred = cls.classify_instance(inst) if(index <= 29): if(pred == SPAM): l1[0] = l1[0]+1 else: l1[1] = l1[1]+1 else: if(pred == SPAM): l2[0] = l2[0]+1 else: l2[1] = l2[1]+1
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # 1a filter data print("Filtering data...") fltr = Filter("weka.filters.unsupervised.attribute.StringToWordVector") fltr.set_inputformat(data) filtered = fltr.filter(data) filtered.set_class_index(0) # 1b build classifier print("Building/evaluating classifier...") cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(filtered) evl = Evaluation(filtered) evl.test_model(cls, filtered) print(evl.to_summary()) print(str(cls)) plg.plot_dot_graph(cls.graph()) # 2. filtered classifier fname = data_dir + os.sep + "simpletext-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) print("Building/evaluating filtered classifier...") cls = FilteredClassifier() cls.set_classifier(Classifier(classname="weka.classifiers.trees.J48"))
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer( classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc(evaluation, title="ROC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) # train 2nd classifier on diabetes dataset classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(diabetes_data) evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42)) plot_cls.plot_rocs({ "NB": evaluation, "RF": evaluation2 }, title="ROC diabetes", class_index=0, wait=False) plot_cls.plot_prcs({ "NB": evaluation, "RF": evaluation2 }, title="PRC diabetes", class_index=0, wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print( str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # train 2nd classifier and show errors in same plot classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg") evaluation2 = Evaluation(bolts_data) evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42)) plot_cls.plot_classifier_errors( { "LR": evaluation.predictions, "SMOreg": evaluation2.predictions }, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") ] plot_cls.plot_learning_curve(cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
for train_index, test_index in sss: print "Iter", itr, X_train, X_test = X[train_index], X[test_index] X_test[:,-1] = classes[0] # make sure test classes is removed y_test = Y[test_index] write_to_weka('train.arff', 'training_data', data.columns, X_train, classes) write_to_weka('test.arff', 'testing_data', data.columns, X_test, classes) loader = Loader(classname="weka.core.converters.ArffLoader") trdata = loader.load_file("train.arff") trdata.class_is_last() classifier = Classifier(classname="weka.classifiers.lazy.IBk") classifier.options = ["-K", "10", "-W", "0", "-I", "-A", "weka.core.neighboursearch.LinearNNSearch -A \"weka.core.ManhattanDistance -R first-last\""] classifier.build_classifier(trdata) tedata = loader.load_file("test.arff") tedata.class_is_last() for index, inst in enumerate(tedata): result = classifier.classify_instance(inst) Ypred[test_index[index]] = classes[int(result)] accuracy = float(np.sum(y_test == Ypred[test_index])) / float(y_test.shape[0]) print " => Accuracy = ", accuracy itr += 1 accuracy = float(np.sum(Y == Ypred)) / float(Y.shape[0]) print "Total accuracy = ", accuracy os.remove('train.arff')
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
def logit_boost(train_data): train_data.class_is_last() cls = Classifier(classname="weka.classifiers.meta.LogitBoost") cls.build_classifier(train_data) return cls
class Weka(object): data = None dataDir = None classifier = None def __init__(self, dataDir = '.'): self.dataDir = dataDir jvm.start() # Inicializa dados com conteudo do arquivo arff def initData(self, arrfFile): loader = Loader(classname="weka.core.converters.ArffLoader") print self.dataDir + '/' + arrfFile self.data = loader.load_file(self.dataDir + '/' + arrfFile) self.data.class_is_last() print 'Carregando arquivo ' + self.dataDir + '/' + arrfFile # print(data) # Realiza o treinamento do classificador def trainData(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]): if arrfFile is not None: self.initData( arrfFile ) if self.data is None: return print 'Contruindo classificador ' + str(classname) + ' ' + ' '.join(options) self.classifier = Classifier(classname=classname, options=options) self.classifier.build_classifier(self.data) # Realiza a classificacao das instancias de um arquivo arff def classify(self, predictFile): if self.data is None or self.classifier is None: return [-1] loader = Loader(classname="weka.core.converters.ArffLoader") predict_data = loader.load_file(self.dataDir + '/' + predictFile) predict_data.class_is_last() values = str(predict_data.class_attribute)[19:-1].split(',') classes = [] for index, inst in enumerate(predict_data): #pred = self.classifier.classify_instance(inst) prediction = self.classifier.distribution_for_instance(inst) cl = int(values[prediction.argmax()][7:]) print 'Classe:', cl classes.append(cl) return classes # Realiza uma validação cruzada e mostra os resultados na saída padrão def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]): if arrfFile is not None: self.initData( arrfFile ) if self.data is None: return print 'Classificador ' + str(classname) + ' ' + ' '.join(options) cls = Classifier(classname=classname, options=options) evl = Evaluation(self.data) evl.crossvalidate_model(cls, self.data, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details())
def rotation_forest(train_data): train_data.class_is_last() cls = Classifier(classname="weka.classifiers.meta.RotationForest") cls.build_classifier(train_data) return cls
jvm.logger.setLevel(jvm.logging.WARNING) jvm.start(packages=True, max_heap_size="512m") # Each instance has nominal class and numeric attributes loader = Loader(classname="weka.core.converters.ArffLoader") trainData = loader.load_file('segment-challenge.arff') trainData.class_is_last() testData = loader.load_file('segment-test.arff') testData.class_is_last() # Default C4.5 tree classifier = Classifier(classname="weka.classifiers.trees.J48") # Search for the best parameters and build a classifier with them classifier.build_classifier(trainData) print("\n\n=========== Classifier information ================\n\n") print(classifier.options) print(classifier) print("\n\n=========== Train results ================\n\n") evaluation = Evaluation(trainData) evaluation.test_model(classifier, trainData) print(classifier.to_commandline()) print(evaluation.matrix()) print("Train recognition: %0.2f%%" % evaluation.percent_correct) print("\n\n=========== Test results ================\n\n") evaluation = Evaluation(testData) evaluation.test_model(classifier, testData)
def flda(train_data): train_data.class_is_last() cls = Classifier(classname="weka.classifiers.functions.FLDA") cls.build_classifier(train_data) return cls
class WekaWrapper: def __init__(self, questionID, algorithm, classifier, parameters, modelParams, optimizer, predict = 0): self.questionID = questionID self.algorithm = algorithm self.classifier = classifier self.parameters = parameters self.modelParams = modelParams self.api = nemoApi() self.config = nemoConfig() self.optimizer = optimizer self.predict = predict self.prediction = None def retrieveData(self, id, dataset): query = self.api.getDataQuery(id, dataset) iquery = InstanceQuery() iquery.db_url = "jdbc:mysql://" + self.config.HOST + ":" + str(self.config.PORT) + "/" + self.config.DB iquery.user = self.config.USER iquery.password = self.config.PASS iquery.query = query data = iquery.retrieve_instances() data.class_is_last() return data def uploadData(self): # Upload file to database self.api.addModel(self.questionID, '?', self.acc, self.model, self.algorithm, False, self.matrix, self.optimizer) info = self.api.fetchQuestionInfo(self.questionID) modelID = info['ID'] for mParam in self.modelParams: mParam.AIModel = modelID self.api.addAIModelParam(mParam) def uploadPrediction(self): # Upload best classifier prediction to database if self.prediction is not None: # Convert prediction to string predStr = 'No prediction' if (self.prediction == 1.0): predStr = "True" elif (self.prediction == 0.0): predStr = "False" print 'Writing ' + predStr self.api.updatePrediction(self.questionID, predStr) def addInstancesToDataset(self, source, dest): # Align the instances of a source dataset to destination's header and add them to the destination dataset i = 0 while i < source.num_instances: values = source.get_instance(i).values it = np.nditer(values, flags=['f_index'], op_flags=['readwrite']) while not it.finished: (it[0], it.index), if (source.attribute(it.index).is_nominal): stringVal = source.get_instance(i).get_string_value(it.index) # print stringVal if(stringVal != '?'): values[it.index] = dest.attribute(it.index).values.index(stringVal) it.iternext() dest.add_instance(Instance.create_instance(values)) i = i + 1 def buildPatientObject(self): # Build a patient to classify patient = self.api.fetchPatientJSON(self.questionID) if patient is not None: newPatient = {} demographics = ['race_cd', 'sex_cd', 'age_in_years_num'] observation_fact_features = ['tval_char', 'nval_num'] for demo in demographics: if demo not in patient: print "Patient definition missing" + demo + "." newPatient[demo] = float('nan') else: if patient[demo] is not None and patient[demo] != '': newPatient[demo] = patient[demo] else: print "Demographic " + demo + " for patient is empty" newPatient[demo] = float('nan') for obs in patient['observation_facts']: concept_cd = obs['concept_cd'] for feat in observation_fact_features: if feat in obs: if obs[feat] is not None: newPatient[(concept_cd + feat)] = obs[feat] else: newPatient[(concept_cd + feat)] = float('nan') else: print "Feature " + concept_cd + feat + " missing from Patient definition, marking it None" newPatient[(concept_cd + feat)] = float('nan') return newPatient else: return None def addPatientNominals(self, patient, dataset): # Add the nominal values for the patient to the master header, in case they aren't already there # Loop and add patient's nominal values in case they aren't in masterDataset # newDataset will be the new master header # Waiting on prediction patient to be defined # Should be like {sex_cd: "m", ...} ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) #print a.name pvalue = patient[a.name] if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset def addNominals(self, dataset): # Add the nominal values for all columns, in case a column has none ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) pvalue = 'DefaultNominal' if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset def createPatientInstance(self, patient, dataset): # Create a patient instance to classify ignoreAttributes = ['readmitted'] values = [] for a in dataset.attributes(): if not a.is_nominal: values.append(patient[a.name]) elif a.name in ignoreAttributes: values.append(0) else: values.append(a.values.index(patient[a.name])) #print values newInst = Instance.create_instance(values) return newInst def run(self): # Attach JVM javabridge.attach() # Debug print "Classifier" print self.classifier print "Params" print self.parameters print "Model Params" print self.modelParams # Get data for testing and learning learnerData = self.retrieveData(self.questionID, "learner") testData = self.retrieveData(self.questionID, 'test') masterData = self.retrieveData(self.questionID, 'all') masterData = self.addNominals(masterData) # Check if there is enough correct data to run if (learnerData.num_instances < 1 or testData.num_instances < 1): self.status = self.config.NOT_ENOUGH_DATA return False # If this is a prediction and there is a valid patient, change masterData header patientObj = self.buildPatientObject() patientInstance = None if ((patientObj is not None) and (self.predict == 1)): masterData = self.addPatientNominals(patientObj, masterData) patientInstance = self.createPatientInstance(patientObj, masterData) masterData.add_instance(patientInstance) elif (patientObj is None) and (self.predict == 1): print 'No patient defined for prediction. Exiting' return True # Fix dataset headers up to match and fix instances to match headers masterData.delete() learner = masterData.copy_instances(masterData, 0, 0) test = masterData.copy_instances(masterData, 0, 0) self.addInstancesToDataset(learnerData, learner) self.addInstancesToDataset(testData, test) # Comparison of data for testing purposes # print 'learnerData' # print learnerData # print 'learner' # print learner # print 'testData' # print testData # print 'test' # print test # pdb.set_trace() # Instantiate classifier self.cls = Classifier(classname=self.classifier, options=self.parameters) # Run classifier self.cls.build_classifier(learner) # for index, inst in enumerate(learnerData): # prediction = self.cls.classify_instance(inst) # distribution = self.cls.distribution_for_instance(inst) # Test classifier evl = Evaluation(learner) evl.test_model(self.cls, test) # Store information about matrix self.acc = evl.percent_correct self.val = None # Convert numpy array into simple array confusionMatrix = [] confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]]) confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]]) # Convert matrix into json format self.matrix = json.dumps(confusionMatrix) # print 'Classifier: ', self.classifier # print 'ID: ', self.questionID # print 'ACC: ', self.acc # print(evl.summary()) # If this is a prediction... make the prediction if ((patientObj is not None) and (self.predict == 1)): masterData.add_instance(patientInstance) print "Running prediction on patient: " print masterData.get_instance(0) self.prediction = self.cls.classify_instance(masterData.get_instance(0)) #self.uploadPrediction() # Temporarily store file to serialize to fileName = str(self.questionID) + self.algorithm + ".model" serialization.write(fileName, self.cls) # Open that file and store it self.model = None with open(fileName, 'rb') as f: self.model = f.read() # Remove temporary file os.remove(fileName) # Set status to awaiting feedback self.status = self.config.AWAITING_FEEDBACK_STATUS return True
def rep_tree(train_data): train_data.class_is_last() cls = Classifier(classname="weka.classifiers.trees.REPTree") cls.build_classifier(train_data) return cls
if data_dir is None: data_dir = "." + os.sep + "data" import os import weka.core.jvm as jvm from weka.core.converters import Loader from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation from weka.filters import Filter jvm.start() # load weather.nominal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # perform 10-fold cross-validation cls = Classifier(classname="weka.classifiers.rules.OneR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation:\n" + evl.to_summary()) # build model on full dataset and output it cls.build_classifier(data) print("Model:\n\n" + str(cls)) jvm.stop()
def mlpc_10(train_data): train_data.class_is_last() cls = Classifier(classname="weka.classifiers.functions.MLPClassifier", options=["-N", "10"]) cls.build_classifier(train_data) return cls