def PredecirUnaTemporada(path): jvm.start() insta = CrearInstanciaParaPredecir(path) atributos = "" file = open('ModelData/wekaHeader.arff', 'r') atributos = file.readlines() file.close() file = open('ModelData/predictionFiles/inst.arff', 'w') file.writelines(atributos) file.write("\n" + insta + '\n') file.close() objects = serialization.read_all("ModelData/77PercentModelPaisajes.model") classifier = Classifier(jobject=objects[0]) loader = Loader() data = loader.load_file("ModelData/predictionFiles/inst.arff") data.class_is_last() clases = ["invierno", "verano", "otono", "primavera"] prediccion = "" for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) prediccion = clases[int(pred)] jvm.stop() return prediccion
def try_params(n_instances, params, train, test, istest): n_instances = int(round(n_instances)) # print "n_instances:", n_instances pprint(params) L = list([]) if params['unpruned'] == True: L.append("-N") L.append("-M") L.append(str(params['min_inst'])) if params['unsmoothed'] == True: L.append("-U") if params['regression'] == True: L.append("-R") clf = Classifier(classname="weka.classifiers.rules.M5Rules", options=L) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, n_instances) return result
def autoweka(data, duration, metric, nb_folds): classifier = Classifier( classname="weka.classifiers.meta.AutoWEKAClassifier", options=["-x", nb_folds, "-timeLimit", duration, "-metric", metric] ) #classname="weka.classifiers.functions.Logistic", options=["-R", "1.0E-2"] classifier.build_classifier(data) print(classifier)
def try_params(n_instances, params, train, valid, test, istest): n_instances = int(round(n_instances)) pprint(params) L = list([]) # L.append("-N") # L.append(str(params['minNo'])) L.append("-O") L.append(str(params['optimizations'])) if params['checkerror'] == False: L.append("-E") if params['pruning'] == False: L.append("-P") clf = Classifier(classname="weka.classifiers.rules.JRip", options=L) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def try_params(n_instances, params, train, valid, test, istest): n_instances = int(round(n_instances)) pprint(params) # data = load(directory) L = list([]) L.append("-I") L.append(str(params['numInterations'])) L.append("-K") L.append(str(params['numattr'])) L.append("-depth") L.append(str(params['depth'])) clf = Classifier(classname="weka.classifiers.trees.RandomForest", options=L) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def try_params(n_instances, params, base, train, valid, test, istest): n_instances = int(round(n_instances)) pprint(params) L = list() if params['missingSeparate'] == True: L.append("-M") if params['locallyPredictive'] == False: L.append("-L") if params['search'] == 'GreedyStepwise': param_search = gs.get_params() search = gs.get_class(param_search) else: param_search = bf.get_params() search = bf.get_class(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("base", base.jobject) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def Boost_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Boost_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False) value_Boost_J48 = str(evaluation.percent_correct) return value_Boost_J48
def get_class(params): # pprint(params) L = list([]) if params['binary'] == True: L.append("-B") if params['residuals'] == True: L.append("-R") if params['crossValidated'] == True: L.append("-C") if params['probabilities'] == True: L.append("-P") L.append("-M") L.append(str(params['min_inst'])) if params['weighting'] != 0 and params['probabilities'] == False: L.append("-W") L.append(str(params['weighting'])) L.append("-A") L.append(str(params['aic'])) clf = Classifier(classname="weka.classifiers.trees.LMT", options=L) return clf
def retrain(self, examples, labels): f = open("trainingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(examples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(examples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.trainingData = loader.load_file("trainingweka.arff") self.trainingData.set_class_index(self.trainingData.num_attributes() - 1) self.classifier = Classifier( classname="weka.classifiers.functions.Logistic", options=["-R", "%f" % (1.0 / self.C)]) self.classifier.build_classifier(self.trainingData)
def RandomTree(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RT_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RT_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RT_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False) value_RT = str(evl.percent_correct) return value_RT
def load_model(filename): """ Load the model from cache. Args: filename(str): The target file name (without extension) to load. Example: LMT Returns: The classifier and data object if the target caching is saved, otherwise None. """ # Path to the cashed model (example: caches/model/LMT.cache) path = os.path.join(os.path.join('caches', 'model'), filename + '.cache') print("Path to the cashed model to load:", path) if os.path.isfile(path): cached_model, cached_data_used_for_training = serialization.read_all( path) print("Loading cached classifier") trained_classifier = Classifier(jobject=cached_model) print("Loading cached data") training_data = Instances(jobject=cached_data_used_for_training) localizer_log.msg("Loaded model: {filename}".format(filename=filename)) return [trained_classifier, training_data] localizer_log.msg("Failed to load cache of 'model'.") return None
def create_model(input_file, output_file): # Load data data = converters.load_any_file(input_file) data.class_is_last() # set class attribute # filter data print_title("Filtering Data") discretize = Filter( classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", "10", "-M", "-1.0", "-R", "first-last"]) discretize.inputformat( data) # let the filter know about the type of data to filter filtered_data = discretize.filter(data) print("Done! (believe it or not)") print_title("Build Classifier") classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1"]) classifier.build_classifier(filtered_data) print("Done! (believe it or not)") serialization.write_all(output_file, [classifier, discretize]) print("Model and filter saved to ", output_file) evaluation = Evaluation(data) # initialize with priors evaluation.crossvalidate_model(classifier, filtered_data, 10, Random(42)) # 10-fold CV print(evaluation.summary()) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect))
def load_classifier(lang, tag): classifier = {} if lang == LANG_ID and tag == "nnp": objects = serialization.read_all(ID_MODEL_NNP) elif lang == LANG_ID and tag == "nn": objects = serialization.read_all(ID_MODEL_NN) elif lang == LANG_ID and tag == "cdp": objects = serialization.read_all(ID_MODEL_CDP) elif lang == LANG_EN and tag == "nnp": objects = serialization.read_all(EN_MODEL_NNP) elif lang == LANG_EN and tag == "jj": objects = serialization.read_all(EN_MODEL_JJ) elif lang == LANG_EN and tag == "nn": objects = serialization.read_all(EN_MODEL_NN) elif lang == LANG_EN and tag == "vbp": objects = serialization.read_all(EN_MODEL_VBP) elif lang == LANG_EN and tag == "cd": objects = serialization.read_all(EN_MODEL_CD) elif lang == LANG_EN and tag == "vb": objects = serialization.read_all(EN_MODEL_VB) classifier['classifier'] = Classifier(jobject=objects[0]) classifier['filter'] = Filter(jobject=objects[1]) return classifier
def try_params(n_instances, params, base, train, valid, test, istest): n_instances = int(round(n_instances)) # print "n_instances:", n_instances pprint(params) L = list([]) if params['missingMerge'] == False: L.append("-M") if params['binarizeNumericAttributes'] == True: L.append("-B") # print L search = ASSearch(classname="weka.attributeSelection.Ranker") evaluator = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval", options=L) clf = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def get_class(params): # pprint(params) L = list([]) if params['unpruned'] == True: L.append("-U") if params['collapseTree'] == True: L.append("-O") if params['unpruned'] == False: L.append("-C") L.append(str(params['confidenceFactor'])) L.append("-M") L.append(str(params['minNumObj'])) if params['binarySplits'] == True: L.append("-B") if params['subtreeRaising'] == True and params['unpruned'] == False: L.append("-S") if params['useLaplace'] == True: L.append("-A") if params['useMDL'] == False: L.append("-J") clf = Classifier(classname="weka.classifiers.trees.J48", options=L) return clf
def try_params(n_instances, params, train, test, istest): n_instances = int(round(n_instances)) # print "n_instances:", n_instances pprint(params) L = list([]) L.append("-B") L.append(str(params['blend'])) if params['entropic'] == True: L.append("-E") L.append("-M") L.append(params['missing']) clf = Classifier(classname="weka.classifiers.lazy.KStar", options=L) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, n_instances) return result
def get_evaluator(params, base): pprint(params) L = list([]) if params['missingMerge'] == False: L.append("-M") if params['binarizeNumericAttributes'] == True: L.append("-B") param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection.Ranker") evaluator = ASEvaluation( classname="weka.attributeSelection.InfoGainAttributeEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def get_evaluator(params, base): pprint(params) L = list() if params['missingSeparate'] == True: L.append("-M") if params['locallyPredictive'] == False: L.append("-L") if params['search'] == 'GreedyStepwise': param_search = gs.get_params() search = gs.get_search(param_search) else: param_search = bf.get_params() search = bf.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def main(args): """ Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the actual class from a test set. Class attribute is assumed to be the last attribute. :param args: the commandline arguments (train and test datasets) :type args: list """ # load a dataset helper.print_info("Loading train: " + args[1]) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(args[1]) train.class_index = train.num_attributes - 1 helper.print_info("Loading test: " + args[2]) test = loader.load_file(args[2]) test.class_is_last() # classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # output predictions print("# - actual - predicted - error - distribution") for index, inst in enumerate(test): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print("%d - %s - %s - %s - %s" % (index + 1, inst.get_string_value( inst.class_index), inst.class_attribute.value(int(pred)), "yes" if pred != inst.get_value(inst.class_index) else "no", str(dist.tolist())))
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print( evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def main(): """ Just runs some example code. """ classifier = Classifier("weka.classifiers.trees.J48") helper.print_title("Capabilities") capabilities = classifier.capabilities print(capabilities) # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() data_capabilities = Capabilities.for_instances(iris_data) print(data_capabilities) print("classifier handles dataset: " + str(capabilities.supports(data_capabilities))) # disable/enable helper.print_title("Disable/Enable") capability = Capability(member="UNARY_ATTRIBUTES") capabilities.disable(capability) capabilities.min_instances = 10 print("Removing: " + str(capability)) print(capabilities)
def save_all_scores_on_validate(): for user in user_list: user_validate_dir = os.listdir("../data/arff_files/" + str(user) + "/validate/") user_validate_dir.sort() n = len(user_validate_dir) for expression_index in range(n): print expression_index, "=>", str( expression_list[expression_index]), ':', str( user_validate_dir[expression_index]) id = str(expression_list[expression_index]) + '_' + str(user) target_dir = '../results/' + str( expression_list[expression_index]) + '/' + str(user) + '/' model_dir = '../models/' + str( expression_list[expression_index]) + '/' + str(user) + '/' validate_data_file = "../data/arff_files/" + str( user) + "/validate/" + str(user_validate_dir[expression_index]) print validate_data_file, "=>", model_dir, "all algos", "=>", target_dir, "\n" loader = Loader(classname="weka.core.converters.ArffLoader") validate_data = loader.load_file(validate_data_file) for algo in algo_func_dict.keys(): trained_model = Classifier( jobject=serialization.read(model_dir + algo + ".model")) scores_matrix = get_classifier_score(trained_model, validate_data) out_file = target_dir + algo + "_scores.csv" #writing scores to target file np.savetxt(out_file, scores_matrix, delimiter=",")
def get_evaluator(params, base): pprint(params) L = list() if params['missing_merge'] == True: L.append("-M") # if params['search'] == 'GreedyStepwise': # param_search = gs.get_params() # search = gs.get_search(param_search) # elif params['search'] == 'BestFirst': # param_search = bf.get_params() # search = bf.get_search(param_search) # elif params['search'] == 'Ranker': param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation(classname="weka.attributeSelection.GainRatioAttributeEval", options=L) clf = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def try_params(n_instances, params, train, valid, test, istest): n_instances = int(round(n_instances)) pprint(params) L = list([]) L.append("-L") L.append(str(params['leaf'])) L.append("-S") L.append(str(params['splitCriterion'])) L.append("-E") L.append(str(params['splitConfidence'])) L.append("-H") L.append(str(params['hoeffdingTieThreshold'])) L.append("-M") L.append(str(params['minimumFractionOfWeightInfoGain'])) L.append("-G") L.append(str(params['gracePeriod'])) clf = Classifier(classname="weka.classifiers.trees.HoeffdingTree", options=L) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def try_params(n_instances, params, base, train, valid, test, istest): n_instances = int(round(n_instances)) pprint(params) L = list() if params['missing_merge'] == True: L.append("-M") if params['search'] == 'GreedyStepwise': param_search = gs.get_params() search = gs.get_search(param_search) elif params['search'] == 'BestFirst': param_search = bf.get_params() search = bf.get_search(param_search) elif params['search'] == 'Ranker': param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation(classname="weka.attributeSelection.GainRatioAttributeEval", options=L) clf = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def get_class(params): # pprint(params) L = list([]) L.append("-L") L.append(str(params['learningRate'])) L.append("-M") L.append(str(params['momentum'])) if params['nominalToBinaryFilter'] == True: L.append("-B") L.append("-H") L.append(str(params['hiddenLayers'])) if params['normalizeNumClasses'] == True: L.append("-C") if params['reset'] == True: L.append("-R") if params['decay'] == True: L.append("-D") L.append("-S") L.append(str(params['seed'])) clf = Classifier( classname="weka.classifiers.functions.MultilayerPerceptron", options=L) return clf
def logit_PC(df_train, df_test, attr_label): ''' logistic regression with PC members only :param df_train: training data, pandas data frame :param df_test: testing data, pandas data frame :param attr_label: label attribute, string :return: PC members, logistic regression model and AUC ''' pcs = RF.learnPC_R(df_train, attr_label) if pcs: # model = LogisticRegression().fit(df_train[pcs], df_train[attr_label]) # pred = model.predict_proba(df_test[pcs]) # pred = [x[1] for x in pred] # auc = evaluate_auc(df_test[attr_label].values.tolist(), pred) df2Instances = DF2Instances(df_train[pcs+[attr_label]], 'train', attr_label) data_train = df2Instances.df_to_instances() data_train.class_is_last() # set class attribute model = Classifier(classname="weka.classifiers.functions.Logistic") model.build_classifier(data_train) df2Instances = DF2Instances(df_test[pcs+[attr_label]], 'test', attr_label) data_test = df2Instances.df_to_instances() data_test.class_is_last() # set class attribute preds = [] for index, inst in enumerate(data_test): preds.append(model.distribution_for_instance(inst)[1]) auc = evaluate_auc(df_test[attr_label].values.tolist(), preds) return pcs, model, auc else: return pcs, None, None
def try_params(n_instances, params, train, valid, test, istest): n_instances = int(round(n_instances)) pprint(params) L = list([]) L.append("-C") L.append(str(params['C'])) L.append("-N") L.append(str(params['filterType'])) if params['buildCalibrationModels'] == True: L.append("-M") L.append("-K") L.append("weka.classifiers.functions.supportVector." + params['kernel']) clf = Classifier(classname="weka.classifiers.functions.SMO", options=L) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def main(args): """ Trains a NaiveBayesUpdateable classifier incrementally on a dataset. The dataset can be supplied as parameter. :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file, incremental=True) data.class_is_last() # classifier nb = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") nb.build_classifier(data) # train incrementally for inst in loader: nb.update_classifier(inst) print(nb)