def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def use_classifier(data, cli, args): cli = cli.format(cli, **args) cls = from_commandline(cli, classname="weka.classifiers.Classifier") cls.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1)) return cls, evaluation
def myGridSearch(data,RBound,MBound): bestlogistic = None best_acc = -float('inf') class bestValues(object): m = float('nan') r = float('nan') for r in range(RBound[0],RBound[1]+RBound[2],RBound[2]): for m in range(MBound[0],MBound[1]+MBound[2],MBound[2]): logistic = Logistic() logistic.setMaxIts(int(m)) logistic.setRidge(pow(10,r)) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(logistic,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc): bestlogistic = logistic best_acc = acc bestValues.m = int(m) bestValues.r = pow(10,r) print "Best accuracy: ", best_acc print "Best values: M = ", bestValues.m, ", Ridge = ", bestValues.r print "-----------------------------------------" return bestlogistic, bestValues.r, bestValues.m, best_acc
def myGridSearch(data,NTreeBounds,NFeaturesBounds): best_acc = -float('inf') bestrandomforest = None class bestValues(object): t = float('nan') f = float('nan') for t in range(NTreeBounds[0],NTreeBounds[1]+NTreeBounds[2],NTreeBounds[2]): for f in range(NFeaturesBounds[0],NFeaturesBounds[1]+NFeaturesBounds[2],NFeaturesBounds[2]): randomforest = RandomForest() randomforest.setNumTrees(int(t)) randomforest.setNumFeatures(int(f)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(randomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() if (acc>best_acc): bestrandomforest = randomforest best_acc = acc bestValues.t = t bestValues.f = f print "Best accuracy:", best_acc print "Best values: NTreeBounds = ", bestValues.t, ", NFeaturesBounds = ", bestValues.f print "-----------------------------------------" return bestrandomforest, bestValues.t, bestValues.f, best_acc
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def use_classifier(data_filename, cli): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_filename) data.class_is_last() cls = from_commandline(cli, classname="weka.classifiers.Classifier") cls.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1)) return cls, evaluation
def RandomForest_ParamFinder(data): # possible set for Number of trees NTreeBounds = [1,20,1] # possible set for number of features NFeaturesBounds = [0,20,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) randomforest = RandomForest() gridsearch.setClassifier(randomforest) gridsearch.setXProperty(String('classifier.numTrees')) gridsearch.setYProperty(String('classifier.numFeatures')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('I')) gridsearch.setXMin(NTreeBounds[0]) gridsearch.setXMax(NTreeBounds[1]) gridsearch.setXStep(NTreeBounds[2]) gridsearch.setYMin(NFeaturesBounds[0]) gridsearch.setYMax(NFeaturesBounds[1]) gridsearch.setYStep(NFeaturesBounds[2]) gridsearch.setYBase(10) print "searching for random-forest NumTrees = [", NTreeBounds[0], ",", NTreeBounds[1], "], NumFeatures = [ ", NFeaturesBounds[0], ",", NFeaturesBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------- Evaluation bestrandomforest = RandomForest() bestrandomforest.setNumTrees(int(bestValues.x)) bestrandomforest.setNumFeatures(int(bestValues.y)) evaluation = Evaluation(data) output = output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestrandomforest,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() print "best accuracy: ", acc print "best random-forest classifier with NumTrees=",bestValues.x , ", NumFeatures = ", bestValues.y OptRndFrst = bestrandomforest OptRndFrstp1 = bestValues.x OptRndFrstp2 = bestValues.y OptRndFrstAcc = acc else: OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc = myGridSearch(data,NTreeBounds,NFeaturesBounds) Description = 'Random-Forest classifier: OptNumTrees = ' + str(OptRndFrstp1) + \ ', OptNumFeatures = ' + str(OptRndFrstp2) + ', OptAcc = ' + str(OptRndFrstAcc) print "-----------------------------------------" return OptRndFrst, OptRndFrstp1, OptRndFrstp2, OptRndFrstAcc, Description
def Logistic_ParamFinder(data): # Possible set for Ridge-value RBounds = [-10,2,1] # possible set for maximum Iteration MBounds = [-1,10,1] if (data.numInstances()>10): # grid search does 10-fold cross validation; hence number of samples must be more than 10 gridsearch = GridSearch() acctag = gridsearch.getEvaluation() acctag = SelectedTag('ACC',acctag.getTags()) gridsearch.setEvaluation(acctag) allfilters = AllFilters() gridsearch.setFilter(allfilters) gridsearch.setGridIsExtendable(Boolean(True)) logistic = Logistic() gridsearch.setClassifier(logistic) gridsearch.setXProperty(String('classifier.maxIts')) gridsearch.setYProperty(String('classifier.ridge')) gridsearch.setXExpression(String('I')) gridsearch.setYExpression(String('pow(BASE,I)')) gridsearch.setXMin(MBounds[0]) gridsearch.setXMax(MBounds[1]) gridsearch.setXStep(MBounds[2]) gridsearch.setYMin(RBounds[0]) gridsearch.setYMax(RBounds[1]) gridsearch.setYStep(RBounds[2]) gridsearch.setYBase(10) print "searching for logistic lcassifier Max Iteration = [", MBounds[0], ",", MBounds[1], "], Ridge = [ 10E", RBounds[0], ",10E", RBounds[1], "] ...." gridsearch.buildClassifier(data) bestValues = gridsearch.getValues() # ----------------------- Evaluation bestlogistic = Logistic() bestlogistic.setMaxIts(int(bestValues.x)) bestlogistic.setRidge(pow(10,bestValues.y)) evaluation = Evaluation(data) output = util.get_buffer_for_predictions()[0] attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution random = Random(1) numFolds = min(10,data.numInstances()) evaluation.crossValidateModel(bestlogistic,data,numFolds,random,[output, attRange, outputDistribution]) acc = evaluation.pctCorrect() print "best accuracy: ", acc print "best logistic classifier with Ridge = ", bestlogistic.getRidge(), " Max Iteration = ", bestlogistic.getMaxIts() OptLog = bestlogistic OptLogp1 = bestlogistic.getRidge() OptLogp2 = bestlogistic.getMaxIts() OptLogAcc = acc else: OptLog, OptLogp1, OptLogp2, OptLogAcc = myGridSearch(data,RBounds,MBounds) Description = 'Logistic classifier OptRidge = ' + str(OptLogp1) + \ ', OptMaxIts = ' + str(OptLogp2) + ', OptAcc = ' + str(OptLogAcc) print "-----------------------------------------" return OptLog, OptLogp1, OptLogp2, OptLogAcc, Description
def bagging_logistic(trainData,testData,params,exparams): IsOptBagOnOptLog = str2bool(params[0]) logistic = Logistic() bagging = Bagging() if IsOptBagOnOptLog: # optimal bagging is based on optimal logistic ridge = float(exparams[0]) maxIt = int(float(exparams[1])) logistic.setMaxIts(maxIt) bagSizePercent = int(float(params[1])) bagging.setBagSizePercent(bagSizePercent) else: # ridge parameter is also optimized in the process ridge = float(params[1]) numIterations = int(float(params[2])) bagging.setNumIterations(numIterations) logistic.setRidge(ridge) bagging.setClassifier(logistic) bagging.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bagging, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bagging, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def smo(trainData,testData,params,exparams): kerType = str2bool(params[0]) cValue = float(params[1]) kerParam = float(params[2]) if kerType: # RBF kernel kernel = RBFKernel() kernel.setGamma(kerParam) else: # Polynomial kernel kernel = PolyKernel() kernel.setExponent(kerParam) smo = SMO() smo.setKernel(kernel) smo.setC(cValue) smo.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(smo, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(smo, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def simple_logistic(trainData,testData,params,exparams): heuristicStop = int(float(params[0])) numBoostingIterations = int(float(params[1])) simplelogistic = SimpleLogistic() simplelogistic.setHeuristicStop(heuristicStop) simplelogistic.setNumBoostingIterations(numBoostingIterations) if (trainData.numInstances()<5): # special case for small sample size simplelogistic.setUseCrossValidation(False) simplelogistic.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(simplelogistic, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(simplelogistic, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def test_model(self, test_data, empty_solution, evaluate = False): model_weka = None if os.path.isfile(self.prediction_file): print 'Model ' + self.name + ' already tested.' elif not os.path.isfile(self.model_file): print 'Impossible testing this model. It should be trained first.' return else: print 'Starting to test_model model ' + self.name + '.' model_weka = Classifier(jobject = serialization.read(self.model_file)) evaluation = Evaluation(data = test_data) evaluation.test_model(classifier = model_weka, data = test_data) predictions = evaluation.predictions() rows = read_sheet(file_name = empty_solution) solutions = [] for row in rows: solution = [row['userid'], row['tweetid'], predictions.pop(0).predicted()] solutions.append(solution) write_the_solution_file(solutions, self.prediction_file) print 'Model ' + self.name + ' tested.' if evaluate == True: if os.path.isfile(self.evaluation_file): print 'Model ' + self.name + ' already evaluated.' return elif model_weka == None: model_weka = Classifier(jobject = serialization.read(self.model_file)) evaluation = Evaluation(data = test_data) evaluation.test_model(classifier = model_weka, data = test_data) save_file(file_name = self.evaluation_file, content = evaluation.to_summary()) print 'Model ' + self.name + ' evaluated.'
def bayesian(trainData,testData,params,exparams): IsOptMultinomialBayes = str2bool(params[0]) IsOptNaiveKernelDensity = str2bool(params[1]) if IsOptMultinomialBayes: # optimal bayesian classifier is multinomial bayes = NaiveBayesMultinomial() else: bayes = NaiveBayes() if IsOptNaiveKernelDensity: # use kernel density estimation bayes.setUseKernelEstimator(Boolean(True)) bayes.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bayes, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bayes, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def build_and_classify(classifier, classifier_name, approach_name, infile, percentage='10'): """ Creates model and classifies against input data. Returns accuracy statistics """ # set seed so results are consistent random.seed('iot') # load data loader = Loader(classname='weka.core.converters.CSVLoader') data = loader.load_file(infile) data.class_is_last() # convert all numeric attributes to nominal to_nominal = Filter(classname='weka.filters.unsupervised.attribute.NumericToNominal', options=['-R', 'first-last']) to_nominal.inputformat(data) data = to_nominal.filter(data) # randomize data with constant seed randomize = Filter(classname='weka.filters.unsupervised.instance.Randomize', options=['-S', '42']) randomize.inputformat(data) data = randomize.filter(data) # create training set and testing set train_percent_filter = Filter(classname='weka.filters.unsupervised.instance.RemovePercentage', options=['-P', percentage, '-V']) train_percent_filter.inputformat(data) train = train_percent_filter.filter(data) test = data # build and test classifier classifier.build_classifier(train) evaluation = Evaluation(train) evaluation.test_model(classifier, test) # return results as array results = [ approach_name, classifier_name, percentage, evaluation.percent_correct, evaluation.weighted_f_measure ] return results
def test_weka_classifier(clf, train, test): clf.build_classifier(train) evl = Evaluation(train) evl.test_model(clf, test) acc = evl.percent_correct auc = evl.weighted_area_under_roc err = evl.error_rate log = evl.sf_mean_scheme_entropy print( "# testing | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}". format(log, acc, auc, err)) return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]): if arrfFile is not None: self.initData( arrfFile ) if self.data is None: return print 'Classificador ' + str(classname) + ' ' + ' '.join(options) cls = Classifier(classname=classname, options=options) evl = Evaluation(self.data) evl.crossvalidate_model(cls, self.data, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details())
def cross_validate(self, detail = True): """Perform cross validation using trained data. Parameters ---------- detail : boolean, optional, default = True If true return a detailed information of cross validation. Returns ------- info : string Info with results of cross validation. """ #print 'cross_validation' start_time = TimeUtils.get_time() info = "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options])) if detail == True: info += "Relation:\t%s\n" % (self.data.relationname) info += "Instances:\t%d\n" % (self.data.num_instances) info += "Attributes:\t%d\n\n" % (self.data.num_attributes) evl = WEvaluation(self.data) evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1)) if detail == False: info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct) info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time) #info += str(evl.percent_correct) + "\n\n" if detail == True: info += "=== Stratified cross-validation ===\n" info += evl.summary() + "\n\n" info += str(evl.class_details()) + "\n\n" classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)] cm = evl.confusion_matrix info += Classifier.confusion_matrix(classes, cm) return info
def runCV(this, arffFile, classifier, folds): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arffFile) data.class_is_last() classes = [str(code) for code in data.class_attribute.values] header = ["Accuracy"] for name in classes: header += [name + " TP", name + " FP", name + " AUC ROC"] values = [] cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, folds, Random(1)) values.append(evl.percent_correct) for name in classes: index = classes.index(name) values += [ evl.true_positive_rate(index) * 100, evl.false_positive_rate(index) * 100, evl.area_under_roc(index) ] this.values = values this.header = header
def crossTest(this, trainingFile, classifier, testFile): loader = Loader(classname="weka.core.converters.ArffLoader") data1 = loader.load_file(trainingFile) data1.class_is_last() cls = Classifier(classname=classifier) cls.build_classifier(data1) data2 = loader.load_file(testFile) data2.class_is_last() classes = [str(code) for code in data2.class_attribute.values] header = ["Accuracy"] for name in classes: header += [name + " TP", name + " FP", name + " AUC ROC"] values = [] evl = Evaluation(data2) evl.test_model(cls, data2) values.append(evl.percent_correct) for name in classes: index = classes.index(name) values += [ evl.true_positive_rate(index) * 100, evl.false_positive_rate(index) * 100, evl.area_under_roc(index) ] this.values = values this.header = header
def predict(self, X): evaluation = Evaluation(self.train_data) # Add class column (we can't copy X, because this is a large object, so we add the column and remove it later) X['class'] = None filename = self.to_arff(X, True) # Remove class column del X['class'] loader = Loader("weka.core.converters.ArffLoader") test_data = loader.load_file(filename) test_data.class_is_last() preds = evaluation.test_model(self.classifier, test_data) return preds
def use_classifier(data): """ Uses the meta-classifier AttributeSelectedClassifier for attribute selection. :param data: the dataset to use :type data: Instances """ print("\n1. Meta-classifier") classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"]) base = Classifier(classname="weka.classifiers.trees.J48") # setting nested options is always a bit tricky, getting all the escaped double quotes right # simply using the bean property for setting Java objects is often easier and less error prone classifier.set_property("classifier", base.jobject) classifier.set_property("evaluator", aseval.jobject) classifier.set_property("search", assearch.jobject) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(1)) print(evaluation.summary())
def run(): jvm.start() load_csv = Loader("weka.core.converters.CSVLoader") data_csv = load_csv.load_file( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.csv" ) saver = Saver("weka.core.converters.ArffSaver") saver.save_file( data_csv, "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff" ) load_arff = Loader("weka.core.converters.ArffLoader") data_arff = load_arff.load_file( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff" ) data_arff.class_is_last() cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"]) cls.build_classifier(data_arff) for index, inst in enumerate(data_arff): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) # save tree prune in txt file saveFile = open( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.txt", "w") saveFile.write(str(cls)) # print(cls) saveFile.close() global j48 J48_class = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) J48_class.build_classifier(data_arff) evaluationj48 = Evaluation(data_arff) evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100)) j48 = str(evaluationj48.percent_correct) jvm.stop() return j48
def runBayes(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def CV10(dataset, algo): print "inside 10cv" print("dataset ----" + dataset) print("algorithm ----" + algo) #Executing 10FCV # jvm.start(packages=True) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() #print(data) cls = Classifier(classname=algo) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 2, Random(5)) print("areaUnderROC/1: " + str(evl.area_under_roc(1)))
def executeKFoldClassifier(self, featureInclusion, kFold): deleteFeatures = 0 for i in range(0, len(featureInclusion)): if featureInclusion[i]: self.instances.deleteAttributeAt(i - deleteFeatures) deleteFeatures += 1 self.instances.setClassIndex(self.instances.numAttributes - 1) cvParameterSelection = javabridge.make_instance( "weka/classifiers/meta/CVParameterSelection", "()V") javabridge.call(cvParameterSelection, "setNumFolds", "(I)V", kFold) javabridge.call(cvParameterSelection, "buildClassifier(weka/core/Instances)V", self.instances) eval = Evaluation(self.instances) eval.crossvalidate_model(cvParameterSelection, self.instances, kFold, random()) return eval.percent_correct()
def train_and_eval_weka_classifier(clf, train, valid, n_instances): # total_inst = train.num_instances total_train_inst = train.num_instances percentage = (n_instances * 100) / total_train_inst if percentage == 100: opt = train else: opt, extra = train.train_test_split(percentage, Random(1)) # inst_train2 = train2.num_instances print('total_train_inst: ', total_train_inst, '| percentage: ', percentage, '| used_inst: ', opt.num_instances) import signal class AlarmException(Exception): pass def alarmHandler(signum, frame): raise AlarmException clf.build_classifier(opt) evl = Evaluation(opt) evl.test_model(clf, valid) acc = evl.percent_correct auc = evl.weighted_area_under_roc err = evl.error_rate log = evl.sf_mean_scheme_entropy print( "# validating | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}" .format(log, acc, auc, err)) return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
def HOV(dataset, algo): print "inside hov" print("dataset ----" + dataset) print("algorithm ----" + algo) #Executing HOV \_*-*_/ # jvm.start(packages=True) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() train, test = data.train_test_split(70.0, Random(10)) cls = Classifier(classname=algo) cls.build_classifier(train) evl = Evaluation(train) evl.test_model(cls, test) return (str(evl.area_under_roc(1)))
def SMOreg(): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("First_trial_regression.arff") data.class_is_last() cls = KernelClassifier(classname="weka.classifiers.functions.SMOreg", options=["-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.2"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(486), pout) print(evl.summary()) print(pout.buffer_content()) # save model serialization.write_all("SMOreg.model2", cls)
def weka_bayesnet(filearffpath='data/datatobayes.arff'): """Simple calling of the bayesian network from python. """ #Preparing the data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file('data/datatobayes.arff') #data = loader.load_file('data/Full.arff') remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) remove.inputformat(data) filtered = data #remove.filter(data) #Classifier test from weka.classifiers import Classifier, Evaluation from weka.core.classes import Random filtered.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.BayesNet", options=['-D']) # evaluation = Evaluation(filtered) evaluation.crossvalidate_model(classifier, filtered, 10, Random(42)) return evaluation.area_under_roc(class_index=0) #ROC, no std of kfold
def evaluation(self, classifier, trainingData, testingData = None): trainingData.set_class_index(trainingData.num_attributes() - 1) if testingData == None: evaluation = Evaluation(trainingData) # initialize with priors evaluation.crossvalidate_model(classifier, trainingData, 10, Random(42)) # 10-fold CV return evaluation else: print "testing data exists" if testingData.num_attributes() == trainingData.num_attributes(): testingData.set_class_index(testingData.num_attributes() - 1) evaluation = Evaluation(trainingData) classifier.build_classifier(trainingData) evaluation.test_model(classifier, testingData) #for attribute in trainingData.attributes(): # print "train:" + str(attribute) #for attribute in testingData.attributes(): # print "test:" + str(attribute) return evaluation else: print "testing Data doesn't have same attribute with training data" for attribute in trainingData.attributes(): print "train:" + str(attribute) for attribute in testingData.attributes(): print "test:" + str(attribute)
def test_classifier(dataset: Instances, classifier: Classifier, params: dict): vars = params.keys() vals = params.values() results = defaultdict(list) for val_combo in itertools.product(*vals): results["numInstances"].append(dataset.num_instances) results["numAttributes"].append(dataset.num_attributes) opts = dict(zip(vars, val_combo)) for opt in opts: results[opt].append(opts[opt]) classifier.set_property( opt, opts[opt] if not isinstance(opts[opt], float) else typeconv.double_to_float(opts[opt])) evl = Evaluation(dataset) classifier.build_classifier(dataset) evl.test_model(classifier, dataset) results["Training_Accuracy"].append(evl.percent_correct) results["size"].append( int(javabridge.call(classifier.jobject, "measureTreeSize", "()D"))) evl.crossvalidate_model(classifier, dataset, 10, Random(1)) results["CV_Accuracy"].append(evl.percent_correct) return results
def vote_classifier_train(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() meta = MultipleClassifiersCombiner( classname="weka.classifiers.meta.Vote", options=[ '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B', 'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG' ]) eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(meta, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout) gc.collect() print_and_save('Proposed model', flag, nameOfDataSet, eval)
def run_naive_bayes_crossval(self, output_directory): # build classifier print("\nBuilding Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "NB Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Naive_Bayes_Crossval", resultsString, output_directory)
def Boost_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Boost_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False) value_Boost_J48 = str(evaluation.percent_correct) return value_Boost_J48
def RandomTree(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RT_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RT_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RT_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False) value_RT = str(evl.percent_correct) return value_RT
def run_ibk(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running IBk on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # Use IBk and set options cls = Classifier(classname="weka.classifiers.lazy.IBk", options=["-K", "3"]) # print(cls.options) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") # Evaluate data evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout) # Save summary, class details and confusion matrix to file result_output = filename_base + "_eval_results.txt" output_eval(evaluation, dir / result_output) # Save the predicited results to file prediction_output = filename_base + "_pred_results.txt" output_pred(pout, dir / prediction_output) print("IBk complete")
def do_temporal_cv(t_selector, instances, num_folds): num_instances = instances.numInstances() results = [] # Split folds for f in xrange(2, num_folds+1): print "fold:%d"%f for pair in split_temporal_train_test(f, num_instances): # train_start = pair.train_start # train_end = pair.train_end train_set = Instances(instances, int(pair.train_start), int(pair.train_end - pair.train_start+1)) test_set = Instances(instances, int(pair.test_start), int(pair.test_end - pair.test_start +1)) t_selector.buildClassifier(train_set) e = Evaluation(train_set) e.evaluateModel(t_selector, test_set) if e.recall(0) > 0 and e.precision(0) > 0: results.append(Result(instances.numAttributes(), e)) # print "precision: %.2f"%evalTest.precision(0) # print "recall: %.2f"%evalTest.recall(0) # print evalTest.toSummaryString() # System.out.println(strSummary); sum_precision = 0 sum_recall = 0 for r in results: # print "precision:" # print r.precision # print "recall:" # print r.recall sum_precision += r.precision sum_recall +=r.recall precision = sum_precision*1.0/len(results) recall = sum_recall*1.0/len(results) avg_fmeasure = harmonic_mean([precision, recall]) print "f_measure:%.2f"%avg_fmeasure
def DecisionTree(rnd_data, folds, seed, data): data_size = rnd_data.num_instances fold_size = math.floor(data_size / folds) # cross-validation evaluation = Evaluation(rnd_data) for i in range(folds): this_fold = fold_size test_start = i * fold_size test_end = (test_start + fold_size) if ((data_size - test_end) / fold_size < 1): this_fold = data_size - test_start test = Instances.copy_instances(rnd_data, test_start, this_fold) # generate validation fold if i == 0: train = Instances.copy_instances(rnd_data, test_end, data_size - test_end) else: train_1 = Instances.copy_instances(rnd_data, 0, test_start) train_2 = Instances.copy_instances(rnd_data, test_end, data_size - test_end) train = Instances.append_instances( train_1, train_2) # generate training fold # build and evaluate classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # build classifier on training set evaluation.test_model(cls, test) # test classifier on validation/test set print("") print("=== Decision Tree ===") print("Classifier: " + cls.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print( evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
def training(self): # Preparação dos dados self.imp = Imputation(self.data) # Seleciona as caracteristicas self.features = FeatureSelection(self.imp.imputed_data) data_selected = self.features.data_selected self.selected_features = self.features.selected_features # Encontra os padrões ausentes self.missing_patterns = MissingPatterns(self.data, self.selected_features).missing_patterns # Realiza o treinamento dos classificadores #print('test train') for mpi in self.missing_patterns: # Seleciona as caracteristicas cpi = set(self.selected_features) - set(mpi) data_temp = Instances.copy_instances(data_selected, from_row=0, num_rows=data_selected.num_instances) data_temp.class_is_last() # Separa os dados de treinamento data_temp = self.reduceData(data_temp, cpi, self.data) # Treina os classificadores com os dados imputados classifier = Classifier(classname=self.learn_class, options=self.options) classifier.build_classifier(data_temp) #print(classifier.distribution_for_instance(data_selected.get_instance(30))) #!!!!!! Verica o peso de cada classificador (sua acuracia de classificação) evl = Evaluation(data_temp) evl.crossvalidate_model(classifier, data_temp, 15, Random(1)) # Adiciona os classificadores treinados ao conjunto de classificadores my_classifier = MyClassifier(classifier, cpi, 1 - evl.mean_absolute_error) self.classifiers.add(my_classifier)
def crossEvaluate(self): """ Evaluate classifier using cross-validation using K folds :return: """ if self.classifierInstance is not None: print '[Cross-validate data]' try: # Cross validation evaluation evaluatorInstance = Evaluation(self.classificationData) evaluatorInstance.crossvalidate_model(self.classifierInstance, self.classificationData, self.evaluationNumFolds, Random(1)) # Store evaluation results self.setEvaluationResults(evaluatorInstance) return True except: return False return False
def baggin_smo(trainData,testData,params,exparams): IsOptBagOnOptSMO = str2bool(params[0]) if IsOptBagOnOptSMO: # optimal bagging is based on optimal SMO thus I should use extra params kerType = str2bool(params[0]) cValue = float(exparams[1]) kerParam = float(exparams[2]) if kerType: # RBF kernel kernel = RBFKernel() kernel.setGamma(kerParam) else: # Polynomial kernel kernel = PolyKernel() kernel.setExponent(kerParam) bagSizePercent = int(float(params[1])) numIterations = int(float(params[2])) smo = SMO() bagging = Bagging() smo.setKernel(kernel) smo.setC(cValue) bagging.setBagSizePercent(bagSizePercent) bagging.setNumIterations(numIterations) bagging.setClassifier(smo) else: # optimal bagging is based on linear SMO cValue = float(params[1]) numIterations = int(float(params[2])) smo = SMO() bagging = Bagging() kernel = PolyKernel() smo.setKernel(kernel) smo.setC(cValue) bagging.setNumIterations(numIterations) bagging.setClassifier(smo) bagging.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bagging, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(bagging, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def CV5x2(dataset, algo, num_datasets): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() cls = Classifier(classname=algo) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 2, Random(5)) print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False)) print(evl.matrix("=== on click prediction(confusion matrix) ===")) print("For Algo"+ str(algo)+"areaUnderROC/1: for CV5x2 " + str(evl.area_under_roc(1))) return evl.area_under_roc(1)
def train(option, sym, num): # load dataset given the symbol path = os.path.join('HistSet', 'histSet_%s.arff' % sym) loader = Loader("weka.core.converters.ArffLoader") dataset = loader.load_file(path) dataset.class_is_last() # set the last attribute as class attribute # load testset # testset = loader.load_file(os.path.join('HistSet', 'testSet_LTC.arff')) # testset.class_is_last() # define classifier cmd = { 'DecisionTable': 'weka.classifiers.rules.DecisionTable -X 1 -S "weka.attributeSelection.BestFirst -D 1 -N 5"', 'SMOreg': 'weka.classifiers.functions.SMOreg -C 1.0 -N 0 -I "weka.classifiers.functions.supportVector.RegSMOImproved -L 0.001 -W 1 -P 1.0E-12 -T 0.001 -V" -K "weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0"', 'LinearRegression': 'weka.classifiers.functions.LinearRegression -S 0 -R 1.0E-8', 'GaussianProcesses': 'weka.classifiers.functions.GaussianProcesses -L 1.0 -N 0 -K "weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 1.0"', } cls = from_commandline(cmd[option], classname='weka.classifiers.Classifier') cls.build_classifier(dataset) # begin evaluating evaluation = Evaluation(dataset) # evaluation.evaluate_train_test_split(cls, dataset, 90, Random(1)) # evaluate by splitting train/test set evl = evaluation.test_model(cls, dataset) # evaluate on test set print('predictions (' + str(len(evl)) + '): ') for i in range(num): print(evl[i - num], end=' ') # print(evaluation.summary()) return evl[-num:]
def run_bayesNet(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running BayesNet on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # Use BayesNet and set options cls = Classifier(classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.TAN", "--", "-P", "1", "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") # Evaluate data evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.confusion_matrix) # Generate grid for ROC # plcls.plot_roc(evaluation, class_index=[0,1], wait=True) # mk dirs for output dir = dir / "bayesNet_results" dir.mkdir(parents=True, exist_ok=True) # Save summary, class details and confusion matrix to file result_output = filename_base + "_bayesNet_eval_results_TAN.txt" output_eval(evaluation, dir / result_output) # Save the predicited results to file prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt" output_pred(pout, dir / prediction_output) print("BayesNet complete")
def experiment_file_random(path_features, path_folder_save_results, options, classifier, fold, random, name): print(name + " Start: " + str(datetime.datetime.now())) time = datetime.datetime.now() cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) d_results = { 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } data = converters.load_any_file(path_features) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv', index=False) save = pout.buffer_content() check_folder_or_create(path_folder_save_results + '/' + 'prediction') with open( path_folder_save_results + '/' + 'prediction/' + str(name) + '.csv', 'w') as f: f.write(save) print(name + " End: " + str(datetime.datetime.now() - time))
def train_trees(data, attributes): clfs = [] evls = [] dt_y_hat = [] unused_attributes = [] for i, att in enumerate(attributes): data.class_index = i count_non_nans = np.count_nonzero(~np.isnan(data.values(i))) if count_non_nans < 5: unused_attributes.append(i) print('Not using attribute {}, only {} real values\n\n'.format( att, count_non_nans)) clfs.append(None) evls.append(None) dt_y_hat.append(None) continue this_clf = Classifier(classname='weka.classifiers.trees.J48', options=['-U', '-B', '-M', '2']) this_clf.build_classifier(data) this_evl = Evaluation(data) this_evl.crossvalidate_model(this_clf, data, 5, Random(1)) dt_y_hat.append(this_clf.distributions_for_instances(data)) clfs.append(this_clf) evls.append(this_evl) return clfs, evls, dt_y_hat, unused_attributes
def obtainBayesNet(file): #The path of the arff extension file must be put. data = converters.load_any_file(folderPathOfArffFiles + file + ".arff") #In the case of this specific data set, the first two attributes were removed since they # represent the name and ranking which are unique values that would affect the classification. # Depending on the data set, certain attributes must be removed. remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-2"]) remove.inputformat(data) data = remove.filter(data) #It is specified that the class value is the last attribute. data.class_is_last() #Define the classifier to be used. classifier = Classifier(classname="weka.classifiers.bayes.BayesNet") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, kFold, Random(42)) #The ROC-AUC is extracted from the string that is received from Weka. info = evaluation.class_details() roc_area = float(info[406:411]) return roc_area
def ClassifyParam(mode, binWidths): if not os.path.exists("classificationResults"): os.makedirs("classificationResults") if("normal" in mode): file = open("classificationResults/AllVsAll.csv","w") file.write("BinWidth, Accuracy\n") for binWidth in binWidths: train_set = "Data/arff/TrainSet_%s.arff"%(binWidth) test_set = "Data/arff/TestSet_%s.arff"%(binWidth) print "Loading Datasets..." train_data = converters.load_any_file(train_set) test_data = converters.load_any_file(test_set) #Set class attribute train_data.class_is_last() test_data.class_is_last() print "Dataset Loaded!" classifier_name = "weka.classifiers.meta.FilteredClassifier" classifier = Classifier(classname=classifier_name, options=[ "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"", "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"]) start_train = time.time() classifier.build_classifier(train_data) end_train = time.time() print "Train\t%s\t%s"%(binWidth, end_train-start_train) for index, inst in enumerate(test_data): if(index == 0): start_sample = time.time() classifier.classify_instance(inst) end_sample = time.time() print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample) print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth) evaluation = Evaluation(test_data) start_batch = time.time() evaluation.test_model(classifier, test_data) end_batch = time.time() print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch) print evaluation.summary() acc = evaluation.percent_correct/100.0 print "Percent correct: " + str(acc) file.write("%s, %s\n"%(binWidth, acc)) file.close()
def random_forest(trainData,testData,params,exparams): numTrees = int(float(params[0])) numFeatures = int(float(params[1])) randomforest = RandomForest() randomforest.setNumTrees(numTrees) randomforest.setNumFeatures(numFeatures) randomforest.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(randomforest, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(randomforest, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def logistic(trainData,testData,params,exparams): ridge = float(params[0]) maxIt = int(float(params[1])) print "Ridge=%s, maxIt=%s" %(str(ridge),str(maxIt)) logistic = Logistic() logistic.setMaxIts(maxIt) logistic.setRidge(ridge) logistic.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(logistic, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(logistic, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def adaboostM1_simple_logistic(trainData,testData,params,exparams): IsOptBoostOnOptSimpLog = str2bool(params[0]) simplelogistic = SimpleLogistic() adaboostm = AdaBoostM1() if IsOptBoostOnOptSimpLog: # optimal adaboost is based on optimal simple logisatic heuristicStop = int(float(exparams[0])) numBoostingIterations = int(float(exparams[1])) weightThreshold = int(float(params[1])) numIterations = int(float(params[2])) simplelogistic.setHeuristicStop(heuristicStop) simplelogistic.setNumBoostingIterations(numBoostingIterations) adaboostm.setWeightThreshold(weightThreshold) adaboostm.setNumIterations(numIterations) else: numBoostingIterations = int(float(params[1])) numIterations = int(float(params[2])) simplelogistic.setNumBoostingIterations(numBoostingIterations) adaboostm.setNumIterations(numIterations) adaboostm.setClassifier(simplelogistic) adaboostm.buildClassifier(trainData) # only a trained classifier can be evaluated # evaluate it on the training evaluation = Evaluation(trainData) (trainOutput, trainBuffer) = util.get_buffer_for_predictions(trainData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(adaboostm, trainData, [trainOutput, attRange, outputDistribution]) print "--> Evaluation:\n" print evaluation.toSummaryString() trainSummary = makeTrainEvalSummary(evaluation) # evaluate it on testing evaluation = Evaluation(testData) (testOutput, testBuffer) = util.get_buffer_for_predictions(testData) attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(adaboostm, testData, [testOutput, attRange, outputDistribution]) return trainBuffer, testBuffer, trainSummary
def runClassifierAlgo(algo, training_filename, test_filename, do_model, do_eval, do_predict): """ Run classifier algorithm <algo> on training data in <training_filename> to build a model then run in on data in <test_filename> (equivalent of WEKA "Supplied test set") """ training_file = FileReader(training_filename) training_data = Instances(training_file) test_file = FileReader(test_filename) test_data = Instances(test_file) # set the class Index - the index of the dependent variable training_data.setClassIndex(class_index) test_data.setClassIndex(class_index) # create the model algo.buildClassifier(training_data) evaluation = None # only a trained classifier can be evaluated if do_eval or do_predict: evaluation = Evaluation(test_data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution evaluation.evaluateModel(algo, test_data, [buffer, attRange, outputDistribution]) if verbose: if do_model: print "--> Generated model:\n" print algo.toString() if do_eval: print "--> Evaluation:\n" print evaluation.toSummaryString() if do_predict: print "--> Predictions:\n" print buffer return {"model": str(algo), "eval": str(evaluation.toSummaryString()), "predict": str(buffer)}
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() classifiers = [ "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.lazy.IBk", "weka.classifiers.trees.J48" ] # cross-validate classifiers for classifier in classifiers: # classifier itself cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s: %0.0f%%" % (classifier, evl.percent_correct)) # meta with cfssubseteval meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier") meta.options = \ ["-E", "weka.attributeSelection.CfsSubsetEval", "-S", "weka.attributeSelection.BestFirst", "-W", classifier] evl = Evaluation(data) evl.crossvalidate_model(meta, data, 10, Random(1)) print("%s (cfs): %0.0f%%" % (classifier, evl.percent_correct)) # meta with wrapper meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier") meta.options = \ ["-E", "weka.attributeSelection.WrapperSubsetEval -B " + classifier,
if data_dir is None: data_dir = "." + os.sep + "data" import os import weka.core.jvm as jvm from weka.core.converters import Loader from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation from weka.filters import Filter jvm.start() # load weather.nominal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # perform 10-fold cross-validation cls = Classifier(classname="weka.classifiers.rules.OneR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation:\n" + evl.to_summary()) # build model on full dataset and output it cls.build_classifier(data) print("Model:\n\n" + str(cls)) jvm.stop()
# check commandline parameters if (not (len(sys.argv) == 2)): print "Usage: UsingJ48Ext.py <ARFF-file>" sys.exit() # load data file print "Loading data..." file = FileReader(sys.argv[1]) data = Instances(file) # set the class Index - the index of the dependent variable data.setClassIndex(data.numAttributes() - 1) # create the model evaluation = Evaluation(data) buffer = StringBuffer() # buffer for the predictions attRange = Range() # no additional attributes output outputDistribution = Boolean(False) # we don't want distribution j48 = J48() j48.buildClassifier(data) # only a trained classifier can be evaluated evaluation.evaluateModel(j48, data, [buffer, attRange, outputDistribution]) # print out the built model print "--> Generated model:\n" print j48 print "--> Evaluation:\n" print evaluation.toSummaryString() print "--> Predictions:\n"
from weka.classifiers import Classifier, Evaluation, CostMatrix, PredictionOutput jvm.start() datasets = [ "ionosphere.arff", "credit-g.arff", "breast-cancer.arff", "diabetes.arff" ] classifiers = [ "weka.classifiers.functions.VotedPerceptron", "weka.classifiers.functions.SMO", ] for dataset in datasets: # load dataset fname = data_dir + os.sep + dataset loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) for classifier in classifiers: # cross-validate classifier cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("%s / %s: %0.1f%%" % (dataset, classifier, evl.percent_correct())) jvm.stop()
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # plot pld.scatter_plot( data, data.get_attribute_by_name("petalwidth").get_index(), data.get_attribute_by_name("petallength").get_index(), wait=False) # add classifier errors to dataset addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-W", "weka.classifiers.trees.J48", "-classification", "-error"]) addcls.set_inputformat(data) filtered = addcls.filter(data) print(filtered) # build J48 cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(data) evl = Evaluation(data) evl.test_model(cls, data) # plot classifier errors plc.plot_classifier_errors(evl.predictions(), wait=True) jvm.stop()
def run(self): # Attach JVM javabridge.attach() # Debug print "Classifier" print self.classifier print "Params" print self.parameters print "Model Params" print self.modelParams # Get data for testing and learning learnerData = self.retrieveData(self.questionID, "learner") testData = self.retrieveData(self.questionID, 'test') masterData = self.retrieveData(self.questionID, 'all') masterData = self.addNominals(masterData) # Check if there is enough correct data to run if (learnerData.num_instances < 1 or testData.num_instances < 1): self.status = self.config.NOT_ENOUGH_DATA return False # If this is a prediction and there is a valid patient, change masterData header patientObj = self.buildPatientObject() patientInstance = None if ((patientObj is not None) and (self.predict == 1)): masterData = self.addPatientNominals(patientObj, masterData) patientInstance = self.createPatientInstance(patientObj, masterData) masterData.add_instance(patientInstance) elif (patientObj is None) and (self.predict == 1): print 'No patient defined for prediction. Exiting' return True # Fix dataset headers up to match and fix instances to match headers masterData.delete() learner = masterData.copy_instances(masterData, 0, 0) test = masterData.copy_instances(masterData, 0, 0) self.addInstancesToDataset(learnerData, learner) self.addInstancesToDataset(testData, test) # Comparison of data for testing purposes # print 'learnerData' # print learnerData # print 'learner' # print learner # print 'testData' # print testData # print 'test' # print test # pdb.set_trace() # Instantiate classifier self.cls = Classifier(classname=self.classifier, options=self.parameters) # Run classifier self.cls.build_classifier(learner) # for index, inst in enumerate(learnerData): # prediction = self.cls.classify_instance(inst) # distribution = self.cls.distribution_for_instance(inst) # Test classifier evl = Evaluation(learner) evl.test_model(self.cls, test) # Store information about matrix self.acc = evl.percent_correct self.val = None # Convert numpy array into simple array confusionMatrix = [] confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]]) confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]]) # Convert matrix into json format self.matrix = json.dumps(confusionMatrix) # print 'Classifier: ', self.classifier # print 'ID: ', self.questionID # print 'ACC: ', self.acc # print(evl.summary()) # If this is a prediction... make the prediction if ((patientObj is not None) and (self.predict == 1)): masterData.add_instance(patientInstance) print "Running prediction on patient: " print masterData.get_instance(0) self.prediction = self.cls.classify_instance(masterData.get_instance(0)) #self.uploadPrediction() # Temporarily store file to serialize to fileName = str(self.questionID) + self.algorithm + ".model" serialization.write(fileName, self.cls) # Open that file and store it self.model = None with open(fileName, 'rb') as f: self.model = f.read() # Remove temporary file os.remove(fileName) # Set status to awaiting feedback self.status = self.config.AWAITING_FEEDBACK_STATUS return True
data_dir = os.environ.get("WEKAMOOC_DATA") if data_dir is None: data_dir = "." + os.sep + "data" import weka.core.jvm as jvm from weka.core.converters import Loader from weka.classifiers import Classifier, Evaluation, PredictionOutput from weka.core.classes import Random import weka.plot.classifiers as plc jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() # cross-validate NaiveBayes cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1), pout) print(evl.summary()) print(evl.matrix()) print(pout) plc.plot_roc(evl, wait=True) jvm.stop()