def try_params(n_instances, params, base, train, valid, test, istest): n_instances = int(round(n_instances)) pprint(params) L = list() if params['missingSeparate'] == True: L.append("-M") if params['locallyPredictive'] == False: L.append("-L") if params['search'] == 'GreedyStepwise': param_search = gs.get_params() search = gs.get_class(param_search) else: param_search = bf.get_params() search = bf.get_class(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("base", base.jobject) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def test_classifier(dataset: Instances, classifier: Classifier, params: dict): vars = params.keys() vals = params.values() results = defaultdict(list) for val_combo in itertools.product(*vals): results["numInstances"].append(dataset.num_instances) results["numAttributes"].append(dataset.num_attributes) opts = dict(zip(vars, val_combo)) for opt in opts: results[opt].append(opts[opt]) classifier.set_property( opt, opts[opt] if not isinstance(opts[opt], float) else typeconv.double_to_float(opts[opt])) evl = Evaluation(dataset) classifier.build_classifier(dataset) evl.test_model(classifier, dataset) results["Training_Accuracy"].append(evl.percent_correct) results["size"].append( int(javabridge.call(classifier.jobject, "measureTreeSize", "()D"))) evl.crossvalidate_model(classifier, dataset, 10, Random(1)) results["CV_Accuracy"].append(evl.percent_correct) return results
def get_evaluator(params, base): pprint(params) L = list() if params['missing_merge'] == True: L.append("-M") # if params['search'] == 'GreedyStepwise': # param_search = gs.get_params() # search = gs.get_search(param_search) # elif params['search'] == 'BestFirst': # param_search = bf.get_params() # search = bf.get_search(param_search) # elif params['search'] == 'Ranker': param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation( classname="weka.attributeSelection.SymmetricalUncertAttributeEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def try_params(n_instances, params, base, train, valid, test, istest): n_instances = int(round(n_instances)) # print "n_instances:", n_instances pprint(params) L = list([]) if params['outputDetailedInfo'] == True: L.append("-D") param_search = rk.get_params() search = rk.get_class(param_search) # search = ASSearch(classname="weka.attributeSelection.Ranker") evaluator = ASEvaluation( classname="weka.attributeSelection.CorrelationAttributeEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def get_evaluator(params, base): pprint(params) L = list([]) if params['outputDetailedInfo'] == True: L.append("-D") param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection.Ranker") evaluator = ASEvaluation( classname="weka.attributeSelection.CorrelationAttributeEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def get_evaluator(params, base): pprint(params) L = list() if params['missingSeparate'] == True: L.append("-M") if params['locallyPredictive'] == False: L.append("-L") if params['search'] == 'GreedyStepwise': param_search = gs.get_params() search = gs.get_search(param_search) else: param_search = bf.get_params() search = bf.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def try_params(n_instances, params, base, train, valid, test, istest): n_instances = int(round(n_instances)) # print "n_instances:", n_instances pprint(params) L = list([]) if params['missingMerge'] == False: L.append("-M") if params['binarizeNumericAttributes'] == True: L.append("-B") # print L search = ASSearch(classname="weka.attributeSelection.Ranker") evaluator = ASEvaluation(classname="weka.attributeSelection.InfoGainAttributeEval", options=L) clf = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def try_params(n_instances, params, base, train, valid, test, istest): n_instances = int(round(n_instances)) pprint(params) L = list() if params['missing_merge'] == True: L.append("-M") if params['search'] == 'GreedyStepwise': param_search = gs.get_params() search = gs.get_search(param_search) elif params['search'] == 'BestFirst': param_search = bf.get_params() search = bf.get_search(param_search) elif params['search'] == 'Ranker': param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation(classname="weka.attributeSelection.GainRatioAttributeEval", options=L) clf = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def get_evaluator(params, base): pprint(params) L = list([]) if params['missingMerge'] == False: L.append("-M") if params['binarizeNumericAttributes'] == True: L.append("-B") param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection.Ranker") evaluator = ASEvaluation( classname="weka.attributeSelection.InfoGainAttributeEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def get_evaluator(params, base): pprint(params) L = list([]) if params['weightByDistance'] == True: L.append("-W") L.append("-M") L.append(str(params['sampleSize'])) L.append("-K") L.append(str(params['numNeighbours'])) L.append("-A") L.append(str(params['sigma'])) param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection.Ranker") evaluator = ASEvaluation(classname="weka.attributeSelection.ReliefFAttributeEval", options=L) clf = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def use_classifier(data): """ Uses the meta-classifier AttributeSelectedClassifier for attribute selection. :param data: the dataset to use :type data: Instances """ print("\n1. Meta-classifier") classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"]) base = Classifier(classname="weka.classifiers.trees.J48") # setting nested options is always a bit tricky, getting all the escaped double quotes right # simply using the bean property for setting Java objects is often easier and less error prone classifier.set_property("classifier", base.jobject) classifier.set_property("evaluator", aseval.jobject) classifier.set_property("search", assearch.jobject) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(1)) print(evaluation.summary())
def use_classifier(data): """ Uses the meta-classifier AttributeSelectedClassifier for attribute selection. :param data: the dataset to use :type data: Instances """ print("\n1. Meta-classifier") classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"]) base = Classifier(classname="weka.classifiers.trees.J48") # setting nested options is always a bit tricky, getting all the escaped double quotes right # simply using the bean property for setting Java objects is often easier and less error prone classifier.set_property("classifier", base.jobject) classifier.set_property("evaluator", aseval.jobject) classifier.set_property("search", assearch.jobject) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(1)) print(evaluation.summary())
def try_params(n_instances, params, base, train, valid, test, istest): n_instances = int(round(n_instances)) # print "n_instances:", n_instances pprint(params) L = list([]) if params['weightByDistance'] == True: L.append("-W") L.append("-M") L.append(str(params['sampleSize'])) L.append("-K") L.append(str(params['numNeighbours'])) L.append("-A") L.append(str(params['sigma'])) # print L search = ASSearch(classname="weka.attributeSelection.Ranker") evaluator = ASEvaluation( classname="weka.attributeSelection.ReliefFAttributeEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) if istest: result = test_weka_classifier(clf, train, test) else: result = train_and_eval_weka_classifier(clf, train, valid, n_instances) return result
def get_evaluator(params, base): pprint(params) L = list() if params['use_training'] == True: L.append("-D") L.append("-S") L.append(str(params['seed'])) L.append("-B") L.append(str(params['minimum_bucket'])) # # if params['search'] == 'GreedyStepwise': # param_search = gs.get_params() # search = gs.get_search(param_search) # elif params['search'] == 'BestFirst': # param_search = bf.get_params() # search = bf.get_search(param_search) # elif params['search'] == 'Ranker': param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation( classname="weka.attributeSelection.OneRAttributeEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def get_evaluator(params, base): pprint(params) L = list() if params['center'] == True: L.append("-C") L.append("-A") L.append(str(params['max_a'])) L.append("-R") L.append(str(params['variance'])) # if params['search'] == 'GreedyStepwise': # param_search = gs.get_params() # search = gs.get_search(param_search) # elif params['search'] == 'BestFirst': # param_search = bf.get_params() # search = bf.get_search(param_search) # elif params['search'] == 'Ranker': param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation(classname="weka.attributeSelection.PrincipalComponents", options=L) clf = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def get_evaluator(params, base): pprint(params) L = list() L.append("-E") L.append(str(params['ev_measure'])) L.append("-R") L.append(str(params['seed'])) L.append("-T") L.append(str(params['threshold'])) if params['search'] == 'GreedyStepwise': param_search = gs.get_params() search = gs.get_search(param_search) elif params['search'] == 'BestFirst': param_search = bf.get_params() search = bf.get_search(param_search) elif params['search'] == 'Ranker': param_search = rk.get_params() search = rk.get_search(param_search) # search = ASSearch(classname="weka.attributeSelection."+params['search']) evaluator = ASEvaluation( classname="weka.attributeSelection.WrapperSubsetEval", options=L) clf = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") clf.set_property("evaluator", evaluator.jobject) clf.set_property("search", search.jobject) clf.set_property("base", base.jobject) return clf
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer( classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc(evaluation, title="ROC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) # train 2nd classifier on diabetes dataset classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(diabetes_data) evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42)) plot_cls.plot_rocs({ "NB": evaluation, "RF": evaluation2 }, title="ROC diabetes", class_index=0, wait=False) plot_cls.plot_prcs({ "NB": evaluation, "RF": evaluation2 }, title="PRC diabetes", class_index=0, wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print( str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # train 2nd classifier and show errors in same plot classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg") evaluation2 = Evaluation(bolts_data) evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42)) plot_cls.plot_classifier_errors( { "LR": evaluation.predictions, "SMOreg": evaluation2.predictions }, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") ] plot_cls.plot_learning_curve(cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in xrange(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
def main(): jvm.start() data_dir = r'C:\Program Files\Weka-3-8-4\data' datasets = ['breast-cancer.arff', 'credit-g.arff'] outfile = "Modeling CV Accuracy.xlsx" loader = Loader() # col_template = ["numInstances", "numAttributes", "binarySplits", "collapseTree", "doNotMakeSplitPointActualValue", # "minNumObj", "useLaplace", "useMDLcorrection", "Training_Accuracy","size", "CV_Accuracy"] dataset_results = defaultdict(list) for datafile in datasets: dataset = loader.load_file(os.path.join(data_dir, datafile)) dataset.class_is_last() # Three possibilities with different parameter sets: # 1. reducedErrorPruning = False, unpruned = False # 2. reducedErrorPruning = True, unpruned = False # 3. reducedErrorPruning = False, unpruned = True param_template = { "binarySplits": [True, False], "collapseTree": [True, False], "doNotMakeSplitPointActualValue": [True, False], "minNumObj": [*range(1, 6), *range(10, 101, 10)], "useLaplace": [True, False], "useMDLcorrection": [True, False] } # 1. classifier = Classifier(".J48") params = param_template.copy() params.update({"confidenceFactor": [x * 0.1 for x in range(1, 6)]}) sheet_name = datafile.split('.')[0] + " rEP=F,unp=F" print("Modeling", sheet_name) eval_results = test_classifier(dataset, classifier, params) dataset_results[datafile].append(eval_results) fit_results = linear_regression(eval_results) write_to_excel(fit_results, outfile, sheet_name) # 2. classifier = Classifier(".J48") classifier.set_property("reducedErrorPruning", True) params = param_template.copy() params.update({"numFolds": [*range(2, 11)]}) sheet_name = datafile.split('.')[0] + " rEP=T,unp=F" print("Modeling", sheet_name) eval_results = test_classifier(dataset, classifier, params) dataset_results[datafile].append(eval_results) fit_results = linear_regression(eval_results) write_to_excel(fit_results, outfile, sheet_name) # 3. classifier = Classifier(".J48") classifier.set_property("reducedErrorPruning", False) classifier.set_property("unpruned", True) params = param_template.copy() sheet_name = datafile.split('.')[0] + " rEP=F,unp=T" print("Modeling", sheet_name) eval_results = test_classifier(dataset, classifier, params) dataset_results[datafile].append(eval_results) fit_results = linear_regression(eval_results) write_to_excel(fit_results, outfile, sheet_name) # Make combined model for all datasets sheet_names = [ "combined rEP=F,unp=F", "combined rEP=T," "unp=F", "combined rEP=F,unp=T" ] for i in range(len(list(dataset_results.values())[0])): combined_results = defaultdict(list) for datafile in datasets: for key in dataset_results[datafile][i]: combined_results[key] += dataset_results[datafile][i][key] print("Modeling", sheet_names[i]) fit_results = linear_regression(combined_results) write_to_excel(fit_results, outfile, sheet_names[i]) jvm.stop()
def main(): jvm.start(packages=True, max_heap_size="4g") print( "Hi! This is a protected command, please insert the password to proceed!" ) for x in range(3): password = input('') if password.strip() == 'DMMLproject': print("All good!") break else: if x == 2: print( "This command is protected and can be used only by an administrator, please use another command." ) return else: print("Wrong password, please provide the correct password") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("dataSources/fraud.arff") print("Before Preprocessing: \n") classStats = data.attribute_stats(22).nominal_counts print("#instances(Class 0): ", classStats[0]) print("#instances(Class 1): ", classStats[1]) preProcessedData = preprocess(data) print("After Preprocessing: \n") classStats = preProcessedData.attribute_stats( preProcessedData.class_index).nominal_counts print("#instances(Class 0): ", classStats[0]) print("#instances(Class 1): ", classStats[1]) # setup classifier with attribute selection classifier = Classifier( classname="weka.classifiers.meta.AttributeSelectedClassifier") aseval = ASEvaluation( classname="weka.attributeSelection.InfoGainAttributeEval") assearch = ASSearch(classname="weka.attributeSelection.Ranker", options=["-N", "4"]) classifier.set_property("evaluator", aseval.jobject) classifier.set_property("search", assearch.jobject) base1 = Classifier(classname="weka.classifiers.bayes.NaiveBayes") base2 = Classifier(classname="weka.classifiers.trees.RandomForest", options=[ "-P", "70", "-I", "30", "-num-slots", "1", "-K", "0", "-M", "1.0", "-S", "1", "-depth", "50" ]) base3 = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) base4 = Classifier(classname="weka.classifiers.trees.J48", options=["-U", "-M", "2"]) base5 = Classifier(classname="weka.classifiers.trees.HoeffdingTree", options=[ "-L", "2", "-S", "1", "-E", "1.0E7", "-H", "0.05", "-M", "0.01", "-G", "200.0", "-N", "0.0" ]) base6 = Classifier(classname="weka.classifiers.lazy.IBk", options=['-K', '1', '-W', '0']) base7 = Classifier(classname="weka.classifiers.bayes.BayesNet") # naive bayes - cross validate - traintestSplit print("----------NaiveBayes----------") classifier.set_property("classifier", base1.jobject) classify(preProcessedData, classifier, True, 'models/naiveBayes.model', splitPerc=70, randomSeed=10) classify(preProcessedData, classifier, False, 'models/naiveBayes.model', splitPerc=70, randomSeed=10) # random forest - cross validate - traintestSplit print("----------RandomForest----------") classifier.set_property("classifier", base2.jobject) classify(preProcessedData, classifier, True, 'models/randomForest.model', splitPerc=70, randomSeed=10) classify(preProcessedData, classifier, False, 'models/randomForest.model', splitPerc=70, randomSeed=10) # decision tree (with pruning) - cross validate - traintestSplit print("----------DecisionTree----------") classifier.set_property("classifier", base3.jobject) classify(preProcessedData, classifier, True, 'models/prunedJ48.model', splitPerc=70, randomSeed=10) classify(preProcessedData, classifier, False, 'models/prunedJ48.model', splitPerc=70, randomSeed=10) # decision tree (without pruning) - cross validate - traintestSplit print("----------DecisionTreeUnpruned----------") classifier.set_property("classifier", base4.jobject) classify(preProcessedData, classifier, True, 'models/unprunedJ48.model', splitPerc=70, randomSeed=10) classify(preProcessedData, classifier, False, 'models/unprunedJ48.model', splitPerc=70, randomSeed=10) # Hoeffding tree - cross validate - traintestSplit print("----------HoeffdingTree----------") classify(preProcessedData, base5, True, 'models/HoeffdingTree.model', splitPerc=70, randomSeed=10) classify(preProcessedData, base5, False, 'models/HoeffdingTree.model', splitPerc=70, randomSeed=10) # K-Nearest-Neighbours - cross validate - traintestSplit print("----------KNN----------") classifier.set_property("classifier", base6.jobject) classify(preProcessedData, classifier, False, 'models/knn.model', splitPerc=70, randomSeed=10) classify(preProcessedData, classifier, True, 'models/preProcessedJ48.model', splitPerc=70, randomSeed=10) # bayesian belief networks - cross validate - traintestSplit print("----------BayesianBelief----------") classifier.set_property("classifier", base7.jobject) classify(preProcessedData, classifier, True, 'models/bayesianBelief.model', splitPerc=70, randomSeed=10) classify(preProcessedData, classifier, False, 'models/bayesianBelief.model', splitPerc=70, randomSeed=10)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
import weka.plot.graph as plot_graph import weka.core.types as types jvm.start() # access classifier's Java API labor_file = '../data/labor.arff' loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in xrange(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject))) prism = Classifier(classname="weka.classifiers.rules.DecisionTable", options=["-R"]) prism.build_classifier(labor_data) print prism.jwrapper.toString() # print prism.jwrapper.m_dtInstances j48 = Classifier(classname="weka.classifiers.trees.J48") j48.set_property("confidenceFactor", types.double_to_float(0.3)) j48.build_classifier(labor_data) print(j48) print(j48.graph) jvm.stop()