def do_execute(self): """ The actual execution of the actor. :return: None if successful, otherwise error message :rtype: str """ result = None evl = self.input.payload pltclassifier.plot_prc(evl, class_index=self.resolve_option("class_index"), title=self.resolve_option("title"), key_loc=self.resolve_option("key_loc"), outfile=self.resolve_option("outfile"), wait=bool(self.resolve_option("wait"))) return result
def do_execute(self): """ The actual execution of the actor. :return: None if successful, otherwise error message :rtype: str """ result = None evl = self.input.payload pltclassifier.plot_prc( evl, class_index=self.resolve_option("class_index"), title=self.resolve_option("title"), key_loc=self.resolve_option("key_loc"), outfile=self.resolve_option("outfile"), wait=bool(self.resolve_option("wait"))) return result
def test_plot_prc(self): """ Tests the plot_prc method. """ loader = converters.Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(self.datafile("diabetes.arff")) data.class_is_last() remove = filters.Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-3"]) cls = classifiers.Classifier(classname="weka.classifiers.bayes.NaiveBayes") fc = classifiers.FilteredClassifier() fc.filter = remove fc.classifier = cls evl = classifiers.Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) plot.plot_prc(evl, class_index=[0, 1], wait=False)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer( classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc(evaluation, title="ROC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) # train 2nd classifier on diabetes dataset classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(diabetes_data) evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42)) plot_cls.plot_rocs({ "NB": evaluation, "RF": evaluation2 }, title="ROC diabetes", class_index=0, wait=False) plot_cls.plot_prcs({ "NB": evaluation, "RF": evaluation2 }, title="PRC diabetes", class_index=0, wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print( str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # train 2nd classifier and show errors in same plot classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg") evaluation2 = Evaluation(bolts_data) evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42)) plot_cls.plot_classifier_errors( { "LR": evaluation.predictions, "SMOreg": evaluation2.predictions }, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") ] plot_cls.plot_learning_curve(cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in xrange(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
#corpus = os.path.join("/content/gdrive/My Drive", corpus_name) #sampled.to_csv(corpus,index = False) loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(corpus) data.class_is_last() """Naive Bayes Classifier for Bug Prediction""" classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(42), output=pred_output) plot_cls.plot_roc(evaluation, title="ROC bugs",class_index=range(0, data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC bugs - NaiveBayes",class_index=range(0, data.class_attribute.num_values), wait=False) """Performance Metrics - Naive Bayes Classifier""" print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("fMeasure: " + str(evaluation.f_measure(1))) print("precision: " + str(evaluation.precision(1))) print("recall: " + str(evaluation.recall(1))) """Random Forest Classifier""" classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
def runner(self, cdat, heap_size = 16384, seed = None, verbose = True): self.set_status(Pipeline.RUNNING) self.logs.append('Initializing Pipeline') para = self.config self.logs.append('Reading Pipeline Configuration') head = '' name = get_rand_uuid_str() self.logs.append('Reading Input File') for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.RUNNING if stage.code == 'dat.fle': head = os.path.abspath(stage.value.path) name, _ = os.path.splitext(stage.value.name) self.logs.append('Parsing to ARFF') path = os.path.join(head, '{name}.arff'.format(name = name)) # This bug, I don't know why, using Config.schema instead. # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose) for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.COMPLETE self.logs.append('Saved ARFF at {path}'.format(path = path)) self.logs.append('Splitting to Training and Testing Sets') JVM.start(max_heap_size = '{size}m'.format(size = heap_size)) load = Loader(classname = 'weka.core.converters.ArffLoader') # data = load.load_file(path) # save = Saver(classname = 'weka.core.converters.ArffSaver') data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only data.class_is_last() # For Debugging Purposes Only # data.class_index = cdat.iclss for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.RUNNING self.logs.append('Splitting Training Set') # TODO - Check if this seed is worth it. seed = assign_if_none(seed, random.randint(0, 1000)) opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)] wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V']) wobj.inputformat(data) tran = wobj.filter(data) self.logs.append('Splitting Testing Set') wobj.options = opts test = wobj.filter(data) for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.COMPLETE self.logs.append('Performing Feature Selection') feat = [ ] for comb in para.FEATURE_SELECTION: if comb.USE: for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.RUNNING srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Search.NAME, options = assign_if_none(comb.Search.OPTIONS, [ ]) )) ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Evaluator.NAME, options = assign_if_none(comb.Evaluator.OPTIONS, [ ]) )) attr = AttributeSelection() attr.search(srch) attr.evaluator(ewal) attr.select_attributes(tran) meta = addict.Dict() meta.search = comb.Search.NAME meta.evaluator = comb.Evaluator.NAME meta.features = [tran.attribute(index).name for index in attr.selected_attributes] feat.append(meta) for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.COMPLETE models = [ ] for model in para.MODEL: if model.USE: summary = addict.Dict() self.logs.append('Modelling {model}'.format(model = model.LABEL)) summary.label = model.LABEL summary.name = model.NAME summary.options = assign_if_none(model.OPTIONS, [ ]) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.RUNNING for i, instance in enumerate(data): iclass = list(range(instance.num_classes)) options = assign_if_none(model.OPTIONS, [ ]) classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options) classifier.build_classifier(tran) serializer.write(os.path.join(head, '{name}.{classname}.model'.format( name = name, classname = model.NAME )), classifier) self.logs.append('Testing model {model}'.format(model = model.LABEL)) evaluation = Evaluation(tran) evaluation.test_model(classifier, test) summary.summary = evaluation.summary() frame = pd.DataFrame(data = evaluation.confusion_matrix) axes = sns.heatmap(frame, cbar = False, annot = True) b64str = get_b64_plot(axes) summary.confusion_matrix = addict.Dict({ 'value': evaluation.confusion_matrix.tolist(), 'plot': b64str }) self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL)) buffer = io.BytesIO() plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.learning_curve = b64str buffer = io.BytesIO() plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.roc_curve = b64str buffer = io.BytesIO() plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.prc_curve = b64str if classifier.graph: summary.graph = classifier.graph for i, instance in enumerate(test): prediction = classifier.classify_instance(instance) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.COMPLETE models.append(summary) self.gist.models = models JVM.stop() JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist) self.logs.append('Pipeline Complete') self.set_status(Pipeline.COMPLETE)