Ejemplo n.º 1
0
def vote_classifier_train(dicrectory, nameOfDataSet, flag):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(dicrectory)
    data.class_is_last()
    meta = MultipleClassifiersCombiner(
        classname="weka.classifiers.meta.Vote",
        options=[
            '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B',
            'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1',
            '-B',
            'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- '
            '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B',
            'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump',
            '-B',
            'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- '
            '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B',
            'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG'
        ])
    eval = Evaluation(data)
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    if flag:
        eval.crossvalidate_model(meta, data, 10, Random(1), pout)
    else:
        eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout)
    gc.collect()
    print_and_save('Proposed model', flag, nameOfDataSet, eval)
Ejemplo n.º 2
0
    def experimenter(self):
        """Perform a test using all classifiers available. 
        
        Returns
        -------
        info : string
            Info with results of experimenter.
        """
        info = ""
        #print 'experimenter'
        aliases = sorted(WekaAlias.get_aliases())
        for alias in aliases:
            try:
                # Ignore very slow classifiers.
                if alias == 'KStar' or alias == 'LWL' or alias == 'MultilayerPerceptron':
                    continue 
                    
                start_time = TimeUtils.get_time()
                
                classifier = WClassifier(classname=WekaAlias.get_classifier(alias))
        
                info +=  "Scheme:\t%s %s\n" % (str(classifier.classname) , " ".join([str(option) for option in classifier.options]))
                
                evl = WEvaluation(self.data)
                evl.evaluate_train_test_split(classifier, self.data, 66, WRandom(1))
        
                info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct)
                info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time)

            except Exception as e:
                if str(e) != 'Object does not implement or subclass weka.classifiers.Classifier: __builtin__.NoneType':
                    info += "Exception in %s: %s\n\n" % (WekaAlias.get_aliases()[alias], str(e))
        
        return info
Ejemplo n.º 3
0
def naive_bayse(dicrectory, nameOfDataSet, flag):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(dicrectory)
    data.class_is_last()
    cls = Classifier(classname='weka.classifiers.bayes.NaiveBayes')
    eval = Evaluation(data)
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    if flag:
        eval.crossvalidate_model(cls, data, 10, Random(1), pout)
    else:
        eval.evaluate_train_test_split(cls, data, 80.0, Random(1), pout)
    print_and_save('Naive Bayes model', flag, nameOfDataSet, eval)
    gc.collect()
Ejemplo n.º 4
0
def evaluate(classifier, data):
    """
    Private function that makes evaluation of classifier on
    given data. With command line arguments we can chose which 
    evaluation to use.

    :param classifier: Classifier
    :param data: weka arff data
    :return: Evaluation
    """
    args = evaluate_parser()
    evaluation = Evaluation(data)
    if args['evaluation'] == 'train_test':
        evaluation.evaluate_train_test_split(classifier, data,
                                             int(args['train_size']),
                                             Random(1))
    elif args['evaluation'] == 'cross_validate':
        evaluation.crossvalidate_model(classifier, data, int(args['folds']),
                                       Random(42))
    else:
        evaluation.test_model(classifier, data)
    return evaluation
Ejemplo n.º 5
0
from utilities import *
import weka.core.jvm as jvm

from weka.core.converters import Loader, Saver

from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random

jvm.start(max_heap_size="3072m")

loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file("./Dataset/trainGrid.arff")
data.class_is_last()

#classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")

evaluation = Evaluation(data)
#evaluation.crossvalidate_model(classifier, data, 10, Random(42))
evaluation.evaluate_train_test_split(classifier, data, 66, Random(42))
res = evaluation.summary()
res += "\n" + evaluation.matrix()
#f = open('./Dataset/resultsGrid.txt', 'w')
#f.write(res)

print res

jvm.stop()
Ejemplo n.º 6
0
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# determine baseline with ZeroR
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
zeror.build_classifier(data)
evl = Evaluation(data)
evl.test_model(zeror, data)
print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct())

print("\nHoldout 10%...")
# use seed 1-10 and perform random split with 90%
perc = []
for i in xrange(1, 11):
    evl = Evaluation(data)
    evl.evaluate_train_test_split(
        Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i))
    perc.append(round(evl.percent_correct(), 1))
    print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct()))

# calculate mean and standard deviation
nperc = numpy.array(perc)
print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc)))

print("\n10-fold Cross-validation...")
# use seed 1-10 and perform 10-fold CV
perc = []
for i in xrange(1, 11):
    evl = Evaluation(data)
    evl.crossvalidate_model(Classifier(classname="weka.classifiers.trees.J48"), data, 10, Random(i))
    perc.append(round(evl.percent_correct(), 1))
    print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct()))
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline,
                                  classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.RBFKernel",
        options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO",
                                  options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", types.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48",
                            options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0,
                                         Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(
        classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText",
        options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier,
                                   diabetes_data,
                                   10,
                                   Random(42),
                                   output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " +
          str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " +
          str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " +
          str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " +
          str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " +
          str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " +
          str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " +
          str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " +
          str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " +
          str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " +
          str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " +
          str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " +
          str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " +
          str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(evaluation,
                      title="ROC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)
    plot_cls.plot_prc(evaluation,
                      title="PRC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)

    # train 2nd classifier on diabetes dataset
    classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
    evaluation2 = Evaluation(diabetes_data)
    evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42))
    plot_cls.plot_rocs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="ROC diabetes",
                       class_index=0,
                       wait=False)
    plot_cls.plot_prcs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="PRC diabetes",
                       class_index=0,
                       wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(
            str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # train 2nd classifier and show errors in same plot
    classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg")
    evaluation2 = Evaluation(bolts_data)
    evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42))
    plot_cls.plot_classifier_errors(
        {
            "LR": evaluation.predictions,
            "SMOreg": evaluation2.predictions
        },
        wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    ]
    plot_cls.plot_learning_curve(cls,
                                 diabetes_data,
                                 increments=0.05,
                                 label_template="[#] !",
                                 metric="percent_correct",
                                 wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in xrange(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
Ejemplo n.º 8
0
from weka.classifiers import Classifier, Evaluation

jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]:
    # train/test split 90% using classifier
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.evaluate_train_test_split(cls, data, 90.0, Random(1))
    print("\n" + classifier + " train/test split (90%):\n" + evl.to_summary())
    cls.build_classifier(data)
    print(classifier + " model:\n\n" + str(cls))

# calculate mean/stdev over 10 cross-validations
for classifier in [
    "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes",
        "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]:
    accuracy = []
    for i in xrange(1,11):
        cls = Classifier(classname=classifier)
        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, 10, Random(i))
        accuracy.append(evl.percent_correct())
    nacc = numpy.array(accuracy)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(
        evaluation, title="ROC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)
    plot_cls.plot_prc(
        evaluation, title="PRC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")]
    plot_cls.plot_learning_curve(
        cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
def testing():
    logging.disable("weka")

    print "PROSES KLASIFIKASI\n------------------"

    jvm.start()

    pruning = 0
    while pruning < 2:

        persen_train = 0
        while persen_train < 4:

            fitur_hapus = 15
            while fitur_hapus >= 0:

                list_akurasi = []
                list_recall = []
                list_presisi = []
                list_fmeasure = []
                list_roc = []
                count = 0

                nama = "hasilTest/"
                if(pruning == 0):
                    nama += "unpruning"
                    if(persen_train == 0):
                        nama += "40"
                    elif(persen_train == 1):
                        nama += "50"
                    elif(persen_train == 2):
                        nama += "60"
                    else:
                        nama += "70"
                else:
                    nama += "pruning"
                    if(persen_train == 0):
                        nama += "40"
                    elif(persen_train == 1):
                        nama += "50"
                    elif(persen_train == 2):
                        nama += "60"
                    else:
                        nama += "70"

                if(fitur_hapus > 0):
                    nama += "removeF" + str(fitur_hapus) + ".txt"
                else:
                    nama += "normal.txt"

                f = open(nama, "w")

                if(pruning == 0):
                    nama = "unpruning"
                    print "Tanpa Pruning"
                    f.write("Hasil Decision Tree C4.5 tanpa Pruning (unpruning)\n")
                    if(persen_train == 0):
                        nama += "40"
                        f.write("Dengan Training Set sebesar 40%\n")
                    elif(persen_train == 1):
                        nama += "50"
                        f.write("Dengan Training Set sebesar 50%\n")
                    elif(persen_train == 2):
                        nama += "60"
                        f.write("Dengan Training Set sebesar 60%\n")
                    else:
                        nama += "70"
                        f.write("Dengan Training Set sebesar 70%\n")
                else:
                    nama = "pruning"
                    print "Dengan Pruning"
                    f.write("Hasil Decision Tree C4.5 Pruning\n")
                    if(persen_train == 0):
                        nama += "40"
                        f.write("Dengan Training Set sebesar 40%\n")
                    elif(persen_train == 1):
                        nama += "50"
                        f.write("Dengan Training Set sebesar 50%\n")
                    elif(persen_train == 2):
                        nama += "60"
                        f.write("Dengan Training Set sebesar 60%\n")
                    else:
                        nama += "70"
                        f.write("Dengan Training Set sebesar 70%\n")

                if(fitur_hapus > 0):
                    f.write("Menggunakan remove pada fitur " + str(fitur_hapus) + "\n\n")
                else:
                    f.write("\n")

                f.write("No. Akurasi Recall Presisi F-Measure ROC\n")

                if persen_train == 0:
                    print "40% Data Training"
                elif persen_train == 1:
                    print "50% Data Training"
                elif persen_train == 2:
                    print "60% Data Training"
                else:
                    print "70% Data Training"

                print "Fitur yang dihapus:", fitur_hapus
                print "\nNo.\tAkurasi\tRecall\tPresisi\tF-Measure\tROC"
                while count < 100:
                    loader = Loader(classname = "weka.core.converters.ArffLoader")
                    data = loader.load_file("hasil.arff")
                    data.class_is_last()

                    if(fitur_hapus > 0):
                        remove = Filter(classname = "weka.filters.unsupervised.attribute.Remove", options = ["-R", str(fitur_hapus)])
                        remove.inputformat(data)
                        data_baru = remove.filter(data)
                        data_baru.class_is_last()
                    else:
                        data_baru = loader.load_file("hasil.arff")
                        data_baru.class_is_last()

                    filter = Filter(classname = "weka.filters.unsupervised.instance.Randomize", options = ["-S", str(int(time.time()))])
                    filter.inputformat(data_baru)
                    data_random = filter.filter(data_baru)
                    data_random.class_is_last()

                    if(pruning == 0):
                        classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-U"])
                    else:
                        classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-C", "0.25"])

                    evaluation = Evaluation(data_random)
                    if(persen_train == 0):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 40)
                    elif(persen_train == 1):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 50)
                    elif(persen_train == 2):
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 60)
                    else:
                        evaluation.evaluate_train_test_split(classifier, data_random, percentage = 70)

                    f.write(str(count + 1) + str( ". " ) + str(evaluation.weighted_true_positive_rate) + str( " " ) + str(evaluation.weighted_recall) + str( " " ) + str(evaluation.weighted_precision) + str( " " ) + str(evaluation.weighted_f_measure) + str( " " ) + str(evaluation.weighted_area_under_roc) + "\n")
                    print count + 1, evaluation.weighted_true_positive_rate, evaluation.weighted_recall, evaluation.weighted_precision, evaluation.weighted_f_measure, evaluation.weighted_area_under_roc

                    list_akurasi.append(evaluation.weighted_true_positive_rate)
                    list_recall.append(evaluation.weighted_recall)
                    list_presisi.append(evaluation.weighted_precision)
                    list_fmeasure.append(evaluation.weighted_f_measure)
                    list_roc.append(evaluation.weighted_area_under_roc)

                    count += 1
                    time.sleep(1)

                list_akurasi.sort()
                list_recall.sort()
                list_presisi.sort()
                list_fmeasure.sort()
                list_roc.sort()

                f.write( ""  + "\n")
                f.write( "Rata-Rata"  + "\n")
                f.write( "Akurasi:" + str(sum(list_akurasi) / 100.0)  + "\n")
                f.write( "Recall:" + str(sum(list_recall) / 100.0)  + "\n")
                f.write( "Presisi:" + str(sum(list_presisi) / 100.0)  + "\n")
                f.write( "F-Measure:" + str(sum(list_fmeasure) / 100.0)  + "\n")
                f.write( "ROC:" + str(sum(list_roc) / 100.0)  + "\n")
                f.write( ""  + "\n")
                f.write( "Max"  + "\n")
                f.write( "Akurasi:" + str(list_akurasi[-1] ) + "\n")
                f.write( "Recall:" + str(list_recall[-1] ) + "\n")
                f.write( "Presisi:" + str(list_presisi[-1] ) + "\n")
                f.write( "F-Measure:" + str(list_fmeasure[-1] ) + "\n")
                f.write( "ROC:" + str(list_roc[-1] ) + "\n")
                f.write( ""  + "\n")
                f.write( "Min"  + "\n")
                f.write( "Akurasi:" + str(list_akurasi[0] ) + "\n")
                f.write( "Recall:" + str(list_recall[0] ) + "\n")
                f.write( "Presisi:" + str(list_presisi[0] ) + "\n")
                f.write( "F-Measure:" + str(list_fmeasure[0] ) + "\n")
                f.write( "ROC:" + str(list_roc[0] ) + "\n")
                f.write( ""  + "\n")

                print ""
                print "Rata-Rata"
                print "Akurasi:", sum(list_akurasi) / 100.0
                print "Recall:", sum(list_recall) / 100.0
                print "Presisi:", sum(list_presisi) / 100.0
                print "F-Measure:", sum(list_fmeasure) / 100.0
                print "ROC:", sum(list_roc) / 100.0
                print ""
                print "Max"
                print "Akurasi:", list_akurasi[-1]
                print "Recall:", list_recall[-1]
                print "Presisi:", list_presisi[-1]
                print "F-Measure:", list_fmeasure[-1]
                print "ROC:", list_roc[-1]
                print ""
                print "Min"
                print "Akurasi:", list_akurasi[0]
                print "Recall:", list_recall[0]
                print "Presisi:", list_presisi[0]
                print "F-Measure:", list_fmeasure[0]
                print "ROC:", list_roc[0]
                print ""

                f.close()
                fitur_hapus -= 1

            persen_train += 1

        pruning += 1

    jvm.stop()
Ejemplo n.º 11
0
from weka.classifiers import Classifier, Evaluation

jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]:
    # train/test split 90% using classifier
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.evaluate_train_test_split(cls, data, 90.0, Random(1))
    print("\n" + classifier + " train/test split (90%):\n" + evl.summary())
    cls.build_classifier(data)
    print(classifier + " model:\n\n" + str(cls))

# calculate mean/stdev over 10 cross-validations
for classifier in [
    "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes",
        "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]:
    accuracy = []
    for i in xrange(1,11):
        cls = Classifier(classname=classifier)
        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, 10, Random(i))
        accuracy.append(evl.percent_correct)
    nacc = numpy.array(accuracy)
Ejemplo n.º 12
0
fname = data_dir + os.sep + "segment-challenge.arff"
print("\nLoading dataset: " + fname + "\n")
train = loader.load_file(fname)
train.set_class_index(train.num_attributes() - 1)

fname = data_dir + os.sep + "segment-test.arff"
print("\nLoading dataset: " + fname + "\n")
test = loader.load_file(fname)
test.set_class_index(train.num_attributes() - 1)

# build J48
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(train)

# evaluate on test
evl = Evaluation(train)
evl.test_model(cls, test)
print("Test set accuracy: %0.0f%%" % evl.percent_correct())

# evaluate on train
evl = Evaluation(train)
evl.test_model(cls, train)
print("Train set accuracy: %0.0f%%" % evl.percent_correct())

# evaluate on random split
evl = Evaluation(train)
evl.evaluate_train_test_split(cls, train, 66.0, Random(1))
print("Random split accuracy: %0.0f%%" % evl.percent_correct())

jvm.stop()
Ejemplo n.º 13
0
data.class_is_last()

# determine baseline with ZeroR
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
zeror.build_classifier(data)
evl = Evaluation(data)
evl.test_model(zeror, data)
print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct)

print("\nHoldout 10%...")
# use seed 1-10 and perform random split with 90%
perc = []
for i in xrange(1, 11):
    evl = Evaluation(data)
    evl.evaluate_train_test_split(
        Classifier(classname="weka.classifiers.trees.J48"), data, 90.0,
        Random(i))
    perc.append(round(evl.percent_correct, 1))
    print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct))

# calculate mean and standard deviation
nperc = numpy.array(perc)
print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc)))

print("\n10-fold Cross-validation...")
# use seed 1-10 and perform 10-fold CV
perc = []
for i in xrange(1, 11):
    evl = Evaluation(data)
    evl.crossvalidate_model(Classifier(classname="weka.classifiers.trees.J48"),
                            data, 10, Random(i))
Ejemplo n.º 14
0
fname = data_dir + os.sep + "segment-challenge.arff"
print("\nLoading dataset: " + fname + "\n")
train = loader.load_file(fname)
train.class_is_last()

fname = data_dir + os.sep + "segment-test.arff"
print("\nLoading dataset: " + fname + "\n")
test = loader.load_file(fname)
test.class_is_last()

# build J48
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(train)

# evaluate on test
evl = Evaluation(train)
evl.test_model(cls, test)
print("Test set accuracy: %0.0f%%" % evl.percent_correct)

# evaluate on train
evl = Evaluation(train)
evl.test_model(cls, train)
print("Train set accuracy: %0.0f%%" % evl.percent_correct)

# evaluate on random split
evl = Evaluation(train)
evl.evaluate_train_test_split(cls, train, 66.0, Random(1))
print("Random split accuracy: %0.0f%%" % evl.percent_correct)

jvm.stop()