Ejemplo n.º 1
0
fname = data_dir + os.sep + "ReutersGrain-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.set_class_index(test.num_attributes() - 1)

setups = (
    ("weka.classifiers.trees.J48", []),
    ("weka.classifiers.bayes.NaiveBayes", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-S"])
)

# cross-validate classifiers
for setup in setups:
    classifier, opt = setup
    print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt)))
    cls = FilteredClassifier()
    cls.set_classifier(Classifier(classname=classifier))
    cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt))
    cls.build_classifier(data)
    evl = Evaluation(test)
    evl.test_model(cls, test)
    print("Accuracy: %0.0f%%" % evl.percent_correct())
    tcdata = plc.generate_thresholdcurve_data(evl, 0)
    print("AUC: %0.3f" % plc.get_auc(tcdata))
    print(evl.to_matrix("Matrix:"))

jvm.stop()
Ejemplo n.º 2
0
    sys.stdout.write("Repetitions=" + str(repetition))
    # initialize curve
    curve = {}
    for percentage in percentages:
        curve[percentage] = 0
    curves[repetition] = curve
    # run and add up percentage correct from repetition
    for seed in xrange(repetition):
        seed += 1
        sys.stdout.write(".")
        for percentage in percentages:
            cls = Classifier(classname="weka.classifiers.trees.J48")
            flt = Filter(classname="weka.filters.unsupervised.instance.Resample",
                         options=["-Z", str(percentage), "-no-replacement"])
            fc = FilteredClassifier()
            fc.set_classifier(cls)
            fc.set_filter(flt)
            evl = Evaluation(data)
            evl.crossvalidate_model(fc, data, 10, Random(seed))
            curve[percentage] += (evl.percent_correct() / repetition)
    # progress info
    sys.stdout.write("\n")

# output the results
if not plot.matplotlib_available:
    print("ZeroR: " + str(baseline))
    for repetition in repetitions:
        y = []
        for percentage in percentages:
            y.append(curves[repetition][percentage])
        print("Repetitions = " + str(repetition) + ":\n" + str(y))