def J48(data, rnm):
    data.class_is_last()
    fc = FilteredClassifier()
    fc.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
    fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, folds, Random(1), pred_output)
    fc.build_classifier(data)
    f0 = open(rnm + '_J48_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc)
    f0.close()
    f1 = open(rnm + '_J48_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_J48_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evl.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evl.class_details())
    f2.close()
    plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_J48_ROC.png', wait=False)
    value_J48 = str(evl.percent_correct)
    return value_J48
Ejemplo n.º 2
0
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
    pred = data_dir + os.sep + group + "_Val.arff"

    loader = Loader(classname="weka.core.converters.ArffLoader")
    print(train)
    train_data = loader.load_file(train)
    train_data.class_index = train_data.attribute_by_name(
        "reference value").index
    print(test)
    test_data = loader.load_file(test)
    test_data.class_index = test_data.attribute_by_name(
        "reference value").index
    print(pred)
    pred_data = loader.load_file(pred)
    pred_data.class_index = pred_data.attribute_by_name(
        "reference value").index

    cls = FilteredClassifier()
    cls.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                        options=["-R", "first"])
    cls.build_classifier(train_data)
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)
    print(evl.summary())

jvm.stop()
Ejemplo n.º 3
0
fname = data_dir + os.sep + "ReutersGrain-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.set_class_index(test.num_attributes() - 1)

setups = (
    ("weka.classifiers.trees.J48", []),
    ("weka.classifiers.bayes.NaiveBayes", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-S"])
)

# cross-validate classifiers
for setup in setups:
    classifier, opt = setup
    print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt)))
    cls = FilteredClassifier()
    cls.set_classifier(Classifier(classname=classifier))
    cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt))
    cls.build_classifier(data)
    evl = Evaluation(test)
    evl.test_model(cls, test)
    print("Accuracy: %0.0f%%" % evl.percent_correct())
    tcdata = plc.generate_thresholdcurve_data(evl, 0)
    print("AUC: %0.3f" % plc.get_auc(tcdata))
    print(evl.to_matrix("Matrix:"))

jvm.stop()
Ejemplo n.º 4
0
print("Train/test/predict...")

groups = ["DataSet1", "DataSet2"]
# groups = ["DataSet2"]

for group in groups:
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
    pred = data_dir + os.sep + group + "_Val.arff"

    loader = Loader(classname="weka.core.converters.ArffLoader")
    print(train)
    train_data = loader.load_file(train)
    train_data.class_index = train_data.attribute_by_name("reference value").index
    print(test)
    test_data = loader.load_file(test)
    test_data.class_index = test_data.attribute_by_name("reference value").index
    print(pred)
    pred_data = loader.load_file(pred)
    pred_data.class_index = pred_data.attribute_by_name("reference value").index

    cls = FilteredClassifier()
    cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    cls.build_classifier(train_data)
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)
    print(evl.summary())

jvm.stop()
Ejemplo n.º 5
0
jvm.start(system_cp=True, packages=True, max_heap_size="512m")

# Train classifier

loader = Loader(classname="weka.core.converters.ArffLoader", options=["-charset", "UTF-8"])
train_data = loader.load_file(os.path.dirname(os.path.realpath(__file__)) + "/datasets/train.arff")
train_data.class_is_last()

string_to_word_vector_filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector")
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial")

fc = FilteredClassifier()
fc.filter = string_to_word_vector_filter
fc.classifier = cls

fc.build_classifier(train_data)

# Create test data

class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"])
str_att = Attribute.create_string("title")

test_dataset = Instances.create_instances(
    name="test_news_set",
    atts=[str_att, class_att],
    capacity=1
)

inst = Instance.create_instance([Instance.missing_value(), Instance.missing_value()])
test_dataset.add_instance(inst)
test_dataset.get_instance(0).set_string_value(0, article['processed']['title'])
Ejemplo n.º 6
0
fname = data_dir + os.sep + "ReutersGrain-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.class_is_last()

setups = (
    ("weka.classifiers.trees.J48", []),
    ("weka.classifiers.bayes.NaiveBayes", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"])
)

# cross-validate classifiers
for setup in setups:
    classifier, opt = setup
    print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt)))
    cls = FilteredClassifier()
    cls.classifier = Classifier(classname=classifier)
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)
    cls.build_classifier(data)
    evl = Evaluation(test)
    evl.test_model(cls, test)
    print("Accuracy: %0.0f%%" % evl.percent_correct)
    tcdata = plc.generate_thresholdcurve_data(evl, 0)
    print("AUC: %0.3f" % plc.get_auc(tcdata))
    print(evl.matrix("Matrix:"))

jvm.stop()
Ejemplo n.º 7
0
evl.crossvalidate_model(fc, train, 5, Random(1))

# Prints Out Confusion Matrix along with other summary statistics
print("LMT (SMOTE balanced classes) CV = 5 Error: %.2f%%" %
      (evl.percent_incorrect))
print(evl.matrix())  #Confusion Matrix

# Plots ROC
plcls.plot_roc(evl, class_index=[0, 1], wait=True)

# Extra Summary
print(evl.summary())
print(evl.class_details())

# Evaluate the classifier on test set
fc.build_classifier(train)
tevl = Evaluation(test)
tevl.test_model(fc, test)

# Prints Out Confusion Matrix along with other summary statistics
print("LMT (SMOTE balanced classes) Test Error: %.2f%%" %
      (tevl.percent_incorrect))
print(tevl.matrix())  #Confusion Matrix

# Plots ROC
plcls.plot_roc(tevl, class_index=[0, 1], wait=True)

# Extra Summary
print(tevl.summary())
print(tevl.class_details())