Exemple #1
0
    def test_model(self, test_data, empty_solution, evaluate = False):
        model_weka = None
        if os.path.isfile(self.prediction_file):
            print 'Model ' + self.name + ' already tested.'
        elif not os.path.isfile(self.model_file):
            print 'Impossible testing this model. It should be trained first.'
            return
        else: 
            print 'Starting to test_model model ' + self.name + '.'
            model_weka = Classifier(jobject = serialization.read(self.model_file)) 
            evaluation = Evaluation(data = test_data)
            evaluation.test_model(classifier = model_weka, data = test_data)
            
            predictions = evaluation.predictions()
            rows        = read_sheet(file_name = empty_solution)
            solutions   = []

            for row in rows:
                solution = [row['userid'], row['tweetid'], predictions.pop(0).predicted()]
                solutions.append(solution)
            write_the_solution_file(solutions, self.prediction_file)
            print 'Model ' + self.name + ' tested.'
        
        if evaluate == True:
            if os.path.isfile(self.evaluation_file):
                print 'Model ' + self.name + ' already evaluated.'
                return
            elif model_weka == None:
                model_weka = Classifier(jobject = serialization.read(self.model_file)) 
                evaluation = Evaluation(data = test_data)
                evaluation.test_model(classifier = model_weka, data = test_data)
            save_file(file_name = self.evaluation_file, content = evaluation.to_summary())
            print 'Model ' + self.name + ' evaluated.'
Exemple #2
0
if data_dir is None:
  data_dir = "." + os.sep + "data"

import os
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation
from weka.filters import Filter

jvm.start()

# load weather.nominal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# perform 10-fold cross-validation
cls = Classifier(classname="weka.classifiers.rules.OneR")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation:\n" + evl.to_summary())

# build model on full dataset and output it
cls.build_classifier(data)
print("Model:\n\n" + str(cls))

jvm.stop()
Exemple #3
0
# Use the WEKAMOOC_DATA environment variable to set the location 
# for the datasets
import os
data_dir = os.environ.get("WEKAMOOC_DATA")
if data_dir is None:
  data_dir = "." + os.sep + "data"

import os
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation

jvm.start()

# load glass
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "glass.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

for k in [1, 5, 20]:
    # cross-validate IBk, display model
    cls = Classifier(classname="weka.classifiers.lazy.IBk", options=["-K", str(k)])
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("10-fold cross-validation (k=" + str(k) + "):\n" + evl.to_summary())

jvm.stop()
Exemple #4
0
data.set_class_index(data.num_attributes() - 1)

# 1a filter data
print("Filtering data...")
fltr = Filter("weka.filters.unsupervised.attribute.StringToWordVector")
fltr.set_inputformat(data)
filtered = fltr.filter(data)
filtered.set_class_index(0)

# 1b build classifier
print("Building/evaluating classifier...")
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(filtered)
evl = Evaluation(filtered)
evl.test_model(cls, filtered)
print(evl.to_summary())
print(str(cls))
plg.plot_dot_graph(cls.graph())

# 2. filtered classifier
fname = data_dir + os.sep + "simpletext-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.set_class_index(test.num_attributes() - 1)
print("Building/evaluating filtered classifier...")
cls = FilteredClassifier()
cls.set_classifier(Classifier(classname="weka.classifiers.trees.J48"))
cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector"))
cls.build_classifier(data)
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText")
Exemple #5
0
jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]:
    # train/test split 90% using classifier
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.evaluate_train_test_split(cls, data, 90.0, Random(1))
    print("\n" + classifier + " train/test split (90%):\n" + evl.to_summary())
    cls.build_classifier(data)
    print(classifier + " model:\n\n" + str(cls))

# calculate mean/stdev over 10 cross-validations
for classifier in [
    "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes",
        "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]:
    accuracy = []
    for i in xrange(1,11):
        cls = Classifier(classname=classifier)
        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, 10, Random(i))
        accuracy.append(evl.percent_correct())
    nacc = numpy.array(accuracy)
    print("%s: %0.2f +/-%0.2f" % (classifier, numpy.mean(nacc), numpy.std(nacc)))
Exemple #6
0
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation

jvm.start()

for dataset in ["diabetes.arff", "breast-cancer.arff"]:
    # load dataset
    loader = Loader(classname="weka.core.converters.ArffLoader")
    fname = data_dir + os.sep + dataset
    print("\nLoading dataset: " + fname + "\n")
    data = loader.load_file(fname)
    data.set_class_index(data.num_attributes() - 1)

    # cross-validate default J48, display model
    cls = Classifier(classname="weka.classifiers.trees.J48")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("10-fold cross-validation (default):\n" + evl.to_summary())
    cls.build_classifier(data)
    print("Model (default):\n\n" + str(cls))

    # cross-validate unpruned J48, display model
    cls = Classifier(classname="weka.classifiers.trees.J48", options=["-U"])
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("10-fold cross-validation (unpruned):\n" + evl.to_summary())
    cls.build_classifier(data)
    print("Model (unpruned):\n\n" + str(cls))

jvm.stop()
Exemple #7
0
wfilter = Filter(classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# convert content to string
wfilter = Filter(classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# set class attribute
data.set_class_index(data.num_attributes() - 1)

# generate baseline
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
evaluation = Evaluation(data)
evaluation.crossvalidate_model(zeror, data, 10, Random(1))
print("\nBaseline:\n" + evaluation.to_summary())

# perform text mining
j48 = Classifier(classname="weka.classifiers.trees.J48")
stwv = Filter(
    classname="weka.filters.unsupervised.attribute.StringToWordVector",
    options=["-R", "1", "-P", "att-"])
stwv.set_inputformat(data)
data = stwv.filter(data)
evaluation = Evaluation(data)
evaluation.crossvalidate_model(j48, data, 10, Random(1))
print("\nJ48:\n" + evaluation.to_summary())

# stop JVM
jvm.stop()
Exemple #8
0
# load a dataset
iris_file = "HairEyeColor.csv"
print("Loading dataset: " + iris_file)
loader = Loader(classname="weka.core.converters.CSVLoader")
iris_data = loader.load_file(iris_file)
print (iris_data.num_attributes)
iris_data.set_class_index(iris_data.num_attributes() - 1)
                                            
# build a classifier and output model
print ("Training J48 classifier on iris")
classifier = Classifier(classname="weka.test.Regression")
#classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"])
# Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
# property of the J48 classifier itself. However, being of type float rather than double, we need
# to convert it to the correct type first using the double_to_float function:
#classifier.set_property("confidenceFactor", types.double_to_float(0.3))
classifier.build_classifier(iris_data)
print(classifier)
print(classifier.graph())
#plot_graph.plot_dot_graph(classifier.graph())
    

evaluation = Evaluation(iris_data)                     # initialize with priors
evaluation.crossvalidate_model(classifier, iris_data, 10, Random(42))  # 10-fold CV
print(evaluation.to_summary())

print("pctCorrect: " + str(evaluation.percent_correct()))
print("incorrect: " + str(evaluation.incorrect()))
jvm.stop()
Exemple #9
0
data = wfilter.filter(data)

# convert content to string
wfilter = Filter(
    classname="weka.filters.unsupervised.attribute.NominalToString",
    options=["-C", "first"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# set class attribute
data.set_class_index(data.num_attributes() - 1)

# generate baseline
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
evaluation = Evaluation(data)
evaluation.crossvalidate_model(zeror, data, 10, Random(1))
print("\nBaseline:\n" + evaluation.to_summary())

# perform text mining
j48 = Classifier(classname="weka.classifiers.trees.J48")
stwv = Filter(
    classname="weka.filters.unsupervised.attribute.StringToWordVector",
    options=["-R", "1", "-P", "att-"])
stwv.set_inputformat(data)
data = stwv.filter(data)
evaluation = Evaluation(data)
evaluation.crossvalidate_model(j48, data, 10, Random(1))
print("\nJ48:\n" + evaluation.to_summary())

# stop JVM
jvm.stop()
Exemple #10
0
from weka.classifiers import Classifier, Evaluation

jvm.start()

# load weather.nominal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# perform 10-fold cross-validation
cls = Classifier(classname="weka.classifiers.rules.OneR")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation (full):\n" + evl.to_summary())
cls.build_classifier(data)
print("Model:\n\n" + str(cls))

# remove attribute "outlook"
print("Removing attribute 'outlook'")
data.delete_attribute(data.get_attribute_by_name("outlook").get_index())

# perform 10-fold cross-validation (reduced dataset)
cls = Classifier(classname="weka.classifiers.rules.OneR")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation (without 'outlook'):\n" + evl.to_summary())
cls.build_classifier(data)
print("Model:\n\n" + str(cls))