def Bag_J48(data, rnm):
    data.class_is_last()
    fc1 = FilteredClassifier()
    fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
    fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.Bagging", options=["-P", "100", "-S", "1", "-I", "10"])
    fc2.classifier = fc1
    pred_output = PredictionOutput(classname = "weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"] )
    folds = 10
    fc2.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output)
    f0 = open(rnm + '_Bag_J48_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc2)
    f0.close()
    f1 = open(rnm + '_Bag_J48_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_Bag_j48_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evaluation.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evaluation.class_details())
    f2.close()
    plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Bag_J48_ROC.png', wait=False)
    value_Bag_J48 = str(evaluation.percent_correct)
    return value_Bag_J48
def RandomForest(data, rnm):
    data.class_is_last()
    fc = FilteredClassifier()
    fc.classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1"])
    fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, folds, Random(1), pred_output)
    fc.build_classifier(data)
    f0 = open(rnm + '_RF_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc)
    f0.close()
    f1 = open(rnm + '_RF_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_RF_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evl.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evl.class_details())
    f2.close()
    plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RF_ROC.png', wait=False)
    value_RF = str(evl.percent_correct)
    return value_RF
Ejemplo n.º 3
0
def fitness(toeval : Individual):
    cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings())
    fc = FilteredClassifier()
    fc.filter = remove
    fc.classifier = cls
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, 10, Random(1))
    return evl.percent_correct
Ejemplo n.º 4
0
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
    pred = data_dir + os.sep + group + "_Val.arff"

    loader = Loader(classname="weka.core.converters.ArffLoader")
    print(train)
    train_data = loader.load_file(train)
    train_data.class_index = train_data.attribute_by_name(
        "reference value").index
    print(test)
    test_data = loader.load_file(test)
    test_data.class_index = test_data.attribute_by_name(
        "reference value").index
    print(pred)
    pred_data = loader.load_file(pred)
    pred_data.class_index = pred_data.attribute_by_name(
        "reference value").index

    cls = FilteredClassifier()
    cls.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                        options=["-R", "first"])
    cls.build_classifier(train_data)
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)
    print(evl.summary())

jvm.stop()
Ejemplo n.º 5
0
def main():
    """
    Just runs some example code.
    """
    #case=["selected_chi100","selected_chi150","selected_chi200","selected_chi250","selected_chi300","selected_chi350","selected_fe100","selected_fe150","selected_fe200","selected_fe250","selected_fe300","selected_fe350","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfm250","selected_sfm300","selected_sfm350","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_sfmt250","selected_sfmt300","selected_sfmt350","selected_cs100","selected_cs150","selected_cs200","selected_cs250","selected_cs300","selected_cs350"]
    #case=["feature_vector100","feature_vector110","feature_vector120","feature_vector130","feature_vector140","feature_vector150","feature_vector90","feature_vector80","feature_vector70","feature_vector60","feature_vector50","feature_vector40","feature_vector30","feature_vector20","feature_vector10","feature_vector5"]
    case = [
        "selected_chi100", "selected_chi150", "selected_chi200",
        "selected_chi250", "selected_chi300", "selected_chi350",
        "selected_fe100", "selected_fe150", "selected_fe200", "selected_fe250",
        "selected_fe300", "selected_fe350", "selected_sfm100",
        "selected_sfm150", "selected_sfm200", "selected_sfm250",
        "selected_sfm300", "selected_sfm350", "selected_sfmt100",
        "selected_sfmt150", "selected_sfmt200", "selected_sfmt250",
        "selected_sfmt300", "selected_sfmt350"
    ]
    #case=["selected_chi100","selected_chi150","selected_chi200","selected_fe100","selected_fe150","selected_fe200","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_cs100","selected_cs150","selected_cs200",]

    for nomefile in case:

        print(nomefile)
        feature_vector = "./selected_vectors30/" + nomefile + ".arff"  # legge il file in formato arff

        loader = Loader("weka.core.converters.ArffLoader")
        data = loader.load_file(feature_vector)
        data.class_is_last()

        #print(data)

        f = open("./selected_vectors30/risultati/" + nomefile + ".txt",
                 "w+")  # file risultati in txt

        #intestazione excel
        intest = [
            "Correlation coefficient", "Mean absolute error",
            "Root mean squared error", "Relative absolute error",
            "Root relative squared error", "Total Number of Instances"
        ]
        workbook = xlsxwriter.Workbook("./selected_vectors30/risultati/" +
                                       nomefile + ".xlsx")  # file excel
        worksheet = workbook.add_worksheet()

        for col_num, dati in enumerate(intest):
            worksheet.write(0, col_num + 1, dati)
        riga = 1

        #lista degli algoritmi da eseguire
        #alg=["meta.Bagging","meta.RandomSubSpace","rules.M5Rules","trees.M5P","trees.RandomForest"]
        alg = [
            "bayes.NaiveBayes", "bayes.NaiveBayesUpdateable",
            "functions.Logistic", "functions.SGD", "functions.SimpleLogistic",
            "functions.SMO", "functions.VotedPerceptron", "meta.AdaBoostM1",
            "meta.AttributeSelectedClassifier", "meta.Bagging",
            "meta.ClassificationViaRegression",
            "meta.IterativeClassifierOptimizer", "meta.LogitBoost",
            "meta.RandomCommittee", "meta.RandomSubSpace",
            "rules.DecisionTable", "rules.JRip", "rules.OneR",
            "trees.DecisionStump", "trees.J48", "trees.RandomForest",
            "trees.REPTree"
        ]

        for row_num, dati in enumerate(alg):
            worksheet.write(row_num + 1, 0, dati)

        for i in alg:
            remove = Filter(
                classname="weka.filters.unsupervised.attribute.Remove")
            cls = Classifier(classname="weka.classifiers." + i)
            fc = FilteredClassifier()
            fc.filter = remove
            fc.classifier = cls

            evl = Evaluation(data)
            evl.crossvalidate_model(fc, data, 10,
                                    Random(1))  # 10 fold cross validation
            #evl.evaluate_train_test_split(fc,data,50,None,None) # 50% split cross validation

            k = evl.summary()

            #scrittura sui file
            f.write(i + "\n")
            f.write(k + "\n")
            my_list = k.split('\n')
            for col_num, dati in enumerate(my_list):
                worksheet.write(riga, col_num, dati[-10:])
            print(i)
            riga += 1
        f.close()
        workbook.close()
Ejemplo n.º 6
0
print("Train/test/predict...")

groups = ["DataSet1", "DataSet2"]
# groups = ["DataSet2"]

for group in groups:
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
    pred = data_dir + os.sep + group + "_Val.arff"

    loader = Loader(classname="weka.core.converters.ArffLoader")
    print(train)
    train_data = loader.load_file(train)
    train_data.class_index = train_data.attribute_by_name("reference value").index
    print(test)
    test_data = loader.load_file(test)
    test_data.class_index = test_data.attribute_by_name("reference value").index
    print(pred)
    pred_data = loader.load_file(pred)
    pred_data.class_index = pred_data.attribute_by_name("reference value").index

    cls = FilteredClassifier()
    cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    cls.build_classifier(train_data)
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)
    print(evl.summary())

jvm.stop()
Ejemplo n.º 7
0
    curve = {}
    for percentage in percentages:
        curve[percentage] = 0
    curves[repetition] = curve
    # run and add up percentage correct from repetition
    for seed in xrange(repetition):
        seed += 1
        sys.stdout.write(".")
        for percentage in percentages:
            cls = Classifier(classname="weka.classifiers.trees.J48")
            flt = Filter(
                classname="weka.filters.unsupervised.instance.Resample",
                options=["-Z", str(percentage), "-no-replacement"])
            fc = FilteredClassifier()
            fc.classifier = cls
            fc.filter = flt
            evl = Evaluation(data)
            evl.crossvalidate_model(fc, data, 10, Random(seed))
            curve[percentage] += (evl.percent_correct / repetition)
    # progress info
    sys.stdout.write("\n")

# output the results
if not plot.matplotlib_available:
    print("ZeroR: " + str(baseline))
    for repetition in repetitions:
        y = []
        for percentage in percentages:
            y.append(curves[repetition][percentage])
        print("Repetitions = " + str(repetition) + ":\n" + str(y))
else:
Ejemplo n.º 8
0
articles_collection = db.articles
article = articles_collection.find_one({"_id": article_id})

jvm.start(system_cp=True, packages=True, max_heap_size="512m")

# Train classifier

loader = Loader(classname="weka.core.converters.ArffLoader", options=["-charset", "UTF-8"])
train_data = loader.load_file(os.path.dirname(os.path.realpath(__file__)) + "/datasets/train.arff")
train_data.class_is_last()

string_to_word_vector_filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector")
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial")

fc = FilteredClassifier()
fc.filter = string_to_word_vector_filter
fc.classifier = cls

fc.build_classifier(train_data)

# Create test data

class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"])
str_att = Attribute.create_string("title")

test_dataset = Instances.create_instances(
    name="test_news_set",
    atts=[str_att, class_att],
    capacity=1
)
Ejemplo n.º 9
0
fname = data_dir + os.sep + "ReutersGrain-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.class_is_last()

setups = (
    ("weka.classifiers.trees.J48", []),
    ("weka.classifiers.bayes.NaiveBayes", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"])
)

# cross-validate classifiers
for setup in setups:
    classifier, opt = setup
    print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt)))
    cls = FilteredClassifier()
    cls.classifier = Classifier(classname=classifier)
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)
    cls.build_classifier(data)
    evl = Evaluation(test)
    evl.test_model(cls, test)
    print("Accuracy: %0.0f%%" % evl.percent_correct)
    tcdata = plc.generate_thresholdcurve_data(evl, 0)
    print("AUC: %0.3f" % plc.get_auc(tcdata))
    print(evl.matrix("Matrix:"))

jvm.stop()
Ejemplo n.º 10
0
test = converters.load_any_file("imbalanced_test.arff")

train.class_is_last()
test.class_is_last()

# Minority Class is getting Sampled 5x
smote = Filter(classname="weka.filters.supervised.instance.SMOTE",
               options=["-P", "500.0"])

# Base Classifier
cls = Classifier(classname="weka.classifiers.trees.LMT",
                 options=["-B", "-I", "10"])

# Filtered Classifier
fc = FilteredClassifier()
fc.filter = smote
fc.classifier = cls

# 5 Fold K cross validation
evl = Evaluation(train)
evl.crossvalidate_model(fc, train, 5, Random(1))

# Prints Out Confusion Matrix along with other summary statistics
print("LMT (SMOTE balanced classes) CV = 5 Error: %.2f%%" %
      (evl.percent_incorrect))
print(evl.matrix())  #Confusion Matrix

# Plots ROC
plcls.plot_roc(evl, class_index=[0, 1], wait=True)

# Extra Summary