def Boost_J48(data, rnm):
    data.class_is_last()
    fc1 = FilteredClassifier()
    fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
    fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"])
    fc2.classifier = fc1
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    fc2.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output)
    f0 = open(rnm + '_Boost_J48_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc2)
    f0.close()
    f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evaluation.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evaluation.class_details())
    f2.close()
    plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False)
    value_Boost_J48 = str(evaluation.percent_correct)
    return value_Boost_J48
def RandomTree(data, rnm):
    data.class_is_last()
    fc = FilteredClassifier()
    fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"])
    fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, folds, Random(1), pred_output)
    fc.build_classifier(data)
    f0 = open(rnm + '_RT_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc)
    f0.close()
    f1 = open(rnm + '_RT_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_RT_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evl.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evl.class_details())
    f2.close()
    plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False)
    value_RT = str(evl.percent_correct)
    return value_RT
Esempio n. 3
0
def fitness(toeval : Individual):
    cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings())
    fc = FilteredClassifier()
    fc.filter = remove
    fc.classifier = cls
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, 10, Random(1))
    return evl.percent_correct
def TrainingModel(arff, modelOutput, clsfier):
    # 启动java虚拟机
    jvm.start()
    # 导入训练集
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(arff)
    train.class_is_first()
    # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高
    cls_name = "weka.classifiers." + clsfier
    clsf = Classifier(classname=cls_name)
    clsf.build_classifier(train)
    print(clsf)
    # 建立模型
    fc = FilteredClassifier()
    fc.classifier = clsf
    evl = Evaluation(train)
    evl.crossvalidate_model(fc, train, 10, Random(1))
    print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())
    print(evl.matrix())
    # 结果统计
    matrixResults = evl.confusion_matrix
    TN = float(matrixResults[0][0])
    FP = float(matrixResults[0][1])
    FN = float(matrixResults[1][0])
    TP = float(matrixResults[1][1])
    TPR = TP / (TP + FN)
    TNR = TN / (FP + TN)
    PPV = TP / (TP + FP)
    NPV = TN / (TN + FN)
    print("算法: " + clsfier)
    print("敏感度 TPR: " + str(TPR))
    print("特异度 TNR: " + str(TNR))
    print("PPV: " + str(PPV))
    print("NPV: " + str(NPV))
    # 保存模型
    clsf.serialize(modelOutput, header=train)
    # 退出虚拟机
    jvm.stop()
    print("分析模型建立完成")
Esempio n. 5
0
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
    pred = data_dir + os.sep + group + "_Val.arff"

    loader = Loader(classname="weka.core.converters.ArffLoader")
    print(train)
    train_data = loader.load_file(train)
    train_data.class_index = train_data.attribute_by_name(
        "reference value").index
    print(test)
    test_data = loader.load_file(test)
    test_data.class_index = test_data.attribute_by_name(
        "reference value").index
    print(pred)
    pred_data = loader.load_file(pred)
    pred_data.class_index = pred_data.attribute_by_name(
        "reference value").index

    cls = FilteredClassifier()
    cls.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                        options=["-R", "first"])
    cls.build_classifier(train_data)
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)
    print(evl.summary())

jvm.stop()
Esempio n. 6
0
jvm.start()

loader = Loader(classname="weka.core.converters.CSVLoader")
data = loader.load_file("C:/Arpit/aps.failure_training_set.csv")
data_test = loader.load_file("C:/Arpit/aps.failure_test_set.csv")
# print(str(data))data = loader.load_file( + "aps.failure_training_set.csv")
data.class_is_last()
data_test.class_is_last()

# remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-3"])

cls = Classifier(classname="weka.classifiers.trees.LMT")

fc = FilteredClassifier()
# fc.filter = remove
fc.classifier = cls

evl = Evaluation(data)

evl.crossvalidate_model(fc, data, 10, Random(1))
preds = evl.test_model(cls, data_test)

conf = evl.confusion_matrix
print(evl.percent_incorrect)
# print(evl.summary())
# print(evl.class_details())
sns.heatmap(conf, cmap="YlGnBu", annot=True, linewidths=.5, fmt='d')

print("AUC", evl.area_under_prc)
import weka.plot.classifiers as plcls  # NB: matplotlib is required
plcls.plot_roc(evl, class_index=[0, 1], wait=True)
Esempio n. 7
0
def main():
    """
    Just runs some example code.
    """
    #case=["selected_chi100","selected_chi150","selected_chi200","selected_chi250","selected_chi300","selected_chi350","selected_fe100","selected_fe150","selected_fe200","selected_fe250","selected_fe300","selected_fe350","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfm250","selected_sfm300","selected_sfm350","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_sfmt250","selected_sfmt300","selected_sfmt350","selected_cs100","selected_cs150","selected_cs200","selected_cs250","selected_cs300","selected_cs350"]
    #case=["feature_vector100","feature_vector110","feature_vector120","feature_vector130","feature_vector140","feature_vector150","feature_vector90","feature_vector80","feature_vector70","feature_vector60","feature_vector50","feature_vector40","feature_vector30","feature_vector20","feature_vector10","feature_vector5"]
    case = [
        "selected_chi100", "selected_chi150", "selected_chi200",
        "selected_chi250", "selected_chi300", "selected_chi350",
        "selected_fe100", "selected_fe150", "selected_fe200", "selected_fe250",
        "selected_fe300", "selected_fe350", "selected_sfm100",
        "selected_sfm150", "selected_sfm200", "selected_sfm250",
        "selected_sfm300", "selected_sfm350", "selected_sfmt100",
        "selected_sfmt150", "selected_sfmt200", "selected_sfmt250",
        "selected_sfmt300", "selected_sfmt350"
    ]
    #case=["selected_chi100","selected_chi150","selected_chi200","selected_fe100","selected_fe150","selected_fe200","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_cs100","selected_cs150","selected_cs200",]

    for nomefile in case:

        print(nomefile)
        feature_vector = "./selected_vectors30/" + nomefile + ".arff"  # legge il file in formato arff

        loader = Loader("weka.core.converters.ArffLoader")
        data = loader.load_file(feature_vector)
        data.class_is_last()

        #print(data)

        f = open("./selected_vectors30/risultati/" + nomefile + ".txt",
                 "w+")  # file risultati in txt

        #intestazione excel
        intest = [
            "Correlation coefficient", "Mean absolute error",
            "Root mean squared error", "Relative absolute error",
            "Root relative squared error", "Total Number of Instances"
        ]
        workbook = xlsxwriter.Workbook("./selected_vectors30/risultati/" +
                                       nomefile + ".xlsx")  # file excel
        worksheet = workbook.add_worksheet()

        for col_num, dati in enumerate(intest):
            worksheet.write(0, col_num + 1, dati)
        riga = 1

        #lista degli algoritmi da eseguire
        #alg=["meta.Bagging","meta.RandomSubSpace","rules.M5Rules","trees.M5P","trees.RandomForest"]
        alg = [
            "bayes.NaiveBayes", "bayes.NaiveBayesUpdateable",
            "functions.Logistic", "functions.SGD", "functions.SimpleLogistic",
            "functions.SMO", "functions.VotedPerceptron", "meta.AdaBoostM1",
            "meta.AttributeSelectedClassifier", "meta.Bagging",
            "meta.ClassificationViaRegression",
            "meta.IterativeClassifierOptimizer", "meta.LogitBoost",
            "meta.RandomCommittee", "meta.RandomSubSpace",
            "rules.DecisionTable", "rules.JRip", "rules.OneR",
            "trees.DecisionStump", "trees.J48", "trees.RandomForest",
            "trees.REPTree"
        ]

        for row_num, dati in enumerate(alg):
            worksheet.write(row_num + 1, 0, dati)

        for i in alg:
            remove = Filter(
                classname="weka.filters.unsupervised.attribute.Remove")
            cls = Classifier(classname="weka.classifiers." + i)
            fc = FilteredClassifier()
            fc.filter = remove
            fc.classifier = cls

            evl = Evaluation(data)
            evl.crossvalidate_model(fc, data, 10,
                                    Random(1))  # 10 fold cross validation
            #evl.evaluate_train_test_split(fc,data,50,None,None) # 50% split cross validation

            k = evl.summary()

            #scrittura sui file
            f.write(i + "\n")
            f.write(k + "\n")
            my_list = k.split('\n')
            for col_num, dati in enumerate(my_list):
                worksheet.write(riga, col_num, dati[-10:])
            print(i)
            riga += 1
        f.close()
        workbook.close()
Esempio n. 8
0
print("Train/test/predict...")

groups = ["DataSet1", "DataSet2"]
# groups = ["DataSet2"]

for group in groups:
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
    pred = data_dir + os.sep + group + "_Val.arff"

    loader = Loader(classname="weka.core.converters.ArffLoader")
    print(train)
    train_data = loader.load_file(train)
    train_data.class_index = train_data.attribute_by_name("reference value").index
    print(test)
    test_data = loader.load_file(test)
    test_data.class_index = test_data.attribute_by_name("reference value").index
    print(pred)
    pred_data = loader.load_file(pred)
    pred_data.class_index = pred_data.attribute_by_name("reference value").index

    cls = FilteredClassifier()
    cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    cls.build_classifier(train_data)
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)
    print(evl.summary())

jvm.stop()
Esempio n. 9
0
fname = data_dir + os.sep + "ReutersGrain-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.class_is_last()

setups = (
    ("weka.classifiers.trees.J48", []),
    ("weka.classifiers.bayes.NaiveBayes", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"])
)

# cross-validate classifiers
for setup in setups:
    classifier, opt = setup
    print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt)))
    cls = FilteredClassifier()
    cls.classifier = Classifier(classname=classifier)
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)
    cls.build_classifier(data)
    evl = Evaluation(test)
    evl.test_model(cls, test)
    print("Accuracy: %0.0f%%" % evl.percent_correct)
    tcdata = plc.generate_thresholdcurve_data(evl, 0)
    print("AUC: %0.3f" % plc.get_auc(tcdata))
    print(evl.matrix("Matrix:"))

jvm.stop()
Esempio n. 10
0
print("Error is",tevlmt.error_rate)
tcm2e = tevlmt.confusion_matrix
tcm2E = pd.DataFrame(tcm2e, index = ["neg","pos"],columns = ["neg","pos"])
plt.figure(figsize = (7,7))
axis = sns.heatmap(tcm2E, annot=True, cbar=False, cmap="Reds")
plcls.plot_roc(tevlmt,class_index=[1])


packages.install_package("SMOTE")


smote = Filter(classname="weka.filters.supervised.instance.SMOTE",options=["-P", "4800"])
smt = Classifier(classname="weka.classifiers.trees.LMT")
fc = FilteredClassifier()
fc.filter = smote
fc.classifier = smt
fc.build_classifier(Wtrain)


evsmt = Evaluation(Wtrain)
evsmt.crossvalidate_model(fc, Wtrain, 5, Random(1))

print("Error is",evsmt.error_rate)
cm2f = evsmt.confusion_matrix
cm2F = pd.DataFrame(cm2f, index = ["neg","pos"],columns = ["neg","pos"])
plt.figure(figsize = (7,7))
axis = sns.heatmap(cm2F, annot=True, cbar=False, cmap="Reds")
plcls.plot_roc(evsmt,class_index=[1])


tevsmt = Evaluation(Wtrain)