def Boost_J48(data, rnm):
    data.class_is_last()
    fc1 = FilteredClassifier()
    fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
    fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"])
    fc2.classifier = fc1
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    fc2.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output)
    f0 = open(rnm + '_Boost_J48_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc2)
    f0.close()
    f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evaluation.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evaluation.class_details())
    f2.close()
    plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False)
    value_Boost_J48 = str(evaluation.percent_correct)
    return value_Boost_J48
Example #2
0
def fitness(toeval : Individual):
    cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings())
    fc = FilteredClassifier()
    fc.filter = remove
    fc.classifier = cls
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, 10, Random(1))
    return evl.percent_correct
def TrainingModel(arff, modelOutput, clsfier):
    # 启动java虚拟机
    jvm.start()
    # 导入训练集
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(arff)
    train.class_is_first()
    # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高
    cls_name = "weka.classifiers." + clsfier
    clsf = Classifier(classname=cls_name)
    clsf.build_classifier(train)
    print(clsf)
    # 建立模型
    fc = FilteredClassifier()
    fc.classifier = clsf
    evl = Evaluation(train)
    evl.crossvalidate_model(fc, train, 10, Random(1))
    print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())
    print(evl.matrix())
    # 结果统计
    matrixResults = evl.confusion_matrix
    TN = float(matrixResults[0][0])
    FP = float(matrixResults[0][1])
    FN = float(matrixResults[1][0])
    TP = float(matrixResults[1][1])
    TPR = TP / (TP + FN)
    TNR = TN / (FP + TN)
    PPV = TP / (TP + FP)
    NPV = TN / (TN + FN)
    print("算法: " + clsfier)
    print("敏感度 TPR: " + str(TPR))
    print("特异度 TNR: " + str(TNR))
    print("PPV: " + str(PPV))
    print("NPV: " + str(NPV))
    # 保存模型
    clsf.serialize(modelOutput, header=train)
    # 退出虚拟机
    jvm.stop()
    print("分析模型建立完成")
def RandomTree(data, rnm):
    data.class_is_last()
    fc = FilteredClassifier()
    fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"])
    fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, folds, Random(1), pred_output)
    fc.build_classifier(data)
    f0 = open(rnm + '_RT_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc)
    f0.close()
    f1 = open(rnm + '_RT_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_RT_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evl.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evl.class_details())
    f2.close()
    plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False)
    value_RT = str(evl.percent_correct)
    return value_RT
Example #5
0
fname = data_dir + os.sep + "ReutersGrain-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.set_class_index(test.num_attributes() - 1)

setups = (
    ("weka.classifiers.trees.J48", []),
    ("weka.classifiers.bayes.NaiveBayes", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-S"])
)

# cross-validate classifiers
for setup in setups:
    classifier, opt = setup
    print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt)))
    cls = FilteredClassifier()
    cls.set_classifier(Classifier(classname=classifier))
    cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt))
    cls.build_classifier(data)
    evl = Evaluation(test)
    evl.test_model(cls, test)
    print("Accuracy: %0.0f%%" % evl.percent_correct())
    tcdata = plc.generate_thresholdcurve_data(evl, 0)
    print("AUC: %0.3f" % plc.get_auc(tcdata))
    print(evl.to_matrix("Matrix:"))

jvm.stop()
Example #6
0
data.class_is_last()

# 1. cheating with default filter
fltr = Filter(classname="weka.filters.supervised.attribute.Discretize",
              options=[])
fltr.inputformat(data)
filtered = fltr.filter(data)
cls = Classifier(classname="weka.classifiers.trees.J48")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1))
cls.build_classifier(filtered)
print("cheating (default): accuracy=%0.1f nodes=%s" %
      (evl.percent_correct, get_nodes(str(cls))))

# 2. using FilteredClassifier with default filter
cls = FilteredClassifier()
cls.classifier = Classifier(classname="weka.classifiers.trees.J48")
cls.filter = Filter(classname="weka.filters.supervised.attribute.Discretize",
                    options=[])
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
cls.build_classifier(data)
print("FilteredClassifier (default): accuracy=%0.1f nodes=%s" %
      (evl.percent_correct, get_nodes(str(cls))))

# 3. using FilteredClassifier (make binary)
cls = FilteredClassifier()
cls.classifier = Classifier(classname="weka.classifiers.trees.J48")
cls.filter = Filter(classname="weka.filters.supervised.attribute.Discretize",
                    options=["-D"])
evl = Evaluation(data)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline,
                                  classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.RBFKernel",
        options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO",
                                  options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", types.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48",
                            options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0,
                                         Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(
        classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText",
        options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier,
                                   diabetes_data,
                                   10,
                                   Random(42),
                                   output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " +
          str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " +
          str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " +
          str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " +
          str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " +
          str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " +
          str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " +
          str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " +
          str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " +
          str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " +
          str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " +
          str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " +
          str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " +
          str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(evaluation,
                      title="ROC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)
    plot_cls.plot_prc(evaluation,
                      title="PRC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)

    # train 2nd classifier on diabetes dataset
    classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
    evaluation2 = Evaluation(diabetes_data)
    evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42))
    plot_cls.plot_rocs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="ROC diabetes",
                       class_index=0,
                       wait=False)
    plot_cls.plot_prcs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="PRC diabetes",
                       class_index=0,
                       wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(
            str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # train 2nd classifier and show errors in same plot
    classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg")
    evaluation2 = Evaluation(bolts_data)
    evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42))
    plot_cls.plot_classifier_errors(
        {
            "LR": evaluation.predictions,
            "SMOreg": evaluation2.predictions
        },
        wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    ]
    plot_cls.plot_learning_curve(cls,
                                 diabetes_data,
                                 increments=0.05,
                                 label_template="[#] !",
                                 metric="percent_correct",
                                 wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in xrange(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
Example #8
0
    # progress info
    sys.stdout.write("Repetitions=" + str(repetition))
    # initialize curve
    curve = {}
    for percentage in percentages:
        curve[percentage] = 0
    curves[repetition] = curve
    # run and add up percentage correct from repetition
    for seed in xrange(repetition):
        seed += 1
        sys.stdout.write(".")
        for percentage in percentages:
            cls = Classifier(classname="weka.classifiers.trees.J48")
            flt = Filter(classname="weka.filters.unsupervised.instance.Resample",
                         options=["-Z", str(percentage), "-no-replacement"])
            fc = FilteredClassifier()
            fc.set_classifier(cls)
            fc.set_filter(flt)
            evl = Evaluation(data)
            evl.crossvalidate_model(fc, data, 10, Random(seed))
            curve[percentage] += (evl.percent_correct() / repetition)
    # progress info
    sys.stdout.write("\n")

# output the results
if not plot.matplotlib_available:
    print("ZeroR: " + str(baseline))
    for repetition in repetitions:
        y = []
        for percentage in percentages:
            y.append(curves[repetition][percentage])
Example #9
0
def main():
    """
    Just runs some example code.
    """
    #case=["selected_chi100","selected_chi150","selected_chi200","selected_chi250","selected_chi300","selected_chi350","selected_fe100","selected_fe150","selected_fe200","selected_fe250","selected_fe300","selected_fe350","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfm250","selected_sfm300","selected_sfm350","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_sfmt250","selected_sfmt300","selected_sfmt350","selected_cs100","selected_cs150","selected_cs200","selected_cs250","selected_cs300","selected_cs350"]
    #case=["feature_vector100","feature_vector110","feature_vector120","feature_vector130","feature_vector140","feature_vector150","feature_vector90","feature_vector80","feature_vector70","feature_vector60","feature_vector50","feature_vector40","feature_vector30","feature_vector20","feature_vector10","feature_vector5"]
    case = [
        "selected_chi100", "selected_chi150", "selected_chi200",
        "selected_chi250", "selected_chi300", "selected_chi350",
        "selected_fe100", "selected_fe150", "selected_fe200", "selected_fe250",
        "selected_fe300", "selected_fe350", "selected_sfm100",
        "selected_sfm150", "selected_sfm200", "selected_sfm250",
        "selected_sfm300", "selected_sfm350", "selected_sfmt100",
        "selected_sfmt150", "selected_sfmt200", "selected_sfmt250",
        "selected_sfmt300", "selected_sfmt350"
    ]
    #case=["selected_chi100","selected_chi150","selected_chi200","selected_fe100","selected_fe150","selected_fe200","selected_sfm100","selected_sfm150","selected_sfm200","selected_sfmt100","selected_sfmt150","selected_sfmt200","selected_cs100","selected_cs150","selected_cs200",]

    for nomefile in case:

        print(nomefile)
        feature_vector = "./selected_vectors30/" + nomefile + ".arff"  # legge il file in formato arff

        loader = Loader("weka.core.converters.ArffLoader")
        data = loader.load_file(feature_vector)
        data.class_is_last()

        #print(data)

        f = open("./selected_vectors30/risultati/" + nomefile + ".txt",
                 "w+")  # file risultati in txt

        #intestazione excel
        intest = [
            "Correlation coefficient", "Mean absolute error",
            "Root mean squared error", "Relative absolute error",
            "Root relative squared error", "Total Number of Instances"
        ]
        workbook = xlsxwriter.Workbook("./selected_vectors30/risultati/" +
                                       nomefile + ".xlsx")  # file excel
        worksheet = workbook.add_worksheet()

        for col_num, dati in enumerate(intest):
            worksheet.write(0, col_num + 1, dati)
        riga = 1

        #lista degli algoritmi da eseguire
        #alg=["meta.Bagging","meta.RandomSubSpace","rules.M5Rules","trees.M5P","trees.RandomForest"]
        alg = [
            "bayes.NaiveBayes", "bayes.NaiveBayesUpdateable",
            "functions.Logistic", "functions.SGD", "functions.SimpleLogistic",
            "functions.SMO", "functions.VotedPerceptron", "meta.AdaBoostM1",
            "meta.AttributeSelectedClassifier", "meta.Bagging",
            "meta.ClassificationViaRegression",
            "meta.IterativeClassifierOptimizer", "meta.LogitBoost",
            "meta.RandomCommittee", "meta.RandomSubSpace",
            "rules.DecisionTable", "rules.JRip", "rules.OneR",
            "trees.DecisionStump", "trees.J48", "trees.RandomForest",
            "trees.REPTree"
        ]

        for row_num, dati in enumerate(alg):
            worksheet.write(row_num + 1, 0, dati)

        for i in alg:
            remove = Filter(
                classname="weka.filters.unsupervised.attribute.Remove")
            cls = Classifier(classname="weka.classifiers." + i)
            fc = FilteredClassifier()
            fc.filter = remove
            fc.classifier = cls

            evl = Evaluation(data)
            evl.crossvalidate_model(fc, data, 10,
                                    Random(1))  # 10 fold cross validation
            #evl.evaluate_train_test_split(fc,data,50,None,None) # 50% split cross validation

            k = evl.summary()

            #scrittura sui file
            f.write(i + "\n")
            f.write(k + "\n")
            my_list = k.split('\n')
            for col_num, dati in enumerate(my_list):
                worksheet.write(riga, col_num, dati[-10:])
            print(i)
            riga += 1
        f.close()
        workbook.close()
Example #10
0
train = converters.load_any_file("imbalanced_train.arff")
test = converters.load_any_file("imbalanced_test.arff")

train.class_is_last()
test.class_is_last()

# Minority Class is getting Sampled 5x
smote = Filter(classname="weka.filters.supervised.instance.SMOTE",
               options=["-P", "500.0"])

# Base Classifier
cls = Classifier(classname="weka.classifiers.trees.LMT",
                 options=["-B", "-I", "10"])

# Filtered Classifier
fc = FilteredClassifier()
fc.filter = smote
fc.classifier = cls

# 5 Fold K cross validation
evl = Evaluation(train)
evl.crossvalidate_model(fc, train, 5, Random(1))

# Prints Out Confusion Matrix along with other summary statistics
print("LMT (SMOTE balanced classes) CV = 5 Error: %.2f%%" %
      (evl.percent_incorrect))
print(evl.matrix())  #Confusion Matrix

# Plots ROC
plcls.plot_roc(evl, class_index=[0, 1], wait=True)
Example #11
0
    sys.stdout.write("Repetitions=" + str(repetition))
    # initialize curve
    curve = {}
    for percentage in percentages:
        curve[percentage] = 0
    curves[repetition] = curve
    # run and add up percentage correct from repetition
    for seed in xrange(repetition):
        seed += 1
        sys.stdout.write(".")
        for percentage in percentages:
            cls = Classifier(classname="weka.classifiers.trees.J48")
            flt = Filter(
                classname="weka.filters.unsupervised.instance.Resample",
                options=["-Z", str(percentage), "-no-replacement"])
            fc = FilteredClassifier()
            fc.classifier = cls
            fc.filter = flt
            evl = Evaluation(data)
            evl.crossvalidate_model(fc, data, 10, Random(seed))
            curve[percentage] += (evl.percent_correct / repetition)
    # progress info
    sys.stdout.write("\n")

# output the results
if not plot.matplotlib_available:
    print("ZeroR: " + str(baseline))
    for repetition in repetitions:
        y = []
        for percentage in percentages:
            y.append(curves[repetition][percentage])
db = client["news-scraper"]
articles_collection = db.articles
article = articles_collection.find_one({"_id": article_id})

jvm.start(system_cp=True, packages=True, max_heap_size="512m")

# Train classifier

loader = Loader(classname="weka.core.converters.ArffLoader", options=["-charset", "UTF-8"])
train_data = loader.load_file(os.path.dirname(os.path.realpath(__file__)) + "/datasets/train.arff")
train_data.class_is_last()

string_to_word_vector_filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector")
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesMultinomial")

fc = FilteredClassifier()
fc.filter = string_to_word_vector_filter
fc.classifier = cls

fc.build_classifier(train_data)

# Create test data

class_att = Attribute.create_nominal("class", ["good", "neutral", "bad"])
str_att = Attribute.create_string("title")

test_dataset = Instances.create_instances(
    name="test_news_set",
    atts=[str_att, class_att],
    capacity=1
)
Example #13
0
fname = data_dir + os.sep + "ReutersGrain-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.class_is_last()

setups = (
    ("weka.classifiers.trees.J48", []),
    ("weka.classifiers.bayes.NaiveBayes", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", []),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]),
    ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"])
)

# cross-validate classifiers
for setup in setups:
    classifier, opt = setup
    print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt)))
    cls = FilteredClassifier()
    cls.classifier = Classifier(classname=classifier)
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)
    cls.build_classifier(data)
    evl = Evaluation(test)
    evl.test_model(cls, test)
    print("Accuracy: %0.0f%%" % evl.percent_correct)
    tcdata = plc.generate_thresholdcurve_data(evl, 0)
    print("AUC: %0.3f" % plc.get_auc(tcdata))
    print(evl.matrix("Matrix:"))

jvm.stop()
Example #14
0
import matplotlib as plt

jvm.start()

loader = Loader(classname="weka.core.converters.CSVLoader")
data = loader.load_file("C:/Arpit/aps.failure_training_set.csv")
data_test = loader.load_file("C:/Arpit/aps.failure_test_set.csv")
# print(str(data))data = loader.load_file( + "aps.failure_training_set.csv")
data.class_is_last()
data_test.class_is_last()

# remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-3"])

cls = Classifier(classname="weka.classifiers.trees.LMT")

fc = FilteredClassifier()
# fc.filter = remove
fc.classifier = cls

evl = Evaluation(data)

evl.crossvalidate_model(fc, data, 10, Random(1))
preds = evl.test_model(cls, data_test)

conf = evl.confusion_matrix
print(evl.percent_incorrect)
# print(evl.summary())
# print(evl.class_details())
sns.heatmap(conf, cmap="YlGnBu", annot=True, linewidths=.5, fmt='d')

print("AUC", evl.area_under_prc)
Example #15
0
print("Train/test/predict...")

groups = ["DataSet1", "DataSet2"]
# groups = ["DataSet2"]

for group in groups:
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
    pred = data_dir + os.sep + group + "_Val.arff"

    loader = Loader(classname="weka.core.converters.ArffLoader")
    print(train)
    train_data = loader.load_file(train)
    train_data.class_index = train_data.attribute_by_name("reference value").index
    print(test)
    test_data = loader.load_file(test)
    test_data.class_index = test_data.attribute_by_name("reference value").index
    print(pred)
    pred_data = loader.load_file(pred)
    pred_data.class_index = pred_data.attribute_by_name("reference value").index

    cls = FilteredClassifier()
    cls.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    cls.build_classifier(train_data)
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)
    print(evl.summary())

jvm.stop()
Example #16
0
    print(group)
    train = data_dir + os.sep + group + "_Cal.arff"
    test = data_dir + os.sep + group + "_Test.arff"
    pred = data_dir + os.sep + group + "_Val.arff"

    loader = Loader(classname="weka.core.converters.ArffLoader")
    print(train)
    train_data = loader.load_file(train)
    train_data.class_index = train_data.attribute_by_name(
        "reference value").index
    print(test)
    test_data = loader.load_file(test)
    test_data.class_index = test_data.attribute_by_name(
        "reference value").index
    print(pred)
    pred_data = loader.load_file(pred)
    pred_data.class_index = pred_data.attribute_by_name(
        "reference value").index

    cls = FilteredClassifier()
    cls.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                        options=["-R", "first"])
    cls.build_classifier(train_data)
    evl = Evaluation(train_data)
    evl.test_model(cls, test_data)
    print(evl.summary())

jvm.stop()

print("Error is",tevlmt.error_rate)
tcm2e = tevlmt.confusion_matrix
tcm2E = pd.DataFrame(tcm2e, index = ["neg","pos"],columns = ["neg","pos"])
plt.figure(figsize = (7,7))
axis = sns.heatmap(tcm2E, annot=True, cbar=False, cmap="Reds")
plcls.plot_roc(tevlmt,class_index=[1])


packages.install_package("SMOTE")


smote = Filter(classname="weka.filters.supervised.instance.SMOTE",options=["-P", "4800"])
smt = Classifier(classname="weka.classifiers.trees.LMT")
fc = FilteredClassifier()
fc.filter = smote
fc.classifier = smt
fc.build_classifier(Wtrain)


evsmt = Evaluation(Wtrain)
evsmt.crossvalidate_model(fc, Wtrain, 5, Random(1))

print("Error is",evsmt.error_rate)
cm2f = evsmt.confusion_matrix
cm2F = pd.DataFrame(cm2f, index = ["neg","pos"],columns = ["neg","pos"])
plt.figure(figsize = (7,7))
axis = sns.heatmap(cm2F, annot=True, cbar=False, cmap="Reds")
plcls.plot_roc(evsmt,class_index=[1])