def testNB(training_data, testing_data):

    train_data = Instances.copy_instances(training_data)
    test_data = Instances.copy_instances(testing_data)

    evaluation = Evaluation(train_data)
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    classifier.build_classifier(
        train_data)  # build classifier on the training data
    evaluation.test_model(classifier,
                          test_data)  # test and evaluate model on the test set
    print("")
    print("")
    print(
        evaluation.summary(
            "--------------Naive Bayes Evaluation--------------"))
    print("Accuracy: " + str(evaluation.percent_correct))
    print("")
    print("Label\tPrecision\t\tRecall\t\t\tF-Measure")
    print("<=50K\t" + str(evaluation.precision(0)) + "\t" +
          str(evaluation.recall(0)) + "\t" + str(evaluation.f_measure(0)))
    print(">50K\t" + str(evaluation.precision(1)) + "\t" +
          str(evaluation.recall(1)) + "\t" + str(evaluation.f_measure(1)))
    print("Mean\t" + str(((evaluation.precision(1)) +
                          (evaluation.precision(0))) / 2) + "\t" +
          str(((evaluation.recall(1)) + (evaluation.recall(0))) / 2) + "\t" +
          str(((evaluation.f_measure(1)) + (evaluation.f_measure(0))) / 2))
Exemple #2
0
def naiveBayes(data):
	
	classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"])
	nfolds=13
	rnd = Random(0)
	evaluation = Evaluation(data)
	evaluation.crossvalidate_model(classifier, data,
	nfolds, rnd)
	print(" Naive Bayes Cross-validation information")
	print(evaluation.summary())
	print("precision: " + str(evaluation.precision(1)))
	print("recall: " + str(evaluation.recall(1)))
	print("F-measure: " + str(evaluation.f_measure(1)))
	print("==confusion matrix==")
	print("     a     b")
	print(evaluation.confusion_matrix)
	print
	#write to file
	f = open("naiveeval.txt", "w")
	f.write(evaluation.summary()) 
	f.write("\n")
	f.write("==confusion matrix==\n")
	f.write("     a       b\n")
	for item in evaluation.confusion_matrix:
		f.write("%s\n" % item)
	f.close() 
	#plot roc graph
	plcls.plot_roc(evaluation, title="Naive Bayes ROC", outfile="NBROC", wait=True)
	
	return evaluation.percent_correct
Exemple #3
0
def experiment_file_random(path_features, path_folder_save_results, options,
                           classifier, fold, random, name):
    print(name + "  Start: " + str(datetime.datetime.now()))
    time = datetime.datetime.now()
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))
    d_results = {
        'percent_correct': [],
        'percent_incorrect': [],
        'precision': [],
        'recall': [],
        'f-score': [],
        'confusion_matrix': []
    }
    data = converters.load_any_file(path_features)
    data.class_is_last()
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, fold, Random(random), pout)
    d_results['percent_correct'].append(evl.percent_correct)
    d_results['percent_incorrect'].append(evl.percent_incorrect)
    d_results['precision'].append(evl.precision(1))
    d_results['recall'].append(evl.recall(1))
    d_results['f-score'].append(evl.f_measure(1))
    d_results['confusion_matrix'].append(
        evl.matrix())  # Generates the confusion matrix.

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv',
                     index=False)

    save = pout.buffer_content()

    check_folder_or_create(path_folder_save_results + '/' + 'prediction')

    with open(
            path_folder_save_results + '/' + 'prediction/' + str(name) +
            '.csv', 'w') as f:
        f.write(save)
    print(name + "  End: " + str(datetime.datetime.now() - time))
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline,
                                  classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.RBFKernel",
        options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO",
                                  options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", types.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48",
                            options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0,
                                         Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(
        classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText",
        options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier,
                                   diabetes_data,
                                   10,
                                   Random(42),
                                   output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " +
          str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " +
          str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " +
          str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " +
          str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " +
          str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " +
          str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " +
          str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " +
          str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " +
          str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " +
          str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " +
          str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " +
          str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " +
          str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(evaluation,
                      title="ROC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)
    plot_cls.plot_prc(evaluation,
                      title="PRC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)

    # train 2nd classifier on diabetes dataset
    classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
    evaluation2 = Evaluation(diabetes_data)
    evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42))
    plot_cls.plot_rocs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="ROC diabetes",
                       class_index=0,
                       wait=False)
    plot_cls.plot_prcs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="PRC diabetes",
                       class_index=0,
                       wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(
            str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # train 2nd classifier and show errors in same plot
    classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg")
    evaluation2 = Evaluation(bolts_data)
    evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42))
    plot_cls.plot_classifier_errors(
        {
            "LR": evaluation.predictions,
            "SMOreg": evaluation2.predictions
        },
        wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    ]
    plot_cls.plot_learning_curve(cls,
                                 diabetes_data,
                                 increments=0.05,
                                 label_template="[#] !",
                                 metric="percent_correct",
                                 wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in xrange(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
evaluation = Evaluation(data)
evaluation.crossvalidate_model(classifier, data, 10, Random(42), output=pred_output)
plot_cls.plot_roc(evaluation, title="ROC bugs",class_index=range(0, data.class_attribute.num_values), wait=False)
plot_cls.plot_prc(evaluation, title="PRC bugs - NaiveBayes",class_index=range(0, data.class_attribute.num_values), wait=False)

"""Performance Metrics - Naive Bayes Classifier"""

print(evaluation.summary())
print(evaluation.class_details())
print(evaluation.matrix())

print("confusionMatrix: " + str(evaluation.confusion_matrix))
print("fMeasure: " + str(evaluation.f_measure(1)))
print("precision: " + str(evaluation.precision(1)))
print("recall: " + str(evaluation.recall(1)))

"""Random Forest Classifier"""

classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
evaluation2 = Evaluation(data)
evaluation2.crossvalidate_model(classifier2, data, 10, Random(42))
plot_cls.plot_roc(evaluation2, title="ROC bugs",class_index=range(0, data.class_attribute.num_values), wait=False)
plot_cls.plot_prc(evaluation2, title="PRC bugs - RandomForest",class_index=range(0, data.class_attribute.num_values), wait=False)

"""Performance Evaluation Metrics - Random Forest"""

print(evaluation2.summary())
print(evaluation2.class_details())
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(
        evaluation, title="ROC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)
    plot_cls.plot_prc(
        evaluation, title="PRC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")]
    plot_cls.plot_learning_curve(
        cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
def classify_and_save(classifier, name, outfile):
    random.seed("ML349")

    csv_header = [
                    "Game Name",
                    "SteamID",
                    "Algorithm",
                    "Number Players",
                    "%Players of Training Set",
                    "Accuracy",
                    "Precision (0)",
                    "Recall (0)",
                    "F1 (0)",
                    "Precision (1)",
                    "Recall (1)",
                    "F1 (1)"
    ]
    game_results = []

    with open("data/games_by_username_all.csv", "r") as f:
        game_list = f.next().rstrip().split(",")

    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file("data/final_train.arff")
    test = loader.load_file("data/final_test.arff")

    count = 0
    for i in itertools.chain(xrange(0, 50), random.sample(xrange(50, len(game_list)), 450)):
        train.class_index = i
        test.class_index = i
        count += 1

        classifier.build_classifier(train)

        evaluation = Evaluation(train)
        evaluation.test_model(classifier, test)

        confusion = evaluation.confusion_matrix
        num_players = sum(confusion[1])
        steam_id = repr(train.class_attribute).split(" ")[1]
        result = [
                    game_list[i],
                    steam_id,
                    name,
                    int(num_players),
                    num_players/1955,
                    evaluation.percent_correct,
                    evaluation.precision(0),
                    evaluation.recall(0),
                    evaluation.f_measure(0),
                    evaluation.precision(1),
                    evaluation.recall(1),
                    evaluation.f_measure(1)
        ]

        game_results.append(result)
        print "\nResult #{2}/500 for {0} (SteamID {1}):".format(game_list[i], steam_id, count),
        print evaluation.summary()

    with open(outfile, "wb") as f:
        csv_writer = csv.writer(f, delimiter=",")
        csv_writer.writerow(csv_header)
        for r in game_results:
            csv_writer.writerow(r)
Exemple #8
0
def experiment_sequential_file(path_indices, path_features,
                               path_folder_save_results, options, classifier,
                               name, indicator_col, images):
    ind_f = load(path_indices)
    lst = ind_f.files

    for item in lst:
        ind = ind_f[item] + 1

    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    data = converters.load_any_file(path_features)

    ind = np.append(ind, len(data))

    data.class_is_last()

    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")

    d_results = {
        'index': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'precision': [],
        'recall': [],
        'f-score': [],
        'confusion_matrix': []
    }

    for j in range(len(ind) - 1):
        first = ind[j]

        if j == len(ind) - 2:
            last = ind[j + 1]
        else:
            last = ind[j + 1] - 1

        d_test = data.subset(row_range=str(first) + '-' + str(last))

        if j == 0:  # first
            d_train = data.subset(row_range=str(last + 1) + '-' +
                                  str(ind[-1]))  # last element
            print(str(last + 1) + '-' + str(ind[-1]))
        elif j == len(ind) - 2:  # last
            d_train = data.subset(row_range='1-' +
                                  str(first - 1))  # last element
            print('1-' + str(first - 1))
        else:  # central
            s = '1-' + str(first - 1) + ',' + str(last + 1) + '-' + str(
                ind[-1])
            print(s)
            d_train = data.subset(row_range=s)

        cls.build_classifier(d_train)

        evl = Evaluation(data)
        evl.test_model(cls, d_test, pout)

        # print(type(d_train))
        # print(type(d_test))

        d_results['index'].append(str(ind[j]))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['precision'].append(evl.precision(1))
        d_results['recall'].append(evl.recall(1))
        d_results['f-score'].append(evl.f_measure(1))
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

    save = pout.buffer_content()

    check_folder_or_create(path_folder_save_results + '/' + 'prediction')

    with open(
            path_folder_save_results + '/' + 'prediction/' + name +
            'pred_data.csv', 'w') as f:
        f.write(save)

    buffer_save = pd.read_csv(path_folder_save_results + '/' + 'prediction/' +
                              name + 'pred_data.csv',
                              index_col=False,
                              header=None)

    col_label = buffer_save[1]
    col_prediction = buffer_save[2]
    col_different = buffer_save[3]

    create_prediction(col_label, col_prediction, col_different, indicator_col,
                      images, name, path_folder_save_results + '/prediction/')

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv',
                     index=False)
Exemple #9
0
def experiment_more_file(path_files,
                         path_folder_save_results,
                         fold,
                         options,
                         classifier,
                         random,
                         name,
                         voting=False):
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    file_list = os.listdir(path_files)

    for file in file_list:
        if ".csv" not in file:
            file_list.remove(file)

    d_results = {
        'name_file': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'precision': [],
        'recall': [],
        'f-score': [],
        'confusion_matrix': []
    }

    for file in file_list:
        indicator_table = pd.read_csv(path_files + '/indicator/' + file[0] +
                                      '_indicator.csv')
        indicator = list(indicator_table['indicator'])
        images = list(indicator_table['image'])

        data = converters.load_any_file(path_files + "/" + file)

        data.class_is_last()

        pout = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.CSV")

        evl = Evaluation(data)

        evl.crossvalidate_model(cls, data, fold, Random(random), pout)

        d_results['name_file'].append(str(file))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['precision'].append(evl.precision(1))
        d_results['recall'].append(evl.recall(1))
        d_results['f-score'].append(evl.f_measure(1))
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

        save = pout.buffer_content()

        check_folder_or_create(path_folder_save_results + '/' + name + '/' +
                               'prediction')

        with open(
                path_folder_save_results + '/' + name + '/' +
                'prediction/pred_data.csv', 'w') as f:
            f.write(save)

        buffer_save = pd.read_csv(path_folder_save_results + '/' + name + '/' +
                                  'prediction/pred_data.csv',
                                  index_col=False)

        col_label = buffer_save['actual']
        col_prediction = buffer_save['predicted']
        col_different = buffer_save['error']

        create_prediction(
            col_label, col_prediction, col_different, indicator, images,
            file[:-4], path_folder_save_results + '/' + name + '/prediction/')

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv")