def main():
    """
    Shows how to use the CostSensitiveClassifier.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.CostSensitiveClassifier",
        options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"])
    base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    classifier.classifier = base

    folds = 10
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, folds, Random(1))


    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
Exemple #2
0
def naiveBayes(data):
	
	classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"])
	nfolds=13
	rnd = Random(0)
	evaluation = Evaluation(data)
	evaluation.crossvalidate_model(classifier, data,
	nfolds, rnd)
	print(" Naive Bayes Cross-validation information")
	print(evaluation.summary())
	print("precision: " + str(evaluation.precision(1)))
	print("recall: " + str(evaluation.recall(1)))
	print("F-measure: " + str(evaluation.f_measure(1)))
	print("==confusion matrix==")
	print("     a     b")
	print(evaluation.confusion_matrix)
	print
	#write to file
	f = open("naiveeval.txt", "w")
	f.write(evaluation.summary()) 
	f.write("\n")
	f.write("==confusion matrix==\n")
	f.write("     a       b\n")
	for item in evaluation.confusion_matrix:
		f.write("%s\n" % item)
	f.close() 
	#plot roc graph
	plcls.plot_roc(evaluation, title="Naive Bayes ROC", outfile="NBROC", wait=True)
	
	return evaluation.percent_correct
Exemple #3
0
def vote_classifier_train(dicrectory, nameOfDataSet, flag):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(dicrectory)
    data.class_is_last()
    meta = MultipleClassifiersCombiner(
        classname="weka.classifiers.meta.Vote",
        options=[
            '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B',
            'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1',
            '-B',
            'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- '
            '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B',
            'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump',
            '-B',
            'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- '
            '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B',
            'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG'
        ])
    eval = Evaluation(data)
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    if flag:
        eval.crossvalidate_model(meta, data, 10, Random(1), pout)
    else:
        eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout)
    gc.collect()
    print_and_save('Proposed model', flag, nameOfDataSet, eval)
def main():
    """
    Shows how to use the CostSensitiveClassifier.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.CostSensitiveClassifier",
        options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"])
    base = Classifier(classname="weka.classifiers.trees.J48",
                      options=["-C", "0.3"])
    classifier.classifier = base

    folds = 10
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, folds, Random(1))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("")
    print(
        evaluation.summary("=== " + str(folds) +
                           " -fold Cross-Validation ==="))
 def evaluation(self, classifier, trainingData, testingData = None):
     trainingData.set_class_index(trainingData.num_attributes() - 1)
     if testingData == None:
         evaluation = Evaluation(trainingData) 
                             # initialize with priors
         evaluation.crossvalidate_model(classifier, trainingData, 10, Random(42))  # 10-fold CV
         return evaluation
     else:
         print "testing data exists"
         if testingData.num_attributes() == trainingData.num_attributes():
             testingData.set_class_index(testingData.num_attributes() - 1)
             evaluation = Evaluation(trainingData)   
             
             classifier.build_classifier(trainingData)
             evaluation.test_model(classifier, testingData)
             
             #for attribute in trainingData.attributes():
             #    print "train:" + str(attribute)
             #for attribute in testingData.attributes():
             #    print "test:" + str(attribute)
                 
                 
             return evaluation
         else:
             print "testing Data doesn't have same attribute with training data"
             for attribute in trainingData.attributes():
                 print "train:" + str(attribute)
             for attribute in testingData.attributes():
                 print "test:" + str(attribute)
def experiment_file_random(path_features, path_folder_save_results, options,
                           classifier, fold, random, name):
    print("start weka")
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))
    d_results = {
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }
    data = converters.load_any_file(path_features)
    data.class_is_last()
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.CSV")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, fold, Random(random), pout)
    d_results['percent_correct'].append(evl.percent_correct)
    d_results['percent_incorrect'].append(evl.percent_incorrect)
    d_results['confusion_matrix'].append(
        evl.matrix())  # Generates the confusion matrix.

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv',
                     index=False)

    save = pout.buffer_content()

    with open(
            path_folder_save_results + '/' + 'prediction/' + str(name) +
            '.csv', 'w') as f:
        f.write(save)
Exemple #7
0
    def evaluation(self, classifier, trainingData, testingData=None):
        trainingData.set_class_index(trainingData.num_attributes() - 1)
        if testingData == None:
            evaluation = Evaluation(trainingData)
            # initialize with priors
            evaluation.crossvalidate_model(classifier, trainingData, 10,
                                           Random(42))  # 10-fold CV
            return evaluation
        else:
            print "testing data exists"
            if testingData.num_attributes() == trainingData.num_attributes():
                testingData.set_class_index(testingData.num_attributes() - 1)
                evaluation = Evaluation(trainingData)

                classifier.build_classifier(trainingData)
                evaluation.test_model(classifier, testingData)

                #for attribute in trainingData.attributes():
                #    print "train:" + str(attribute)
                #for attribute in testingData.attributes():
                #    print "test:" + str(attribute)

                return evaluation
            else:
                print "testing Data doesn't have same attribute with training data"
                for attribute in trainingData.attributes():
                    print "train:" + str(attribute)
                for attribute in testingData.attributes():
                    print "test:" + str(attribute)
def naivebay_classifier_weka(data):
    classifier = Classifier("weka.classifiers.bayes.NaiveBayes")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(42))
    print(evaluation.summary())
    print(evaluation.confusion_matrix)
    return classifier
def Boost_J48(data, rnm):
    data.class_is_last()
    fc1 = FilteredClassifier()
    fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
    fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"])
    fc2.classifier = fc1
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    fc2.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output)
    f0 = open(rnm + '_Boost_J48_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc2)
    f0.close()
    f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evaluation.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evaluation.class_details())
    f2.close()
    plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False)
    value_Boost_J48 = str(evaluation.percent_correct)
    return value_Boost_J48
Exemple #10
0
def run():
    jvm.start()
    load_csv = Loader("weka.core.converters.CSVLoader")
    data_csv = load_csv.load_file(
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.csv"
    )

    saver = Saver("weka.core.converters.ArffSaver")
    saver.save_file(
        data_csv,
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff"
    )

    load_arff = Loader("weka.core.converters.ArffLoader")
    data_arff = load_arff.load_file(
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff"
    )
    data_arff.class_is_last()

    global j48
    J48_class = Classifier(classname="weka.classifiers.trees.J48",
                           options=["-C", "0.25", "-M", "2"])
    J48_class.build_classifier(data_arff)
    evaluationj48 = Evaluation(data_arff)
    evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100))
    j48 = str(evaluationj48.percent_correct)
    jvm.stop()
    return j48
Exemple #11
0
def test_classifier(dataset: Instances, classifier: Classifier, params: dict):
    vars = params.keys()
    vals = params.values()

    results = defaultdict(list)

    for val_combo in itertools.product(*vals):
        results["numInstances"].append(dataset.num_instances)
        results["numAttributes"].append(dataset.num_attributes)
        opts = dict(zip(vars, val_combo))

        for opt in opts:
            results[opt].append(opts[opt])
            classifier.set_property(
                opt, opts[opt] if not isinstance(opts[opt], float) else
                typeconv.double_to_float(opts[opt]))

        evl = Evaluation(dataset)
        classifier.build_classifier(dataset)
        evl.test_model(classifier, dataset)
        results["Training_Accuracy"].append(evl.percent_correct)
        results["size"].append(
            int(javabridge.call(classifier.jobject, "measureTreeSize", "()D")))
        evl.crossvalidate_model(classifier, dataset, 10, Random(1))
        results["CV_Accuracy"].append(evl.percent_correct)

    return results
def RandomTree(data, rnm):
    data.class_is_last()
    fc = FilteredClassifier()
    fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"])
    fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"])
    pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"])
    folds = 10
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, folds, Random(1), pred_output)
    fc.build_classifier(data)
    f0 = open(rnm + '_RT_Tree.txt', 'w')
    print >> f0, "Filename: ", rnm
    print >> f0, '\n\n'
    print >> f0, str(fc)
    f0.close()
    f1 = open(rnm + '_RT_Prediction.txt', 'w')
    print >> f1, 'Filename:', rnm
    print >> f1, 'Prediction Summary:', (pred_output.buffer_content())
    f1.close()
    f2 = open(rnm + '_RT_Evaluation.txt', 'w')
    print >> f2, 'Filename:', rnm
    print >> f2, 'Evaluation Summary:', (evl.summary())
    print >> f2, '\n\n\n'
    print >> f2, (evl.class_details())
    f2.close()
    plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False)
    value_RT = str(evl.percent_correct)
    return value_RT
Exemple #13
0
def runSMO(file, bound):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file)
    data.class_is_first()

    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", bound])

    cls = KernelClassifier(
        classname="weka.classifiers.functions.SMO",
        options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"])
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.PolyKernel",
        options=["-C", "250007", "-E", "1.0"])
    cls.kernel = kernel
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    remove.inputformat(data)
    filtered = remove.filter(data)

    evl = Evaluation(filtered)
    evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)

    #print(pout.buffer_content())

    print(evl.percent_correct)
    #print(evl.summary())

    result = evl.class_details()
    print(result)
    return result
Exemple #14
0
def create_model(input_file, output_file):
    # Load data
    data = converters.load_any_file(input_file)
    data.class_is_last()  # set class attribute

    # filter data
    print_title("Filtering Data")
    discretize = Filter(
        classname="weka.filters.unsupervised.attribute.Discretize",
        options=["-B", "10", "-M", "-1.0", "-R", "first-last"])
    discretize.inputformat(
        data)  # let the filter know about the type of data to filter
    filtered_data = discretize.filter(data)
    print("Done! (believe it or not)")

    print_title("Build Classifier")
    classifier = Classifier(classname="weka.classifiers.trees.RandomForest",
                            options=["-I", "100", "-K", "0", "-S", "1"])
    classifier.build_classifier(filtered_data)
    print("Done! (believe it or not)")
    serialization.write_all(output_file, [classifier, discretize])
    print("Model and filter saved to ", output_file)

    evaluation = Evaluation(data)  # initialize with priors
    evaluation.crossvalidate_model(classifier, filtered_data, 10,
                                   Random(42))  # 10-fold CV
    print(evaluation.summary())
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
Exemple #15
0
    def run_naive_bayes_crossval(self, output_directory):
        # build classifier
        print("\nBuilding Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "NB Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("Naive_Bayes_Crossval", resultsString,
                          output_directory)
def proses():  #diluar def index = 0
    import math
    from weka.classifiers import Kernel, KernelClassifier
    from weka.classifiers import PredictionOutput
    import numpy as np
    klasifi = KernelClassifier(classname="weka.classifiers.functions.SMOreg",
                               options=["-N", "0"])
    vm = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel",
                options=["-G", "0.1"])
    klasifi.vm = vm
    output_x = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    kelola = Evaluation(anomali)
    kelola.crossvalidate_model(klasifi,
                               anomali,
                               10,
                               Random(0),
                               output=output_x)
    process = 0
    for x in anomali.values(anomali.class_index):
        data_inst.append(x)
    for x in kelola.predictions:
        i = str(x)
        index = i.split()
        data_pred.append(float(index[2]))
    data_std.insert(idx, math.ceil(np.std(data_inst)) * 0.1)
    print('\n DONE PROCESSING DATASET ATTRIBUTE ',
          anomali.attribute(anomali.class_index), '...')
Exemple #17
0
def run(dataset_path):
    start = time.time()

    ### load a dataset ###
    train_data = model.load_dataset_weka(dataset_path)  #
    to_nomial_class_filter = Filter(
        classname="weka.filters.unsupervised.attribute.NumericToNominal",
        options=["-R", "last"])
    to_nomial_class_filter.inputformat(train_data)

    ###  Naive Bayes ### Choose what you want
    classifier = Classifier("weka.classifiers.bayes.NaiveBayesMultinomial")
    # classifier = Classifier("weka.classifiers.bayes.NaiveBayes")
    # classifier.build_classifer(train_data)
    evaluation = Evaluation(to_nomial_class_filter.filter(train_data))
    evaluation.crossvalidate_model(classifier,
                                   to_nomial_class_filter.filter(train_data),
                                   10, Random(42))
    # print(evaluation.summary())
    # print(evaluation.class_details())
    # print(evaluation.matrix())

    # ###  Naive Bayes ###
    # mlp = Classifier("weka.classifiers.bayes.Naive Bayes")
    # mlp.build_classifer(train_file_5EMO)

    print(time.time() - start)
def use_classifier(data, cli, args):
    cli = cli.format(cli, **args)
    cls = from_commandline(cli, classname="weka.classifiers.Classifier")
    cls.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1))
    return cls, evaluation
Exemple #19
0
    def runCV(this, arffFile, classifier, folds):

        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(arffFile)
        data.class_is_last()

        classes = [str(code) for code in data.class_attribute.values]
        header = ["Accuracy"]
        for name in classes:
            header += [name + " TP", name + " FP", name + " AUC ROC"]
        values = []

        cls = Classifier(classname=classifier)

        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, folds, Random(1))

        values.append(evl.percent_correct)
        for name in classes:
            index = classes.index(name)
            values += [
                evl.true_positive_rate(index) * 100,
                evl.false_positive_rate(index) * 100,
                evl.area_under_roc(index)
            ]

        this.values = values
        this.header = header
Exemple #20
0
def fitness(toeval : Individual):
    cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings())
    fc = FilteredClassifier()
    fc.filter = remove
    fc.classifier = cls
    evl = Evaluation(data)
    evl.crossvalidate_model(fc, data, 10, Random(1))
    return evl.percent_correct
def f_smote():
    jvm.start()

    train_data, test_data = b_i_impute_data()

    train_data = train_data[:10000]
    y_train = train_data["class"]
    x_train = train_data.drop("class", axis=1)

    sm = SMOTE(ratio="minority")
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

    x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns)
    y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"])
    train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1)
    print_f("smote train data shape", train_data_sm_df.shape)
    train_data_sm_df.to_csv("./train_data_sm.csv", index=False)

    train_data_sm = converters.load_any_file("train_data_sm.csv")
    train_data_sm.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print_f("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print_f("bulding classifier")
    cls.build_classifier(train_data_sm)
    print_f("Evaluating")
    evl = Evaluation(train_data_sm)

    evl.crossvalidate_model(cls, train_data_sm, 5, Random(1))
    print_f("Train Accuracy:", evl.percent_correct)
    print_f("Train summary")
    print_f(evl.summary())
    print_f("Train class details")
    print_f(evl.class_details())
    print_f("Train confusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl,
                   class_index=[0, 1],
                   wait=True,
                   outfile="./plots/2_f_smote_10k.png")
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)

    evl = Evaluation(test_data)
    print_f("testing model")
    evl.test_model(cls, test_data)
    print_f("Test Accuracy:", evl.percent_correct)
    print_f("Test summary")
    print_f(evl.summary())
    print_f(" Testclass details")
    print_f(evl.class_details())
    print_f("Testconfusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/f_test_roc_curve.png")
def use_classifier(data_filename, cli):
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_filename)
    data.class_is_last()
    cls = from_commandline(cli, classname="weka.classifiers.Classifier")
    cls.build_classifier(data)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1))
    return cls, evaluation
Exemple #23
0
def run_bayesNet(file):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running BayesNet on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    # Load data with class as first attr
    data = load_Arff_file(file)
    data.class_is_first()

    # Use BayesNet and set options
    cls = Classifier(classname="weka.classifiers.bayes.BayesNet",
                     options=[
                         "-D", "-Q",
                         "weka.classifiers.bayes.net.search.local.TAN", "--",
                         "-P", "1", "-S", "BAYES", "-E",
                         "weka.classifiers.bayes.net.estimate.SimpleEstimator",
                         "--", "-A", "0.5"
                     ])

    # Predictions stored in pout
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")

    # Evaluate data
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout)

    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.confusion_matrix)

    # Generate grid for ROC
    # plcls.plot_roc(evaluation, class_index=[0,1], wait=True)

    # mk dirs for output
    dir = dir / "bayesNet_results"
    dir.mkdir(parents=True, exist_ok=True)

    # Save summary, class details and confusion matrix to file
    result_output = filename_base + "_bayesNet_eval_results_TAN.txt"
    output_eval(evaluation, dir / result_output)

    # Save the predicited results to file
    prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt"
    output_pred(pout, dir / prediction_output)

    print("BayesNet complete")
def experiment_more_file(path_files, path_folder_save_results, fold, options,
                         classifier, random, name):
    cls = Classifier(classname=classifier,
                     options=weka.core.classes.split_options(options))

    file_list = os.listdir(path_files)

    for file in file_list:
        if ".csv" not in file:
            file_list.remove(file)

    d_results = {
        'name_file': [],
        'percent_correct': [],
        'percent_incorrect': [],
        'confusion_matrix': []
    }

    print(file_list)

    for file in file_list:
        print(str(file))
        data = converters.load_any_file(path_files + "/" + file)

        data.class_is_last()

        pout = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.CSV")

        evl = Evaluation(data)

        evl.crossvalidate_model(cls, data, fold, Random(random), pout)

        d_results['name_file'].append(str(file))
        d_results['percent_correct'].append(evl.percent_correct)
        d_results['percent_incorrect'].append(evl.percent_incorrect)
        d_results['confusion_matrix'].append(
            evl.matrix())  # Generates the confusion matrix.

        save = pout.buffer_content()

        with open(
                path_folder_save_results + '/' + 'prediction/' + str(name) +
                str(file)[:-4] + 'pred_data.csv', 'w') as f:
            f.write(save)

    d_results = pd.DataFrame(data=d_results)

    d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv",
                     index=False)
Exemple #25
0
def naive_bayse(dicrectory, nameOfDataSet, flag):
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(dicrectory)
    data.class_is_last()
    cls = Classifier(classname='weka.classifiers.bayes.NaiveBayes')
    eval = Evaluation(data)
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    if flag:
        eval.crossvalidate_model(cls, data, 10, Random(1), pout)
    else:
        eval.evaluate_train_test_split(cls, data, 80.0, Random(1), pout)
    print_and_save('Naive Bayes model', flag, nameOfDataSet, eval)
    gc.collect()
Exemple #26
0
def CV5x2(dataset,  algo, num_datasets):

	loader = Loader(classname="weka.core.converters.ArffLoader")
	data = loader.load_file(dataset)
	data.class_is_last()

	cls = Classifier(classname=algo)

	evl = Evaluation(data)
	evl.crossvalidate_model(cls, data, 2, Random(5))

	print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False))
        print(evl.matrix("=== on click prediction(confusion matrix) ==="))
	print("For Algo"+ str(algo)+"areaUnderROC/1: for CV5x2 " + str(evl.area_under_roc(1)))

	return evl.area_under_roc(1)
Exemple #27
0
def evaluate_classifier(cls, data, crossvalidate=False, n_folds=10):
    """
    Evaluation
    :param cls: trained classifier
    :param data: data to test the model on
    :param crossvalidate: True to use crossvalidation
    :param n_folds: number of folds to cross validate for
    :return: evaluation object
    """
    evl = Evaluation(data)
    if crossvalidate:
        evl.crossvalidate_model(cls, data, n_folds, Random(5))
    else:
        evl.test_model(cls, data)

    return evl
Exemple #28
0
def obtainSVM(file):
    data = converters.load_any_file(folderPathOfArffFiles + file + ".arff")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "1-2"])
    remove.inputformat(data)
    data = remove.filter(data)
    data.class_is_last()

    classifier = Classifier(classname="weka.classifiers.functions.LibSVM")
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, kFold, Random(42))

    info = evaluation.class_details()
    roc_area = float(info[406:411])

    return roc_area
    def execute(self,featureInclusion, kFold, classIndex):
        deletedFeatures = 0
        for i in range(0,len(featureInclusion)):
            if featureInclusion[i] == False:
                self.instances.deleteAttributeAt( i - deletedFeatures)
                deletedFeatures += 1

        self.instances.setClassIndex(classIndex)

        cvParameterSelection = javabridge.make_instance("Lweka/classifiers/meta/CVParameterSelection","()V")
        javabridge.call(cvParameterSelection, "setNumFolds", "(I)V", kFold)
        javabridge.call(cvParameterSelection,"buildClassifier(Lweka/core/Instances)V",self.instances)

        eval = Evaluation(self.instances)
        eval.crossvalidate_model(cvParameterSelection, self.instances, kFold, Random(1))

        return eval.percent_correct()
    def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]):
        
        if arrfFile is not None:
            self.initData( arrfFile )
            
        if self.data is None:
            return 

        print 'Classificador ' + str(classname) + ' ' + ' '.join(options)
        cls = Classifier(classname=classname, options=options)
        
        evl = Evaluation(self.data)
        evl.crossvalidate_model(cls, self.data, 10, Random(1))

        print(evl.percent_correct)
        print(evl.summary())
        print(evl.class_details())
Exemple #31
0
def SimpleLogistic():
    # load a dataset
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file("First_trial_classification.arff")
    data.class_is_last()  # set class attribute

    cls = Classifier(classname="weka.classifiers.functions.SimpleLogistic")
    pout = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText")
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(486), pout)

    print(evl.summary())
    print(pout.buffer_content())

    # save model
    serialization.write_all("SimpleLogistic2.model", cls)
    def cross_validate(self, detail = True):
        """Perform cross validation using trained data. 
        
        Parameters
        ----------
        detail : boolean, optional, default = True
            If true return a detailed information of cross validation.
            
        Returns
        -------
        info : string
            Info with results of cross validation.
        """
        
        #print 'cross_validation'
        
        start_time = TimeUtils.get_time()
        
        info =  "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options]))
        
        if detail == True:
            info += "Relation:\t%s\n" % (self.data.relationname)
            info += "Instances:\t%d\n" % (self.data.num_instances)
            info += "Attributes:\t%d\n\n" % (self.data.num_attributes)
        
        evl = WEvaluation(self.data)
        evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1))
        
        if detail == False:
            info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct)

        info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time)
        #info += str(evl.percent_correct) + "\n\n"
        
        if detail == True:
            info += "=== Stratified cross-validation ===\n"
            info += evl.summary() + "\n\n"
            
            info += str(evl.class_details()) + "\n\n"
            
            classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)]
            cm = evl.confusion_matrix
            info += Classifier.confusion_matrix(classes, cm)

        return info
Exemple #33
0
    def run_crossval(self, output_directory, classifier_name,
                     classifier_weka_spec, options_list):
        # build classifier
        print("\nBuilding " + classifier_name +
              " Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname=classifier_weka_spec, options=options_list)
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = classifier_name + " Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString += "\n"
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString += "\n"
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\n\n" + classifier_name + " Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        options_string = ""
        for option in options_list:
            options_string = options_string + str(option)

        options_string = options_string.replace(".", "-")
        options_string = options_string.replace("-", "_")
        #Save Results and Cleanup
        self.save_results(classifier_name + options_string + "_Crossval",
                          resultsString, output_directory)
def use_classifier(data):
    """
    Uses the meta-classifier AttributeSelectedClassifier for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n1. Meta-classifier")
    classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    base = Classifier(classname="weka.classifiers.trees.J48")
    # setting nested options is always a bit tricky, getting all the escaped double quotes right
    # simply using the bean property for setting Java objects is often easier and less error prone
    classifier.set_property("classifier", base.jobject)
    classifier.set_property("evaluator", aseval.jobject)
    classifier.set_property("search", assearch.jobject)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(1))
    print(evaluation.summary())
def use_classifier(data):
    """
    Uses the meta-classifier AttributeSelectedClassifier for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n1. Meta-classifier")
    classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    base = Classifier(classname="weka.classifiers.trees.J48")
    # setting nested options is always a bit tricky, getting all the escaped double quotes right
    # simply using the bean property for setting Java objects is often easier and less error prone
    classifier.set_property("classifier", base.jobject)
    classifier.set_property("evaluator", aseval.jobject)
    classifier.set_property("search", assearch.jobject)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(1))
    print(evaluation.summary())
Exemple #36
0
def run():
    jvm.start()
    load_csv = Loader("weka.core.converters.CSVLoader")
    data_csv = load_csv.load_file(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.csv"
    )

    saver = Saver("weka.core.converters.ArffSaver")
    saver.save_file(
        data_csv,
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff"
    )

    load_arff = Loader("weka.core.converters.ArffLoader")
    data_arff = load_arff.load_file(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff"
    )
    data_arff.class_is_last()

    cls = Classifier(classname="weka.classifiers.trees.J48",
                     options=["-C", "0.5"])
    cls.build_classifier(data_arff)
    for index, inst in enumerate(data_arff):
        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)
        # save tree prune in txt file

    saveFile = open(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.txt",
        "w")
    saveFile.write(str(cls))
    # print(cls)
    saveFile.close()

    global j48
    J48_class = Classifier(classname="weka.classifiers.trees.J48",
                           options=["-C", "0.25", "-M", "2"])
    J48_class.build_classifier(data_arff)
    evaluationj48 = Evaluation(data_arff)
    evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100))
    j48 = str(evaluationj48.percent_correct)
    jvm.stop()
    return j48
Exemple #37
0
print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct())

print("\nHoldout 10%...")
# use seed 1-10 and perform random split with 90%
perc = []
for i in xrange(1, 11):
    evl = Evaluation(data)
    evl.evaluate_train_test_split(
        Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i))
    perc.append(round(evl.percent_correct(), 1))
    print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct()))

# calculate mean and standard deviation
nperc = numpy.array(perc)
print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc)))

print("\n10-fold Cross-validation...")
# use seed 1-10 and perform 10-fold CV
perc = []
for i in xrange(1, 11):
    evl = Evaluation(data)
    evl.crossvalidate_model(Classifier(classname="weka.classifiers.trees.J48"), data, 10, Random(i))
    perc.append(round(evl.percent_correct(), 1))
    print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct()))

# calculate mean and standard deviation
nperc = numpy.array(perc)
print("mean=%0.2f stdev=%0.2f" % (numpy.mean(nperc), numpy.std(nperc)))

jvm.stop()
Exemple #38
0
# load weather.nominal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# define classifiers
classifiers = ["weka.classifiers.rules.OneR", "weka.classifiers.trees.J48"]

# cross-validate original dataset
for classifier in classifiers:
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("%s (original): %0.0f%%" % (classifier, evl.percent_correct))

# replace 'outlook' in first 4 'no' instances with 'missing'
modified = Instances.copy_instances(data)
count = 0
for i in xrange(modified.num_instances):
    if modified.get_instance(i).get_string_value(modified.class_index) == "no":
        count += 1
        modified.get_instance(i).set_missing(0)
        if count == 4:
            break

# cross-validate modified dataset
for classifier in classifiers:
    cls = Classifier(classname=classifier)
Exemple #39
0
	def test(self, folds = 10):
		evaluation = Evaluation(self.data)                     # initialize with priors
		evaluation.crossvalidate_model(self.classifier, self.data, folds, Random(42))  # 10-fold CV
		print('Total number of instances: '+str(evaluation.num_instances)+'.')
		print(str(round(evaluation.percent_correct,2))+'% / '+str(round(evaluation.correct, 2))+' correct.')
		print(str(round(evaluation.percent_incorrect,2))+'% / '+str(round(evaluation.incorrect, 2))+' incorrect.')
Exemple #40
0
wfilter = Filter(classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# convert content to string
wfilter = Filter(classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# set class attribute
data.set_class_index(data.num_attributes() - 1)

# generate baseline
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
evaluation = Evaluation(data)
evaluation.crossvalidate_model(zeror, data, 10, Random(1))
print("\nBaseline:\n" + evaluation.to_summary())

# perform text mining
j48 = Classifier(classname="weka.classifiers.trees.J48")
stwv = Filter(
    classname="weka.filters.unsupervised.attribute.StringToWordVector",
    options=["-R", "1", "-P", "att-"])
stwv.set_inputformat(data)
data = stwv.filter(data)
evaluation = Evaluation(data)
evaluation.crossvalidate_model(j48, data, 10, Random(1))
print("\nJ48:\n" + evaluation.to_summary())

# stop JVM
jvm.stop()
Exemple #41
0
def process_classifier(runType, cls, occ, devList, fewCats, label, subtract):
	global devCount
	global save_orig
	global save_subtract
	conf_matrix = {}

	if occ:
		table = 'temp_dat_occ_vector_occ'
	else:
		table = 'temp_dat_occ_vector_2'

	writeStr = '=========================================================================================\n' + \
		'Running ' + runType + ' classifier for \'' + label + '\''
	sys.stdout.write(writeStr + '\r')
	total_conf.write(writeStr + '\n')
	sys.stdout.flush()

	if runType == 'unseen':
		i = 0
		indiv_results = {}
		for dev in devList:
			devCount += 1
			remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start))
			sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \
				str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining)                 \r')
			sys.stdout.flush()

			if fewCats:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC in (select * from id_fewcats_mac) '
					'and deviceMAC!=\'' + dev + '\';')
			else:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC!=\'' + dev + '\';')
			results = aws_c.fetchall()

			# Generate type list
			total_types = ['{']
			for data in results:
				if(data[-1] not in total_types):
					total_types.append('\"')
					total_types.append(data[-1])
					total_types.append('\"')
					total_types.append(',')
			total_types[-1] = '}'
			typeStr = ''.join(total_types)

			arff_train = label + '_' + dev + '_train'
			arff_test = label + '_' + dev + '_test'

			gen_arff(arff_train, typeStr, results, occ, arff_idcol)

			if fewCats:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC in (select * from id_fewcats_mac) '
					'and deviceMAC=\'' + dev + '\';')
			else:
				aws_c.execute('select * from ' + table + ' ' \
					'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
					'and deviceMAC=\'' + dev + '\';')
			gen_arff(arff_test, typeStr, aws_c.fetchall(), occ, arff_idcol)

			train = loader.load_file(arff_train + '.arff')
			train.class_is_last()
			mv(arff_train + '.arff', master_saveDir)
			test = loader.load_file(arff_test + '.arff')
			test.class_is_last()
			mv(arff_test + '.arff', master_saveDir)

			cls.build_classifier(train)

			# output predictions
			testName = ''
			predictions = []
			for index, inst in enumerate(test):
				if testName != '':
					if testName != inst.get_string_value(inst.class_index):
						print(str(testName) + ' ' + str(inst.get_string_value(inst.class_index)))
						exit()
					else:
						testName = inst.get_string_value(inst.class_index)	
				else:
					testName = inst.get_string_value(inst.class_index)

				if testName not in conf_matrix:
					conf_matrix[testName] = {}

				pred = cls.classify_instance(inst)
				# dist = cls.distribution_for_instance(inst)
				# if(pred == inst.get_value(inst.class_index)):
				predName = inst.class_attribute.value(int(pred))
				if predName not in conf_matrix[testName]:
					conf_matrix[testName][predName] = 0
				conf_matrix[testName][predName] += 1
				predictions.append(predName)

			total = 0
			if testName != '':
				for predName in conf_matrix[testName]:
					if predName == testName:
						correct = conf_matrix[testName][predName]
						total += correct
					else:
						total += conf_matrix[testName][predName]


			# while (len(predictions) * 2) <= 100:
			# 	predictions += pyrandom.sample(predictions, len(predictions))
			# if len(predictions) < 100:
			# 	predictions += pyrandom.sample(predictions, 100 - len(predictions))

			lots_predictions = []
			while len(lots_predictions) < 10000:
				lots_predictions += pyrandom.sample(predictions, 1)

			#indiv_results[dev] = [testName, pyrandom.sample(predictions, 100)]

			indiv_results[dev] = [testName, lots_predictions]

			# while len(predictions) < 100:
			# 	predictions += pyrandom.sample(predictions, 1)

			# indiv_results[dev] = [testName, predictions]

			# indiv_results[dev] = [testName, predictions]

			# Prep to print the how-many-days graph
			# days_output.write('\n\n\"' + dev + '\"\n')


			
			#print(str(testName) + ' ' + str(correct) + ' ' + str(total) + ' ' + str(float(correct)/total))

			# i += 1
			# if i == 10:
			# 	break


		correct, total = print_conf_matrix(conf_matrix, sys.stdout, False, False, False)
		correct, total = print_conf_matrix(conf_matrix, total_conf, False, False, False)

		if subtract == 'orig':
			save_orig = copy.deepcopy(conf_matrix)
		elif subtract == 'subtract':
			save_subtract = copy.deepcopy(conf_matrix)

		final_result = round(100*float(correct)/total,2)

		writeStr = '\nCorrectly Classified Instances\t\t' + str(correct) + '\t\t' + str(final_result) + '\n' + \
			'Incorrectly Classified Instances\t' + str(total-correct) + '\t\t' + str(round(100*float(total-correct)/total,2)) + '\n' + \
			'Total Number of Instances\t\t' + str(total) + '\n'
		print(writeStr)
		total_conf.write(writeStr + '\n')

		conf_interval = 10
		total_instances = float(sum([sum([conf_matrix[test][pred] for pred in conf_matrix[test]]) for test in conf_matrix]))

		p_d = {}
		p_e = {}
		p_e_given_d = {}
		for testName in conf_matrix:
			count_d = float(sum([conf_matrix[testName][label] for label in conf_matrix[testName]]))
			p_d[testName] = count_d / total_instances
			p_e[testName] = float(sum([conf_matrix[label][testName] for label in conf_matrix if testName in conf_matrix[label]]) / total_instances)
			p_e_given_d[testName] = {}

			for predName in conf_matrix:
				if predName in conf_matrix[testName]:
					p_e_given_d[testName][predName] = conf_matrix[testName][predName] / count_d
				else:
					p_e_given_d[testName][predName] = 0

		confidence = open('confidence.dat', 'w')
		for testName in conf_matrix:
			confidence.write('\n\n\"' + testName + '\"\n')
			print(testName)

			for classEvents in range(1, (conf_interval+1)):
				numerator = math.pow(p_e_given_d[testName][testName], classEvents) * p_d[testName]
				demoninator = 0
				for otherName in conf_matrix:
					demoninator += math.pow(p_e_given_d[otherName][testName], classEvents) * p_d[otherName]
				confidence.write(str(classEvents) + '\t' + str(numerator/demoninator) + '\n')
				print(str(classEvents) + '\t' + str(numerator/demoninator)) 
			print('')

		for predName in p_e_given_d['Router/Modem']:
			print('P( ' + predName + ' | Router/Modem ):\t' + str(p_e_given_d['Router/Modem'][predName]))

		for predName in p_e_given_d['Cable Box']:
			print('P( ' + predName + ' | Cable Box ):\t' + str(p_e_given_d['Cable Box'][predName]))

		#router = open('router', 'w')
		print('Router Stuff:')
		routerDev = 'Router/Modem'
		lampDev = 'Lamp'
		cableDev = 'Cable Box'
		origClassList = ['Router/Modem', 'Cable Box', 'Lamp', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Router/Modem']

		classListList =  [['Router/Modem'] + list(listItem) for listItem in set(itertools.permutations(origClassList))]

		classListList = [
			['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'],
			['Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'],
			['Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'],
			['Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp']
		]

		for idClass, classList in enumerate(classListList):
			print(idClass)
			for classEvents in range(1, (conf_interval+1)):
				numerator_router = p_d[routerDev]
				numerator_lamp = p_d[lampDev]
				numerator_cable = p_d[cableDev]
				for idx, classInst in enumerate(classList):
					if idx < classEvents:
						numerator_router *= p_e_given_d[routerDev][classInst]
						numerator_lamp *= p_e_given_d[lampDev][classInst]
						numerator_cable *= p_e_given_d[cableDev][classInst]
				demoninator = 0
				for otherName in conf_matrix:
					obsValue = p_d[otherName]
					for idx, classInst in enumerate(classList):
						if idx < classEvents:
							obsValue *= p_e_given_d[otherName][classInst]
					demoninator += obsValue
				print(str(classEvents) + '\t' + str(numerator_router/demoninator) + '\t' + str(numerator_lamp/demoninator) + '\t' + str(numerator_cable/demoninator) + '\t\"' + classList[classEvents-1]) + '\"'
			print('')

		numberDevList(indiv_results)

		eachDev = open('indiv_results.dat', 'w')
		newIDStream = open('new_id.dat', 'w')
		for devItem in indiv_results:
			print_obsResults(conf_matrix, conf_interval, p_d, p_e, p_e_given_d, indiv_results[devItem], eachDev, devItem, newIDStream)
		print('')
		print('total devices: ' + str(len(indiv_results)))
		# print('total devices: ' + str(total_devices))
		# print('total correct: ' + str(total_correct))
		# print('  pct correct: ' + str(round(100*float(total_correct)/total_devices,2)) + '\n')

		print('initial confidence: ' + str(round(100*float(sum(initial_confidence))/len(initial_confidence),2)))
		print('initial accuracy: ' + str(round(100*float(sum(initial_accuracy))/len(initial_accuracy),2)) + '\n')

		# print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_correct))/len(final_confidence_correct),2)))
		# print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_incorrect))/len(final_confidence_incorrect),2)))
		# print('final accuracy: ' + str(round(100*float(total_correct)/total_devices,2)))

		for devType in final_accuracy:
			print('final accuracy ' + devType + ' : ' + str(round(float(sum(final_accuracy[devType]))/len(final_accuracy[devType]),6)))
			print('final confidence (correct) ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType]))/len(final_confidence_correct[devType]),6)))
			if len(final_confidence_incorrect[devType]) > 0:
				print('final confidence (incorrect) ' + devType + ' : ' + str(round(float(sum(final_confidence_incorrect[devType]))/len(final_confidence_incorrect[devType]),6)))
			else:
				print('final confidence (incorrect) ' + devType + ' : ' + str(0))
			print('final confidence ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType])+sum(final_confidence_incorrect[devType]))/(len(final_confidence_correct[devType])+len(final_confidence_incorrect[devType])),2)))

		print_conf_matrix(new_conf_matrix, sys.stdout, False, False, False)

		for topType in actual_confidence_matrix:
			for botType in actual_confidence_matrix[topType]:
				storeArray = actual_confidence_matrix[topType][botType]
				if len(storeArray) > 0:
					actual_confidence_matrix[topType][botType] = round(sum(storeArray)/len(storeArray),2)
				else:
					actual_confidence_matrix[topType][botType] = 0

		print_conf_matrix(conf_matrix, sys.stdout, False, False, False)
		print_conf_matrix(actual_confidence_matrix, sys.stdout, False, False, False)
		print_conf_matrix(actual_confidence_matrix, sys.stdout, True, False, True)

		for devType in acc_over_time_dev:
			printOverTime(devType, acc_over_time_dev[devType], conf_over_time_dev[devType])
		printOverTime('total', acc_over_time, conf_over_time)

	elif runType == 'seen':
		if fewCats:
			aws_c.execute('select * from ' + table + ' ' \
				'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \
				'and deviceMAC in (select * from id_fewcats_mac);')
		else:
			aws_c.execute('select * from ' + table + ' ' \
				'where duty!=0 and deviceMAC not in (select * from vector_reject);')
		results = aws_c.fetchall()

		devCount += 1
		remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start))
		sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \
			str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining)                 \r')
		sys.stdout.flush()

		# Generate type list
		total_types = ['{']
		for data in results:
			if(data[-1] not in total_types):
				total_types.append('\"')
				total_types.append(data[-1])
				total_types.append('\"')
				total_types.append(',')
		total_types[-1] = '}'
		typeStr = ''.join(total_types)

		arff_file = label + '_train'

		gen_arff(arff_file, typeStr, results, occ, arff_idcol)

		train = loader.load_file(arff_file + '.arff')
		train.class_is_last()
		mv(arff_file + '.arff', master_saveDir)

		cls.build_classifier(train)

		evl = Evaluation(train)
		evl.crossvalidate_model(cls, train, 10, Random(1))

		print('\n')
		#print(evl.percent_correct)
		#print(evl.class_details())
		print(evl.matrix())
		total_conf.write('\n' + evl.matrix())
		print(evl.summary())
		total_conf.write(evl.summary() + '\n')

		final_result = round(evl.percent_correct, 2)

	else:
		success = []
		for startDev in devList:
			for changeToDev in devList:
				if startDev != changeToDev:
					devCount += 1
					remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start))
					sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \
						str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining)                 \r')
					sys.stdout.flush()
					
					aws_c.execute('select * from temp_dat_occ_vector_2 ' \
						'where duty!=0 and deviceMAC in (\'' + startDev + '\',\'' + changeToDev + '\');')
					results = [x[:-1] + (x[1],) for x in aws_c.fetchall()]	# Class label is just the deviceMAC

					if len(results) > 10:

						# Generate type list
						typeStr = '{' + startDev + ',' + changeToDev + '}'

						arff_file = label + '_' + startDev + '_' + changeToDev + '_train'

						gen_arff(arff_file, typeStr, results, occ, arff_idcol)

						train = loader.load_file(arff_file + '.arff')
						train.class_is_last()
						mv(arff_file + '.arff', master_saveDir)

						cls.build_classifier(train)

						evl = Evaluation(train)
						evl.crossvalidate_model(cls, train, 10, Random(1))

						print('\n')
						#print(evl.percent_correct)
						#print(evl.class_details())
						print(evl.matrix())
						total_conf.write('\n' + evl.matrix())
						print(evl.summary())
						total_conf.write(evl.summary() + '\n')

						success.append(evl.percent_correct)

		if len(success) > 0:
			final_result = [sum(success)/len(success), percentile(success, 5), percentile(success, 10), percentile(success, 95)]
		else:
			final_result = False

	if label in total_results:
		print('Warning label ' + label + ' exists twice, overwriting...')
	if final_result != False:
		total_results[label] = final_result
Exemple #42
0
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

for equal in ["", "-F"]:
    print("\nEqual frequency binning? " + str(equal == "-F") + "\n")
    for bins in [0, 40, 10, 5, 2]:
        if bins > 0:
            fltr = Filter(classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", str(bins), equal])
            fltr.set_inputformat(data)
            filtered = fltr.filter(data)
        else:
            filtered = data
        cls = Classifier(classname="weka.classifiers.trees.J48")
        # cross-validate
        evl = Evaluation(filtered)
        evl.crossvalidate_model(cls, filtered, 10, Random(1))
        # build classifier on full dataset
        cls.build_classifier(filtered)
        # get size of tree from model strings
        lines = str(cls).split("\n")
        nodes = "N/A"
        for line in lines:
            if line.find("Size of the tree :") > -1:
                nodes = line.replace("Size of the tree :", "").strip()
        # output stats
        print("bins=%i accuracy=%0.1f nodes=%s" % (bins, evl.percent_correct(), nodes))

jvm.stop()
Exemple #43
0
import weka.plot as plot
if plot.matplotlib_available:
    import matplotlib.pyplot as plt

jvm.start()

# load glass
fname = data_dir + os.sep + "glass.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# compute baseline
evl = Evaluation(data)
evl.crossvalidate_model(Classifier("weka.classifiers.rules.ZeroR"), data, 10, Random(1))
baseline = evl.percent_correct()

# generate learning curves
percentages = [1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
repetitions = [1, 10, 100]
curves = {}
for repetition in repetitions:
    # progress info
    sys.stdout.write("Repetitions=" + str(repetition))
    # initialize curve
    curve = {}
    for percentage in percentages:
        curve[percentage] = 0
    curves[repetition] = curve
    # run and add up percentage correct from repetition
Exemple #44
0
# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]:
    # train/test split 90% using classifier
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.evaluate_train_test_split(cls, data, 90.0, Random(1))
    print("\n" + classifier + " train/test split (90%):\n" + evl.to_summary())
    cls.build_classifier(data)
    print(classifier + " model:\n\n" + str(cls))

# calculate mean/stdev over 10 cross-validations
for classifier in [
    "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes",
        "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]:
    accuracy = []
    for i in xrange(1,11):
        cls = Classifier(classname=classifier)
        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, 10, Random(i))
        accuracy.append(evl.percent_correct())
    nacc = numpy.array(accuracy)
    print("%s: %0.2f +/-%0.2f" % (classifier, numpy.mean(nacc), numpy.std(nacc)))

jvm.stop()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(
        evaluation, title="ROC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)
    plot_cls.plot_prc(
        evaluation, title="PRC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")]
    plot_cls.plot_learning_curve(
        cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
Exemple #46
0
data_dir = os.environ.get("WEKAMOOC_DATA")
if data_dir is None:
    data_dir = "." + os.sep + "data"

import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.classifiers import Classifier, Evaluation, PredictionOutput
from weka.core.classes import Random
import weka.plot.classifiers as plc

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.class_is_last()

# cross-validate NaiveBayes
cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1), pout)
print(evl.summary())
print(evl.matrix())
print(pout)
plc.plot_roc(evl, wait=True)

jvm.stop()
Exemple #47
0
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
# we'll set the class attribute after filtering

# apply NominalToBinary filter and set class attribute
fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary")
fltr.inputformat(data)
filtered = fltr.filter(data)
filtered.class_is_last()

# cross-validate LinearRegression on filtered data, display model
cls = Classifier(classname="weka.classifiers.functions.LinearRegression")
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)
print("10-fold cross-validation:\n" + evl.summary())
print("Predictions:\n\n" + str(pout))
cls.build_classifier(filtered)
print("Model:\n\n" + str(cls))

# use AddClassification filter with LinearRegression on filtered data
print("Applying AddClassification to filtered data:\n")
fltr = Filter(
    classname="weka.filters.supervised.attribute.AddClassification",
    options=["-W", "weka.classifiers.functions.LinearRegression", "-classification"])
fltr.inputformat(filtered)
classified = fltr.filter(filtered)
print(classified)

# convert class back to nominal
from weka.classifiers import Classifier
cls = Classifier(classname= "weka.classifiers.bayes.NaiveBayes" )


# No options of interest to adjust
# Build classifier on training data
cls.build_classifier(train)
#       print(cls)

#import weka.plot.graph as graph  
#graph.plot_dot_graph(cls.graph)

from weka.classifiers import Evaluation
from weka.core.classes import Random
evl = Evaluation(train)
evl.crossvalidate_model(cls, train, 10, Random(1))

print ("Kappa Score")
print (evl.kappa) # 0.50 - Not bad
print ("Evaluation Summary")
print (evl.summary()) # Accuracy: 83%

##  Test model on new data ##

evl = Evaluation(test)

from weka.classifiers import PredictionOutput
pred_output = PredictionOutput(
classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])

evl.crossvalidate_model(cls, test, 10, Random(1), pred_output)
Exemple #49
0
from weka.filters import Filter
# convert csv into arff format (weka compatable)
# use convertcsvtoarff.py file

# load arff file

loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file("reviewsinformation_task2.arff")
iris_data.class_is_last()
loader = Loader("weka.core.converters.ArffLoader")
iris_data = loader.load_file(iris_file)
iris_data.class_is_last()

# kernel classifier
helper.print_title("Creating SMO as KernelClassifier")
kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
classifier.kernel = kernel
classifier.build_classifier(iris_data)
print("classifier: " + classifier.to_commandline())
print("model:\n" + str(classifier))

#print("model:\n" + str(classifier))


evaluation = Evaluation('test_data.arff')
evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
print(evaluation.summary())
print(evaluation.class_details())
print(evaluation.matrix())
Exemple #50
0
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.class_is_last()

classifiers = [
    "weka.classifiers.bayes.NaiveBayes",
    "weka.classifiers.lazy.IBk",
    "weka.classifiers.trees.J48"
]

# cross-validate classifiers
for classifier in classifiers:
    # classifier itself
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("%s: %0.0f%%" % (classifier, evl.percent_correct))
    # meta with cfssubseteval
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    meta.options = \
        ["-E", "weka.attributeSelection.CfsSubsetEval",
         "-S", "weka.attributeSelection.BestFirst",
         "-W", classifier]
    evl = Evaluation(data)
    evl.crossvalidate_model(meta, data, 10, Random(1))
    print("%s (cfs): %0.0f%%" % (classifier, evl.percent_correct))
    # meta with wrapper
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    meta.options = \
        ["-E", "weka.attributeSelection.WrapperSubsetEval -B " + classifier,
         "-S", "weka.attributeSelection.BestFirst",
Exemple #51
0
# load a dataset
iris_file = "HairEyeColor.csv"
print("Loading dataset: " + iris_file)
loader = Loader(classname="weka.core.converters.CSVLoader")
iris_data = loader.load_file(iris_file)
print (iris_data.num_attributes)
iris_data.set_class_index(iris_data.num_attributes() - 1)
                                            
# build a classifier and output model
print ("Training J48 classifier on iris")
classifier = Classifier(classname="weka.test.Regression")
#classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"])
# Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
# property of the J48 classifier itself. However, being of type float rather than double, we need
# to convert it to the correct type first using the double_to_float function:
#classifier.set_property("confidenceFactor", types.double_to_float(0.3))
classifier.build_classifier(iris_data)
print(classifier)
print(classifier.graph())
#plot_graph.plot_dot_graph(classifier.graph())
    

evaluation = Evaluation(iris_data)                     # initialize with priors
evaluation.crossvalidate_model(classifier, iris_data, 10, Random(42))  # 10-fold CV
print(evaluation.to_summary())

print("pctCorrect: " + str(evaluation.percent_correct()))
print("incorrect: " + str(evaluation.incorrect()))
jvm.stop()