Python read_arff Examples, util.read_arff Python Examples

Example #1

0

Show file

File: random_forest.py Project: tanjuma/CS360

def main():
    """
    Calls every function to implement Rain Forests
    """

    opts = util.parse_args()
    train_partition = util.read_arff(opts.train_filename)
    test_partition = util.read_arff(opts.test_filename)
    #constructs our ensemble of decision stumps
    ds_ensemble = construct_ensemble(opts.T, train_partition)
    #constructs a list of lists of (predicted labels for all examples) for each classifer
    ds_list = testing(test_partition, ds_ensemble, opts.threshold)
    #gets the final predicted labels for test data by majority vote
    finalpred_lst = finaloutput(ds_list, test_partition)
    #contructs confusion matrix
    confusion_matrix = util.construct_cm(finalpred_lst, test_partition)
    #computes the true positive and false positive rates for the confusion matrix
    (true_pos, false_pos) = util.rates(confusion_matrix)
    #print statements
    print("T:", opts.T, ", thresh: ", opts.threshold)
    print("        prediction    ")
    print("       -1        1")
    print("-1", "|  ", confusion_matrix[0, 0], "  ", confusion_matrix[0, 1])
    print(" 1", "|  ", confusion_matrix[1, 0], "   ", confusion_matrix[1, 1])
    print(" ")
    print("false positive: ", false_pos)
    print("true positive: ", true_pos)

Example #2

0

Show file

def main():

    # read in data (y in {-1,1})
    opts = util.parse_args('Random forests')
    train_partition = util.read_arff(opts.train_filename)
    test_partition = util.read_arff(opts.test_filename)
    T = opts.classifier_nums
    threshold = opts.thresh

    # training the data
    ensemble = random_forest_train_data(train_partition, T)

    # testing the data
    confusion_matrix, FPR, TPR = random_forest_test_data(
        test_partition, ensemble, threshold)

    print('T: ' + str(T) + ' , thresh ' + str(threshold))
    print('\n')
    print(' prediction')
    print('   -1  1')
    print('   -----')
    print('-1| ' + str(int(confusion_matrix[0][0])) + '  ' +
          str(int(confusion_matrix[0][1])))
    print(' 1| ' + str(int(confusion_matrix[1][0])) + '  ' +
          str(int(confusion_matrix[1][1])))
    print('\n')

    # calculating the false positive rate and the true positive rate
    print('false positive: ' + str(FPR))
    print('true positive: ' + str(TPR))

Example #3

0

Show file

def main():

    opts = util.parse_args()
    train_partition = util.read_arff(opts.train_filename)
    test_partition = util.read_arff(opts.test_filename)
    #training happens to get ensemble list and corresponding score list
    (ds_ensemble, scorelist) = construct_ensemble(opts, train_partition)
    #testing starts..to get list of predicted labels from each classifier
    tested_list = testing(test_partition, ds_ensemble, opts.threshold)
    #gets the final predicted labels for test data
    finalpred_lst = finaloutput(tested_list, test_partition, scorelist,
                                opts.threshold)
    #contructs confusion matrix
    confusion_matrix = util.construct_cm(finalpred_lst, test_partition)
    #computes the true positive and false positive rates for the confusion matrix
    (true_pos, false_pos) = util.rates(confusion_matrix)
    #print statements
    print("T:", opts.T, ", thresh: ", opts.threshold)
    print("        prediction    ")
    print("       -1        1")
    print("-1", "|  ", confusion_matrix[0, 0], "  ", confusion_matrix[0, 1])
    print(" 1", "|  ", confusion_matrix[1, 0], "   ", confusion_matrix[1, 1])
    print(" ")
    print("false positive: ", false_pos)
    print("true positive: ", true_pos)

Example #4

0

Show file

File: roc_curve.py Project: tanjuma/CS360

def main():
    opts = util.parse_args()
    train_partition = util.read_arff(opts.train_filename)
    test_partition = util.read_arff(opts.test_filename)

    #training random forest first
    print(opts.T)
    rf_ensemble = random_forest.construct_ensemble(opts.T, train_partition)
    #training AdaBoost next
    (ad_ensemble,
     scorelist) = ada_boost.construct_ensemble(opts, train_partition)
    #initializing threshold that will be changed in the loop
    thresh = -0.1
    #initializes two lists of size 20 to hold true and false positive rates for
    #both the ensemble methods for each threshold value
    rm_forest = [None] * 20
    adaboost = [None] * 20
    #loops to increment threshold
    for i in range(20):
        rm_forest[i] = test_random(test_partition, rf_ensemble, thresh)
        adaboost[i] = test_adaboost(test_partition, ad_ensemble, scorelist,
                                    thresh)
        thresh += 0.06
        print(rm_forest[i], adaboost[i], thresh)

    #plots the roc curves
    plot_data(adaboost, rm_forest, opts.T)

Example #5

0

Show file

def main():

    opts = util.parse_args()
    train_partition = util.read_arff(opts.train_filename, True)
    test_partition = util.read_arff(opts.test_filename, False)

    # create an instance of the DecisionTree class from the train_partition
    tree = DecisionTree(train_partition, (vars(opts)).get("depth"))
    rootnode = tree.constructsubtree(train_partition,
                                     (vars(opts)).get("depth"), 0)

    #print text representation of the DecisionTree
    tree.printtree(rootnode)

Example #6

0

Show file

File: DecisionStump.py Project: ldakir/Machine-Learning

def main():
    opts = util.parse_args('')
    train_partition = util.read_arff(opts.train_filename)
    test_partition = util.read_arff(opts.test_filename)

    for i in range(train_partition.n):
        example = train_partition.data[i]
        if i == 0 or i == 8:
            example.set_weight(0.25)
        else:
            example.set_weight(0.5 / (train_partition.n - 2))

    for x in train_partition.F:
        print(train_partition.gain(x))

    d = DecisionStump(train_partition)
    print(d)

    for x in test_partition.data:
        print(x.label, d.classify(x.features))

Example #7

0

Show file

File: run_NB.py Project: ldakir/Machine-Learning

def main():

    opts = util.parse_args()
    train_partition = util.read_arff(opts.train_filename)
    test_partition = util.read_arff(opts.test_filename)

    #Creating Naive Bayes Model
    nb_model = NaiveBayes(train_partition)
    m = len(test_partition.labels)
    confusion_matrix = np.zeros((m, m))  #initializing the confusion matrix
    accuracy = 0
    for x in test_partition.data:
        y_hat = nb_model.classify(x.features)
        y = x.label
        confusion_matrix[y][y_hat] += 1
        if y == y_hat:
            accuracy += 1

    print('Accuracy: ' + str(round(accuracy / test_partition.n, 6)) + ' (' +
          str(accuracy) + ' out of ' + str(test_partition.n) + ' correct)')
    print(confusion_matrix)

Example #8

0

Show file

File: run_NB.py Project: tanjuma/CS360

def main():
    # Process the data
    opts = util.parse_args()
    train_partition = util.read_arff(opts.train_filename)
    test_partition  = util.read_arff(opts.test_filename)

    # sanity check
    print("num train =", train_partition.n, ", num classes =", train_partition.K)
    print("num test  =", test_partition.n, ", num classes =", test_partition.K)

    nb_model = NaiveBayes(train_partition)

    y_real = [] #list of real y's
    y_h = [] #list of predicted y's
    for example in test_partition.data: #loops through test example list
        y_hat = nb_model.classify(example.features) #calls classify on each example's feature
        y_real.append(int(example.label)) #appends the test data's label to y_real
        y_h.append(y_hat) #appends the predicted label to y_h\

    ln = len(nb_model.classes)
    l = len(test_partition.data)
    confusion_matrix = np.zeros((ln,ln)) #makes a confusion matrix of zeroes of the right size first
    for i in range(l):
        y_r = y_real[i]
        pred_y = y_h[i]
        confusion_matrix[y_r][pred_y] += 1  #adds one to diagonal elements of the numpy array

    n = 0 #keeps track of number of accurate data points
    for i in range(ln):
        n += confusion_matrix[i][i] #sums the diagonal


    accuracy = n / (l) #computes accuracy

    #printing here
    print("Accuracy", round(accuracy, 7), "(", int(n), " out of ", l , " correct)")
    print("Confusion Matrix:")
    print(confusion_matrix)

Example #9

0

Show file

File: run_dtree.py Project: tessagrethen/machine-learning-algos

def main():
    opts = util.parse_args()
    train_partition = util.read_arff(opts.train_filename, True)
    test_partition = util.read_arff(opts.test_filename, False)

    # create a DecisionTree instance from training data
    if opts.depth:
        DecisionTree.max_depth = opts.depth

    train_dtree = DecisionTree(train_partition, 0)

    # print text representation of the decision tree
    print(train_dtree)

    # evaluate the decision tree on test data
    correct = 0
    for e in test_partition.data:
        if train_dtree.predict(e) == e.label:
            correct += 1

    print(f'{correct} out of {test_partition.n} correct')
    accuracy = Decimal(f'{correct / test_partition.n}').quantize(
        Decimal('1.0000'))
    print(f'accuracy: {accuracy}')

Example #10

0

Show file

def main():
    """
    Loads data into partitions, creates a Naive Bayes model based on the train
    data, runs the model on the test data, and evaluates its accuracy.
    """
    opts = util.parse_args()
    train_partition, test_partition = util.read_arff(opts.filename)

    nb_model = NaiveBayes(train_partition)

    examples = test_partition.data
    total = len(examples)
    total_correct = 0

    K = test_partition.K
    confusion_matrix = np.zeros((K, K), int)
    for example in examples:
        y_hat = nb_model.classify(example.features)
        y = example.label
        confusion_matrix[y][y_hat] += 1

        if y_hat == y:
            total_correct += 1

    accuracy = round(total_correct / total, 6)
    accuracy_str = "Accuracy: " + str(accuracy) + " ("
    correct_str = str(total_correct) + " out of " + str(total) + " correct)"
    print(accuracy_str + correct_str)
    stretch = 8
    prediction_labels = "   "
    top_row = "   "
    table = ""
    for y_hat in range(K):
        prediction_labels += " " * (stretch -
                                    len(str(y_hat + 1))) + str(y_hat + 1)
        top_row += "-" * stretch
    for y in range(K):
        table += " " + str(y + 1) + "|"
        for y_hat in range(K):
            entry = str(confusion_matrix[y][y_hat])
            table += " " * (stretch - len(entry)) + entry
        table += "\n"
    print("\n\n        prediction")
    print(prediction_labels)
    print(top_row)
    print(table)

Example #11

0

Show file

File: roc_curve.py Project: ldakir/Machine-Learning

"""
Run ensemble methods to create ROC curves.
Authors: Lamiaa Dakir
Date: 10/28/2019
"""
import util
from random_forest import *
from ada_boost import *
import optparse
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

# read in data (y in {-1,1})
train_partition = util.read_arff('data/mushroom_train.arff')
test_partition = util.read_arff('data/mushroom_test.arff')

parser = optparse.OptionParser()
parser.add_option('-T',
                  '--classifier_nums',
                  type='int',
                  help='Number of classifiers')
(opts, args) = parser.parse_args()
T = opts.classifier_nums

random_forest_FPRs = []
random_forest_TPRs = []

ada_boost_FPRs = []
ada_boost_TPRs = []

Example #12

0

Show file

def main():

    opts = util.parse_args()
    train_partition = util.read_arff(opts.train_filename, True)
    test_partition = util.read_arff(opts.test_filename, False)