def crossValidateNaiveBayes():
    f1Inputs, f1Labels, _ = read_libsvm_default('data/data_madelon/folds/fold1')
    f2Inputs, f2Labels, _ = read_libsvm_default('data/data_madelon/folds/fold2')
    f3Inputs, f3Labels, _ = read_libsvm_default('data/data_madelon/folds/fold3')
    f4Inputs, f4Labels, _ = read_libsvm_default('data/data_madelon/folds/fold4')
    f5Inputs, f5Labels, _ = read_libsvm_default('data/data_madelon/folds/fold5')
    allFoldInputArrays = [f1Inputs.toarray(), f2Inputs.toarray(),
                          f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray()]
    allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels]

    for array in allFoldInputArrays:
        discreteizeData(array)

    smoothingTerms = [2, 1.5, 1.0, 0.5]

    bestSmoothingTerm = None
    bestAccuracy = 0

    counter = 1

    everyAccuracy = []

    for smoothing in smoothingTerms:
        allAccuracies = []
        for i in range(len(allFoldInputArrays)):
            allTrainData = []
            allTrainLabels = []
            for j in range(len(allFoldInputArrays)):
                if j != i:
                    allTrainData.extend(allFoldInputArrays[j])
                    allTrainLabels.extend(allFoldLabelArrays[j])

            print("Hyperparameters: smoothing term: " + str(smoothing))

            tempbayes = naiveBayes(numFeatures, smoothing)
            tempbayes.train(allTrainData, allTrainLabels)
            evaluation = tempbayes.evaluate(allFoldInputArrays[i], allFoldLabelArrays[i])
            accuracy = evaluation
            allAccuracies.append(accuracy)
            everyAccuracy.append(accuracy)

        if statistics.mean(allAccuracies) > bestAccuracy:
            bestAccuracy = statistics.mean(allAccuracies)
            bestSmoothingTerm = smoothing

    avgAccuracy = statistics.mean(everyAccuracy)
    print("Best smoothing term: " + str(bestSmoothingTerm))
    print("Best accuracy: " + str(bestAccuracy))
    print("Average accuracy: " + str(avgAccuracy))
def crossValidateRandomForest():
    f1Inputs, f1Labels, _ = read_libsvm_default('data/data_semeion/folds/fold1')
    f2Inputs, f2Labels, _ = read_libsvm_default('data/data_semeion/folds/fold2')
    f3Inputs, f3Labels, _ = read_libsvm_default('data/data_semeion/folds/fold3')
    f4Inputs, f4Labels, _ = read_libsvm_default('data/data_semeion/folds/fold4')
    f5Inputs, f5Labels, _ = read_libsvm_default('data/data_semeion/folds/fold5')
    allFoldInputArrays = [f1Inputs.toarray(), f2Inputs.toarray(),
                          f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray()]
    allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels]

    forestSizes = [10, 50, 100]

    bestForestSize = None
    bestAccuracy = 0

    counter = 1

    everyAccuracy = []

    for forestSize in forestSizes:
        allAccuracies = []
        for i in range(len(allFoldInputArrays)):
            allTrainData = []
            allTrainLabels = []
            for j in range(len(allFoldInputArrays)):
                if j != i:
                    allTrainData.extend(allFoldInputArrays[j])
                    allTrainLabels.extend(allFoldLabelArrays[j])

            print("Hyperparameters: forest size: " + str(forestSize))

            tempforest = randomForest(numFeatures, forestSize)
            tempforest.train(allTrainData, allTrainLabels)
            evaluation = tempforest.evaluate(allFoldInputArrays[i], allFoldLabelArrays[i])
            accuracy = evaluation
            allAccuracies.append(accuracy)
            everyAccuracy.append(accuracy)

        if statistics.mean(allAccuracies) > bestAccuracy:
            bestAccuracy = statistics.mean(allAccuracies)
            bestForestSize = forestSize

    avgAccuracy = statistics.mean(everyAccuracy)
    print("Best forest size: " + str(bestForestSize))
    print("Best accuracy: " + str(bestAccuracy))
    print("Average accuracy: " + str(avgAccuracy))
from libsvm import read_libsvm_default
from naive_bayes import *
import statistics

## Setup Data:
trainingInputs, trainingLabels, numFeatures = read_libsvm_default('data/data_madelon/madelon_data_train')
testInputs, testLabels, _ = read_libsvm_default('data/data_madelon/madelon_data_test', numFeatures)
trainingInputsArr = np.array(trainingInputs.toarray())
testInputsArr = testInputs.toarray()

## Discretize data:
def discreteizeData(nonDiscreteArr):
    means = np.mean(nonDiscreteArr, axis=0)
    for i in range(len(nonDiscreteArr)):
        for j in range(len(nonDiscreteArr[i])):
            if nonDiscreteArr[i][j] <= means[j]:
                nonDiscreteArr[i][j] = 0
            else:
                nonDiscreteArr[i][j] = 1
    return nonDiscreteArr


discreteizeData(trainingInputsArr)


def crossValidateNaiveBayes():
    f1Inputs, f1Labels, _ = read_libsvm_default('data/data_madelon/folds/fold1')
    f2Inputs, f2Labels, _ = read_libsvm_default('data/data_madelon/folds/fold2')
    f3Inputs, f3Labels, _ = read_libsvm_default('data/data_madelon/folds/fold3')
    f4Inputs, f4Labels, _ = read_libsvm_default('data/data_madelon/folds/fold4')
    f5Inputs, f5Labels, _ = read_libsvm_default('data/data_madelon/folds/fold5')
from libsvm import read_libsvm_default
import csv
import numpy as np
from data import Data
from random_forest import randomForest
import statistics

np.random.seed(4)

## Setup Data:
trainingInputs, trainingLabels, numFeatures = read_libsvm_default(
    'data/data-splits/data.train')
testInputs, testLabels, _ = read_libsvm_default('data/data-splits/data.test',
                                                numFeatures)
trainingInputsArr = trainingInputs.toarray()
testInputsArr = testInputs.toarray()


## Discretize data:
def discreteizeData(nonDiscreteArr):
    means = np.mean(nonDiscreteArr, axis=0)
    for i in range(len(nonDiscreteArr)):
        for j in range(len(nonDiscreteArr[i])):
            if nonDiscreteArr[i][j] <= means[j]:
                nonDiscreteArr[i][j] = 0
            else:
                nonDiscreteArr[i][j] = 1
    return nonDiscreteArr


discreteizeData(trainingInputsArr)
Ejemplo n.º 5
0
from libsvm import read_libsvm_default
from naive_bayes import *
import statistics

## Setup Data:
trainingInputs, trainingLabels, numFeatures = read_libsvm_default('data/data_semeion/hand_data_train')
testInputs, testLabels, _ = read_libsvm_default('data/data_semeion/hand_data_test', numFeatures)
trainingInputsArr = trainingInputs.toarray()
testInputsArr = testInputs.toarray()

def crossValidateNaiveBayes():
    f1Inputs, f1Labels, _ = read_libsvm_default('data/data_semeion/folds/fold1')
    f2Inputs, f2Labels, _ = read_libsvm_default('data/data_semeion/folds/fold2')
    f3Inputs, f3Labels, _ = read_libsvm_default('data/data_semeion/folds/fold3')
    f4Inputs, f4Labels, _ = read_libsvm_default('data/data_semeion/folds/fold4')
    f5Inputs, f5Labels, _ = read_libsvm_default('data/data_semeion/folds/fold5')
    allFoldInputArrays = [f1Inputs.toarray(), f2Inputs.toarray(),
                          f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray()]
    allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels]

    smoothingTerms = [2.0, 1.5, 1.0, 0.5]

    bestSmoothingTerm = None
    bestAccuracy = 0

    counter = 1

    everyAccuracy = []

    for smoothing in smoothingTerms:
        allAccuracies = []