Ejemplo n.º 1
0
# -*- coding: utf-8 -*-
"""
Created on Tue Nov  6 17:39:00 2018

@author: Erik
"""

from sklearn.naive_bayes import MultinomialNB
from get_data import get_data_tfidf, get_data_custom
from Score import Score, average_scores
from sklearn.model_selection import KFold

X, y = get_data_custom("data-2_train.csv", 3, 2)

kf = KFold(n_splits=10)
kf.get_n_splits(X)
test_scores = []
train_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gnb = MultinomialNB()
    gnb.fit(X_train, y_train)

    y_pred = gnb.predict(X_test)
    test_scores.append(Score(y_test, y_pred))

    y_pred = gnb.predict(X_train)
    train_scores.append(Score(y_train, y_pred))
Ejemplo n.º 2
0
Created on Sat Oct 27 10:04:55 2018

@author: Erik
"""

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from get_data import get_data_custom, one_hot_encode
#used to split data
from sklearn.model_selection import train_test_split

#file name, max gram length, min occurances of gram
#for me  get_data('data-1_train.csv', 1, 0) is around 71-73% accuracy on test, which is actually great!
#  and get_data_custom('data-1_train.csv', 3, 3) yeilds around 72 as well but takes much longer
X, y = get_data_custom('data-1_train.csv', 2, 0, False)
y = one_hot_encode(y)

#split as required
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
X_train, X_validation, y_train, y_validation = train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.2)

ffnn = Sequential()
ffnn.add(Dense(1024, input_dim=len(X_train[0]), activation='relu'))
#add a second hidden layer, usually fewer and fewer nodes per hidden layer, this is such a small example it's way overdone
ffnn.add(Dense(512, activation='relu'))
#softmax used in output layer, output layer must match number or categories, for whatever reason we are using 0, 1, 2 and 3
ffnn.add(Dense(3, activation='softmax'))
ffnn.compile(optimizer=SGD(lr=0.1),
Ejemplo n.º 3
0
from keras.regularizers import l1, l2, l1_l2

DATA_SETS = ['data-1_train.csv', 'data-2_train.csv']
ALGOS = ['nn', 'nb', 'dt', 'rf']
PRE_PROCS = ['tfidf', 'cust']

file = open('test.csv', 'w')
file.write("test")

for ds in DATA_SETS:
    for alg in ALGOS:
        for proc in PRE_PROCS:
            if proc == 'tfidf':
                X, y = get_data_tfidf(ds)
            else:
                X, y = get_data_custom(ds, 2, 0, False)

            y_encode = one_hot_encode(y)

            kf = KFold(n_splits=10)
            kf.get_n_splits(X)
            scores = []

            print("Working on: " + ds + " " + alg + " " + proc)
            i = 1
            for train_index, test_index in kf.split(X):
                print(i)
                i += 1
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y_encode[train_index], y[test_index]
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-
"""
THIS ALWAYS OUTPUTS THE MAJORITY CLASS!!!  Something is broken about the C and gamma values most likely but I can't find a good
combination so far.
"""

from get_data import get_data_custom
from sklearn.model_selection import train_test_split
from sklearn import svm

print("SVM:\n")

#file name, max grams, in occurances of gram
X, y = get_data_custom('data-1_train.csv', 2, 2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
#X_train = X[0:4]
#X_test = X[4: 6]
#y_train = y[0: 4]
#y_test = y[4:6]
'''This is what really needs work.  Right now it just outputs the majority class.  We need to tune the hyper
parameters C and gamma.  Please refer to the documentation for their definitions.

The problem is that currently the modl just outputs the majority label ~50% accuracy.  Your task is to find a way to improve it.

A crude attempt at hyper parameter tuning is below
'''
Cs = [0.1, 1, 10, 100, 1000]  #iterate through a variety of different C values
gammas = [1e-20, 1e-10, 1e-5,
          1e-2]  #iterate through a veriety of different gamma values