def main():
    from import_data import import_csv
    from sklearn.decomposition import PCA
    import time
    data_files_path = 'data_and_scripts/'
    
    TRAIN_INPUTS_PATH = data_files_path+'train_inputs.csv'
    TEST_INPUTS_PATH = data_files_path+'test_inputs.csv'

    TRAIN_INPUTS_SUBSET_PATH = data_files_path+'train_inputs_subset.csv'
    
    #get the original inputs
    starttime = time.clock()
    train_inputs = import_csv(TRAIN_INPUTS_PATH)
    test_inputs = import_csv(TEST_INPUTS_PATH)
    print 'Time to import: %0.1f'%(time.clock() - starttime)    
    
    N,K = np.shape(train_inputs)
    T,Ki = np.shape(test_inputs)
    print N , K
    print T , Ki
    #concatenate train and test image
    concat = np.concatenate((train_inputs, test_inputs), axis=0)    
    print 'concatenated'
    print np.shape(concat)
    
    #apply transformation
    starttime = time.clock()
    transformed_concat = transform_features(concat, 500)
    print 'Time to transform: %0.1f'%(time.clock() - starttime)
    
    #apply PCA
    # starttime = time.clock()
    # desired=500
    # print 'Reducing feature set size from %d to %d...'%(K,desired)
    # features = PCA(n_components=desired).fit_transform(transformed_concat)
    # print 'Time to transform: %0.1f'%(time.clock() - starttime)
    features = transformed_concat
    
    #split
    transform_train_inputs, transform_test_inputs = features[:N,], features[N:,]
    
    #save to csv file    
    starttime = time.clock()
    print 'saving to csv file'
    header = makeHeader(desired)
    #Id column
    transform_train_inputs = np.concatenate((np.arange(N).reshape(N,1),transform_train_inputs), axis=1)
    transform_test_inputs = np.concatenate((np.arange(T).reshape(T,1),transform_test_inputs),axis=1)
    np.savetxt(data_files_path+'transformed_train_inputs.csv',transform_train_inputs,fmt='%f', delimiter=',', newline='\n', header=header,comments='')
    np.savetxt(data_files_path+ 'transformed_test_inputs.csv',transform_test_inputs,fmt='%f', delimiter=',', newline='\n', header=header,comments='')
    print 'Time to save: %0.1f'%(time.clock() - starttime)
Example #2
0
def command_line_run(args):
    args_dict = {}
    for i in range(1, len(args)):
        if '-' in args[i]:
            args_dict[args[i]] = []
            args_dict[-1] = args_dict[args[i]]
        else:
            args_dict[-1].append(float(args[i]))
    del args_dict[-1]

    num_classes = 10
    random.seed(1917)

    if '-debug' in args_dict:
        train_outputs = import_csv(TRAIN_OUTPUTS_SUBSET_PATH).astype(int)
        train_inputs = import_csv(TRAIN_INPUTS_SUBSET_PATH)
    else:
        train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int)
        train_inputs = import_csv(TRAIN_INPUTS_PATH)

    if '-t' in args_dict:
        print len(train_inputs)
        train_inputs = np.array(transform_features(train_inputs))
        print len(train_inputs)

    # Default values.
    hnh = []
    num_features = 300
    dropout = None
    lr = 1.0
    epochs = 50

    if '-f' in args_dict:
        num_features = map(int, args_dict['-f'])[0]

    if '-test' in args_dict:
        test_inputs = import_csv(TEST_INPUTS_PATH)

        if '-t' in args_dict:
            test_inputs = transform_features(test_inputs)

        if not num_features == len(train_inputs[0]):
            alll = feature_reduce(
                np.array(list(train_inputs) + list(test_inputs)), num_features)
            train_inputs = alll[:len(train_inputs)]
            test_inputs = alll[len(train_inputs):]

    if '-validate' in args_dict:
        validation_size = (4 * len(train_inputs)) / 5

        # Randomize the train and validation set.
        rand_idxs = random.sample(range(0, len(train_inputs)),
                                  len(train_inputs))

        test_inputs = train_inputs[rand_idxs[validation_size:]]
        test_outputs = train_outputs[rand_idxs[validation_size:]]
        train_inputs = train_inputs[rand_idxs[0:validation_size]]
        train_outputs = train_outputs[rand_idxs[0:validation_size]]

        # We have to reduce the features all at the same time because it is unsupervised learning and
        # we want the same features to be picked by PCA for both of the train and test sets.
        if not num_features == len(train_inputs[0]):
            alll = feature_reduce(
                np.array(list(train_inputs) + list(test_inputs)), num_features)
            train_inputs = alll[:len(train_inputs)]
            test_inputs = alll[len(train_inputs):]

    if '-hn' in args_dict:
        hnh = map(int, args_dict['-hn'])

    if '-d' in args_dict:
        if not (0.0 <= args_dict['-d'][0] <= 1.0):
            print 'Please input a dropout rate between 0 and 1!'
            exit(0)
        dropout = args_dict['-d'][0]

    if '-lr' in args_dict:
        lr = args_dict['-lr'][0]

    if '-e' in args_dict:
        epochs = int(args_dict['-e'][0])

    nn = NeuralNetwork(len(train_inputs[0]),
                       hnh,
                       num_classes,
                       learning_rate=lr,
                       dropout=dropout)
    nn.fit(train_inputs, train_outputs, training_horizon=epochs, verbose=True)
    p = nn.predict(test_inputs)

    fname = data_files_path + 'predictions_with_%depochs_%dfeatures_%0.2flf' % (
        epochs, num_features, lr)
    if '-test' in args_dict:
        with open(fname + '.csv', 'w') as f:
            f.write('Id,Prediction\n')
            for i in range(len(p)):
                f.write('%d,%d\n' % (i + 1, p[i]))
    else:
        print accuracy(p, test_outputs)
        if '-record' in args_dict:
            heatmap(p, test_outputs, fname)
Example #3
0
from import_data import import_csv
import scipy
from matplotlib import pyplot as plt

x,y = import_csv('../resampled-to-8bit.csv')

im = x[0]
plt.imshow(im)

"""
scipy.misc.imshow(im)
scipy.misc.imshow(misc.imrotate(im, 45))
scipy.misc.imshow(misc.imrotate(im, 15))
scipy.misc.imshow(misc.imrotate(im, 90))
scipy.misc.imshow(misc.imrotate(im, 100))
"""

def command_line_run(args):
    args_dict = {}
    for i in range(1,len(args)):
        if '-' in args[i]:
            args_dict[args[i]] = []
            args_dict[-1] = args_dict[args[i]]
        else:
            args_dict[-1].append(float(args[i]))
    del args_dict[-1]

    num_classes = 10
    random.seed(1917)

    if '-debug' in args_dict:
        train_outputs = import_csv(TRAIN_OUTPUTS_SUBSET_PATH).astype(int)
        train_inputs = import_csv(TRAIN_INPUTS_SUBSET_PATH)
    else:
        train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int)
        train_inputs = import_csv(TRAIN_INPUTS_PATH)
    
    if '-t' in args_dict:
        print len(train_inputs)
        train_inputs = np.array(transform_features(train_inputs))
        print len(train_inputs)

    # Default values.
    hnh = []
    num_features = 300
    dropout = None
    lr = 1.0
    epochs = 50

    if '-f' in args_dict:
        num_features = map(int, args_dict['-f'])[0]

    if '-test' in args_dict:
        test_inputs = import_csv(TEST_INPUTS_PATH)

        if '-t' in args_dict:
            test_inputs = transform_features(test_inputs)
 
        if not num_features == len(train_inputs[0]):
            alll = feature_reduce(np.array(list(train_inputs)+list(test_inputs)), num_features)
            train_inputs = alll[: len(train_inputs)]
            test_inputs = alll[len(train_inputs) :]

    if '-validate' in args_dict:
        validation_size = (4 * len(train_inputs)) / 5

        # Randomize the train and validation set.
        rand_idxs = random.sample(range(0, len(train_inputs)), len(train_inputs))

        test_inputs = train_inputs[rand_idxs[validation_size : ]]
        test_outputs = train_outputs[rand_idxs[validation_size : ]]
        train_inputs = train_inputs[rand_idxs[0 : validation_size]]
        train_outputs = train_outputs[rand_idxs[0 : validation_size]]

        # We have to reduce the features all at the same time because it is unsupervised learning and
        # we want the same features to be picked by PCA for both of the train and test sets.
        if not num_features == len(train_inputs[0]):
            alll = feature_reduce(np.array(list(train_inputs)+list(test_inputs)), num_features)
            train_inputs = alll[: len(train_inputs)]
            test_inputs = alll[len(train_inputs) :]

    if '-hn' in args_dict:
        hnh = map(int, args_dict['-hn'])

    if '-d' in args_dict:
        if not (0.0 <= args_dict['-d'][0] <= 1.0):
            print 'Please input a dropout rate between 0 and 1!'
            exit(0)
        dropout = args_dict['-d'][0]

    if '-lr' in args_dict:
        lr = args_dict['-lr'][0]

    if '-e' in args_dict:
        epochs = int(args_dict['-e'][0])

    nn = NeuralNetwork(len(train_inputs[0]), hnh, num_classes, learning_rate=lr, dropout=dropout)
    nn.fit(train_inputs, train_outputs, training_horizon=epochs, verbose=True)
    p = nn.predict(test_inputs)

    fname = data_files_path+'predictions_with_%depochs_%dfeatures_%0.2flf'%(epochs,num_features,lr)
    if '-test' in args_dict:
        with open(fname+'.csv','w') as f:
            f.write('Id,Prediction\n')
            for i in range(len(p)):
                f.write('%d,%d\n'%(i+1,p[i]))
    else:
        print accuracy(p, test_outputs)
        if '-record' in args_dict:
            heatmap(p, test_outputs, fname)
from sklearn.metrics import confusion_matrix
from graphic import heatmap
from classification import accuracy

if __name__ == '__main__':

    data_files_path = 'data_and_scripts/'

    TRAIN_INPUTS_PATH = data_files_path + 'train_inputs324.csv'
    TRAIN_OUTPUTS_PATH = data_files_path + 'train_outputs.csv'
    TEST_INPUTS_PATH = data_files_path + 'test_inputs324.csv'

    TRAIN_INPUTS_SUBSET_PATH = data_files_path + 'train_inputs_subset.csv'
    TRAIN_OUTPUTS_SUBSET_PATH = data_files_path + 'train_outputs_subset.csv'

    train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int)
    train_inputs = import_csv(TRAIN_INPUTS_PATH)

    print np.shape(train_outputs)
    print np.shape(train_inputs)

    #randomly split the data into a train set and a validation set
    train_x, test_x, train_y, test_y = train_test_split(train_inputs,
                                                        train_outputs,
                                                        test_size=0.2,
                                                        random_state=17)

    #use the training set to find best learning rate c
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid, n_jobs=-1)
    clf.fit(train_x, train_y)
from sklearn.metrics import confusion_matrix
from graphic import heatmap
from classification import accuracy

if __name__ == "__main__":

    data_files_path = "data_and_scripts/"

    TRAIN_INPUTS_PATH = data_files_path + "train_inputs324.csv"
    TRAIN_OUTPUTS_PATH = data_files_path + "train_outputs.csv"
    TEST_INPUTS_PATH = data_files_path + "test_inputs324.csv"

    TRAIN_INPUTS_SUBSET_PATH = data_files_path + "train_inputs_subset.csv"
    TRAIN_OUTPUTS_SUBSET_PATH = data_files_path + "train_outputs_subset.csv"

    train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int)
    train_inputs = import_csv(TRAIN_INPUTS_PATH)

    print np.shape(train_outputs)
    print np.shape(train_inputs)

    # randomly split the data into a train set and a validation set
    train_x, test_x, train_y, test_y = train_test_split(train_inputs, train_outputs, test_size=0.2, random_state=17)

    # use the training set to find best learning rate c
    param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    clf = GridSearchCV(LinearSVC(penalty="l2"), param_grid, n_jobs=-1)
    clf.fit(train_x, train_y)
    # best parameter
    print ("Best parameters set found on development set:")
    print ()
Example #7
0
import numpy as np
import cPickle as pickle
#from scipy import sparse
#from sklearn.feature_extraction.text import TfidfVectorizer
#import en
#import peach
#import sklearn.feature_extraction.text as skltext
import fea_extract
import import_data as ip
import evaluation as eval
import classification as clf

# loading data from files
train_data = ip.import_csv('../Data/train.csv')
test_data = ip.import_csv('../Data/test.csv')


# loading the pre computed data
classes = ['s', 'w', 'c']
saved_keywords_list = []
saved_train_fea = []
saved_test_fea = []
output_folder = 'outputs/'
for c in classes:
    try:
        with open(output_folder + 'keywords_list_'+c+'.p', 'rb') as fp:
            saved_keywords_list.append(pickle.load(fp))
        #saved_keywords_list.append(np.load('keywords_list_'+c+'.npy'))
        saved_train_fea.append(np.load(output_folder + 'train_fea_'+c+'.npy'))
        saved_test_fea.append(np.load(output_folder + 'test_fea_'+c+'.npy'))
def main():
    from import_data import import_csv
    from sklearn.decomposition import PCA
    import time
    data_files_path = 'data_and_scripts/'

    TRAIN_INPUTS_PATH = data_files_path + 'train_inputs.csv'
    TEST_INPUTS_PATH = data_files_path + 'test_inputs.csv'

    TRAIN_INPUTS_SUBSET_PATH = data_files_path + 'train_inputs_subset.csv'

    #get the original inputs
    starttime = time.clock()
    train_inputs = import_csv(TRAIN_INPUTS_PATH)
    test_inputs = import_csv(TEST_INPUTS_PATH)
    print 'Time to import: %0.1f' % (time.clock() - starttime)

    N, K = np.shape(train_inputs)
    T, Ki = np.shape(test_inputs)
    print N, K
    print T, Ki
    #concatenate train and test image
    concat = np.concatenate((train_inputs, test_inputs), axis=0)
    print 'concatenated'
    print np.shape(concat)

    #apply transformation
    starttime = time.clock()
    transformed_concat = transform_features(concat, 500)
    print 'Time to transform: %0.1f' % (time.clock() - starttime)

    #apply PCA
    # starttime = time.clock()
    # desired=500
    # print 'Reducing feature set size from %d to %d...'%(K,desired)
    # features = PCA(n_components=desired).fit_transform(transformed_concat)
    # print 'Time to transform: %0.1f'%(time.clock() - starttime)
    features = transformed_concat

    #split
    transform_train_inputs, transform_test_inputs = features[:N, ], features[
        N:, ]

    #save to csv file
    starttime = time.clock()
    print 'saving to csv file'
    header = makeHeader(desired)
    #Id column
    transform_train_inputs = np.concatenate(
        (np.arange(N).reshape(N, 1), transform_train_inputs), axis=1)
    transform_test_inputs = np.concatenate(
        (np.arange(T).reshape(T, 1), transform_test_inputs), axis=1)
    np.savetxt(data_files_path + 'transformed_train_inputs.csv',
               transform_train_inputs,
               fmt='%f',
               delimiter=',',
               newline='\n',
               header=header,
               comments='')
    np.savetxt(data_files_path + 'transformed_test_inputs.csv',
               transform_test_inputs,
               fmt='%f',
               delimiter=',',
               newline='\n',
               header=header,
               comments='')
    print 'Time to save: %0.1f' % (time.clock() - starttime)
Example #9
0
# Author: Kian Kenyon-Dean
# The purpose of this program is to make 12 new images for each image
# after a random rotation of d degrees for each one.

from import_data import import_csv,ORIGINAL_DIRECTORY,images_to_csv
from scipy import misc
import numpy as np

if __name__ == '__main__':
    destination = '../rotated_images/'
    degree_rotations = []
    x,y = import_csv(ORIGINAL_DIRECTORY+'.csv')

    np.random.seed(1917)
    for i in range(len(x)):
        image_matrix = x[i]
        image_class = y[i]
     
        randomized_degrees = np.random.uniform(0,360,12) 
        degree_rotations.append(randomized_degrees)  
        for d in randomized_degrees:
            new_image = misc.imrotate(image_matrix, d)

            name = 'class-%d-sample-%d-degree-%d-%s.png'\
                    %(int(image_class), (i+1)%101, int(d), str(d).split('.')[1][:5])

            misc.imsave(destination + name, new_image)

    with open('../random_degree_rotations.csv','w') as f:
        f.write(','*12)
        for arr in degree_rotations: