def main(): from import_data import import_csv from sklearn.decomposition import PCA import time data_files_path = 'data_and_scripts/' TRAIN_INPUTS_PATH = data_files_path+'train_inputs.csv' TEST_INPUTS_PATH = data_files_path+'test_inputs.csv' TRAIN_INPUTS_SUBSET_PATH = data_files_path+'train_inputs_subset.csv' #get the original inputs starttime = time.clock() train_inputs = import_csv(TRAIN_INPUTS_PATH) test_inputs = import_csv(TEST_INPUTS_PATH) print 'Time to import: %0.1f'%(time.clock() - starttime) N,K = np.shape(train_inputs) T,Ki = np.shape(test_inputs) print N , K print T , Ki #concatenate train and test image concat = np.concatenate((train_inputs, test_inputs), axis=0) print 'concatenated' print np.shape(concat) #apply transformation starttime = time.clock() transformed_concat = transform_features(concat, 500) print 'Time to transform: %0.1f'%(time.clock() - starttime) #apply PCA # starttime = time.clock() # desired=500 # print 'Reducing feature set size from %d to %d...'%(K,desired) # features = PCA(n_components=desired).fit_transform(transformed_concat) # print 'Time to transform: %0.1f'%(time.clock() - starttime) features = transformed_concat #split transform_train_inputs, transform_test_inputs = features[:N,], features[N:,] #save to csv file starttime = time.clock() print 'saving to csv file' header = makeHeader(desired) #Id column transform_train_inputs = np.concatenate((np.arange(N).reshape(N,1),transform_train_inputs), axis=1) transform_test_inputs = np.concatenate((np.arange(T).reshape(T,1),transform_test_inputs),axis=1) np.savetxt(data_files_path+'transformed_train_inputs.csv',transform_train_inputs,fmt='%f', delimiter=',', newline='\n', header=header,comments='') np.savetxt(data_files_path+ 'transformed_test_inputs.csv',transform_test_inputs,fmt='%f', delimiter=',', newline='\n', header=header,comments='') print 'Time to save: %0.1f'%(time.clock() - starttime)
def command_line_run(args): args_dict = {} for i in range(1, len(args)): if '-' in args[i]: args_dict[args[i]] = [] args_dict[-1] = args_dict[args[i]] else: args_dict[-1].append(float(args[i])) del args_dict[-1] num_classes = 10 random.seed(1917) if '-debug' in args_dict: train_outputs = import_csv(TRAIN_OUTPUTS_SUBSET_PATH).astype(int) train_inputs = import_csv(TRAIN_INPUTS_SUBSET_PATH) else: train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int) train_inputs = import_csv(TRAIN_INPUTS_PATH) if '-t' in args_dict: print len(train_inputs) train_inputs = np.array(transform_features(train_inputs)) print len(train_inputs) # Default values. hnh = [] num_features = 300 dropout = None lr = 1.0 epochs = 50 if '-f' in args_dict: num_features = map(int, args_dict['-f'])[0] if '-test' in args_dict: test_inputs = import_csv(TEST_INPUTS_PATH) if '-t' in args_dict: test_inputs = transform_features(test_inputs) if not num_features == len(train_inputs[0]): alll = feature_reduce( np.array(list(train_inputs) + list(test_inputs)), num_features) train_inputs = alll[:len(train_inputs)] test_inputs = alll[len(train_inputs):] if '-validate' in args_dict: validation_size = (4 * len(train_inputs)) / 5 # Randomize the train and validation set. rand_idxs = random.sample(range(0, len(train_inputs)), len(train_inputs)) test_inputs = train_inputs[rand_idxs[validation_size:]] test_outputs = train_outputs[rand_idxs[validation_size:]] train_inputs = train_inputs[rand_idxs[0:validation_size]] train_outputs = train_outputs[rand_idxs[0:validation_size]] # We have to reduce the features all at the same time because it is unsupervised learning and # we want the same features to be picked by PCA for both of the train and test sets. if not num_features == len(train_inputs[0]): alll = feature_reduce( np.array(list(train_inputs) + list(test_inputs)), num_features) train_inputs = alll[:len(train_inputs)] test_inputs = alll[len(train_inputs):] if '-hn' in args_dict: hnh = map(int, args_dict['-hn']) if '-d' in args_dict: if not (0.0 <= args_dict['-d'][0] <= 1.0): print 'Please input a dropout rate between 0 and 1!' exit(0) dropout = args_dict['-d'][0] if '-lr' in args_dict: lr = args_dict['-lr'][0] if '-e' in args_dict: epochs = int(args_dict['-e'][0]) nn = NeuralNetwork(len(train_inputs[0]), hnh, num_classes, learning_rate=lr, dropout=dropout) nn.fit(train_inputs, train_outputs, training_horizon=epochs, verbose=True) p = nn.predict(test_inputs) fname = data_files_path + 'predictions_with_%depochs_%dfeatures_%0.2flf' % ( epochs, num_features, lr) if '-test' in args_dict: with open(fname + '.csv', 'w') as f: f.write('Id,Prediction\n') for i in range(len(p)): f.write('%d,%d\n' % (i + 1, p[i])) else: print accuracy(p, test_outputs) if '-record' in args_dict: heatmap(p, test_outputs, fname)
from import_data import import_csv import scipy from matplotlib import pyplot as plt x,y = import_csv('../resampled-to-8bit.csv') im = x[0] plt.imshow(im) """ scipy.misc.imshow(im) scipy.misc.imshow(misc.imrotate(im, 45)) scipy.misc.imshow(misc.imrotate(im, 15)) scipy.misc.imshow(misc.imrotate(im, 90)) scipy.misc.imshow(misc.imrotate(im, 100)) """
def command_line_run(args): args_dict = {} for i in range(1,len(args)): if '-' in args[i]: args_dict[args[i]] = [] args_dict[-1] = args_dict[args[i]] else: args_dict[-1].append(float(args[i])) del args_dict[-1] num_classes = 10 random.seed(1917) if '-debug' in args_dict: train_outputs = import_csv(TRAIN_OUTPUTS_SUBSET_PATH).astype(int) train_inputs = import_csv(TRAIN_INPUTS_SUBSET_PATH) else: train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int) train_inputs = import_csv(TRAIN_INPUTS_PATH) if '-t' in args_dict: print len(train_inputs) train_inputs = np.array(transform_features(train_inputs)) print len(train_inputs) # Default values. hnh = [] num_features = 300 dropout = None lr = 1.0 epochs = 50 if '-f' in args_dict: num_features = map(int, args_dict['-f'])[0] if '-test' in args_dict: test_inputs = import_csv(TEST_INPUTS_PATH) if '-t' in args_dict: test_inputs = transform_features(test_inputs) if not num_features == len(train_inputs[0]): alll = feature_reduce(np.array(list(train_inputs)+list(test_inputs)), num_features) train_inputs = alll[: len(train_inputs)] test_inputs = alll[len(train_inputs) :] if '-validate' in args_dict: validation_size = (4 * len(train_inputs)) / 5 # Randomize the train and validation set. rand_idxs = random.sample(range(0, len(train_inputs)), len(train_inputs)) test_inputs = train_inputs[rand_idxs[validation_size : ]] test_outputs = train_outputs[rand_idxs[validation_size : ]] train_inputs = train_inputs[rand_idxs[0 : validation_size]] train_outputs = train_outputs[rand_idxs[0 : validation_size]] # We have to reduce the features all at the same time because it is unsupervised learning and # we want the same features to be picked by PCA for both of the train and test sets. if not num_features == len(train_inputs[0]): alll = feature_reduce(np.array(list(train_inputs)+list(test_inputs)), num_features) train_inputs = alll[: len(train_inputs)] test_inputs = alll[len(train_inputs) :] if '-hn' in args_dict: hnh = map(int, args_dict['-hn']) if '-d' in args_dict: if not (0.0 <= args_dict['-d'][0] <= 1.0): print 'Please input a dropout rate between 0 and 1!' exit(0) dropout = args_dict['-d'][0] if '-lr' in args_dict: lr = args_dict['-lr'][0] if '-e' in args_dict: epochs = int(args_dict['-e'][0]) nn = NeuralNetwork(len(train_inputs[0]), hnh, num_classes, learning_rate=lr, dropout=dropout) nn.fit(train_inputs, train_outputs, training_horizon=epochs, verbose=True) p = nn.predict(test_inputs) fname = data_files_path+'predictions_with_%depochs_%dfeatures_%0.2flf'%(epochs,num_features,lr) if '-test' in args_dict: with open(fname+'.csv','w') as f: f.write('Id,Prediction\n') for i in range(len(p)): f.write('%d,%d\n'%(i+1,p[i])) else: print accuracy(p, test_outputs) if '-record' in args_dict: heatmap(p, test_outputs, fname)
from sklearn.metrics import confusion_matrix from graphic import heatmap from classification import accuracy if __name__ == '__main__': data_files_path = 'data_and_scripts/' TRAIN_INPUTS_PATH = data_files_path + 'train_inputs324.csv' TRAIN_OUTPUTS_PATH = data_files_path + 'train_outputs.csv' TEST_INPUTS_PATH = data_files_path + 'test_inputs324.csv' TRAIN_INPUTS_SUBSET_PATH = data_files_path + 'train_inputs_subset.csv' TRAIN_OUTPUTS_SUBSET_PATH = data_files_path + 'train_outputs_subset.csv' train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int) train_inputs = import_csv(TRAIN_INPUTS_PATH) print np.shape(train_outputs) print np.shape(train_inputs) #randomly split the data into a train set and a validation set train_x, test_x, train_y, test_y = train_test_split(train_inputs, train_outputs, test_size=0.2, random_state=17) #use the training set to find best learning rate c param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid, n_jobs=-1) clf.fit(train_x, train_y)
from sklearn.metrics import confusion_matrix from graphic import heatmap from classification import accuracy if __name__ == "__main__": data_files_path = "data_and_scripts/" TRAIN_INPUTS_PATH = data_files_path + "train_inputs324.csv" TRAIN_OUTPUTS_PATH = data_files_path + "train_outputs.csv" TEST_INPUTS_PATH = data_files_path + "test_inputs324.csv" TRAIN_INPUTS_SUBSET_PATH = data_files_path + "train_inputs_subset.csv" TRAIN_OUTPUTS_SUBSET_PATH = data_files_path + "train_outputs_subset.csv" train_outputs = import_csv(TRAIN_OUTPUTS_PATH).astype(int) train_inputs = import_csv(TRAIN_INPUTS_PATH) print np.shape(train_outputs) print np.shape(train_inputs) # randomly split the data into a train set and a validation set train_x, test_x, train_y, test_y = train_test_split(train_inputs, train_outputs, test_size=0.2, random_state=17) # use the training set to find best learning rate c param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]} clf = GridSearchCV(LinearSVC(penalty="l2"), param_grid, n_jobs=-1) clf.fit(train_x, train_y) # best parameter print ("Best parameters set found on development set:") print ()
import numpy as np import cPickle as pickle #from scipy import sparse #from sklearn.feature_extraction.text import TfidfVectorizer #import en #import peach #import sklearn.feature_extraction.text as skltext import fea_extract import import_data as ip import evaluation as eval import classification as clf # loading data from files train_data = ip.import_csv('../Data/train.csv') test_data = ip.import_csv('../Data/test.csv') # loading the pre computed data classes = ['s', 'w', 'c'] saved_keywords_list = [] saved_train_fea = [] saved_test_fea = [] output_folder = 'outputs/' for c in classes: try: with open(output_folder + 'keywords_list_'+c+'.p', 'rb') as fp: saved_keywords_list.append(pickle.load(fp)) #saved_keywords_list.append(np.load('keywords_list_'+c+'.npy')) saved_train_fea.append(np.load(output_folder + 'train_fea_'+c+'.npy')) saved_test_fea.append(np.load(output_folder + 'test_fea_'+c+'.npy'))
def main(): from import_data import import_csv from sklearn.decomposition import PCA import time data_files_path = 'data_and_scripts/' TRAIN_INPUTS_PATH = data_files_path + 'train_inputs.csv' TEST_INPUTS_PATH = data_files_path + 'test_inputs.csv' TRAIN_INPUTS_SUBSET_PATH = data_files_path + 'train_inputs_subset.csv' #get the original inputs starttime = time.clock() train_inputs = import_csv(TRAIN_INPUTS_PATH) test_inputs = import_csv(TEST_INPUTS_PATH) print 'Time to import: %0.1f' % (time.clock() - starttime) N, K = np.shape(train_inputs) T, Ki = np.shape(test_inputs) print N, K print T, Ki #concatenate train and test image concat = np.concatenate((train_inputs, test_inputs), axis=0) print 'concatenated' print np.shape(concat) #apply transformation starttime = time.clock() transformed_concat = transform_features(concat, 500) print 'Time to transform: %0.1f' % (time.clock() - starttime) #apply PCA # starttime = time.clock() # desired=500 # print 'Reducing feature set size from %d to %d...'%(K,desired) # features = PCA(n_components=desired).fit_transform(transformed_concat) # print 'Time to transform: %0.1f'%(time.clock() - starttime) features = transformed_concat #split transform_train_inputs, transform_test_inputs = features[:N, ], features[ N:, ] #save to csv file starttime = time.clock() print 'saving to csv file' header = makeHeader(desired) #Id column transform_train_inputs = np.concatenate( (np.arange(N).reshape(N, 1), transform_train_inputs), axis=1) transform_test_inputs = np.concatenate( (np.arange(T).reshape(T, 1), transform_test_inputs), axis=1) np.savetxt(data_files_path + 'transformed_train_inputs.csv', transform_train_inputs, fmt='%f', delimiter=',', newline='\n', header=header, comments='') np.savetxt(data_files_path + 'transformed_test_inputs.csv', transform_test_inputs, fmt='%f', delimiter=',', newline='\n', header=header, comments='') print 'Time to save: %0.1f' % (time.clock() - starttime)
# Author: Kian Kenyon-Dean # The purpose of this program is to make 12 new images for each image # after a random rotation of d degrees for each one. from import_data import import_csv,ORIGINAL_DIRECTORY,images_to_csv from scipy import misc import numpy as np if __name__ == '__main__': destination = '../rotated_images/' degree_rotations = [] x,y = import_csv(ORIGINAL_DIRECTORY+'.csv') np.random.seed(1917) for i in range(len(x)): image_matrix = x[i] image_class = y[i] randomized_degrees = np.random.uniform(0,360,12) degree_rotations.append(randomized_degrees) for d in randomized_degrees: new_image = misc.imrotate(image_matrix, d) name = 'class-%d-sample-%d-degree-%d-%s.png'\ %(int(image_class), (i+1)%101, int(d), str(d).split('.')[1][:5]) misc.imsave(destination + name, new_image) with open('../random_degree_rotations.csv','w') as f: f.write(','*12) for arr in degree_rotations: