def load_data(num_records=1750): print('... loading data ...') data, labels = prep.get_data(num_records) labels += 1 train_x, test_x, train_y, test_y = train_test_split( data, labels, test_size=0.3, random_state=28022016 ) train_x, valid_x, train_y, valid_y = train_test_split( train_x, train_y, test_size=0.3, random_state=10032016 ) def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(np.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(np.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue return shared_x, T.cast(shared_y, 'int32') test_x, test_y = shared_dataset((test_x, test_y)) valid_x, valid_y = shared_dataset((valid_x, valid_y)) train_x, train_y = shared_dataset((train_x, train_y)) rval = [(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] return rval
2 tests - against twitter data and against polarity list """ from Python_code.classifiers.preprocessing import img_preprocess as prep from sklearn.decomposition import RandomizedPCA from sklearn.metrics import accuracy_score, classification_report from sklearn.cross_validation import train_test_split import pandas as pd import time from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC APPLY_PCA = False prep.SIZE = (250, 250) data, labels = prep.get_data(1000) # split test & train sets train_x, test_x, train_y, test_y = train_test_split( data, labels, test_size=0.3, random_state=28022016) if APPLY_PCA: pca = RandomizedPCA(n_components=100, whiten=False) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) print('starting svm') start_time = time.time() parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}