def main(): # xtrain, ytrain, xvalid, yvalid, xtest = readData() # xtrain, ytrain, xvalid, yvalid, xtest = cleanData(xtrain, ytrain, xvalid, yvalid, xtest) xtrain, ytrain, xvalid, yvalid, xtest = rmp.readCleanData() # xtrain = xtrain[0:30000] # ytrain = ytrain[0:30000] xtrain, xvalid, xtest, countVect = rmp.vectorizeWords(xtrain, xvalid, xtest) ytest, w, b = resumeLogisticRegression(xtrain, ytrain, xvalid, yvalid, xtest, 1) # 1000 epochs = 30 min ytest = pd.Series(ytest, index=xtest.iloc[:, 1]) ytest.to_csv("result.csv") np.savetxt("w.csv", w) np.savetxt("b.csv", b) return xtrain, ytrain, xvalid, yvalid, xtest, ytest
def loadRMPData(useCleaned=False): if useCleaned: xtrain, ytrain, xvalid, yvalid, xtest = rmp.readCleanData() else: xtrain, ytrain, xvalid, yvalid, xtest = readData() xtrain, ytrain, xvalid, yvalid, xtest = cleanData(xtrain, ytrain, xvalid, yvalid, xtest) xtrain, xvalid, xtest, countVect = rmp.vectorizeWords(xtrain, xvalid, xtest) xtrain = xtrain.drop(["tid", "date", "id", xtrain.columns[0]], axis=1) xvalid = xvalid.drop(["tid", "date", "id", xvalid.columns[0]], axis=1) xtest = xtest.drop(["tid", "date", "id", xtest.columns[0]], axis=1) ytest = np.zeros((len(xtest),), dtype=theano.config.floatX) def shareData(dataX, dataY, borrow=True): sharedX = theano.shared(np.asarray(dataX, dtype=theano.config.floatX), borrow=True) sharedY = theano.shared(np.asarray(dataY, dtype=theano.config.floatX), borrow=True) return sharedX, T.cast(sharedY, "int32") xtrain, ytrain = shareData(xtrain, ytrain) xvalid, yvalid = shareData(xvalid, yvalid) xtest, ytest = shareData(xtest, ytest) return [(xtrain, ytrain), (xvalid, yvalid), (xtest, ytest)]
def main(): #TODO: normalize columns by z-score! # xtrain, ytrain, xvalid, yvalid, xtest = readData() # xtrain, ytrain, xvalid, yvalid, xtest = cleanData(xtrain, ytrain, xvalid, yvalid, xtest) xtrain, ytrain, xvalid, yvalid, xtest = rmp.readCleanData(keepQuality=True) # xtrain = xtrain[0:30000] # ytrain = ytrain[0:30000] xtrain, xvalid, xtest, countVect = rmp.vectorizeWords(xtrain, xvalid, xtest) xtrain = xtrain.append(xvalid, ignore_index=True) xtrain = xtrain.drop(['tid', 'date', 'id', xtrain.columns[0]], axis=1) xtrain = xtrain.apply(lambda x: (x-np.mean(x))/np.std(x), axis = 0, raw=True) helpfulness = xtrain['helpfulness'].get_values() clarity = xtrain['clarity'].get_values()[:,0] easiness = xtrain['easiness'].get_values() quality = xtrain['quality'].get_values()[:,0] xtrain = xtrain.drop(['helpfulness', 'clarity', 'easiness', 'quality'], axis=1) n = len(xtrain.iloc[0,:]) helpfulnessCorr = [] for i in range(n): helpfulnessCorr.append(np.correlate(xtrain.iloc[:,i].get_values(), helpfulness)[0]) clarityCorr = [] for i in range(n): clarityCorr.append(np.correlate(xtrain.iloc[:,i].get_values(), clarity)[0]) easinessCorr = [] for i in range(n): easinessCorr.append(np.correlate(xtrain.iloc[:,i].get_values(), easiness)[0]) qualityCorr = [] for i in range(n): qualityCorr.append(np.correlate(xtrain.iloc[:,i].get_values(), quality)[0]) correlationData = pd.DataFrame({'column': xtrain.columns, 'helpfulnessCorr': helpfulnessCorr, 'clarityCorr': clarityCorr, 'easinessCorr': easinessCorr, 'qualityCorr': qualityCorr}, columns=['column', 'helpfulnessCorr', 'clarityCorr', 'easinessCorr', 'qualityCorr']) correlationData.to_csv('correlationData.csv')
def sgd_optimization_rmp(learning_rate=0.13, n_epochs=1000, batch_size=600): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ xtrain, ytrain, xtest = rmp.readCleanData() # xtrain, ytrain, xtest = rmp.cleanData(xtrain, ytrain, xtest) xtrain, xtest, cv = rmp.vectorizeWords(xtrain, xtest) xtrain = xtrain.drop(['date', 'id', xtrain.columns[0]], axis=1) xtest = xtest.drop(['date', 'id', xtest.columns[0]], axis=1) index = list(xtrain.index) random.shuffle(index) xtrain = xtrain.ix[index] ytrain = ytrain.ix[index] xtrain.reset_index() ytrain.reset_index() nvalid = int(len(xtrain)/90) xvalid = theano.shared(np.array(xtrain[0:nvalid].values, dtype=np.float32), borrow=True) xtrain = theano.shared(np.array(xtrain[nvalid:].values, dtype=np.float32), borrow=True) yvalid = theano.shared(np.array(ytrain[0:nvalid].values, dtype=np.int32), borrow=True) ytrain = theano.shared(np.array(ytrain[nvalid:].values, dtype=np.int32), borrow=True) ytest = theano.shared(np.zeros((len(xtest), 1), dtype=np.int32), borrow=True) xtest = theano.shared(np.array(xtest, dtype=np.float32), borrow=True) # compute number of minibatches for training, validation and testing n_train_batches = xtrain.get_value().shape[0] / batch_size n_valid_batches = xvalid.get_value().shape[0] / batch_size n_test_batches = xtest.get_value().shape[0] / batch_size print n_train_batches print n_valid_batches print n_test_batches ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: xtest[index * batch_size: (index + 1) * batch_size], y: ytest[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: xvalid[index * batch_size: (index + 1) * batch_size], y: yvalid[index * batch_size: (index + 1) * batch_size] } ) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: xtrain[index * batch_size: (index + 1) * batch_size], y: ytrain[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-3 ############### # TRAIN MODEL # ############### print '... training the model' # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = np.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) print( ( ' epoch %i, minibatch %i/%i, test error of' ' best model %f %%' ) % ( epoch, minibatch_index + 1, n_train_batches, test_score * 100. ) ) # save the best model with open('best_model.pkl', 'w') as f: cPickle.dump(classifier, f) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print( ( 'Optimization complete with best validation score of %f %%,' 'with test performance %f %%' ) % (best_validation_loss * 100., test_score * 100.) ) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
import basicLogisticRegression import rmp import numpy as np import pandas as pd # xtrain, ytrain, xvalid, yvalid, xtest = readData() xtrain, ytrain, xvalid, yvalid, xtest = rmp.readData() xtrain, ytrain, xvalid, yvalid, xtest = rmp.cleanData(xtrain, ytrain, xvalid, yvalid, xtest, keepQuality=True) # xtrain = xtrain[0:30000] # ytrain = ytrain[0:30000] xtrain, xvalid, xtest, countVect = rmp.vectorizeWords(xtrain, xvalid, xtest) yvalid, ytest, w, b = basicLogisticRegression.resumeLogisticRegression(xtrain, ytrain, xvalid, yvalid, xtest, 1) #1000 epochs = 30 min ytest = pd.Series(ytest, index=xtest.iloc[:,0]) yvalid = pd.Series(yvalid, index=xvalid.iloc[:,0]) error = yvalid.get_values()-xvalid.quality.get_values() errorWRTHC = np.zeros((5, 5)) countWRTHC = np.zeros((5, 5)) for i in range(len(xvalid)): errorWRTHC[xvalid.iloc[i].helpfulness-1, xvalid.iloc[i].clarity-1] += error[i] countWRTHC[xvalid.iloc[i].helpfulness-1, xvalid.iloc[i].clarity-1] += 1