def check_accuracy(self): '''computes thetas on training set, saves them, and checks accuracy on evaluation set''' tinit = 0.005* np.random.rand(self.LABS, self.N) thetas = soft.optimizeThetas(tinit, self.xt, self.gt, self.LABS, self.L) thetas = thetas.reshape(self.LABS, -1) np.savetxt('./data/kaggle/optimized_thetas.csv', thetas, delimiter=',') h = soft.h(thetas, self.xe) predictions = h.argmax(axis=1) zeros_are_right = np.subtract(self.ye.T, predictions) misses = 1.0 * np.count_nonzero(zeros_are_right) acc = 1 - misses/len(predictions) print 'accuracy:', acc pass
def test_model_submit(self): # compute thetas on whole training set tinit = 0.005* np.random.rand(self.LABS, self.N) x = np.vstack([self.xt, self.xe]) y = np.vstack([self.yt, self.ye]) g = np.vstack([self.gt, self.ge]) # find thetas and save them thetas = soft.optimizeThetas(tinit, x, g, self.LABS, self.L) thetas = thetas.reshape(self.LABS, -1) np.savetxt('./data/kaggle/submit_optimized_thetas.csv', thetas, delimiter=',') # compute predictions m, n = self.x_test.shape h = soft.h(thetas, self.x_test) predictions = np.zeros((m,2)) for i in range(m): a = h[i,:].argmax() predictions[i,:]=[i+1, a] print 'To submitt add header: ImageId,Label' print predictions[0:10,:] np.savetxt('./data/kaggle/predictions.csv', predictions, fmt='%i,%i') pass
def choose_lambda(self): '''train with different regularization parameters and choose the one that minimizes the cost in the evaluation set.''' tinit = 0.005* np.random.rand(self.LABS, self.N) # initialize some working vars #rango = np.array([1e-3, 1e-2, 1e-1, 1, 10, 100]) rango = np.array([2, 3, 4, 5, 6, 7, 8, 9, 10]) Jt, Je = np.array([]), np.array([]) bestC, bestL = 1e+10, 0.0 # cycle through lambdas and choose the one with lowest cost for chosen_lambda in rango: t = soft.optimizeThetas(tinit, self.xt, self.gt, \ numLabels=self.LABS, l=chosen_lambda, visual=True) cost_t = soft.j(t, self.xt, self.gt, self.LABS, chosen_lambda) cost_e = soft.j(t, self.xe, self.ge, self.LABS, chosen_lambda) Jt = np.append(Jt, cost_t) Je = np.append(Je, cost_e) print 'chosen_lambda:', chosen_lambda if cost_e < bestC: bestC = cost_e bestL = chosen_lambda print '_______________new best is', bestL, 'with cost_e', cost_e print "\n\nthe best lambda is", bestL # plot #line1 = plt.plot(np.log10(rango), Jt) #line2 = plt.plot(np.log10(rango), Je) line1 = plt.plot(rango, Jt) line2 = plt.plot(rango, Je) plt.setp(line1, linewidth=2.0, label='training', color='b', solid_joinstyle='round') plt.setp(line2, linewidth=2.0, label='training', color='r', solid_joinstyle='round') plt.xlabel('Lambda') plt.ylabel('J') plt.show() pass
def __init__(self): # initialize general parameters self.L = 7 # lambda, regularization parameter self.LABS = 10 # N. of possible values of Y (labels) self.N = 784 # N. of features per image (28x28) # load data files training_data = pd.read_csv('./data/kaggle/train.csv', header=0) testing_data = pd.read_csv('./data/kaggle/test.csv', header=0) x = np.array(training_data.ix[:, 1:]).astype('float64') y = np.atleast_2d(training_data.ix[:, 0]).T # training set self.xt = x[0:30000, :] self.yt = y[0:30000, :] self.gt = soft.groundTruth(self.yt, self.LABS) # evaluation set self.xe = x[30000:, :] self.ye = y[30000:, :] self.ge = soft.groundTruth(self.ye, self.LABS) # testing set self.x_test = np.array(testing_data.ix[:,:]).astype('float64')
def learning_curves(self): tinit = 0.005* np.random.rand(self.LABS, self.N) m, n = self.xt.shape sample = np.array([3, 6, 9, 12, 15, 18, 21, 24, 27, 30])*1000 Jt, Je = np.array([]), np.array([]) for m in sample: my_t = soft.optimizeThetas(tinit, self.xt[0:m,:], self.gt[0:m,:], \ numLabels=self.LABS, l=self.L, visual=False) Jt = np.append(Jt, soft.j(my_t, self.xt[0:m,:], self.gt[0:m,:], self.LABS, self.L)) Je = np.append(Je, soft.j(my_t, self.xe, self.ge, self.LABS, self.L)) # plot (m, Jtr) and (m, Jcv) line1 = plt.plot(sample, Jt) line2 = plt.plot(sample, Je) plt.setp(line1, linewidth=2.0, label='training', color='b', solid_joinstyle='round') plt.setp(line2, linewidth=2.0, label='training', color='r', solid_joinstyle='round') plt.xlabel('Number of Examples') plt.ylabel('Cost / Error') plt.show() pass
def check_accuracy(self): logit_thetas = {} soft_thetas = np.array(pd.read_csv('./data/kaggle/optimized_thetas.csv', header=None)) soft_thetas = soft_thetas.reshape(self.LABS, -1) h = soft.h(soft_thetas, self.xe) m = h.shape[0] misses = 0.00 count = 0.0 for i in range(m): true_label = self.ye[i,0] [ml_1, ml_2] = h[i,:].argsort()[-2:][::-1] # 1st and 2nd model choices p1,p2 = h[i,:][ml_1], h[i,:][ml_2] right_order = True if ml_1 > ml_2: right_order = False s = `ml_2`+`ml_1` else: s = `ml_1`+`ml_2` if p1<0.99 and p2>0.01: if s not in logit_thetas: count +=1 logit_thetas[s] = self.optimize_logit_for(s) l_t = logit_thetas[s] logix = np.hstack([1, self.xe[i,:]]) p = logit.h(l_t, logix) if (p>0.5): prediction = (ml_1 if right_order else ml_2) else: prediction = (ml_2 if right_order else ml_1) else: prediction = ml_1 #print prediction, true_label if prediction!=true_label: misses +=1.0 print 'misses', misses print 'logit thetas searched', count acc = 1 - misses/m print 'accuracy:', acc pass
def test_model_submit(self): logit_thetas = {} soft_thetas = np.array(pd.read_csv('./data/kaggle/submit_optimized_thetas.csv', header=None)) soft_thetas = soft_thetas.reshape(self.LABS, -1) m, n = self.x_test.shape h = soft.h(soft_thetas, self.x_test) predictions = np.zeros((m,2)) for i in range(m): [ml_1, ml_2] = h[i,:].argsort()[-2:][::-1] # 1st and 2nd model choices p1,p2 = h[i,:][ml_1], h[i,:][ml_2] right_order = True if ml_1 > ml_2: right_order = False s = `ml_2`+`ml_1` else: s = `ml_1`+`ml_2` if p1<0.99 and p2>0.01: if s not in logit_thetas: logit_thetas[s] = self.optimize_logit_for(s) l_t = logit_thetas[s] logix = np.hstack([1, self.x_test[i,:]]) p = logit.h(l_t, logix) if (p>0.5): predictions[i,:] = ([i+1, ml_1] if right_order else [i+1, ml_2]) else: predictions[i,:] = ([i+1, ml_2] if right_order else [i+1, ml_1]) else: predictions[i,:]=[i+1, ml_1] print 'To submitt add header: ImageId,Label' print predictions[0:10,:] np.savetxt('./data/kaggle/predictions_2steps.csv', predictions, fmt='%i,%i') pass