def fit(self, X, Y, learning_rate=10e-8, reg=10e-12, epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] Tvalid = y2indicator(Yvalid) X, Y = X[:-1000], Y[:-1000] N,D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W = np.random.randn(D, K) / np.sqrt(D+K) self.b = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): pY = self.forward(X) #gradient descent self.W -= learning_rate * (X.T.dot(pY-T) + reg*self.W) self.b -= learning_rate * ((pY -T).sum(axis=0) + reg*self.b) if i % 10 == 0: pYvalid = self.forward(Xvalid) c = cost(Tvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print(f"i: {i}, cost: {c}, error: {e}") if e < best_validation_error: best_validation_error = e print(best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-7, reg=0., epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] Tvalid = y2indicator(Yvalid) X, Y = X[:-1000], Y[:-1000] N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W = np.random.randn(D, K) / np.sqrt(D) self.b = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation and cost calculation pY = self.forward(X) # gradient descent step self.W -= learning_rate*(X.T.dot(pY - T) + reg*self.W) self.b -= learning_rate*((pY - T).sum(axis=0) + reg*self.b) if i % 10 == 0: pYvalid = self.forward(Xvalid) c = cost(Tvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def main(): X,Y = get_normalized_data() max_iter = 20 print_period = 10 lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M1 = 300 M2 = 100 K = 10 W1_init = np.random.randn(D, M1) / 28 b1_init = np.zeros(M) W2_init = np.random.randn(M1, M2) / np.sqrt(M1) b2_init = np.zeros(K) W3_init = np.random.randn(M2, k) / np.sqrt(M2) b3_init = np.zeros(K)
def fit(self, X, Y, learning_rate = 10e-7, \ reg=10e-7, epoch = 10000, show_fig = False): #divide into train and test data Xtest, Ytest, Xtrain, Ytrain = self.prepare_data(X, Y, multi=True) Ttrain = y2indicator(Ytrain) Ttest = y2indicator(Ytest) costs = [] best_validation_error = 1 for i in xrange(epoch): pY, Z = self.forward_multi(Xtrain) #forward prop #back prop pY_Y = pY - Ttrain self.W2 -= learning_rate * (Z.T.dot(pY_Y) + reg * self.W2) self.b2 -= learning_rate * (pY_Y.sum(axis=0) + reg * self.b2) # dZ = np.outer(pY_Y, self.W2)* (Z > 0) #Z > 0 is derivative of ReLU # print pY_Y.shape, self.W2.shape, Z.shape dZ = pY_Y.dot(self.W2.T) * (1 - Z * Z) self.W1 -= learning_rate * (Xtrain.T.dot(dZ) + reg * self.W1) self.b1 -= learning_rate * (np.sum(dZ, axis=0) + reg * self.b1) if i % 10 == 0: pYtest, _ = self.forward_multi(Xtest) c = cost(Ttest, pYtest) costs.append(c) e = error_rate(Ytest, np.argmax(pYtest, axis=1)) print "i: ", i, "cost: ", c, "error: ", e if e < best_validation_error: best_validation_error = e print "best validation error:", best_validation_error self.show_fig_cost(costs, show_fig)
def __init__(self, Xtrain, Ytrain, Xtest, Ytest): print("Initialising NN...") self.Xtrain = Xtrain self.Ytrain = Ytrain self.Xtest = Xtest self.Ytest = Ytest self.Ytest_ind = y2indicator(self.Ytest) self.Ytrain_ind = y2indicator(self.Ytrain)
def main(): Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) lr = 0.00004 reg = 0.01 N, D = Xtrain.shape K = 10 max_iter = 1000 batch_sz = 500 n_batches = N // batch_sz print_period = 10 W_init = np.random.randn(D, K) b_init = np.random.randn(K) X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='T') W = tf.Variable(W_init.astype(np.float32)) b = tf.Variable(b_init.astype(np.float32)) Yish = tf.matmul(X, W) + b cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) #train_op = tf.train.GradientDescentOptimizer(lr).minimize(cost) train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) predict_op = tf.argmax(Yish, 1) LL = [] init = tf.initialize_all_variables() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind}) prediction = session.run(predict_op, feed_dict={X: Xtest}) err = error_rate(prediction, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) plt.plot(LL) plt.show()
def benchmark_full(): Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() print("Performing logistic regression...") # lr = LogisticRegression(solver='lbfgs') # convert Ytrain and Ytest to (N x K) matrices of indicator variables N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] LLtest = [] CRtest = [] # reg = 1 # learning rate 0.0001 is too high, 0.00005 is also too high # 0.00003 / 2000 iterations => 0.363 error, -7630 cost # 0.00004 / 1000 iterations => 0.295 error, -7902 cost # 0.00004 / 2000 iterations => 0.321 error, -7528 cost # reg = 0.1, still around 0.31 error # reg = 0.01, still around 0.31 error lr = 0.00004 reg = 0.01 for i in range(500): p_y = forward(Xtrain, W, b) # print "p_y:", p_y ll = cost(p_y, Ytrain_ind) LL.append(ll) p_y_test = forward(Xtest, W, b) lltest = cost(p_y_test, Ytest_ind) LLtest.append(lltest) err = error_rate(p_y_test, Ytest) CRtest.append(err) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) iters = range(len(LL)) plt.plot(iters, LL, iters, LLtest) plt.show() plt.plot(CRtest) plt.show()
def sgd_batch(): """ use util functions to run the logistic classification with bp """ X_train, Y_train, X_test, Y_test = get_transformed_digit() N,D = X_train.shape yindi_train = y2indicator(Y_train) yindi_test = y2indicator(Y_test) M = yindi_test.shape[1] W = np.random.rand(D,M) b = np.random.rand(M) cost_train = [] cost_test = [] error_test = [] eta = 1e-4 penalty = 1e-2 batch_size = 500 batch_num = N // batch_size #batch for i in range(500): X_shuffle,Y_train_shuffle = shuffle(X_train,yindi_train) for ii in range(int(batch_num)): # x_tem = X_shuffle[ii].reshape(1,D) # y_tem = Y_train_shuffle[ii].reshape(1,10) x_tem = X_shuffle[int(i*batch_size):int((i+1)*batch_size)] y_tem = Y_train_shuffle[int(i*batch_size):int((i+1)*batch_size)] y_fit = forward(x = x_tem,w=W,b=b) W += eta*(deri_w(t_matrix = y_tem, y_matrix = y_fit,x = x_tem)-penalty*W) b += eta*(deri_b(t_matrix = y_tem, y_matrix = y_fit)-penalty*b) p_y_test = forward(x = X_test,w=W,b=b) cost_test_tem = cost(y_matrix = p_y_test,t_matrix = yindi_test) cost_test.append(cost_test_tem) if ii % 100 == 0: error_tem = error_rate(y_matrix = p_y_test, target = Y_test) print("the error rate in "+str(ii)+" iteration is :"+str(error_tem)) p_y_final = forward(x = X_test,w=W,b=b) error_final = error_rate(y_matrix = p_y_final, target = Y_test) print("the final error rate is "+str(error_final))
def get_data(): print('loading data ...') data_train = [] targets_train = [] data_test = [] targets_test = [] with open('../large_files/r8-train-all-terms.txt', encoding='utf-8') as f1: for line in f1: values = line.split('\t') data_train.append(values[1]) targets_train.append(values[0]) with open('../large_files/r8-test-all-terms.txt', encoding='utf-8') as f2: for line in f2: values = line.split('\t') data_test.append(values[1]) targets_test.append(values[0]) # one-hot encode targets # how do i know it's assigning the same labels to each? Ytrain_labels = LabelEncoder().fit_transform(targets_train) Ytest_labels = LabelEncoder().fit_transform(targets_test) Ytrain = y2indicator(Ytrain_labels) Ytest = y2indicator(Ytest_labels) print('Ytrain: ', Ytrain.shape) print('Ytest: ', Ytest.shape) # possible shape problem if K test != K train if (Ytrain.shape[1] != Ytest.shape[1]): raise ValueError('A very specific bad thing happened.') # get an average word vector for the data def avgwords(data): tot = [] for article in data: totalwordvecs = [] for word in article.split(): if word in word2vec: wvec = word2vec[word] totalwordvecs.append(wvec) else: # if word not vectorized, return all zeros totalwordvecs.append(np.zeros(int(d))) totalwordvecs = np.array(totalwordvecs) avgword = np.mean(totalwordvecs, axis=0) tot.append(avgword.tolist()) return np.array(tot) Xtrain = avgwords(data_train) Xtest = avgwords(data_test) print('Xtrain: ', Xtrain.shape) print('Xtest: ', Xtest.shape) return Xtrain, Xtest, Ytrain, Ytest, Ytrain_labels, Ytest_labels
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-6, reg=1e-6, epochs=10000, show_fig=False): Tvalid = y2indicator(Yvalid) N, D = X.shape K = len(set(Y) | set(Yvalid)) T = y2indicator(Y) self.W1 = np.random.randn(D, self.M) / np.sqrt(D) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M) self.b2 = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation and cost calculation pY, Z = self.forward(X) # Gradient Descent step ''' 在玩這個資料集的時候,首度引入L2概念, 注意L2原本就寫在 loss function內, 整個loss funciton作微分後,||W||變成一次方的型態,如下方運算式''' pY_T = pY - T # 先設成變數,這樣之後計算才會快阿 self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2) self.b2 -= learning_rate * (pY_T.sum(axis=0) + reg * self.b2) # dZ = pY_T.dot(self.W2.T) * (Z > 0) # relu dZ = pY_T.dot(self.W2.T) * (1 - Z * Z) # tanh self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1) self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1) if i % 20 == 0: pYvalid, _ = self.forward(Xvalid) c = cost2(Yvalid, pYvalid) # c = cost(Tvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print('best_validation_error:', best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-06, reg=10e-1, epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] Tvalid = y2indicator(Yvalid) X, Y = X[:-1000], Y[:-1000] N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W1 = np.random.randn(D, self.M) / np.sqrt(D + self.M) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M + K) self.b2 = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation pY, Z = self.forward(X) #gradient descent step pY_T = pY - T self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2) self.b2 -= learning_rate * ((pY_T.sum()) + reg * self.b2) # dZ = pY_T.dot(self.W2.T) * (Z > 0) # relu dZ = pY_T.dot(self.W2.T) * (1 - Z * Z) # tanh self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1) self.b1 -= learning_rate * (np.sum(dZ, axis=0) + reg * self.b1) if i % 10 == 0: pYvalid, _ = self.forward(Xvalid) c = cost(Tvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i: ", i, "cost: ", c, "error: ", e) if e < best_validation_error: best_validation_error = e print("best validation error = ", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, T, learning_rate=10e-8, reg=10e-12, epochs=10000, show_fig=False): X, T = shuffle(X, T) X_train, T_train = X[:-1000], T[:-1000] X_valid, T_valid = X[-1000:], T[-1000:] N, D = X_train.shape K = len(set(T_train)) T_train_ind = y2indicator(T_train) #initialize parameter: W need independence to number of para self.W = np.random.randn(D, K) / np.sqrt(D + K) self.b = np.zeros(K) costs = [] best_validation_error = 1 for n in range(epochs): # forwardpropogation process Y = self.forwardprop(X_train) #Gradient descent Y_T = Y - T_train_ind self.W -= learning_rate * (X_train.T.dot(Y_T) + reg * self.W) self.b -= learning_rate * (Y_T.sum(axis=0) + reg * self.b) #presentation if n % 10 == 0: Y_valid = self.forwardprop(X_valid) T_valid_ind = y2indicator(T_valid) c = cost(T_valid_ind, Y_valid) costs.append(c) er = error_rate(T_valid, self.predict(X_valid)) print(n, 'cost', c, 'error', er) if er < best_validation_error: best_validation_error = er print('Best validation error', best_validation_error) if show_fig: plt.plot(costs) plt.title('cross entropy loss') plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500
def fit(self, X, Y, learning_rate=10e-6, regularisation=10e-1, epochs=10000, show_fig=False): X, Y = shuffle(X, Y) # print("X.shape"+str(X.shape)) # print("Y.shape"+str(Y.shape)) Xvalid, Yvalid = X[-1000:], Y[-1000:] # Tvalid = y2indicator(Yvalid) # WE DONT NEED TVALID CAUSE WE ARE USING COST2 X, Y = X[:-1000], Y[:-1000] # print("X.shape"+str(X.shape)) # print("Y.shape"+str(Y.shape)) N, D = X.shape K = len(set(Y)) T = y2indicator(Y) #Need this for gradient descent self.W1, self.b1 = init_weight_and_bias(D, self.M) self.W2, self.b2 = init_weight_and_bias(self.M, K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation pY, Z = self.forward(X) # gradient descent pY_T = pY - T self.W2 -= learning_rate * (Z.T.dot(pY_T) + regularisation * self.W2) self.b2 -= learning_rate * ( (pY_T).sum(axis=0) + regularisation * self.b2) # dZ = pY_T.dot(self.W2.T) * (Z>0) #Relu dZ = pY_T.dot(self.W2.T) * (1 - Z * Z) # Tanh self.W1 -= learning_rate * (X.T.dot(dZ) + regularisation * self.W1) self.b1 -= learning_rate * (dZ.sum(axis=0) + regularisation * self.b1) if i % 10 == 0: pYvalid, _ = self.forward(Xvalid) c = cost2(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i : " + str(i) + "; Cost : " + str(c) + "; Error : " + str(e)) if e < best_validation_error: best_validation_error = e print("Best Validation error : " + str(best_validation_error)) if (show_fig): plt.plot(costs) plt.show()
def main(): # step 1: get the data and define all the usual variables X, Y = get_normalized_data() max_iter = 15 print_period = 10 lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500
def fit(self,X,Y,learning_rate=10e-8,regularisation=10e-12,epochs=10000,show_fig=False): X,Y = shuffle(X,Y) # print("X.shape"+str(X.shape)) # print("Y.shape"+str(Y.shape)) Xvalid, Yvalid = X[-1000:],Y[-1000:] Tvalid = y2indicator(Yvalid) X,Y = X[:-1000],Y[:-1000] # print("X.shape"+str(X.shape)) # print("Y.shape"+str(Y.shape)) N,D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W,self.b = init_weight_and_bias(D,K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation pY = self.forward(X) # gradient descent self.W -= learning_rate*(X.T.dot(pY-T) + regularisation*self.W) self.b -= learning_rate*((pY-T).sum(axis=0) + regularisation*self.b) if i%10 ==0 : pYvalid = self.forward(Xvalid) c = cost(Tvalid,pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid,axis=1)) print("i : "+str(i)+"; Cost : "+str(c)+"; Error : "+str(e)) if e < best_validation_error: best_validation_error = e print("Best Validation error : "+str(best_validation_error)) if(show_fig): plt.plot(costs) plt.show()
def fit(X, Y, show_fig=False): K = len(set(Y)) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) Xvalid, Yvalid = X[-1000:], Y[-1000:] Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate X, Y = X[:-1000], Y[:-1000] N, D = X.shape tfX = tf.placeholder(tf.float32, shape=(None, D), name='X') tfT = tf.placeholder(tf.float32, shape=(None, K), name='T') prediction = neural_network(D, K, tfX) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=tfT)) train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost) n_batches = N // batch_sz epoch_loss = 0 costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch}) if j % 20 == 0: c = session.run([train_op, cost], feed_dict={ tfX: Xbatch, tfT: Ybatch }) costs.append(c) p = session.run(tf.argmax(prediction, 1), feed_dict={ tfX: Xvalid, tfT: Yvalid }) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-7, reg=0., epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] Tvalid = y2indicator(Yvalid) X, Y = X[:-1000], Y[:-1000] N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W = np.random.randn(D, K) / np.sqrt(D) self.b = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation and cost calculation pY = self.forward(X) # gradient descent step self.W -= learning_rate * (X.T.dot(pY - T) + reg * self.W) self.b -= learning_rate * ((pY - T).sum(axis=0) + reg * self.b) if i % 10 == 0: pYvalid = self.forward(Xvalid) c = cost(Tvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500
def fit(self, X, T, learning_rate=10e-7, reg=10e-7, epochs=10000, show_fig=False): X, T = shuffle(X, T) X_train, T_train = X[:-1000], T[:-1000] X_valid, T_valid = X[-1000:], T[-1000:] N, D = X_train.shape K = len(set(T_train)) #initialize parameters self.W1 = np.random.randn(D, self.M) / np.sqrt(D) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M) self.b2 = np.zeros(K) costs = [] best_validation_error = 1 for n in range(epochs): #forwardpropogation process Y, Z = self.forwardprop(X_train) #Gradient Descent T_train_ind = y2indicator(T_train) Y_T = Y - T_train_ind self.W2 -= learning_rate * (Z.T.dot(Y_T) + reg * self.W2) self.b2 -= learning_rate * (Y_T.sum(axis=0) + reg * self.b2) dZ = Y_T.dot(self.W2.T) * (1 - Z * Z) self.W1 = learning_rate * (X_train.T.dot(dZ) + reg * self.W1) self.b1 = learning_rate * (dZ.sum(axis=0) + reg * self.b1) #representation of validation cost and error rate if n % 10 == 0: Y_valid, _ = self.forwardprop(X_valid) cost = cost2(T_valid, Y_valid) costs.append(cost) er = error_rate(T_valid, np.argmax(Y_valid, axis=1)) print(n, 'cost:', cost, 'error', er) if er < best_validation_error: best_validation_error = er print('Best validation error:', best_validation_error) if show_fig: plt.plot(costs) plt.title('cross entropy loss') plt.show()
def fit(self, X, Y, learning_rate=1e-6, reg=1e-6, epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:1000], Y[:1000] N, D = X.shape K = max(Y) + 1 T = y2indicator(Y) self.W1 = np.random.randn(D, self.M) / np.sqrt(D) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, k) / np.sqrt(self.M) self.b2 = np.zeros(K) cost = [] best_validation_error = 1 # forward propagation and cost calculation for i in range(epochs): pY, Z = self.forward(X) # gradient descent step pY_T = pY - T self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2) self.b2 -= learning_rate * (pY_T.sum(axis=0) + reg * self.b2) # dZ = pY_T.dot(self.W2.T) * (Z > 0) #relu dZ = pY_T.dot(self.W2.T) * (1 - Z * Z) # tanh self.W1 -= learning_rate * (X.T.dot(Z) + reg * self.W1) self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1) if i % 10 == 0: pYvalid, _ = self.forward(Xvalid) c = cost2(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print('i: ', i, 'cost:', c, 'error: ', e) if e < best_validation_error: best_validation_error = e print('best_validation_error: ', best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X_train, labels_train, X_val, labels_val, learning_rate=5e-7, lambda_=1e0, epochs=5000, show_fig=False): N, D = X_train.shape K = len(set(labels_train)) Y_train = y2indicator(labels_train) self.W1 = np.random.randn(D, self.M) * np.sqrt(2 / (D + self.M)) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) * np.sqrt(2 / (self.M + K)) self.b2 = np.zeros(K) costs = [] best_val_error = 1 for i in range(epochs): # Forward Propagation Y_train_pred, Z = self.forward(X_train) # Gradient Descent step delta2 = Y_train_pred - Y_train self.W2 -= learning_rate * (Z.T.dot(delta2) + lambda_ * self.W2) self.b2 -= learning_rate * (delta2.sum(axis=0) + lambda_ * self.b2) #delta1 = np.outer(delta2, self.W2) * (Z > 0) delta1 = delta2.dot(self.W2.T) * (1 - Z * Z) self.W1 -= learning_rate * (X_train.T.dot(delta1) + lambda_ * self.W1) self.b1 -= learning_rate * (delta1.sum(axis=0) + lambda_ * self.b1) if i % 50 == 0: Y_val_pred, _ = self.forward(X_val) c = softmax_cost2(labels_val, Y_val_pred) costs.append(c) e = error_rate(labels_val, np.argmax(Y_val_pred, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_val_error: best_val_error = e print("best_val_error:", best_val_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-7, reg=10e-7, epochs=10000, show_fig=False): # Tvalid = y2indicator(Yvalid) Xvalid, Yvalid, X, Y = splitTrainTestFromLast(X, Y, 1000) N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W1 = np.random.randn(D, self.M) / np.sqrt(D + self.M) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M + K) self.b2 = np.zeros(K) costs = [] best_validation_error = 1 for i in xrange(epochs): # forward propagation and cost calculation pY, Z = self.forward(X) # gradient descent step pY_T = pY - T self.W2 -= learning_rate * (Z.T.dot(pY_T) + reg * self.W2) self.b2 -= learning_rate * (pY_T.sum(axis=0) + reg * self.b2) # dZ = pY_T.dot(self.W2.T) * (Z > 0) # relu dZ = pY_T.dot(self.W2.T) * (1 - Z * Z) # tanh self.W1 -= learning_rate * (X.T.dot(dZ) + reg * self.W1) self.b1 -= learning_rate * (dZ.sum(axis=0) + reg * self.b1) if i % 10 == 0: pYvalid, _ = self.forward(Xvalid) c = cost2(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print "i:", i, "cost:", c, "error:", e if e < best_validation_error: best_validation_error = e print "best_validation_error:", best_validation_error if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-1, activation=tf.nn.sigmoid, epochs=20): N, T, D = Xtrain.shape Y_flat = np.copy(Y) Y = y2indicator(Y) self.f = activation batch_count = N // self.batch_size costs = [] for i in range(epochs): batch_grp = np.arange(0, self.batch_size) for j in range(batch_count): Xbatch, Ybatch = X[batch_grp], Y[batch_grp] Xbatch = Xbatch.reshape( (self.batch_size, self.chunk_size, self.input_size)) batch_grp += self.batch_size session.run([self.train_op, self.cost_op, self.predict_op], feed_dict={ self.Xin: Xbatch, self.labels: Ybatch }) if j % 20 == 0: testbatch_grp = np.random.choice(N, self.batch_size, replace=True) c, p = self.session.run([self.cost_op, self.predict_op], feed_dict={ self.Xin: X[testbatch_grp], self.labels: Y[testbatch_grp] }) a = accuracy(Y_flat[testbatch_grp], p) print("i:", i, "j:", j, "nb:", batch_count, "cost:", c, "accuracy:", a)
def fit(self, X, Y, learning_rate=1e-3, epochs=2, batch_size=100, test_size=1000): N, *D = X.shape Y_flat = np.copy(Y) Y = y2indicator(Y) batch_count = N // batch_size costs = [] for i in range(epochs): batch_grp = np.arange(0, batch_size) for j in range(batch_count): Xbatch, Ybatch = X[batch_grp], Y[batch_grp] batch_grp += batch_size self.session.run([self.train_op, self.cost], feed_dict={ self.Xin: Xbatch, self.labels: Ybatch }) if j % 20 == 0: testbatch_grp = np.random.choice(N, test_size, replace=True) c, p = self.session.run([self.cost, self.predictions], feed_dict={ self.Xin: X[testbatch_grp], self.labels: Y[testbatch_grp] }) costs.append(c) a = accuracy(Y_flat[testbatch_grp], p) print("i:", i, "j:", j, "nb:", batch_count, "cost:", c, "accuracy:", a)
def fit(self, X, Y, learning_rate=1e-6, reg=1e-6, epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] # Tvalid = y2indicator(Yvalid) X, Y = X[:-1000], Y[:-1000] N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W1 = np.random.randn(D, self.M) / np.sqrt(D) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M) self.b2 = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation and cost calculation pY, Z = self.forward(X) # gradient descent step pY_T = pY - T self.W2 -= learning_rate*(Z.T.dot(pY_T) + reg*self.W2) self.b2 -= learning_rate*(pY_T.sum(axis=0) + reg*self.b2) # dZ = pY_T.dot(self.W2.T) * (Z > 0) # relu dZ = pY_T.dot(self.W2.T) * (1 - Z*Z) # tanh self.W1 -= learning_rate*(X.T.dot(dZ) + reg*self.W1) self.b1 -= learning_rate*(dZ.sum(axis=0) + reg*self.b1) if i % 10 == 0: pYvalid, _ = self.forward(Xvalid) c = cost2(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
def fit(self, X_train, labels_train, X_val, labels_val, learning_rate=5e-7, lambda_=1e0, epochs=5000, show_fig=False): N, D = X_train.shape K = len(set(labels_train)) Y_train = y2indicator(labels_train) self.W = np.random.randn(D, K) * np.sqrt(1 / D) self.b = np.zeros(K) costs = [] best_val_error = 1 for i in range(epochs): # Forward propagation Y_train_pred = self.forward(X_train) # Gradient descent self.W -= learning_rate * (X_train.T.dot(Y_train_pred - Y_train) + lambda_ * self.W) self.b -= learning_rate * ( (Y_train_pred - Y_train).sum(axis=0) + lambda_ * self.b) if i % 50 == 0: Y_val_pred = self.forward(X_val) c = softmax_cost2(labels_val, Y_val_pred) costs.append(c) e = error_rate(labels_val, np.argmax(Y_val_pred, axis=1)) print("Epoch:", i, "Cost:", c, "Error rate", e) if e < best_val_error: best_val_error = e print("Best validation error", best_val_error) if show_fig: plt.plot(costs) plt.show()
def main(): #some load data K = 10 (Xtrain, Ytrain), (Xtest, Ytest) = cifar10_test() Xtrain = (Xtrain / 255).astype(np.float32) Xtest = (Xtest / 255).astype(np.float32) Ytrain_ind = y2indicator(Ytrain, K).astype(np.int32) Ytest_ind = y2indicator(Ytest, K).astype(np.int32) print(Xtrain.shape) print(Ytrain_ind.shape) epoch = 200 print_period = 10 N = Xtrain.shape[0] batch_sz = 250 n_batches = N // batch_sz n_batches_test = Xtrain.shape[0] // batch_sz M = 512 M1 = 512 #K = 10 ABOVE poolsz = (2, 2) W1_shape = (5, 5, 3, 16) #32 / 2 = 16 W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[-1], dtype=np.float32) W2_shape = (5, 5, 16, 40) # 16 / 2= 8 W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[-1], dtype=np.float32) W3_shape = (5, 5, 40, 100) # 8 / 2 =4 W3_init = init_filter(W3_shape, poolsz) b3_init = np.zeros(W3_shape[-1], dtype=np.float32) W4_shape = (5, 5, 100, 196) #4 / 2 = 2 W4_init = init_filter(W4_shape, poolsz) b4_init = np.zeros(W4_shape[-1], dtype=np.float32) W5_init = np.random.randn(W4_shape[-1] * 2 * 2, M) / np.sqrt(W4_shape[-1] * 2 * 2 + M) b5_init = np.zeros(M, dtype=np.float32) W6_init = np.random.randn(M, M1) / np.sqrt(M + M1) b6_init = np.zeros(M1, dtype=np.float32) W7_init = np.random.randn(M1, K) / np.sqrt(M1 + K) b7_init = np.zeros(K, dtype=np.float32) X = tf.placeholder(tf.float32, shape=(batch_sz, 32, 32, 3), name='X') T = tf.placeholder(tf.float32, shape=(batch_sz, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) W4 = tf.Variable(W4_init.astype(np.float32)) b4 = tf.Variable(b4_init.astype(np.float32)) W5 = tf.Variable(W5_init.astype(np.float32)) b5 = tf.Variable(b5_init.astype(np.float32)) W6 = tf.Variable(W6_init.astype(np.float32)) b6 = tf.Variable(b6_init.astype(np.float32)) W7 = tf.Variable(W7_init.astype(np.float32)) b7 = tf.Variable(b7_init.astype(np.float32)) Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z3 = convpool(Z2, W3, b3) Z4 = convpool(Z3, W4, b4) Z4_shape = Z4.get_shape().as_list() Z4r = tf.reshape(Z4, [-1, np.prod(Z4_shape[1:])]) Z5 = tf.nn.relu(tf.matmul(Z4r, W5) + b5) Z6 = tf.nn.relu(tf.matmul(Z5, W6) + b6) Yish = tf.matmul(Z6, W7) + b7 cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits_v2(logits=Yish, labels=T)) cost_test = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=Yish, labels=T)) train_op = tf.train.AdamOptimizer(0.001).minimize(cost) predict_op = tf.argmax(tf.nn.softmax(Yish), axis=1) t0 = time.time() LL = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epoch): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:batch_sz * (j + 1)] Ybatch = Ytrain_ind[j * batch_sz:batch_sz * (j + 1)] if len(Xbatch) == batch_sz: session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: # due to RAM limitations we need to have a fixed size input # so as a result, we have this ugly total cost and prediction computation test_cost = 0 prediction = np.zeros(len(Xtest)) for k in range(Xtest.shape[0] // batch_sz): Xtestbatch = Xtest[k * batch_sz:batch_sz * (k + 1)] Ytestbatch = Ytest_ind[k * batch_sz:batch_sz * (k + 1), ] test_cost += session.run(cost_test, feed_dict={ X: Xtestbatch, T: Ytestbatch }) prediction[k * batch_sz:batch_sz * (k + 1)] = session.run( predict_op, feed_dict={X: Xtestbatch}) accur = error_rate(prediction, np.argmax(Ytest_ind, axis=1)) print( f'epoch is {i} accuracy is {round(accur,5)} and cost is {round(test_cost,5)}' ) LL.append(test_cost) plt.plot(LL) plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 0.0000000001 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
# installation is easy! just the usual "sudo pip(3) install keras" # get the data, same as Theano + Tensorflow examples # no need to split now, the fit() function will do it X, Y = get_normalized_data() # get shapes N, D = X.shape K = len(set(Y)) # by default Keras wants one-hot encoded labels # there's another cost function we can use # where we can just pass in the integer labels directly # just like Tensorflow / Theano Y = y2indicator(Y) # the model will be a sequence of layers model = Sequential() # ANN with layers [784] -> [500] -> [300] -> [10] model.add(Dense(units=500, input_dim=D)) model.add(Activation('relu')) model.add(Dense(units=300)) # don't need to specify input_dim model.add(Activation('relu')) model.add(Dense(units=K)) model.add(Activation('softmax'))
# installation is easy! just the usual "sudo pip(3) install keras" # get the data, same as Theano + Tensorflow examples # no need to split now, the fit() function will do it Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() # get shapes N, D = Xtrain.shape K = len(set(Ytrain)) # by default Keras wants one-hot encoded labels # there's another cost function we can use # where we can just pass in the integer labels directly # just like Tensorflow / Theano Ytrain = y2indicator(Ytrain) Ytest = y2indicator(Ytest) # the model will be a sequence of layers model = Sequential() # ANN with layers [784] -> [500] -> [300] -> [10] model.add(Dense(units=500, input_dim=D)) model.add(Activation('relu')) model.add(Dense(units=300)) # don't need to specify input_dim model.add(Activation('relu')) model.add(Dense(units=K)) model.add(Activation('softmax'))
def main(): # 1.batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000], Y[:-1000] Xtest, Ytest = X[-1000:], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = X.shape batch_sz = 500 n_batches = N / batch_sz M = 300 # number of hidden neurons K = 10 # number of output classes W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #1. batch SGD LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #2. batch with momentum W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * dW1 - lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #3. batch with Nesterov momentum W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nesterov = [] CR_nesterov = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) dW2 = mu * mu * dW2 - (1 + mu) * lr * ( derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * mu * db2 - (1 + mu) * lr * ( derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * mu * dW1 - (1 + mu) * lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * mu * db1 - (1 + mu) * lr * ( derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_nesterov.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nesterov.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label='momentum') plt.plot(LL_nesterov, label='nesterov') plt.legend() plt.show()
def main(): # step 1: get the data and define all the usual variables X, Y = get_normalized_data() max_iter = 15 print_period = 10 lr = 0.00004 mu = 0.9 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz # add an extra layer just for fun M1 = 300 M2 = 100 K = 10 W1_init = np.random.randn(D, M1) / 28 b1_init = np.zeros(M1) W2_init = np.random.randn(M1, M2) / np.sqrt(M1) b2_init = np.zeros(M2) W3_init = np.random.randn(M2, K) / np.sqrt(M2) b3_init = np.zeros(K) # define variables and expressions X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) # define the model Z1 = tf.nn.relu( tf.matmul(X, W1) + b1 ) Z2 = tf.nn.relu( tf.matmul(Z1, W2) + b2 ) Yish = tf.matmul(Z2, W3) + b3 # remember, the cost function does the softmaxing! weird, right? # softmax_cross_entropy_with_logits take in the "logits" # if you wanted to know the actual output of the neural net, # you could pass "Yish" into tf.nn.softmax(logits) cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(Yish, T)) # we choose the optimizer but don't implement the algorithm ourselves # let's go with RMSprop, since we just learned about it. # it includes momentum! train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) # we'll use this to calculate the error rate predict_op = tf.argmax(Yish, 1) LL = [] init = tf.initialize_all_variables() with tf.Session() as session: session.run(init) for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind}) prediction = session.run(predict_op, feed_dict={X: Xtest}) err = error_rate(prediction, Ytest) print "Cost / err at iteration i=%d, j=%d: %.6f / %.3f" % (i, j, test_cost, err) LL.append(test_cost) plt.plot(LL) plt.show()
def main(): X, Y = get_normalized_data() max_iter = 20 print_period = 10 lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1_init = np.random.randn(D, M) / 28 b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) thX = T.matrix('X') thT = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') thZ = relu( thX.dot(W1) + b1 ) thY = T.nnet.softmax( thZ.dot(W2) + b2 ) cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum()) prediction = T.argmax(thY, axis=1) update_W1 = W1 - lr*T.grad(cost, W1) update_b1 = b1 - lr*T.grad(cost, b1) update_W2 = W2 - lr*T.grad(cost, W2) update_b2 = b2 - lr*T.grad(cost, b2) train = theano.function( inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)], ) get_prediction = theano.function( inputs=[thX, thT], outputs =[cost, prediction], ) LL = [] for i in range(max_iter): for j in range(int(n_batches)): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) LL.append(cost_val) plt.plot(LL) plt.show()
def fit(self, X, Y, lr=10e-4, mu=0.99, reg=10e-4, decay=0.99999, eps=10e-3, batch_sz=30, epochs=3, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) K = len(set(Y)) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate # initialize convpool layers N, d, d, c = X.shape mi = c outw = d outh = d self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = outw / 2 outh = outh / 2 mi = mo # initialize mlp layers self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W, 'W_logreg') self.b = tf.Variable(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.convpool_layers: self.params += h.params for h in self.hidden_layers: self.params += h.params # set up tensorflow functions and variables tfX = tf.placeholder(tf.float32, shape=(None, d, d, c), name='X') tfY = tf.placeholder(tf.float32, shape=(None, K), name='Y') act = self.forward(tfX) rcost = reg*sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(act, tfY)) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) n_batches = N / batch_sz costs = [] init = tf.initialize_all_variables() with tf.Session() as session: session.run(init) for i in xrange(epochs): X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfY: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={tfX: Xvalid, tfY: Yvalid}) costs.append(c) p = session.run(prediction, feed_dict={tfX: Xvalid, tfY: Yvalid}) e = error_rate(Yvalid_flat, p) print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e if show_fig: plt.plot(costs) plt.show()
def main(): max_iter = 10 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparams lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1. Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1 ** t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2 ** t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] # comparable hyperparameters for fair comparison lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # 2. stochastic W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. batch W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-2, mu=0.99, decay=0.999, reg=1e-3, epochs=10, batch_sz=100, show_fig=False): K = len(set(Y)) #make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) #for cauculating error rate Yvalid_flat = Yvalid Yvalid = y2indicator(Yvalid).astype(np.float32) #initialize hidden layers N, D = X.shape self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W.astype(np.float32)) self.b = tf.Variable(b.astype(np.float32)) #collect param for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params #set up function and variable tfX = tf.placeholder(tf.float32, shape=(None, D), name='X') tfT = tf.placeholder(tf.float32, shape=(None, K), name='T') act = self.forward(tfX) rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(logits=act, labels=tfT)) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost) n_batches = N // batch_sz costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={ tfX: Xvalid, tfT: Yvalid }) costs.append(c) p = session.run(prediction, feed_dict={ tfX: Xvalid, tfT: Yvalid }) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-2, mu=0.99, decay=0.999, reg=1e-3, epochs=10, batch_sz=100, show_fig=False): K = len(set(Y)) # won't work later b/c we turn it into indicator # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) # Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate X, Y = X[:-1000], Y[:-1000] # initialize hidden layers N, D = X.shape self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W.astype(np.float32)) self.b = tf.Variable(b.astype(np.float32)) # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables tfX = tf.placeholder(tf.float32, shape=(None, D), name='X') tfT = tf.placeholder(tf.float32, shape=(None, K), name='T') act = self.forward(tfX) rcost = reg*sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=act, labels=tfT ) ) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=mu).minimize(cost) n_batches = N // batch_sz costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfT: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={tfX: Xvalid, tfT: Yvalid}) costs.append(c) p = session.run(prediction, feed_dict={tfX: Xvalid, tfT: Yvalid}) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print "Performing logistic regression..." Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(1): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for SGD:", datetime.now() - t0 # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): # step 1: get the data and define all the usual variables X, Y = get_normalized_data() max_iter = 15 print_period = 10 lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz # add an extra layer just for fun M1 = 300 M2 = 100 K = 10 W1_init = np.random.randn(D, M1) / 28 b1_init = np.zeros(M1) W2_init = np.random.randn(M1, M2) / np.sqrt(M1) b2_init = np.zeros(M2) W3_init = np.random.randn(M2, K) / np.sqrt(M2) b3_init = np.zeros(K) # initialize varaibles and expressions X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='Y') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) # define the model Z1 = tf.nn.relu(tf.matmul(X, W1) + b1) Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2) # the cost function does the softmaxing! SO NO SOFTMAXING HERE Yish = tf.matmul(Z2, W3) + b3 cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) predict_op = tf.argmax(Yish, 1) costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind}) prediction = session.run(predict_op, feed_dict={X: Xtest}) err = error_rate(prediction, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) costs.append(test_cost) plt.plot(costs) plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. batch with momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nest.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def fit(self, X, Y, Xvalid, Yvalid, lr=1e-2, mu=0.9, reg=1e-3, decay=0.99999, eps=1e-10, batch_sz=30, epochs=5, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) K = len(set(Y)) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) Yvalid = y2indicator(Yvalid).astype(np.float32) Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate # initialize convpool layers N, width, height, c = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = outw // 2 outh = outh // 2 mi = mo # initialize mlp layers self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_bias(M1, K) self.W = tf.Variable(W, 'W_logreg') self.b = tf.Variable(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.convpool_layers: self.params += h.params for h in self.hidden_layers: self.params += h.params # set up tensorflow functions and variables tfX = tf.placeholder(tf.float32, shape=(None, width, height, c), name='X') tfY = tf.placeholder(tf.float32, shape=(None, K), name='Y') act = self.forward(tfX) rcost = reg*sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=act, labels=tfY ) ) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) n_batches = N // batch_sz costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfY: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={tfX: Xvalid, tfY: Yvalid}) costs.append(c) p = session.run(prediction, feed_dict={tfX: Xvalid, tfY: Yvalid}) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() max_iter = 20 print_period = 10 lr = 0.0004 reg = 0.01 Xtrain = Xtest.astype(np.float32) Ytrain = Ytest.astype(np.float32) Ytrain_ind = y2indicator(Ytrain).astype(np.float32) Ytest_ind = y2indicator(Ytest).astype(np.float32) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_init = np.random.randn(D, M) / 28 b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) # step 2: define theano variables and expressions thX = T.matrix('X') thT = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') # we can use the built-in theano functions to do relu and softmax thZ = relu( thX.dot(W1) + b1 ) # relu is new in version 0.7.1 but just in case you don't have it thY = T.nnet.softmax( thZ.dot(W2) + b2 ) # define the cost function and prediction cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum()) prediction = T.argmax(thY, axis=1) # step 3: training expressions and functions # we can just include regularization as part of the cost because it is also automatically differentiated! update_W1 = W1 - lr*T.grad(cost, W1) update_b1 = b1 - lr*T.grad(cost, b1) update_W2 = W2 - lr*T.grad(cost, W2) update_b2 = b2 - lr*T.grad(cost, b2) train = theano.function( inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)], ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[thX, thT], outputs=[cost, prediction], ) costs = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) costs.append(cost_val) plt.plot(costs) plt.show()