def fit(self, X, Y, learning_rate=1e-7, reg=0., epochs=10000, show_fig=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:], Y[-1000:] Tvalid = y2indicator(Yvalid) X, Y = X[:-1000], Y[:-1000] N, D = X.shape K = len(set(Y)) T = y2indicator(Y) self.W = np.random.randn(D, K) / np.sqrt(D) self.b = np.zeros(K) costs = [] best_validation_error = 1 for i in range(epochs): # forward propagation and cost calculation pY = self.forward(X) # gradient descent step self.W -= learning_rate*(X.T.dot(pY - T) + reg*self.W) self.b -= learning_rate*((pY - T).sum(axis=0) + reg*self.b) if i % 10 == 0: pYvalid = self.forward(Xvalid) c = cost(Tvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, np.argmax(pYvalid, axis=1)) print("i:", i, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.show()
forward_set = np.empty((18, 1)) data_sets = [] for k in range(0, X.shape[1]): best_mse = -1 # Starting value for best mean square error for p in range(k, X.shape[1]): # starting from k is the same as p-k count += 1 # Keep track of models trained temp = forward_set.copy( ) # Create deep copy of forward model to avoid changing the optimized model X_train = np.c_[ temp, X[:, p]] # Concat previous forward model with new feature for training beta = util.calcBeta(X_train, y) # Calculate beta mse = util.cost(X_train, y, beta) # Calculate cost print(f"Training MSE: {mse}, Column: {p}") # Save best feature if found if mse < best_mse or best_mse < 0: best_mse = mse # assign new best mse best_index = p # index of best column keep_feature = X[:, p] # actual feature # Add the column to M+1 forward_set = np.c_[forward_set, keep_feature] # Move chosen feature to index k to avoid singular matrix when dotting and inversing X[:, [k, best_index]] = X[:, [best_index, k]] print(f"Lowest MSE for {k + 1} features: {best_mse}") data_sets.append(forward_set) # Save the best 18 x n sets to a list
def main(): # compare 3: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum # all with L2 regularization X, Y = get_normalized_data() Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) M = 300 K = 10 ######################IMPORTANETE PARAMETERE #################### t = 1 # !!!!!!!!!!!!!!!! ############################################################### epochs = 20 print_period = 10 lr0 = 0.001 reg = 0.01 epsilon = 1e-8 # is it the same as 10e-8 beta1 = 0.9 # mu = 0.9 beta2 = 0.999 # decay = 0.999 batch_size = 500 number_batches = int(N // batch_size) W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) tr_costs_momentum = [] errors_batch_momentum = [] losses_test_momentum = [] # momentum coeficient mW2 = 0 mW1 = 0 mb2 = 0 mb1 = 0 vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 mW2_hat = 0 mW1_hat = 0 mb2_hat = 0 mb1_hat = 0 vW1_hat = 0 vW2_hat = 0 vb1_hat = 0 vb2_hat = 0 for epoch in range(epochs): for j in range(number_batches): xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :] ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :] ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2) # gradients gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2 gb2 = derivative_b2(ytr, ytr_pred) + reg * b2 gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1 gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1 # update momentum mW2 = beta1 * mW2 + (1 - beta1) * gW2 mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb2 = beta1 * mb2 + (1 - beta1) * gb2 mb1 = beta1 * mb1 + (1 - beta1) * gb1 # update velocity vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction correction1 = (1 - beta1**t) mW2_hat = mW2 / correction1 mW1_hat = mW1 / correction1 mb2_hat = mb2 / correction1 mb1_hat = mb1 / correction1 correction2 = (1 - beta2**t) vW2_hat = vW2 / correction2 vW1_hat = vW1 / correction2 vb2_hat = vb2 / correction2 vb1_hat = vb1 / correction2 # update t !!!!!!! t += 1 # update W2 -= lr0 * (mW2_hat / np.sqrt(vW2_hat + epsilon)) W1 -= lr0 * (mW1_hat / np.sqrt(vW1_hat + epsilon)) b2 -= lr0 * (mb2_hat / np.sqrt(vb2_hat + epsilon)) b1 -= lr0 * (mb1_hat / np.sqrt(vb1_hat + epsilon)) if j % print_period == 0: yte_pred, _ = forward(Xtest, W1, b1, W2, b2) l = cost(yte_pred, Ytest_ind) losses_test_momentum.append(l) print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l)) e = error_rate(yte_pred, Ytest) errors_batch_momentum.append(e) print("Error rate:", e) ctr = cost(ytr_pred, ytr) print("traning set cost", ctr) tr_costs_momentum.append(ctr) pY, _ = forward(Xtest, W1, b1, W2, b2) #plt.plot(tr_costs_momentum, label='tr_costs momentum') plt.plot(losses_test_momentum, label='losses_test momentum RMS') #plt.plot(errors_batch, label='errors_batch') # plt.show() # print("tr_costs", errors_batch_momentum) print("Final error rate:", error_rate(pY, Ytest)) plt.legend() plt.show()
def main(): max_iter = 20 print_period = 20 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() Ytrain_ind = y2indicator(Ytrain) # Target of train data Ytest_ind = y2indicator(Ytest) # Target of test data lr = 0.00004 reg = 0.01 N, D = Xtrain.shape M = 300 K = 10 np.random.seed(123) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) batch_sz = 500 n_batches = N // batch_sz # 82 # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. learning rate = constant losses_batch = [] errors_batch = [] for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_rms = [] errors_rms = [] ''' in RMSprop you can use a bigger lr, but if you set this too high you'll get NaN! if you use the same learning rate within RMSprop and General method, there is only slight difference between them. ''' lr0 = 0.001 cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # # update # cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*np.square(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # W2 -= lr0 / (np.sqrt(cache_W2) + eps) *(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*np.square(derivative_b2(Ybatch, pYbatch) + reg*b2) # b2 -= lr0 / (np.sqrt(cache_b2) + eps) *(derivative_b2(Ybatch, pYbatch) + reg*b2) # cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*np.square(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2)) # W1 -= lr0 / (np.sqrt(cache_W1) + eps) *(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) # cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*np.square(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # b1 -= lr0 / (np.sqrt(cache_b1) + eps) *(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # updates # 更聰明的寫法,是把上面式子中,會重複計算到的部分提出來計算並指派給變數,讓它只計算一次,這樣會加速 gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_rms.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label='contant') plt.plot(losses_rms, label='RMSprop') plt.legend() plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print "Performing logistic regression..." Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(1): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for SGD:", datetime.now() - t0 # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches/2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # updates W2 -= lr*gW2 b2 -= lr*gb2 W1 -= lr*gW1 b1 -= lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(K) b2_0 = np.zeros(K) # .1 Adam W1 = W1_0.copy() W2 = W2_0.copy() b1 = b1_0.copy() b2 = b2_0.copy() losses_adam = [] errors_adam = [] # 1st moment mW1 = 0 mW2 = 0 mb1 = 0 mb2 = 0 # 2nd moment vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 # Hyperparams eps = 1e-8 lr = 0.001 beta1 = 0.9 beta2 = 0.999 t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction correction1 = 1 - beta1 ** t mW1_hat = mW1 / correction1 mb1_hat = mb1 / correction1 mW2_hat = mW2 / correction1 mb2_hat = mb2 / correction1 # correction2 = 1 - beta2 ** t vb2_hat = vb2 / correction2 vb1_hat = vb1 / correction2 vW2_hat = vW2 / correction2 vW1_hat = vW1 / correction2 t += 1 # weights W1 = W1 - lr * mW1_hat / np.sqrt(vW1_hat + eps) b1 = b1 - lr * mb1_hat / np.sqrt(vb1_hat + eps) W2 = W2 - lr * mW2_hat / np.sqrt(vW2_hat + eps) b2 = b2 - lr * mb2_hat / np.sqrt(vb2_hat + eps) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_adam.append(l) print(f'Adam Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_adam.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) adam_error = error_rate(pY, Y_test) # 3. RMSProp with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_rms = [] errors_rms = [] # comparable hyper parameters for fair lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_rms.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') err = error_rate(pY, Y_test) errors_rms.append(err) print("Error rate:", err) pY, _ = forward(X_test, W1, b1, W2, b2) rms_error = error_rate(pY, Y_test) print(f"Final RMSProp error rate: {rms_error}") print(f"Final Adam error rate: {adam_error}") plt.plot(losses_adam, label='batch cost') plt.plot(losses_rms, label='RMSProp cost') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. batch with momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nest.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): # Compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nestrov momentum max_iter = 30 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() lr = 0.00004 reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch # cost = -16 losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_batch.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_batch.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final error rate: {error_rate(pY, Y_test)}") # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # Update velocities dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_momentum.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_momentum.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final error rate: {error_rate(pY, Y_test)}") # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # v update vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 # param update W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_nesterov.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_nesterov.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final error rate: {error_rate(pY, Y_test)}") plt.plot(losses_batch, label='batch') plt.plot(losses_momentum, label='momentum') plt.plot(losses_nesterov, label='nesterov') plt.show()
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize the data: mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print('Performing logistic regression...') Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) K = len(set(Y)) np.random.seed() # 1. Full Gradient Descend: W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) LL = [] # a storage for costs lr = 0.0001 # learning rate reg = 0.01 # L2-regularization term t0 = datetime.now() print('utilizing full GD...') for i in range(200): p_y = forward(Xtrain, W, b) W += lr * (grad_W(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (grad_b(Ytrain_ind, p_y).sum(axis=0) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: error = error_rate(p_y_test, Ytest) print('i: %d, cost: %.6f, error: %.6f' % (i, ll, error)) dt1 = datetime.now() - t0 p_y_test = forward(Xtest, W, b) plt.plot(LL) plt.title('Cost for full GD') plt.show() plt.savefig('Cost_full_GD.png') print('Final error rate:', error_rate(p_y_test, Ytest)) print('Elapsed time for full GD:', dt1) # 2. Stochastic Gradien Descent W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) LLstochastic = [] # a storage for costs lr = 0.0001 # learning rate reg = 0.01 # L2-regularization term t0 = datetime.now() print('utilizing stochastic GD...') for i in range(25): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) # we consider just 500 samples, not all the dataset for n in range(N): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, K) p_y = forward(x, W, b) W += lr * (grad_W(y, p_y, x) - reg * W) b += lr * (grad_b(y, p_y).sum(axis=0) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LLstochastic.append(ll) if n % (N // 2) == 0: error = error_rate(p_y_test, Ytest) print('i: %d, cost: %.6f, error: %.6f' % (i, ll, error)) dt2 = datetime.now() - t0 p_y_test = forward(Xtest, W, b) plt.plot(LLstochastic) plt.title('Cost for stochastic GD') plt.show() plt.savefig('Cost_stochastic_GD.png') print('Final error rate:', error_rate(p_y_test, Ytest)) print('Elapsed time for stochastic GD:', dt2) # 3. Batch Gradient Descent: W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) LLbatch = [] lr = 0.0001 # learning rate reg = 0.01 # L2-regularization term batch_size = 500 n_batches = N // batch_size t0 = datetime.now() print('utilizing batch GD...') for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_size:batch_size * (j + 1), :] y = tmpY[j * batch_size:batch_size * (j + 1), :] p_y = forward(x, W, b) W += lr * (grad_W(y, p_y, x) - reg * W) b += lr * (grad_b(y, p_y).sum(axis=0) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LLbatch.append(ll) if j % (n_batches // 2) == 0: error = error_rate(p_y_test, Ytest) print('i: %d, cost: %.6f, error: %.6f' % (i, ll, error)) dt3 = datetime.now() - t0 p_y_test = forward(Xtest, W, b) plt.plot(LLbatch) plt.title('Cost for batch GD') plt.show() plt.savefig('Cost_batch_GD.png') print('Final error rate:', error_rate(p_y_test, Ytest)) print('Elapsed time for batch GD', dt3) # plot all costs together: x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label='full') x2 = np.linspace(0, 1, len(LLstochastic)) plt.plot(x2, LLstochastic, label='stochastic') x3 = np.linspace(0, 1, len(LLbatch)) plt.plot(x3, LLbatch, label='batch') plt.legend() plt.show() plt.savefig('Costs_together.png')
def main(): max_iter = 20 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #const LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #gradient gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 #updates W2 -= lr*gW2 b2 -= lr*gb2 W1 -= lr*gW1 b1 -= lr*gb1 if j%print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) #RMSprop W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #const LL_rms = [] CR_rms = [] lr0 = 0.001 cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0*gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j%print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_rms.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label = 'const') plt.plot(LL_rms, label = 'rms') plt.legend() plt.show()
def main(): # 3 scenarios # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 15 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.0001 reg = 0.001 # Xtrain = X[:-1000, ] # Ytrain = Y[:-1000] # Xtest = X[-1000:, ] # Ytest = Y[-1000:, ] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = int(N / batch_sz) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # Batch losses_batch = [] error_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # A = ' ' # A = u"\n| |\n|----------------------| \n(\\__/) || \n(• v •) || \n / D" if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) # print( # u"|----------------------|\n| | \n Costo # en i=%d, j=%d: \n %.6f" % (i, j, l) + A) e = error_rate(pY, Ytest) error_batch.append(e) print("Ratio de error:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # update velocities dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Nesterov momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # v update vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 # param update W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) plt.plot(losses_batch, label='batch') plt.plot(losses_momentum, label='momentum') plt.plot(losses_nesterov, label='Nesterov') plt.legend() plt.show()
def main(): max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000,], Y[:-1000] Xtest, Ytest = X[-1000:,], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = X.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D+M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #1. batch SGD LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz,] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz,] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #RMSProp W1 = np.random.randn(D, M) / np.sqrt(D+M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr_rms = 0.001 cache_W2, cache_b2, cache_W1, cache_b1 = 0, 0, 0, 0 decay_rate = 0.999 eps = 1e-6 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz,] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz,] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #update gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2**2 W2 -= lr_rms * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2**2 b2 -= lr_rms * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1**2 W1 -= lr_rms * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1**2 b1 -= lr_rms * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "Error rate:", err pY, Z = forward(X, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) test_losses_full = [] lr = 0.9 # lr0 = lr # save for later reg = 0. t0 = datetime.now() last_dt = 0 intervals = [] for i in range(50): p_y = forward(Xtrain, W, b) gW = gradW(Ytrain_ind, p_y, Xtrain) / N gb = gradb(Ytrain_ind, p_y) / N W += lr*(gW - reg*W) b += lr*(gb - reg*b) p_y_test = forward(Xtest, W, b) test_loss = cost(p_y_test, Ytest_ind) dt = (datetime.now() - t0).total_seconds() # save these dt2 = dt - last_dt last_dt = dt intervals.append(dt2) test_losses_full.append([dt, test_loss]) if (i + 1) % 10 == 0: print("Cost at iteration %d: %.6f" % (i + 1, test_loss)) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # save the max time so we don't surpass it in subsequent iterations max_dt = dt avg_interval_dt = np.mean(intervals) # 2. stochastic W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) test_losses_sgd = [] lr = 0.001 reg = 0. t0 = datetime.now() last_dt_calculated_loss = 0 done = False for i in range(50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(N): x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) gW = gradW(y, p_y, x) gb = gradb(y, p_y) W += lr*(gW - reg*W) b += lr*(gb - reg*b) dt = (datetime.now() - t0).total_seconds() dt2 = dt - last_dt_calculated_loss if dt2 > avg_interval_dt: p_y_test = forward(Xtest, W, b) test_loss = cost(p_y_test, Ytest_ind) test_losses_sgd.append([dt, test_loss]) # time to quit if dt > max_dt: done = True break if done: break if (i + 1) % 10 == 0: print("Cost at iteration %d: %.6f" % (i + 1, test_loss)) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. mini-batch W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) test_losses_batch = [] batch_sz = 500 lr = 0.08 reg = 0. n_batches = N // batch_sz t0 = datetime.now() last_dt_calculated_loss = 0 done = False for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) gW = gradW(y, p_y, x) / batch_sz gb = gradb(y, p_y) / batch_sz W += lr*(gW - reg*W) b += lr*(gb - reg*b) dt = (datetime.now() - t0).total_seconds() dt2 = dt - last_dt_calculated_loss if dt2 > avg_interval_dt: p_y_test = forward(Xtest, W, b) test_loss = cost(p_y_test, Ytest_ind) test_losses_batch.append([dt, test_loss]) # time to quit if dt > max_dt: done = True break if done: break if (i + 1) % 10 == 0: print("Cost at iteration %d: %.6f" % (i + 1, test_loss)) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for mini-batch GD:", datetime.now() - t0) # convert to numpy arrays test_losses_full = np.array(test_losses_full) test_losses_sgd = np.array(test_losses_sgd) test_losses_batch = np.array(test_losses_batch) plt.plot(test_losses_full[:,0], test_losses_full[:,1], label="full") plt.plot(test_losses_sgd[:,0], test_losses_sgd[:,1], label="sgd") plt.plot(test_losses_batch[:,0], test_losses_batch[:,1], label="mini-batch") plt.legend() plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 0.0000000001 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() #1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 #2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 #hyperparameters lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 #Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #gradient gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 #new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 #new vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 #bias correction correction1 = 1 - beta1**t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2**t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 #update t t += 1 W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) #RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparameters lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 losses_adam = [] errors_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1**t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2**t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_adam.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_adam.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() plt.plot(losses_adam, label='adam') pY, _ = forward(Xtest, W1, b1, W2, b2) plt.legend() plt.show()
def main(): max_iter = 20 # make 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch GD LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:] yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:] pYbatch, Z= forward(xBatch, W1, b1, W2, b2) W1 -= lr*(derivative_w2(Z, yBatch, pYbatch) + reg*W2) b1 -= lr*(derivative_b2(yBatch, pYbatch) + reg*b2) W2 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b2 -= lr*(derivative_b1(Z, yBatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print()"Final error rate:", error_rate(pY, Ytest)) # 2. batch GD w/ momentum W1 = np.random(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:] yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:] pYbatch, Z= forward(xBatch, W1, b1, W2, b2) # updates dW2 = mu*dW2 - lr*(derivative_w2(Z,yBatch, pYbatch) + reg*W2) W1 += dW2 db2 = mu*db2 - lr*(derivative_b2(yBatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll)) err = error_rate(pY, Ytest) CR_momentum.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print()"Final error rate:", error_rate(pY, Ytest)) # 3. batch GD w/ Nesterov momentum W1 = np.random(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 # dW2 = 0 # db2 = 0 # dW1 = 0 # db1 = 0 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1))) # dW(t) = mu*dW(t-1) + g(t) # W(t) = W(t-1) - mu*dW(t) W1_tmp = W1 - lr*mu*vW1 b1_tmp = b1 - lr*mu*vb1 W2_tmp = W2 - lr*mu*vW2 b2_tmp = b2 - lr*mu*vb2 xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:] yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:] pYbatch, Z= forward(xBatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp) # updates vW2 = mu*vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg*W2_tmp W2 -= lr*vW2 vb2 = mu*vb2 + derivative_b2(Ybatch, pYbatch) + reg*b2_tmp b2 -= lr*vb2 vW1 = mu*vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg*W1_tmp W1 -= lr*vW1 vb1 = mu*vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg*b1_tmp b1 -= lr*vb1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll)) err = error_rate(pY, Ytest) CR_nest.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print()"Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): dobatch = False dobatchwithmomentum = True dobatchwithnesterovmomentum = True max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # batch #cost = -16 if dobatch: losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() # batch with momentum if dobatchwithmomentum: print("momentum") W1 = W1_0.copy b1 = b1_0.copy W2 = W2_0.copy b2 = b2_0.copy losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() # Nesterov momentum if dobatchwithnesterovmomentum: W1 = W1_0.copy b1 = b1_0.copy W2 = W2_0.copy b2 = b2_0.copy losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # 2. stochastic W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range( 50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n, :].reshape( 1, D ) # in this case, x is a vector, shape=(D,) ,為了讓 feature and target 可以計算 forward and Weights 要做reshape y = tmpY[n, :].reshape( 1, 10 ) # y is a vector, need to convert into metrix for y2indicator calculation p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. batch W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): # compare 3: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum # all with L2 regularization X, Y = get_normalized_data() Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) M = 300 K = 10 max_iter = 20 epochs = 20 print_period = 10 lr0 = 0.0004 reg = 0.01 epsilon = 10e-10 decay = 0.999 batch_size = 500 number_batches = int(N // batch_size) W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() cache_W2 = 1 cache_W1 = 1 cache_b2 = 1 cache_b1 = 1 tr_costs = [] errors_batch = [] losses_test = [] # 1. Just grad & RMSprop # 1. for epoch in range(epochs): for j in range(number_batches): xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :] ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :] ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2) # gradients gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2 gb2 = derivative_b2(ytr, ytr_pred) + reg * b2 gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1 gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1 # # AdaGrad # cache_W2 += derivative_w2(z_tr, ytr, ytr_pred) * derivative_w2(z_tr, ytr, ytr_pred) # cache_W1 += derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) * derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) # cache_b2 += derivative_b2(ytr, ytr_pred) * derivative_b2(ytr, ytr_pred) # cache_b1 += derivative_b1(z_tr, ytr, ytr_pred, W2) * derivative_b1(z_tr, ytr, ytr_pred, W2) # RMSProp cache_W2 += decay * cache_W2 + (1 - decay) * gW2 * gW2 cache_W1 += decay * cache_W1 + (1 - decay) * gW1 * gW1 cache_b2 += decay * cache_b2 + (1 - decay) * gb2 * gb2 cache_b1 += decay * cache_b1 + (1 - decay) * gb1 * gb1 W2 -= lr0 * (gW2 // (cache_W2 + epsilon) + reg * W2) b2 -= lr0 * (gb2 // (cache_b2 + epsilon) + reg * b2) W1 -= lr0 * (gW1 // (cache_W1 + epsilon) + reg * W1) b1 -= lr0 * (gb1 // (cache_b1 + epsilon) + reg * b1) if j % print_period == 0: yte_pred, _ = forward(Xtest, W1, b1, W2, b2) l = cost(yte_pred, Ytest_ind) losses_test.append(l) print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l)) e = error_rate(yte_pred, Ytest) errors_batch.append(e) print("Error rate:", e) ctr = cost(ytr_pred, ytr) print("traning set cost", ctr) tr_costs.append(ctr) pY, _ = forward(Xtest, W1, b1, W2, b2) #plt.plot(tr_costs, label='tr_costs') plt.plot(losses_test, label='losses_test RMS') #plt.plot(errors_batch, label='errors_batch') # plt.show() # print("tr_costs", tr_costs) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch grad with momentum & RMSprop # # # 2. W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # regular batch gradient descend tr_costs_momentum = [] errors_batch_momentum = [] losses_test_momentum = [] # momentum coeficient mu = 0.8 cache_W2 = 1 cache_W1 = 1 cache_b2 = 1 cache_b1 = 1 dW1 = 0 dW2 = 0 db1 = 0 db2 = 0 cW1 = 0 cW2 = 0 cb1 = 0 cb2 = 0 for epoch in range(epochs): for j in range(number_batches): xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :] ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :] ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2) # gradients gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2 gb2 = derivative_b2(ytr, ytr_pred) + reg * b2 gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1 gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1 # potencjalnie pojebalem momentum i velocity # RMSProp cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 cW2 = (gW2 // (cache_W2) + epsilon) cb2 = (gb2 // (cache_b2) + epsilon) cW1 = (gW1 // (cache_W1) + epsilon) cb1 = (gb1 // (cache_b1) + epsilon) # update velocity dW2 = mu * dW2 + (1 - mu) * lr0 * cW2 db2 = mu * db2 + (1 - mu) * lr0 * cb2 dW1 = mu * dW1 + (1 - mu) * lr0 * cW1 db1 = mu * db1 + (1 - mu) * lr0 * cb1 # update W2 -= dW2 W1 -= dW1 b2 -= db2 b1 -= db1 if j % print_period == 0: yte_pred, _ = forward(Xtest, W1, b1, W2, b2) l = cost(yte_pred, Ytest_ind) losses_test_momentum.append(l) print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l)) e = error_rate(yte_pred, Ytest) errors_batch_momentum.append(e) print("Error rate:", e) ctr = cost(ytr_pred, ytr) print("traning set cost", ctr) tr_costs_momentum.append(ctr) pY, _ = forward(Xtest, W1, b1, W2, b2) #plt.plot(tr_costs_momentum, label='tr_costs momentum') plt.plot(losses_test_momentum, label='losses_test momentum RMS') #plt.plot(errors_batch, label='errors_batch') # plt.show() # print("tr_costs", errors_batch_momentum) print("Final error rate:", error_rate(pY, Ytest)) plt.legend() plt.show()
hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # Update T t += 1 # Apply Update to parameters W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b1 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Y_test) err_adam.append(err) print("Eror rate: ", err) pY, _ = forward(X_test, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Y_test)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy()
def main(): max_iter = 10 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparams lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1. Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1 ** t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2 ** t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] # comparable hyperparameters for fair comparison lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
def main(): # compare 5 scenarios: # 1. batch SGD with constant learning rate # 2. batch SGD with RMSProp # 3. batch SGD with AdaGrad # 4. batch SGD with exponential decay np.random.seed(2) max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights: W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch SGD with constant learning rate: LL_batch = [] CR_batch = [] t0 = datetime.now() print('\nperforming batch SGD with constant learning rate...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: W2 -= lr * (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, p_Ybatch) + reg * b2) W1 -= lr * (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_batch.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err1 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_batch) #plt.title('Cost for batch GD with const lr') #plt.show() # 2. batch GD with RMSProp: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_RMSProp = [] CR_RMSProp = [] lr0 = 0.001 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay = 0.999 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with RMSProp...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_RMSProp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_RMSProp.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err2 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_RMSProp) #plt.title('Cost for batch SGD with RMSProp') #plt.show() # 3. batch SGD with AdaGrad: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_AdaGrad = [] CR_AdaGrad = [] lr0 = 0.01 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with AdaGrad...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = cache_W2 + gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = cache_b2 + gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = cache_W1 + gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = cache_b1 + gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_AdaGrad.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_AdaGrad.append(error) print('error rate:', error) dt3 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err3 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_AdaGrad) #plt.title('Cost for batch SGD with AdaGrad') #plt.show() ''' # 4. batch SGD with exponential decay: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_exp = [] CR_exp = [] lr0 = 0.0004 # initial learning rate k = 1e-7 t = 0 # initial log lr = lr0 t0 = datetime.now() print('\nperforming batch SGD with lr exponential decay...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_size:(j+1)*batch_size, :] Ybatch = Ytrain_ind[j*batch_size:(j+1)*batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg*W2) W2 -= lr*gW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg*b2) b2 -= lr*gb2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg*W1) W1 -= lr*gW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg*b1) b1 -= lr*gb1 # decrease the learning rate lr = lr0 * np.exp(-k*t) t += 1 if j % print_period == 0: print('current learning rate:', lr) pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_exp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_exp.append(error) print('error rate:', error) dt4 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch SGD with lr exponential decay:', dt4) # plot the cost #plt.plot(LL_exp) #plt.title('Cost for batch SGD with lr exponential decay') #plt.show() ''' print('\nBatch SGD with constant learning rate:') print('final error rate:', final_err1) print('elapsed time:', dt1) print('\nBatch SGD with RMSProp:') print('final error rate:', final_err2) print('elapsed time:', dt2) print('\nBatch SGD with AdaGrad:') print('final error rate:', final_err3) print('elapsed time:', dt3) # plot the costs together: plt.plot(LL_batch, label='const_lr') plt.plot(LL_RMSProp, label='RMSProp') plt.plot(LL_AdaGrad, label='AdaGrad') #plt.plot(LL_exp, label='lr_exp_decay') plt.legend() plt.show()
def main(): # 3 scenarios # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 15 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.0001 reg = 0.001 # Xtrain = X[:-1000, ] # Ytrain = Y[:-1000] # Xtest = X[-1000:, ] # Ytest = Y[-1000:, ] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = int(N / batch_sz) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # Batch losses_batch = [] error_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # A = ' ' # A = u"\n| |\n|----------------------| \n(\\__/) || \n(• v •) || \n / D" if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) # print( # u"|----------------------|\n| | \n Costo # en i=%d, j=%d: \n %.6f" % (i, j, l) + A) e = error_rate(pY, Ytest) error_batch.append(e) print("Ratio de error:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_rms = [] errors_rms = [] lr0 = 0.001 cacheW2 = 0 cacheb2 = 0 cacheW1 = 0 cacheb1 = 0 decay_rate = 0.99 eps = 0.000001 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # caches cacheW2 = decay_rate * cacheW2 + (1 - decay_rate) * gW2 * gW2 cacheb2 = decay_rate * cacheb2 + (1 - decay_rate) * gb2 * gb2 cacheW1 = decay_rate * cacheW1 + (1 - decay_rate) * gW1 * gW1 cacheb1 = decay_rate * cacheb1 + (1 - decay_rate) * gb1 * gb1 W2 -= lr0 * gW2 / (np.sqrt(cacheW2) + eps) b2 -= lr0 * gb2 / (np.sqrt(cacheb2) + eps) W1 -= lr0 * gW1 / (np.sqrt(cacheW1) + eps) b1 -= lr0 * gb1 / (np.sqrt(cacheb1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_rms.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) plt.plot(losses_batch, label='batch') plt.plot(losses_rms, label='rmsprop') plt.legend() plt.show()
def batch_grad(): #get data and for test and train sets X, Y = get_normalized_data() #XTrain = X[:-1000, :] #YTrain = Y[:-1000] #YTrain_ind = y2indicator(YTrain) #XTest = X[-1000:, :] #YTest = Y[-1000:] # = y2indicator(YTest) Y_ind = y2indicator(Y) batchSz = 500 #Initialize random weights N, D = X.shape K = len(set(Y)) M = 300 W1 = np.random.randn(D, M) b1 = np.random.randn(M) W2 = np.random.randn(M, K) b2 = np.random.randn(K) learning_rate = 10e-5 reg = 0.01 no_batches = int(N / batchSz) print("No of bathces: ", no_batches) for i in range(300): for n in range(no_batches): #get current batch XBatch = X[n * batchSz:(n * batchSz + batchSz), :] YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :] #Forward prop pY, Z = forward_relu(XBatch, W1, b1, W2, b2) #Backprop W2 += learning_rate * (derivative_w2(pY, YBatch_ind, Z) + reg * W2) b2 += learning_rate * (derivative_b2(pY, YBatch_ind) + reg * b2) W1 += learning_rate * ( derivative_w1(pY, YBatch_ind, W2, Z, XBatch) + reg * W1) b1 += learning_rate * (derivative_b1(pY, YBatch_ind, W2, Z) + reg * b1) if n % 100 == 0: #Forward prop #pY, Z = forward_relu(XBatch, W1, b1, W2, b2) YBatch = Y[n * batchSz:n * batchSz + batchSz] P = np.argmax(pY, axis=1) er = error_rate(P, YBatch) c = cost(YBatch_ind, pY) print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c) pY, Z = forward_relu(X, W1, b1, W2, b2) p = np.argmax(pY, axis=1) print("Final training error rate: ", error_rate(p, Y)) XTest = get_test_data() pY, ZTest = forward_relu(XTest, W1, b1, W2, b2) YTest = np.argmax(pY, axis=1) f = open("test_result.csv", "w") f.write("ImageId,Label\n") n = YTest.shape[0] for i in range(n): f.write(str(i + 1) + "," + str(YTest[i]) + "\n") f.close()
def batch_grad(): #get data and for test and train sets X, Y = get_normalized_data() #XTrain = X[:-1000, :] #YTrain = Y[:-1000] #YTrain_ind = y2indicator(YTrain) #XTest = X[-1000:, :] #YTest = Y[-1000:] # = y2indicator(YTest) Y_ind = y2indicator(Y) batchSz = 500 #Initialize random weights N, D = X.shape K = len(set(Y)) M = 300 W1 = np.random.randn(D, M) b1 = np.random.randn(M) W2 = np.random.randn(M, K) b2 = np.random.randn(K) learning_rate = 10e-5 no_batches = int(N / batchSz) print("No of bathces: ", no_batches) for i in range(300): for n in range(no_batches): #get current batch XBatch = X[n * batchSz:(n * batchSz + batchSz), :] #YBatch = Y[n*batchSz:n*batchSz + batchSz] YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :] #Forward prop pY, Z = forward_relu(XBatch, W1, b1, W2, b2) #Backprop W2 += learning_rate * derivative_w2(pY, YBatch_ind, Z) b2 += learning_rate * derivative_b2(pY, YBatch_ind) W1 += learning_rate * derivative_w1(pY, YBatch_ind, W2, Z, XBatch) b1 += learning_rate * derivative_b1(pY, YBatch_ind, W2, Z) if n % 100 == 0: #Forward prop #pY, Z = forward_relu(XBatch, W1, b1, W2, b2) YBatch = Y[n * batchSz:n * batchSz + batchSz] P = np.argmax(pY, axis=1) er = error_rate(P, YBatch) c = cost(YBatch_ind, pY) print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c) # pY, Z = forward_prop(XTrain, W1, b1, W2, b2) # P = np.argmax(pY, axis=1) # print("Final training error rate: ", error_rate(P, YTrain)) # # pY, Z = forward_prop(XTest, W1, b1, W2, b2) # P = np.argmax(pY, axis=1) # print("Final testing error rate: ", error_rate(P, YTest)) pY, Z = forward_relu(X, W1, b1, W2, b2) p = np.argmax(pY, axis=1) print("Final Final training error rate: ", error_rate(p, Y))
def main(): X, Y, _, _ = get_transformed_data() X = X[:, :300] # normalize X first mu = X.mean(axis=0) std = X.std(axis=0) X = (X - mu) / std print "Performing logistic regression..." Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange(200): p_y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for full GD:", datetime.now() - t0 # 2. stochastic W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in xrange( 1): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in xrange(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for SGD:", datetime.now() - t0 # 3. batch W = np.random.randn(D, 10) / 28 b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N / batch_sz t0 = datetime.now() for i in xrange(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in xrange(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] p_y = forward(x, W, b) W += lr * (gradW(y, p_y, x) - reg * W) b += lr * (gradb(y, p_y) - reg * b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(p_y_test, Ytest) print "Cost at iteration %d: %.6f" % (i, ll) print "Error rate:", err p_y = forward(Xtest, W, b) print "Final error rate:", error_rate(p_y, Ytest) print "Elapsted time for batch GD:", datetime.now() - t0 x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): p_y = forward(Xtrain, W, b) W += lr*(gradW(Ytrain_ind, p_y, Xtrain) - reg*W) b += lr*(gradb(Ytrain_ind, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for full GD:", datetime.now() - t0) # 2. stochastic W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(50): # takes very long since we're computing cost for 41k samples tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n,:].reshape(1,D) y = tmpY[n,:].reshape(1,10) p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_stochastic.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for SGD:", datetime.now() - t0) # 3. batch W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] p_y = forward(x, W, b) W += lr*(gradW(y, p_y, x) - reg*W) b += lr*(gradb(y, p_y) - reg*b) p_y_test = forward(Xtest, W, b) ll = cost(p_y_test, Ytest_ind) LL_batch.append(ll) if i % 1 == 0: err = error_rate(p_y_test, Ytest) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()
def main(): max_iter = 20 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() lr = 0.00004 reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(K) b2 = np.zeros(K) # copy weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. Constant Learning rate losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_batch.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_batch.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) batch_error = error_rate(pY, Y_test) print(f"Final batch error rate: {batch_error}") # 2. RMSProp W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_RMSP = [] errors_RMSP = [] lr0 = 0.001 cache_W1 = 1 cache_b1 = 1 cache_W2 = 1 cache_b2 = 1 decay = 0.999 epsilon = 1e-10 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + epsilon) gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + epsilon) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + epsilon) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + epsilon) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_RMSP.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_RMSP.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final RMSProp error rate: {error_rate(pY, Y_test)}") print(f"Final batch error rate: {batch_error}") plt.plot(losses_batch, label='batch cost') plt.plot(losses_RMSP, label='RMSProp cost') plt.legend() plt.show()
def main(): # compare 2 scenarios: # 1. batch GD with RMSProp and momentum # 2. Adam GD max_iter = 20 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) # 1. batch GD with RMSProp and momentum: print('\nperforming batch GD with RMSProp and momentum...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_rm = [] CR_rm = [] # hyperparams: lr0 = 0.001 #lr0 = 0.0001 mu = 0.9 decay = 0.999 eps = 10e-9 # momentum (velocity terms): dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 # rms-prop cache (with no bias correction): cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 t0 = datetime.now() for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: # (note: we utilize a bit different version of momentum) gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2 + eps)) W2 -= dW2 #dW2 = mu*dW2 - lr0*gW2 / (np.sqrt(cache_W2) + eps) #W2 += dW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2 + eps)) b2 -= db2 #db2 = mu*db2 - lr0*gb2 / (np.sqrt(cache_b2) + eps) #b2 += db2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1 + eps)) W1 -= dW1 #dW1 = mu*dW1 - lr0*gW1 / (np.sqrt(cache_W1) + eps) #W1 += dW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1 + eps)) b1 -= db1 #db1 = mu*db1 - lr0*gb1 / (np.sqrt(cache_b1) + eps) #b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_rm.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_rm.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch GD with RMSProp and momentum:', dt1) # plot the cost plt.plot(LL_rm) plt.title('Cost for batch GD with RMSProp and momentum') plt.show() # 2. Adam optimizer print('\nperforming Adam optimizer...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # hyperparams: lr = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 10e-9 # 1st moment: mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment: vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 LL_adam = [] CR_adam = [] t0 = datetime.now() t = 1 # index; used instead of j, because j starts with 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates: # gradients: gW2 = derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2 gb2 = derivative_b2(Ybatch, p_Ybatch) + reg * b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1 # 1st moment: mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 # 2nd moment: vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction: mW2_bc = mW2 / (1 - beta1**t) mb2_bc = mb2 / (1 - beta1**t) mW1_bc = mW1 / (1 - beta1**t) mb1_bc = mb1 / (1 - beta1**t) vW2_bc = vW2 / (1 - beta2**t) vb2_bc = vb2 / (1 - beta2**t) vW1_bc = vW1 / (1 - beta2**t) vb1_bc = vb1 / (1 - beta2**t) # weights and biases (parameters): W2 = W2 - lr * mW2_bc / np.sqrt(vW2_bc + eps) b2 = b2 - lr * mb2_bc / np.sqrt(vb2_bc + eps) W1 = W1 - lr * mW1_bc / np.sqrt(vW1_bc + eps) b1 = b1 - lr * mb1_bc / np.sqrt(vb1_bc + eps) t += 1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_adam.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_adam.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for Adam optimizer:', dt2) # plot the cost plt.plot(LL_adam) plt.title('Cost for Adam optimizer') plt.show() # plot costs from the two experiments together: plt.plot(LL_rm, label='RMSProp with momentum') plt.plot(LL_adam, label='Adam optimizer') plt.title('Cost') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. batch with momentum W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * dW1 - lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 # alternate version uses dW # dW2 = 0 # db2 = 0 # dW1 = 0 # db1 = 0 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1))) # dW(t) = mu*dW(t-1) + g(t) # W(t) = W(t-1) - mu*dW(t) W1_tmp = W1 - lr * mu * vW1 b1_tmp = b1 - lr * mu * vb1 W2_tmp = W2 - lr * mu * vW2 b2_tmp = b2 - lr * mu * vb2 Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] # pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) pYbatch, Z = forward(Xbatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp) # updates # dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # W2 += dW2 # db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) # b2 += db2 # dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) # W1 += dW1 # db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # b1 += db1 vW2 = mu * vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg * W2_tmp W2 -= lr * vW2 vb2 = mu * vb2 + derivative_b2(Ybatch, pYbatch) + reg * b2_tmp b2 -= lr * vb2 vW1 = mu * vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg * W1_tmp W1 -= lr * vW1 vb1 = mu * vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg * b1_tmp b1 -= lr * vb1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nest.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): ''' RMSprop is a form adaptative learning rate which decreases over time ''' max_iter = 20 #for RelU #max_iter = 30 #for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.0004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M =300 K=10 #1. batch SGD W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :] Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch,pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period ==0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #2. RMSProp W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 1 - 1e-5 eps = 1e-10 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :] Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1-decay_rate)*gW2*gW2 W2 -= lr0*gW2 /(np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch,pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1-decay_rate)*gb2*gb2 b2 -= lr0*gb2 /(np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1-decay_rate)*gW1*gW1 W1 -= lr0*gW1 /(np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1-decay_rate)*gb1*gb1 b1 -= lr0*gb1 /(np.sqrt(cache_b1) + eps) if j % print_period ==0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "RMS Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "RMS Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "RMS Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='batch') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
correction2 = 1 - beta2**t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) if j % print_period == 0: print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.show() # In[4]:
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 0.0000000001 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print('logistic regression') # randomly assign weights N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) M = 10 scale = 28 # full grad descent W, b = initwb(D, M, scale) LL = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(200): P_Y = forward(Xtrain, W, b) W += lr * (gradW(Ytrain_ind, P_Y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, P_Y) - reg * b) P_Y_test = forward(Xtest, W, b) ll = cost(P_Y_test, Ytest_ind) LL.append(ll) if i % 10 == 0: err = error_rate(P_Y_test, Ytest) print("cost at iter: %d: %.6f" % (i, ll)) print("error rate: ", err, "\n") P_Y = forward(Xtest, W, b) print("final error: ", error_rate(P_Y, Ytest)) print("elapsed time for full GD: ", datetime.now() - t0) # 2. Stochastic W, b = initwb(D, M, scale) LL_stochastic = [] lr = 0.0001 reg = 0.01 t0 = datetime.now() for i in range(1): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, 10) P_Y = forward(x, W, b) W += lr * (gradW(y, P_Y, x) - reg * W) b += lr * (gradb(y, P_Y) - reg * b) P_Y_test = forward(Xtest, W, b) ll = cost(P_Y_test, Ytest_ind) LL_stochastic.append(ll) if n % (N / 2) == 0: err = error_rate(P_Y_test, Ytest) print("Cost at iteration %d: %6.f" % (i, ll)) print("error rate: ", err) P_Y = forward(Xtest, W, b) print("error rate: ", error_rate(P_Y, Ytest)) print("elapsed time for SGD: ", datetime.now() - t0) # batch W, b = initwb(D, M, scale) LL_batch = [] lr = 0.001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz t0 = datetime.now() for i in range(50): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] P_Y = forward(x, W, b) W += lr * (gradW(y, P_Y, x) - reg * W) b += lr * (gradb(y, P_Y) - reg * b) P_Y_test = forward(Xtest, W, b) ll = cost(P_Y_test, Ytest_ind) LL_batch.append(ll) if j % (n_batches / 2) == 0: err = error_rate(P_Y_test, Ytest) print("Cost at iteration %d: %6.f" % (i, ll)) print("error rate: ", err) P_Y = forward(Xtest, W, b) print("error rate: ", error_rate(P_Y, Ytest)) print("elapsed time for SGD: ", datetime.now() - t0) x1 = np.linspace(0, 1, len(LL)) plt.plot(x1, LL, label="full") x2 = np.linspace(0, 1, len(LL_stochastic)) plt.plot(x2, LL_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(LL_batch)) plt.plot(x3, LL_batch, label="batch") plt.legend() plt.show()