def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = int(N / batch_sz) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 0.0000000001 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): max_iter = 20 print_period = 20 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() Ytrain_ind = y2indicator(Ytrain) # Target of train data Ytest_ind = y2indicator(Ytest) # Target of test data lr = 0.00004 reg = 0.01 N, D = Xtrain.shape M = 300 K = 10 np.random.seed(123) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) batch_sz = 500 n_batches = N // batch_sz # 82 # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. learning rate = constant losses_batch = [] errors_batch = [] for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_rms = [] errors_rms = [] ''' in RMSprop you can use a bigger lr, but if you set this too high you'll get NaN! if you use the same learning rate within RMSprop and General method, there is only slight difference between them. ''' lr0 = 0.001 cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # # update # cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*np.square(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # W2 -= lr0 / (np.sqrt(cache_W2) + eps) *(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*np.square(derivative_b2(Ybatch, pYbatch) + reg*b2) # b2 -= lr0 / (np.sqrt(cache_b2) + eps) *(derivative_b2(Ybatch, pYbatch) + reg*b2) # cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*np.square(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2)) # W1 -= lr0 / (np.sqrt(cache_W1) + eps) *(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) # cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*np.square(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # b1 -= lr0 / (np.sqrt(cache_b1) + eps) *(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # updates # 更聰明的寫法,是把上面式子中,會重複計算到的部分提出來計算並指派給變數,讓它只計算一次,這樣會加速 gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_rms.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label='contant') plt.plot(losses_rms, label='RMSprop') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. batch with momentum W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * dW1 - lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 # alternate version uses dW # dW2 = 0 # db2 = 0 # dW1 = 0 # db1 = 0 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1))) # dW(t) = mu*dW(t-1) + g(t) # W(t) = W(t-1) - mu*dW(t) W1_tmp = W1 - lr * mu * vW1 b1_tmp = b1 - lr * mu * vb1 W2_tmp = W2 - lr * mu * vW2 b2_tmp = b2 - lr * mu * vb2 Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] # pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) pYbatch, Z = forward(Xbatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp) # updates # dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # W2 += dW2 # db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) # b2 += db2 # dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) # W1 += dW1 # db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # b1 += db1 vW2 = mu * vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg * W2_tmp W2 -= lr * vW2 vb2 = mu * vb2 + derivative_b2(Ybatch, pYbatch) + reg * b2_tmp b2 -= lr * vb2 vW1 = mu * vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg * W1_tmp W1 -= lr * vW1 vb1 = mu * vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg * b1_tmp b1 -= lr * vb1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nest.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 10 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #SAVE INITIAL WEIGHT AND BIAS W1_copy = W1.copy() b1_copy = b1.copy() W2_copy = W2.copy() b2_copy = b2.copy() #1st moment mW1 = 0 mW2 = 0 mb1 = 0 mb2 = 0 #2nd moment vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 #hyperparams learning_rate = 0.001 beta1 = 0.99 beta2 = 0.999 eps = 1e-8 #adam lose_adam = [] error_adam = [] t = 1 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) #update gradient gW2 = derivative_w2(Z, y, pY) + reg * W2 gb2 = derivative_b2(y, pY) + reg * b2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1 gb1 = derivative_b1(Z, y, pY, W2) + reg * b1 #update 1st moment mW1 = beta1 * mW1 + (1 - beta1) * gW1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mb2 = beta1 * mb2 + (1 - beta1) * gb2 #update 2nd moment vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 #bias correction correction_1 = 1 - beta1**t correction_2 = 1 - beta2**t mW1_hat = mW1 / correction_1 mW2_hat = mW2 / correction_1 mb1_hat = mb1 / correction_1 mb2_hat = mb2 / correction_1 vW1_hat = vW1 / correction_2 vW2_hat = vW2 / correction_2 vb1_hat = vb1 / correction_2 vb2_hat = vb2 / correction_2 #update t t += 1 #update weight W2 -= learning_rate * mW2_hat / np.sqrt(vW2_hat + eps) b2 -= learning_rate * mb2_hat / np.sqrt(vb2_hat + eps) b1 -= learning_rate * mb1_hat / np.sqrt(vb1_hat + eps) W1 -= learning_rate * mW1_hat / np.sqrt(vW1_hat + eps) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_adam.append(l) error_adam.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #RMSprop with momentum W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() #hyperparams learning_rate = 0.001 decay_rate = 0.999 mu = 0.9 eps = 1e-8 #rmsprop cache cache_W1 = 1 cache_W2 = 1 cache_b1 = 1 cache_b2 = 1 #momentum dW1 = 0 dW2 = 0 db1 = 0 db2 = 0 lose_rmsprop_m = [] error_rmsprop_m = [] t = 1 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) #udpate gW2 = derivative_w2(Z, y, pY) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 - ( 1 - mu) * learning_rate * gW2 / np.sqrt(cache_W2 + eps) W2 += dW2 gb2 = derivative_b2(y, pY) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 - ( 1 - mu) * learning_rate * gb2 / np.sqrt(cache_b2 + eps) b2 += db2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 - ( 1 - mu) * learning_rate * gW1 / np.sqrt(cache_W1 + eps) W1 += dW1 gb1 = derivative_b1(Z, y, pY, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 - ( 1 - mu) * learning_rate * gb1 / np.sqrt(cache_b1 + eps) b1 += db1 # #update cache # cache_W1 = decay_rate * cache_W1 + (1-decay_rate)*gW1*gW1 # cache_W2 = decay_rate * cache_W2 + (1-decay_rate)*gW2*gW2 # cache_b1 = decay_rate * cache_b1 + (1-decay_rate)*gb1*gb1 # cache_b2 = decay_rate * cache_b2 + (1-decay_rate)*gb2*gb2 # #update momentum # dW2 = mu*dW2 + (1-mu) * learning_rate * gW2 / (np.sqrt(cache_W2) + eps) # db2 = mu*db2 + (1-mu) * learning_rate * gb2 / (np.sqrt(cache_b2) + eps) # dW1 = mu*dW1 + (1-mu) * learning_rate * dW1 / (np.sqrt(cache_W1) + eps) # db1 = mu*db1 + (1-mu) * learning_rate * db1 / (np.sqrt(cache_b1) + eps) # #update weights # W2 -= dW2 # b2 -= db2 # W1 -= dW1 # b1 -= db1 if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_rmsprop_m.append(l) error_rmsprop_m.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) plt.plot(lose_adam, label="adam") plt.plot(lose_rmsprop_m, label="rmsprop with momentum") plt.legend() plt.show()
# Hyperparams lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1. Adam optimizer loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batch): X_batch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Y_batch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pY_batch, Z = forward(X_batch, W1, b1, W2, b2) # Update the gradiant gW2 = derivative_w2(Z, Y_batch, pY_batch) + reg * W2 gb2 = derivative_b2(Y_batch, pY_batch) + reg * b2 gW1 = derivative_w1(X_batch, Z, Y_batch, pY_batch, W2) + reg * W1 gb1 = derivative_b1(Z, Y_batch, pY_batch, W2) + reg * b1 # Update new Moments mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # Update new Velocity vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1
def main(): # compare 2 scenarios: # 1. batch GD with RMSProp and momentum # 2. Adam GD max_iter = 20 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) # 1. batch GD with RMSProp and momentum: print('\nperforming batch GD with RMSProp and momentum...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_rm = [] CR_rm = [] # hyperparams: lr0 = 0.001 #lr0 = 0.0001 mu = 0.9 decay = 0.999 eps = 10e-9 # momentum (velocity terms): dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 # rms-prop cache (with no bias correction): cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 t0 = datetime.now() for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: # (note: we utilize a bit different version of momentum) gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2 + eps)) W2 -= dW2 #dW2 = mu*dW2 - lr0*gW2 / (np.sqrt(cache_W2) + eps) #W2 += dW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2 + eps)) b2 -= db2 #db2 = mu*db2 - lr0*gb2 / (np.sqrt(cache_b2) + eps) #b2 += db2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1 + eps)) W1 -= dW1 #dW1 = mu*dW1 - lr0*gW1 / (np.sqrt(cache_W1) + eps) #W1 += dW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1 + eps)) b1 -= db1 #db1 = mu*db1 - lr0*gb1 / (np.sqrt(cache_b1) + eps) #b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_rm.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_rm.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch GD with RMSProp and momentum:', dt1) # plot the cost plt.plot(LL_rm) plt.title('Cost for batch GD with RMSProp and momentum') plt.show() # 2. Adam optimizer print('\nperforming Adam optimizer...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # hyperparams: lr = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 10e-9 # 1st moment: mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment: vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 LL_adam = [] CR_adam = [] t0 = datetime.now() t = 1 # index; used instead of j, because j starts with 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates: # gradients: gW2 = derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2 gb2 = derivative_b2(Ybatch, p_Ybatch) + reg * b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1 # 1st moment: mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 # 2nd moment: vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction: mW2_bc = mW2 / (1 - beta1**t) mb2_bc = mb2 / (1 - beta1**t) mW1_bc = mW1 / (1 - beta1**t) mb1_bc = mb1 / (1 - beta1**t) vW2_bc = vW2 / (1 - beta2**t) vb2_bc = vb2 / (1 - beta2**t) vW1_bc = vW1 / (1 - beta2**t) vb1_bc = vb1 / (1 - beta2**t) # weights and biases (parameters): W2 = W2 - lr * mW2_bc / np.sqrt(vW2_bc + eps) b2 = b2 - lr * mb2_bc / np.sqrt(vb2_bc + eps) W1 = W1 - lr * mW1_bc / np.sqrt(vW1_bc + eps) b1 = b1 - lr * mb1_bc / np.sqrt(vb1_bc + eps) t += 1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_adam.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_adam.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for Adam optimizer:', dt2) # plot the cost plt.plot(LL_adam) plt.title('Cost for Adam optimizer') plt.show() # plot costs from the two experiments together: plt.plot(LL_rm, label='RMSProp with momentum') plt.plot(LL_adam, label='Adam optimizer') plt.title('Cost') plt.legend() plt.show()
def main(): max_iter = 20 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() learning_rate = 0.00004 reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #SAVE INITIAL WEIGHT AND BIAS W1_copy = W1.copy() b1_copy = b1.copy() W2_copy = W2.copy() b2_copy = b2.copy() #constant learning_rate lose_constant = [] error_constant = [] for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) W2 -= learning_rate * (derivative_w2(Z, y, pY) + reg * W2) b2 -= learning_rate * (derivative_b2(y, pY) + reg * b2) W1 -= learning_rate * (derivative_w1(x, Z, y, pY, W2) + reg * W1) b1 -= learning_rate * (derivative_b1(Z, y, pY, W2) + reg * b1) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_constant.append(l) error_constant.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #RMSprop W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() learning_rate_0 = 0.001 lose_non_costant = [] error_non_constant = [] cache_W1 = 1 cache_W2 = 1 cache_b1 = 1 cache_b2 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) gW2 = derivative_w2(Z, y, pY) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= learning_rate_0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(y, pY) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= learning_rate_0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= learning_rate_0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, y, pY, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= learning_rate_0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_non_costant.append(l) error_non_constant.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) plt.plot(lose_constant, label="batch") plt.plot(lose_non_costant, label="non_constant") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 K = len(set(Ytrain)) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 20 # 1. batch costs_batch = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) W2 -= lr * (derivative_W2(Z, pY, y) + reg * W2) b2 -= lr * (derivative_b2(pY, y) + reg * b2) W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg * W1) b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg * b1) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 2. RMSprop W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 lr0 = 0.001 costs_RMS = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) gW2 = (derivative_W2(Z, pY, y) + reg * W2) cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = (derivative_b2(pY, y) + reg * b2) cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1) cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1) cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_RMS.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) plt.plot(costs_batch, label="batch") plt.plot(costs_RMS, label="rms") plt.legend() plt.show()
b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] <<<<<<< HEAD for i in xrange(max_iter): for j in xrange(n_batches): ======= for i in range(max_iter): for j in range(n_batches): >>>>>>> upstream/master Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) <<<<<<< HEAD
def main(): # load the data: (Xtrain, Ytrain), (Xtest, Ytest) = mnist.load_data() # print(Xtrain.shape) N, d, _ = Xtrain.shape D = d*d Ntest = len(Xtest) # normalize the data: Xtrain = Xtrain / 255.0 Xtest = Xtest / 255.0 # display: # n = np.random.choice(N) # plt.imshow(Xtrain[n], cmap='gray') # plt.title(str(Ytrain[n])) # plt.show() # reshape the data: Xtrain = Xtrain.reshape(N, D) Xtest = Xtest.reshape(Ntest, D) # print('Xtrain.max():', Xtrain.max()) # print('Xtrain.shape:', Xtrain.shape) Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # define hyperparameters: epochs = 30 print_period = 10 lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz M = 300 # the hidden layer size K = len(set(Ytrain)) # randomly initialize the weights: W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) # 1. mini-batch SGD: losses_batch = [] errors_batch = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() print('\nmini-batch SGD') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # update the params: W2 -= lr*(derivative_W2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_batch.append(l) e = error_rate(pY, Ytest) errors_batch.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) # print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l)) # print('error_rate:', e) sys.stdout.flush() pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) # 2. mini-batch SGD with momentum - version 1: losses_momentum1 = [] errors_momentum1 = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() mu = 0.9 # momentum term # initial values for the 'velocities': dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 print('\nmini-batch SGD with momentum - version 1') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # calculate the gradients: gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update the 'velocities': dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # update the params: W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_momentum1.append(l) e = error_rate(pY, Ytest) errors_momentum1.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) # print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l)) # print('error_rate:', e) sys.stdout.flush() pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) ''' # 3. mini-batch SGD with momentum - version 2: losses_momentum2 = [] errors_momentum2 = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() mu = 0.9 # momentum term # initial values for the 'velocities': dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 # lr = 0.0004 print('\nmini-batch SGD with momentum - version 2') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # calculate the gradients: gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # # update the 'velocities': dW2 = mu*dW2 + (1-mu)*gW2 db2 = mu*db2 + (1-mu)*gb2 dW1 = mu*dW1 + (1-mu)*gW1 db1 = mu*db1 + (1-mu)*gb1 # update the 'velocities': # dW2 = mu*dW2 + gW2 # db2 = mu*db2 + gb2 # dW1 = mu*dW1 + gW1 # db1 = mu*db1 + gb1 # update the params: W2 -= lr*dW2 b2 -= lr*db2 W1 -= lr*dW1 b1 -= lr*db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_momentum2.append(l) e = error_rate(pY, Ytest) errors_momentum2.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) sys.stdout.flush() pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) # best result: epochs = 25, final_error = 0.0179 ''' # 4. mini-batch SGD with Nesterov momentum: losses_nesterov_momentum = [] errors_nesterov_momentum = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() mu = 0.9 # momentum term # initial values for the 'velocities': dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 print('\nmini-batch SGD with Nesterov momentum') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # calculate the gradients: gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update the 'velocities': dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # update the params: W2 += mu*dW2 - lr*gW2 b2 += mu*db2 - lr*gb2 W1 += mu*dW1 - lr*gW1 b1 += mu*db1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_nesterov_momentum.append(l) e = error_rate(pY, Ytest) errors_nesterov_momentum.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) sys.stdout.flush() # print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l)) # print('error_rate:', e) pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) # plot the losses: plt.plot(losses_batch, label='mini-batch SGD') plt.plot(losses_momentum1, label='+ momentum') plt.plot(losses_nesterov_momentum, label='+ Nesterov momentum') plt.xlabel('iterations') plt.ylabel('loss') plt.legend() plt.show()
def main(): max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000,], Y[:-1000] Xtest, Ytest = X[-1000:,], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = X.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D+M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #1. batch SGD LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz,] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz,] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #RMSProp W1 = np.random.randn(D, M) / np.sqrt(D+M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr_rms = 0.001 cache_W2, cache_b2, cache_W1, cache_b1 = 0, 0, 0, 0 decay_rate = 0.999 eps = 1e-6 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz,] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz,] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #update gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2**2 W2 -= lr_rms * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2**2 b2 -= lr_rms * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1**2 W1 -= lr_rms * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1**2 b1 -= lr_rms * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "Error rate:", err pY, Z = forward(X, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): max_iter = 20 # make 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch GD LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:] yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:] pYbatch, Z= forward(xBatch, W1, b1, W2, b2) W1 -= lr*(derivative_w2(Z, yBatch, pYbatch) + reg*W2) b1 -= lr*(derivative_b2(yBatch, pYbatch) + reg*b2) W2 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b2 -= lr*(derivative_b1(Z, yBatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print()"Final error rate:", error_rate(pY, Ytest)) # 2. batch GD w/ momentum W1 = np.random(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:] yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:] pYbatch, Z= forward(xBatch, W1, b1, W2, b2) # updates dW2 = mu*dW2 - lr*(derivative_w2(Z,yBatch, pYbatch) + reg*W2) W1 += dW2 db2 = mu*db2 - lr*(derivative_b2(yBatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll)) err = error_rate(pY, Ytest) CR_momentum.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print()"Final error rate:", error_rate(pY, Ytest)) # 3. batch GD w/ Nesterov momentum W1 = np.random(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 # dW2 = 0 # db2 = 0 # dW1 = 0 # db1 = 0 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1))) # dW(t) = mu*dW(t-1) + g(t) # W(t) = W(t-1) - mu*dW(t) W1_tmp = W1 - lr*mu*vW1 b1_tmp = b1 - lr*mu*vb1 W2_tmp = W2 - lr*mu*vW2 b2_tmp = b2 - lr*mu*vb2 xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:] yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:] pYbatch, Z= forward(xBatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp) # updates vW2 = mu*vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg*W2_tmp W2 -= lr*vW2 vb2 = mu*vb2 + derivative_b2(Ybatch, pYbatch) + reg*b2_tmp b2 -= lr*vb2 vW1 = mu*vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg*W1_tmp W1 -= lr*vW1 vb1 = mu*vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg*b1_tmp b1 -= lr*vb1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll)) err = error_rate(pY, Ytest) CR_nest.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print()"Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 20 print_period = 10 X_train, X_test, t_train, t_test = get_normalized_data() T_train = T_indicator(t_train) T_test = T_indicator(t_test) lr = 0.00004 reg = 0.01 N, D = X_train.shape batch_sz = 500 nb_batches = N // batch_sz M = 300 K = 10 print( 'N_train = {}\t N_test = 1000\t D = {}\t M = {}\t K = {}\t batch_size = {}\t nb_batches = {}\t lr_cst = {}\n' .format(N, D, M, K, batch_sz, nb_batches, lr)) # np.sqrt(D) ~ 28 W0 = np.random.randn(D, M) / 28 b0 = np.zeros(M) W1 = np.random.randn(M, K) / np.sqrt(M) b1 = np.zeros(K) # 1. CONSTANT LEARNING RATE print('CONSTANT LEARNING RATE') #t0 = datetime.now() J_constant_lr = [] # measured on test data every 10 batches accuracy_constant_lr = [] # measured on test data every 10 batches for epoch in range(max_iter): for batch_index in range(nb_batches): X_batch = X_train[batch_index * batch_sz:(batch_index + 1) * batch_sz, ] T_batch = T_train[batch_index * batch_sz:(batch_index + 1) * batch_sz, ] A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1) # Updates W1 -= lr * J_derivative_W1(T_batch, Y_batch, A_batch) b1 -= lr * J_derivative_b1(T_batch, Y_batch) W0 -= lr * J_derivative_W0(T_batch, Y_batch, W1, A_batch, X_batch) b0 -= lr * J_derivative_b0(T_batch, Y_batch, W1, A_batch) if (batch_index % print_period) == 0: _, Y_test = forward(X_test, W0, b0, W1, b1) j_test = J(T_test, Y_test) J_constant_lr.append(j_test) acc = accuracy(predict(Y_test), t_test) accuracy_constant_lr.append(acc) print( 'Epoch n° {} batch n° {}:\t TEST COST {}\t TEST ACCURACY RATE: {}' .format(epoch, batch_index, j_test, acc)) _, Y_test_final = forward(X_test, W0, b0, W1, b1) print('Final ACCURACY RATE on TEST data: {}\n'.format( accuracy(predict(Y_test_final), t_test))) #print('Constant lr execution time: {}\n'.format(datetime.now() - t0)) # 2. RMSProp print('RMSProp') #t0 = datetime.now() W0 = np.random.randn(D, M) / 28 b0 = np.zeros(M) W1 = np.random.randn(M, K) / np.sqrt(M) b1 = np.zeros(K) J_RMSProp = [] accuracy_RMSProp = [] lr0 = 0.001 #if you set the initial lr too high you'll get Nan cache_W1 = 0 cache_b1 = 0 cache_W0 = 0 cache_b0 = 0 decay = 0.999 eps = 0.000001 for epoch in range(max_iter): for b_index in range(nb_batches): X_batch = X_train[b_index * batch_sz:(b_index + 1) * batch_sz, ] T_batch = T_train[b_index * batch_sz:(b_index + 1) * batch_sz, ] A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1) # Updates gW1 = J_derivative_W1(T_batch, Y_batch, A_batch) + reg * W1 cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr / (np.sqrt(cache_W1 + eps)) * gW1 gb1 = J_derivative_b1(T_batch, Y_batch) + reg * b1 cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr / (np.sqrt(cache_b1) + eps) * gb1 gW0 = J_derivative_W0(T_batch, Y_batch, W1, A_batch, X_batch) + reg * W0 cache_W0 = decay * cache_b0 + (1 - decay) * gW0 * gW0 W0 -= lr / (np.sqrt(cache_W0) + eps) * gW0 gb0 = J_derivative_b0(T_batch, Y_batch, W1, A_batch) cache_b0 = decay * cache_b0 + (1 - decay) * gb0 * gb0 b0 -= lr / (np.sqrt(cache_b0) + eps) * gb0 if (b_index % 10) == 0: _, Y_test = forward(X_test, W0, b0, W1, b1) j_test = J(T_test, Y_test) J_RMSProp.append(j_test) acc = accuracy(predict(Y_test), t_test) accuracy_RMSProp.append(acc) print( 'Epoch n° {} Batch n°{}:\t TEST COST: {}\t TEST ACCURACY RATE: {}' .format(epoch, b_index * nb_batches, j_test, acc)) _, Y_test_final = forward(X_test, W0, b0, W1, b1) print('Final accuracy rate on test data: {}'.format( accuracy(predict(Y_test_final), t_test))) #print('Constant lr execution time: {}'.format(datetime.now() - t0)) plt.plot(J_constant_lr, label='constant lr') plt.plot(J_RMSProp, label='RMSProp') plt.legend() plt.savefig('RMSProp.py')
def momentum_batch(): """ use util functions to run the logistic classification with bp """ X_train, Y_train, X_test, Y_test = get_transformed_digit() N,D = X_train.shape yindi_train = y2indicator(Y_train) yindi_test = y2indicator(Y_test) M = 300 K = 10 # W = np.random.rand(D,M) # b = np.random.rand(M) W1 = np.random.rand(D,M)/np.sqrt(D) b1 = np.zeros(M) W2 = np.random.rand(M,K)/np.sqrt(M) b2 = np.zeros(K) cost_test = [] error_test = [] eta = 0.00004 penalty = 0.001 batch_size = 500 batch_num = N // batch_size mu = 0.9 vw2 = 0 vb2 = 0 vw1 = 0 vb1 = 0 t1 = time.time() #batch for i in range(100): X_shuffle,Y_train_shuffle = shuffle(X_train,yindi_train) for ii in range(int(batch_num)): # x_tem = X_shuffle[ii].reshape(1,D) # y_tem = Y_train_shuffle[ii].reshape(1,10) x_tem = X_shuffle[int(i*batch_size):int((i+1)*batch_size)] y_tem = Y_train_shuffle[int(i*batch_size):int((i+1)*batch_size)] # y_fit = forward(x = x_tem,w=W,b=b) y_fit, z = forward(x = x_tem, w1 = W1, b1 = b1, w2 = W2, b2 = b2, method = 'relu') #the only change to benchmark batch is the update rule: gw2 = deri_w2(z = z, y= y_fit,t = y_tem) + penalty * W2 gb2 = deri_b2(y = y_fit, t = y_tem) + penalty*b2 gw1 = deri_w1(X = x_tem,Z = z,T = y_tem, Y = y_fit, W2 = W2) + penalty*W1 gb1 = eta*(deri_b1(Z = z,T = y_tem, Y = y_fit,W2= W2) + penalty*b1) vw2 = mu*vw2 - eta * gw2 vb2 = mu*vb2 - eta * gb2 vw1 = mu*vw1 - eta * gw1 vb1 = mu*vb1 - eta * gb1 W2 += vw2 b2 += vb2 W1 += vw1 b1 += vb1 p_y_test,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu') cost_test_tem = cost(y_matrix = p_y_test,t_matrix = yindi_test) cost_test.append(cost_test_tem) error_tem = error_rate(y_matrix = p_y_test, target = Y_test) print("the error rate in "+str(i)+" is :"+str(error_tem)) t2 = time.time() print("the whole process takes "+str(t2-t1)+" seconds") p_y_final,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu') error_final = error_rate(y_matrix = p_y_final, target = Y_test) print("the final error rate is "+str(error_final))
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. batch with momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nest.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
#Values for grid search nh = np.array([2, 4, 8, 10]) #Number of nodes in hidden layer et = np.array([0.001, 0.01, 0.1]) #Learning rate train_accuracy = np.zeros((len(nh), len(et)), dtype=np.float64) test_accuracy = np.zeros((len(nh), len(et)), dtype=np.float64) for i, n in enumerate(nh): for j, e in enumerate(et): mlp = mlp.mlp(x_train, y_train, nhidden=n, eta=e, linear=True) mlp.earlystopping(x_train, y_train, x_test, y_test) preds_train = [] preds_test = [] for k in x_train: pred = mlp.forward(k) preds_train.append(pred) for k in x_test: pred = mlp.forward(k) preds_test.append(pred) train_accuracy[i, j] = r2_score(y_train, preds_train) test_accuracy[i, j] = r2_score(y_test, preds_test) plot_data(et, nh, train_accuracy) plot_data(et, nh, test_accuracy)
def benchmark_batch(): """ use util functions to run the logistic classification with bp """ X_train, Y_train, X_test, Y_test = get_transformed_digit() N,D = X_train.shape yindi_train = y2indicator(Y_train) yindi_test = y2indicator(Y_test) M = 300 K = 10 # W = np.random.rand(D,M) # b = np.random.rand(M) W1 = np.random.rand(D,M)/np.sqrt(D) b1 = np.zeros(M) W2 = np.random.rand(M,K)/np.sqrt(M) b2 = np.zeros(K) cost_test = [] error_test = [] eta = 0.00004 penalty = 0.001 batch_size = 500 batch_num = N // batch_size t1 = time.time() #batch for i in range(100): X_shuffle,Y_train_shuffle = shuffle(X_train,yindi_train) for ii in range(int(batch_num)): # x_tem = X_shuffle[ii].reshape(1,D) # y_tem = Y_train_shuffle[ii].reshape(1,10) x_tem = X_shuffle[int(i*batch_size):int((i+1)*batch_size)] y_tem = Y_train_shuffle[int(i*batch_size):int((i+1)*batch_size)] # y_fit = forward(x = x_tem,w=W,b=b) y_fit, z = forward(x = x_tem, w1 = W1, b1 = b1, w2 = W2, b2 = b2, method = 'relu') W2 -= eta*(deri_w2(z = z, y= y_fit,t = y_tem) + penalty * W2) b2 -= eta*(deri_b2(y = y_fit, t = y_tem) + penalty*b2) W1 -= eta*(deri_w1(X = x_tem,Z = z,T = y_tem, Y = y_fit, W2 = W2) + penalty*W1 ) b1 -= eta*(deri_b1(Z = z,T = y_tem, Y = y_fit,W2= W2) + penalty*b1) # W2 -= eta*(deri_w2(z = z, y= y_fit,t = y_tem) ) # b2 -= eta*(deri_b2(y = y_fit, t = y_tem) ) # W1 -= eta*(deri_w1(X = x_tem,Z = z,T = y_tem, Y = y_fit, W2 = W2) ) # b1 -= eta*(deri_b1(Z = z,T = y_tem, Y = y_fit,W2= W2)) # W += eta*(deri_w(t_matrix = y_tem, y_matrix = y_fit,x = x_tem)-penalty*W) # b += eta*(deri_b(t_matrix = y_tem, y_matrix = y_fit)-penalty*b) p_y_test,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu') cost_test_tem = cost(y_matrix = p_y_test,t_matrix = yindi_test) cost_test.append(cost_test_tem) error_tem = error_rate(y_matrix = p_y_test, target = Y_test) print("the error rate in "+str(i)+" is :"+str(error_tem)) t2 = time.time() print("the whole process takes "+str(t2-t1)+" seconds") p_y_final,_ = forward(x = X_test,w1 = W1, b1=b1,w2= W2, b2 = b2,method = 'relu') error_final = error_rate(y_matrix = p_y_final, target = Y_test) print("the final error rate is "+str(error_final))
def main(): # 3 scenarios # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 15 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.0001 reg = 0.001 # Xtrain = X[:-1000, ] # Ytrain = Y[:-1000] # Xtest = X[-1000:, ] # Ytest = Y[-1000:, ] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = int(N / batch_sz) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # Batch losses_batch = [] error_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # A = ' ' # A = u"\n| |\n|----------------------| \n(\\__/) || \n(• v •) || \n / D" if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) # print( # u"|----------------------|\n| | \n Costo # en i=%d, j=%d: \n %.6f" % (i, j, l) + A) e = error_rate(pY, Ytest) error_batch.append(e) print("Ratio de error:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # update velocities dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Nesterov momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # v update vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 # param update W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) plt.plot(losses_batch, label='batch') plt.plot(losses_momentum, label='momentum') plt.plot(losses_nesterov, label='Nesterov') plt.legend() plt.show()
def main(): ''' RMSprop is a form adaptative learning rate which decreases over time ''' max_iter = 20 #for RelU #max_iter = 30 #for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.0004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M =300 K=10 #1. batch SGD W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :] Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch,pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period ==0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #2. RMSProp W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 1 - 1e-5 eps = 1e-10 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :] Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1-decay_rate)*gW2*gW2 W2 -= lr0*gW2 /(np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch,pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1-decay_rate)*gb2*gb2 b2 -= lr0*gb2 /(np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1-decay_rate)*gW1*gW1 W1 -= lr0*gW1 /(np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1-decay_rate)*gb1*gb1 b1 -= lr0*gb1 /(np.sqrt(cache_b1) + eps) if j % print_period ==0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "RMS Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "RMS Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "RMS Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='batch') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): max_iter = 20 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #const LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #gradient gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 #updates W2 -= lr*gW2 b2 -= lr*gb2 W1 -= lr*gW1 b1 -= lr*gb1 if j%print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) #RMSprop W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #const LL_rms = [] CR_rms = [] lr0 = 0.001 cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0*gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j%print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_rms.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label = 'const') plt.plot(LL_rms, label = 'rms') plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(K) b2_0 = np.zeros(K) # .1 Adam W1 = W1_0.copy() W2 = W2_0.copy() b1 = b1_0.copy() b2 = b2_0.copy() losses_adam = [] errors_adam = [] # 1st moment mW1 = 0 mW2 = 0 mb1 = 0 mb2 = 0 # 2nd moment vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 # Hyperparams eps = 1e-8 lr = 0.001 beta1 = 0.9 beta2 = 0.999 t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction correction1 = 1 - beta1 ** t mW1_hat = mW1 / correction1 mb1_hat = mb1 / correction1 mW2_hat = mW2 / correction1 mb2_hat = mb2 / correction1 # correction2 = 1 - beta2 ** t vb2_hat = vb2 / correction2 vb1_hat = vb1 / correction2 vW2_hat = vW2 / correction2 vW1_hat = vW1 / correction2 t += 1 # weights W1 = W1 - lr * mW1_hat / np.sqrt(vW1_hat + eps) b1 = b1 - lr * mb1_hat / np.sqrt(vb1_hat + eps) W2 = W2 - lr * mW2_hat / np.sqrt(vW2_hat + eps) b2 = b2 - lr * mb2_hat / np.sqrt(vb2_hat + eps) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_adam.append(l) print(f'Adam Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_adam.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) adam_error = error_rate(pY, Y_test) # 3. RMSProp with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_rms = [] errors_rms = [] # comparable hyper parameters for fair lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_rms.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') err = error_rate(pY, Y_test) errors_rms.append(err) print("Error rate:", err) pY, _ = forward(X_test, W1, b1, W2, b2) rms_error = error_rate(pY, Y_test) print(f"Final RMSProp error rate: {rms_error}") print(f"Final Adam error rate: {adam_error}") plt.plot(losses_adam, label='batch cost') plt.plot(losses_rms, label='RMSProp cost') plt.legend() plt.show()
def main(): # Compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nestrov momentum max_iter = 30 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() lr = 0.00004 reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch # cost = -16 losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_batch.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_batch.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final error rate: {error_rate(pY, Y_test)}") # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # Update velocities dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_momentum.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_momentum.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final error rate: {error_rate(pY, Y_test)}") # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # v update vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 # param update W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_nesterov.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_nesterov.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final error rate: {error_rate(pY, Y_test)}") plt.plot(losses_batch, label='batch') plt.plot(losses_momentum, label='momentum') plt.plot(losses_nesterov, label='nesterov') plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # updates W2 -= lr*gW2 b2 -= lr*gb2 W1 -= lr*gW1 b1 -= lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparameters lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 losses_adam = [] errors_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1**t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2**t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_adam.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_adam.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() plt.plot(losses_adam, label='adam') pY, _ = forward(Xtest, W1, b1, W2, b2) plt.legend() plt.show()
def main(): # compare 5 scenarios: # 1. batch SGD with constant learning rate # 2. batch SGD with RMSProp # 3. batch SGD with AdaGrad # 4. batch SGD with exponential decay np.random.seed(2) max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights: W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch SGD with constant learning rate: LL_batch = [] CR_batch = [] t0 = datetime.now() print('\nperforming batch SGD with constant learning rate...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: W2 -= lr * (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, p_Ybatch) + reg * b2) W1 -= lr * (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_batch.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err1 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_batch) #plt.title('Cost for batch GD with const lr') #plt.show() # 2. batch GD with RMSProp: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_RMSProp = [] CR_RMSProp = [] lr0 = 0.001 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay = 0.999 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with RMSProp...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_RMSProp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_RMSProp.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err2 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_RMSProp) #plt.title('Cost for batch SGD with RMSProp') #plt.show() # 3. batch SGD with AdaGrad: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_AdaGrad = [] CR_AdaGrad = [] lr0 = 0.01 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with AdaGrad...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = cache_W2 + gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = cache_b2 + gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = cache_W1 + gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = cache_b1 + gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_AdaGrad.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_AdaGrad.append(error) print('error rate:', error) dt3 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err3 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_AdaGrad) #plt.title('Cost for batch SGD with AdaGrad') #plt.show() ''' # 4. batch SGD with exponential decay: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_exp = [] CR_exp = [] lr0 = 0.0004 # initial learning rate k = 1e-7 t = 0 # initial log lr = lr0 t0 = datetime.now() print('\nperforming batch SGD with lr exponential decay...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_size:(j+1)*batch_size, :] Ybatch = Ytrain_ind[j*batch_size:(j+1)*batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg*W2) W2 -= lr*gW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg*b2) b2 -= lr*gb2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg*W1) W1 -= lr*gW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg*b1) b1 -= lr*gb1 # decrease the learning rate lr = lr0 * np.exp(-k*t) t += 1 if j % print_period == 0: print('current learning rate:', lr) pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_exp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_exp.append(error) print('error rate:', error) dt4 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch SGD with lr exponential decay:', dt4) # plot the cost #plt.plot(LL_exp) #plt.title('Cost for batch SGD with lr exponential decay') #plt.show() ''' print('\nBatch SGD with constant learning rate:') print('final error rate:', final_err1) print('elapsed time:', dt1) print('\nBatch SGD with RMSProp:') print('final error rate:', final_err2) print('elapsed time:', dt2) print('\nBatch SGD with AdaGrad:') print('final error rate:', final_err3) print('elapsed time:', dt3) # plot the costs together: plt.plot(LL_batch, label='const_lr') plt.plot(LL_RMSProp, label='RMSProp') plt.plot(LL_AdaGrad, label='AdaGrad') #plt.plot(LL_exp, label='lr_exp_decay') plt.legend() plt.show()
def main(): dobatch = False dobatchwithmomentum = True dobatchwithnesterovmomentum = True max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # batch #cost = -16 if dobatch: losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() # batch with momentum if dobatchwithmomentum: print("momentum") W1 = W1_0.copy b1 = b1_0.copy W2 = W2_0.copy b2 = b2_0.copy losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() # Nesterov momentum if dobatchwithnesterovmomentum: W1 = W1_0.copy b1 = b1_0.copy W2 = W2_0.copy b2 = b2_0.copy losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 20 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() lr = 0.00004 reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(K) b2 = np.zeros(K) # copy weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. Constant Learning rate losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_batch.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_batch.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) batch_error = error_rate(pY, Y_test) print(f"Final batch error rate: {batch_error}") # 2. RMSProp W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_RMSP = [] errors_RMSP = [] lr0 = 0.001 cache_W1 = 1 cache_b1 = 1 cache_W2 = 1 cache_b2 = 1 decay = 0.999 epsilon = 1e-10 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + epsilon) gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + epsilon) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + epsilon) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + epsilon) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_RMSP.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_RMSP.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final RMSProp error rate: {error_rate(pY, Y_test)}") print(f"Final batch error rate: {batch_error}") plt.plot(losses_batch, label='batch cost') plt.plot(losses_RMSP, label='RMSProp cost') plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparams lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1. Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1 ** t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2 ** t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] # comparable hyperparameters for fair comparison lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() #1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 #2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 #hyperparameters lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 #Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #gradient gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 #new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 #new vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 #bias correction correction1 = 1 - beta1**t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2**t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 #update t t += 1 W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) #RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
def main(): # 3 scenarios # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 15 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.0001 reg = 0.001 # Xtrain = X[:-1000, ] # Ytrain = Y[:-1000] # Xtest = X[-1000:, ] # Ytest = Y[-1000:, ] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = int(N / batch_sz) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # Batch losses_batch = [] error_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # A = ' ' # A = u"\n| |\n|----------------------| \n(\\__/) || \n(• v •) || \n / D" if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) # print( # u"|----------------------|\n| | \n Costo # en i=%d, j=%d: \n %.6f" % (i, j, l) + A) e = error_rate(pY, Ytest) error_batch.append(e) print("Ratio de error:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_rms = [] errors_rms = [] lr0 = 0.001 cacheW2 = 0 cacheb2 = 0 cacheW1 = 0 cacheb1 = 0 decay_rate = 0.99 eps = 0.000001 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # caches cacheW2 = decay_rate * cacheW2 + (1 - decay_rate) * gW2 * gW2 cacheb2 = decay_rate * cacheb2 + (1 - decay_rate) * gb2 * gb2 cacheW1 = decay_rate * cacheW1 + (1 - decay_rate) * gW1 * gW1 cacheb1 = decay_rate * cacheb1 + (1 - decay_rate) * gb1 * gb1 W2 -= lr0 * gW2 / (np.sqrt(cacheW2) + eps) b2 -= lr0 * gb2 / (np.sqrt(cacheb2) + eps) W1 -= lr0 * gW1 / (np.sqrt(cacheW1) + eps) b1 -= lr0 * gb1 / (np.sqrt(cacheb1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_rms.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) plt.plot(losses_batch, label='batch') plt.plot(losses_rms, label='rmsprop') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
======= # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): >>>>>>> upstream/master Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: <<<<<<< HEAD # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll)
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 0.0000000001 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): max_iter = 2 # 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch SGD LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(int(n_batches)): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(int(n_batches)): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * dW1 - lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_momentum.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(int(n_batches)): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu * mu * dW2 - (1 + mu) * lr * ( derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * mu * db2 - (1 + mu) * lr * ( derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * mu * dW1 - (1 + mu) * lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * mu * db1 - (1 + mu) * lr * ( derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_nest.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_nest.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()