def main(): # compare 5 scenarios: # 1. batch SGD with constant learning rate # 2. batch SGD with RMSProp # 3. batch SGD with AdaGrad # 4. batch SGD with exponential decay np.random.seed(2) max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights: W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch SGD with constant learning rate: LL_batch = [] CR_batch = [] t0 = datetime.now() print('\nperforming batch SGD with constant learning rate...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: W2 -= lr * (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, p_Ybatch) + reg * b2) W1 -= lr * (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_batch.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err1 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_batch) #plt.title('Cost for batch GD with const lr') #plt.show() # 2. batch GD with RMSProp: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_RMSProp = [] CR_RMSProp = [] lr0 = 0.001 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay = 0.999 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with RMSProp...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_RMSProp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_RMSProp.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err2 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_RMSProp) #plt.title('Cost for batch SGD with RMSProp') #plt.show() # 3. batch SGD with AdaGrad: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_AdaGrad = [] CR_AdaGrad = [] lr0 = 0.01 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with AdaGrad...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = cache_W2 + gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = cache_b2 + gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = cache_W1 + gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = cache_b1 + gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_AdaGrad.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_AdaGrad.append(error) print('error rate:', error) dt3 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err3 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_AdaGrad) #plt.title('Cost for batch SGD with AdaGrad') #plt.show() ''' # 4. batch SGD with exponential decay: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_exp = [] CR_exp = [] lr0 = 0.0004 # initial learning rate k = 1e-7 t = 0 # initial log lr = lr0 t0 = datetime.now() print('\nperforming batch SGD with lr exponential decay...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_size:(j+1)*batch_size, :] Ybatch = Ytrain_ind[j*batch_size:(j+1)*batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg*W2) W2 -= lr*gW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg*b2) b2 -= lr*gb2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg*W1) W1 -= lr*gW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg*b1) b1 -= lr*gb1 # decrease the learning rate lr = lr0 * np.exp(-k*t) t += 1 if j % print_period == 0: print('current learning rate:', lr) pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_exp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_exp.append(error) print('error rate:', error) dt4 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch SGD with lr exponential decay:', dt4) # plot the cost #plt.plot(LL_exp) #plt.title('Cost for batch SGD with lr exponential decay') #plt.show() ''' print('\nBatch SGD with constant learning rate:') print('final error rate:', final_err1) print('elapsed time:', dt1) print('\nBatch SGD with RMSProp:') print('final error rate:', final_err2) print('elapsed time:', dt2) print('\nBatch SGD with AdaGrad:') print('final error rate:', final_err3) print('elapsed time:', dt3) # plot the costs together: plt.plot(LL_batch, label='const_lr') plt.plot(LL_RMSProp, label='RMSProp') plt.plot(LL_AdaGrad, label='AdaGrad') #plt.plot(LL_exp, label='lr_exp_decay') plt.legend() plt.show()
def main(): # compare 2 scenarios: # 1. batch GD with RMSProp and momentum # 2. Adam GD max_iter = 20 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) # 1. batch GD with RMSProp and momentum: print('\nperforming batch GD with RMSProp and momentum...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_rm = [] CR_rm = [] # hyperparams: lr0 = 0.001 #lr0 = 0.0001 mu = 0.9 decay = 0.999 eps = 10e-9 # momentum (velocity terms): dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 # rms-prop cache (with no bias correction): cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 t0 = datetime.now() for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: # (note: we utilize a bit different version of momentum) gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2 + eps)) W2 -= dW2 #dW2 = mu*dW2 - lr0*gW2 / (np.sqrt(cache_W2) + eps) #W2 += dW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2 + eps)) b2 -= db2 #db2 = mu*db2 - lr0*gb2 / (np.sqrt(cache_b2) + eps) #b2 += db2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1 + eps)) W1 -= dW1 #dW1 = mu*dW1 - lr0*gW1 / (np.sqrt(cache_W1) + eps) #W1 += dW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1 + eps)) b1 -= db1 #db1 = mu*db1 - lr0*gb1 / (np.sqrt(cache_b1) + eps) #b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_rm.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_rm.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch GD with RMSProp and momentum:', dt1) # plot the cost plt.plot(LL_rm) plt.title('Cost for batch GD with RMSProp and momentum') plt.show() # 2. Adam optimizer print('\nperforming Adam optimizer...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # hyperparams: lr = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 10e-9 # 1st moment: mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment: vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 LL_adam = [] CR_adam = [] t0 = datetime.now() t = 1 # index; used instead of j, because j starts with 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates: # gradients: gW2 = derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2 gb2 = derivative_b2(Ybatch, p_Ybatch) + reg * b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1 # 1st moment: mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 # 2nd moment: vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction: mW2_bc = mW2 / (1 - beta1**t) mb2_bc = mb2 / (1 - beta1**t) mW1_bc = mW1 / (1 - beta1**t) mb1_bc = mb1 / (1 - beta1**t) vW2_bc = vW2 / (1 - beta2**t) vb2_bc = vb2 / (1 - beta2**t) vW1_bc = vW1 / (1 - beta2**t) vb1_bc = vb1 / (1 - beta2**t) # weights and biases (parameters): W2 = W2 - lr * mW2_bc / np.sqrt(vW2_bc + eps) b2 = b2 - lr * mb2_bc / np.sqrt(vb2_bc + eps) W1 = W1 - lr * mW1_bc / np.sqrt(vW1_bc + eps) b1 = b1 - lr * mb1_bc / np.sqrt(vb1_bc + eps) t += 1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_adam.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_adam.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for Adam optimizer:', dt2) # plot the cost plt.plot(LL_adam) plt.title('Cost for Adam optimizer') plt.show() # plot costs from the two experiments together: plt.plot(LL_rm, label='RMSProp with momentum') plt.plot(LL_adam, label='Adam optimizer') plt.title('Cost') plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 K = len(set(Ytrain)) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 20 # 1. batch costs_batch = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) W2 -= lr * (derivative_W2(Z, pY, y) + reg * W2) b2 -= lr * (derivative_b2(pY, y) + reg * b2) W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg * W1) b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg * b1) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 2. RMSprop W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 lr0 = 0.001 costs_RMS = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) gW2 = (derivative_W2(Z, pY, y) + reg * W2) cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = (derivative_b2(pY, y) + reg * b2) cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1) cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1) cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_RMS.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) plt.plot(costs_batch, label="batch") plt.plot(costs_RMS, label="rms") plt.legend() plt.show()
def main(): # load the data: (Xtrain, Ytrain), (Xtest, Ytest) = mnist.load_data() # print(Xtrain.shape) N, d, _ = Xtrain.shape D = d*d Ntest = len(Xtest) # normalize the data: Xtrain = Xtrain / 255.0 Xtest = Xtest / 255.0 # display: # n = np.random.choice(N) # plt.imshow(Xtrain[n], cmap='gray') # plt.title(str(Ytrain[n])) # plt.show() # reshape the data: Xtrain = Xtrain.reshape(N, D) Xtest = Xtest.reshape(Ntest, D) # print('Xtrain.max():', Xtrain.max()) # print('Xtrain.shape:', Xtrain.shape) Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # define hyperparameters: epochs = 30 print_period = 10 lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz M = 300 # the hidden layer size K = len(set(Ytrain)) # randomly initialize the weights: W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) # 1. mini-batch SGD: losses_batch = [] errors_batch = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() print('\nmini-batch SGD') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # update the params: W2 -= lr*(derivative_W2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_batch.append(l) e = error_rate(pY, Ytest) errors_batch.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) # print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l)) # print('error_rate:', e) sys.stdout.flush() pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) # 2. mini-batch SGD with momentum - version 1: losses_momentum1 = [] errors_momentum1 = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() mu = 0.9 # momentum term # initial values for the 'velocities': dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 print('\nmini-batch SGD with momentum - version 1') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # calculate the gradients: gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update the 'velocities': dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # update the params: W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_momentum1.append(l) e = error_rate(pY, Ytest) errors_momentum1.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) # print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l)) # print('error_rate:', e) sys.stdout.flush() pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) ''' # 3. mini-batch SGD with momentum - version 2: losses_momentum2 = [] errors_momentum2 = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() mu = 0.9 # momentum term # initial values for the 'velocities': dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 # lr = 0.0004 print('\nmini-batch SGD with momentum - version 2') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # calculate the gradients: gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # # update the 'velocities': dW2 = mu*dW2 + (1-mu)*gW2 db2 = mu*db2 + (1-mu)*gb2 dW1 = mu*dW1 + (1-mu)*gW1 db1 = mu*db1 + (1-mu)*gb1 # update the 'velocities': # dW2 = mu*dW2 + gW2 # db2 = mu*db2 + gb2 # dW1 = mu*dW1 + gW1 # db1 = mu*db1 + gb1 # update the params: W2 -= lr*dW2 b2 -= lr*db2 W1 -= lr*dW1 b1 -= lr*db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_momentum2.append(l) e = error_rate(pY, Ytest) errors_momentum2.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) sys.stdout.flush() pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) # best result: epochs = 25, final_error = 0.0179 ''' # 4. mini-batch SGD with Nesterov momentum: losses_nesterov_momentum = [] errors_nesterov_momentum = [] W1 = W1_init.copy() b1 = b1_init.copy() W2 = W2_init.copy() b2 = b2_init.copy() mu = 0.9 # momentum term # initial values for the 'velocities': dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 print('\nmini-batch SGD with Nesterov momentum') t0 = datetime.now() for i in range(epochs): Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j+1)*batch_sz] Ybatch = Ytrain_ind[j*batch_sz:(j+1)*batch_sz] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # calculate the gradients: gW2 = derivative_W2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update the 'velocities': dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # update the params: W2 += mu*dW2 - lr*gW2 b2 += mu*db2 - lr*gb2 W1 += mu*dW1 - lr*gW1 b1 += mu*db1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cross_entropy(pY, Ytest) losses_nesterov_momentum.append(l) e = error_rate(pY, Ytest) errors_nesterov_momentum.append(e) sys.stdout.write('epoch: %d, batch: %d, cost: %.6f, error_rate: %.4f\r' % (i, j, l, e)) sys.stdout.flush() # print('\nepoch: %d, batch: %d, cost: %6f' % (i, j, l)) # print('error_rate:', e) pY, _ = forward(Xtest, W1, b1, W2, b2) print('ETA:', datetime.now() - t0, 'final error rate:', error_rate(pY, Ytest), ' '*20) # plot the losses: plt.plot(losses_batch, label='mini-batch SGD') plt.plot(losses_momentum1, label='+ momentum') plt.plot(losses_nesterov_momentum, label='+ Nesterov momentum') plt.xlabel('iterations') plt.ylabel('loss') plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 K = len(set(Ytrain)) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() lr = 0.001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 10 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1st moment mW2 = 0 mb2 = 0 mW1 = 0 mb1 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # 1. Adam costs_adam = [] t = 1 for i in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) # gradients gW2 = (derivative_W2(Z, pY, y) + reg * W2) gb2 = (derivative_b2(pY, y) + reg * b2) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1) gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1) # new m mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 # new v vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction correction1 = 1 - beta1**t hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 correction2 = 1 - beta2**t hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 # update t t += 1 # apply updates to the params W2 -= lr * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 -= lr * hat_mb2 / np.sqrt(hat_vb2 + eps) W1 -= lr * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 -= lr * hat_mb1 / np.sqrt(hat_vb1 + eps) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_adam.append(c) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 # momentum mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 costs_RMS = [] for i in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) # updates gW2 = (derivative_W2(Z, pY, y) + reg * W2) cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = (derivative_b2(pY, y) + reg * b2) cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1) cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1) cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_RMS.append(c) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) plt.plot(costs_adam, label='adam') plt.plot(costs_RMS, label='rmsprop') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 K = len(set(Ytrain)) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 20 # 1. batch costs_batch = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] pY, Z = forward(x, W1, b1, W2, b2) W2 -= lr * (derivative_W2(Z, pY, y) + reg*W2) b2 -= lr * (derivative_b2(pY, y) + reg*b2) W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg*W1) b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg*b1) if j % 50 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 costs_batch_momentum = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] pY, Z = forward(x, W1, b1, W2, b2) # gradients gW2 = (derivative_W2(Z, pY, y) + reg*W2) gb2 = (derivative_b2(pY, y) + reg*b2) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg*W1) gb1 = (derivative_b1(W2, Z, pY, y) + reg*b1) # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % 50 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch_momentum.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 costs_batch_momentum_nesterov = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] pY, Z = forward(x, W1, b1, W2, b2) # gradients gW2 = (derivative_W2(Z, pY, y) + reg*W2) gb2 = (derivative_b2(pY, y) + reg*b2) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg*W1) gb1 = (derivative_b1(W2, Z, pY, y) + reg*b1) # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % 50 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch_momentum_nesterov.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) plt.plot(costs_batch, label="batch") plt.plot(costs_batch_momentum, label="momentum") plt.plot(costs_batch_momentum_nesterov, label="nesterov") plt.legend() plt.show()