def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # update velocities dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # v update vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 # param update W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 10 print_period = 40 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() Ytrain_ind = y2indicator(Ytrain) # Target of train data Ytest_ind = y2indicator(Ytest) # Target of test data N, D = Xtrain.shape M = 300 K = 10 np.random.seed(123) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) batch_sz = 500 n_batches = N // batch_sz # 82 # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() lr = 0.001 reg = 0.01 # 1. RMSProp + Momemtum decay_rate = 0.999 eps = 1e-8 mu = 0.9 # RMSProp cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 losses_rms = [] errors_rms = [] for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates ## 這邊updata momentum的形式,不是原始定義,而是模仿adam的 moment1 ## 可以當成沒有使用 Bias correction 的 adam gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 dW2 = mu*dW2 + (1 - mu) * lr * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 db2 = mu*db2 + (1 - mu) * lr * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 dW1 = mu*dW1 + (1 - mu) * lr * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 db1 = mu*db1 + (1 - mu)* lr * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_rms.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. Adam W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() beta1 = 0.9 # 0.999 # 這兩個值調一下 就會發生很大變化的樣子, 也就是說參數調教超重要的阿 beta2 = 0.999 # 0.99 eps = 1e-8 t = 1 m_W2 = 0 m_b2 = 0 m_W1 = 0 m_b1 = 0 v_W2 = 0 v_b2 = 0 v_W1 = 0 v_b1 = 0 losses_adam = [] errors_adam = [] for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradient gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # new m m_W2 = beta1*m_W2 + (1 - beta1)*gW2 m_b2 = beta1*m_b2 + (1 - beta1)*gb2 m_W1 = beta1*m_W1 + (1 - beta1)*gW1 m_b1 = beta1*m_b1 + (1 - beta1)*gb1 # new v v_W2 = beta2*v_W2 + (1 - beta2)*gW2*gW2 v_b2 = beta2*v_b2 + (1 - beta2)*gb2*gb2 v_W1 = beta2*v_W1 + (1 - beta2)*gW1*gW1 v_b1 = beta2*v_b1 + (1 - beta2)*gb1*gb1 # m_hat get from bias correction m_W2_hat = m_W2 / (1 - beta1**t) m_b2_hat = m_b2 / (1 - beta1**t) m_W1_hat = m_W1 / (1 - beta1**t) m_b1_hat = m_b1 / (1 - beta1**t) # v_hat get from bias correction v_W2_hat = v_W2 / (1 - beta2**t) v_b2_hat = v_b2 / (1 - beta2**t) v_W1_hat = v_W1 / (1 - beta2**t) v_b1_hat = v_b1 / (1 - beta2**t) # update W2 -= lr * m_W2_hat / np.sqrt(v_W2_hat + eps) b2 -= lr * m_b2_hat / np.sqrt(v_b2_hat + eps) W1 -= lr * m_W1_hat / np.sqrt(v_W1_hat + eps) b1 -= lr * m_b1_hat / np.sqrt(v_b1_hat + eps) t += 1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_adam.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_rms, label='RMSprop + Momentun') plt.plot(losses_adam, label='Adam') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. batch with momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nest.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = int(N / batch_sz) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 0.0000000001 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum ''' # steps of training a model # 0. get data # 1. function --> forwrad --> OK # 2. loss --> cost --> ok # 3. solver --> gradient descent ''' max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) # Target of train data Ytest_ind = y2indicator(Ytest) # Target of test data N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 np.random.seed(42) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) ''' initial weight 分母除上np.sqrt(D)對cost非常有幫助 1. 如果不除這個東東,initial weight 會太大,將造成以下後果 1.1 計算forward()時,通過sigmoid中的np.exp(-( X.dot(W1) + b1 )),一開始很可能會出現太大數字(overflow encountered 數據溢出), 溢出代表該數字物件的型別無法承載這麼大數字,會報錯,但code還是能繼續跑,因為sigmoid的值在0~1之間 1.2 如果是通過relu,輸出值可能會是一極大值 之後後再經過expA = np.exp(A),大概首輪epoch中的第二個batch計算時,就會得到無限大爆掉了 1.3 結論: activation = relu時, initial weight要夠小 2. 除上一個極大值500 ( 約np.sqrt(D)的20倍 ),在update weight with momentum的實驗中, 最後一個epoch的cost = 106.49 3. 除 np.sqrt(D),在update weight with momentum的實驗中, 最後一個epoch的cost = 119.005 4. 除上一個極大值5000,在update weight with momentum的實驗中, 最後一個epoch的cost = 160.68 5. 其他參考書上說,大部分情況直接無腦除上100即可以得到很好的訓練成果 ''' # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch SGD losses_batch = [] errors_batch = [] for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 # momentu的係數,大多數用0.9 就已經足夠好了 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # update velocities dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print('Cost at iteration i=%d, j=%d : %.6f' % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print('Error rate:', e) pY, _ = forward(Xtest, W1, b1, W2, b2) print('Fianl error rate:', error_rate(pY, Ytest)) plt.plot(losses_batch, label='batch') plt.plot(losses_momentum, label="momentum") plt.legend() plt.show()
def main(): # compare 5 scenarios: # 1. batch SGD with constant learning rate # 2. batch SGD with RMSProp # 3. batch SGD with AdaGrad # 4. batch SGD with exponential decay np.random.seed(2) max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights: W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch SGD with constant learning rate: LL_batch = [] CR_batch = [] t0 = datetime.now() print('\nperforming batch SGD with constant learning rate...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: W2 -= lr * (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, p_Ybatch) + reg * b2) W1 -= lr * (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_batch.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err1 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_batch) #plt.title('Cost for batch GD with const lr') #plt.show() # 2. batch GD with RMSProp: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_RMSProp = [] CR_RMSProp = [] lr0 = 0.001 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay = 0.999 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with RMSProp...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_RMSProp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_RMSProp.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err2 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_RMSProp) #plt.title('Cost for batch SGD with RMSProp') #plt.show() # 3. batch SGD with AdaGrad: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_AdaGrad = [] CR_AdaGrad = [] lr0 = 0.01 # initial learning rate cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 eps = 10e-10 t0 = datetime.now() print('\nperforming batch SGD with AdaGrad...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = cache_W2 + gW2 * gW2 W2 -= lr0 * gW2 / np.sqrt(cache_W2 + eps) gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = cache_b2 + gb2 * gb2 b2 -= lr0 * gb2 / np.sqrt(cache_b2 + eps) gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = cache_W1 + gW1 * gW1 W1 -= lr0 * gW1 / np.sqrt(cache_W1 + eps) gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = cache_b1 + gb1 * gb1 b1 -= lr0 * gb1 / np.sqrt(cache_b1 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_AdaGrad.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_AdaGrad.append(error) print('error rate:', error) dt3 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) final_err3 = error_rate(pY, Ytest) # plot the cost #plt.plot(LL_AdaGrad) #plt.title('Cost for batch SGD with AdaGrad') #plt.show() ''' # 4. batch SGD with exponential decay: W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_exp = [] CR_exp = [] lr0 = 0.0004 # initial learning rate k = 1e-7 t = 0 # initial log lr = lr0 t0 = datetime.now() print('\nperforming batch SGD with lr exponential decay...') for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_size:(j+1)*batch_size, :] Ybatch = Ytrain_ind[j*batch_size:(j+1)*batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg*W2) W2 -= lr*gW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg*b2) b2 -= lr*gb2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg*W1) W1 -= lr*gW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg*b1) b1 -= lr*gb1 # decrease the learning rate lr = lr0 * np.exp(-k*t) t += 1 if j % print_period == 0: print('current learning rate:', lr) pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_exp.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_exp.append(error) print('error rate:', error) dt4 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch SGD with lr exponential decay:', dt4) # plot the cost #plt.plot(LL_exp) #plt.title('Cost for batch SGD with lr exponential decay') #plt.show() ''' print('\nBatch SGD with constant learning rate:') print('final error rate:', final_err1) print('elapsed time:', dt1) print('\nBatch SGD with RMSProp:') print('final error rate:', final_err2) print('elapsed time:', dt2) print('\nBatch SGD with AdaGrad:') print('final error rate:', final_err3) print('elapsed time:', dt3) # plot the costs together: plt.plot(LL_batch, label='const_lr') plt.plot(LL_RMSProp, label='RMSProp') plt.plot(LL_AdaGrad, label='AdaGrad') #plt.plot(LL_exp, label='lr_exp_decay') plt.legend() plt.show()
def main(): # compare 3: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum # all with L2 regularization X, Y = get_normalized_data() Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) M = 300 K = 10 ######################IMPORTANETE PARAMETERE #################### t = 1 # !!!!!!!!!!!!!!!! ############################################################### epochs = 20 print_period = 10 lr0 = 0.001 reg = 0.01 epsilon = 1e-8 # is it the same as 10e-8 beta1 = 0.9 # mu = 0.9 beta2 = 0.999 # decay = 0.999 batch_size = 500 number_batches = int(N // batch_size) W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) tr_costs_momentum = [] errors_batch_momentum = [] losses_test_momentum = [] # momentum coeficient mW2 = 0 mW1 = 0 mb2 = 0 mb1 = 0 vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 mW2_hat = 0 mW1_hat = 0 mb2_hat = 0 mb1_hat = 0 vW1_hat = 0 vW2_hat = 0 vb1_hat = 0 vb2_hat = 0 for epoch in range(epochs): for j in range(number_batches): xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :] ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :] ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2) # gradients gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2 gb2 = derivative_b2(ytr, ytr_pred) + reg * b2 gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1 gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1 # update momentum mW2 = beta1 * mW2 + (1 - beta1) * gW2 mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb2 = beta1 * mb2 + (1 - beta1) * gb2 mb1 = beta1 * mb1 + (1 - beta1) * gb1 # update velocity vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction correction1 = (1 - beta1**t) mW2_hat = mW2 / correction1 mW1_hat = mW1 / correction1 mb2_hat = mb2 / correction1 mb1_hat = mb1 / correction1 correction2 = (1 - beta2**t) vW2_hat = vW2 / correction2 vW1_hat = vW1 / correction2 vb2_hat = vb2 / correction2 vb1_hat = vb1 / correction2 # update t !!!!!!! t += 1 # update W2 -= lr0 * (mW2_hat / np.sqrt(vW2_hat + epsilon)) W1 -= lr0 * (mW1_hat / np.sqrt(vW1_hat + epsilon)) b2 -= lr0 * (mb2_hat / np.sqrt(vb2_hat + epsilon)) b1 -= lr0 * (mb1_hat / np.sqrt(vb1_hat + epsilon)) if j % print_period == 0: yte_pred, _ = forward(Xtest, W1, b1, W2, b2) l = cost(yte_pred, Ytest_ind) losses_test_momentum.append(l) print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l)) e = error_rate(yte_pred, Ytest) errors_batch_momentum.append(e) print("Error rate:", e) ctr = cost(ytr_pred, ytr) print("traning set cost", ctr) tr_costs_momentum.append(ctr) pY, _ = forward(Xtest, W1, b1, W2, b2) #plt.plot(tr_costs_momentum, label='tr_costs momentum') plt.plot(losses_test_momentum, label='losses_test momentum RMS') #plt.plot(errors_batch, label='errors_batch') # plt.show() # print("tr_costs", errors_batch_momentum) print("Final error rate:", error_rate(pY, Ytest)) plt.legend() plt.show()
def main(): dobatch = False dobatchwithmomentum = True dobatchwithnesterovmomentum = True max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # batch #cost = -16 if dobatch: losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() # batch with momentum if dobatchwithmomentum: print("momentum") W1 = W1_0.copy b1 = b1_0.copy W2 = W2_0.copy b2 = b2_0.copy losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() # Nesterov momentum if dobatchwithnesterovmomentum: W1 = W1_0.copy b1 = b1_0.copy W2 = W2_0.copy b2 = b2_0.copy losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def main(): # compare 3: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum # all with L2 regularization X, Y = get_normalized_data() Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) M = 300 K = 10 max_iter = 20 epochs = 20 print_period = 10 lr0 = 0.0004 reg = 0.01 epsilon = 10e-10 decay = 0.999 batch_size = 500 number_batches = int(N // batch_size) W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() cache_W2 = 1 cache_W1 = 1 cache_b2 = 1 cache_b1 = 1 tr_costs = [] errors_batch = [] losses_test = [] # 1. Just grad & RMSprop # 1. for epoch in range(epochs): for j in range(number_batches): xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :] ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :] ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2) # gradients gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2 gb2 = derivative_b2(ytr, ytr_pred) + reg * b2 gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1 gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1 # # AdaGrad # cache_W2 += derivative_w2(z_tr, ytr, ytr_pred) * derivative_w2(z_tr, ytr, ytr_pred) # cache_W1 += derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) * derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) # cache_b2 += derivative_b2(ytr, ytr_pred) * derivative_b2(ytr, ytr_pred) # cache_b1 += derivative_b1(z_tr, ytr, ytr_pred, W2) * derivative_b1(z_tr, ytr, ytr_pred, W2) # RMSProp cache_W2 += decay * cache_W2 + (1 - decay) * gW2 * gW2 cache_W1 += decay * cache_W1 + (1 - decay) * gW1 * gW1 cache_b2 += decay * cache_b2 + (1 - decay) * gb2 * gb2 cache_b1 += decay * cache_b1 + (1 - decay) * gb1 * gb1 W2 -= lr0 * (gW2 // (cache_W2 + epsilon) + reg * W2) b2 -= lr0 * (gb2 // (cache_b2 + epsilon) + reg * b2) W1 -= lr0 * (gW1 // (cache_W1 + epsilon) + reg * W1) b1 -= lr0 * (gb1 // (cache_b1 + epsilon) + reg * b1) if j % print_period == 0: yte_pred, _ = forward(Xtest, W1, b1, W2, b2) l = cost(yte_pred, Ytest_ind) losses_test.append(l) print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l)) e = error_rate(yte_pred, Ytest) errors_batch.append(e) print("Error rate:", e) ctr = cost(ytr_pred, ytr) print("traning set cost", ctr) tr_costs.append(ctr) pY, _ = forward(Xtest, W1, b1, W2, b2) #plt.plot(tr_costs, label='tr_costs') plt.plot(losses_test, label='losses_test RMS') #plt.plot(errors_batch, label='errors_batch') # plt.show() # print("tr_costs", tr_costs) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch grad with momentum & RMSprop # # # 2. W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # regular batch gradient descend tr_costs_momentum = [] errors_batch_momentum = [] losses_test_momentum = [] # momentum coeficient mu = 0.8 cache_W2 = 1 cache_W1 = 1 cache_b2 = 1 cache_b1 = 1 dW1 = 0 dW2 = 0 db1 = 0 db2 = 0 cW1 = 0 cW2 = 0 cb1 = 0 cb2 = 0 for epoch in range(epochs): for j in range(number_batches): xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :] ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :] ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2) # gradients gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2 gb2 = derivative_b2(ytr, ytr_pred) + reg * b2 gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1 gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1 # potencjalnie pojebalem momentum i velocity # RMSProp cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 cW2 = (gW2 // (cache_W2) + epsilon) cb2 = (gb2 // (cache_b2) + epsilon) cW1 = (gW1 // (cache_W1) + epsilon) cb1 = (gb1 // (cache_b1) + epsilon) # update velocity dW2 = mu * dW2 + (1 - mu) * lr0 * cW2 db2 = mu * db2 + (1 - mu) * lr0 * cb2 dW1 = mu * dW1 + (1 - mu) * lr0 * cW1 db1 = mu * db1 + (1 - mu) * lr0 * cb1 # update W2 -= dW2 W1 -= dW1 b2 -= db2 b1 -= db1 if j % print_period == 0: yte_pred, _ = forward(Xtest, W1, b1, W2, b2) l = cost(yte_pred, Ytest_ind) losses_test_momentum.append(l) print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l)) e = error_rate(yte_pred, Ytest) errors_batch_momentum.append(e) print("Error rate:", e) ctr = cost(ytr_pred, ytr) print("traning set cost", ctr) tr_costs_momentum.append(ctr) pY, _ = forward(Xtest, W1, b1, W2, b2) #plt.plot(tr_costs_momentum, label='tr_costs momentum') plt.plot(losses_test_momentum, label='losses_test momentum RMS') #plt.plot(errors_batch, label='errors_batch') # plt.show() # print("tr_costs", errors_batch_momentum) print("Final error rate:", error_rate(pY, Ytest)) plt.legend() plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparameters lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 losses_adam = [] errors_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1**t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2**t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_adam.append(l) print("cost at iter i %d, j %d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_adam.append(e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("final error rate: ", error_rate(pY, Ytest)) print() plt.plot(losses_adam, label='adam') pY, _ = forward(Xtest, W1, b1, W2, b2) plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. batch with momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # # updates dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * dW1 - lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: # # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu * mu * dW2 - (1 + mu) * lr * ( derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * mu * db2 - (1 + mu) * lr * ( derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * mu * dW1 - (1 + mu) * lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * mu * db1 - (1 + mu) * lr * ( derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nest.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): # Compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nestrov momentum max_iter = 30 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() lr = 0.00004 reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch # cost = -16 losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_batch.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_batch.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final error rate: {error_rate(pY, Y_test)}") # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # Update velocities dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_momentum.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_momentum.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final error rate: {error_rate(pY, Y_test)}") # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # v update vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 # param update W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_nesterov.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_nesterov.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final error rate: {error_rate(pY, Y_test)}") plt.plot(losses_batch, label='batch') plt.plot(losses_momentum, label='momentum') plt.plot(losses_nesterov, label='nesterov') plt.show()
def main(): max_iter = 20 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #const LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #gradient gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 #updates W2 -= lr*gW2 b2 -= lr*gb2 W1 -= lr*gW1 b1 -= lr*gb1 if j%print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) #RMSprop W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #const LL_rms = [] CR_rms = [] lr0 = 0.001 cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz: (j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz: (j*batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0*gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j%print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_rms.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label = 'const') plt.plot(LL_rms, label = 'rms') plt.legend() plt.show()
def main(): # 3 scenarios # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 15 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.0001 reg = 0.001 # Xtrain = X[:-1000, ] # Ytrain = Y[:-1000] # Xtest = X[-1000:, ] # Ytest = Y[-1000:, ] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = int(N / batch_sz) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # Batch losses_batch = [] error_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # A = ' ' # A = u"\n| |\n|----------------------| \n(\\__/) || \n(• v •) || \n / D" if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) # print( # u"|----------------------|\n| | \n Costo # en i=%d, j=%d: \n %.6f" % (i, j, l) + A) e = error_rate(pY, Ytest) error_batch.append(e) print("Ratio de error:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # update velocities dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Nesterov momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # v update vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 # param update W2 += mu * vW2 - lr * gW2 b2 += mu * vb2 - lr * gb2 W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) plt.plot(losses_batch, label='batch') plt.plot(losses_momentum, label='momentum') plt.plot(losses_nesterov, label='Nesterov') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * dW1 - lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_momentum.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 # alternate version uses dW # dW2 = 0 # db2 = 0 # dW1 = 0 # db1 = 0 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1))) # dW(t) = mu*dW(t-1) + g(t) # W(t) = W(t-1) - mu*dW(t) W1_tmp = W1 - lr * mu * vW1 b1_tmp = b1 - lr * mu * vb1 W2_tmp = W2 - lr * mu * vW2 b2_tmp = b2 - lr * mu * vb2 Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] # pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) pYbatch, Z = forward(Xbatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp) # updates # dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # W2 += dW2 # db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) # b2 += db2 # dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) # W1 += dW1 # db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # b1 += db1 vW2 = mu * vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg * W2_tmp W2 -= lr * vW2 vb2 = mu * vb2 + derivative_b2(Ybatch, pYbatch) + reg * b2_tmp b2 -= lr * vb2 vW1 = mu * vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg * W1_tmp W1 -= lr * vW1 vb1 = mu * vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg * b1_tmp b1 -= lr * vb1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_nest.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparams lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1. Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1 ** t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2 ** t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] # comparable hyperparameters for fair comparison lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
for i in xrange(max_iter): for j in xrange(n_batches): ======= for i in range(max_iter): for j in range(n_batches): >>>>>>> upstream/master Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) <<<<<<< HEAD print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2)
def main(): # 3 scenarios # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 15 print_period = 10 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.0001 reg = 0.001 # Xtrain = X[:-1000, ] # Ytrain = Y[:-1000] # Xtest = X[-1000:, ] # Ytest = Y[-1000:, ] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = int(N / batch_sz) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # Batch losses_batch = [] error_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # A = ' ' # A = u"\n| |\n|----------------------| \n(\\__/) || \n(• v •) || \n / D" if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) # print( # u"|----------------------|\n| | \n Costo # en i=%d, j=%d: \n %.6f" % (i, j, l) + A) e = error_rate(pY, Ytest) error_batch.append(e) print("Ratio de error:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) # Momentum W1 = W1_0.copy() b1 = b1.copy() W2 = W2.copy() b2 = b2.copy() losses_rms = [] errors_rms = [] lr0 = 0.001 cacheW2 = 0 cacheb2 = 0 cacheW1 = 0 cacheb1 = 0 decay_rate = 0.99 eps = 0.000001 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) gb2 = (derivative_b2(Ybatch, pYbatch) + reg * b2) gW1 = (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) gb1 = (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) # caches cacheW2 = decay_rate * cacheW2 + (1 - decay_rate) * gW2 * gW2 cacheb2 = decay_rate * cacheb2 + (1 - decay_rate) * gb2 * gb2 cacheW1 = decay_rate * cacheW1 + (1 - decay_rate) * gW1 * gW1 cacheb1 = decay_rate * cacheb1 + (1 - decay_rate) * gb1 * gb1 W2 -= lr0 * gW2 / (np.sqrt(cacheW2) + eps) b2 -= lr0 * gb2 / (np.sqrt(cacheb2) + eps) W1 -= lr0 * gW1 / (np.sqrt(cacheW1) + eps) b1 -= lr0 * gb1 / (np.sqrt(cacheb1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_rms.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate: ", error_rate(pY, Ytest)) plt.plot(losses_batch, label='batch') plt.plot(losses_rms, label='rmsprop') plt.legend() plt.show()
def main(): max_iter = 20 print_period = 20 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() Ytrain_ind = y2indicator(Ytrain) # Target of train data Ytest_ind = y2indicator(Ytest) # Target of test data lr = 0.00004 reg = 0.01 N, D = Xtrain.shape M = 300 K = 10 np.random.seed(123) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) batch_sz = 500 n_batches = N // batch_sz # 82 # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. learning rate = constant losses_batch = [] errors_batch = [] for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_rms = [] errors_rms = [] ''' in RMSprop you can use a bigger lr, but if you set this too high you'll get NaN! if you use the same learning rate within RMSprop and General method, there is only slight difference between them. ''' lr0 = 0.001 cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): # Xtrain, Ytrain_ind = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] # Target of each batch pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # # update # cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*np.square(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # W2 -= lr0 / (np.sqrt(cache_W2) + eps) *(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*np.square(derivative_b2(Ybatch, pYbatch) + reg*b2) # b2 -= lr0 / (np.sqrt(cache_b2) + eps) *(derivative_b2(Ybatch, pYbatch) + reg*b2) # cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*np.square(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2)) # W1 -= lr0 / (np.sqrt(cache_W1) + eps) *(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) # cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*np.square(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # b1 -= lr0 / (np.sqrt(cache_b1) + eps) *(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # updates # 更聰明的寫法,是把上面式子中,會重複計算到的部分提出來計算並指派給變數,讓它只計算一次,這樣會加速 gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_rms.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label='contant') plt.plot(losses_rms, label='RMSprop') plt.legend() plt.show()
# 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): >>>>>>> upstream/master Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: <<<<<<< HEAD # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2)
def main(): max_iter = 20 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() lr = 0.00004 reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(K) b2 = np.zeros(K) # copy weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. Constant Learning rate losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_batch.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_batch.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) batch_error = error_rate(pY, Y_test) print(f"Final batch error rate: {batch_error}") # 2. RMSProp W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_RMSP = [] errors_RMSP = [] lr0 = 0.001 cache_W1 = 1 cache_b1 = 1 cache_W2 = 1 cache_b2 = 1 decay = 0.999 epsilon = 1e-10 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + epsilon) gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + epsilon) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + epsilon) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + epsilon) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_RMSP.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_RMSP.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) print(f"Final RMSProp error rate: {error_rate(pY, Y_test)}") print(f"Final batch error rate: {batch_error}") plt.plot(losses_batch, label='batch cost') plt.plot(losses_RMSP, label='RMSProp cost') plt.legend() plt.show()
def main(): # compare 2 scenarios: # 1. batch GD with RMSProp and momentum # 2. Adam GD max_iter = 20 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain, Ytrain = X[:-1000, :], Y[:-1000] Xtest, Ytest = X[-1000:, :], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 # number of hidden layer units K = len(set(Ytrain)) batch_size = 500 n_batches = N // batch_size # randomly initialize weights: W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) # 1. batch GD with RMSProp and momentum: print('\nperforming batch GD with RMSProp and momentum...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() LL_rm = [] CR_rm = [] # hyperparams: lr0 = 0.001 #lr0 = 0.0001 mu = 0.9 decay = 0.999 eps = 10e-9 # momentum (velocity terms): dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 # rms-prop cache (with no bias correction): cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 t0 = datetime.now() for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) #print(Z.shape, p_Ybatch.shape, Ybatch.shape) #print('First batch cost:', cost(p_Ybatch, Ybatch)) # updates: # (note: we utilize a bit different version of momentum) gW2 = (derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2) cache_W2 = decay * cache_W2 + (1 - decay) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2 + eps)) W2 -= dW2 #dW2 = mu*dW2 - lr0*gW2 / (np.sqrt(cache_W2) + eps) #W2 += dW2 gb2 = (derivative_b2(Ybatch, p_Ybatch) + reg * b2) cache_b2 = decay * cache_b2 + (1 - decay) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2 + eps)) b2 -= db2 #db2 = mu*db2 - lr0*gb2 / (np.sqrt(cache_b2) + eps) #b2 += db2 gW1 = (derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1) cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1 + eps)) W1 -= dW1 #dW1 = mu*dW1 - lr0*gW1 / (np.sqrt(cache_W1) + eps) #W1 += dW1 gb1 = (derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1) cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1 + eps)) b1 -= db1 #db1 = mu*db1 - lr0*gb1 / (np.sqrt(cache_b1) + eps) #b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) #print('pY:', pY) ll = cost(pY, Ytest_ind) LL_rm.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_rm.append(error) print('error rate:', error) dt1 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for batch GD with RMSProp and momentum:', dt1) # plot the cost plt.plot(LL_rm) plt.title('Cost for batch GD with RMSProp and momentum') plt.show() # 2. Adam optimizer print('\nperforming Adam optimizer...') W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # hyperparams: lr = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 10e-9 # 1st moment: mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment: vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 LL_adam = [] CR_adam = [] t0 = datetime.now() t = 1 # index; used instead of j, because j starts with 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_size:(j + 1) * batch_size, :] Ybatch = Ytrain_ind[j * batch_size:(j + 1) * batch_size, :] p_Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates: # gradients: gW2 = derivative_W2(Z, Ybatch, p_Ybatch) + reg * W2 gb2 = derivative_b2(Ybatch, p_Ybatch) + reg * b2 gW1 = derivative_W1(Xbatch, Z, Ybatch, p_Ybatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, p_Ybatch, W2) + reg * b1 # 1st moment: mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 # 2nd moment: vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction: mW2_bc = mW2 / (1 - beta1**t) mb2_bc = mb2 / (1 - beta1**t) mW1_bc = mW1 / (1 - beta1**t) mb1_bc = mb1 / (1 - beta1**t) vW2_bc = vW2 / (1 - beta2**t) vb2_bc = vb2 / (1 - beta2**t) vW1_bc = vW1 / (1 - beta2**t) vb1_bc = vb1 / (1 - beta2**t) # weights and biases (parameters): W2 = W2 - lr * mW2_bc / np.sqrt(vW2_bc + eps) b2 = b2 - lr * mb2_bc / np.sqrt(vb2_bc + eps) W1 = W1 - lr * mW1_bc / np.sqrt(vW1_bc + eps) b1 = b1 - lr * mb1_bc / np.sqrt(vb1_bc + eps) t += 1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_adam.append(ll) print('\ni: %d, j: %d, cost: %.6f' % (i, j, ll)) error = error_rate(pY, Ytest) CR_adam.append(error) print('error rate:', error) dt2 = datetime.now() - t0 pY, _ = forward(Xtest, W1, b1, W2, b2) print('\nFinal error rate:', error_rate(pY, Ytest)) print('Elapsed time for Adam optimizer:', dt2) # plot the cost plt.plot(LL_adam) plt.title('Cost for Adam optimizer') plt.show() # plot costs from the two experiments together: plt.plot(LL_rm, label='RMSProp with momentum') plt.plot(LL_adam, label='Adam optimizer') plt.title('Cost') plt.legend() plt.show()
def main(): # compare 3: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum # all with L2 regularization print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # regular batch gradient descend epochs = 30 tr_costs = [] errors_batch = [] losses_test = [] batch_size = 500 number_batches = int(N // batch_size) #max_iter = 30 # 1. for epoch in range(epochs): for j in range(number_batches): xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :] ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :] ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2) W2 -= lr * (derivative_w2(z_tr, ytr, ytr_pred) + reg * W2) b2 -= lr * (derivative_b2(ytr, ytr_pred) + reg * b2) W1 -= lr * (derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1) b1 -= lr * (derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1) if j % print_period == 0: yte_pred, _ = forward(Xtest, W1, b1, W2, b2) l = cost(yte_pred, Ytest_ind) losses_test.append(l) print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l)) e = error_rate(yte_pred, Ytest) errors_batch.append(e) print("Error rate:", e) ctr = cost(ytr_pred, ytr) print("traning set cost", ctr) tr_costs.append(ctr) pY, _ = forward(Xtest, W1, b1, W2, b2) #plt.plot(tr_costs, label='tr_costs') plt.plot(losses_test, label='losses_test') #plt.plot(errors_batch, label='errors_batch') # plt.show() # print("tr_costs", tr_costs) print("Final error rate:", error_rate(pY, Ytest)) # 2. W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # regular batch gradient descend tr_costs_momentum = [] errors_batch_momentum = [] losses_test_momentum = [] # momentum coeficient mu = 0.9 dW1 = 0 dW2 = 0 db1 = 0 db2 = 0 for epoch in range(epochs): for j in range(number_batches): xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :] ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :] ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2) # gradients gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2 gb2 = derivative_b2(ytr, ytr_pred) + reg * b2 gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1 gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1 # update velocity dW2 = mu * dW2 - lr * gW2 db2 = mu * db2 - lr * gb2 dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 # update W2 += dW2 W1 += dW1 b2 += db2 b1 += db1 if j % print_period == 0: yte_pred, _ = forward(Xtest, W1, b1, W2, b2) l = cost(yte_pred, Ytest_ind) losses_test_momentum.append(l) print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l)) e = error_rate(yte_pred, Ytest) errors_batch_momentum.append(e) print("Error rate:", e) ctr = cost(ytr_pred, ytr) print("traning set cost", ctr) tr_costs_momentum.append(ctr) pY, _ = forward(Xtest, W1, b1, W2, b2) #plt.plot(tr_costs_momentum, label='tr_costs momentum') plt.plot(losses_test_momentum, label='losses_test momentum') #plt.plot(errors_batch, label='errors_batch') # plt.show() # print("tr_costs", errors_batch_momentum) print("Final error rate:", error_rate(pY, Ytest)) # 3. W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # regular batch gradient descend tr_costs_nesterov = [] errors_batch_nesterov = [] losses_test_nesterov = [] # momentum coeficient mu = 0.9 vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 for epoch in range(epochs): for j in range(number_batches): xtr = Xtrain[j * batch_size:(j * batch_size + batch_size), :] ytr = Ytrain_ind[j * batch_size:(j * batch_size + batch_size), :] ytr_pred, z_tr = forward(xtr, W1, b1, W2, b2) # gradients gW2 = derivative_w2(z_tr, ytr, ytr_pred) + reg * W2 gb2 = derivative_b2(ytr, ytr_pred) + reg * b2 gW1 = derivative_w1(xtr, z_tr, ytr, ytr_pred, W2) + reg * W1 gb1 = derivative_b1(z_tr, ytr, ytr_pred, W2) + reg * b1 # update velocity vW2 = mu * vW2 - lr * gW2 vb2 = mu * vb2 - lr * gb2 vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 # update W2 += mu * vW2 - lr * gW2 W1 += mu * vW1 - lr * gW1 b2 += mu * vb2 - lr * gb2 b1 += mu * vb1 - lr * gb1 if j % print_period == 0: yte_pred, _ = forward(Xtest, W1, b1, W2, b2) l = cost(yte_pred, Ytest_ind) losses_test_nesterov.append(l) print("test set Cost at iteration epoch=%d, j=%d: %.6f" % (epoch, j, l)) e = error_rate(yte_pred, Ytest) errors_batch_nesterov.append(e) print("Error rate:", e) ctr = cost(ytr_pred, ytr) print("traning set cost", ctr) tr_costs_nesterov.append(ctr) pY, _ = forward(Xtest, W1, b1, W2, b2) #plt.plot(tr_costs_nesterov, label='tr_costs_nesterov') plt.plot(losses_test_nesterov, label='losses_test_nesterov') #plt.plot(errors_batch_nesterov, label='errors_batch') plt.legend() plt.show() # print("tr_costs_nesterov", errors_batch_momentum) print("Final error rate nesterov:", error_rate(pY, Ytest))
def main(): ''' RMSprop is a form adaptative learning rate which decreases over time ''' max_iter = 20 #for RelU #max_iter = 30 #for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.0004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M =300 K=10 #1. batch SGD W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :] Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch,pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period ==0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #2. RMSProp W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 1 - 1e-5 eps = 1e-10 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:((j+1)*batch_sz), :] Ybatch = Ytrain_ind[j*batch_sz:((j+1)*batch_sz), :] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) #updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1-decay_rate)*gW2*gW2 W2 -= lr0*gW2 /(np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch,pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1-decay_rate)*gb2*gb2 b2 -= lr0*gb2 /(np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1-decay_rate)*gW1*gW1 W1 -= lr0*gW1 /(np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1-decay_rate)*gb1*gb1 b1 -= lr0*gb1 /(np.sqrt(cache_b1) + eps) if j % print_period ==0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "RMS Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "RMS Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "RMS Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='batch') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 20 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() learning_rate = 0.00004 reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #SAVE INITIAL WEIGHT AND BIAS W1_copy = W1.copy() b1_copy = b1.copy() W2_copy = W2.copy() b2_copy = b2.copy() #constant learning_rate lose_constant = [] error_constant = [] for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) W2 -= learning_rate * (derivative_w2(Z, y, pY) + reg * W2) b2 -= learning_rate * (derivative_b2(y, pY) + reg * b2) W1 -= learning_rate * (derivative_w1(x, Z, y, pY, W2) + reg * W1) b1 -= learning_rate * (derivative_b1(Z, y, pY, W2) + reg * b1) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_constant.append(l) error_constant.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #RMSprop W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() learning_rate_0 = 0.001 lose_non_costant = [] error_non_constant = [] cache_W1 = 1 cache_W2 = 1 cache_b1 = 1 cache_b2 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) gW2 = derivative_w2(Z, y, pY) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= learning_rate_0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(y, pY) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= learning_rate_0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= learning_rate_0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, y, pY, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= learning_rate_0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_non_costant.append(l) error_non_constant.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) plt.plot(lose_constant, label="batch") plt.plot(lose_non_costant, label="non_constant") plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparams lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1. Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1**t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2**t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] # comparable hyperparameters for fair comparison lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 X_train, X_test, Y_train, Y_test = get_normalized_data() reg = 0.01 Y_train_ind = y2indicator(Y_train) Y_test_ind = y2indicator(Y_test) N, D = X_train.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(K) b2_0 = np.zeros(K) # .1 Adam W1 = W1_0.copy() W2 = W2_0.copy() b1 = b1_0.copy() b2 = b2_0.copy() losses_adam = [] errors_adam = [] # 1st moment mW1 = 0 mW2 = 0 mb1 = 0 mb2 = 0 # 2nd moment vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 # Hyperparams eps = 1e-8 lr = 0.001 beta1 = 0.9 beta2 = 0.999 t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction correction1 = 1 - beta1 ** t mW1_hat = mW1 / correction1 mb1_hat = mb1 / correction1 mW2_hat = mW2 / correction1 mb2_hat = mb2 / correction1 # correction2 = 1 - beta2 ** t vb2_hat = vb2 / correction2 vb1_hat = vb1 / correction2 vW2_hat = vW2 / correction2 vW1_hat = vW1 / correction2 t += 1 # weights W1 = W1 - lr * mW1_hat / np.sqrt(vW1_hat + eps) b1 = b1 - lr * mb1_hat / np.sqrt(vb1_hat + eps) W2 = W2 - lr * mW2_hat / np.sqrt(vW2_hat + eps) b2 = b2 - lr * mb2_hat / np.sqrt(vb2_hat + eps) if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_adam.append(l) print(f'Adam Cost at iteration i={i}, j={j} : {l}') e = error_rate(pY, Y_test) errors_adam.append(e) print("error_rate", e) pY, _ = forward(X_test, W1, b1, W2, b2) adam_error = error_rate(pY, Y_test) # 3. RMSProp with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_rms = [] errors_rms = [] # comparable hyper parameters for fair lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(X_test, W1, b1, W2, b2) l = cost(pY, Y_test_ind) losses_rms.append(l) print(f'Cost at iteration i={i}, j={j} : {l}') err = error_rate(pY, Y_test) errors_rms.append(err) print("Error rate:", err) pY, _ = forward(X_test, W1, b1, W2, b2) rms_error = error_rate(pY, Y_test) print(f"Final RMSProp error rate: {rms_error}") print(f"Final Adam error rate: {adam_error}") plt.plot(losses_adam, label='batch cost') plt.plot(losses_rms, label='RMSProp cost') plt.legend() plt.show()
def main(): max_iter = 20 # make 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. constant learning rate LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:] yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:] pYbatch, Z= forward(xBatch, W1, b1, W2, b2) W1 -= lr*(derivative_w2(Z, yBatch, pYbatch) + reg*W2) b1 -= lr*(derivative_b2(yBatch, pYbatch) + reg*b2) W2 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b2 -= lr*(derivative_b1(Z, yBatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print()"Final error rate:", error_rate(pY, Ytest)) # 2. RMS prop W1 = np.random(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if too high, will result with NaN cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 0.0000000001 for i in range(max_iter): for j in range(n_batches): xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:] yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:] pYbatch, Z= forward(xBatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll)) err = error_rate(pY, Ytest) CR_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print()"Final error rate:", error_rate(pY, Ytest)) # 3. batch GD w/ Nesterov momentum W1 = np.random(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 # dW2 = 0 # db2 = 0 # dW1 = 0 # db1 = 0 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1))) # dW(t) = mu*dW(t-1) + g(t) # W(t) = W(t-1) - mu*dW(t) W1_tmp = W1 - lr*mu*vW1 b1_tmp = b1 - lr*mu*vb1 W2_tmp = W2 - lr*mu*vW2 b2_tmp = b2 - lr*mu*vb2 xBatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),:] yBatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),:] pYbatch, Z= forward(xBatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp) # updates vW2 = mu*vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg*W2_tmp W2 -= lr*vW2 vb2 = mu*vb2 + derivative_b2(Ybatch, pYbatch) + reg*b2_tmp b2 -= lr*vb2 vW1 = mu*vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg*W1_tmp W1 -= lr*vW1 vb1 = mu*vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg*b1_tmp b1 -= lr*vb1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print("Cost at iteration i=%d, j=%d: %.6f", % (i, j, ll)) err = error_rate(pY, Ytest) CR_nest.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print()"Final error rate:", error_rate(pY, Ytest))
def main(): max_iter = 10 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #SAVE INITIAL WEIGHT AND BIAS W1_copy = W1.copy() b1_copy = b1.copy() W2_copy = W2.copy() b2_copy = b2.copy() #1st moment mW1 = 0 mW2 = 0 mb1 = 0 mb2 = 0 #2nd moment vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 #hyperparams learning_rate = 0.001 beta1 = 0.99 beta2 = 0.999 eps = 1e-8 #adam lose_adam = [] error_adam = [] t = 1 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) #update gradient gW2 = derivative_w2(Z, y, pY) + reg * W2 gb2 = derivative_b2(y, pY) + reg * b2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1 gb1 = derivative_b1(Z, y, pY, W2) + reg * b1 #update 1st moment mW1 = beta1 * mW1 + (1 - beta1) * gW1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mb2 = beta1 * mb2 + (1 - beta1) * gb2 #update 2nd moment vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 #bias correction correction_1 = 1 - beta1**t correction_2 = 1 - beta2**t mW1_hat = mW1 / correction_1 mW2_hat = mW2 / correction_1 mb1_hat = mb1 / correction_1 mb2_hat = mb2 / correction_1 vW1_hat = vW1 / correction_2 vW2_hat = vW2 / correction_2 vb1_hat = vb1 / correction_2 vb2_hat = vb2 / correction_2 #update t t += 1 #update weight W2 -= learning_rate * mW2_hat / np.sqrt(vW2_hat + eps) b2 -= learning_rate * mb2_hat / np.sqrt(vb2_hat + eps) b1 -= learning_rate * mb1_hat / np.sqrt(vb1_hat + eps) W1 -= learning_rate * mW1_hat / np.sqrt(vW1_hat + eps) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_adam.append(l) error_adam.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #RMSprop with momentum W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() #hyperparams learning_rate = 0.001 decay_rate = 0.999 mu = 0.9 eps = 1e-8 #rmsprop cache cache_W1 = 1 cache_W2 = 1 cache_b1 = 1 cache_b2 = 1 #momentum dW1 = 0 dW2 = 0 db1 = 0 db2 = 0 lose_rmsprop_m = [] error_rmsprop_m = [] t = 1 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) #udpate gW2 = derivative_w2(Z, y, pY) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 - ( 1 - mu) * learning_rate * gW2 / np.sqrt(cache_W2 + eps) W2 += dW2 gb2 = derivative_b2(y, pY) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 - ( 1 - mu) * learning_rate * gb2 / np.sqrt(cache_b2 + eps) b2 += db2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 - ( 1 - mu) * learning_rate * gW1 / np.sqrt(cache_W1 + eps) W1 += dW1 gb1 = derivative_b1(Z, y, pY, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 - ( 1 - mu) * learning_rate * gb1 / np.sqrt(cache_b1 + eps) b1 += db1 # #update cache # cache_W1 = decay_rate * cache_W1 + (1-decay_rate)*gW1*gW1 # cache_W2 = decay_rate * cache_W2 + (1-decay_rate)*gW2*gW2 # cache_b1 = decay_rate * cache_b1 + (1-decay_rate)*gb1*gb1 # cache_b2 = decay_rate * cache_b2 + (1-decay_rate)*gb2*gb2 # #update momentum # dW2 = mu*dW2 + (1-mu) * learning_rate * gW2 / (np.sqrt(cache_W2) + eps) # db2 = mu*db2 + (1-mu) * learning_rate * gb2 / (np.sqrt(cache_b2) + eps) # dW1 = mu*dW1 + (1-mu) * learning_rate * dW1 / (np.sqrt(cache_W1) + eps) # db1 = mu*db1 + (1-mu) * learning_rate * db1 / (np.sqrt(cache_b1) + eps) # #update weights # W2 -= dW2 # b2 -= db2 # W1 -= dW1 # b1 -= db1 if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_rmsprop_m.append(l) error_rmsprop_m.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) plt.plot(lose_adam, label="adam") plt.plot(lose_rmsprop_m, label="rmsprop with momentum") plt.legend() plt.show()
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 0.0000000001 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
# 1. Adam optimizer loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batch): X_batch = X_train[j * batch_sz:(j * batch_sz + batch_sz), ] Y_batch = Y_train_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pY_batch, Z = forward(X_batch, W1, b1, W2, b2) # Update the gradiant gW2 = derivative_w2(Z, Y_batch, pY_batch) + reg * W2 gb2 = derivative_b2(Y_batch, pY_batch) + reg * b2 gW1 = derivative_w1(X_batch, Z, Y_batch, pY_batch, W2) + reg * W1 gb1 = derivative_b1(Z, Y_batch, pY_batch, W2) + reg * b1 # Update new Moments mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # Update new Velocity vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # Bias Correction Update correction1 = 1 - beta1**t
def main(): # 1.batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000], Y[:-1000] Xtest, Ytest = X[-1000:], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = X.shape batch_sz = 500 n_batches = N / batch_sz M = 300 # number of hidden neurons K = 10 # number of output classes W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #1. batch SGD LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #2. batch with momentum W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * dW1 - lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #3. batch with Nesterov momentum W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nesterov = [] CR_nesterov = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) dW2 = mu * mu * dW2 - (1 + mu) * lr * ( derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * mu * db2 - (1 + mu) * lr * ( derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * mu * dW1 - (1 + mu) * lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * mu * db1 - (1 + mu) * lr * ( derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_nesterov.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nesterov.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label='momentum') plt.plot(LL_nesterov, label='nesterov') plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 K = len(set(Ytrain)) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 20 # 1. batch costs_batch = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) W2 -= lr * (derivative_W2(Z, pY, y) + reg * W2) b2 -= lr * (derivative_b2(pY, y) + reg * b2) W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg * W1) b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg * b1) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 2. RMSprop W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 lr0 = 0.001 costs_RMS = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) gW2 = (derivative_W2(Z, pY, y) + reg * W2) cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = (derivative_b2(pY, y) + reg * b2) cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1) cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1) cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_RMS.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) plt.plot(costs_batch, label="batch") plt.plot(costs_RMS, label="rms") plt.legend() plt.show()