def main(): train_X, test_X, train_Y, test_Y = get_normalized_data() model = ANN([500, 300]) session = tf.InteractiveSession() model.set_session(session) model.fit(train_X, train_Y, test_X, test_Y, show_fig=True) print("Train accuracy:", model.score(train_X, train_Y)) print("Test accuracy:", model.score(test_X, test_Y))
def main(): max_iter = 20 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() learning_rate = 0.00004 reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) #initialize theano variables thX = T.matrix('X') thT = T.matrix('T') W1 = theano.shared(W1_init, 'W1') W2 = theano.shared(W2_init, 'W2') b1 = theano.shared(b1_init, 'b1') b2 = theano.shared(b2_init, 'b2') #action function and softmax tZ = relu(thX.dot(W1) + b1) t_pY = T.nnet.softmax(tZ.dot(W2) + b2) #cost function and predition function cost = -(thT * T.log(t_pY)).sum() + reg * (W1 * W1).sum() + reg * ( W2 * W2).sum() + reg * (b1 * b1).sum() + reg * (b2 * b2).sum() predition = T.argmax(t_pY, axis=1) #training update_b2 = b2 - learning_rate * T.grad(cost, b2) update_W2 = W2 - learning_rate * T.grad(cost, W2) update_b1 = b1 - learning_rate * T.grad(cost, b1) update_W1 = W1 - learning_rate * T.grad(cost, W1) train = theano.function(inputs=[thX, thT], updates=[(W1, update_W1), (W2, update_W2), (b1, update_b1), (b2, update_b2)]) get_prediction = theano.function(inputs=[thX, thT], outputs=[cost, predition]) costs = [] for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] train(x, y) if j % print_period == 0: cost, test_pY = get_prediction(test_X, test_Y_ind) error = error_rate(test_pY, test_Y) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost, error)) costs.append(cost) plt.plot(costs) plt.show()
def main(): max_iter = 10 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #SAVE INITIAL WEIGHT AND BIAS W1_copy = W1.copy() b1_copy = b1.copy() W2_copy = W2.copy() b2_copy = b2.copy() #1st moment mW1 = 0 mW2 = 0 mb1 = 0 mb2 = 0 #2nd moment vW1 = 0 vW2 = 0 vb1 = 0 vb2 = 0 #hyperparams learning_rate = 0.001 beta1 = 0.99 beta2 = 0.999 eps = 1e-8 #adam lose_adam = [] error_adam = [] t = 1 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) #update gradient gW2 = derivative_w2(Z, y, pY) + reg * W2 gb2 = derivative_b2(y, pY) + reg * b2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1 gb1 = derivative_b1(Z, y, pY, W2) + reg * b1 #update 1st moment mW1 = beta1 * mW1 + (1 - beta1) * gW1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mb2 = beta1 * mb2 + (1 - beta1) * gb2 #update 2nd moment vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 #bias correction correction_1 = 1 - beta1**t correction_2 = 1 - beta2**t mW1_hat = mW1 / correction_1 mW2_hat = mW2 / correction_1 mb1_hat = mb1 / correction_1 mb2_hat = mb2 / correction_1 vW1_hat = vW1 / correction_2 vW2_hat = vW2 / correction_2 vb1_hat = vb1 / correction_2 vb2_hat = vb2 / correction_2 #update t t += 1 #update weight W2 -= learning_rate * mW2_hat / np.sqrt(vW2_hat + eps) b2 -= learning_rate * mb2_hat / np.sqrt(vb2_hat + eps) b1 -= learning_rate * mb1_hat / np.sqrt(vb1_hat + eps) W1 -= learning_rate * mW1_hat / np.sqrt(vW1_hat + eps) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_adam.append(l) error_adam.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #RMSprop with momentum W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() #hyperparams learning_rate = 0.001 decay_rate = 0.999 mu = 0.9 eps = 1e-8 #rmsprop cache cache_W1 = 1 cache_W2 = 1 cache_b1 = 1 cache_b2 = 1 #momentum dW1 = 0 dW2 = 0 db1 = 0 db2 = 0 lose_rmsprop_m = [] error_rmsprop_m = [] t = 1 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) #udpate gW2 = derivative_w2(Z, y, pY) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 - ( 1 - mu) * learning_rate * gW2 / np.sqrt(cache_W2 + eps) W2 += dW2 gb2 = derivative_b2(y, pY) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 - ( 1 - mu) * learning_rate * gb2 / np.sqrt(cache_b2 + eps) b2 += db2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 - ( 1 - mu) * learning_rate * gW1 / np.sqrt(cache_W1 + eps) W1 += dW1 gb1 = derivative_b1(Z, y, pY, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 - ( 1 - mu) * learning_rate * gb1 / np.sqrt(cache_b1 + eps) b1 += db1 # #update cache # cache_W1 = decay_rate * cache_W1 + (1-decay_rate)*gW1*gW1 # cache_W2 = decay_rate * cache_W2 + (1-decay_rate)*gW2*gW2 # cache_b1 = decay_rate * cache_b1 + (1-decay_rate)*gb1*gb1 # cache_b2 = decay_rate * cache_b2 + (1-decay_rate)*gb2*gb2 # #update momentum # dW2 = mu*dW2 + (1-mu) * learning_rate * gW2 / (np.sqrt(cache_W2) + eps) # db2 = mu*db2 + (1-mu) * learning_rate * gb2 / (np.sqrt(cache_b2) + eps) # dW1 = mu*dW1 + (1-mu) * learning_rate * dW1 / (np.sqrt(cache_W1) + eps) # db1 = mu*db1 + (1-mu) * learning_rate * db1 / (np.sqrt(cache_b1) + eps) # #update weights # W2 -= dW2 # b2 -= db2 # W1 -= dW1 # b1 -= db1 if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_rmsprop_m.append(l) error_rmsprop_m.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) plt.plot(lose_adam, label="adam") plt.plot(lose_rmsprop_m, label="rmsprop with momentum") plt.legend() plt.show()
def main(): max_iter = 20 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() learning_rate = 0.00004 reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #SAVE INITIAL WEIGHT AND BIAS W1_copy = W1.copy() b1_copy = b1.copy() W2_copy = W2.copy() b2_copy = b2.copy() #constant learning_rate lose_constant = [] error_constant = [] for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) W2 -= learning_rate * (derivative_w2(Z, y, pY) + reg * W2) b2 -= learning_rate * (derivative_b2(y, pY) + reg * b2) W1 -= learning_rate * (derivative_w1(x, Z, y, pY, W2) + reg * W1) b1 -= learning_rate * (derivative_b1(Z, y, pY, W2) + reg * b1) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_constant.append(l) error_constant.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #RMSprop W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() learning_rate_0 = 0.001 lose_non_costant = [] error_non_constant = [] cache_W1 = 1 cache_W2 = 1 cache_b1 = 1 cache_b2 = 1 decay_rate = 0.999 eps = 1e-10 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) gW2 = derivative_w2(Z, y, pY) + reg * W2 cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= learning_rate_0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(y, pY) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= learning_rate_0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(x, Z, y, pY, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= learning_rate_0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, y, pY, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= learning_rate_0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_non_costant.append(l) error_non_constant.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) plt.plot(lose_constant, label="batch") plt.plot(lose_non_costant, label="non_constant") plt.legend() plt.show()
def main(): max_iter = 20 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() # learning_rate = 0.00004 reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) #1st moment mW1_init = np.zeros((K, M)) mW2_init = np.zeros((M, K)) mb1_init = np.zeros((1, M)) mb2_init = np.zeros((1, K)) #2nd moment vW1_init = np.zeros((M, M)) vW2_init = np.zeros((M, M)) vb1_init = np.zeros((M, M)) vb2_init = np.zeros((K, K)) #hyperparams learning_rate = 0.001 beta1 = 0.99 beta2 = 0.999 eps = 1e-8 #other parameters t_init = 1 #initialize theano variables thX = T.matrix('X') thT = T.matrix('T') W1 = theano.shared(W1_init, 'W1') W2 = theano.shared(W2_init, 'W2') b1 = theano.shared(b1_init, 'b1') b2 = theano.shared(b2_init, 'b2') mW1 = theano.shared(mW1_init, 'mW1') mW2 = theano.shared(mW2_init, 'mW2') mb1 = theano.shared(mb1_init, 'mb1') mb2 = theano.shared(mb2_init, 'mb2') vW1 = theano.shared(vW1_init, 'vW1') vW2 = theano.shared(vW2_init, 'vW2') vb1 = theano.shared(vb1_init, 'vb1') vb2 = theano.shared(vb2_init, 'vb2') t = theano.shared(t_init, 't') #action fuction tZ = relu(thX.dot(W1) + b1) t_pY = T.nnet.softmax(tZ.dot(W2) + b2) #cost and prediction function cost = -(thT * T.log(t_pY)).sum() + reg * ((W1 * W1).sum() + (W2 * W2).sum() + (b1 * b1).sum() + (b2 * b2).sum()) prediction = T.argmax(t_pY, axis=1) #trainning #update gradient gW2 = T.grad(cost, W2) gb2 = T.grad(cost, b2) gW1 = T.grad(cost, W1) gb1 = T.grad(cost, b1) #update 1st moment update_mW1 = beta1 * mW1 + (1 - beta1) * gW1 update_mW2 = beta1 * mW2 + (1 - beta1) * gW2 update_mb1 = beta1 * mb1 + (1 - beta1) * gb1 update_mb2 = beta1 * mb2 + (1 - beta1) * gb2 #update 2nd moment update_vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 update_vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 update_vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 update_vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 #bias correction correction_1 = 1 - beta1**t correction_2 = 1 - beta2**t mW1_hat = mW1 / correction_1 mW2_hat = mW2 / correction_1 mb1_hat = mb1 / correction_1 mb2_hat = mb2 / correction_1 vW1_hat = vW1 / correction_2 vW2_hat = vW2 / correction_2 vb1_hat = vb1 / correction_2 vb2_hat = vb2 / correction_2 #update update_t = t + 1 update_W2 = W2 - learning_rate * mW2_hat / T.sqrt(vW2_hat + eps) update_b2 = b2 - learning_rate * mb2_hat / T.sqrt(vb2_hat + eps) update_b1 = b1 - learning_rate * mb1_hat / T.sqrt(vb1_hat + eps) update_W1 = W1 - learning_rate * mW1_hat / T.sqrt(vW1_hat + eps) train = theano.function(inputs=[thX, thT], updates=[(W1, update_W1), (W2, update_W2), (b1, update_b1), (b2, update_b2), (mW1, update_mW1), (mW2, update_mW2), (mb1, update_mb1), (mb2, update_mb2), (vW1, update_vW1), (vW2, update_vW2), (vb1, update_vb1), (vb2, update_vb2), (t, update_t)]) get_prediciton = theano.function(inputs=[thX, thT], outputs=[cost, prediction]) costs = [] for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] train(x, y) if j % print_period == 0: cost, test_pY = get_prediciton(test_X, test_Y_ind) rror = error_rate(test_pY, test_Y) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost, error)) costs.append(cost) plt.plot(costs) plt.show()
def main(): max_iter = 20 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() learning_rate = 0.00004 reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M1 = 300 M2 = 100 K = 10 W1_init = np.random.randn(D, M1) / np.sqrt(D) b1_init = np.zeros(M1) W2_init = np.random.randn(M1, M2) / np.sqrt(M1) b2_init = np.zeros(M2) W3_init = np.random.randn(M2, K) / np.sqrt(M2) b3_init = np.zeros(K) #initialize tensorflow variables X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) #define model Z1 = tf.nn.relu(tf.matmul(X, W1) + b1) Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2) Y_temp = tf.matmul(Z2, W3) + b3 cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits_v2(logits=Y_temp, labels=T)) train_op = tf.train.RMSPropOptimizer(learning_rate, decay=0.99, momentum=0.9).minimize(cost) prediction_op = tf.argmax(Y_temp, axis=1) costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j * batch_size:(j * batch_size + batch_size), :] y = shuffle_Y[j * batch_size:(j * batch_size + batch_size), :] session.run(train_op, feed_dict={X: x, T: y}) if j % print_period == 0: test_cost = session.run(cost, feed_dict={ X: test_X, T: test_Y_ind }) prediction = session.run(prediction_op, feed_dict={X: test_X}) error = error_rate(prediction, test_Y) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, error)) costs.append(test_cost) plt.plot(costs) plt.show()
def main(): train_X, test_X, train_Y, test_Y = get_normalized_data() model = ANN([500, 300], [0.8, 0.5, 0.5]) model.fit(train_X, train_Y, test_X, test_Y, show_fig=True)
def main(): max_iter = 20 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() learning_rate = 0.00004 reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #SAVE INITIAL WEIGHT AND BIAS W1_copy = W1.copy() b1_copy = b1.copy() W2_copy = W2.copy() b2_copy = b2.copy() #batch loss_batch = [] error_batch =[] for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :] y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) W2 -= learning_rate * (derivative_w2(Z, y, pY) + reg*W2) b2 -= learning_rate * (derivative_b2(y, pY) + reg*b2) W1 -= learning_rate * (derivative_w1(x, Z, y, pY, W2) + reg*W1) b1 -= learning_rate * (derivative_b1(Z, y, pY, W2) + reg*b1) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) loss_batch.append(l) error_batch.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #momentum W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() lose_momentum = [] error_momentum = [] mu = 0.9 dW1 = 0 dW2 = 0 db1 = 0 db2 = 0 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range (batch_num): x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :] y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) # print("overflow?") gW2 = derivative_w2(Z, y, pY) + reg*W2 gb2 = derivative_b2(y, pY) + reg*b2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg*W1 gb1 = derivative_b1(Z, y, pY, W2) + reg*b1 #UDPATE VELOCITIES dW2 = mu*dW2 - learning_rate*gW2 db2 = mu*db2 - learning_rate*gb2 dW1 = mu*dW1 - learning_rate*gW1 db1 = mu*db1 - learning_rate*gb1 #UPDATE WEIGHT W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_momentum.append(l) error_momentum.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #Nesterov momentum W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() lose_nesterov = [] error_nesterov = [] mu = 0.9 dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(test_X, test_Y_ind) for j in range(batch_num): x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :] y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) gW2 = derivative_w2(Z, y, pY) + reg*W2 gb2 = derivative_b2(y, pY) + reg*b2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg*W1 gb1 = derivative_b1(Z, y, pY, W2) + reg*b1 #update velocities dW2 = mu*dW2 - learning_rate*gW2 db2 = mu*db2 - learning_rate*db2 dW1 = mu*dW1 - learning_rate*gW1 db1 = mu*db1 - learning_rate*gb1 #update weight W2 += mu*dW2 - learning_rate*gW2 b2 += mu*db2 - learning_rate*db2 W1 += mu*dW1 - learning_rate*gW1 b1 += mu*db1 - learning_rate*gb1 if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_nesterov.append(l) error_nesterov.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) plt.plot(loss_batch, label="batch") plt.plot(lose_momentum, label="momentum") plt.plot(lose_nesterov, label="Nesterov") plt.legend() plt.show()