def main(): X_train, X_test, t_train, t_test = get_pca_normalized_data() print("Performing multi-class logistic regression...\n") N, D = X_train.shape K = 10 T_train = T_indicator(t_train) T_test = T_indicator(t_test) lr = float(sys.argv[1]) reg = float(sys.argv[2]) batch_size = int(sys.argv[3]) ######## 1. FULL GRADIENT DESCENT ######## print('Full Gradient Descent') W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) J_test_full = [] t0 = datetime.now() for epoch in range(50): Y_train = forward(X_train, W, b) W -= lr * (gradW(T_train, Y_train, X_train) - reg * W) b -= lr * (gradb(T_train, Y_train) - reg * b) Y_test = forward(X_test, W, b) j_test = J(T_test, Y_test) J_test_full.append(j_test) if epoch % 1 == 0: err = accuracy(predict(Y_test), t_test) if epoch % 10 == 0: print("Epoch {}:\tcost: {}\taccuracy: {}".format( epoch, round(j_test, 4), err)) Y_test = forward(X_test, W, b) print("Final accuracy:", accuracy(predict(Y_test), t_test)) print("Elapsted time for full GD: {}\n".format(datetime.now() - t0)) ######## 2. STOCHASTIC GRADIENT DESCENT ######## print('Stochastic Gradient Descent') W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) J_test_stochastic = [] t0 = datetime.now() for epoch in range( 50): # takes very long since we're computing cost for 41k samples tmpX, tmpT = shuffle(X_train, T_train) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n, :].reshape(1, D) t = tmpT[n, :].reshape(1, 10) Y_train = forward(x, W, b) W -= lr * (gradW(t, Y_train, x) - reg * W) b -= lr * (gradb(t, Y_train) - reg * b) Y_test = forward(X_test, W, b) j_test = J(T_test, Y_test) J_test_stochastic.append(j_test) if epoch % 1 == 0: err = accuracy(predict(Y_test), t_test) if epoch % 10 == 0: print("Epoch {}:\tcost: {}\taccuracy: {}".format( epoch, round(j_test, 4), err)) Y_test_final = forward(X_test, W, b) print("Final accuracy:", accuracy(predict(Y_test_final), t_test)) print("Elapsted time for SGD: {}\n".format(datetime.now() - t0)) ######## 3. BATCH GRADIENT DESCENT ######## print('Batch Gradient Descent') W = np.random.randn(D, K) / np.sqrt(D) b = np.zeros(K) J_test_batch = [] nb_batches = N // batch_size t0 = datetime.now() for epoch in range(50): tmpX, tmpT = shuffle(X_train, T_train) for batch_index in range(nb_batches): x = tmpX[batch_index * batch_size:(batch_index * batch_size + batch_size), :] t = tmpT[batch_index * batch_size:(batch_index * batch_size + batch_size), :] Y_train = forward(x, W, b) W -= lr * (gradW(t, Y_train, x) - reg * W) b -= lr * (gradb(t, Y_train) - reg * b) Y_test = forward(X_test, W, b) j_test = J(T_test, Y_test) J_test_batch.append(j_test) if epoch % 1 == 0: err = accuracy(predict(Y_test), t_test) if epoch % 10 == 0: print("Epoch {}\tcost: {}\taccuracy: {}".format( epoch, round(j_test, 4), err)) Y_test_final = forward(X_test, W, b) print("Final accuracy:", accuracy(predict(Y_test_final), t_test)) print("Elapsted time for batch GD:", datetime.now() - t0) ######## PLOTS ######## x1 = np.linspace(0, 1, len(J_test_full)) plt.plot(x1, J_test_full, label="full") x2 = np.linspace(0, 1, len(J_test_stochastic)) plt.plot(x2, J_test_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(J_test_batch)) plt.plot(x3, J_test_batch, label="batch") plt.legend() #plt.savefig('full_vs_stoch_vs_batch_lr={}_reg={}_batch_size={}.png'.format(lr, reg, batch_size)) plt.show()
from keras.models import Model from keras.layers import Dense, Input from util import get_normalized_data, T_indicator import matplotlib.pyplot as plt X_train, X_test, t_train, t_test = get_normalized_data() # get shapes N, D = X_train.shape K = len(set(t_train)) T_train = T_indicator(t_train) T_test = T_indicator(t_test) # ANN with layers [784] -> [500] -> [300] -> [10] x = Input(shape=(D,)) a = Dense(500, activation='relu')(x) a = Dense(300, activation='relu')(a) a = Dense(K, activation='softmax')(a) model = Model(inputs=x, outputs=a) model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'] ) # note: multiple ways to choose a backend # either theano, tensorflow, or cntk # https://keras.io/backend/
def main(): # STEP 1 : get the data and define all the usual variables X_train, X_test, t_train, t_test = get_normalized_data() X_train = X_train.astype(np.float32) X_test = X_test.astype(np.float32) t_train = t_train.astype(np.float32) t_test = t_test.astype(np.float32) T_train = T_indicator(t_train).astype(np.float32) T_test = T_indicator(t_test).astype(np.float32) # Dimensionality N, D = X_train.shape M = 300 K = 10 max_iter = 20 print_period = 10 batch_size = 500 nb_batches = N // batch_size # Hyperparameters lr = 0.0004 reg = 0.01 # Initialize weights W0_init = np.random.randn(D, M) / np.sqrt(D) b0_init = np.zeros(M) W1_init = np.random.randn(M, K) / np.sqrt(M) b1_init = np.zeros(K) # STEP 2 : define Theano variables and expressions # data and labels to go into the inputs thX = Tens.matrix('X') thT = Tens.matrix('T') # weights to be update W0 = theano.shared(W0_init, 'W0') b0 = theano.shared(b0_init, 'b0') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') # use the built-in theano functions to forward thA = relu(thX.dot(W0) + b0) thY = Tens.nnet.softmax(thA.dot(W1) + b1) # define cost function J = -(thT * Tens.log(thY)).sum() + reg * ((W1 * W1).sum() + (b1 * b1).sum() + (W0 * W0).sum() + (b0 * b0).sum()) prediction = Tens.argmax(thY, axis=1) # STEP 3 : training expression and function update_W1 = W1 - lr * Tens.grad(J, W1) update_b1 = b1 - lr * Tens.grad(J, b1) update_W0 = W0 - lr * Tens.grad(J, W0) update_b0 = b0 - lr * Tens.grad(J, b0) train = theano.function( inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W0, update_W0), (b0, update_b0)], ) # the prediction over the whole dataset prediction_state = theano.function(inputs=[thX, thT], outputs=[J, prediction]) Js = [] for epoch in range(max_iter): for batch_index in range(nb_batches): X_batch = X_train[batch_index * batch_size:(batch_index + 1) * batch_size, ] T_batch = T_train[batch_index * batch_size:(batch_index + 1) * batch_size, ] train(X_batch, T_batch) if batch_index % print_period == 0: J_test, prediction_test = prediction_state(X_test, T_test) Js.append(J_test) print('Epoch {}\t batch_index {}:\t J {}\t accuracy {}'.format( epoch, batch_index, J_test, accuracy(prediction_test, t_test))) plt.plot(Js) plt.savefig('cost_theano_nnet.png')
def main(): ## STEP 1: get the data and define all the usual variables X_train, X_test, t_train, t_test = get_normalized_data() T_train = T_indicator(t_train) T_test = T_indicator(t_test) # Dimensionality max_iter = 15 print_period = 50 N, D = X_train.shape M1 = 300 M2 = 100 K = 10 batch_size = 500 nb_batches = N // batch_size print( 'Dim: N:{}\t D:{}\t M1:{}\t M2:{}\t K:{}\t batch_size:{}\t nb_batches={}' .format(N, D, M1, M2, K, batch_size, nb_batches)) # Hyperparameters lr = 0.0004 reg = 0.01 print('HP: lr:{}\t reg:{}'.format(lr, reg)) # Weigths initialization W0_init = np.random.randn(D, M1) / 28 b0_init = np.zeros(M1) W1_init = np.random.randn(M1, M2) / np.sqrt(M1) b1_init = np.zeros(M2) W2_init = np.random.randn(M2, K) / np.sqrt(M2) b2_init = np.zeros(K) ## STEP 2: DEFINE Tensorflow variables and expressions X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='T') W0 = tf.Variable(W0_init.astype(np.float32)) b0 = tf.Variable(b0_init.astype(np.float32)) W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) A1 = tf.nn.relu(tf.matmul(X, W0) + b0) A2 = tf.nn.relu(tf.matmul(A1, W1) + b1) # U and not Y because softmax is taken care while calculating the cost U = tf.matmul(A2, W2) + b2 J = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits_v2(logits=U, labels=T)) ## STEP 3: GD updates expressions, training and predict functions expressions # The optimizeris are already implemented # let's go with RMSprop, it includes momentum. train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(J) predict_op = tf.argmax(U, 1) ## STEP 4: TRAINING costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for epoch in range(max_iter): for batch_index in range(nb_batches): X_batch = X_train[batch_index * batch_size:(batch_index + 1) * batch_size, ] T_batch = T_train[batch_index * batch_size:(batch_index + 1) * batch_size, ] session.run(train_op, feed_dict={X: X_batch, T: T_batch}) if batch_index % print_period == 0: j = session.run(J, feed_dict={X: X_test, T: T_test}) costs.append(j) prediction = session.run(predict_op, feed_dict={X: X_test}) acc = accuracy(prediction, t_test) print('Epoch: {}\t cost:{}\t accuracy:{}'.format( epoch, round(j, 3), round(acc, 3))) plt.plot(costs) plt.savefig( './experiments/tf_NN_RMSProp_N={}_D={}_M1={}_M2={}_K={}_batch_size={}_nb_batches={}_lr={}_reg={}.png' .format(N, D, M1, M2, K, batch_size, nb_batches, lr, reg))
def main(): max_iter = 20 print_period = 10 X_train, X_test, t_train, t_test = get_normalized_data() T_train = T_indicator(t_train) T_test = T_indicator(t_test) lr = 0.00004 reg = 0.01 N, D = X_train.shape batch_sz = 500 nb_batches = N // batch_sz M = 300 K = 10 print( 'N_train = {}\t N_test = 1000\t D = {}\t M = {}\t K = {}\t batch_size = {}\t nb_batches = {}\t lr_cst = {}\n' .format(N, D, M, K, batch_sz, nb_batches, lr)) # np.sqrt(D) ~ 28 W0 = np.random.randn(D, M) / 28 b0 = np.zeros(M) W1 = np.random.randn(M, K) / np.sqrt(M) b1 = np.zeros(K) # 1. CONSTANT LEARNING RATE print('CONSTANT LEARNING RATE') #t0 = datetime.now() J_constant_lr = [] # measured on test data every 10 batches accuracy_constant_lr = [] # measured on test data every 10 batches for epoch in range(max_iter): for batch_index in range(nb_batches): X_batch = X_train[batch_index * batch_sz:(batch_index + 1) * batch_sz, ] T_batch = T_train[batch_index * batch_sz:(batch_index + 1) * batch_sz, ] A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1) # Updates W1 -= lr * J_derivative_W1(T_batch, Y_batch, A_batch) b1 -= lr * J_derivative_b1(T_batch, Y_batch) W0 -= lr * J_derivative_W0(T_batch, Y_batch, W1, A_batch, X_batch) b0 -= lr * J_derivative_b0(T_batch, Y_batch, W1, A_batch) if (batch_index % print_period) == 0: _, Y_test = forward(X_test, W0, b0, W1, b1) j_test = J(T_test, Y_test) J_constant_lr.append(j_test) acc = accuracy(predict(Y_test), t_test) accuracy_constant_lr.append(acc) print( 'Epoch n° {} batch n° {}:\t TEST COST {}\t TEST ACCURACY RATE: {}' .format(epoch, batch_index, j_test, acc)) _, Y_test_final = forward(X_test, W0, b0, W1, b1) print('Final ACCURACY RATE on TEST data: {}\n'.format( accuracy(predict(Y_test_final), t_test))) #print('Constant lr execution time: {}\n'.format(datetime.now() - t0)) # 2. RMSProp print('RMSProp') #t0 = datetime.now() W0 = np.random.randn(D, M) / 28 b0 = np.zeros(M) W1 = np.random.randn(M, K) / np.sqrt(M) b1 = np.zeros(K) J_RMSProp = [] accuracy_RMSProp = [] lr0 = 0.001 #if you set the initial lr too high you'll get Nan cache_W1 = 0 cache_b1 = 0 cache_W0 = 0 cache_b0 = 0 decay = 0.999 eps = 0.000001 for epoch in range(max_iter): for b_index in range(nb_batches): X_batch = X_train[b_index * batch_sz:(b_index + 1) * batch_sz, ] T_batch = T_train[b_index * batch_sz:(b_index + 1) * batch_sz, ] A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1) # Updates gW1 = J_derivative_W1(T_batch, Y_batch, A_batch) + reg * W1 cache_W1 = decay * cache_W1 + (1 - decay) * gW1 * gW1 W1 -= lr / (np.sqrt(cache_W1 + eps)) * gW1 gb1 = J_derivative_b1(T_batch, Y_batch) + reg * b1 cache_b1 = decay * cache_b1 + (1 - decay) * gb1 * gb1 b1 -= lr / (np.sqrt(cache_b1) + eps) * gb1 gW0 = J_derivative_W0(T_batch, Y_batch, W1, A_batch, X_batch) + reg * W0 cache_W0 = decay * cache_b0 + (1 - decay) * gW0 * gW0 W0 -= lr / (np.sqrt(cache_W0) + eps) * gW0 gb0 = J_derivative_b0(T_batch, Y_batch, W1, A_batch) cache_b0 = decay * cache_b0 + (1 - decay) * gb0 * gb0 b0 -= lr / (np.sqrt(cache_b0) + eps) * gb0 if (b_index % 10) == 0: _, Y_test = forward(X_test, W0, b0, W1, b1) j_test = J(T_test, Y_test) J_RMSProp.append(j_test) acc = accuracy(predict(Y_test), t_test) accuracy_RMSProp.append(acc) print( 'Epoch n° {} Batch n°{}:\t TEST COST: {}\t TEST ACCURACY RATE: {}' .format(epoch, b_index * nb_batches, j_test, acc)) _, Y_test_final = forward(X_test, W0, b0, W1, b1) print('Final accuracy rate on test data: {}'.format( accuracy(predict(Y_test_final), t_test))) #print('Constant lr execution time: {}'.format(datetime.now() - t0)) plt.plot(J_constant_lr, label='constant lr') plt.plot(J_RMSProp, label='RMSProp') plt.legend() plt.savefig('RMSProp.py')
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum # Inputs and targets X_train, X_test, t_train, t_test = get_normalized_data() T_train = T_indicator(t_train) T_test = T_indicator(t_test) # Dimensionnality and hyperparameters N, D = X_train.shape M = 300 K = 10 print('Dimensionality: N = {}\t D = {}\t M = {}\t K = {}'.format( N, D, M, K)) batch_sz = 500 n_batches = N // batch_sz lr = 0.00004 reg = 0.01 max_iter = 30 # make it 20 for relu mu = 0.9 print( 'Hyperparameters: lr = {}\t reg = {}\t velocity = {}\t nb_batches = {}\t batch_size={}\t nb_epochs={}' .format(lr, reg, mu, n_batches, batch_sz, max_iter)) print_period = 50 # Weights W0 = np.random.randn(D, M) / np.sqrt(D) b0 = np.zeros(M) W1 = np.random.randn(M, K) / np.sqrt(M) b1 = np.zeros(K) # save initial weights W0_0 = W0.copy() b0_0 = b0.copy() W1_0 = W1.copy() b1_0 = b1.copy() # 1. Batch print('BATCH GRADIENT DESCENT') t0 = datetime.now() losses_batch = [] errors_batch = [] for epoch in range(max_iter): for batch_index in range(n_batches): X_train_batch = X_train[batch_index * batch_sz:(batch_index * batch_sz + batch_sz), ] T_train_batch = T_train[batch_index * batch_sz:(batch_index * batch_sz + batch_sz), ] A1, Y_train_batch = forward(X_train_batch, W0, b0, W1, b1) W1 -= lr * (J_derivative_W1(T_train_batch, Y_train_batch, A1) + reg * W1) b1 -= lr * (J_derivative_b1(T_train_batch, Y_train_batch) + reg * b1) W0 -= lr * (J_derivative_W0(T_train_batch, Y_train_batch, W1, A1, X_train_batch) + reg * W0) b0 -= lr * (J_derivative_b0(T_train_batch, Y_train_batch, W1, A1) + reg * b0) if batch_index % print_period == 0: _, Y_test = forward(X_test, W0, b0, W1, b1) j_test = J(T_test, Y_test) losses_batch.append(j_test) e = accuracy(predict(Y_test), t_test) errors_batch.append(e) print( "Cost at iteration epoch={}, batch_index={}: {}\t Accuracy = {}" .format(epoch, batch_index, round(j_test, 6), e)) _, Y_test = forward(X_test, W0, b0, W1, b1) print("Final accuracy: {}\n".format(accuracy(predict(Y_test), t_test))) print("Elapsted time for batch GD: {}\n".format(datetime.now() - t0)) # 2. Batch with momentum print('BATCH GRAGIENT DESCENT WITH MOMENTUM') t0 = datetime.now() W0 = W0_0.copy() b0 = b0_0.copy() W1 = W1_0.copy() b1 = b1_0.copy() losses_momentum = [] errors_momentum = [] dW1 = 0 db1 = 0 dW0 = 0 db0 = 0 for epoch in range(max_iter): for batch_index in range(n_batches): X_train_batch = X_train[batch_index * batch_sz:(batch_index * batch_sz + batch_sz), ] T_train_batch = T_train[batch_index * batch_sz:(batch_index * batch_sz + batch_sz), ] A1, Y_train_batch = forward(X_train_batch, W0, b0, W1, b1) # gradients gW1 = J_derivative_W1(T_train_batch, Y_train_batch, A1) + reg * W1 gb1 = J_derivative_b1(T_train_batch, Y_train_batch) + reg * b1 gW0 = J_derivative_W0(T_train_batch, Y_train_batch, W1, A1, X_train_batch) + reg * W0 gb0 = J_derivative_b0(T_train_batch, Y_train_batch, W1, A1) + reg * b0 # update velocities dW1 = mu * dW1 - lr * gW1 db1 = mu * db1 - lr * gb1 dW0 = mu * dW0 - lr * gW0 db0 = mu * db0 - lr * gb0 # updates W1 += dW1 b1 += db1 W0 += dW0 b0 += db0 if batch_index % print_period == 0: _, Y_test = forward(X_test, W0, b0, W1, b1) j_test = J(T_test, Y_test) losses_momentum.append(j_test) e = accuracy(predict(Y_test), t_test) errors_momentum.append(e) print( "Cost at iteration epoch={}, batch_index={}: {}\tAccuracy: {}" .format(epoch, batch_index, round(j_test, 6), e)) _, Y_test_final = forward(X_test, W0, b0, W1, b1) print("Final accuracy:", accuracy(predict(Y_test_final), t_test)) print("Elapsted time for batch GD with Momentum: {}\n".format( datetime.now() - t0)) # 3. Batch with Nesterov momentum print('BATCH GRADIENT DESCENT WITH NESTEROV MOMENTUM') t0 = datetime.now() W0 = W0_0.copy() b0 = b0_0.copy() W1 = W1_0.copy() b1 = b1_0.copy() losses_nesterov = [] errors_nesterov = [] vW1 = 0 vb1 = 0 vW0 = 0 vb0 = 0 for epoch in range(max_iter): for batch_index in range(n_batches): X_train_batch = X_train[batch_index * batch_sz:(batch_index * batch_sz + batch_sz), ] T_train_batch = T_train[batch_index * batch_sz:(batch_index * batch_sz + batch_sz), ] A1, Y_train_batch = forward(X_train_batch, W0, b0, W1, b1) # updates gW1 = J_derivative_W1(T_train_batch, Y_train_batch, A1) + reg * W1 gb1 = J_derivative_b1(T_train_batch, Y_train_batch) + reg * b1 gW0 = J_derivative_W0(T_train_batch, Y_train_batch, W1, A1, X_train_batch) + reg * W0 gb0 = J_derivative_b0(T_train_batch, Y_train_batch, W1, A1) + reg * b0 # v update vW1 = mu * vW1 - lr * gW1 vb1 = mu * vb1 - lr * gb1 vW0 = mu * vW0 - lr * gW0 vb0 = mu * vb0 - lr * gb0 # param update W1 += mu * vW1 - lr * gW1 b1 += mu * vb1 - lr * gb1 W0 += mu * vW0 - lr * gW0 b0 += mu * vb0 - lr * gb0 if (batch_index % print_period == 0): _, Y_test = forward(X_test, W0, b0, W1, b1) j_test = J(T_test, Y_test) losses_nesterov.append(j_test) e = accuracy(predict(Y_test), t_test) errors_nesterov.append(e) print( "Cost at iteration epoch={}, batch_index={}: {}\tAccuracy: {}" .format(epoch, batch_index, round(j_test, 6), e)) _, Y_test_final = forward(X_test, W0, b0, W1, b1) print("Final accuracy:", accuracy(predict(Y_test_final), t_test)) print("Elapsted time for batch GD with Nesterov Momentum: {}\n".format( datetime.now() - t0)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show() plt.savefig('momentums_relu_activation.png')
def main(): max_iter = 10 print_period = 10 X_train, X_test, t_train, t_test = get_normalized_data() T_train = T_indicator(t_train) T_test = T_indicator(t_test) # Dimensionality N, D = X_train.shape M = 300 K = 10 batch_size = 500 nb_batches = N // batch_size print('N:{}\t batch_size: {}\t nb_batches: {}\t D:{}\t M:{}\t K:{}'.format(N, batch_size, nb_batches, D, M, K)) # hyperparameters reg = 0.01 lr0= 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 print('Hyperparameters: reg:{}\t lr0:{}\t beta1:{}\t beta2:{}\t eps:{}\n'.format(reg, lr0, beta1, beta2, eps)) # Weights initialization W0_0 = np.random.randn(D, M) / np.sqrt(D) b0_0 = np.zeros(M) W1_0 = np.random.randn(M, K) / np.sqrt(M) b1_0 = np.zeros(K) W0 = W0_0.copy() b0 = b0_0.copy() W1 = W1_0.copy() b1 = b1_0.copy() # 1st Moment mW0 = 0 mb0 = 0 mW1 = 0 mb1 = 0 # 2nd Moment vW0 = 0 vb0 = 0 vW1 = 0 vb1 = 0 # 1. Adam t0 = datetime.now() J_adam = [] accuracy_adam = [] t = 1 for epoch in range(max_iter): for batch_index in range(nb_batches): X_batch = X_train[batch_index*batch_size: (batch_index+1)*batch_size,] T_batch = T_train[batch_index*batch_size: (batch_index+1)*batch_size,] A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1) # gradient updates gW1 = J_derivative_W1(T_batch, Y_batch, A_batch) + reg * W1 gb1 = J_derivative_b1(T_batch, Y_batch) + reg * b1 gW0 = J_derivative_W0(T_batch, Y_batch, W1, A_batch, X_batch) + reg * W0 gb0 = J_derivative_b0(T_batch, Y_batch, W1, A_batch) + reg * b0 # 1st moment updates mW1 = beta1 * mW1 + (1-beta1) * gW1 mb1 = beta1 * mb1 + (1-beta1) * gb1 mW0 = beta1 * mW0 + (1-beta1) * gW0 mb0 = beta1 * mb0 + (1-beta1) * gb0 # 2nd moment updates vW1 = beta2 * vW1 + (1-beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1-beta2) * gb1 * gb1 vW0 = beta2 * vW0 + (1-beta2) * gW0 * gW0 vb0 = beta2 * vb0 + (1-beta2) * gb0 * gb0 # corrections corr1 = 1 - beta1 ** t corr2 = 1 - beta2 ** t mW1_c = mW1 / corr1 mb1_c = mb1 / corr1 mW0_c = mW0 / corr1 mb0_c = mb0 / corr1 vW1_c = vW1 / corr2 vb1_c = vb1 / corr2 vW0_c = vW0 / corr2 vb0_c = vb0 / corr2 # t update t += 1 # gradient descent W1 -= lr0 * mW1_c / np.sqrt(vW1_c + eps) b1 -= lr0 * mb1_c / np.sqrt(vb1_c + eps) W0 -= lr0 * mW0_c / np.sqrt(vW0_c + eps) b0 -= lr0 * mb0_c / np.sqrt(vb0_c + eps) ''' if (batch_index % print_period) == 0: _, Y_validation = forward(X_test, W0, b0, W1, b1) j = J(T_test, Y_validation) J_adam.append(j) acc = accuracy(predict(Y_validation), t_test) accuracy_adam.append(acc) print('Epoch {}\t batch_index {}\t iteration {} : cost {}\t accuracy: {}\t'.format(epoch, batch_index, epoch * batch_index, j, acc)) ''' _, Y_final_test = forward(X_test, W0, b0, W1, b1) print('Final accuracy with Adam: {}'.format(accuracy(predict(Y_final_test), t_test))) print('Execution time with Adam: {}\n'.format(datetime.now() - t0)) # 2. RMSProp with momentum W0 = W0_0 b0 = b0_0 W1 = W1_0 b1 = b1_0 decay_rate = 0.999 mu = 0.9 cW1 = 1 cb1 = 1 cW0 = 1 cb0 = 0 vW1 = 0 vb1 = 0 vW0 = 0 vb1 = 0 t0 = datetime.now() J_rmsprop_momentum = [] accuracy_rmsprop_momentum = [] for epoch in range(max_iter): for batch_index in range(nb_batches): X_batch = X_train[batch_index*batch_size:(batch_index+1)*batch_size,] T_batch = T_train[batch_index*batch_size:(batch_index+1)*batch_size,] A_batch, Y_batch = forward(X_batch, W0, b0, W1, b1) # gradient_update gW1 = J_derivative_W1(T_batch, Y_batch, A_batch) + reg * W1 gb1 = J_derivative_b1(T_batch, Y_batch) + reg * b1 gW0 = J_derivative_W0(T_batch, Y_batch, W1, A_batch, X_batch) + reg * W0 gb0 = J_derivative_b0(T_batch, Y_batch, W1, A_batch) + reg * b0 # cache update cW1 = decay_rate * cW1 + (1 - decay_rate) * gW1 * gW1 cb1 = decay_rate * cb1 + (1 - decay_rate) * gb1 * gb1 cW0 = decay_rate * cW0 + (1 - decay_rate) * gW0 * gW0 cb0 = decay_rate * cb0 + (1 - decay_rate) * gb0 * gb0 # momentum updates vW1 = mu * vW1 + (1 - mu) * lr0 * gW1 / np.exp(cW1 + eps) vb1 = mu * vb1 + (1 - mu) * lr0 * gb1 / np.exp(cb1 + eps) vW0 = mu * vW0 + (1 - mu) * lr0 * gW0 / np.exp(cW0 + eps) vb0 = mu * vb0 + (1 - mu) * lr0 * gb0 / np.exp(cb0 + eps) # gradient descent W1 -= vW1 b1 -= vb1 W0 -= vW0 b0 -= vb0 ''' if (batch_index % print_period) == 0: _, Y_validation = forward(X_test, W0, b0, W1, b1) j = J(T_test, Y_validation) J_rmsprop_momentum.append(j) acc = accuracy(predict(Y_validation), t_test) accuracy_rmsprop_momentum.append(acc) print('Epoch {}\t batch_index {}\t iteration {} : cost {}\t accuracy: {}\t'.format(epoch, batch_index, epoch * batch_index, j, acc)) ''' _, Y_final_test = forward(X_test, W0, b0, W1, b1) print('Final accuracy with RMSProp with momentum: {}'.format(accuracy(predict(Y_final_test), t_test))) print('Execution time with RMSProp with momentum: {}\n'.format(datetime.now() - t0)) plt.plot(J_adam, label='adam') plt.plot(J_rmsprop_momentum, label='rmsprop with momentum') plt.legend() plt.show()