def main(): Xtrain, Xtest, Ttrain, Ttest = get_normalized_data() Ttrain, Ttest = y2indicator(Ttrain), y2indicator(Ttest) lr = .00004 reg = .01 max_iter = 20 print_period = 10 N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 # hidden units K = 10 # output classes W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) thX = T.matrix('X') # placeholders thT = T.matrix('T') W1 = theano.shared(W1_init, 'W1') # updateable b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') thZ = T.nnet.relu(thX.dot(W1) + b1) # more placeholders for calculations thY = T.nnet.softmax(thZ.dot(W2) + b2) # cost function that will be differentiated cost = -(thT*T.log(thY)).sum() + reg*((W1**2).sum() + (b1**2).sum() + (W2**2).sum() + (b2**2).sum()) prediction = T.argmax(thY, axis=1) update_W1 = W1 - lr*T.grad(cost, W1) # derivative of cost wrt W1 update_b1 = b1 - lr*T.grad(cost, b1) update_W2 = W2 - lr*T.grad(cost, W2) update_b2 = b2 - lr*T.grad(cost, b2) train = theano.function( inputs=[thX, thT], updates=[[W1, update_W1], [b1, update_b1], [W2, update_W2], [b2, update_b2]] ) get_prediction = theano.function( inputs=[thX, thT], outputs=[cost, prediction] ) LL = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz+batch_sz)] Tbatch = Ttrain[j*batch_sz:(j*batch_sz+batch_sz)] train(Xbatch, Tbatch) if j % print_period == 0: cost_val, Ptest = get_prediction(Xtest, Ttest) err = error_rate(Ptest, np.argmax(Ttest, axis=1)) print("cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) LL.append(cost_val) cost_val, Ytest = get_prediction(Xtest, Ttest) print("final error rate:", error_rate(Ptest, np.argmax(Ttest, axis=1))) plt.plot(LL) plt.show()
import numpy as np import matplotlib.pyplot as plt from LP_mlp import forward, derivative_w1, derivative_w2, derivative_b1, derivative_b2 from LP_util import get_normalized_data, error_rate, cost, y2indicator from sklearn.utils import shuffle max_iter = 20 print_period = 10 Xtrain, Xtest, Ttrain, Ttest = get_normalized_data() lr = .00004 reg = .01 Ttrain = y2indicator(Ttrain) Ttest = y2indicator(Ttest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 # hidden units, around the number of components according to PCA K = Ttrain.shape[1] W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.random.randn(K) # batch gradient descent (no momentum) W1 = W1_0.copy() b1 = b1_0.copy()
def main(): Xtrain, Xtest, Ttrain_label, Ttest_label = get_normalized_data() Ttrain, Ttest = y2indicator(Ttrain_label), y2indicator(Ttest_label) lr = .0004 # reg = .01 max_iter = 20 print_period = 10 N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M1 = 300 # hidden units M2 = 100 K = 10 # output classes def init_weights(shape): return tf.Variable(tf.random_normal(shape, stddev=0.1)) def forward(X, W1, b1, W2, b2, W3, b3): Z1 = tf.nn.relu(tf.matmul(X, W1) + b1) Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2) return tf.matmul(Z2, W3) + b3 # return activation, not softmax def error_rate(P, T): return (P != T).mean() tfX = tf.placeholder(tf.float32, [None, D]) tfT = tf.placeholder(tf.float32, [None, K]) W1 = init_weights([D, M1]) b1 = init_weights([M1]) W2 = init_weights([M1, M2]) b2 = init_weights([M2]) W3 = init_weights([M2, K]) b3 = init_weights([K]) tfY = forward(tfX, W1, b1, W2, b2, W3, b3) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=tfT, logits=tfY)) predict_op = tf.argmax(tfY, axis=1) # train_op = tf.train.GradientDescentOptimizer(lr).minimize(cost) train_op = tf.train.RMSPropOptimizer(lr, decay=.99, momentum=.9).minimize(cost) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) LL = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz)] Tbatch = Ttrain[j * batch_sz:(j * batch_sz + batch_sz)] sess.run(train_op, feed_dict={tfX: Xbatch, tfT: Tbatch}) if j % print_period == 0: Ptest = sess.run(predict_op, feed_dict={tfX: Xtest}) err = error_rate(Ptest, Ttest_label) c = sess.run(cost, feed_dict={tfX: Xtest, tfT: Ttest}) print("cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, c, err)) LL.append(c) Ptest = sess.run(predict_op, feed_dict={tfX: Xtest}) print("final error rate:", error_rate(Ptest, Ttest_label)) plt.plot(LL) plt.show()
def main(): max_iter = 20 print_period = 10 Xtrain, Xtest, Ttrain, Ttest = get_normalized_data() reg = .01 Ttrain, Ttest = y2indicator(Ttrain), y2indicator(Ttest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 # hidden units K = 10 # output classes # use the same initial weights for adam and RMSprop+momentum W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) # intial learning rate and decay lr0 = .001 # don't go too high, or NaNs will result beta1 = .9 # decay constants for first and second moment beta2 = .999 eps = 1e-8 # 1. Adam W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 loss_adam = [] err_adam = [] t = 1 # by convention. Otherwise, first correction is 0 (div by 0) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz+batch_sz)] Tbatch = Ttrain[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Tbatch, Ybatch) + reg*W2 gb2 = derivative_b2(Tbatch, Ybatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Tbatch, Ybatch, W2) + reg*W1 gb1 = derivative_b1(Z, Tbatch, Ybatch, W2) + reg*b1 # new first moment (m) # exponentially smoothed avg of the gradients mW1 = beta1*mW1 + (1 - beta1)*gW1 # add fraction of gradient mb2 = beta1*mb1 + (1 - beta1)*mb1 mW2 = beta1*mW2 + (1 - beta1)*gW2 mb2 = beta1*mb2 + (1 - beta1)*gb2 # new second moment (v) # exponentially smoothed avg of the squared gradients vW1 = beta2*vW1 + (1 - beta2)*gW1**2 # add fraction of gradient^2 vb2 = beta2*vb1 + (1 - beta2)*mb1**2 vW2 = beta2*vW2 + (1 - beta2)*gW2**2 vb2 = beta2*vb2 + (1 - beta2)*gb2**2 # bias correction correction1 = 1 - beta1**t hat_mW1 = mW1/correction1 # boost first updates (against 0 bias) hat_mb1 = mb1/correction1 hat_mW2 = mW2/correction1 hat_mb2 = mb2/correction1 correction2 = 1 - beta2**t hat_vW1 = vW1/correction2 # boost first updates (against 0 bias) hat_vb1 = vb1/correction2 hat_vW2 = vW2/correction2 hat_vb2 = vb2/correction2 t += 1 # update t (each learning step) # update weights W1 -= lr0 * hat_mW1 / (np.sqrt(hat_vW1) + eps) b1 -= lr0 * hat_mb1 / (np.sqrt(hat_vb1) + eps) W2 -= lr0 * hat_mW2 / (np.sqrt(hat_vW2) + eps) b2 -= lr0 * hat_mb2 / (np.sqrt(hat_vb2) + eps) if j % print_period == 0: # calculate Log-Likelihood Y, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(Y, Ttest) loss_adam.append(ll) print('cost at iteration i=%d, j=%d %.6f' % (i, j, ll)) err = error_rate(Y, np.argmax(Ttest, axis=1)) err_adam.append(err) print('error rate:', err) Y, _ = forward(Xtest, W1, b1, W2, b2) print('final error rate (adam):', error_rate(Y, np.argmax(Ttest, axis=1))) # 2. RMSprop + momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() cache_W1 = 1 # updates more comparable with adam cache_b1 = 1 cache_W2 = 1 cache_b2 = 1 dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 loss_rmsmom = [] err_rmsmom = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz+batch_sz)] Tbatch = Ttrain[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Tbatch, Ybatch) + reg*W2 gb2 = derivative_b2(Tbatch, Ybatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Tbatch, Ybatch, W2) + reg*W1 gb1 = derivative_b1(Z, Tbatch, Ybatch, W2) + reg*b1 # RMSprop cache (less learning as more learning occurs) cache_W1 = beta2*cache_W1 + (1 - beta2)*gW1*gW1 cache_b1 = beta2*cache_b1 + (1 - beta2)*gb1*gb1 cache_W2 = beta2*cache_W2 + (1 - beta2)*gW2*gW2 cache_b2 = beta2*cache_b2 + (1 - beta2)*gb2*gb2 # velocity term (momentum) # dW1 = dW1*beta1 + lr0*gW1 # this is plain velocity.. # db1 = db1*beta1 + lr0*gb1 # dW2 = dW2*beta1 + lr0*gW2 # db2 = db2*beta1 + lr0*db2 # LP added (1- beta1) discussed in adam lecture, he says it makes # the comparison more fair. (velo only topped up) # add some gradient divided by learning history cache dW1 = dW1*beta1 + (1-beta1)*lr0*gW1/(np.sqrt(cache_W1)+eps) db1 = db1*beta1 + (1-beta1)*lr0*gb1/(np.sqrt(cache_b1)+eps) dW2 = dW2*beta1 + (1-beta1)*lr0*gW2/(np.sqrt(cache_W2)+eps) db2 = db2*beta1 + (1-beta1)*lr0*db2/(np.sqrt(cache_b2)+eps) # subtract accumlated velocity W2 -= dW2 b2 -= db2 W1 -= dW1 b1 -= db1 if j % print_period == 0: # calculate LL Y, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(Y, Ttest) loss_rmsmom.append(ll) print('cost at iteration i=%d, j=%d %.6f' % (i, j, ll)) err = error_rate(Y, np.argmax(Ttest, axis=1)) err_rmsmom.append(err) print('error rate:', err) Y, _ = forward(Xtest, W1, b1, W2, b2) print('final error rate (RMSprop+mom):', error_rate(Y, np.argmax(Ttest, axis=1))) plt.plot(loss_adam, label='adam') plt.plot(loss_rmsmom, label='RMSprop+momentum') plt.legend() plt.show()
def main(): max_iter = 20 print_period = 10 Xtrain, Xtest, Ttrain, Ttest = get_normalized_data() reg = .01 lr = .0001 Ttrain, Ttest = y2indicator(Ttrain), y2indicator(Ttest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 # hidden units K = 10 # output classes # 1. constant learning rate W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(K) b2 = np.zeros(K) LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz)] Tbatch = Ttrain[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr * (derivative_w2(Z, Tbatch, Ybatch) + reg * W2) b2 -= lr * (derivative_b2(Tbatch, Ybatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Tbatch, Ybatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Tbatch, Ybatch, W2) + reg * b1) if j % print_period == 0: # calculate LL Y, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(Y, Ttest) LL_batch.append(ll) print('cost at iteration i=%d, j=%d %.6f' % (i, j, ll)) err = error_rate(Y, np.argmax(Ttest, axis=1)) CR_batch.append(err) print('error rate:', err) Y, _ = forward(Xtest, W1, b1, W2, b2) print('final error rate (batch):', error_rate(Y, np.argmax(Ttest, axis=1))) # 1. RMSprop W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(K) b2 = np.zeros(K) # intial learning rate and decay lr0 = .0005 # don't go too high, or NaNs will result decay_rate = .999 eps = .000001 # epsilon (constant, see notes) cache_W1 = 0 cache_b1 = 0 cache_W2 = 0 cache_b2 = 0 LL_rms = [] CR_rms = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz)] Tbatch = Ttrain[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Tbatch, Ybatch) + reg * W2 # gradient # replace fraction of cache with new gradient cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 # divide gradient by sqrt of cache # (greater learning history -> less new learning) W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Tbatch, Ybatch) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Tbatch, Ybatch, W2) + reg * W1 cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Tbatch, Ybatch, W2) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate LL Y, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(Y, Ttest) LL_rms.append(ll) print('cost at iteration i=%d, j=%d %.6f' % (i, j, ll)) err = error_rate(Y, np.argmax(Ttest, axis=1)) CR_rms.append(err) print('error rate:', err) Y, _ = forward(Xtest, W1, b1, W2, b2) print('final error rate (RMSprop):', error_rate(Y, np.argmax(Ttest, axis=1))) plt.plot(LL_batch, label='batch') plt.plot(LL_rms, label='RMSprop') plt.legend() plt.show()