def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 K = len(set(Ytrain)) W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 20 # step 2: define theano variables and expressions thX = T.matrix('X') thT = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') # we can use the built-in theano functions to do relu and softmax thZ = T.nnet.relu(thX.dot(W1) + b1) thpY = T.nnet.softmax(thZ.dot(W2) + b2) # define the cost function and prediction cost = -(thT * T.log(thpY)).sum() + reg * ((W1 * W1).sum() + (b1 * b1).sum() + (W2 * W2).sum() + (b2 * b2).sum()) prediction = T.argmax(thpY, axis=1) # step 3: training expressions and functions update_W1 = W1 - lr * T.grad(cost, W1) update_b1 = b1 - lr * T.grad(cost, b1) update_W2 = W2 - lr * T.grad(cost, W2) update_b2 = b2 - lr * T.grad(cost, b2) train = theano.function(inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)]) # create another function for this because we want it over the whole dataset get_prediction = theano.function(inputs=[thX, thT], outputs=[cost, prediction]) costs_batch = [] for i in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] train(x, y) if j % 10 == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) e = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, e)) costs_batch.append(cost_val) plt.plot(costs_batch) plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() print("Performing logistic regression...") N, D = Xtrain.shape K = len(set(Ytrain)) Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) # 1. full W = np.random.randn(D, K) / np.sqrt(D + K) b = np.zeros(K) costs = [] lr = 0.0001 reg = 0.01 epochs = 50 t0 = datetime.now() for t in range(epochs): pY = forward(Xtrain, W, b) W -= lr * (gradW(Xtrain, pY, Ytrain_ind) + reg * W) b -= lr * (gradb(pY, Ytrain_ind) + reg * b) pY_test = forward(Xtest, W, b) c = cost(pY_test, Ytest_ind) costs.append(c) if t % 1 == 0: e = error_rate(pY_test, Ytest) if t % 10 == 0: print("Cost at iteration %d: %.6f" % (t, c)) print("Error rate:", e) print("Elapsted time for full GD:", datetime.now() - t0) print("\n") # 2. stochastic W = np.random.randn(D, K) / np.sqrt(D + K) b = np.zeros(K) costs_stochastic = [] lr = 0.0001 reg = 0.01 epochs = 50 t0 = datetime.now() for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for n in range(min(N, 500)): # shortcut so it won't take so long... x = tmpX[n, :].reshape(1, D) y = tmpY[n, :].reshape(1, K) pY = forward(x, W, b) W -= lr * (gradW(x, pY, y) + reg * W) b -= lr * (gradb(pY, y) + reg * b) pY_test = forward(Xtest, W, b) c = cost(pY_test, Ytest_ind) costs_stochastic.append(c) if t % 1 == 0: e = error_rate(pY_test, Ytest) if t % 10 == 0: print("Cost at iteration %d: %.6f" % (t, c)) print("Error rate:", e) print("Elapsted time for SGD:", datetime.now() - t0) print("\n") # 3. batch W = np.random.randn(D, K) / np.sqrt(D + K) b = np.zeros(K) costs_batch = [] lr = 0.0001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 50 t0 = datetime.now() for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY = forward(x, W, b) W -= lr * (gradW(x, pY, y) + reg * W) b -= lr * (gradb(pY, y) + reg * b) pY_test = forward(Xtest, W, b) c = cost(pY_test, Ytest_ind) costs_batch.append(c) if t % 1 == 0: e = error_rate(pY_test, Ytest) if t % 10 == 0: print("Cost at iteration %d: %.6f" % (t, c)) print("Error rate:", e) print("Elapsted time for batch GD:", datetime.now() - t0) x1 = np.linspace(0, 1, len(costs)) plt.plot(x1, costs, label="full") x2 = np.linspace(0, 1, len(costs_stochastic)) plt.plot(x2, costs_stochastic, label="stochastic") x3 = np.linspace(0, 1, len(costs_batch)) plt.plot(x3, costs_batch, label="batch") plt.legend() plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 K = len(set(Ytrain)) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 20 # 1. batch costs_batch = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) W2 -= lr * (derivative_W2(Z, pY, y) + reg * W2) b2 -= lr * (derivative_b2(pY, y) + reg * b2) W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg * W1) b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg * b1) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 2. RMSprop W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 eps = 1e-10 lr0 = 0.001 costs_RMS = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) gW2 = (derivative_W2(Z, pY, y) + reg * W2) cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = (derivative_b2(pY, y) + reg * b2) cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1) cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1) cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_RMS.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) plt.plot(costs_batch, label="batch") plt.plot(costs_RMS, label="rms") plt.legend() plt.show()
def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape # add an extra layer just for fun M1 = 300 M2 = 100 K = len(set(Ytrain)) W1_init = np.random.randn(D, M1) / np.sqrt(D) b1_init = np.zeros(M1) W2_init = np.random.randn(M1, M2) / np.sqrt(M1) b2_init = np.zeros(M2) W3_init = np.random.randn(M2, K) / np.sqrt(M2) b3_init = np.zeros(K) lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 15 # define variables and expressions X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) # define the model Z1 = tf.nn.relu(tf.matmul(X, W1)+b1) Z2 = tf.nn.relu(tf.matmul(Z1, W2)+b2) pY = tf.matmul(Z2, W3)+b3 # remember, the cost function does the softmaxing! # define the cost function cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels=T, logits=pY)) # we choose the optimizer but don't implement the algorithm ourselves # let's go with RMSprop, since we just learned about it. # it includes momentum! train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) prediction = tf.argmax(pY, axis=1) costs_batch = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] session.run(train_op, feed_dict={X: x, T: y}) if j % 50 == 0: cost_val = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind}) prediction_val = session.run(prediction, feed_dict={X: Xtest}) e = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, e)) costs_batch.append(cost_val) plt.plot(costs_batch) plt.show()
def main(): Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 K = len(set(Ytrain)) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() lr = 0.001 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 10 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1st moment mW2 = 0 mb2 = 0 mW1 = 0 mb1 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # 1. Adam costs_adam = [] t = 1 for i in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) # gradients gW2 = (derivative_W2(Z, pY, y) + reg * W2) gb2 = (derivative_b2(pY, y) + reg * b2) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1) gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1) # new m mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 # new v vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 # bias correction correction1 = 1 - beta1**t hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 correction2 = 1 - beta2**t hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 # update t t += 1 # apply updates to the params W2 -= lr * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 -= lr * hat_mb2 / np.sqrt(hat_vb2 + eps) W1 -= lr * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 -= lr * hat_mb1 / np.sqrt(hat_vb1 + eps) if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_adam.append(c) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 decay_rate = 0.999 # momentum mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 costs_RMS = [] for i in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j * batch_sz:(j * batch_sz + batch_sz), :] y = tmpY[j * batch_sz:(j * batch_sz + batch_sz), :] pY, Z = forward(x, W1, b1, W2, b2) # updates gW2 = (derivative_W2(Z, pY, y) + reg * W2) cache_W2 = decay_rate * cache_W2 + (1 - decay_rate) * gW2 * gW2 dW2 = mu * dW2 + (1 - mu) * lr * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = (derivative_b2(pY, y) + reg * b2) cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 db2 = mu * db2 + (1 - mu) * lr * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = (derivative_W1(x, W2, Z, pY, y) + reg * W1) cache_W1 = decay_rate * cache_W1 + (1 - decay_rate) * gW1 * gW1 dW1 = mu * dW1 + (1 - mu) * lr * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = (derivative_b1(W2, Z, pY, y) + reg * b1) cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 db1 = mu * db1 + (1 - mu) * lr * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % 10 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_RMS.append(c) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) plt.plot(costs_adam, label='adam') plt.plot(costs_RMS, label='rmsprop') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum Xtrain, Xtest, Ytrain, Ytest = get_transformed_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape M = 300 K = len(set(Ytrain)) W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() lr = 0.00004 reg = 0.01 batch_sz = 500 n_batches = N // batch_sz epochs = 20 # 1. batch costs_batch = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] pY, Z = forward(x, W1, b1, W2, b2) W2 -= lr * (derivative_W2(Z, pY, y) + reg*W2) b2 -= lr * (derivative_b2(pY, y) + reg*b2) W1 -= lr * (derivative_W1(x, W2, Z, pY, y) + reg*W1) b1 -= lr * (derivative_b1(W2, Z, pY, y) + reg*b1) if j % 50 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 costs_batch_momentum = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] pY, Z = forward(x, W1, b1, W2, b2) # gradients gW2 = (derivative_W2(Z, pY, y) + reg*W2) gb2 = (derivative_b2(pY, y) + reg*b2) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg*W1) gb1 = (derivative_b1(W2, Z, pY, y) + reg*b1) # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % 50 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch_momentum.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) print("\n") # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 costs_batch_momentum_nesterov = [] for t in range(epochs): tmpX, tmpY = shuffle(Xtrain, Ytrain_ind) for j in range(n_batches): x = tmpX[j*batch_sz:(j*batch_sz + batch_sz),:] y = tmpY[j*batch_sz:(j*batch_sz + batch_sz),:] pY, Z = forward(x, W1, b1, W2, b2) # gradients gW2 = (derivative_W2(Z, pY, y) + reg*W2) gb2 = (derivative_b2(pY, y) + reg*b2) gW1 = (derivative_W1(x, W2, Z, pY, y) + reg*W1) gb1 = (derivative_b1(W2, Z, pY, y) + reg*b1) # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % 50 == 0: pY_test, _ = forward(Xtest, W1, b1, W2, b2) c = cost(pY_test, Ytest_ind) costs_batch_momentum_nesterov.append(c) print("Cost at iteration t=%d, j=%d: %.6f" % (t, j, c)) e = error_rate(pY_test, Ytest) print("Error rate:", e) plt.plot(costs_batch, label="batch") plt.plot(costs_batch_momentum, label="momentum") plt.plot(costs_batch_momentum_nesterov, label="nesterov") plt.legend() plt.show()