def main(): X,Y = get_normalized_data() max_iter = 20 print_period = 10 lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M1 = 300 M2 = 100 K = 10 W1_init = np.random.randn(D, M1) / 28 b1_init = np.zeros(M) W2_init = np.random.randn(M1, M2) / np.sqrt(M1) b2_init = np.zeros(K) W3_init = np.random.randn(M2, k) / np.sqrt(M2) b3_init = np.zeros(K)
def main(): X_train, X_test, t_train, t_test = get_normalized_data() ann = ANN([500,300]) session = ann.set_session(tf.InteractiveSession()) ann.fit(X_train, X_test, t_train, t_test, show_fig = True) writer = tf.summary('./tensorboard_logs/demo1') writer.add_graph(session.graph)
def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() # hidden layer sizes and dropout rates (keep rate) ann = ANN([500, 300], [0.8, 0.5, 0.5]) ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)
def main(): Xtrain,Xtest, Ytrain,Ytest = get_normalized_data() ann = ANN([500, 300]) session = tf.InteractiveSession() ann.set_session(session) ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig = True) print("Train accuracy:", ann.score(Xtrain, Ytrain)) print("Test accuracy:", ann.score(Xtest, Ytest))
def main(): X, Y = get_normalized_data() # normalize MNIST dataset t0 = datetime.now() model = ANN([2000, 1000], [0.8, 0.5, 0.5]) model.fit(X, Y, display_cost=True) dt = datetime.now() - t0 print('Elapsed time:', dt)
def main(): X, Y = get_normalized_data() t0 = datetime.now() model = ANN([2000, 1000, 500]) model.fit(X, Y, display_cost=True, save_params=True) dt = datetime.now() - t0 print('Elapsed time:', dt)
def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() ann = ANN([500, 300]) ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True) print("Train accuracy:", ann.score(Xtrain, Ytrain)) print("Test accuracy:", ann.score(Xtest, Ytest))
def main(): X, Y = get_normalized_data() t0 = datetime.now() model = ANN([500, 300], [0.8, 0.8, 0.8]) model.fit(X, Y, display_cost=True, save_params=False) dt = datetime.now() - t0 print('Elapsed time:', dt)
def main(): Xtrain,Xtest, Ytrain, Ytest = get_normalized_data() model = TFLogistic("./tf.model") model.fit(Xtrain, Ytrain, Xtest, Ytest) print("final train accuracy:", model.score(Xtrain, Ytrain)) print("final test accuracy:", model.score(Xtest, Ytest)) model.save("my_trained.json") model = TFLogistic.load("my_trained.json") print("final train accuracy (after reload):", model.score(Xtrain, Ytrain)) print("final test accuracy (after reload):", model.score(Xtest, Ytest))
def main(): X, Y = get_normalized_data() Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] model = TFLogistic("tf.model") model.fit(Xtrain, Ytrain, Xtest, Ytest)
def main(): X, Y = get_normalized_data() # normalized MNIST dataset Xtest, Ytest = X[-1000:, :], Y[-1000:] Xtrain, Ytrain = X[:-1000, :], Y[:-1000] model = ANN([1000, 500, 500]) model.fit(Xtrain, Ytrain, display_cost=True) # joblib.dump(model, 'mymodel.pkl') # model = joblib.load('mymodel.pkl') print('Test set acc:', model.score(Xtest.astype(np.float32), Ytest.astype(np.float32)))
def main(): Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) lr = 0.00004 reg = 0.01 N, D = Xtrain.shape K = 10 max_iter = 1000 batch_sz = 500 n_batches = N // batch_sz print_period = 10 W_init = np.random.randn(D, K) b_init = np.random.randn(K) X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='T') W = tf.Variable(W_init.astype(np.float32)) b = tf.Variable(b_init.astype(np.float32)) Yish = tf.matmul(X, W) + b cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) #train_op = tf.train.GradientDescentOptimizer(lr).minimize(cost) train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) predict_op = tf.argmax(Yish, 1) LL = [] init = tf.initialize_all_variables() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind}) prediction = session.run(predict_op, feed_dict={X: Xtest}) err = error_rate(prediction, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) plt.plot(LL) plt.show()
def benchmark_full(): Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() print("Performing logistic regression...") # lr = LogisticRegression(solver='lbfgs') # convert Ytrain and Ytest to (N x K) matrices of indicator variables N, D = Xtrain.shape Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) W = np.random.randn(D, 10) / np.sqrt(D) b = np.zeros(10) LL = [] LLtest = [] CRtest = [] # reg = 1 # learning rate 0.0001 is too high, 0.00005 is also too high # 0.00003 / 2000 iterations => 0.363 error, -7630 cost # 0.00004 / 1000 iterations => 0.295 error, -7902 cost # 0.00004 / 2000 iterations => 0.321 error, -7528 cost # reg = 0.1, still around 0.31 error # reg = 0.01, still around 0.31 error lr = 0.00004 reg = 0.01 for i in range(500): p_y = forward(Xtrain, W, b) # print "p_y:", p_y ll = cost(p_y, Ytrain_ind) LL.append(ll) p_y_test = forward(Xtest, W, b) lltest = cost(p_y_test, Ytest_ind) LLtest.append(lltest) err = error_rate(p_y_test, Ytest) CRtest.append(err) W += lr * (gradW(Ytrain_ind, p_y, Xtrain) - reg * W) b += lr * (gradb(Ytrain_ind, p_y) - reg * b) if i % 10 == 0: print("Cost at iteration %d: %.6f" % (i, ll)) print("Error rate:", err) p_y = forward(Xtest, W, b) print("Final error rate:", error_rate(p_y, Ytest)) iters = range(len(LL)) plt.plot(iters, LL, iters, LLtest) plt.show() plt.plot(CRtest) plt.show()
def main(): X, Y = get_normalized_data() X, Y = shuffle(X, Y) Xtrain, Ytrain = X[:-1000], Y[:-1000] Xtest, Ytest = X[-1000:], Y[-1000:] ann = ANN([500, 300]) ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True) print("Train accuracy:", ann.score(Xtrain, Ytrain)) print("Test accuracy:", ann.score(Xtest, Ytest))
def main(): # step 1: get the data and define all the usual variables X, Y = get_normalized_data() # Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3) X, Y = shuffle(X, Y) Xtrain, Ytrain = X[:-1000], Y[:-1000] Xtest, Ytest = X[-1000:], Y[-1000:] ann = ANN([500, 300]) ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True) print("Train accuracy:", ann.score(Xtrain, Ytrain)) print("Test accuracy:", ann.score(Xtest, Ytest))
def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() ann = ANN([500, 300]) session = tf.compat.v1.InteractiveSession() ann.set_session(session) ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True) print("Train accuracy:", ann.score(Xtrain, Ytrain)) print("Test accuracy:", ann.score(Xtest, Ytest))
def main(): X, Y = get_normalized_data() t0 = datetime.now() model = ANN([500, 300]) session = tf.InteractiveSession() model.set_session(session) model.fit(X, Y, display_cost=True, save_params=True) dt = datetime.now() - t0 print('Elapsed time:', dt)
def main(): Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() # Ytrain_ind = y2indicator(Ytrain) # Ytest_ind = y2indicator(Ytest) #Find optimal hyperparams M = [100, 200, 300] lrs = [0.00001, 0.000001, 0.0001] regs = [0.1, 0.01, 0.001] ann = ANN(Xtrain, Ytrain, Xtest, Ytest) lr, reg, M, LL = ann.grid_search(M, lrs, regs) print("Found optimal values: lr={}, reg={}, M={} at a cost of {}".format( lr, reg, M, LL))
def main(): Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() model = TFLogistic("./tf.model") model.fit(Xtrain, Ytrain, Xtest, Ytest) # test out restoring the model via the predict function print("final train accuracy:", model.score(Xtrain, Ytrain)) print("final test accuracy:", model.score(Xtest, Ytest)) # save the model model.save("my_trained_model.json") # load and score again model = TFLogistic.load("my_trained_model.json") print("final train accuracy (after reload):", model.score(Xtrain, Ytrain)) print("final test accuracy (after reload):", model.score(Xtest, Ytest))
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500
def main(): # step 1: get the data and define all the usual variables X, Y = get_normalized_data() max_iter = 15 print_period = 10 lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500
def main(): X, Y = get_normalized_data() Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] model = TFLogistic("tf.model") model.fit(Xtrain, Ytrain, Xtest, Ytest) # test out restoring the model via the predict function print "final train accuracy:", model.score(Xtrain, Ytrain) print "final test accuracy:", model.score(Xtest, Ytest) # save the model model.save("my_trained_model.json") # load and score again model = TFLogistic.load("my_trained_model.json") print "final train accuracy (after reload):", model.score(Xtrain, Ytrain) print "final test accuracy (after reload):", model.score(Xtest, Ytest)
def main(): X, Y = get_normalized_data() Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] model = TFLogistic("tf.model") model.fit(Xtrain, Ytrain, Xtest, Ytest) # test out restoring the model via the predict function print "final train accuracy:", model.score(Xtrain, Ytrain) print "final test accuracy:", model.score(Xtest, Ytest) # save the model model.save("my_trained_model.json") # load and score again model = TFLogistic.load("my_trained_model.json") print "final train accuracy (after reload):", model.score(Xtrain, Ytrain) print "final test accuracy (after reload):", model.score(Xtest, Ytest)
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500
def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() ann = ANN([500, 300], [0.8, 0.5, 0.5], './tf.model') session = tf.InteractiveSession() ann.set_session(session) ann.fit(Xtrain, Ytrain, Xtest, Ytest) print("final train accuracy:", ann.score(Xtrain, Ytrain)) print("final test accuracy:", ann.score(Xtest, Ytest)) ann.save("my_saved_model.json") session.close() sess = tf.InteractiveSession() model = ANN.load(sess, "my_saved_model.json") model.set_session(sess) model.restore_model() print("final train accuracy (after reload):", model.score(Xtrain, Ytrain)) print("final test accuracy (after reload):", model.score(Xtest, Ytest))
def main(): # 1.batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov max_iter = 20 print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain, Ytrain = X[:-1000], Y[:-1000] Xtest, Ytest = X[-1000:], Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = X.shape batch_sz = 500 n_batches = N / batch_sz M = 300 # number of hidden neurons K = 10 # number of output classes W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #1. batch SGD LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #2. batch with momentum W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * dW1 - lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) #3. batch with Nesterov momentum W1 = np.random.randn(D, M) / np.sqrt(D + M) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nesterov = [] CR_nesterov = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j * batch_sz:(j + 1) * batch_sz, ] Ybatch = Ytrain_ind[j * batch_sz:(j + 1) * batch_sz, ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) dW2 = mu * mu * dW2 - (1 + mu) * lr * ( derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * mu * db2 - (1 + mu) * lr * ( derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * mu * dW1 - (1 + mu) * lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * mu * db1 - (1 + mu) * lr * ( derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_nesterov.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nesterov.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label='momentum') plt.plot(LL_nesterov, label='nesterov') plt.legend() plt.show()
def main(): # step 1: get the data and define all the usual variables X, Y = get_normalized_data() max_iter = 15 print_period = 10 lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz # add an extra layer just for fun M1 = 300 M2 = 100 K = 10 W1_init = np.random.randn(D, M1) / 28 b1_init = np.zeros(M1) W2_init = np.random.randn(M1, M2) / np.sqrt(M1) b2_init = np.zeros(M2) W3_init = np.random.randn(M2, K) / np.sqrt(M2) b3_init = np.zeros(K) # initialize varaibles and expressions X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='Y') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) # define the model Z1 = tf.nn.relu(tf.matmul(X, W1) + b1) Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2) # the cost function does the softmaxing! SO NO SOFTMAXING HERE Yish = tf.matmul(Z2, W3) + b3 cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) predict_op = tf.argmax(Yish, 1) costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind}) prediction = session.run(predict_op, feed_dict={X: Xtest}) err = error_rate(prediction, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) costs.append(test_cost) plt.plot(costs) plt.show()
def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() ann = ANN([500, 300], [0.8, 0.5, 0.5]) ann.fit(Xtrain, Ytrain, Xtest, Ytest, show_fig=True)
# Note: is helpful to look at keras_example.py first import numpy as np import matplotlib.pyplot as plt from util import get_normalized_data import torch from torch.autograd import Variable from torch import optim # get the data, same as Theano + Tensorflow examples # no need to split now, the fit() function will do it X, Y = get_normalized_data() # get shapes _, D = X.shape K = len(set(Y)) # split the data Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] # Note: no need to convert Y to indicator matrix # the model will be a sequence of layers
def main(): max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. const # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. RMSprop W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_rms = [] CR_rms = [] lr0 = 0.001 # if you set this too high you'll get NaN! cache_W2 = 0 cache_b2 = 0 cache_W1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 0.0000000001 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 W2 -= lr0 * gW2 / (np.sqrt(cache_W2) + eps) gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 b2 -= lr0 * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 W1 -= lr0 * gW1 / (np.sqrt(cache_W1) + eps) gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 b1 -= lr0 * gb1 / (np.sqrt(cache_b1) + eps) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_rms.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_rms.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label='const') plt.plot(LL_rms, label='rms') plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000, ] Ytrain = Y[:-1000] Xtest = X[-1000:, ] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) b2 -= lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) W1 -= lr * (derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) b1 -= lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) ll = cost(pY, Ytest_ind) LL_batch.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_batch.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu * dW2 - lr * (derivative_w2(Z, Ybatch, pYbatch) + reg * W2) W2 += dW2 db2 = mu * db2 - lr * (derivative_b2(Ybatch, pYbatch) + reg * b2) b2 += db2 dW1 = mu * dW1 - lr * ( derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg * W1) W1 += dW1 db1 = mu * db1 - lr * (derivative_b1(Z, Ybatch, pYbatch, W2) + reg * b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_momentum.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 # alternate version uses dW # dW2 = 0 # db2 = 0 # dW1 = 0 # db1 = 0 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): # because we want g(t) = grad(f(W(t-1) - lr*mu*dW(t-1))) # dW(t) = mu*dW(t-1) + g(t) # W(t) = W(t-1) - mu*dW(t) W1_tmp = W1 - lr * mu * vW1 b1_tmp = b1 - lr * mu * vb1 W2_tmp = W2 - lr * mu * vW2 b2_tmp = b2 - lr * mu * vb2 Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] # pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) pYbatch, Z = forward(Xbatch, W1_tmp, b1_tmp, W2_tmp, b2_tmp) # updates # dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) # W2 += dW2 # db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) # b2 += db2 # dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) # W1 += dW1 # db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) # b1 += db1 vW2 = mu * vW2 + derivative_w2(Z, Ybatch, pYbatch) + reg * W2_tmp W2 -= lr * vW2 vb2 = mu * vb2 + derivative_b2(Ybatch, pYbatch) + reg * b2_tmp b2 -= lr * vb2 vW1 = mu * vW1 + derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2_tmp) + reg * W1_tmp W1 -= lr * vW1 vb1 = mu * vb1 + derivative_b1(Z, Ybatch, pYbatch, W2_tmp) + reg * b1_tmp b1 -= lr * vb1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll)) err = error_rate(pY, Ytest) CR_nest.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): max_iter = 10 print_period = 10 X, Y = get_normalized_data() reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_0 = np.random.randn(D, M) / np.sqrt(D) b1_0 = np.zeros(M) W2_0 = np.random.randn(M, K) / np.sqrt(M) b2_0 = np.zeros(K) W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparams lr0 = 0.001 beta1 = 0.9 beta2 = 0.999 eps = 1e-8 # 1. Adam loss_adam = [] err_adam = [] t = 1 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # new m mW1 = beta1 * mW1 + (1 - beta1) * gW1 mb1 = beta1 * mb1 + (1 - beta1) * gb1 mW2 = beta1 * mW2 + (1 - beta1) * gW2 mb2 = beta1 * mb2 + (1 - beta1) * gb2 # new v vW1 = beta2 * vW1 + (1 - beta2) * gW1 * gW1 vb1 = beta2 * vb1 + (1 - beta2) * gb1 * gb1 vW2 = beta2 * vW2 + (1 - beta2) * gW2 * gW2 vb2 = beta2 * vb2 + (1 - beta2) * gb2 * gb2 # bias correction correction1 = 1 - beta1 ** t hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 correction2 = 1 - beta2 ** t hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 # update t t += 1 # apply updates to the params W1 = W1 - lr0 * hat_mW1 / np.sqrt(hat_vW1 + eps) b1 = b1 - lr0 * hat_mb1 / np.sqrt(hat_vb1 + eps) W2 = W2 - lr0 * hat_mW2 / np.sqrt(hat_vW2 + eps) b2 = b2 - lr0 * hat_mb2 / np.sqrt(hat_vb2 + eps) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_adam.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_adam.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. RMSprop with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() loss_rms = [] err_rms = [] # comparable hyperparameters for fair comparison lr0 = 0.001 mu = 0.9 decay_rate = 0.999 eps = 1e-8 # rmsprop cache cache_W2 = 1 cache_b2 = 1 cache_W1 = 1 cache_b1 = 1 # momentum dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 cache_W2 = decay_rate*cache_W2 + (1 - decay_rate)*gW2*gW2 dW2 = mu * dW2 + (1 - mu) * lr0 * gW2 / (np.sqrt(cache_W2) + eps) W2 -= dW2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 cache_b2 = decay_rate*cache_b2 + (1 - decay_rate)*gb2*gb2 db2 = mu * db2 + (1 - mu) * lr0 * gb2 / (np.sqrt(cache_b2) + eps) b2 -= db2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 cache_W1 = decay_rate*cache_W1 + (1 - decay_rate)*gW1*gW1 dW1 = mu * dW1 + (1 - mu) * lr0 * gW1 / (np.sqrt(cache_W1) + eps) W1 -= dW1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 cache_b1 = decay_rate*cache_b1 + (1 - decay_rate)*gb1*gb1 db1 = mu * db1 + (1 - mu) * lr0 * gb1 / (np.sqrt(cache_b1) + eps) b1 -= db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) loss_rms.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) err = error_rate(pY, Ytest) err_rms.append(err) print("Error rate:", err) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(loss_adam, label='adam') plt.plot(loss_rms, label='rmsprop') plt.legend() plt.show()
def main(): # step 1: get the data and define all the usual variables X, Y = get_normalized_data() ann = ANN([500, 300], [0.8, 0.5, 0.5]) ann.fit(X, Y)
from keras.models import Sequential from keras.layers import Dense, Activation from util import get_normalized_data, y2indicator import matplotlib.pyplot as plt Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() N, D = Xtrain.shape K = len(set(Ytrain)) Ytrain = y2indicator(Ytrain) Ytest = y2indicator(Ytest) model = Sequential() model.add(Dense(units=500, input_dim=D)) model.add(Activation('relu')) model.add(Dense(units=300)) model.add(Activation('relu')) model.add(Dense(units=K)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) r = model.fit(Xtrain, Ytrain, validation_data=(Xtest, Ytest), epochs=15,
def batch_grad(): #get data and for test and train sets X, Y = get_normalized_data() #XTrain = X[:-1000, :] #YTrain = Y[:-1000] #YTrain_ind = y2indicator(YTrain) #XTest = X[-1000:, :] #YTest = Y[-1000:] # = y2indicator(YTest) Y_ind = y2indicator(Y) batchSz = 500 #Initialize random weights N, D = X.shape K = len(set(Y)) M = 300 W1 = np.random.randn(D, M) b1 = np.random.randn(M) W2 = np.random.randn(M, K) b2 = np.random.randn(K) learning_rate = 0.001 reg = 0.01 cache_w2 = 0 cache_b2 = 0 cache_w1 = 0 cache_b1 = 0 decay_rate = 0.999 eps = 10e-10 no_batches = int(N / batchSz) print("No of bathces: ", no_batches) for i in range(300): for n in range(no_batches): #get current batch XBatch = X[n * batchSz:(n * batchSz + batchSz), :] YBatch_ind = Y_ind[n * batchSz:(n * batchSz + batchSz), :] #Forward prop pY, Z = forward_relu(XBatch, W1, b1, W2, b2) #Backprop gW2 = derivative_w2(pY, YBatch_ind, Z) + reg * W2 cache_w2 = decay_rate * cache_w2 + (1 - decay_rate) * gW2 * gW2 W2 += learning_rate * gW2 / (np.sqrt(cache_w2) + eps) gb2 = derivative_b2(pY, YBatch_ind) + reg * b2 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 += learning_rate * gb2 / (np.sqrt(cache_b2) + eps) gW1 = derivative_w1(pY, YBatch_ind, W2, Z, XBatch) + reg * W1 cache_b2 = decay_rate * cache_b2 + (1 - decay_rate) * gb2 * gb2 b2 += learning_rate * gb2 / (np.sqrt(cache_b2) + eps) gb1 = derivative_b1(pY, YBatch_ind, W2, Z) + reg * b1 cache_b1 = decay_rate * cache_b1 + (1 - decay_rate) * gb1 * gb1 b1 += learning_rate * gb1 / (np.sqrt(cache_b1) + eps) if n % 100 == 0: #Forward prop #pY, Z = forward_relu(XBatch, W1, b1, W2, b2) YBatch = Y[n * batchSz:n * batchSz + batchSz] P = np.argmax(pY, axis=1) er = error_rate(P, YBatch) c = cost(YBatch_ind, pY) print("Loop: ", i, n, "Error rate: ", er, "Cost: ", c) pY, Z = forward_relu(X, W1, b1, W2, b2) p = np.argmax(pY, axis=1) print("Final Final training error rate: ", error_rate(p, Y)) XTest = get_test_data() pY, ZTest = forward_relu(XTest, W1, b1, W2, b2) YTest = np.argmax(pY, axis=1) f = open("test_rms.csv", "w") f.write("ImageId,Label\n") n = YTest.shape[0] for i in range(n): f.write(str(i + 1) + "," + str(YTest[i]) + "\n") f.close()
from keras.models import Sequential from keras.layers import Dense, Activation from util import get_normalized_data, y2indicator import matplotlib.pyplot as plt # NOTE: do NOT name your file keras.py because it will conflict # with importing keras # installation is easy! just the usual "sudo pip(3) install keras" # get the data, same as Theano + Tensorflow examples # no need to split now, the fit() function will do it Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() # get shapes N, D = Xtrain.shape K = len(set(Ytrain)) # by default Keras wants one-hot encoded labels # there's another cost function we can use # where we can just pass in the integer labels directly # just like Tensorflow / Theano Ytrain = y2indicator(Ytrain) Ytest = y2indicator(Ytest) # the model will be a sequence of layers model = Sequential()
def main(): # step 1: get the data and define all the usual variables X, Y = get_normalized_data() max_iter = 15 print_period = 10 lr = 0.00004 mu = 0.9 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz # add an extra layer just for fun M1 = 300 M2 = 100 K = 10 W1_init = np.random.randn(D, M1) / 28 b1_init = np.zeros(M1) W2_init = np.random.randn(M1, M2) / np.sqrt(M1) b2_init = np.zeros(M2) W3_init = np.random.randn(M2, K) / np.sqrt(M2) b3_init = np.zeros(K) # define variables and expressions X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) # define the model Z1 = tf.nn.relu( tf.matmul(X, W1) + b1 ) Z2 = tf.nn.relu( tf.matmul(Z1, W2) + b2 ) Yish = tf.matmul(Z2, W3) + b3 # remember, the cost function does the softmaxing! weird, right? # softmax_cross_entropy_with_logits take in the "logits" # if you wanted to know the actual output of the neural net, # you could pass "Yish" into tf.nn.softmax(logits) cost = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(Yish, T)) # we choose the optimizer but don't implement the algorithm ourselves # let's go with RMSprop, since we just learned about it. # it includes momentum! train_op = tf.train.RMSPropOptimizer(lr, decay=0.99, momentum=0.9).minimize(cost) # we'll use this to calculate the error rate predict_op = tf.argmax(Yish, 1) LL = [] init = tf.initialize_all_variables() with tf.Session() as session: session.run(init) for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind}) prediction = session.run(predict_op, feed_dict={X: Xtest}) err = error_rate(prediction, Ytest) print "Cost / err at iteration i=%d, j=%d: %.6f / %.3f" % (i, j, test_cost, err) LL.append(test_cost) plt.plot(LL) plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 10 X, Y = get_normalized_data() lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # 1. batch # cost = -16 LL_batch = [] CR_batch = [] for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_batch.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_batch.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 2. batch with momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_momentum = [] CR_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*dW2 - lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*db2 - lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*dW1 - lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*db1 - lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_momentum.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_momentum.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) # 3. batch with Nesterov momentum W1 = np.random.randn(D, M) / 28 b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) LL_nest = [] CR_nest = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in xrange(max_iter): for j in xrange(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates dW2 = mu*mu*dW2 - (1 + mu)*lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) W2 += dW2 db2 = mu*mu*db2 - (1 + mu)*lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) b2 += db2 dW1 = mu*mu*dW1 - (1 + mu)*lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) W1 += dW1 db1 = mu*mu*db1 - (1 + mu)*lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) b1 += db1 if j % print_period == 0: # calculate just for LL pY, _ = forward(Xtest, W1, b1, W2, b2) # print "pY:", pY ll = cost(pY, Ytest_ind) LL_nest.append(ll) print "Cost at iteration i=%d, j=%d: %.6f" % (i, j, ll) err = error_rate(pY, Ytest) CR_nest.append(err) print "Error rate:", err pY, _ = forward(Xtest, W1, b1, W2, b2) print "Final error rate:", error_rate(pY, Ytest) plt.plot(LL_batch, label="batch") plt.plot(LL_momentum, label="momentum") plt.plot(LL_nest, label="nesterov") plt.legend() plt.show()
def main(): # compare 3 scenarios: # 1. batch SGD # 2. batch SGD with momentum # 3. batch SGD with Nesterov momentum max_iter = 20 # make it 30 for sigmoid print_period = 50 Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() lr = 0.00004 reg = 0.01 Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) # save initial weights W1_0 = W1.copy() b1_0 = b1.copy() W2_0 = W2.copy() b2_0 = b2.copy() # 1. batch losses_batch = [] errors_batch = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # print "first batch cost:", cost(pYbatch, Ybatch) # updates W2 -= lr*(derivative_w2(Z, Ybatch, pYbatch) + reg*W2) b2 -= lr*(derivative_b2(Ybatch, pYbatch) + reg*b2) W1 -= lr*(derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1) b1 -= lr*(derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1) if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_batch.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_batch.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 2. batch with momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_momentum = [] errors_momentum = [] mu = 0.9 dW2 = 0 db2 = 0 dW1 = 0 db1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # gradients gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # update velocities dW2 = mu*dW2 - lr*gW2 db2 = mu*db2 - lr*gb2 dW1 = mu*dW1 - lr*gW1 db1 = mu*db1 - lr*gb1 # updates W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_momentum.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_momentum.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) # 3. batch with Nesterov momentum W1 = W1_0.copy() b1 = b1_0.copy() W2 = W2_0.copy() b2 = b2_0.copy() losses_nesterov = [] errors_nesterov = [] mu = 0.9 vW2 = 0 vb2 = 0 vW1 = 0 vb1 = 0 for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] pYbatch, Z = forward(Xbatch, W1, b1, W2, b2) # updates gW2 = derivative_w2(Z, Ybatch, pYbatch) + reg*W2 gb2 = derivative_b2(Ybatch, pYbatch) + reg*b2 gW1 = derivative_w1(Xbatch, Z, Ybatch, pYbatch, W2) + reg*W1 gb1 = derivative_b1(Z, Ybatch, pYbatch, W2) + reg*b1 # v update vW2 = mu*vW2 - lr*gW2 vb2 = mu*vb2 - lr*gb2 vW1 = mu*vW1 - lr*gW1 vb1 = mu*vb1 - lr*gb1 # param update W2 += mu*vW2 - lr*gW2 b2 += mu*vb2 - lr*gb2 W1 += mu*vW1 - lr*gW1 b1 += mu*vb1 - lr*gb1 if j % print_period == 0: pY, _ = forward(Xtest, W1, b1, W2, b2) l = cost(pY, Ytest_ind) losses_nesterov.append(l) print("Cost at iteration i=%d, j=%d: %.6f" % (i, j, l)) e = error_rate(pY, Ytest) errors_nesterov.append(e) print("Error rate:", e) pY, _ = forward(Xtest, W1, b1, W2, b2) print("Final error rate:", error_rate(pY, Ytest)) plt.plot(losses_batch, label="batch") plt.plot(losses_momentum, label="momentum") plt.plot(losses_nesterov, label="nesterov") plt.legend() plt.show()
def main(): X, Y = get_normalized_data() max_iter = 20 print_period = 10 lr = 0.00004 reg = 0.01 Xtrain = X[:-1000,] Ytrain = Y[:-1000] Xtest = X[-1000:,] Ytest = Y[-1000:] Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) N, D = Xtrain.shape batch_sz = 500 n_batches = N / batch_sz M = 300 K = 10 W1_init = np.random.randn(D, M) / 28 b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) thX = T.matrix('X') thT = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') thZ = relu( thX.dot(W1) + b1 ) thY = T.nnet.softmax( thZ.dot(W2) + b2 ) cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum()) prediction = T.argmax(thY, axis=1) update_W1 = W1 - lr*T.grad(cost, W1) update_b1 = b1 - lr*T.grad(cost, b1) update_W2 = W2 - lr*T.grad(cost, W2) update_b2 = b2 - lr*T.grad(cost, b2) train = theano.function( inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)], ) get_prediction = theano.function( inputs=[thX, thT], outputs =[cost, prediction], ) LL = [] for i in range(max_iter): for j in range(int(n_batches)): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) LL.append(cost_val) plt.plot(LL) plt.show()
def main(): # step 1: get the data and define all the usual variables Xtrain, Xtest, Ytrain, Ytest = get_normalized_data() max_iter = 20 print_period = 10 lr = 0.0004 reg = 0.01 Xtrain = Xtest.astype(np.float32) Ytrain = Ytest.astype(np.float32) Ytrain_ind = y2indicator(Ytrain).astype(np.float32) Ytest_ind = y2indicator(Ytest).astype(np.float32) N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M = 300 K = 10 W1_init = np.random.randn(D, M) / 28 b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) # step 2: define theano variables and expressions thX = T.matrix('X') thT = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') # we can use the built-in theano functions to do relu and softmax thZ = relu( thX.dot(W1) + b1 ) # relu is new in version 0.7.1 but just in case you don't have it thY = T.nnet.softmax( thZ.dot(W2) + b2 ) # define the cost function and prediction cost = -(thT * T.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum()) prediction = T.argmax(thY, axis=1) # step 3: training expressions and functions # we can just include regularization as part of the cost because it is also automatically differentiated! update_W1 = W1 - lr*T.grad(cost, W1) update_b1 = b1 - lr*T.grad(cost, b1) update_W2 = W2 - lr*T.grad(cost, W2) update_b2 = b2 - lr*T.grad(cost, b2) train = theano.function( inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)], ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[thX, thT], outputs=[cost, prediction], ) costs = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) costs.append(cost_val) plt.plot(costs) plt.show()