def main(): # step 1: load the data, transform as needed train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
def main(): train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test Ytest_ind = y2indicator(Ytest) # Gradient descent parameters max_iter = 6 print_period = 10 N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz # Limit samples since input will always have to be same size # Could also do N = N / batch_sz * batch_sz Xtrain = Xtrain[:73000, ] Ytrain = Ytrain[:73000] Xtest = Xtest[:26000, ] Ytest = Ytest[:26000] Ytest_ind = Ytest_ind[:26000, ] # Initial weights M = 500 K = 10 poolsz = (2, 2) # W1_shape = (filter_width, filter_height, # num_color_channels, num_feature_maps) W1_shape = (5, 5, 3, 20) W1_init = init_filter(W1_shape, poolsz) # One bias per output feature map b1_init = np.zeros(W1_shape[-1], dtype=np.float32) # W2_shape = (filter_width, filter_height, # old_num_feature_maps, num_feature_maps) W2_shape = (5, 5, 20, 50) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[-1], dtype=np.float32) # Vanilla ANN weights W3_init = np.random.randn(W2_shape[-1] * 8 * 8, M) / np.sqrt(W2_shape[-1] * 8 * 8 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # Define variables and expressions # Using None as the first shape element takes up too much RAM X = tf.placeholder(tf.float32, shape=(batch_sz, 32, 32, 3), name='X') T = tf.placeholder(tf.float32, shape=(batch_sz, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) W4 = tf.Variable(W4_init.astype(np.float32)) b4 = tf.Variable(b4_init.astype(np.float32)) Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z2_shape = Z2.get_shape().as_list() Z2r = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])]) Z3 = tf.nn.relu(tf.matmul(Z2r, W3) + b3) Yish = tf.matmul(Z3, W4) + b4 cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) # Use this to calculate the error rate predict_op = tf.argmax(Yish, 1) t0 = datetime.now() LL = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] if len(Xbatch) == batch_sz: session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: # Due to RAM limitations we need to have a # fixed size input, so we have this ugly total # cost and prediction computation. test_cost = 0 prediction = np.zeros(len(Xtest)) for k in range(len(Xtest) // batch_sz): Xtestbatch = Xtest[k * batch_sz:(k * batch_sz + batch_sz), ] Ytestbatch = Ytest_ind[k * batch_sz:(k * batch_sz + batch_sz), ] test_cost += session.run(cost, feed_dict={ X: Xtestbatch, T: Ytestbatch }) prediction[k * batch_sz:(k * batch_sz + batch_sz)] = session.run( predict_op, feed_dict={X: Xtestbatch}) err = error_rate(prediction, Ytest) print("Cost / err at iteration i = %d,\ j = %d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show()
def main(): # step 1: load the data, transform as needed train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test Ytest_ind = y2indicator(Ytest) max_iter = 8 print_period = 10 lr = np.float32(0.00001) reg = np.float32(0.01) mu = np.float32(0.99) N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz M = 500 K = 10 poolsz = (2, 2) # after conv will be of dimension 32 - 5 + 1 = 28 # after downsample 28 / 2 = 14 W1_shape = (20, 3, 5, 5) # (num_feature_maps, num_color_channels, filter_width, filter_height) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[0], dtype=np.float32) # one bias per output feature map # after conv will be of dimension 14 - 5 + 1 = 10 # after downsample 10 / 2 = 5 W2_shape = (50, 20, 5, 5) # (num_feature_maps, old_num_feature_maps, filter_width, filter_height) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[0], dtype=np.float32) # vanilla ANN weights W3_init = np.random.randn(W2_shape[0]*5*5, M) / np.sqrt(W2_shape[0]*5*5 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # step 2: define theano variables and expressions X = T.tensor4('X', dtype='float32') Y = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') W3 = theano.shared(W3_init.astype(np.float32), 'W3') b3 = theano.shared(b3_init, 'b3') W4 = theano.shared(W4_init.astype(np.float32), 'W4') b4 = theano.shared(b4_init, 'b4') # momentum changes dW1 = theano.shared(np.zeros(W1_init.shape, dtype=np.float32), 'dW1') db1 = theano.shared(np.zeros(b1_init.shape, dtype=np.float32), 'db1') dW2 = theano.shared(np.zeros(W2_init.shape, dtype=np.float32), 'dW2') db2 = theano.shared(np.zeros(b2_init.shape, dtype=np.float32), 'db2') dW3 = theano.shared(np.zeros(W3_init.shape, dtype=np.float32), 'dW3') db3 = theano.shared(np.zeros(b3_init.shape, dtype=np.float32), 'db3') dW4 = theano.shared(np.zeros(W4_init.shape, dtype=np.float32), 'dW4') db4 = theano.shared(np.zeros(b4_init.shape, dtype=np.float32), 'db4') # forward pass Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3) pY = T.nnet.softmax( Z3.dot(W4) + b4) # define the cost function and prediction params = (W1, b1, W2, b2, W3, b3, W4, b4) reg_cost = reg*np.sum((param*param).sum() for param in params) cost = -(Y * T.log(pY)).sum() + reg_cost prediction = T.argmax(pY, axis=1) # step 3: training expressions and functions update_W1 = W1 + mu*dW1 - lr*T.grad(cost, W1) update_b1 = b1 + mu*db1 - lr*T.grad(cost, b1) update_W2 = W2 + mu*dW2 - lr*T.grad(cost, W2) update_b2 = b2 + mu*db2 - lr*T.grad(cost, b2) update_W3 = W3 + mu*dW3 - lr*T.grad(cost, W3) update_b3 = b3 + mu*db3 - lr*T.grad(cost, b3) update_W4 = W4 + mu*dW4 - lr*T.grad(cost, W4) update_b4 = b4 + mu*db4 - lr*T.grad(cost, b4) # update weight changes update_dW1 = mu*dW1 - lr*T.grad(cost, W1) update_db1 = mu*db1 - lr*T.grad(cost, b1) update_dW2 = mu*dW2 - lr*T.grad(cost, W2) update_db2 = mu*db2 - lr*T.grad(cost, b2) update_dW3 = mu*dW3 - lr*T.grad(cost, W3) update_db3 = mu*db3 - lr*T.grad(cost, b3) update_dW4 = mu*dW4 - lr*T.grad(cost, W4) update_db4 = mu*db4 - lr*T.grad(cost, b4) train = theano.function( inputs=[X, Y], updates=[ (W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2), (W3, update_W3), (b3, update_b3), (W4, update_W4), (b4, update_b4), (dW1, update_dW1), (db1, update_db1), (dW2, update_dW2), (db2, update_db2), (dW3, update_dW3), (db3, update_db3), (dW4, update_dW4), (db4, update_db4), ], ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[X, Y], outputs=[cost, prediction], ) t0 = datetime.now() LL = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain_ind[j*batch_sz:(j*batch_sz + batch_sz),] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) LL.append(cost_val) print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show() # visualize W1 (20, 3, 5, 5) W1_val = W1.get_value() grid = np.zeros((8*5, 8*5)) m = 0 n = 0 for i in range(20): for j in range(3): filt = W1_val[i,j] grid[m*5:(m+1)*5,n*5:(n+1)*5] = filt m += 1 if m >= 8: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W1") plt.show() # visualize W2 (50, 20, 5, 5) W2_val = W2.get_value() grid = np.zeros((32*5, 32*5)) m = 0 n = 0 for i in range(50): for j in range(20): filt = W2_val[i,j] grid[m*5:(m+1)*5,n*5:(n+1)*5] = filt m += 1 if m >= 32: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W2") plt.show()
def main(): # step 1: load the data, transform as needed train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test max_iter = 6 print_period = 10 lr = np.float32(1e-2) mu = np.float32(0.99) N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz M = 500 K = 10 poolsz = (2, 2) # after conv will be of dimension 32 - 5 + 1 = 28 # after downsample 28 / 2 = 14 W1_shape = (20, 3, 5, 5) # (num_feature_maps, num_color_channels, filter_width, filter_height) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[0], dtype=np.float32) # one bias per output feature map # after conv will be of dimension 14 - 5 + 1 = 10 # after downsample 10 / 2 = 5 W2_shape = (50, 20, 5, 5) # (num_feature_maps, old_num_feature_maps, filter_width, filter_height) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[0], dtype=np.float32) # vanilla ANN weights W3_init = np.random.randn(W2_shape[0]*5*5, M) / np.sqrt(W2_shape[0]*5*5 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # step 2: define theano variables and expressions X = T.tensor4('X', dtype='float32') Y = T.ivector('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') W3 = theano.shared(W3_init.astype(np.float32), 'W3') b3 = theano.shared(b3_init, 'b3') W4 = theano.shared(W4_init.astype(np.float32), 'W4') b4 = theano.shared(b4_init, 'b4') # forward pass Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3) pY = T.nnet.softmax( Z3.dot(W4) + b4) # define the cost function and prediction cost = -(T.log(pY[T.arange(Y.shape[0]), Y])).mean() prediction = T.argmax(pY, axis=1) # step 3: training expressions and functions params = [W1, b1, W2, b2, W3, b3, W4, b4] # momentum changes dparams = [ theano.shared( np.zeros_like( p.get_value(), dtype=np.float32 ) ) for p in params ] updates = [] grads = T.grad(cost, params) for p, dp, g in zip(params, dparams, grads): dp_update = mu*dp - lr*g p_update = p + dp_update updates.append((dp, dp_update)) updates.append((p, p_update)) train = theano.function( inputs=[X, Y], updates=updates, ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[X, Y], outputs=[cost, prediction], ) t0 = datetime.now() costs = [] for i in range(max_iter): Xtrain, Ytrain = shuffle(Xtrain, Ytrain) for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) costs.append(cost_val) print("Elapsed time:", (datetime.now() - t0)) plt.plot(costs) plt.show()
def main(): train, test = get_data() # Need to scale! don't leave as 0..255 Xtrain = rearrange(train['X']) # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Ytrain = train['y'].flatten() - 1 # print len(Ytrain) del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test # gradient descent params max_iter = 6 # epoch print_period = 10 N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz # limit samples since input will always have to be same size # you could also just do N = N / batch_sz * batch_sz Xtrain = Xtrain[:73000, ] Ytrain = Ytrain[:73000] Xtest = Xtest[:26000, ] Ytest = Ytest[:26000] # initial weights M = 500 # hidden units of ANN K = 10 # number of classes poolsz = (2, 2) # output is (N, 32, 32, 3), (#images, height, width, #color) # (filter_width, filter_height, num_color_channels, num_feature_maps) W1_shape = (5, 5, 3, 20) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros( W1_shape[-1], dtype=np.float32) # one bias per output feature map -- 20 bias # (filter_width, filter_height, num_feature_maps_in, num_feature_maps_out) W2_shape = (5, 5, 20, 50) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[-1], dtype=np.float32) # -- 50 bias # vanilla ANN weights # finall shape of feature map is (8,8) ? W3_init = np.random.randn(W2_shape[-1] * 8 * 8, M) / np.sqrt(W2_shape[-1] * 8 * 8 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # define variables and expressions # using None as the first shape element takes up too much RAM unfortunately X = tf.placeholder(tf.float32, shape=(batch_sz, 32, 32, 3), name='X') T = tf.placeholder(tf.int32, shape=(batch_sz, ), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) W4 = tf.Variable(W4_init.astype(np.float32)) b4 = tf.Variable(b4_init.astype(np.float32)) Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z2_shape = Z2.get_shape().as_list() # output is (N, h, w, #feature maps) # reshape: Z2_shape[0]: #images Z2r = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])]) Z3 = tf.nn.relu(tf.matmul(Z2r, W3) + b3) # logits Yish = tf.matmul(Z3, W4) + b4 cost = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=Yish, labels=T)) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) # we'll use this to calculate the error rate predict_op = tf.argmax(Yish, 1) t0 = datetime.now() LL = [] W1_val = None W2_val = None init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain[j * batch_sz:(j * batch_sz + batch_sz), ] if len(Xbatch) == batch_sz: session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: # due to RAM limitations we need to have a fixed size input # so as a result, we have this ugly total cost and prediction computation test_cost = 0 prediction = np.zeros(len(Xtest)) for k in range(len(Xtest) // batch_sz): Xtestbatch = Xtest[k * batch_sz:(k * batch_sz + batch_sz), ] Ytestbatch = Ytest[k * batch_sz:(k * batch_sz + batch_sz), ] test_cost += session.run(cost, feed_dict={ X: Xtestbatch, T: Ytestbatch }) prediction[k * batch_sz:(k * batch_sz + batch_sz)] = session.run( predict_op, feed_dict={X: Xtestbatch}) err = error_rate(prediction, Ytest) print( "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) W1_val = W1.eval() W2_val = W2.eval() print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show() W1_val = W1_val.transpose(3, 2, 0, 1) W2_val = W2_val.transpose(3, 2, 0, 1) # visualize W1 (20, 3, 5, 5) # W1_val = W1.get_value() grid = np.zeros((8 * 5, 8 * 5)) m = 0 n = 0 for i in range(20): for j in range(3): filt = W1_val[i, j] grid[m * 5:(m + 1) * 5, n * 5:(n + 1) * 5] = filt m += 1 if m >= 8: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W1") plt.show() # visualize W2 (50, 20, 5, 5) # W2_val = W2.get_value() grid = np.zeros((32 * 5, 32 * 5)) m = 0 n = 0 for i in range(50): for j in range(20): filt = W2_val[i, j] grid[m * 5:(m + 1) * 5, n * 5:(n + 1) * 5] = filt m += 1 if m >= 32: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W2") plt.show()
def main(): # step 1: load the data, transform as needed train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test max_iter = 6 print_period = 10 lr = np.float32(1e-2) mu = np.float32(0.99) N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz M = 500 K = 10 poolsz = (2, 2) # after conv will be of dimension 32 - 5 + 1 = 28 # after downsample 28 / 2 = 14 W1_shape = ( 20, 3, 5, 5 ) # (num_feature_maps, num_color_channels, filter_width, filter_height) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[0], dtype=np.float32) # one bias per output feature map # after conv will be of dimension 14 - 5 + 1 = 10 # after downsample 10 / 2 = 5 W2_shape = ( 50, 20, 5, 5 ) # (num_feature_maps, old_num_feature_maps, filter_width, filter_height) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[0], dtype=np.float32) # vanilla ANN weights W3_init = np.random.randn(W2_shape[0] * 5 * 5, M) / np.sqrt(W2_shape[0] * 5 * 5 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # step 2: define theano variables and expressions X = T.tensor4('X', dtype='float32') Y = T.ivector('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') W3 = theano.shared(W3_init.astype(np.float32), 'W3') b3 = theano.shared(b3_init, 'b3') W4 = theano.shared(W4_init.astype(np.float32), 'W4') b4 = theano.shared(b4_init, 'b4') # forward pass Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3) pY = T.nnet.softmax(Z3.dot(W4) + b4) # define the cost function and prediction cost = -(T.log(pY[T.arange(Y.shape[0]), Y])).mean() prediction = T.argmax(pY, axis=1) # step 3: training expressions and functions params = [W1, b1, W2, b2, W3, b3, W4, b4] # momentum changes dparams = [ theano.shared(np.zeros_like(p.get_value(), dtype=np.float32)) for p in params ] updates = [] grads = T.grad(cost, params) for p, dp, g in zip(params, dparams, grads): dp_update = mu * dp - lr * g p_update = p + dp_update updates.append((dp, dp_update)) updates.append((p, p_update)) train = theano.function( inputs=[X, Y], updates=updates, ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[X, Y], outputs=[cost, prediction], ) t0 = datetime.now() costs = [] for i in range(max_iter): Xtrain, Ytrain = shuffle(Xtrain, Ytrain) for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain[j * batch_sz:(j * batch_sz + batch_sz), ] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) costs.append(cost_val) print("Elapsed time:", (datetime.now() - t0)) plt.plot(costs) plt.show()
def rearrange(X): # input is (32, 32, 3, N) # output is (N, 32, 32, 3) # N = X.shape[-1] # out = np.zeros((N, 32, 32, 3), dtype=np.float32) # for i in xrange(N): # for j in xrange(3): # out[i, :, :, j] = X[:, :, j, i] # return out / 255 return (X.transpose(3, 0, 1, 2) / 255).astype(np.float32) # In[3]: train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 # print len(Ytrain) del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test
def main(): # step 1: load the data, transform as needed train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test Ytest_ind = y2indicator(Ytest) max_iter = 8 print_period = 10 lr = np.float32(0.00001) reg = np.float32(0.01) mu = np.float32(0.99) N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz M = 500 K = 10 poolsz = (2, 2) # after conv will be of dimension 32 - 5 + 1 = 28 # after downsample 28 / 2 = 14 W1_shape = ( 20, 3, 5, 5 ) # (num_feature_maps, num_color_channels, filter_width, filter_height) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[0], dtype=np.float32) # one bias per output feature map # after conv will be of dimension 14 - 5 + 1 = 10 # after downsample 10 / 2 = 5 W2_shape = ( 50, 20, 5, 5 ) # (num_feature_maps, old_num_feature_maps, filter_width, filter_height) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[0], dtype=np.float32) # vanilla ANN weights W3_init = np.random.randn(W2_shape[0] * 5 * 5, M) / np.sqrt(W2_shape[0] * 5 * 5 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # step 2: define theano variables and expressions X = T.tensor4('X', dtype='float32') Y = T.matrix('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') W3 = theano.shared(W3_init.astype(np.float32), 'W3') b3 = theano.shared(b3_init, 'b3') W4 = theano.shared(W4_init.astype(np.float32), 'W4') b4 = theano.shared(b4_init, 'b4') # momentum changes dW1 = theano.shared(np.zeros(W1_init.shape, dtype=np.float32), 'dW1') db1 = theano.shared(np.zeros(b1_init.shape, dtype=np.float32), 'db1') dW2 = theano.shared(np.zeros(W2_init.shape, dtype=np.float32), 'dW2') db2 = theano.shared(np.zeros(b2_init.shape, dtype=np.float32), 'db2') dW3 = theano.shared(np.zeros(W3_init.shape, dtype=np.float32), 'dW3') db3 = theano.shared(np.zeros(b3_init.shape, dtype=np.float32), 'db3') dW4 = theano.shared(np.zeros(W4_init.shape, dtype=np.float32), 'dW4') db4 = theano.shared(np.zeros(b4_init.shape, dtype=np.float32), 'db4') # forward pass Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3) pY = T.nnet.softmax(Z3.dot(W4) + b4) # define the cost function and prediction params = (W1, b1, W2, b2, W3, b3, W4, b4) reg_cost = reg * sum((param * param).sum() for param in params) cost = -(Y * T.log(pY)).sum() + reg_cost prediction = T.argmax(pY, axis=1) # step 3: training expressions and functions update_W1 = W1 + mu * dW1 - lr * T.grad(cost, W1) update_b1 = b1 + mu * db1 - lr * T.grad(cost, b1) update_W2 = W2 + mu * dW2 - lr * T.grad(cost, W2) update_b2 = b2 + mu * db2 - lr * T.grad(cost, b2) update_W3 = W3 + mu * dW3 - lr * T.grad(cost, W3) update_b3 = b3 + mu * db3 - lr * T.grad(cost, b3) update_W4 = W4 + mu * dW4 - lr * T.grad(cost, W4) update_b4 = b4 + mu * db4 - lr * T.grad(cost, b4) # update weight changes update_dW1 = mu * dW1 - lr * T.grad(cost, W1) update_db1 = mu * db1 - lr * T.grad(cost, b1) update_dW2 = mu * dW2 - lr * T.grad(cost, W2) update_db2 = mu * db2 - lr * T.grad(cost, b2) update_dW3 = mu * dW3 - lr * T.grad(cost, W3) update_db3 = mu * db3 - lr * T.grad(cost, b3) update_dW4 = mu * dW4 - lr * T.grad(cost, W4) update_db4 = mu * db4 - lr * T.grad(cost, b4) train = theano.function( inputs=[X, Y], updates=[ (W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2), (W3, update_W3), (b3, update_b3), (W4, update_W4), (b4, update_b4), (dW1, update_dW1), (db1, update_db1), (dW2, update_dW2), (db2, update_db2), (dW3, update_dW3), (db3, update_db3), (dW4, update_dW4), (db4, update_db4), ], ) # create another function for this because we want it over the whole dataset get_prediction = theano.function( inputs=[X, Y], outputs=[cost, prediction], ) t0 = datetime.now() LL = [] for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] train(Xbatch, Ybatch) if j % print_period == 0: cost_val, prediction_val = get_prediction(Xtest, Ytest_ind) err = error_rate(prediction_val, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, err)) LL.append(cost_val) print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show() # visualize W1 (20, 3, 5, 5) W1_val = W1.get_value() grid = np.zeros((8 * 5, 8 * 5)) m = 0 n = 0 for i in range(20): for j in range(3): filt = W1_val[i, j] grid[m * 5:(m + 1) * 5, n * 5:(n + 1) * 5] = filt m += 1 if m >= 8: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W1") plt.show() # visualize W2 (50, 20, 5, 5) W2_val = W2.get_value() grid = np.zeros((32 * 5, 32 * 5)) m = 0 n = 0 for i in range(50): for j in range(20): filt = W2_val[i, j] grid[m * 5:(m + 1) * 5, n * 5:(n + 1) * 5] = filt m += 1 if m >= 32: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W2") plt.show()
def main(): train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 # print len(Ytrain) del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test # gradient descent params max_iter = 6 print_period = 10 N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz # limit samples since input will always have to be same size # you could also just do N = N / batch_sz * batch_sz Xtrain = Xtrain[:73000,] Ytrain = Ytrain[:73000] Xtest = Xtest[:26000,] Ytest = Ytest[:26000] # print "Xtest.shape:", Xtest.shape # print "Ytest.shape:", Ytest.shape # initial weights M = 500 K = 10 poolsz = (2, 2) W1_shape = (5, 5, 3, 20) # (filter_width, filter_height, num_color_channels, num_feature_maps) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[-1], dtype=np.float32) # one bias per output feature map W2_shape = (5, 5, 20, 50) # (filter_width, filter_height, old_num_feature_maps, num_feature_maps) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[-1], dtype=np.float32) # vanilla ANN weights W3_init = np.random.randn(W2_shape[-1]*8*8, M) / np.sqrt(W2_shape[-1]*8*8 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # define variables and expressions # using None as the first shape element takes up too much RAM unfortunately X = tf.placeholder(tf.float32, shape=(batch_sz, 32, 32, 3), name='X') T = tf.placeholder(tf.int32, shape=(batch_sz,), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) W4 = tf.Variable(W4_init.astype(np.float32)) b4 = tf.Variable(b4_init.astype(np.float32)) Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z2_shape = Z2.get_shape().as_list() Z2r = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])]) Z3 = tf.nn.relu( tf.matmul(Z2r, W3) + b3 ) Yish = tf.matmul(Z3, W4) + b4 cost = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=Yish, labels=T ) ) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) # we'll use this to calculate the error rate predict_op = tf.argmax(Yish, 1) t0 = datetime.now() LL = [] W1_val = None W2_val = None init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j*batch_sz:(j*batch_sz + batch_sz),] Ybatch = Ytrain[j*batch_sz:(j*batch_sz + batch_sz),] if len(Xbatch) == batch_sz: session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: # due to RAM limitations we need to have a fixed size input # so as a result, we have this ugly total cost and prediction computation test_cost = 0 prediction = np.zeros(len(Xtest)) for k in range(len(Xtest) // batch_sz): Xtestbatch = Xtest[k*batch_sz:(k*batch_sz + batch_sz),] Ytestbatch = Ytest[k*batch_sz:(k*batch_sz + batch_sz),] test_cost += session.run(cost, feed_dict={X: Xtestbatch, T: Ytestbatch}) prediction[k*batch_sz:(k*batch_sz + batch_sz)] = session.run( predict_op, feed_dict={X: Xtestbatch}) err = error_rate(prediction, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) W1_val = W1.eval() W2_val = W2.eval() print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show() W1_val = W1_val.transpose(3, 2, 0, 1) W2_val = W2_val.transpose(3, 2, 0, 1) # visualize W1 (20, 3, 5, 5) # W1_val = W1.get_value() grid = np.zeros((8*5, 8*5)) m = 0 n = 0 for i in range(20): for j in range(3): filt = W1_val[i,j] grid[m*5:(m+1)*5,n*5:(n+1)*5] = filt m += 1 if m >= 8: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W1") plt.show() # visualize W2 (50, 20, 5, 5) # W2_val = W2.get_value() grid = np.zeros((32*5, 32*5)) m = 0 n = 0 for i in range(50): for j in range(20): filt = W2_val[i,j] grid[m*5:(m+1)*5,n*5:(n+1)*5] = filt m += 1 if m >= 32: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title("W2") plt.show()
def main(): train, test = get_data() Xtrain = rearrange(train['X']) # train['y'] has shape (N,1) and vals ranging 1:10; need shape (N,) and ranging 0:9 Ytrain = train['y'].flatten() - 1 del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test Ytest_ind = y2indicator(Ytest) # grad. desc. params max_iter = 20 print_period = 10 N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz # make num samples divisible by batch_sz so all batches are same sz Xtrain = Xtrain[:73000, ] Ytrain = Ytrain[:73000] Xtest = Xtest[:26000, ] Ytest = Ytest[:26000] Ytest_ind = Ytest_ind[:26000, ] # initial weights M = 500 # neurons in final layer K = 10 # num classes pool_sz = (2, 2) W1_shape = ( 5, 5, 3, 20 ) # filter shape (width, height, num_color_channel, num_feature_maps(or filters)) W1_init = init_filter(W1_shape, pool_sz) # pass in pool_sz for normalization b1_init = np.zeros(W1_shape[-1], dtype=np.float32) W2_shape = (5, 5, 20, 50) W2_init = init_filter(W2_shape, pool_sz) b2_init = np.zeros(W2_shape[-1], dtype=np.float32) # vanilla NN weights W3_init = np.random.randn(W2_shape[-1] * 8 * 8, M) / np.sqrt( W2_shape[-1] * 8 * 8 + M) # 8 factor is result of # final convolution (2 convpool layers 32x32--> 16x16 --> 8x8 output_sz) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) X = tf.placeholder(tf.float32, shape=(batch_sz, 32, 32, 3), name='X') T = tf.placeholder(tf.float32, shape=(batch_sz, K), name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) W4 = tf.Variable(W4_init.astype(np.float32)) b4 = tf.Variable(b4_init.astype(np.float32)) Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z2_shape = Z2.get_shape().as_list() Z2r = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:]) ]) # reshape data to input to ANN layer Z3 = tf.nn.relu(tf.matmul(Z2r, W3) + b3) Yish = tf.matmul(Z3, W4) + b4 cost = tf.reduce_sum( # sums all elements in matrix tf.nn.softmax_cross_entropy_with_logits( # computes softmax with logits and labels logits=Yish, labels=T)) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) predict_op = tf.argmax(Yish, 1) t0 = datetime.now() LL = [] init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz)] if len(Xbatch) == batch_sz: sess.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: # due to RAM limiations, we need to have fixed size input # as a result, need total clost and pred computation test_cost = 0 prediction = np.zeros(len(Xtest)) # since tf var X is expecting input of batch_sz, need to loop throug Xtest # in iterations of batch_sz for k in range(len(Xtest) // batch_sz): Xtestbatch = Xtest[k * batch_sz:(k * batch_sz + batch_sz), ] Ytestbatch = Ytest_ind[k * batch_sz:(k * batch_sz + batch_sz)] test_cost += sess.run(cost, feed_dict={ X: Xtestbatch, T: Ytestbatch }) prediction[k * batch_sz:(k * batch_sz + batch_sz)] = sess.run( predict_op, feed_dict={X: Xtestbatch}) err = error_rate(prediction, Ytest) print( "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show()
def rearrange(X): # input is (32, 32, 3, N) # output is (N, 32, 32, 3) # N = X.shape[-1] # out = np.zeros((N, 32, 32, 3), dtype=np.float32) # for i in xrange(N): # for j in xrange(3): # out[i, :, :, j] = X[:, :, j, i] # return out / 255 return (X.transpose(3, 0, 1, 2) / 255).astype(np.float32) train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 # print len(Ytrain) del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test
def main(): train, test = get_data() X_train = rearrange(train['X']) t_train = train['y'].flatten() - 1 del train X_train, t_train = shuffle(X_train, t_train) X_test = rearrange(test['X']) t_test = test['y'].flatten() - 1 del test # Gradient-descent parameters epochs = 6 print_period = 10 N = X_train.shape[0] batch_size = 500 nb_batches = N // batch_size # Limit samples since input will always have to be same size # we could have done: N = N / batch_size * batch_size X_train = X_train[:73000, ] t_train = t_train[:73000] X_test = X_test[:26000, ] t_test = t_test[:26000] # Initial weights M = 500 K = 10 pool_size = (2, 2) # W*H*C1*features_map W0_shape = (5, 5, 3, 20) W0_init = init_filter(W0_shape, pool_size) b0_init = np.zeros(W0_shape[-1], dtype=np.float32) W1_shape = (5, 5, 20, 50) W1_init = init_filter(W1_shape, pool_size) b1_init = np.zeros(W1_shape[-1], dtype=np.float32) # ANN weights W2_init = np.random.randn(W1_shape[-1] * 8 * 8, M) / np.sqrt(W1_shape[-1] * 8 * 8 + M) b2_init = np.zeros(M) W3_init = np.random.randn(M, K) / np.sqrt(M + K) b3_init = np.zeros(K) # tf environment X_pl = tf.placeholder(tf.float32, shape=(batch_size, 32, 32, 3), name='X') t_pl = tf.placeholder(tf.int32, shape=(batch_size, ), name='t') W0 = tf.Variable(W0_init.astype(np.float32)) b0 = tf.Variable(b0_init.astype(np.float32)) W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) # tf training environment A1 = convpool(X_pl, W0, b0) A2 = convpool(A1, W1, b1) A2_shape = A2.get_shape().as_list() A2r = tf.reshape(A2, [A2_shape[0], np.prod(A2.shape[1:])]) A3 = tf.nn.relu(tf.matmul(A2r, W2) + b2) Z4 = tf.matmul(A3, W3) + b3 J = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=t_pl, logits=Z4)) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(J) # tf test environment y = tf.argmax(Z4, 1) # TRAIN & TEST t0 = datetime.now() tests_costs = [] init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for epoch in range(epochs): for batch_id in range(nb_batches): X_train_batch = X_train[batch_id * batch_size:(batch_id + 1) * batch_size, ] t_train_batch = t_train[batch_id * batch_size:(batch_id + 1) * batch_size, ] if len(X_train_batch) == batch_size: sess.run(train_op, feed_dict={ X_pl: X_train_batch, t_pl: t_train_batch }) if batch_id % print_period == 0: # due to RAM limitations we need to have a fixed input # We took the size of a batch for the placeholder # as a result we have this ugly total cost and prediction computation j_test = 0 y_test = np.zeros(len(X_test)) for batch_test_id in range(len(X_test) // batch_size): X_test_batch = X_test[batch_test_id * batch_size:(batch_test_id + 1) * batch_size, ] t_test_batch = t_test[batch_test_id * batch_size:(batch_test_id + 1) * batch_size] j_test += sess.run(J, feed_dict={ X_pl: X_test_batch, t_pl: t_test_batch }) y_test[batch_test_id * batch_size:(batch_test_id + 1) * batch_size, ] = sess.run( y, feed_dict={X_pl: X_test_batch}) tests_costs.append(j_test) acc = accuracy(y_test, t_test) print( 'Epoch {} batch_id {}: validation cost: {} - accuracy = {}%' .format(epoch, batch_id, j_test, acc * 100)) # W0_val = W0.eval() # W1_val = W1.eval() print('Elapsed time: {}'.format(datetime.now() - t0)) #plt.plot(tests_costs) #plt.show() '''
def main(): train, test = get_data() # Need to scale! don't leave as 0..255 # Y is a N x 1 matrix with values 1..10 (MATLAB indexes by 1) # So flatten it and make it 0..9 # Also need indicator matrix for cost calculation Xtrain = rearrange(train['X']) Ytrain = train['y'].flatten() - 1 # print len(Ytrain) del train Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Ytrain_ind = y2indicator(Ytrain) Xtest = rearrange(test['X']) Ytest = test['y'].flatten() - 1 del test Ytest_ind = y2indicator(Ytest) # gradient descent params max_iter = 6 print_period = 10 N = Xtrain.shape[0] batch_sz = 500 n_batches = N // batch_sz Xtrain = Xtrain[:73000, ] Ytrain = Ytrain[:73000, ] Xtest = Xtest[:26000, ] Ytest = Ytest[:26000, ] M = 500 K = 10 poolsz = (2, 2) # W1 # (filter_width, filter_height, num_color_channels, num_feature_maps) W1_shape = (5, 5, 3, 20) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[-1], dtype=np.float32) # W2 # (filter_width, filter_height, num_color_channels, num_feature_maps) W2_shape = (5, 5, 20, 50) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[-1], dtype=np.float32) # W3. FeedForward Network W3_init = np.random.randn(W2_shape[-1] * 8 * 8, M) / np.sqrt(W2_shape[-1] * 8 * 8 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) # Tensorflow variables # using None as the first shape element takes up too much RAM unfortunately # Init X X = tf.placeholder(tf.float32, shape=(batch_sz, Xtrain.shape[1], Xtrain.shape[2], Xtrain.shape[3])) # Init T T = tf.placeholder(tf.float32, shape=(batch_sz, K), name='T') # Init Weights and Biases W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) W4 = tf.Variable(W4_init.astype(np.float32)) b4 = tf.Variable(b4_init.astype(np.float32)) # FeedForward operation Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z2_shape = Z2.get_shape().as_list() # Reshape to [N, W*H*C] Z2r = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])]) # Output to 2nd convpool layer, flattened # and multplied with FeedForward layer Z3 = tf.nn.relu(tf.matmul(Z2r, W3) + b3) # Output of FF layer, multiplied with Y output layer Yish = tf.matmul(Z3, W4) + b4 cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) # we'll use this to calculate the error rate predict_op = tf.argmax(Yish, 1) t0 = datetime.now() LL = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] if len(Xbatch) == batch_sz: session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: # due to RAM limitations we need to have a fixed size input # so as a result, we have this ugly total cost and prediction computation test_cost = 0 prediction = np.zeros(len(Xtest)) for k in range(len(Xtest) // batch_sz): Xtestbatch = Xtest[k * batch_sz:(k * batch_sz + batch_sz), ] Ytestbatch = Ytest_ind[k * batch_sz:(k * batch_sz + batch_sz), ] # Accumulate test cost here test_cost += session.run(cost, feed_dict={ X: Xtestbatch, T: Ytestbatch }) # Only assign part of the prediction prediction[k * batch_sz:(k * batch_sz + batch_sz)] = session.run( predict_op, feed_dict={ X: Xtestbatch, T: Ytestbatch }) err = error_rate(prediction, Ytest) print( "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) print("Elapsed time:", (datetime.now() - t0)) plt.plot(LL) plt.show()
def main(): train, test = get_data() train_X = rearrange(train['X']) train_Y = train['y'].flatten() - 1 train_X, train_Y = shuffle(train_X, train_Y) test_X = rearrange(test['X']) test_Y = test['y'].flatten() - 1 del train del test max_iter = 6 print_period = 10 N = train_X.shape[0] batch_sz = 500 num_batch = N // batch_sz train_X = train_X[:73000, ] train_Y = train_Y[:73000] test_X = test_X[:26000, ] test_Y = test_Y[:26000] #init weights and placeholders M = 500 K = 10 W1_shape = (5, 5, 3, 20) W1_init = init_filter(W1_shape) b1_init = np.zeros(W1_shape[-1], dtype=np.float32) W2_shape = (5, 5, 20, 50) W2_init = init_filter(W2_shape) b2_init = np.zeros(W2_shape[-1], dtype=np.float32) W3_init = np.random.randn(W2_shape[-1] * 8 * 8, M) / np.sqrt(W2_shape[-1] * 8 * 8 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M + K) b4_init = np.zeros(K, dtype=np.float32) inputs = tf.placeholder(tf.float32, shape=[batch_sz, 32, 32, 3], name='inputs') labels = tf.placeholder(tf.int32, shape=[ batch_sz, ], name='labels') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) W4 = tf.Variable(W4_init.astype(np.float32)) b4 = tf.Variable(b4_init.astype(np.float32)) #forward Z1 = convpool(inputs, W1, b1) Z2 = convpool(Z1, W2, b2) Z2_shape = Z2.get_shape().as_list() Z2_re = tf.reshape(Z2, [Z2_shape[0], np.prod(Z2_shape[1:])]) Z3 = tf.nn.relu(tf.matmul(Z2_re, W3) + b3) logits = tf.matmul(Z3, W4) + b4 #init functions cost = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) predict_op = tf.argmax(logits, axis=1) costs = [] W1_value = None W2_value = None init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y) for j in range(num_batch): x = shuffle_X[j * batch_sz:(j * batch_sz + batch_sz), ] y = shuffle_Y[j * batch_sz:(j * batch_sz + batch_sz), ] if len(x) == batch_sz: session.run(train_op, feed_dict={inputs: x, labels: y}) if j % print_period == 0: test_cost = 0 prediction = np.zeros(len(test_X)) for k in range(len(test_X) // batch_sz): Xtestbatch = test_X[k * batch_sz:(k * batch_sz + batch_sz), ] Ytestbatch = test_Y[k * batch_sz:(k * batch_sz + batch_sz), ] test_cost += session.run(cost, feed_dict={ inputs: Xtestbatch, labels: Ytestbatch }) prediction[k * batch_sz:(k * batch_sz + batch_sz)] = session.run( predict_op, feed_dict={ inputs: Xtestbatch }) err = error_rate(prediction, test_Y) costs.append(test_cost) print( "Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) W1_value = W1.eval() W2_value = W2.eval() plt.plot(costs) plt.show() W1_value = W1_value.transpose(3, 2, 0, 1) W2_value = W2_value.transpose(3, 2, 0, 1) #input 3 chanels, output 20 chanels, use 8*8=64 grids and left final 4 empty grid = np.zeros((8 * 5, 8 * 5)) m = 0 n = 0 for i in range(20): for j in range(3): grid[m * 5:(m + 1) * 5, n * 5:(n + 1) * 5] = W1_value[i, j] m += 1 if m >= 8: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title('W1') plt.show() #input 20, output 50, total is 1000. use 32*32=1024 grids and left final 24 empty grid = np.zeros((32 * 5, 32 * 5)) m = 0 n = 0 for i in range(50): for j in range(20): grid[m * 5:(m + 1) * 5, n * 5:(n + 1) * 5] = W2_value[i, j] m += 1 if m >= 32: m = 0 n += 1 plt.imshow(grid, cmap='gray') plt.title('W2') plt.show()
def main(): train, test = get_data() train_X = rearrange(train['X']) train_Y = train['y'].flatten()-1 train_X, train_Y = shuffle(train_X, train_Y) test_X = rearrange(test['X']) test_Y = test['y'].flatten()-1 max_iter = 6 print_period = 10 lr = np.float32(0.0001) mu = np.float32(0.99) decay = np.float32(0.9) eps = np.float32(1e-10) reg = np.float32(0.01) N = train_X.shape[0] batch_sz = 500 num_batch = N // batch_sz M = 500 K = 10 poolsz = (2, 2) W1_shape = (20, 3, 5, 5) #(num_feature_maps, num_color_channels, filter_width, filter_height) W1_init = init_filter(W1_shape, poolsz) b1_init = np.zeros(W1_shape[0], dtype=np.float32) W2_shape = (50, 20, 5, 5) #(num_feature_maps, old_num_feature_maps, filter_width, filter_height) W2_init = init_filter(W2_shape, poolsz) b2_init = np.zeros(W2_shape[0], dtype=np.float32) #ANN W3_init = np.random.randn(W2_shape[0]*5*5, M) / np.sqrt(W2_shape[0]*5*5 + M) b3_init = np.zeros(M, dtype=np.float32) W4_init = np.random.randn(M, K) / np.sqrt(M+K) b4_init = np.zeros(K, dtype=np.float32) #init theano variables X = T.tensor4('X', dtype='float32') Y = T.ivector('T') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') W3 = theano.shared(W3_init.astype(np.float32), 'W3') b3 = theano.shared(b3_init, 'b3') W4 = theano.shared(W4_init.astype(np.float32), 'W4') b4 = theano.shared(b4_init, 'b4') #forward Z1 = convpool(X, W1, b1) Z2 = convpool(Z1, W2, b2) Z3 = relu(Z2.flatten(ndim=2).dot(W3) + b3) pY = T.nnet.softmax(Z3.dot(W4) + b4) #test & prediction functions params = [W1, b1, W2, b2, W3, b3, W4, b4] rcost = reg * np.sum((p*p).sum() for p in params) cost = -(T.log(pY[T.arange(Y.shape[0]), Y])).mean() + rcost prediction = T.argmax(pY, axis=1) momentum = [theano.shared( np.zeros_like(p.get_value(), dtype=np.float32)) for p in params] catchs = [theano.shared( np.ones_like(p.get_value(), dtype=np.float32)) for p in params] #RMSProp updates = [] grads = T.grad(cost, params) for p, g, m, c in zip(params, grads, momentum, catchs): updates_c = decay*c + (np.float32(1.0)-decay)*g*g updates_m = mu*m - lr*g / T.sqrt(updates_c + eps) updates_p = p + updates_m updates.append([c, updates_c]) updates.append([m, updates_m]) updates.append([p, updates_p]) #init functions train_op = theano.function(inputs=[X, Y], updates=updates) prediction_op = theano.function(inputs=[X, Y], outputs=[cost, prediction]) costs= [] for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y) for j in range(num_batch): x = shuffle_X[j*batch_sz : (j*batch_sz+batch_sz), :] y = shuffle_Y[j*batch_sz : (j*batch_sz+batch_sz)] train_op(x, y) if j % print_period == 0: cost_val, p_val = prediction_op(test_X, test_Y) e = error_rate(p_val, test_Y) costs.append(cost_val) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, cost_val, e)) plt.plot(costs) plt.show()