def main(): # define data dims Nclass = 500 D = 2 M = 3 K = 3 # generate three gaussian clouds X1 = np.random.randn(Nclass, 2) + np.array([0, -2]) X2 = np.random.randn(Nclass, 2) + np.array([ 2, 2, ]) X3 = np.random.randn(Nclass, 2) + np.array([ -2, 2, ]) X = np.vstack([X1, X2, X3]) Y = np.array([0] * Nclass + [1] * Nclass + [2] * Nclass) N = len(Y) # turn Y into an indicator matrix for training T = np.zeros((N, K)) for i in range(N): T[i, Y[i]] = 1 plt.scatter(X[:, 0], X[:, 1], c=Y, s=100, alpha=0.5) plt.show() W1, b1 = init_weight_and_biases(D, M) W2, b2 = init_weight_and_biases(M, K) # perform backpropgation learning_rate = 10e-7 costs = [] for epoch in range(100000): output, hidden = forward(X, W1, b1, W2, b2) if epoch % 100: c = cost(T, output) P = np.argmax(output, axis=1) r = classification_rate(Y, P) print("cost:", c, "classification rate:", r) costs.append(c) # gradient ascent (reverse of gradient descent) W2 += learning_rate * derivative_w2(hidden, T, output) b2 += learning_rate * derivative_b2(T, output) W1 += learning_rate * derivative_w1(X, hidden, T, output, W2) b1 += learning_rate * derivative_b1(T, output, W2, hidden) plt.plot(costs) plt.show()
def __init__(self, M1, M2, an_id): self.id = an_id self.M1 = M1 self.M2 = M2 W, b = init_weight_and_biases(M1, M2) self.W = tf.Variable(W.astype(np.float32)) self.b = tf.Variable(b.astype(np.float32)) self.params = [self.W, self.b]
def __init__(self, M1, M2, an_id): self.id = an_id self.M1 = M1 self.M2 = M2 W, b = init_weight_and_biases(M1, M2) self.W = theano.shared(W, 'W_%s' % self.id) self.b = theano.shared(b, 'b_%s' % self.id) self.params = [self.W, self.b]
X, Y = shuffle(X, Y) Y = Y.astype(np.int32) M = 5 D = X.shape[1] K = len(set(Y)) # create train and test sets Xtrain = X[:-100] Ytrain = Y[:-100] Ytrain_ind = y2indicator(Ytrain) Xtest = X[-100:] Ytest = Y[-100:] Ytest_ind = y2indicator(Ytest) W1, b1 = init_weight_and_biases(D, M) W2, b2 = init_weight_and_biases(M, K) def forward(X, W1, b1, W2, b2): Z = np.tanh(X.dot(W1) + b1) return softmax(Z.dot(W2) + b2), Z def cross_entropy(T, pY): return -np.mean(T * np.log(pY)) train_costs = [] test_costs = []
def main(): Xtrain, Ytrain, Xtest, Ytest = MNISTData().loadFlatData() Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Xtest, Ytest = shuffle(Xtest, Ytest) Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) max_iter = 20 print_period = 10 N, D = Xtrain.shape batch_sz = 500 n_batches = N // batch_sz M1 = 1000 M2 = 500 K = 10 W1_init, b1_init = init_weight_and_biases(D, M1) W2_init, b2_init = init_weight_and_biases(M1, M2) W3_init, b3_init = init_weight_and_biases(M2, K) # define tensorflow vars and expressions X = tf.placeholder(tf.float32, shape=[None, D], name='X') T = tf.placeholder(tf.float32, shape=[None, K], name='T') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) W3 = tf.Variable(W3_init.astype(np.float32)) b3 = tf.Variable(b3_init.astype(np.float32)) Z1 = tf.nn.relu(tf.matmul(X, W1) + b1) Z2 = tf.nn.relu(tf.matmul(Z1, W2) + b2) Yish = tf.matmul(Z2, W3) + b3 cost =tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=Yish, labels=T)) train_op = tf.train.RMSPropOptimizer(0.0001, decay=0.99, momentum=0.9).minimize(cost) # used for error rate prediction predict_op = tf.argmax(Yish, 1) LL = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(max_iter): for j in range(n_batches): Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ] Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ] session.run(train_op, feed_dict={X: Xbatch, T: Ybatch}) if j % print_period == 0: test_cost = session.run(cost, feed_dict={X: Xtest, T: Ytest_ind}) prediction = session.run(predict_op, feed_dict={X: Xtest, T: Ytest_ind}) err = error_rate(prediction, Ytest) print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err)) LL.append(test_cost) plt.plot(LL) plt.show()
X, Y = get_data() X, Y = shuffle(X, Y) Y = Y.astype(np.int32) D = X.shape[1] K = len(set(Y)) # split into train and test sets Xtrain = X[:-100] Ytrain = Y[:-100] Ytrain_ind = y2indicator(Ytrain) Xtest = X[-100:] Ytest = Y[-100:] Ytest_ind = y2indicator(Ytest) # initialize weights W, b = init_weight_and_biases(D, K) def forward(X, W, b): return softmax(X.dot(W) + b) def cross_entropy(T, pY): return -np.mean(T * np.log(pY)) train_costs = [] test_costs = [] learning_rate = 0.001
def fit(self, X, Y, lr=10e-4, mu=0.99, reg=10e-4, decay=0.99999, eps=10e-3, batch_sz=30, epochs=3, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) K = len(set(Y)) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = y2indicator(Y).astype(np.float32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] Yvalid_flat = np.argmax(Yvalid, axis=1) # for calculating error rate # initialize convpool layers N, width, height, c = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = outw / 2 outh = outh / 2 mi = mo # initialize mlp layers self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0] * outw * outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_biases(M1, K) self.W = tf.Variable(W, 'W_logreg') self.b = tf.Variable(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.convpool_layers: self.params += h.params for h in self.hidden_layers: self.params += h.params # set up tensorflow functions and variables tfX = tf.placeholder(tf.float32, shape=(None, width, height, c), name='X') tfY = tf.placeholder(tf.float32, shape=(None, K), name='Y') act = self.forward(tfX) rcost = reg * sum([tf.nn.l2_loss(p) for p in self.params]) cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=act, labels=tfY ) ) + rcost prediction = self.predict(tfX) train_op = tf.train.RMSPropOptimizer(lr, decay=decay, momentum=mu).minimize(cost) n_batches = N // batch_sz costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] session.run(train_op, feed_dict={tfX: Xbatch, tfY: Ybatch}) if j % 20 == 0: c = session.run(cost, feed_dict={tfX: Xvalid, tfY: Yvalid}) costs.append(c) p = session.run(prediction, feed_dict={tfX: Xvalid, tfY: Yvalid}) e = error_rate(Yvalid_flat, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, lr=10e-5, mu=0.99, reg=10e-7, decay=0.99999, eps=10e-3, batch_sz=30, epochs=100, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] # initialize convpool layers N, c, width, height = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = (outw - fw + 1) / 2 outh = (outh - fh + 1) / 2 mi = mo # initialize mlp layers K = len(set(Y)) self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_biases(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for c in self.convpool_layers: self.params += c.params for h in self.hidden_layers: self.params += h.params # for momentum dparams = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params] # for rmsprop cache = [theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params] # set up theano functions and variables thX = T.tensor4('X', dtype='float32') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg*T.sum([(p*p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.th_predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) # updates = [ # (c, decay*c + (np.float32(1)-decay)*T.grad(cost, p)*T.grad(cost, p)) for p, c in zip(self.params, cache) # ] + [ # (p, p + mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] + [ # (dp, mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] # momentum only updates = [ (p, p + mu*dp - lr*T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] + [ (dp, mu*dp - lr*T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] train_op = theano.function( inputs=[thX, thY], updates=updates ) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()