def __init__(self, V, D, K, activation): self.D = D self.f = activation # word embedding We = init_weight(V, D) # linear terms W1 = init_weight(D, D) W2 = init_weight(D, D) # bias bh = np.zeros(D) # output layer Wo = init_weight(D, K) bo = np.zeros(K) # make them tensorflow variables self.We = tf.Variable(We.astype(np.float32)) self.W1 = tf.Variable(W1.astype(np.float32)) self.W2 = tf.Variable(W2.astype(np.float32)) self.bh = tf.Variable(bh.astype(np.float32)) self.Wo = tf.Variable(Wo.astype(np.float32)) self.bo = tf.Variable(bo.astype(np.float32)) self.params = [self.We, self.W1, self.W2, self.Wo]
def setUp(self): rng = np.random.RandomState(0) init_w_e, init_b_e = util.init_weight(rng, self.n_in, self.n_hidden) init_w_d, init_b_d = util.init_weight(rng, self.n_hidden, self.n_in) self.w_e.set_value(init_w_e, borrow=True) self.b_e.set_value(init_b_e, borrow=True) self.w_d.set_value(init_w_d, borrow=True) self.b_d.set_value(init_b_d, borrow=True)
def fit(self, X, epochs=500, show_fig=False): N = len(X) D = self.D M = self.M V = self.V # initial weights We = init_weight(V, D).astype(np.float32) Wx = init_weight(D, M).astype(np.float32) Wh = init_weight(M, M).astype(np.float32) bh = np.zeros(M).astype(np.float32) h0 = np.zeros(M).astype(np.float32) Wo = init_weight(M, V).astype(np.float32) bo = np.zeros(V).astype(np.float32) # build tensorflow functions self.build(We, Wx, Wh, bh, h0, Wo, bo) # sentence input: # [START, w1, w2, ..., wn] # sentence target: # [w1, w2, w3, ..., END] costs = [] n_total = sum((len(sentence)+1) for sentence in X) for i in range(epochs): X = shuffle(X) n_correct = 0 cost = 0 for j in range(N): # problem! many words --> END token are overrepresented # result: generated lines will be very short # we will try to fix in a later iteration # BAD! magic numbers 0 and 1... input_sequence = [0] + X[j] output_sequence = X[j] + [1] # we set 0 to start and 1 to end _, c, p = self.session.run( (self.train_op, self.cost, self.predict_op), feed_dict={self.tfX: input_sequence, self.tfY: output_sequence} ) # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total)) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def __init__(self, M1, M2, an_id): self.id = an_id self.M1 = M1 self.M2 = M2 W = init_weight(M1, M2) b = np.zeros(M2) self.W = theano.shared(W, 'W_%s' % self.id) self.b = theano.shared(b, 'b_%s' % self.id) self.params = [self.W, self.b]
def get_param(name, n_in, n_out, params, rng): w_name = "w_" + name b_name = "b_" + name if params is not None and w_name in params: assert b_name in params init_w = params[w_name] init_b = params[b_name] else: init_w, init_b = util.init_weight(rng, n_in, n_out) w = theano.shared(name=w_name, borrow=True, value=init_w) b = theano.shared(name=b_name, borrow=True, value=init_b) return w, b
def __init__(self, Mi, Mo, activation): self.Mi = Mi self.Mo = Mo self.f = activation # numpy init Wxr = init_weight(Mi, Mo) Whr = init_weight(Mo, Mo) br = np.zeros(Mo) Wxz = init_weight(Mi, Mo) Whz = init_weight(Mo, Mo) bz = np.zeros(Mo) Wxh = init_weight(Mi, Mo) Whh = init_weight(Mo, Mo) bh = np.zeros(Mo) h0 = np.zeros(Mo) # theano vars self.Wxr = theano.shared(Wxr) self.Whr = theano.shared(Whr) self.br = theano.shared(br) self.Wxz = theano.shared(Wxz) self.Whz = theano.shared(Whz) self.bz = theano.shared(bz) self.Wxh = theano.shared(Wxh) self.Whh = theano.shared(Whh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.params = [self.Wxr, self.Whr, self.br, self.Wxz, self.Whz, self.bz, self.Wxh, self.Whh, self.bh, self.h0]
def __init__(self, Mi, Mo, activation): self.Mi = Mi self.Mo = Mo self.f = activation # x(t) to r(t) gate Wxr = init_weight(Mi, Mo) # h(t) to r(t) gate Whr = init_weight(Mo, Mo) # bias to r(t) gate br = np.zeros(Mo) # x(t) to z(t) gate Wxz = init_weight(Mi, Mo) # h(t) to z(t) gate Whz = init_weight(Mo, Mo) # bias to z(t) gate bz = np.zeros(Mo) # x(t) to h(t) gate Wxh = init_weight(Mi, Mo) # h(t-1) to h(t) gate Whh = init_weight(Mo, Mo) # bias to h(t) gate bh = np.zeros(Mo) # initial hidden state h0 = np.zeros(Mo) # create theano variables self.Wxr = theano.shared(Wxr) self.Whr = theano.shared(Whr) self.br = theano.shared(br) self.Wxz = theano.shared(Wxz) self.Whz = theano.shared(Whz) self.bz = theano.shared(bz) self.Wxh = theano.shared(Wxh) self.Whh = theano.shared(Whh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.params = [ self.Wxr, self.Whr, self.br, self.Wxz, self.Whz, self.bz, self.Wxh, self.Whh, self.bh, self.h0 ]
def fit(self, X, Y, learning_rate=10e-3, mu=0.99, reg=10e-12, eps=10e-10, epochs=400, batch_sz=20, print_period=1, show_fig=False): # X = X.astype(np.float32) Y = Y.astype(np.int32) # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W = init_weight(M1, K) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # for momentum dparams = [ theano.shared(np.zeros(p.get_value().shape)) for p in self.params ] # for rmsprop cache = [ theano.shared(np.zeros(p.get_value().shape)) for p in self.params ] # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg * T.sum([(p * p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.predict(thX) grads = T.grad(cost, self.params) # momentum only updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates, ) n_batches = N / batch_sz # print "N:", N, "batch_sz:", batch_sz # print "n_batches:", n_batches costs = [] for i in xrange(epochs): X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % print_period == 0: costs.append(c) e = np.mean(Ybatch != p) print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e if show_fig: plt.plot(costs) plt.show()
def fit(self, trees, test_trees, reg=1e-3, epochs=8, train_inner_nodes=False): D = self.D V = self.V K = self.K N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3*D) W22 = np.random.randn(D, D, D) / np.sqrt(3*D) W12 = np.random.randn(D, D, D) / np.sqrt(3*D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = tf.Variable(We.astype(np.float32)) self.W11 = tf.Variable(W11.astype(np.float32)) self.W22 = tf.Variable(W22.astype(np.float32)) self.W12 = tf.Variable(W12.astype(np.float32)) self.W1 = tf.Variable(W1.astype(np.float32)) self.W2 = tf.Variable(W2.astype(np.float32)) self.bh = tf.Variable(bh.astype(np.float32)) self.Wo = tf.Variable(Wo.astype(np.float32)) self.bo = tf.Variable(bo.astype(np.float32)) self.weights = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.Wo] words = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='words') left_children = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='left_children') right_children = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='right_children') labels = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='labels') # save for later self.words = words self.left = left_children self.right = right_children self.labels = labels def dot1(a, B): return tf.tensordot(a, B, axes=[[0], [1]]) def dot2(B, a): return tf.tensordot(B, a, axes=[[1], [0]]) def recursive_net_transform(hiddens, n): h_left = hiddens.read(left_children[n]) h_right = hiddens.read(right_children[n]) return self.f( dot1(h_left, dot2(self.W11, h_left)) + dot1(h_right, dot2(self.W22, h_right)) + dot1(h_left, dot2(self.W12, h_right)) + dot1(h_left, self.W1) + dot1(h_right, self.W2) + self.bh ) def recurrence(hiddens, n): w = words[n] # any non-word will have index -1 h_n = tf.cond( pred=w >= 0, true_fn=lambda: tf.nn.embedding_lookup(params=self.We, ids=w), false_fn=lambda: recursive_net_transform(hiddens, n) ) hiddens = hiddens.write(n, h_n) n = tf.add(n, 1) return hiddens, n def condition(hiddens, n): # loop should continue while n < len(words) return tf.less(n, tf.shape(input=words)[0]) hiddens = tf.TensorArray( tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False ) hiddens, _ = tf.while_loop( cond=condition, body=recurrence, loop_vars=[hiddens, tf.constant(0)], parallel_iterations=1 ) h = hiddens.stack() logits = tf.matmul(h, self.Wo) + self.bo prediction_op = tf.argmax(input=logits, axis=1) self.prediction_op = prediction_op rcost = reg*sum(tf.nn.l2_loss(p) for p in self.weights) if train_inner_nodes: # filter out -1s labeled_indices = tf.compat.v1.where(labels >= 0) cost_op = tf.reduce_mean( input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.gather(logits, labeled_indices), labels=tf.gather(labels, labeled_indices), ) ) + rcost else: cost_op = tf.reduce_mean( input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits[-1], labels=labels[-1], ) ) + rcost train_op = tf.compat.v1.train.AdagradOptimizer(learning_rate=8e-3).minimize(cost_op) # train_op = tf.train.MomentumOptimizer(learning_rate=8e-3, momentum=0.9).minimize(cost_op) # NOTE: If you're using GPU, InteractiveSession breaks # AdagradOptimizer and some other optimizers # change to tf.Session() if so. self.session = tf.compat.v1.Session() init_op = tf.compat.v1.global_variables_initializer() self.session.run(init_op) costs = [] sequence_indexes = range(N) for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 n_total = 0 cost = 0 it = 0 for j in sequence_indexes: words_, left, right, lab = trees[j] # print("words_:", words_) # print("lab:", lab) c, p, _ = self.session.run( (cost_op, prediction_op, train_op), feed_dict={ words: words_, left_children: left, right_children: right, labels: lab } ) if np.isnan(c): print("Cost is nan! Let's stop here. \ Why don't you try decreasing the learning rate?") for p in self.params: print(p.get_value().sum()) exit() cost += c n_correct += (p[-1] == lab[-1]) n_total += 1 it += 1 if it % 10 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost) ) sys.stdout.flush() # calculate the test score n_test_correct = 0 n_test_total = 0 for words_, left, right, lab in test_trees: p = self.session.run(prediction_op, feed_dict={ words: words_, left_children: left, right_children: right, labels: lab }) n_test_correct += (p[-1] == lab[-1]) n_test_total += 1 print( "i:", i, "cost:", cost, "train acc:", float(n_correct)/n_total, "test acc:", float(n_test_correct)/n_test_total, "time for epoch:", (datetime.now() - t0) ) costs.append(cost) plt.plot(costs) plt.show()
def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100, show_fig=False): # cast to float mu = np.float32(mu) learning_rate = np.float32(learning_rate) X = X.astype(np.float32) N, D = X.shape n_batches = N // batch_sz # define shared (all weights in NN) W0 = init_weight((D, self.M)) self.W = theano.shared(W0, 'W_%s' % self.id) self.bh = theano.shared( np.zeros(self.M, dtype=np.float32), 'bh_%s' % self.id ) # 重大發現阿,假如要固定型別 TensorType(float32, matrix),也要記得將np.array物件的內部型別轉成np.float32 self.bo = theano.shared(np.zeros(D, dtype=np.float32), 'bo_%s' % self.id) self.params = [self.W, self.bh, self.bo] # keep tracking all parameters self.forward_params = [self.W, self.bh] self.dW = theano.shared(np.zeros(W0.shape, dtype=np.float32), 'dW_%s' % self.id) self.dbh = theano.shared(np.zeros(self.M, dtype=np.float32), 'dbh_%s' % self.id) self.dbo = theano.shared(np.zeros(D, dtype=np.float32), 'dbo_%s' % self.id) self.dparams = [self.dW, self.dbh, self.dbo] self.forward_dparams = [self.dW, self.dbh] # define matrix (training data) X_in = T.matrix('X_%s' % self.id, dtype='float32') X_hat = self.forward_output(X_in) # attach it to the object so it can be used later # must be sigmoid because the output is also a sigmoid H = T.nnet.sigmoid( X_in.dot(self.W) + self.bh) # define a hidden layer operation as a theano function # 取出中間hidden layer 的實際數值(作圖/ DNN訓練用),這個 function 的輸入是 numpy.array ,輸出是 numpy.array self.hidden_op = theano.function( inputs=[X_in], outputs=H, ) # save this for later so we can call it to # create reconstructions of input self.predict = theano.function( inputs=[X_in], outputs=X_hat, ) # cost = ((X_in - X_hat) * (X_in - X_hat)).sum() / N # squared error cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).sum() / N # cross entropy cost_op = theano.function(inputs=[X_in], outputs=cost) # updates = [ # (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) # ] + [ # (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp, in zip(self.params, self.dparams) # ] updates = momentum_updates(cost, self.params, mu, learning_rate) train_op = theano.function( inputs=[X_in], updates=updates, ) costs = [] print('training autoencoder: %s' % self.id) print('epochs to do:', epochs) for i in range(epochs): print('epoch:', i) X = shuffle(X) for j in range(n_batches): batch = X[j * batch_sz:(j * batch_sz + batch_sz), ] train_op(batch) the_cost = cost_op(batch) costs.append(the_cost) if j % 10 == 0: print('j / n_batches', j, '/', n_batches, 'cost:', the_cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-1, mu=.99, reg=1.0, activation=T.tanh, batch_sz=100, epochs=100, show_fig=False): D = X[0].shape[1] K = len(set(Y.flatten())) N = len(Y) M = self.M self.f = activation # initial weights Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.Wo, self.bo, self.h0] thX = T.fmatrix('X') thY = T.ivector('Y') thStartPoints = T.ivector('startPoints') XW = thX.dot(self.Wx) def recurrence(xw_t, is_start, h_t1, h0): # return h(t), y(t) h_t = T.switch( T.eq(is_start, 1), self.f(xw_t + h0.dot(self.Wh) + self.bh), self.f(xw_t + h_t1.dot(self.Wh) + self.bh) ) return h_t h, _ = theano.scan( fn=recurrence, outputs_info=[self.h0], sequences=[XW, thStartPoints], non_sequences=[self.h0], n_steps=XW.shape[0], # mode="DebugMode" ) # py_x = y[:, 0, :] py_x = T.nnet.softmax(h.dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) ## Notes # py_x[T.arange(thY.shape[0]), thY] ==> is advence indexing # eg: # thY = [1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0] # py_x = array([ # [0.21464644, 0.78535356], [0.53838035, 0.46161965], [0.53524101, 0.46475899], [0.4911481 , 0.5088519 ], # [0.49989071, 0.50010929], [0.53311029, 0.46688971], [0.49294333, 0.50705667], [0.49984173, 0.50015827], # [0.49985361, 0.50014639], [0.49982706, 0.50017294], [0.53299261, 0.46700739], [0.49291816, 0.50708184] # ]) # py_x[T.arange(thY.shape[0]), thY] ==> py_x[[0,1,2,3,4,5,6,7,8,9,10,11], [1,1,1,0,1,1,0,1,0,1,1,0] # ==> [ 0.78535356, 0.46161965, 0.46475899, 0.4911481 , # 0.50010929, 0.46688971, 0.49294333, 0.50015827, # 0.49985361, 0.50017294, 0.46700739, 0.49291816] cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] # self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function( inputs=[thX, thY, thStartPoints], outputs=[cost, prediction, py_x], updates=updates # mode="DebugMode" ) costs = [] n_batches = N // batch_sz sequenceLength = X.shape[1] startPoints = np.zeros(sequenceLength*batch_sz, dtype=np.int32) for b in range(batch_sz): startPoints[b*sequenceLength] = 1 for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(n_batches): Xbatch = X[j*batch_sz:(j+1)*batch_sz].reshape(sequenceLength*batch_sz, D) Ybatch = Y[j*batch_sz:(j+1)*batch_sz].reshape(sequenceLength*batch_sz).astype(np.int32) c, p, rout = self.train_op(Xbatch, Ybatch, startPoints) cost += c for b in range(batch_sz): idx = sequenceLength*(b + 1) - 1 if p[idx] == Ybatch[idx]: n_correct += 1 print("shape y:", rout.shape) print("i:", i, "cost:", cost, "Classification rate:", (float(n_correct) / N)) costs.append(cost) # if n_correct == N: # break if show_fig: plt.plot(costs) plt.show()
def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): N = len(X) D = self.D M = self.M V = self.V # initial weights We = init_weight(V, D) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) # z = np.ones(M) Wxz = init_weight(D, M) Whz = init_weight(M, M) bz = np.zeros(M) Wo = init_weight(M, V) bo = np.zeros(V) thX, thY, py_x, prediction = self.set(We, Wx, Wh, bh, h0, Wxz, Whz, bz, Wo, bo, activation) lr = T.scalar('lr') cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - lr * g) for p, dp, g in zip(self.params, dparams, grads)] + [ (dp, mu * dp - lr * g) for dp, g in zip(dparams, grads) ] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY, lr], outputs=[cost, prediction], updates=updates) costs = [] for i in xrange(epochs): X = shuffle(X) n_correct = 0 n_total = 0 cost = 0 for j in xrange(N): if np.random.random() < 0.1: input_sequence = [0] + X[j] output_sequence = X[j] + [1] else: input_sequence = [0] + X[j][:-1] output_sequence = X[j] n_total += len(output_sequence) # we set 0 to start and 1 to end c, p = self.train_op(input_sequence, output_sequence, learning_rate) # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 print "i:", i, "cost:", cost, "correct rate:", (float(n_correct) / n_total) if (i + 1) % 500 == 0: learning_rate /= 2 costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, learning_rate=1e-4, mu=0.99, epochs=10, batch_sz=100, show_fig=True, activation=T.nnet.relu, RecurrentUnit=LSTM): D = self.D V = self.V N = len(X) We = init_weight(V, D) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params thX = T.ivector('X') # will represent multiple batches concatenated thY = T.ivector('Y') # represents next word thStartPoints = T.ivector('start_points') Z = self.We[thX] for ru in self.hidden_layers: Z = ru.output(Z, thStartPoints) py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] # self.predict_op = theano.function(inputs=[thX, thStartPoints], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY, thStartPoints], outputs=[cost, prediction], updates=updates) costs = [] n_batches = N // batch_sz for i in range(epochs): t0 = datetime.now() X = shuffle(X) n_correct = 0 n_total = 0 cost = 0 for j in range(n_batches): # construct input sequence and output sequence as # concatenatation of multiple input sequences and output sequences # input X should be a list of 2-D arrays or one 3-D array # N x T(n) x D - batch size x sequence length x num features # sequence length can be variable sequenceLengths = [] input_sequence = [] output_sequence = [] for k in range(j * batch_sz, (j + 1) * batch_sz): # don't always add the end token if np.random.random() < 0.01 or len(X[k]) <= 1: input_sequence += [0] + X[k] output_sequence += X[k] + [1] sequenceLengths.append(len(X[k]) + 1) else: input_sequence += [0] + X[k][:-1] output_sequence += X[k] sequenceLengths.append(len(X[k])) n_total += len(output_sequence) startPoints = np.zeros(len(output_sequence), dtype=np.int32) last = 0 for length in sequenceLengths: startPoints[last] = 1 last += length c, p = self.train_op(input_sequence, output_sequence, startPoints) cost += c for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 if j % 1 == 0: sys.stdout.write( "j/n_batches: %d/%d correct rate so far: %f\r" % (j, n_batches, float(n_correct) / n_total)) sys.stdout.flush() print("i:", i, "cost:", cost, "correct rate:", (float(n_correct) / n_total), "time for epoch:", (datetime.now() - t0)) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def __init__(self, Mi, Mo, activation): self.Mi = Mi self.Mo = Mo self.f = activation # numpy init Wxi = init_weight(Mi, Mo) Whi = init_weight(Mo, Mo) Wci = init_weight(Mo, Mo) bi = np.zeros(Mo) Wxf = init_weight(Mi, Mo) Whf = init_weight(Mo, Mo) Wcf = init_weight(Mo, Mo) bf = np.zeros(Mo) Wxc = init_weight(Mi, Mo) Whc = init_weight(Mo, Mo) bc = np.zeros(Mo) Wxo = init_weight(Mi, Mo) Who = init_weight(Mo, Mo) Wco = init_weight(Mo, Mo) bo = np.zeros(Mo) c0 = np.zeros(Mo) h0 = np.zeros(Mo) # theano vars self.Wxi = theano.shared(Wxi) self.Whi = theano.shared(Whi) self.Wci = theano.shared(Wci) self.bi = theano.shared(bi) self.Wxf = theano.shared(Wxf) self.Whf = theano.shared(Whf) self.Wcf = theano.shared(Wcf) self.bf = theano.shared(bf) self.Wxc = theano.shared(Wxc) self.Whc = theano.shared(Whc) self.bc = theano.shared(bc) self.Wxo = theano.shared(Wxo) self.Who = theano.shared(Who) self.Wco = theano.shared(Wco) self.bo = theano.shared(bo) self.c0 = theano.shared(c0) self.h0 = theano.shared(h0) self.params = [ self.Wxi, self.Whi, self.Wci, self.bi, self.Wxf, self.Whf, self.Wcf, self.bf, self.Wxc, self.Whc, self.bc, self.Wxo, self.Who, self.Wco, self.bo, self.c0, self.h0, ]
def fit(self, X, Y, batch_sz=20, learning_rate=10e-1, mu=0.99, activation=tf.nn.sigmoid, epochs=100, show_fig=False): N, T, D = X.shape # X is of size N x T(n) x D K = len(set(Y.flatten())) M = self.M self.f = activation # initial weights # note: Wx, Wh, bh are all part of the RNN unit and will be created # by BasicRNNCell Wo = init_weight(M, K).astype(np.float32) bo = np.zeros(K, dtype=np.float32) # make them tf variables self.Wo = tf.Variable(Wo) self.bo = tf.Variable(bo) # tf Graph input tfX = tf.placeholder(tf.float32, shape=(batch_sz, T, D), name='inputs') tfY = tf.placeholder(tf.int64, shape=(batch_sz, T), name='targets') # turn tfX into a sequence, e.g. T tensors all of size (batch_sz, D) sequenceX = x2sequence(tfX, T, D, batch_sz) # create the simple rnn unit rnn_unit = BasicRNNCell(num_units=self.M, activation=self.f) # Get rnn cell output # outputs, states = rnn_module.rnn(rnn_unit, sequenceX, dtype=tf.float32) outputs, states = get_rnn_output(rnn_unit, sequenceX, dtype=tf.float32) # outputs are now of size (T, batch_sz, M) # so make it (batch_sz, T, M) outputs = tf.transpose(outputs, (1, 0, 2)) outputs = tf.reshape(outputs, (T*batch_sz, M)) # Linear activation, using rnn inner loop last output logits = tf.matmul(outputs, self.Wo) + self.bo predict_op = tf.argmax(logits, 1) targets = tf.reshape(tfY, (T*batch_sz,)) cost_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, targets)) train_op = tf.train.MomentumOptimizer(learning_rate, momentum=mu).minimize(cost_op) costs = [] n_batches = N / batch_sz init = tf.initialize_all_variables() with tf.Session() as session: session.run(init) for i in xrange(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in xrange(n_batches): Xbatch = X[j*batch_sz:(j+1)*batch_sz] Ybatch = Y[j*batch_sz:(j+1)*batch_sz] _, c, p = session.run([train_op, cost_op, predict_op], feed_dict={tfX: Xbatch, tfY: Ybatch}) cost += c for b in xrange(batch_sz): idx = (b + 1)*T - 1 n_correct += (p[idx] == Ybatch[b][-1]) if i % 10 == 0: print "i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N) if n_correct == N: print "i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N) break costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, trees, test_trees, reg=1e-3, epochs=8, train_inner_nodes=False): D = self.D V = self.V K = self.K N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3*D) W22 = np.random.randn(D, D, D) / np.sqrt(3*D) W12 = np.random.randn(D, D, D) / np.sqrt(3*D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = tf.Variable(We.astype(np.float32)) self.W11 = tf.Variable(W11.astype(np.float32)) self.W22 = tf.Variable(W22.astype(np.float32)) self.W12 = tf.Variable(W12.astype(np.float32)) self.W1 = tf.Variable(W1.astype(np.float32)) self.W2 = tf.Variable(W2.astype(np.float32)) self.bh = tf.Variable(bh.astype(np.float32)) self.Wo = tf.Variable(Wo.astype(np.float32)) self.bo = tf.Variable(bo.astype(np.float32)) self.weights = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.Wo] words = tf.placeholder(tf.int32, shape=(None,), name='words') left_children = tf.placeholder(tf.int32, shape=(None,), name='left_children') right_children = tf.placeholder(tf.int32, shape=(None,), name='right_children') labels = tf.placeholder(tf.int32, shape=(None,), name='labels') # save for later self.words = words self.left = left_children self.right = right_children self.labels = labels def dot1(a, B): return tf.tensordot(a, B, axes=[[0], [1]]) def dot2(B, a): return tf.tensordot(B, a, axes=[[1], [0]]) def recursive_net_transform(hiddens, n): h_left = hiddens.read(left_children[n]) h_right = hiddens.read(right_children[n]) return self.f( dot1(h_left, dot2(self.W11, h_left)) + dot1(h_right, dot2(self.W22, h_right)) + dot1(h_left, dot2(self.W12, h_right)) + dot1(h_left, self.W1) + dot1(h_right, self.W2) + self.bh ) def recurrence(hiddens, n): w = words[n] # any non-word will have index -1 h_n = tf.cond( w >= 0, lambda: tf.nn.embedding_lookup(self.We, w), lambda: recursive_net_transform(hiddens, n) ) hiddens = hiddens.write(n, h_n) n = tf.add(n, 1) return hiddens, n def condition(hiddens, n): # loop should continue while n < len(words) return tf.less(n, tf.shape(words)[0]) hiddens = tf.TensorArray( tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False ) hiddens, _ = tf.while_loop( condition, recurrence, [hiddens, tf.constant(0)], parallel_iterations=1 ) h = hiddens.stack() logits = tf.matmul(h, self.Wo) + self.bo prediction_op = tf.argmax(logits, axis=1) self.prediction_op = prediction_op rcost = reg*sum(tf.nn.l2_loss(p) for p in self.weights) if train_inner_nodes: # filter out -1s labeled_indices = tf.where(labels >= 0) cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.gather(logits, labeled_indices), labels=tf.gather(labels, labeled_indices), ) ) + rcost else: cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits[-1], labels=labels[-1], ) ) + rcost train_op = tf.train.AdagradOptimizer(learning_rate=8e-3).minimize(cost_op) # train_op = tf.train.MomentumOptimizer(learning_rate=8e-3, momentum=0.9).minimize(cost_op) # NOTE: If you're using GPU, InteractiveSession breaks # AdagradOptimizer and some other optimizers # change to tf.Session() if so. self.session = tf.Session() init_op = tf.global_variables_initializer() self.session.run(init_op) costs = [] sequence_indexes = range(N) for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 n_total = 0 cost = 0 it = 0 for j in sequence_indexes: words_, left, right, lab = trees[j] # print("words_:", words_) # print("lab:", lab) c, p, _ = self.session.run( (cost_op, prediction_op, train_op), feed_dict={ words: words_, left_children: left, right_children: right, labels: lab } ) if np.isnan(c): print("Cost is nan! Let's stop here. \ Why don't you try decreasing the learning rate?") for p in self.params: print(p.get_value().sum()) exit() cost += c n_correct += (p[-1] == lab[-1]) n_total += 1 it += 1 if it % 10 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost) ) sys.stdout.flush() # calculate the test score n_test_correct = 0 n_test_total = 0 for words_, left, right, lab in test_trees: p = self.session.run(prediction_op, feed_dict={ words: words_, left_children: left, right_children: right, labels: lab }) n_test_correct += (p[-1] == lab[-1]) n_test_total += 1 print( "i:", i, "cost:", cost, "train acc:", float(n_correct)/n_total, "test acc:", float(n_test_correct)/n_test_total, "time for epoch:", (datetime.now() - t0) ) costs.append(cost) plt.plot(costs) plt.show()
def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): N = len(X) D = self.D M = self.M V = self.V self.f = activation # initial weights We = init_weight(V, D) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, V) bo = np.zeros(V) # make them theano shared self.We = theano.shared(We) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] thX = T.ivector('X') Ei = self.We[thX] # will be a TxD matrix thY = T.ivector('Y') # sentence input: # [START, w1, w2, ..., wn] # sentence target: # [w1, w2, w3, ..., END] def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=Ei, n_steps=Ei.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates ) costs = [] n_total = sum((len(sentence)+1) for sentence in X) for i in xrange(epochs): X = shuffle(X) n_correct = 0 cost = 0 for j in xrange(N): # problem! many words --> END token are overrepresented # result: generated lines will be very short # we will try to fix in a later iteration # BAD! magic numbers 0 and 1... input_sequence = [0] + X[j] output_sequence = X[j] + [1] # we set 0 to start and 1 to end c, p = self.train_op(input_sequence, output_sequence) # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): N = len(X) D = self.D M = self.M V = self.V # initial weights We = init_weight(V, D) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) # z = np.ones(M) Wxz = init_weight(D, M) Whz = init_weight(M, M) bz = np.zeros(M) Wo = init_weight(M, V) bo = np.zeros(V) thX, thY, py_x, prediction = self.set(We, Wx, Wh, bh, h0, Wxz, Whz, bz, Wo, bo, activation) lr = T.scalar('lr') cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - lr*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - lr*g) for dp, g in zip(dparams, grads) ] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function( inputs=[thX, thY, lr], outputs=[cost, prediction], updates=updates ) costs = [] for i in xrange(epochs): X = shuffle(X) n_correct = 0 n_total = 0 cost = 0 for j in xrange(N): if np.random.random() < 0.1: input_sequence = [0] + X[j] output_sequence = X[j] + [1] else: input_sequence = [0] + X[j][:-1] output_sequence = X[j] n_total += len(output_sequence) # we set 0 to start and 1 to end c, p = self.train_op(input_sequence, output_sequence, learning_rate) # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total) if (i + 1) % 500 == 0: learning_rate /= 2 costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): N = len(X) D = self.D M = self.M V = self.V self.f = activation # intial weights We = init_weight(V, D) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] thX = T.ivector('X') Ei = self.We[thX] # T x D thY = T.ivector('Y') def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=Ei, n_steps=Ei.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates, ) costs = [] n_total = sum((len(sentence) + 1) for sentence in X) for i in xrange(epochs): X = shuffle(X) n_correct = 0 cost = 0 for j in xrange(N): input_sequence = [0] + X[j] output_sequence = X[j] + [1] c, p = self.train_op(input_sequence, output_sequence) cost += c for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 print "i:", i, "cost:", cost, "correct rate:", (float(n_correct) / n_total) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-2, mu=0.99, reg=1e-12, epochs=400, batch_sz=20, print_period=1, show_fig=False): # X = X.astype(np.float32) Y = Y.astype(np.int32) # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 W = init_weight(M1, K) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # for momentum dparams = [theano.shared(np.zeros(p.get_value().shape)) for p in self.params] # for rmsprop cache = [theano.shared(np.zeros(p.get_value().shape)) for p in self.params] # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg*T.sum([(p*p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.predict(thX) grads = T.grad(cost, self.params) # momentum only updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates, ) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % print_period == 0: costs.append(c) e = np.mean(Ybatch != p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, trees, test_trees, reg=1e-3, epochs=8, train_inner_nodes=False): D = self.D V = self.V K = self.K N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3 * D) W22 = np.random.randn(D, D, D) / np.sqrt(3 * D) W12 = np.random.randn(D, D, D) / np.sqrt(3 * D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.W11 = theano.shared(W11) self.W22 = theano.shared(W22) self.W12 = theano.shared(W12) self.W1 = theano.shared(W1) self.W2 = theano.shared(W2) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [ self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh, self.Wo, self.bo ] lr = T.scalar('learning_rate') words = T.ivector('words') left_children = T.ivector('left_children') right_children = T.ivector('right_children') labels = T.ivector('labels') def recurrence(n, hiddens, words, left, right): w = words[n] # any non-word will have index -1 hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor( hiddens[n], self. f(hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) + hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W1) + hiddens[right[n]].dot(self.W2) + self.bh))) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, left_children, right_children], ) py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = reg * T.sum([(p * p).sum() for p in self.params]) if train_inner_nodes: relevant_labels = labels[labels >= 0] cost = -T.mean(T.log(py_x[labels >= 0, relevant_labels])) + rcost else: cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost updates = adagrad(cost, self.params, lr) self.cost_predict_op = theano.function( inputs=[words, left_children, right_children, labels], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[words, left_children, right_children, labels, lr], outputs=[cost, prediction], updates=updates) lr_ = 8e-3 # initial learning rate costs = [] sequence_indexes = range(N) # if train_inner_nodes: # n_total = sum(len(words) for words, _, _, _ in trees) # else: # n_total = N for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 n_total = 0 cost = 0 it = 0 for j in sequence_indexes: words, left, right, lab = trees[j] c, p = self.train_op(words, left, right, lab, lr_) if np.isnan(c): print("Cost is nan! Let's stop here. \ Why don't you try decreasing the learning rate?") for p in self.params: print(p.get_value().sum()) exit() cost += c n_correct += (p[-1] == lab[-1]) n_total += 1 it += 1 if it % 10 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct) / n_total, cost)) sys.stdout.flush() # calculate the test score n_test_correct = 0 n_test_total = 0 for words, left, right, lab in test_trees: _, p = self.cost_predict_op(words, left, right, lab) n_test_correct += (p[-1] == lab[-1]) n_test_total += 1 print("i:", i, "cost:", cost, "train acc:", float(n_correct) / n_total, "test acc:", float(n_test_correct) / n_test_total, "time for epoch:", (datetime.now() - t0)) costs.append(cost) plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=0.01, mu=0.99, epochs=30, batch_sz=100): N, D = X.shape K = len(set(Y)) self.hidden_layers = [] mi = D for mo in self.hidden_layer_sizes: h = HiddenLayer(mi, mo) self.hidden_layers.append(h) mi = mo # initialize logistic regression layer W = init_weight(*(mo, K)) b = np.zeros(K) self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b] self.allWs = [] for h in self.hidden_layers: self.params += h.params self.allWs.append(h.W) self.allWs.append(self.W) X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) prediction = self.predict(X_in) # cost_predict_op = theano.function( # inputs=[X_in, targets], # outputs=[cost, prediction], # ) dparams = [theano.shared(p.get_value() * 0) for p in self.params] grads = T.grad(cost, self.params) updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] train_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], updates=updates, ) n_batches = N / batch_sz costs = [] lastWs = [W.get_value() for W in self.allWs] W_changes = [] print "supervised training..." for i in xrange(epochs): print "epoch:", i X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] c, p = train_op(Xbatch, Ybatch) if j % 100 == 0: print "j / n_batches:", j, "/", n_batches, "cost:", c, "error:", error_rate( p, Ybatch) costs.append(c) # log changes in all Ws W_change = [ np.abs(W.get_value() - lastW).mean() for W, lastW in zip(self.allWs, lastWs) ] W_changes.append(W_change) lastWs = [W.get_value() for W in self.allWs] W_changes = np.array(W_changes) plt.subplot(2, 1, 1) for i in xrange(W_changes.shape[1]): plt.plot(W_changes[:, i], label='layer %s' % i) plt.legend() # plt.show() plt.subplot(2, 1, 2) plt.plot(costs) plt.show()
def fit(self, X, Y, batch_sz=20, learning_rate=1.0, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False): D = X[0].shape[1] # X is of size N x T(n) x D K = len(set(Y.flatten())) N = len(Y) M = self.M self.f = activation # initial weights Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) # make them theano shared self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] thX = T.fmatrix('X') # will represent multiple batches concatenated thY = T.ivector('Y') thStartPoints = T.ivector('start_points') XW = thX.dot(self.Wx) # startPoints will contain 1 where a sequence starts and 0 otherwise # Ex. if I have 3 sequences: [[1,2,3], [4,5], [6,7,8]] # Then I will concatenate these into one X: [1,2,3,4,5,6,7,8] # And startPoints will be [1,0,0,1,0,1,0,0] # One possible solution: loop through index # def recurrence(t, h_t1, XW, h0, startPoints): # # returns h(t) # # if at a boundary, state should be h0 # h_t = T.switch( # T.eq(startPoints[t], 1), # self.f(XW[t] + h0.dot(self.Wh) + self.bh), # self.f(XW[t] + h_t1.dot(self.Wh) + self.bh) # ) # return h_t # h, _ = theano.scan( # fn=recurrence, # outputs_info=[self.h0], # sequences=T.arange(XW.shape[0]), # non_sequences=[XW, self.h0, thStartPoints], # n_steps=XW.shape[0], # ) # other solution - loop through all sequences simultaneously def recurrence(xw_t, is_start, h_t1, h0): # if at a boundary, state should be h0 h_t = T.switch( T.eq(is_start, 1), self.f(xw_t + h0.dot(self.Wh) + self.bh), self.f(xw_t + h_t1.dot(self.Wh) + self.bh) ) return h_t h, _ = theano.scan( fn=recurrence, outputs_info=[self.h0], sequences=[XW, thStartPoints], non_sequences=[self.h0], n_steps=XW.shape[0], ) # h is of shape (T*batch_sz, M) py_x = T.nnet.softmax(h.dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] # self.predict_op = theano.function(inputs=[thX, thStartPoints], outputs=prediction) self.train_op = theano.function( inputs=[thX, thY, thStartPoints], outputs=[cost, prediction, py_x], updates=updates ) costs = [] n_batches = N // batch_sz sequenceLength = X.shape[1] # if each sequence was of variable length, we would need to # initialize this inside the loop for every new batch startPoints = np.zeros(sequenceLength*batch_sz, dtype=np.int32) for b in range(batch_sz): startPoints[b*sequenceLength] = 1 for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(n_batches): Xbatch = X[j*batch_sz:(j+1)*batch_sz].reshape(sequenceLength*batch_sz, D) Ybatch = Y[j*batch_sz:(j+1)*batch_sz].reshape(sequenceLength*batch_sz).astype(np.int32) c, p, rout = self.train_op(Xbatch, Ybatch, startPoints) # print "p:", p cost += c # P = p.reshape(batch_sz, sequenceLength) for b in range(batch_sz): idx = sequenceLength*(b + 1) - 1 if p[idx] == Ybatch[idx]: n_correct += 1 # else: # print "pred:", p[idx], "actual:", Ybatch[idx] if i % 10 == 0: print("shape y:", rout.shape) print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)) if n_correct == N: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)) break costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, learning_rate=1e-5, mu=0.99, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=True): D = self.D V = self.V N = len(X) We = init_weight(V, D) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params thX = T.ivector('X') thY = T.ivector('Y') Z = self.We[thX] for ru in self.hidden_layers: Z = ru.output(Z) py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) # let's return py_x too so we can draw a sample instead self.predict_op = theano.function( inputs=[thX], outputs=[py_x, prediction], allow_input_downcast=True, ) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] dWe = theano.shared(self.We.get_value()*0) gWe = T.grad(cost, self.We) dWe_update = mu*dWe - learning_rate*gWe We_update = self.We + dWe_update if normalize: We_update /= We_update.norm(2) updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] + [ (self.We, We_update), (dWe, dWe_update) ] self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates ) costs = [] for i in range(epochs): t0 = datetime.now() X = shuffle(X) n_correct = 0 n_total = 0 cost = 0 for j in range(N): if np.random.random() < 0.01 or len(X[j]) <= 1: input_sequence = [0] + X[j] output_sequence = X[j] + [1] else: input_sequence = [0] + X[j][:-1] output_sequence = X[j] n_total += len(output_sequence) # test: try: # we set 0 to start and 1 to end c, p = self.train_op(input_sequence, output_sequence) except Exception as e: PYX, pred = self.predict_op(input_sequence) print("input_sequence len:", len(input_sequence)) print("PYX.shape:",PYX.shape) print("pred.shape:", pred.shape) raise e # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 if j % 200 == 0: sys.stdout.write("j/N: %d/%d correct rate so far: %f\r" % (j, N, float(n_correct)/n_total)) sys.stdout.flush() print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0)) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, trees, learning_rate=10e-4, mu=0.5, reg=10e-3, eps=10e-3, epochs=20, activation=T.tanh, train_inner_nodes=False): D = self.D V = self.V K = self.K self.f = activation N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3*D) W22 = np.random.randn(D, D, D) / np.sqrt(3*D) W12 = np.random.randn(D, D, D) / np.sqrt(3*D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.W11 = theano.shared(W11) self.W22 = theano.shared(W22) self.W12 = theano.shared(W12) self.W1 = theano.shared(W1) self.W2 = theano.shared(W2) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh, self.Wo, self.bo] words = T.ivector('words') left_children = T.ivector('left_children') right_children = T.ivector('right_children') labels = T.ivector('labels') def recurrence(n, hiddens, words, left, right): w = words[n] # any non-word will have index -1 hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor(hiddens[n], self.f( hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) + hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W1) + hiddens[right[n]].dot(self.W2) + self.bh ) ) ) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, left_children, right_children], ) py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = reg*T.mean([(p*p).sum() for p in self.params]) if train_inner_nodes: cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost else: cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost grads = T.grad(cost, self.params) # dparams = [theano.shared(p.get_value()*0) for p in self.params] cache = [theano.shared(p.get_value()*0) for p in self.params] # momentum # updates = [ # (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) # ] + [ # (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) # ] updates = [ (c, c + g*g) for c, g in zip(cache, grads) ] + [ (p, p - learning_rate*g / T.sqrt(c + eps)) for p, c, g in zip(self.params, cache, grads) ] self.cost_predict_op = theano.function( inputs=[words, left_children, right_children, labels], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[words, left_children, right_children, labels], outputs=[cost, prediction], updates=updates ) costs = [] sequence_indexes = range(N) if train_inner_nodes: n_total = sum(len(words) for words, _, _, _ in trees) else: n_total = N for i in xrange(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 for j in sequence_indexes: words, left, right, lab = trees[j] c, p = self.train_op(words, left, right, lab) if np.isnan(c): print "Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?" exit() cost += c if train_inner_nodes: n_correct += np.sum(p == lab) else: n_correct += (p[-1] == lab[-1]) it += 1 if it % 1 == 0: sys.stdout.write("j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost)) sys.stdout.flush() print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0) costs.append(cost) plt.plot(costs) plt.show()
def __init__(self, Mi, Mo, activation): self.Mi = Mi self.Mo = Mo self.f = activation # input gate self.Wxi = init_weight(Mi, Mo) self.Whi = init_weight(Mo, Mo) self.Wci = init_weight(Mo, Mo) self.bi = np.zeros(Mi) # forget gate self.Wxf = init_weight(Mi, Mo) self.Whf = init_weight(Mo, Mo) self.Wcf = init_weight(Mo, Mo) self.bf = np.zeros(Mo) # candidate cell self.Wxc = init_weight(Mi, Mo) self.Whc = init_weight(Mo, Mo) self.bc = np.zeros(Mo) # output gate self.Wxo = init_weight(Mi, Mo) self.Who = init_weight(Mo, Mo) self.Wco = init_weight(Mo, Mo) self.bo = np.zeros(Mo) # initial state of h and c self.h0 = np.zeros(Mo) self.c0 = np.zeros(Mo) # czy to dobry rozmiar # initialize in theano # input gate self.Wxi = theano.shared(Wxi) self.Whi = theano.shared(Whi) self.Wci = theano.shared(Wci) self.bi = theano.shared(bi) # forget gate self.Wxf = theano.shared(Wxf) self.Whf = theano.shared(Whf) self.Wcf = theano.shared(Wcf) self.bf = theano.shared(bf) # candidate gate self.Wxc = theano.shared(Wxc) self.Whc = theano.shared(Whc) self.bc = theano.shared(bc) # output gate self.Wxo = theano.shared(Wxo) self.Who = theano.shared(Who) self.Wco = theano.shared(Wco) self.bo = theano.shared(bo) # initial states self.h0 = theano.shared(h0) self.c0 = theano.shared(c0) # list for grad update params = [ self.Wxi, self.Whi, self.Wci, self.bi, self.Wxf, self.Whf, self.Wcf, self.bf, self.Wxc, self.Whc, self.bc, self.Wxo, self.Who, self.Wco, self.bo, self.h0, self.c0 ]
def fit(self, trees, test_trees, reg=1e-3, epochs=8, train_inner_nodes=False): D = self.D V = self.V K = self.K N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3*D) W22 = np.random.randn(D, D, D) / np.sqrt(3*D) W12 = np.random.randn(D, D, D) / np.sqrt(3*D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.W11 = theano.shared(W11) self.W22 = theano.shared(W22) self.W12 = theano.shared(W12) self.W1 = theano.shared(W1) self.W2 = theano.shared(W2) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh, self.Wo, self.bo] lr = T.scalar('learning_rate') words = T.ivector('words') left_children = T.ivector('left_children') right_children = T.ivector('right_children') labels = T.ivector('labels') def recurrence(n, hiddens, words, left, right): w = words[n] # any non-word will have index -1 hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor(hiddens[n], self.f( hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) + hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W1) + hiddens[right[n]].dot(self.W2) + self.bh ) ) ) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, left_children, right_children], ) py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = reg*T.sum([(p*p).sum() for p in self.params]) if train_inner_nodes: relevant_labels = labels[labels >= 0] cost = -T.mean(T.log(py_x[labels >= 0, relevant_labels])) + rcost else: cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost updates = adagrad(cost, self.params, lr) self.cost_predict_op = theano.function( inputs=[words, left_children, right_children, labels], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[words, left_children, right_children, labels, lr], outputs=[cost, prediction], updates=updates ) lr_ = 8e-3 # initial learning rate costs = [] sequence_indexes = range(N) # if train_inner_nodes: # n_total = sum(len(words) for words, _, _, _ in trees) # else: # n_total = N for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 n_total = 0 cost = 0 it = 0 for j in sequence_indexes: words, left, right, lab = trees[j] c, p = self.train_op(words, left, right, lab, lr_) if np.isnan(c): print("Cost is nan! Let's stop here. \ Why don't you try decreasing the learning rate?") for p in self.params: print(p.get_value().sum()) exit() cost += c n_correct += (p[-1] == lab[-1]) n_total += 1 it += 1 if it % 10 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost) ) sys.stdout.flush() # calculate the test score n_test_correct = 0 n_test_total = 0 for words, left, right, lab in test_trees: _, p = self.cost_predict_op(words, left, right, lab) n_test_correct += (p[-1] == lab[-1]) n_test_total += 1 print( "i:", i, "cost:", cost, "train acc:", float(n_correct)/n_total, "test acc:", float(n_test_correct)/n_test_total, "time for epoch:", (datetime.now() - t0) ) costs.append(cost) plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1.0, mu=0.99, reg=1.0, activation=tf.tanh, epochs=100, show_fig=False): N, T, D = X.shape K = len(set(Y.flatten())) M = self.M self.f = activation # initial weights, pay attention to the shape! Wx = init_weight(D, M).astype(np.float32) Wh = init_weight(M, M).astype(np.float32) bh = np.zeros(M, dtype=np.float32) h0 = np.zeros(M, dtype=np.float32) Wo = init_weight(M, K).astype(np.float32) bo = np.zeros(K, dtype=np.float32) self.Wx = tf.Variable(Wx) self.Wh = tf.Variable(Wh) self.bh = tf.Variable(bh) self.h0 = tf.Variable(h0) self.Wo = tf.Variable(Wo) self.bo = tf.Variable(bo) tfX = tf.placeholder(tf.float32, shape=(T, D), name='X') tfY = tf.placeholder(tf.int32, shape=(T, ), name='Y') XWx = tf.matmul(tfX, self.Wx) def recurrence(h_t1, xw_t): # matmul() only works with 2-D objects # we want to return a 1-D object of size M # so that the final result is T x M, not T x 1 x M! h_t = self.f(xw_t + tf.matmul(tf.reshape(h_t1, (1, M)), self.Wh) + self.bh) return tf.reshape(h_t, (M, )) h = tf.scan( fn=recurrence, elems=XWx, initializer=self.h0, ) logits = tf.matmul(h, self.Wo) + self.bo cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tfY, logits=logits, )) predict_op = tf.argmax(logits, 1) train_op = tf.train.AdamOptimizer(1e-2).minimize(cost) init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) costs = [] for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 batch_cost = 0 for j in range(N): _, c, p = session.run([train_op, cost, predict_op], feed_dict={ tfX: X[j].reshape(T, D), tfY: Y[j] }) batch_cost += c if p[-1] == Y[j, -1]: n_correct += 1 print("i:", i, "cost:", batch_cost, "classification rate:", (float(n_correct) / N)) costs.append(batch_cost) if n_correct == N: break if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, Xtest, Ytest, pretrain=True, learning_rate=0.1, mu=0.99, reg=0.0, epochs=1, batch_sz=100): # cast to float32 learning_rate = np.float32(learning_rate) mu = np.float32(mu) reg = np.float32(reg) # 選擇要不要 train AutoEncoder 物件 pretrain_epochs = 2 if not pretrain: pretrain_epochs = 0 # 假如這裡epoch=0 就只會initialize weights 但不做training # training AutoEncoder 物件 current_input = X for ae in self.hidden_layers: ae.fit(current_input, epochs=pretrain_epochs) current_input = ae.hidden_op(current_input) # initialize logistic regression layer (最後一層) N = len(Y) K = len(set(Y)) W0 = init_weight((self.hidden_layers[-1].M, K)) self.W = theano.shared(W0, 'W_logreg') self.b = theano.shared(np.zeros(K, dtype=np.float32), 'b_logreg') self.params = [self.W, self.b] for ae in self.hidden_layers: # self.params.append(ae.forward_params) self.params += ae.forward_params self.dW = theano.shared(np.zeros(W0.shape, dtype=np.float32), 'dW_logreg') self.db = theano.shared(np.zeros(K, dtype=np.float32), 'db_logreg') self.dparams = [self.dW, self.db] for ae in self.hidden_layers: # self.dparams.append(ae.forward_dparams) self.dparams += ae.forward_dparams X_in = T.matrix('X_in', dtype='float32') targets = T.ivector( 'Targets') # 注意,這邊要在建立vector存量同時,就宣告存量型別(ivector),如果不宣告,預設是float32 pY = self.forward(X_in) reg_cost = T.sum([(p * p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) + reg * reg_cost updates = [ (p, p + mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, self.dparams) ] + [ (dp, mu * dp - learning_rate * T.grad(cost, p)) for p, dp, in zip(self.params, self.dparams) ] # ..............................包含之前已Autoencoder pretrain過的每一層都要training train_op = theano.function(inputs=[X_in, targets], updates=updates) prediction = self.predict(X_in) cost_predict_op = theano.function(inputs=[X_in, targets], outputs=[cost, prediction]) n_batches = N // batch_sz costs = [] print("supervised training...") for i in range(epochs): print("epoch:", i) X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) the_cost, the_prediction = cost_predict_op(Xtest, Ytest) error = error_rate(the_prediction, Ytest) print("j / n_batches:", j, "/", n_batches, "cost:", the_cost, "error:", error) costs.append(the_cost) plt.plot(costs) plt.show()
def __init__(self, Mi, Mo, activation): self.Mi = Mi self.Mo = Mo self.f = activation # numpy init Wxi = init_weight(Mi, Mo) # input to input gate Whi = init_weight(Mo, Mo) Wci = init_weight(Mo, Mo) bi = np.zeros(Mo) Wxf = init_weight(Mi, Mo) # input to forget gate Whf = init_weight(Mo, Mo) Wcf = init_weight(Mo, Mo) bf = np.zeros(Mo) Wxc = init_weight(Mi, Mo) # input to cell Whc = init_weight(Mo, Mo) bc = np.zeros(Mo) Wxo = init_weight(Mi, Mo) # input to output gate Who = init_weight(Mo, Mo) Wco = init_weight(Mo, Mo) bo = np.zeros(Mo) c0 = np.zeros(Mo) h0 = np.zeros(Mo) # theano vars self.Wxi = theano.shared(Wxi) self.Whi = theano.shared(Whi) self.Wci = theano.shared(Wci) self.bi = theano.shared(bi) self.Wxf = theano.shared(Wxf) self.Whf = theano.shared(Whf) self.Wcf = theano.shared(Wcf) self.bf = theano.shared(bf) self.Wxc = theano.shared(Wxc) self.Whc = theano.shared(Whc) self.bc = theano.shared(bc) self.Wxo = theano.shared(Wxo) self.Who = theano.shared(Who) self.Wco = theano.shared(Wco) self.bo = theano.shared(bo) self.c0 = theano.shared(c0) self.h0 = theano.shared(h0) self.params = [ self.Wxi, self.Whi, self.Wci, self.bi, self.Wxf, self.Whf, self.Wcf, self.bf, self.Wxc, self.Whc, self.bc, self.Wxo, self.Who, self.Wco, self.bo, self.c0, self.h0, ]
def fit(self, trees, learning_rate=1e-3, mu=0.5, reg=1e-2, eps=1e-2, epochs=20, activation=T.tanh, train_inner_nodes=False): D = self.D V = self.V K = self.K self.f = activation N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3*D) W22 = np.random.randn(D, D, D) / np.sqrt(3*D) W12 = np.random.randn(D, D, D) / np.sqrt(3*D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.W11 = theano.shared(W11) self.W22 = theano.shared(W22) self.W12 = theano.shared(W12) self.W1 = theano.shared(W1) self.W2 = theano.shared(W2) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh, self.Wo, self.bo] words = T.ivector('words') left_children = T.ivector('left_children') right_children = T.ivector('right_children') labels = T.ivector('labels') def recurrence(n, hiddens, words, left, right): w = words[n] # any non-word will have index -1 hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor(hiddens[n], self.f( hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) + hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) + hiddens[left[n]].dot(self.W1) + hiddens[right[n]].dot(self.W2) + self.bh ) ) ) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, left_children, right_children], ) py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = reg*T.mean([(p*p).sum() for p in self.params]) if train_inner_nodes: cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost else: cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost grads = T.grad(cost, self.params) # dparams = [theano.shared(p.get_value()*0) for p in self.params] cache = [theano.shared(p.get_value()*0) for p in self.params] # momentum # updates = [ # (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) # ] + [ # (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) # ] updates = [ (c, c + g*g) for c, g in zip(cache, grads) ] + [ (p, p - learning_rate*g / T.sqrt(c + eps)) for p, c, g in zip(self.params, cache, grads) ] self.cost_predict_op = theano.function( inputs=[words, left_children, right_children, labels], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[words, left_children, right_children, labels], outputs=[cost, prediction], updates=updates ) costs = [] sequence_indexes = range(N) if train_inner_nodes: n_total = sum(len(words) for words, _, _, _ in trees) else: n_total = N for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 for j in sequence_indexes: words, left, right, lab = trees[j] c, p = self.train_op(words, left, right, lab) if np.isnan(c): print("Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?") exit() cost += c if train_inner_nodes: n_correct += np.sum(p == lab) else: n_correct += (p[-1] == lab[-1]) it += 1 if it % 1 == 0: sys.stdout.write("j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost)) sys.stdout.flush() print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0)) costs.append(cost) plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False): D = X[0].shape[1] K = len(set(Y.flatten())) N = len(Y) M = self.M self.f = activation # initialize weights Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] thX = T.fmatrix('X') thY = T.ivector('Y') def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=thX, n_steps=thX.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction, y], updates=updates, ) costs = [] for i in xrange(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in xrange(N): c, p, rout = self.train_op(X[j], Y[j]) cost += c if p[-1] == Y[j,-1]: n_correct += 1 print "shape y:", rout.shape print "i:", i, "cost:", cost, "classification rate:", (float(n_correct) / N) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False): D = X[0].shape[1] # X is of size N x T(n) x D K = len(set(Y.flatten())) N = len(Y) M = self.M self.f = activation # initial weights Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) # make them theano shared self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] thX = T.fmatrix('X') thY = T.ivector('Y') def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=thX, n_steps=thX.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean((py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction, y], updates=updates) costs = [] for i in xrange(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in xrange(N): c, p, rout = self.train_op(X[j], Y[j]) # print "p:", p cost += c if p[-1] == Y[j, -1]: n_correct += 1 print "shape y:", rout.shape print "i:", i, "cost:", cost, "classification rate:", ( float(n_correct) / N) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-4, mu=0.99, epochs=30, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=False): D = self.D V = self.V N = len(X) We = init_weight(V, D) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, self.K) bo = np.zeros(self.K) self.We = theano.shared(We) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params thX = T.ivector('X') thY = T.ivector('Y') Z = self.We[thX] for ru in self.hidden_layers: Z = ru.output(Z) py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) testf = theano.function( inputs=[thX], outputs=py_x, ) testout = testf(X[0]) print("py_x.shape:", testout.shape) prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] dWe = theano.shared(self.We.get_value()*0) gWe = T.grad(cost, self.We) dWe_update = mu*dWe - learning_rate*gWe We_update = self.We + dWe_update if normalize: We_update /= We_update.norm(2) updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] + [ (self.We, We_update), (dWe, dWe_update) ] self.cost_predict_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates ) costs = [] sequence_indexes = range(N) n_total = sum(len(y) for y in Y) for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 for j in sequence_indexes: c, p = self.train_op(X[j], Y[j]) cost += c n_correct += np.sum(p == Y[j]) it += 1 if it % 200 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost) ) sys.stdout.flush() print( "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0) ) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1.0, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): M = self.M V = self.V K = len(set(Y)) print("V:", V) X, Y = shuffle(X, Y) Nvalid = 10 Xvalid, Yvalid = X[-Nvalid:], Y[-Nvalid:] X, Y = X[:-Nvalid], Y[:-Nvalid] N = len(X) # initial weights Wx = init_weight(V, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) thX, thY, py_x, prediction = self.set(Wx, Wh, bh, h0, Wo, bo, activation) cost = -T.mean(T.log(py_x[thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] lr = T.scalar('learning_rate') updates = [(p, p + mu * dp - lr * g) for p, dp, g in zip(self.params, dparams, grads)] + [ (dp, mu * dp - lr * g) for dp, g in zip(dparams, grads) ] self.train_op = theano.function( inputs=[thX, thY, lr], outputs=[cost, prediction], updates=updates, allow_input_downcast=True, ) costs = [] for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(N): # we set 0 to start and 1 to end # print "X[%d]:" % j, X[j], "len:", len(X[j]) c, p = self.train_op(X[j], Y[j], learning_rate) # print "p:", p, "y:", Y[j] cost += c if p == Y[j]: n_correct += 1 # update the learning rate learning_rate *= 0.9999 # calculate validation accuracy n_correct_valid = 0 for j in range(Nvalid): p = self.predict_op(Xvalid[j]) if p == Yvalid[j]: n_correct_valid += 1 print("i:", i, "cost:", cost, "correct rate:", (float(n_correct) / N), end=" ") print("validation correct rate:", (float(n_correct_valid) / Nvalid)) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def __init__(self, Mi, Mo, activation): self.Mi = Mi self.Mo = Mo self.f = activation Wxi = init_weight(Mi, Mo) Whi = init_weight(Mo, Mo) Wci = init_weight(Mo, Mo) bi = np.zeros(Mo) Wxf = init_weight(Mi, Mo) Whf = init_weight(Mo, Mo) Wcf = init_weight(Mo, Mo) bf = np.zeros(Mo) Wxc = init_weight(Mi, Mo) Whc = init_weight(Mo, Mo) bc = np.zeros(Mo) Wxo = init_weight(Mi, Mo) Who = init_weight(Mo, Mo) Wco = init_weight(Mo, Mo) bo = np.zeros(Mo) # initial hidden state c0 = np.zeros(Mo) h0 = np.zeros(Mo) self.Wxi = theano.shared(Wxi) self.Whi = theano.shared(Whi) self.Wci = theano.shared(Wci) self.bi = theano.shared(bi) self.Wxf = theano.shared(Wxf) self.Whf = theano.shared(Whf) self.Wcf = theano.shared(Wcf) self.bf = theano.shared(bf) self.Wxc = theano.shared(Wxc) self.Whc = theano.shared(Whc) self.bc = theano.shared(bc) self.Wxo = theano.shared(Wxo) self.Who = theano.shared(Who) self.Wco = theano.shared(Wco) self.bo = theano.shared(bo) self.c0 = theano.shared(c0) self.h0 = theano.shared(h0) self.params = [ self.Wxi, self.Whi, self.Wci, self.bi, self.Wxf, self.Whf, self.Wcf, self.bf, self.Wxc, self.Whc, self.bc, self.Wxo, self.Who, self.Wco, self.bo, self.c0, self.h0 ]
def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): N = len(X) D = self.D M = self.M V = self.V self.f = activation # initial weights We = init_weight(V, D) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, V) bo = np.zeros(V) # make them theano shared self.We = theano.shared(We) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [ self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo ] thX = T.ivector('X') Ei = self.We[thX] # will be a TxD matrix thY = T.ivector('Y') # sentence input: # [START, w1, w2, ..., wn] # sentence target: # [w1, w2, w3, ..., END] def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=Ei, n_steps=Ei.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction], updates=updates) self.costs = [] self.correct_rates = [] n_total = sum((len(sentence) + 1) for sentence in X) for i in range(epochs): X = shuffle(X) n_correct = 0 cost = 0 for j in range(N): # problem! many words --> END token are overrepresented # result: generated lines will be very short # we will try to fix in a later iteration input_sequence = [SimpleRNN.SENTENCE_START] + X[j] output_sequence = X[j] + [SimpleRNN.SENTENCE_END] # we set 0 to start and 1 to end c, p = self.train_op(input_sequence, output_sequence) # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 correct_rate = n_correct / n_total if (i + 1) % 10 == 0: print("i:", i + 1, "cost:", cost, "correct rate:", correct_rate) self.costs.append(cost) self.correct_rates.append(correct_rate) if show_fig: plt.plot(self.costs) plt.show()
def fit(self, X, Y, learning_rate=1.0, mu=0.99, reg=1.0, activation=tf.tanh, epochs=100, show_fig=False): N, T, D = X.shape K = len(set(Y.flatten())) M = self.M self.f = activation # initial weights Wx = init_weight(D, M).astype(np.float32) Wh = init_weight(M, M).astype(np.float32) bh = np.zeros(M, dtype=np.float32) h0 = np.zeros(M, dtype=np.float32) Wo = init_weight(M, K).astype(np.float32) bo = np.zeros(K, dtype=np.float32) # make them theano shared self.Wx = tf.Variable(Wx) self.Wh = tf.Variable(Wh) self.bh = tf.Variable(bh) self.h0 = tf.Variable(h0) self.Wo = tf.Variable(Wo) self.bo = tf.Variable(bo) tfX = tf.placeholder(tf.float32, shape=(T, D), name='X') tfY = tf.placeholder(tf.int32, shape=(T,), name='Y') XWx = tf.matmul(tfX, self.Wx) def recurrence(h_t1, xw_t): # matmul() only works with 2-D objects # we want to return a 1-D object of size M # so that the final result is T x M # not T x 1 x M h_t = self.f(xw_t + tf.matmul(tf.reshape(h_t1, (1, M)), self.Wh) + self.bh) return tf.reshape(h_t, (M,)) h = tf.scan( fn=recurrence, elems=XWx, initializer=self.h0, ) logits = tf.matmul(h, self.Wo) + self.bo cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tfY, logits=logits, ) ) predict_op = tf.argmax(logits, 1) train_op = tf.train.AdamOptimizer(1e-2).minimize(cost) init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) costs = [] for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 batch_cost = 0 for j in range(N): _, c, p = session.run([train_op, cost, predict_op], feed_dict={tfX: X[j].reshape(T, D), tfY: Y[j]}) batch_cost += c if p[-1] == Y[j,-1]: n_correct += 1 print("i:", i, "cost:", batch_cost, "classification rate:", (float(n_correct)/N)) costs.append(batch_cost) if n_correct == N: break if show_fig: plt.plot(costs) plt.show()
def fit(self, trees, learning_rate=3*10e-4, mu=0.99, reg=10e-5, epochs=15, activation=T.nnet.relu, train_inner_nodes=False): D = self.D V = self.V K = self.K self.f = activation N = len(trees) We = init_weight(V, D) Wh = np.random.randn(2, D, D) / np.sqrt(2 + D + D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wh, self.bh, self.Wo, self.bo] words = T.ivector('words') parents = T.ivector('parents') relations = T.ivector('relations') labels = T.ivector('labels') def recurrence(n, hiddens, words, parents, relations): w = words[n] # any non-word will have index -1 # if T.ge(w, 0): # hiddens = T.set_subtensor(hiddens[n], self.We[w]) # else: # hiddens = T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh)) hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh)) ) r = relations[n] # 0 = is_left, 1 = is_right p = parents[n] # parent idx # if T.ge(p, 0): # # root will have parent -1 # hiddens = T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r])) hiddens = T.switch( T.ge(p, 0), T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r])), hiddens ) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, parents, relations], ) # shape of h that is returned by scan is TxTxD # because hiddens is TxD, and it does the recurrence T times # technically this stores T times too much data py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = reg*T.mean([(p*p).sum() for p in self.params]) if train_inner_nodes: # won't work for binary classification cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost else: # print "K is:", K # premean = T.log(py_x[-1]) # target = T.zeros(K) # target = T.set_subtensor(target[labels[-1]], 1) # cost = -T.mean(target * premean) cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] self.cost_predict_op = theano.function( inputs=[words, parents, relations, labels], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[words, parents, relations, labels], outputs=[h, cost, prediction], updates=updates ) costs = [] sequence_indexes = range(N) if train_inner_nodes: n_total = sum(len(words) for words, _, _, _ in trees) else: n_total = N for i in xrange(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 for j in sequence_indexes: words, par, rel, lab = trees[j] # print "len(words):", len(words) _, c, p = self.train_op(words, par, rel, lab) # if h.shape[0] < 10: # print h # print "py_x.shape:", y.shape # print "pre-mean shape:", pm.shape # print "target shape:", t.shape # exit() if np.isnan(c): print "Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?" exit() cost += c if train_inner_nodes: n_correct += np.sum(p == lab) else: n_correct += (p[-1] == lab[-1]) it += 1 if it % 1 == 0: sys.stdout.write("j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost)) sys.stdout.flush() print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0) costs.append(cost) plt.plot(costs) plt.show()
def fit(self, X, Y, batch_sz=20, learning_rate=0.1, mu=0.9, activation=tf.nn.sigmoid, epochs=100, show_fig=False): N, T, D = X.shape # X is of size N x T(n) x D K = len(set(Y.flatten())) M = self.M self.f = activation # initial weights # note: Wx, Wh, bh are all part of the RNN unit and will be created # by BasicRNNCell Wo = init_weight(M, K).astype(np.float32) bo = np.zeros(K, dtype=np.float32) # make them tf variables self.Wo = tf.Variable(Wo) self.bo = tf.Variable(bo) # tf Graph input tfX = tf.compat.v1.placeholder(tf.float32, shape=(batch_sz, T, D), name='inputs') tfY = tf.compat.v1.placeholder(tf.int64, shape=(batch_sz, T), name='targets') # turn tfX into a sequence, e.g. T tensors all of size (batch_sz, D) sequenceX = x2sequence(tfX, T, D, batch_sz) # create the simple rnn unit rnn_unit = BasicRNNCell(num_units=self.M, activation=self.f) # Get rnn cell output # outputs, states = rnn_module.rnn(rnn_unit, sequenceX, dtype=tf.float32) outputs, states = get_rnn_output(rnn_unit, sequenceX, dtype=tf.float32) # outputs are now of size (T, batch_sz, M) # so make it (batch_sz, T, M) outputs = tf.transpose(a=outputs, perm=(1, 0, 2)) outputs = tf.reshape(outputs, (T*batch_sz, M)) # Linear activation, using rnn inner loop last output logits = tf.matmul(outputs, self.Wo) + self.bo predict_op = tf.argmax(input=logits, axis=1) targets = tf.reshape(tfY, (T*batch_sz,)) cost_op = tf.reduce_mean( input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=targets ) ) train_op = tf.compat.v1.train.MomentumOptimizer(learning_rate, momentum=mu).minimize(cost_op) costs = [] n_batches = N // batch_sz init = tf.compat.v1.global_variables_initializer() with tf.compat.v1.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(n_batches): Xbatch = X[j*batch_sz:(j+1)*batch_sz] Ybatch = Y[j*batch_sz:(j+1)*batch_sz] _, c, p = session.run([train_op, cost_op, predict_op], feed_dict={tfX: Xbatch, tfY: Ybatch}) cost += c for b in range(batch_sz): idx = (b + 1)*T - 1 n_correct += (p[idx] == Ybatch[b][-1]) if i % 10 == 0: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)) if n_correct == N: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)) break costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): M = self.M V = self.V K = len(set(Y)) print "V:", V X, Y = shuffle(X, Y) Nvalid = 10 Xvalid, Yvalid = X[-Nvalid:], Y[-Nvalid:] X, Y = X[:-Nvalid], Y[:-Nvalid] N = len(X) # initial weights Wx = init_weight(V, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) thX, thY, py_x, prediction = self.set(Wx, Wh, bh, h0, Wo, bo, activation) cost = -T.mean(T.log(py_x[thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] lr = T.scalar('learning_rate') updates = [ (p, p + mu*dp - lr*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - lr*g) for dp, g in zip(dparams, grads) ] self.train_op = theano.function( inputs=[thX, thY, lr], outputs=[cost, prediction], updates=updates, allow_input_downcast=True, ) costs = [] for i in xrange(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in xrange(N): c, p = self.train_op(X[j], Y[j], learning_rate) cost += c if p == Y[j]: n_correct += 1 learning_rate *= 0.9999 n_correct_valid = 0 for j in xrange(Nvalid): p = self.predict_op(Xvalid[j]) if p == Yvalid[j]: n_correct_valid += 1 print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/N), print "validation correct rate:", (float(n_correct_valid)/Nvalid) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def __init__(self, Mi, Mo): W = init_weight(Mi, Mo) b = np.zeros(Mo) self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b]
class RNN: def __init__(self, D, hidden_layer_sizes, V): self.hidden_layer_sizes = hidden_layer_sizes self.D = D self.V = V <<<<<<< HEAD def fit(self, X, learning_rate=10e-5, mu=0.99, epochs=10, batch_sz=100, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU): ======= def fit(self, X, learning_rate=1e-4, mu=0.99, epochs=10, batch_sz=100, show_fig=True, activation=T.nnet.relu, RecurrentUnit=LSTM): >>>>>>> upstream/master D = self.D V = self.V N = len(X) We = init_weight(V, D) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wo, self.bo] for ru in self.hidden_layers:
Ytest = tf.keras.preprocessing.sequence.pad_sequences(Ytest, maxlen=sequence_length) print("Xtrain.shape:", Xtrain.shape) print("Ytrain.shape:", Ytrain.shape) # inputs inputs = tf.placeholder(tf.int32, shape=(None, sequence_length)) targets = tf.placeholder(tf.int32, shape=(None, sequence_length)) num_samples = tf.shape(inputs)[0] # useful for later # embedding We = np.random.randn(V, embedding_dim).astype(np.float32) # output layer Wo = init_weight(hidden_layer_size, K).astype(np.float32) bo = np.zeros(K).astype(np.float32) # make them tensorflow variables tfWe = tf.Variable(We) tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) # make the rnn unit rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) # get the output x = tf.nn.embedding_lookup(tfWe, inputs) # converts x from a tensor of shape N x T x M
def fit(self, X, learning_rate=1e-5, mu=0.99, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=True): D = self.D V = self.V N = len(X) We = init_weight(V, D) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params thX = T.ivector('X') thY = T.ivector('Y') Z = self.We[thX] for ru in self.hidden_layers: Z = ru.output(Z) py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) # let's return py_x too so we can draw a sample instead self.predict_op = theano.function( inputs=[thX], outputs=[py_x, prediction], allow_input_downcast=True, ) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] dWe = theano.shared(self.We.get_value()*0) gWe = T.grad(cost, self.We) dWe_update = mu*dWe - learning_rate*gWe We_update = self.We + dWe_update if normalize: We_update /= We_update.norm(2) updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] + [ (self.We, We_update), (dWe, dWe_update) ] self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates ) costs = []
def fit(self, X, learning_rate=10e-5, mu=0.99, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=True): D = self.D V = self.V N = len(X) We = init_weight(V, D) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params thX = T.ivector('X') thY = T.ivector('Y') Z = self.We[thX] for ru in self.hidden_layers: Z = ru.output(Z) py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) # let's return py_x too so we can draw a sample instead self.predict_op = theano.function( inputs=[thX], outputs=[py_x, prediction], allow_input_downcast=True, ) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] dWe = theano.shared(self.We.get_value() * 0) gWe = T.grad(cost, self.We) dWe_update = mu * dWe - learning_rate * gWe We_update = self.We + dWe_update if normalize: We_update /= We_update.norm(2) updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads)] + [ (dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads) ] + [(self.We, We_update), (dWe, dWe_update)] self.train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction], updates=updates) costs = [] for i in xrange(epochs): t0 = datetime.now() X = shuffle(X) n_correct = 0 n_total = 0 cost = 0 for j in xrange(N): if np.random.random() < 0.01 or len(X[j]) <= 1: input_sequence = [0] + X[j] output_sequence = X[j] + [1] else: input_sequence = [0] + X[j][:-1] output_sequence = X[j] n_total += len(output_sequence) # test: try: # we set 0 to start and 1 to end c, p = self.train_op(input_sequence, output_sequence) except Exception as e: PYX, pred = self.predict_op(input_sequence) print "input_sequence len:", len(input_sequence) print "PYX.shape:", PYX.shape print "pred.shape:", pred.shape raise e # print "p:", p cost += c # print "j:", j, "c:", c/len(X[j]+1) for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 if j % 200 == 0: sys.stdout.write("j/N: %d/%d correct rate so far: %f\r" % (j, N, float(n_correct) / n_total)) sys.stdout.flush() print "i:", i, "cost:", cost, "correct rate:", ( float(n_correct) / n_total), "time for epoch:", (datetime.now() - t0) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, batch_sz=20, learning_rate=1.0, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False): D = X[0].shape[1] # X is of size N x T(n) x D K = len(set(Y.flatten())) N = len(Y) M = self.M self.f = activation # initial weights Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) # make them theano shared self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] thX = T.fmatrix('X') # will represent multiple batches concatenated thY = T.ivector('Y') thStartPoints = T.ivector('start_points') XW = thX.dot(self.Wx) # startPoints will contain 1 where a sequence starts and 0 otherwise # Ex. if I have 3 sequences: [[1,2,3], [4,5], [6,7,8]] # Then I will concatenate these into one X: [1,2,3,4,5,6,7,8] # And startPoints will be [1,0,0,1,0,1,0,0] # One possible solution: loop through index # def recurrence(t, h_t1, XW, h0, startPoints): # # returns h(t) # # if at a boundary, state should be h0 # h_t = T.switch( # T.eq(startPoints[t], 1), # self.f(XW[t] + h0.dot(self.Wh) + self.bh), # self.f(XW[t] + h_t1.dot(self.Wh) + self.bh) # ) # return h_t # h, _ = theano.scan( # fn=recurrence, # outputs_info=[self.h0], # sequences=T.arange(XW.shape[0]), # non_sequences=[XW, self.h0, thStartPoints], # n_steps=XW.shape[0], # ) # other solution - loop through all sequences simultaneously def recurrence(xw_t, is_start, h_t1, h0): # if at a boundary, state should be h0 h_t = T.switch(T.eq(is_start, 1), self.f(xw_t + h0.dot(self.Wh) + self.bh), self.f(xw_t + h_t1.dot(self.Wh) + self.bh)) return h_t h, _ = theano.scan( fn=recurrence, outputs_info=[self.h0], sequences=[XW, thStartPoints], non_sequences=[self.h0], n_steps=XW.shape[0], ) # h is of shape (T*batch_sz, M) py_x = T.nnet.softmax(h.dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] # self.predict_op = theano.function(inputs=[thX, thStartPoints], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY, thStartPoints], outputs=[cost, prediction, py_x], updates=updates) costs = [] n_batches = N // batch_sz sequenceLength = X.shape[1] # if each sequence was of variable length, we would need to # initialize this inside the loop for every new batch startPoints = np.zeros(sequenceLength * batch_sz, dtype=np.int32) for b in range(batch_sz): startPoints[b * sequenceLength] = 1 for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(n_batches): Xbatch = X[j * batch_sz:(j + 1) * batch_sz].reshape( sequenceLength * batch_sz, D) Ybatch = Y[j * batch_sz:(j + 1) * batch_sz].reshape( sequenceLength * batch_sz).astype(np.int32) c, p, rout = self.train_op(Xbatch, Ybatch, startPoints) # print "p:", p cost += c # P = p.reshape(batch_sz, sequenceLength) for b in range(batch_sz): idx = sequenceLength * (b + 1) - 1 if p[idx] == Ybatch[idx]: n_correct += 1 # else: # print "pred:", p[idx], "actual:", Ybatch[idx] if i % 10 == 0: print("shape y:", rout.shape) print("i:", i, "cost:", cost, "classification rate:", (float(n_correct) / N)) if n_correct == N: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct) / N)) break costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False): # define all the sizes D = X[0].shape[1] K = len(set(Y.flatten())) N = len(Y) M = self.M self.f = activation # initialize weights Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) # turn to theano shared variables self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] # define theano inputs outputs thX = T.fmatrix("X") thY = T.ivector("Y") # define recurrence def recurrence(x_t, h_t1): # x_t: current x # h_t1: previous h # returns h(t) and y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t # theano scan function [h, y], _ = theano.scan(fn=recurrence, outputs_info=[self.h0, None], sequences=thX, n_steps=thX.shape[0]) # define output py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] # updates updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] # theano function predict_op = theano.function( inputs=[thX], outputs=prediction, ) train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction, y], updates=updates) # main training loop costs = [] for i in range(epochs): print("epoch:", i) X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(N): c, p, rout = train_op(X[j], Y[j]) cost += c if p[-1] == Y[j, -1]: n_correct += 1 print("shape y:", Y.shape) print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)) / N) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
class RecursiveNN: def __init__(self, V, D, K, activation=T.tanh): self.V = V self.D = D self.K = K self.f = activation def fit(self, trees, reg=1e-3, epochs=8, train_inner_nodes=False): D = self.D V = self.V K = self.K >>>>>>> upstream/master N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3*D) W22 = np.random.randn(D, D, D) / np.sqrt(3*D) W12 = np.random.randn(D, D, D) / np.sqrt(3*D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.W11 = theano.shared(W11) self.W22 = theano.shared(W22) self.W12 = theano.shared(W12) self.W1 = theano.shared(W1) self.W2 = theano.shared(W2)
def fit(self, X, learning_rate=10e-5, mu=0.99, epochs=10, batch_sz=100, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU): D = self.D V = self.V N = len(X) We = init_weight(V, D) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params thX = T.ivector('X') # will represent multiple batches concatenated thY = T.ivector('Y') # represents next word thStartPoints = T.ivector('start_points') Z = self.We[thX] for ru in self.hidden_layers: Z = ru.output(Z, thStartPoints) py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] # self.predict_op = theano.function(inputs=[thX, thStartPoints], outputs=prediction) self.train_op = theano.function( inputs=[thX, thY, thStartPoints], outputs=[cost, prediction], updates=updates ) costs = [] n_batches = N / batch_sz for i in xrange(epochs): t0 = datetime.now() X = shuffle(X) n_correct = 0 n_total = 0 cost = 0 for j in xrange(n_batches): # construct input sequence and output sequence as # concatenatation of multiple input sequences and output sequences # input X should be a list of 2-D arrays or one 3-D array # N x T(n) x D - batch size x sequence length x num features # sequence length can be variable sequenceLengths = [] input_sequence = [] output_sequence = [] for k in xrange(j*batch_sz, (j+1)*batch_sz): # don't always add the end token if np.random.random() < 0.01 or len(X[k]) <= 1: input_sequence += [0] + X[k] output_sequence += X[k] + [1] sequenceLengths.append(len(X[k]) + 1) else: input_sequence += [0] + X[k][:-1] output_sequence += X[k] sequenceLengths.append(len(X[k])) n_total += len(output_sequence) startPoints = np.zeros(len(output_sequence), dtype=np.int32) last = 0 for length in sequenceLengths: startPoints[last] = 1 last += length c, p = self.train_op(input_sequence, output_sequence, startPoints) cost += c for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 if j % 1 == 0: sys.stdout.write("j/n_batches: %d/%d correct rate so far: %f\r" % (j, n_batches, float(n_correct)/n_total)) sys.stdout.flush() print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=.01, mu=.99, epochs=30, batch_sz=100): N, D = X.shape K = len(set(Y)) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: h = HiddenLayer(Mi, Mo) self.hidden_layers.append(h) Mi = Mo W = init_weight(Mi, K) b = np.zeros(K) self.W = theano.shared(W) self.b = theano.shared(b) self.params = [self.W, self.b] self.allWs = [] for h in self.hidden_layers: self.params += h.params self.allWs.append(h.W) self.allWs.append(self.W) X_in = T.matrix('X_in') targets = T.ivector('Targets') pY = self.forward(X_in) cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets])) prediction = self.predict(X_in) dparams = [theano.shared(p.get_value() * 0) for p in self.params] grads = T.grad(cost, self.params) updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] # N, D = X.shape # K = len(set(Y)) # self.hidden_layers = [] # mi = D # for mo in self.hidden_layer_sizes: # h = HiddenLayer(mi, mo) # self.hidden_layers.append(h) # mi = mo # # initialize logistic regression layer # W = init_weight(*(mo, K)) # b = np.zeros(K) # self.W = theano.shared(W) # self.b = theano.shared(b) # self.params = [self.W, self.b] # self.allWs = [] # for h in self.hidden_layers: # self.params += h.params # self.allWs.append(h.W) # self.allWs.append(self.W) # X_in = T.matrix('X_in') # targets = T.ivector('Targets') # pY = self.forward(X_in) # cost = -T.mean( T.log(pY[T.arange(pY.shape[0]), targets]) ) # prediction = self.predict(X_in) # dparams = [theano.shared(p.get_value()*0) for p in self.params] # grads = T.grad(cost, self.params) # updates = [ # (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) # ] + [ # (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) # ] train_op = theano.function( inputs=[X_in, targets], outputs=[cost, prediction], updates=updates, ) n_batches = N / batch_sz costs = [] lastWs = [W.get_value() for W in self.allWs] W_changes = [] for i in xrange(epochs): print "epoch", i X, Y = shuffle(X, Y) for j in xrange(n_batches): Xbatch = X[j * batch_sz:(j + 1) * batch_sz, :] Ybatch = Y[j * batch_sz:(j + 1) * batch_sz] c, Yhat = train_op(Xbatch, Ybatch) if j % 100 == 0: error = error_rate(Ybatch, Yhat) print "i:%d\tj:%d\tnb:%d\tcost:%.6f\terror:%.3f\t" % ( i, j, n_batches, c, error) costs.append(c) W_change = [ np.abs(W.get_value() - lastW).mean() for W, lastW in zip(self.allWs, lastWs) ] W_changes.append(W_change) lastWs = [W.get_value() for W in self.allWs] W_changes = np.array(W_changes) plt.subplot(2, 1, 1) for i in xrange(W_changes.shape[1]): plt.plot(W_changes[:, 1], label='layer %d' % i) plt.legend() plt.subplot(2, 1, 2) plt.plot(costs) plt.show()
def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False): N = len(X) # Number of training samples. D = self.D M = self.M V = self.V self.f = activation # inital weights We = init_weight(V, D) Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, V) bo = np.zeros(V) self.We = theano.shared(We) self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [ self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo ] thX = T.ivector('X') # sequence of indexes. Ei = self.We[thX] # returns a TxD matrix thY = T.ivector('Y') def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan(fn=recurrence, outputs_info=[self.h0, None], sequences=Ei, n_steps=Ei.shape[0]) print(f'y.shape: {y.shape}') py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) # returns gradient of cost with all params dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction], updates=updates) costs = [] n_total = sum((len(sentence) + 1) for sentence in X) for i in range(epochs): X = shuffle(X) n_correct = 0 cost = 0 for j in range(N): input_sequence = [0] + X[j] output_sequence = X[j] + [1] c, p = self.train_op(input_sequence, output_sequence) cost += c for pj, xj in zip(p, output_sequence): if pj == xj: n_correct += 1 print( f'epoch: {i}, cost: {cost}, correct_rate: {float(n_correct) / n_total}' ) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, trees, learning_rate=3 * 10e-4, mu=0.99, reg=10e-5, epochs=15, activation=T.nnet.relu, train_inner_nodes=False): D = self.D V = self.V K = self.K self.f = activation N = len(trees) We = init_weight(V, D) Wh = np.random.randn(2, D, D) / np.sqrt(2 + D + D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wh, self.bh, self.Wo, self.bo] words = T.ivector('words') parents = T.ivector('parents') relations = T.ivector('relations') labels = T.ivector('labels') def recurrence(n, hiddens, words, parents, relations): w = words[n] # any non-word will have index -1 # if T.ge(w, 0): # hiddens = T.set_subtensor(hiddens[n], self.We[w]) # else: # hiddens = T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh)) hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh))) r = relations[n] # 0 = is_left, 1 = is_right p = parents[n] # parent idx if T.ge(p, 0): # root will have parent -1 hiddens = T.set_subtensor( hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r])) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, parents, relations], ) py_x = T.nnet.softmax(h[:, 0, :].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = T.mean([(p * p).sum() for p in self.params]) if train_inner_nodes: cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost else: cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads) ] + [(dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads)] self.cost_predict_op = theano.function( inputs=[words, parents, relations, labels], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[words, parents, relations, labels], outputs=[cost, prediction], updates=updates) costs = [] sequence_indexes = range(N) if train_inner_nodes: n_total = sum(len(words) for words, _, _, _ in trees) else: n_total = N for i in xrange(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 for j in sequence_indexes: words, par, rel, lab = trees[j] c, p = self.train_op(words, par, rel, lab) if np.isnan(c): print "Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?" exit() cost += c if train_inner_nodes: n_correct += np.sum(p == lab) else: n_correct += (p[-1] == lab[-1]) it += 1 if it % 1 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct) / n_total, cost)) sys.stdout.flush() print "i:", i, "cost:", cost, "correct rate:", ( float(n_correct) / n_total), "time for epoch:", (datetime.now() - t0) costs.append(cost) plt.plot(costs) plt.show()
Ytest = tf.keras.preprocessing.sequence.pad_sequences(Ytest, maxlen=sequence_length) print("Xtrain.shape:", Xtrain.shape) print("Ytrain.shape:", Ytrain.shape) # inputs inputs = tf.placeholder(tf.int32, shape=(None, sequence_length)) targets = tf.placeholder(tf.int32, shape=(None, sequence_length)) num_samples = tf.shape(inputs)[0] # useful for later # embedding We = np.random.randn(V, embedding_dim).astype(np.float32) # output layer Wo = init_weight(hidden_layer_size, K).astype(np.float32) bo = np.zeros(K).astype(np.float32) # make them tensorflow variables tfWe = tf.Variable(We) tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) # make the rnn unit rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) # get the output x = tf.nn.embedding_lookup(tfWe, inputs) # converts x from a tensor of shape N x T x D