def __init__(self, V, D, K, activation): self.D = D self.f = activation # word embedding: We = init_weight(V, D) # linear terms: W1 = init_weight(D, D) W2 = init_weight(D, D) # bias bh = np.zeros(D) # output layer Wo = init_weight(D, K) bo = np.zeros(K) # make the updatable tensorflow variables self.We = tf.Variable(We.astype(np.float32)) self.W1 = tf.Variable(W1.astype(np.float32)) self.W2 = tf.Variable(W2.astype(np.float32)) self.bh = tf.Variable(bh.astype(np.float32)) self.Wo = tf.Variable(Wo.astype(np.float32)) self.bo = tf.Variable(bo.astype(np.float32)) self.params = [self.We, self.W1, self.W2, self.Wo]
def __init__(self, Mi, Mo, activation): self.Mi = Mi self.Mo = Mo self.f = activation # numpy init Wxr = init_weight(Mi, Mo) Whr = init_weight(Mo, Mo) br = np.zeros(Mo) Wxz = init_weight(Mi, Mo) Whz = init_weight(Mo, Mo) bz = np.zeros(Mo) Wxh = init_weight(Mi, Mo) Whh = init_weight(Mo, Mo) bh = np.zeros(Mo) h0 = np.zeros(Mo) # theano vars self.Wxr = theano.shared(Wxr) self.Whr = theano.shared(Whr) self.br = theano.shared(br) self.Wxz = theano.shared(Wxz) self.Whz = theano.shared(Whz) self.bz = theano.shared(bz) self.Wxh = theano.shared(Wxh) self.Whh = theano.shared(Whh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.params = [ self.Wxr, self.Whr, self.br, self.Wxz, self.Whz, self.bz, self.Wxh, self.Whh, self.bh, self.h0 ]
def fit(self, trees, test_trees, reg=1e-3, epochs=8, train_inner_nodes=False): D = self.D V = self.V K = self.K N = len(trees) We = init_weight(V, D) W11 = np.random.randn(D, D, D) / np.sqrt(3 * D) W22 = np.random.randn(D, D, D) / np.sqrt(3 * D) W12 = np.random.randn(D, D, D) / np.sqrt(3 * D) W1 = init_weight(D, D) W2 = init_weight(D, D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = tf.Variable(We.astype(np.float32)) self.W11 = tf.Variable(W11.astype(np.float32)) self.W22 = tf.Variable(W22.astype(np.float32)) self.W12 = tf.Variable(W12.astype(np.float32)) self.W1 = tf.Variable(W1.astype(np.float32)) self.W2 = tf.Variable(W2.astype(np.float32)) self.bh = tf.Variable(bh.astype(np.float32)) self.Wo = tf.Variable(Wo.astype(np.float32)) self.bo = tf.Variable(bo.astype(np.float32)) self.weights = [ self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.Wo ] words = tf.placeholder(tf.int32, shape=(None, ), name='words') left_children = tf.placeholder(tf.int32, shape=(None, ), name='left_children') right_children = tf.placeholder(tf.int32, shape=(None, ), name='right_children') labels = tf.placeholder(tf.int32, shape=(None, ), name='labels') # save for later self.words = words self.left = left_children self.right = right_children self.labels = labels def dot1(a, B): return tf.tensordot(a, B, axes=[[0], [1]]) def dot2(B, a): return tf.tensordot(B, a, axes=[[1], [0]]) def recursive_net_transform(hiddens, n): h_left = hiddens.read(left_children[n]) h_right = hiddens.read(right_children[n]) return self.f( dot1(h_left, dot2(self.W11, h_left)) + dot1(h_right, dot2(self.W22, h_right)) + dot1(h_left, dot2(self.W12, h_right)) + dot1(h_left, self.W1) + dot1(h_right, self.W2) + self.bh) def recurrence(hiddens, n): w = words[n] # any non word will have index -1 h_n = tf.cond(w >= 0, lambda: tf.nn.embedding_lookup(self.We, w), lambda: recursive_net_transform(hiddens, n)) hiddens = hiddens.write(n, h_n) n = tf.add(n, 1) return hiddens, n def condition(hiddens, n): # loop should continue while n < len(words) return tf.less(n, tf.shape(words)[0]) hiddens = tf.TensorArray( tf.float32, size=0, dynamic_size=True, clear_after_read=False, infer_shape=False, ) hiddens, _ = tf.while_loop(condition, recurrence, [hiddens, tf.constant(0)], parallel_iterations=1) h = hiddens.stack() logits = tf.matmul(h, self.Wo) + self.bo prediction_op = tf.argmax(logits, axis=1) self.prediction_op = prediction_op rcost = reg * sum(tf.nn.l2_loss(p) for p in self.weights) if train_inner_nodes: labeled_indices = tf.where(labels >= 0) cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.gather(logits, labeled_indices), labels=tf.gather(labels, labeled_indices), )) + rcost else: cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits[-1], labels=labels[-1], )) + rcost # might have to swap out for momentum optimzer if using GPU train_op = tf.train.AdagradOptimizer( learning_rate=1e-4).minimize(cost_op) self.session = tf.InteractiveSession() init_op = tf.global_variables_initializer() self.session.run(init_op) costs = [] sequence_indexes = range(N) for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 n_total = 0 cost = 0 it = 0 # Use a single example per update - stochastic gradient descent. for j in sequence_indexes: words_, left, right, lab = trees[j] # print("words_:", words_) # print("lab:", lab) c, p, _ = self.session.run( (cost_op, prediction_op, train_op), feed_dict={ words: words_, left_children: left, right_children: right, labels: lab }) if np.isnan(c): print("Cost is nan! try decreasing the learning rate.") for p in self.params: print(p.get_value().sum()) exit() cost += c n_correct += (p[-1] == lab[-1]) n_total += 1 it += 1 if it % 10 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct) / n_total, cost)) sys.stdout.flush() # calculate the test score: n_test_correct = 0 n_test_total = 0 for words_, left, right, lab in test_trees: p = self.session.run(prediction_op, feed_dict={ words: words_, left_children: left, right_children: right, labels: lab }) n_test_correct += (p[-1] == lab[-1]) n_test_total += 1 print("i:", i, "cost:", cost, "train acc:", float(n_correct) / n_total, "test acc:", float(n_test_correct) / n_test_total, "time for epoch:", (datetime.now() - t0)) costs.append(cost) print("costs: ", costs) plt.plot(costs) plt.show()
def fit(self, X, Y, learning_rate=1e-4, mu=0.99, epochs=20, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=False): D = self.D V = self.V N = len(X) We = init_weight(V, D) self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_sizes: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, self.K) bo = np.zeros(self.K) self.We = theano.shared(We) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params thX = T.ivector('X') thY = T.ivector('Y') Z = self.We[thX] for ru in self.hidden_layers: Z = ru.output(Z) py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) testf = theano.function( inputs=[thX], outputs=py_x, ) testout = testf(X[0]) print "py_x.shape:", testout.shape prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) dWe = theano.shared(self.We.get_value() * 0) gWe = T.grad(cost, self.We) dWe_update = mu * dWe - learning_rate * gWe We_update = self.We + dWe_update if normalize: We_update /= We_update.norm(2) updates = [ update for param in self.params for update in rmsprop_updates(cost, param, learning_rate, mu) ] + [(self.We, We_update), (dWe, dWe_update)] self.cost_predict_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction], updates=updates) costs = [] sequence_indexes = range(N) n_total = sum(len(y) for y in Y) for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 for j in sequence_indexes: c, p = self.train_op(X[j], Y[j]) cost += c n_correct += np.sum(p == Y[j]) it += 1 if it % 200 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct) / n_total, cost)) sys.stdout.flush() print("i:", i + 1, "cost:", cost, "correct rate:", (float(n_correct) / n_total), "time for epoch:", (datetime.now() - t0)) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
# ================= 事先準備model 所需的參數與元件 ====================== # inputs inputs = tf.placeholder( tf.int32, shape=(None, sequence_length)) # input is a tensor of shape N x T targets = tf.placeholder( tf.int32, shape=(None, sequence_length)) # target is a tensor of shape N x T num_sample = tf.shape(inputs)[0] # useful for later # embedding We = np.random.randn(V, embedding_dim).astype(np.float32) # output layer Wo = init_weight(hidden_layer_size, K).astype(np.float32) bo = np.zeros(K).astype(np.float32) # make them tensorflow variables tfWe = tf.Variable(We) tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) # make the rnn unit rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) # ================ model + cost + solver ======================= # get the output from enbedding layer x = tf.nn.embedding_lookup(tfWe, inputs) # x is a tensor of shape N x T x M # converts x from a tensor of shape N x T x M
def fit(self, X, Y, learning_rate=1e-4, mu=0.99, epochs=30, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=False): ## ========== 先準備所有model中 所有必備的 weight matrix + initial hidden value(h0) =========== D = self.D V = self.V N = len(X) We = init_weight(V, D) # embedding matrix ,這一層不具有bias喔 self.hidden_layers = [] Mi = D for Mo in self.hidden_layer_size: ru = RecurrentUnit(Mi, Mo, activation) self.hidden_layers.append(ru) Mi = Mo Wo = init_weight(Mi, self.K) bo = np.zeros(self.K) self.We = theano.shared( We) # We 不跟其他參數被一起蒐集到同一個 list內,因為他要另外做weight update self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wo, self.bo] for ru in self.hidden_layers: self.params += ru.params thX = T.ivector('X') thY = T.ivector('Y') ## =========== step1 model ============================ Z = self.We[thX] for ru in self.hidden_layers: Z = ru.output(Z) py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo) testf = theano.function( # 寫這段測試小程式的目的是,檢查py_x.shape是甚麼 inputs=[thX], outputs=py_x, ) print("py_x.shape:", testf(X[0]).shape) prediction = T.argmax(py_x, axis=1) ## ========== step2,3 cost and solver =================== cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) # 對We做更新的公式 gWe = T.grad(cost, self.We) dWe = theano.shared(self.We.get_value() * 0) dWe_update = mu * dWe - learning_rate * gWe We_update = self.We + dWe_update if normalize: We_update /= We_update.norm(2) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value() * 0) for p in self.params] # 全部weight 參數更新公式,都包在update 這個串列中 updates = [(p, p + mu * dp - learning_rate * g) for p, dp, g in zip(self.params, dparams, grads)] + [ (dp, mu * dp - learning_rate * g) for dp, g in zip(dparams, grads) ] + [(self.We, We_update), (dWe, dWe_update)] self.cost_predict_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction], updates=updates, ) # =========== Training Process ========== costs = [] sequence_indexes = range(N) n_total = sum(len(y) for y in Y) # 用來算準確率 for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 # 用來keep track 目前是第幾個j for j in sequence_indexes: c, p = self.train_op(X[j], Y[j]) cost += c n_correct += np.sum(p == Y[j]) it += 1 if it % 200 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct) / n_total, cost)) sys.stdout.flush() print("i:", i, "cost:", cost, "correct rate:", (float(n_correct) / n_total), "time for epoch:", (datetime.now() - t0)) costs.append(cost) if show_fig: plt.plot(costs) plt.show()
def fit(self, trees, learning_rate=3 * 1e-3, mu=0.99, reg=1e-4, epochs=15, activation=T.nnet.relu, train_inner_nodes=False): D = self.D V = self.V K = self.K self.f = activation N = len(trees) We = init_weight(V, D) Wh = np.random.randn(2, D, D) / np.sqrt(2 + D + D) bh = np.zeros(D) Wo = init_weight(D, K) bo = np.zeros(K) self.We = theano.shared(We) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.We, self.Wh, self.bh, self.Wo, self.bo] words = T.ivector('words') parents = T.ivector('parents') relations = T.ivector('relations') labels = T.ivector('labels') def recurrence(n, hiddens, words, parents, relations): w = words[n] hiddens = T.switch( T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]), T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh))) r = relations[n] p = parents[n] hiddens = T.switch( T.ge(p, 0), T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r])), hiddens) return hiddens hiddens = T.zeros((words.shape[0], D)) h, _ = theano.scan( fn=recurrence, outputs_info=[hiddens], n_steps=words.shape[0], sequences=T.arange(words.shape[0]), non_sequences=[words, parents, relations], ) py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo) prediction = T.argmax(py_x, axis=1) rcost = reg * T.mean([(p * p).sum() for p in self.params]) if train_inner_nodes: cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost else: cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost # grads = T.grad(cost, self.params) # dparams = [theano.shared(p.get_value()*0) for p in self.params] # # updates = [ # (p, p * mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) # ] + [ # (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) # ] updates = adagrad(cost, self.params, lr=1e-4) self.cost_predict_op = theano.function( inputs=[words, parents, relations, labels], outputs=[cost, prediction], allow_input_downcast=True, ) self.train_op = theano.function( inputs=[words, parents, relations, labels], outputs=[h, cost, prediction], updates=updates) costs = [] sequence_indexes = range(N) if train_inner_nodes: n_total = sum(len(words) for words, _, _, _ in trees) else: n_total = N for i in range(epochs): t0 = datetime.now() sequence_indexes = shuffle(sequence_indexes) n_correct = 0 cost = 0 it = 0 for j in sequence_indexes: words, par, rel, lab = trees[j] _, c, p = self.train_op(words, par, rel, lab) cost += c if train_inner_nodes: n_correct += np.sum(p == lab) else: n_correct += (p[-1] == lab[-1]) it += 1 if it % 1 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f/r" % (it, N, float(n_correct / n_total), cost)) sys.stdout.flush() print("i:", i, "cost:", cost, "correct rate:", (float(n_correct) / n_total), "time for epoch:", (datetime.now() - t0)) costs.append(cost) print('costs:', costs) plt.plot(costs) plt.show()