def model_initializer(self, V, K, sq_length, recurr_unit, nonlin_func, optimizer, optimizer_args, reg): self.input_words = tf.placeholder(tf.int32, shape=(None, sq_length), name="tfX") self.target_POS = tf.placeholder(tf.int32, shape=(None, sq_length), name="tfT") num_samples = tf.shape(self.input_words)[0] self.hidden_layers = [] M_input = self.D self.W_embed = self.helper((V, self.D)) self.W_out = self.helper((self.hid_lay_sizes[-1], K)) Xw = tf.nn.embedding_lookup(self.W_embed[0], self.input_words) # converts x from a tensor of shape N x T x M into a list of length T, where each element is a tensor of shape N x M Xw = tf.unstack(Xw, sq_length, 1) output = Xw for idx, layer_sz in enumerate(self.hid_lay_sizes): rnn_unit = recurr_unit[idx](num_units=layer_sz, activation=self.nonlinear( nonlin_func[idx])) output, _ = get_rnn_output(rnn_unit, output, dtype=tf.float32) # outputs are now of size (T, N, M) => make it (N, T, M); M - is last hidden layer size output = tf.transpose(output, (1, 0, 2)) output = tf.reshape( output, (sq_length * num_samples, self.hid_lay_sizes[-1])) # NT x M logits = tf.matmul(output, self.W_out[0]) + self.W_out[1] # NT x K self.prediction = tf.reshape(tf.argmax(logits, axis=1), (num_samples, sq_length)) #self.out_prob = tf.nn.softmax(logits) l2_loss = reg * sum( tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables() if not ("noreg" in tf_var.name or "Bias" in tf_var.name)) ''' tf.reduce_sum([beta*tf.nn.l2_loss(var) for var in tf.trainable_variables()]) ''' self.cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.reshape(self.target_POS, [-1]))) + l2_loss self.train_op = self.optimizer(optimizer, optimizer_args).minimize(self.cost)
def forward(self, X): outputs, states = get_rnn_output(self.rnn_unit, X, dtype=tf.float32) # outputs are now of size (T, batch_sz, M) # so make it (batch_sz, T, M) """ outputs = tf.transpose(outputs, (1, 0, 2)) outputs = tf.reshape( outputs, (-1, self.hidden_layer_size)) """ return tf.matmul(outputs[-1], self.Wo) + self.bo
def fit(self, X, Y, batch_sz=20, learning_rate=0.1, mu=0.9, activation=tf.nn.sigmoid, epochs=100, show_fig=False): N, T, D = X.shape # X is of size N x T(n) x D K = len(set(Y.flatten())) M = self.M self.f = activation # initial weights # note: Wx, Wh, bh are all part of the RNN unit and will be created # by BasicRNNCell Wo = init_weight(M, K).astype(np.float32) bo = np.zeros(K, dtype=np.float32) # make them tf variables self.Wo = tf.Variable(Wo) self.bo = tf.Variable(bo) # tf Graph input tfX = tf.compat.v1.placeholder(tf.float32, shape=(batch_sz, T, D), name='inputs') tfY = tf.compat.v1.placeholder(tf.int64, shape=(batch_sz, T), name='targets') # turn tfX into a sequence, e.g. T tensors all of size (batch_sz, D) sequenceX = x2sequence(tfX, T, D, batch_sz) # create the simple rnn unit rnn_unit = BasicRNNCell(num_units=self.M, activation=self.f) # Get rnn cell output # outputs, states = rnn_module.rnn(rnn_unit, sequenceX, dtype=tf.float32) outputs, states = get_rnn_output(rnn_unit, sequenceX, dtype=tf.float32) # outputs are now of size (T, batch_sz, M) # so make it (batch_sz, T, M) outputs = tf.transpose(a=outputs, perm=(1, 0, 2)) outputs = tf.reshape(outputs, (T*batch_sz, M)) # Linear activation, using rnn inner loop last output logits = tf.matmul(outputs, self.Wo) + self.bo predict_op = tf.argmax(input=logits, axis=1) targets = tf.reshape(tfY, (T*batch_sz,)) cost_op = tf.reduce_mean( input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=targets ) ) train_op = tf.compat.v1.train.MomentumOptimizer(learning_rate, momentum=mu).minimize(cost_op) costs = [] n_batches = N // batch_sz init = tf.compat.v1.global_variables_initializer() with tf.compat.v1.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(n_batches): Xbatch = X[j*batch_sz:(j+1)*batch_sz] Ybatch = Y[j*batch_sz:(j+1)*batch_sz] _, c, p = session.run([train_op, cost_op, predict_op], feed_dict={tfX: Xbatch, tfY: Ybatch}) cost += c for b in range(batch_sz): idx = (b + 1)*T - 1 n_correct += (p[idx] == Ybatch[b][-1]) if i % 10 == 0: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)) if n_correct == N: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)) break costs.append(cost) if show_fig: plt.plot(costs) plt.show()
rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) # ================ model + cost + solver ======================= # get the output from enbedding layer x = tf.nn.embedding_lookup(tfWe, inputs) # x is a tensor of shape N x T x M # converts x from a tensor of shape N x T x M # into a list of length T, where each element is a tensor of shape N x M # tensorflow的RNN 有個很怪的要求,是輸入tensor型別必須是 T x N x M # 還好tensorflow 有現成的方法來改變 tensor shape x = tf.unstack( x, sequence_length, axis=1 ) # axis=1 (第二個維度) 代表對T dim 做分解 # output x is a tensor of shape T x N x D # get the rnn output output, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) # output are now of size (T, N, M) # so make it (N, T, M) # TODO 這裡可以用unstack嗎?? outputs = tf.transpose(output, (1, 0, 2)) # TODO 確認 transpose/reshape/ unstack 運作邏輯 outputs = tf.reshape( outputs, (num_sample * sequence_length, hidden_layer_size)) # NT x M (這裡詳閱note 1) # final dense layer logits = tf.matmul(outputs, tfWo) + tfbo # NT x K predictions = tf.argmax(logits, axis=1) # (NT, ) predict_op = tf.reshape(predictions, (num_sample, sequence_length)) # N x T labels_flat = tf.reshape(targets, [-1]) # (NT, ) ,這一步的目的是為了後續計算cost,詳閱note2
def train(self, epochs=10, learning_rate=1e-2, mu=0.99, batch_size=32, hidden_layer_size=10, embedding_dim=10): # training config sequence_length = max(len(x) for x in self.Xtrain + self.Xtest) V = self.V K = self.K # pad sequences Xtrain = pad_sequences(self.Xtrain, maxlen=sequence_length) Ytrain = pad_sequences(self.Ytrain, maxlen=sequence_length) Xtest = pad_sequences(self.Xtest, maxlen=sequence_length) Ytest = pad_sequences(self.Ytest, maxlen=sequence_length) print("Xtrain.shape:", Xtrain.shape) print("Ytrain.shape:", Ytrain.shape) # inputs inputs = tf.placeholder(tf.int32, shape=(None, sequence_length)) targets = tf.placeholder(tf.int32, shape=(None, sequence_length)) num_samples = tf.shape(inputs)[0] # useful for later # embedding We = np.random.randn(V, embedding_dim).astype(np.float32) # output layer Wo = init_weight(hidden_layer_size, K).astype(np.float32) bo = np.zeros(K).astype(np.float32) # make them tensorflow variables tfWe = tf.Variable(We) tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) # make the rnn unit rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) # get the output x = tf.nn.embedding_lookup(tfWe, inputs) # converts x from a tensor of shape N x T x D # into a list of length T, where each element is a tensor of shape N x D x = tf.unstack(x, sequence_length, 1) # get the rnn output outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) # outputs are now of size (T, N, M) # so make it (N, T, M) outputs = tf.transpose(outputs, (1, 0, 2)) outputs = tf.reshape( outputs, (sequence_length * num_samples, hidden_layer_size)) # NT x M # Linear activation, using rnn inner loop last output logits = tf.matmul(outputs, tfWo) + tfbo # NT x K predictions = tf.argmax(logits, 1) predict_op = tf.reshape(predictions, (num_samples, sequence_length)) labels_flat = tf.reshape(targets, [-1]) cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels_flat)) train_op = tf.train.AdamOptimizer(learning_rate).minimize(cost_op) # init stuffle sess = tf.InteractiveSession() init = tf.global_variables_initializer() sess.run(init) # training loop accs = [] costs = [] n_batches = len(Ytrain) // batch_size for i in range(epochs): n_total = 0 n_correct = 0 t0 = datetime.now() Xtrain, Ytrain = shuffle(Xtrain, Ytrain) cost = 0 for j in range(n_batches): x = Xtrain[j * batch_size:(j + 1) * batch_size] y = Ytrain[j * batch_size:(j + 1) * batch_size] # get the cost, predictions, and perform a gradient descent step c, p, _ = sess.run((cost_op, predict_op, train_op), feed_dict={ inputs: x, targets: y }) cost += c # calculate the accuracy for yi, pi in zip(y, p): # we don't care about the padded entries so ignore them yii = yi[yi > 0] pii = pi[yi > 0] n_correct += np.sum(yii == pii) n_total += len(yii) # print stuff out periodically if j % 10 == 0: sys.stdout.write( "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (j, n_batches, float(n_correct) / n_total, cost)) sys.stdout.flush() # get test acc. too p = sess.run(predict_op, feed_dict={inputs: Xtest, targets: Ytest}) n_test_correct = 0 n_test_total = 0 for yi, pi in zip(Ytest, p): yii = yi[yi > 0] pii = pi[yi > 0] n_test_correct += np.sum(yii == pii) n_test_total += len(yii) test_acc = float(n_test_correct) / n_test_total print("i:", i, "cost:", "%.4f" % cost, "train acc:", "%.4f" % (float(n_correct) / n_total), "test acc:", "%.4f" % test_acc, "time for epoch:", (datetime.now() - t0)) accs.append((float(n_correct) / n_total)) costs.append(cost) f, plt_arr = plt.subplots(2, sharex=True) plt_arr[0].plot(costs) plt_arr[0].set_title('costs') plt_arr[1].plot(accs) plt_arr[1].set_title('acc') plt.show()
def fit(self, X, Y, batch_sz=20, learning_rate=0.1, mu=0.9, activation=tf.nn.sigmoid, epochs=100, show_fig=False): N, T, D = X.shape # X is of size N x T(n) x D K = len(set(Y.flatten())) M = self.M self.f = activation # initial weights # note: Wx, Wh, bh are all part of the RNN unit and will be created # by BasicRNNCell Wo = init_weight(M, K).astype(np.float32) bo = np.zeros(K, dtype=np.float32) # make them tf variables self.Wo = tf.Variable(Wo) self.bo = tf.Variable(bo) # tf Graph input tfX = tf.placeholder(tf.float32, shape=(batch_sz, T, D), name='inputs') tfY = tf.placeholder(tf.int64, shape=(batch_sz, T), name='targets') # turn tfX into a sequence, e.g. T tensors all of size (batch_sz, D) sequenceX = x2sequence(tfX, T, D, batch_sz) # create the simple rnn unit rnn_unit = BasicRNNCell(num_units=self.M, activation=self.f) # Get rnn cell output # outputs, states = rnn_module.rnn(rnn_unit, sequenceX, dtype=tf.float32) outputs, states = get_rnn_output(rnn_unit, sequenceX, dtype=tf.float32) # outputs are now of size (T, batch_sz, M) # so make it (batch_sz, T, M) outputs = tf.transpose(outputs, (1, 0, 2)) outputs = tf.reshape(outputs, (T*batch_sz, M)) # Linear activation, using rnn inner loop last output logits = tf.matmul(outputs, self.Wo) + self.bo predict_op = tf.argmax(logits, 1) targets = tf.reshape(tfY, (T*batch_sz,)) cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=targets ) ) train_op = tf.train.MomentumOptimizer(learning_rate, momentum=mu).minimize(cost_op) costs = [] n_batches = N // batch_sz init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(n_batches): Xbatch = X[j*batch_sz:(j+1)*batch_sz] Ybatch = Y[j*batch_sz:(j+1)*batch_sz] _, c, p = session.run([train_op, cost_op, predict_op], feed_dict={tfX: Xbatch, tfY: Ybatch}) cost += c for b in range(batch_sz): idx = (b + 1)*T - 1 n_correct += (p[idx] == Ybatch[b][-1]) if i % 10 == 0: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)) if n_correct == N: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)) break costs.append(cost) if show_fig: plt.plot(costs) plt.show()
W0 = init_weight(hidden_unit_size, class_num).astype(np.float32) b0 = np.zeros(class_num, dtype=np.float32) tfW0 = tf.Variable(W0) tfb0 = tf.Variable(b0) # 受限于X的shape,tfX就长这样了,导致之后的一系列维度转换 tfX = tf.placeholder(tf.float32, shape=(batch_size, bit_len, D), name='inputs') tfY = tf.placeholder(tf.int32, shape=(batch_size, bit_len), name='outputs') # 将tfX转换为序列 bit_len个lists 每个list里是 batch_size D sequenceX = x2sequence(tfX, batch_size, bit_len, D) rnn_units = BasicRNNCell(num_units=hidden_unit_size, activation=tf.nn.sigmoid) # outputs同sequenceX: bit_len batch_size D bit_len个二维tensor outputs_, states = get_rnn_output(rnn_units, sequenceX, dtype=tf.float32) outputs = tf.transpose(outputs_, perm=(1, 0, 2)) outputs = tf.reshape(outputs, shape=(bit_len * batch_size, hidden_unit_size)) logits = tf.matmul(outputs, tfW0) + tfb0 predict = tf.argmax(logits, axis=1) targets = tf.reshape(tfY, shape=(bit_len * batch_size, )) # 损失函数 loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits)) # 优化算法 train_optimize = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum) train_setp = train_optimize.minimize(loss)
def fit(self, X, Y, batch_sz=20, learning_rate=0.1, mu=0.9, activation=tf.nn.sigmoid, epochs=100, show_fig=False): N, T, D = X.shape # X is of size N x T(n) x D K = len(set(Y.flatten())) M = self.M self.f = activation hidden_layer = HiddenLayer(M, K) params = hidden_layer.get_hidden_layer_params() self.Wo = params[0] self.bo = params[1] # tf Graph input tfX = tf.placeholder(tf.float32, shape=(batch_sz, T, D), name='inputs') tfY = tf.placeholder(tf.int64, shape=(batch_sz, T), name='targets') # turn tfX into a sequence, e.g. T tensors all of size (batch_sz, D) sequenceX = x2sequence(tfX, T, D, batch_sz) rnn_unit = BasicRNNCell(num_units=self.M, activation=self.f) outputs, states = get_rnn_output(rnn_unit, sequenceX, dtype=tf.float32) # outputs are now of size (T, batch_sz, M) # so make it (batch_sz, T, M) outputs = tf.transpose(outputs, (1, 0, 2)) outputs = tf.reshape(outputs, (T * batch_sz, M)) logits = tf.matmul(outputs, self.Wo) + self.bo predict_op = tf.argmax(logits, axis=1) targets = tf.reshape(tfY, (T * batch_sz, )) ####default -1 #calculate the cost function cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=targets)) train_op = tf.train.MomentumOptimizer(learning_rate, momentum=mu).minimize(cost_op) costs = [] n_batches = N // batch_sz init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for i in range(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in range(n_batches): Xbatch = X[j * batch_sz:(j + 1) * batch_sz] Ybatch = Y[j * batch_sz:(j + 1) * batch_sz] # calculate c: _, c, p = session.run([train_op, cost_op, predict_op], feed_dict={ tfX: Xbatch, tfY: Ybatch }) cost += c for b in range(batch_sz): idx = (b + 1) * T - 1 n_correct += (p[idx] == Ybatch[b][-1]) if i % 10 == 0: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct) / N)) if n_correct == N: print("i:", i, "cost:", cost, "classification rate:", (float(n_correct) / N)) break costs.append(cost) if show_fig: plt.plot(costs) plt.show()
# make them tensorflow variables tfWe = tf.Variable(We) tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) # make the rnn unit rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) # get the output x = tf.nn.embedding_lookup(tfWe, inputs) # (N, T, D) # converts x from a tensor of shape (N, T, D) # into a list of length T, where each element is a tensor of shape (N, D) x = tf.unstack(x, sequence_length, 1) # () # get the rnn output outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) # (T, N, M) # outputs are now of size (T, N, M) # so make it (N, T, M) outputs = tf.transpose(outputs, (1, 0, 2)) # (N, T, M) outputs = tf.reshape(outputs, (num_samples * sequence_length, hidden_layer_size)) # (NT, M) # final dense layer logits = tf.matmul(outputs, tfWo) + tfbo # (NT, K) predictions = tf.argmax(logits, 1) # (NT, ) predict_op = tf.reshape(predictions, (num_samples, sequence_length)) # (N, T) labels_flat = tf.reshape(targets, [-1]) # flattens shape into 1-D: (N, T, 1) --> (NT, ) loss_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits,
tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) # make the rnn unit rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) # get the output x = tf.nn.embedding_lookup(tfWe, inputs) # converts x from a tensor of shape N x T x M # into a list of length T, where each element is a tensor of shape N x M x = tf.unstack(x, sequence_length, 1) # get the rnn output outputs, states = get_rnn_output(rnn_unit, x, dtype=tf.float32) # outputs are now of size (T, N, M) # so make it (N, T, M) outputs = tf.transpose(outputs, (1, 0, 2)) outputs = tf.reshape(outputs, (sequence_length*num_samples, hidden_layer_size)) # NT x M # final dense layer logits = tf.matmul(outputs, tfWo) + tfbo # NT x K predictions = tf.argmax(logits, 1) predict_op = tf.reshape(predictions, (num_samples, sequence_length)) labels_flat = tf.reshape(targets, [-1]) cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(
tfWo = tf.Variable(Wo) tfbo = tf.Variable(bo) # set up the rnn unit rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu) rnn_unit_dropout = DropoutWrapper(rnn_unit, output_keep_prob=keep_prob) # get the output x = tf.nn.embedding_lookup(tfWe, inputs) # convert x from a tensorof shape N x T x D # into a list of length T, where each element is a tensor of shape N x D x = tf.unstack(x, sequence_length, 1) # get the rnn output outputs, states = get_rnn_output(rnn_unit_dropout, x, dtype=tf.float32) # outputs are now of size (T, N, M) # so make it (N, T, M) outputs = tf.transpose(outputs, (1, 0, 2)) outputs = tf.reshape( outputs, (sequence_length * num_samples, hidden_layer_size)) # NT x M # Linear activation, using rnn inner loop last output logits = tf.matmul(outputs, tfWo) + tfbo # NT x K predictions = tf.argmax(logits, 1) predict_op = tf.reshape(predictions, (num_samples, sequence_length)) labels_flat = tf.reshape(targets, [-1]) cost_op = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,