def testSequenceLoss(self): with self.test_session() as sess: logits = [constant_op.constant(i + 0.5, shape=[2, 5]) for i in range(3)] targets = [ constant_op.constant( i, dtypes.int32, shape=[2]) for i in range(3) ] weights = [constant_op.constant(1.0, shape=[2]) for i in range(3)] average_loss_per_example = seq2seq_lib.sequence_loss( logits, targets, weights, average_across_timesteps=True, average_across_batch=True) res = sess.run(average_loss_per_example) self.assertAllClose(1.60944, res) average_loss_per_sequence = seq2seq_lib.sequence_loss( logits, targets, weights, average_across_timesteps=False, average_across_batch=True) res = sess.run(average_loss_per_sequence) self.assertAllClose(4.828314, res) total_loss = seq2seq_lib.sequence_loss( logits, targets, weights, average_across_timesteps=False, average_across_batch=False) res = sess.run(total_loss) self.assertAllClose(9.656628, res)
def get_model(feed_previous=False): learning_rate = tf.Variable(float(init_learning_rate), trainable=False, dtype=tf.float32) learning_rate_decay_op = learning_rate.assign(learning_rate * 0.9) encoder_inputs = [] decoder_inputs = [] target_weights = [] for i in range(input_seq_len): encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(output_seq_len + 1): decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) for i in range(output_seq_len): target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) targets = [decoder_inputs[i + 1] for i in range(output_seq_len)] cell = tf.contrib.rnn.BasicLSTMCell(size) outputs, _ = seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs[:output_seq_len], cell, num_encoder_symbols=num_encoder_symbols, num_decoder_symbols=num_decoder_symbols, embedding_size=size, output_projection=None, feed_previous=feed_previous, dtype=tf.float32) loss = seq2seq.sequence_loss(outputs, targets, target_weights) opt = tf.train.GradientDescentOptimizer(learning_rate) update = opt.apply_gradients(opt.compute_gradients(loss)) saver = tf.train.Saver(tf.global_variables()) return encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate
def get_model(feed_previous=False): """构造模型 """ learning_rate = tf.Variable(float(init_learning_rate), trainable=False, dtype=tf.float32) learning_rate_decay_op = learning_rate.assign(learning_rate * 0.9) encoder_inputs = [] decoder_inputs = [] target_weights = [] for i in range(input_seq_len): encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(output_seq_len + 1): decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) for i in range(output_seq_len): target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # decoder_inputs左移一个时序作为targets targets = [decoder_inputs[i + 1] for i in range(output_seq_len)] # cell = tf.contrib.rnn.BasicLSTMCell(size) dropout = 1 num_layers = 3 cell = tf.contrib.rnn.BasicLSTMCell(size) cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=dropout) cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers) # 纵向上有两个LSTM # 这里输出的状态我们不需要 outputs, _ = seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs[:output_seq_len], cell, num_encoder_symbols=num_encoder_symbols, num_decoder_symbols=num_decoder_symbols, embedding_size=size, output_projection=None, feed_previous=feed_previous, dtype=tf.float32) # 计算加权交叉熵损失 loss = seq2seq.sequence_loss(outputs, targets, target_weights) # 梯度下降优化器 opt = tf.train.AdamOptimizer(learning_rate).minimize(loss) # 优化目标:让loss最小化 # update = opt.apply_gradients(opt.compute_gradients(loss)) # 模型持久化 saver = tf.train.Saver(tf.global_variables()) return encoder_inputs, decoder_inputs, target_weights, outputs, loss, opt, saver, learning_rate_decay_op, learning_rate
def get_model(feed_previous=False): """ 构造模型:seq2seq feed_previous表示decoder_inputs是我们直接提供训练数据的输入, 还是用前一个RNNCell的输出映射出来的,如果feed_previous为True, 那么就是用前一个RNNCell的输出,并经过Wx+b线性变换成 """ learning_rate = tf.Variable(float(init_learning_rate), trainable=False, dtype=tf.float32) learning_rate_decay_op = learning_rate.assign(learning_rate * 0.9) encoder_inputs = [] decoder_inputs = [] target_weights = [] for i in range(input_seq_len): encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(output_seq_len + 1): decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) for i in range(output_seq_len): target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # decoder_inputs左移一个时序作为targets targets = [decoder_inputs[i + 1] for i in range(output_seq_len)] cell = tf.contrib.rnn.BasicLSTMCell(size) # 这里输出的状态我们不需要 outputs, _ = seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs[:output_seq_len], cell, num_encoder_symbols=num_encoder_symbols, num_decoder_symbols=num_decoder_symbols, embedding_size=size, output_projection=None, feed_previous=feed_previous, dtype=tf.float32) # 计算加权交叉熵损失 loss = seq2seq.sequence_loss(outputs, targets, target_weights) # 梯度下降优化器 opt = tf.train.GradientDescentOptimizer(learning_rate) # 优化目标:让loss最小化 update = opt.apply_gradients(opt.compute_gradients(loss)) # 模型持久化 saver = tf.train.Saver(tf.global_variables()) return encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate
def ForwardBackward(enc_inp, dec_inp, feed_previous): scope_name = "fp_{}".format(feed_previous) with variable_scope.variable_scope(scope_name): dec_op, _ = seq2seq(enc_inp, dec_inp, feed_previous=feed_previous) net_variables = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES, scope_name) optimizer = adam.AdamOptimizer(0.03, epsilon=1e-5) update_op = optimizer.minimize( seq2seq_lib.sequence_loss(dec_op, targets, weights), var_list=net_variables) return dec_op, update_op, net_variables
def get_model(feed_previous=False): learning_rate = tf.Variable(float(init_learning_rate), trainable=False, dtype=tf.float32) learning_rate_decay_op = learning_rate.assign(learning_rate * 0.9) encoder_inputs = [] decoder_inputs = [] target_weights = [] for i in range(input_seq_len): encoder_inputs.append( tf.compat.v1.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(output_seq_len + 1): decoder_inputs.append( tf.compat.v1.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) for i in range(output_seq_len): target_weights.append( tf.compat.v1.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # decoder_inputs左移一个时序作为targets targets = [decoder_inputs[i + 1] for i in range(output_seq_len)] cell = tf.contrib.rnn.BasicLSTMCell(size) # 这里输出的状态我们不需要 outputs, _ = seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs[:output_seq_len], cell, num_encoder_symbols=num_encoder_symbols, num_decoder_symbols=num_decoder_symbols, embedding_size=size, output_projection=None, # 是一个(W, B)结构的tuple,W是shape为[output_size x num_decoder_symbols]的weight矩阵,B是shape为[num_decoder_symbols]的偏置向量 feed_previous=feed_previous, dtype=tf.float32) # 计算交叉熵损失 loss = seq2seq.sequence_loss(outputs, targets, target_weights) # 梯度下降优化器 opt = tf.compat.v1.train.GradientDescentOptimizer(learning_rate) # 优化目标:让loss最小化 update = opt.apply_gradients(opt.compute_gradients(loss)) # 模型持久化,保存所有的变量 saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables()) return encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate
def get_model(): # 这个方法需要的参数分别是:inputs_tensor,decoder_tensor,cell,类似与vocab_size的symbols,虽然我不知道encoder_symbolsy有什么用 # 然后是embed_size,应该和cell的size一样,然后是需不需要softmax,decode_inputs是来自前面的RNNcell还是我们自己输入,最后是数 # 据类型 ''' embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, num_heads=1, output_projection=None, feed_previous=False, dtype=None, scope=None, initial_state_attention=False ) ''' encoder_inputs = [] decoder_inputs = [] targets_weigh = [] for i in range(input_seq_len): encoder_inputs.append(tf.placeholder(shape=[None],dtype=tf.int32,name="encoder{0}".format(i))) for i in range(output_seq_len): decoder_inputs.append(tf.placeholder(shape=[None],dtype=tf.int32,name="decode{0}".format(i))) for i in range(output_seq_len): targets_weigh.append( tf.placeholder(shape=[None],dtype=tf.float32,name="weight{0}".format(i)) ) targets = [decoder_inputs[i] for i in range(1,output_seq_len)] targets.append(np.zeros(shape=[2],dtype=np.int32)) cell = tf.nn.rnn_cell.BasicLSTMCell(size) outputs,_ = seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=num_encoder_symbols, num_decoder_symbols=num_decoder_symbols, embedding_size=size, output_projection=None, feed_previous=False, dtype=tf.float32 ) loss = seq2seq.sequence_loss( outputs,targets,targets_weigh ) opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) update = opt.apply_gradients(opt.compute_gradients(loss)) saver = tf.train.Saver(tf.global_variables()) return encoder_inputs,decoder_inputs,targets_weigh,outputs,loss,update,saver pass
def sequence_loss(self, y_pred, y_true): ''' Loss function for the seq2seq RNN. Reshape predicted and true (label) tensors, generate dummy weights, then use seq2seq.sequence_loss to actually compute the loss function. ''' logits = tf.unstack( y_pred, axis=1) # list of [-1, num_decoder_synbols] elements targets = tf.unstack( y_true, axis=1 ) # y_true has shape [-1, self.out_seq_len]; unpack to list of self.out_seq_len [-1] elements weights = [tf.ones_like(yp, dtype=tf.float32) for yp in targets] sl = seq2seq.sequence_loss(logits, targets, weights) return sl
def get_model(feed_previous=False): """构造模型 """ learning_rate = tf.Variable(float(init_learning_rate), trainable=False, dtype=tf.float32) learning_rate_decay_op = learning_rate.assign(learning_rate * 0.99) encoder_inputs = [] decoder_inputs = [] target_weights = [] for i in range(input_seq_len): encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in range(output_seq_len + 1): decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) for i in range(output_seq_len): target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # decoder_inputs左移一个时序作为targets targets = [decoder_inputs[i + 1] for i in range(output_seq_len)] cell = tf.contrib.rnn.LSTMCell(size) # 这里输出的状态我们不需要 outputs, _ = seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs[:output_seq_len], cell, num_encoder_symbols=num_encoder_symbols, num_decoder_symbols=num_decoder_symbols, embedding_size=size, output_projection=None, feed_previous=feed_previous, dtype=tf.float32) # 计算加权交叉熵损失 loss = seq2seq.sequence_loss(outputs, targets, target_weights) # 梯度下降优化器 update = tf.train.AdamOptimizer(learning_rate).minimize(loss) # 模型持久化 saver = tf.train.Saver(tf.global_variables()) return encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate
def add_loss_op(self, output): """将损失添加到目标函数上面 Hint:使用tensorflow.python.ops.seq2seq.sequence_loss 来实现序列损失 参数: 输出:一个张量 大小是 (None,self.vocab) 返回: 损失:一个0-d大小的张量 """ all_ones = [tf.ones([self.config.batch_size * self.config.num_steps])] cross_entropy = sequence_loss( [output], [tf.reshape(self.labels_placeholder, [-1])], all_ones, len(self.vocab)) tf.add_to_collection('total_loss', cross_entropy) loss = tf.add_n(tf.get_collection('total_loss')) return loss
def add_loss_op(self, output): """Adds loss ops to the computational graph. Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss. Args: output: A tensor of shape (None, self.vocab) Returns: loss: A 0-d tensor (scalar) """ ### YOUR CODE HERE ones = tf.ones([self.config.batch_size * self.config.num_steps]) loss = sequence_loss([output], [tf.reshape(self.labels_placeholder, [-1])], [ones]) ### END YOUR CODE return loss
def get_model(feed_previous=False): """构造模型 """ encoder_inputs = [] decoder_inputs = [] target_weights = [] for i in xrange(input_seq_len): encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(output_seq_len + 1): decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) for i in xrange(output_seq_len): target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # decoder_inputs左移一个时序作为targets targets = [decoder_inputs[i + 1] for i in xrange(output_seq_len)] cell = tf.contrib.rnn.BasicLSTMCell(size) # 这里输出的状态我们不需要 outputs, _ = seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs[:output_seq_len], cell, num_encoder_symbols=num_encoder_symbols, num_decoder_symbols=num_decoder_symbols, embedding_size=size, output_projection=None, feed_previous=feed_previous, dtype=tf.float32) # 计算加权交叉熵损失 loss = seq2seq.sequence_loss(outputs, targets, target_weights) # 梯度下降优化器 opt = tf.train.GradientDescentOptimizer(learning_rate) # 优化目标:让loss最小化 update = opt.apply_gradients(opt.compute_gradients(loss)) # 模型持久化 saver = tf.train.Saver(tf.global_variables()) return encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver
def add_loss_op(self, output): """Adds loss ops to the computational graph. Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss. Args: output: A tensor of shape (None, self.vocab) Returns: loss: A 0-d tensor (scalar) """ all_ones = [tf.ones([self.config.batch_size * self.config.num_steps])] # [(640,1)] targets = [tf.reshape(self.labels_placeholder, [-1])] cross_entropy = sequence_loss(logits=[output], # [(640,10000)] targets=targets, # [(640,1)] weights=all_ones) # [(640,1)] tf.add_to_collection('total_loss', cross_entropy) loss = tf.add_n(tf.get_collection('total_loss')) return loss
def get_model(feed_previous=False): """构造模型 """ learning_rate = tf.Variable(float(init_learning_rate), trainable=False, dtype=tf.float32) learning_rate_decay_op = learning_rate.assign(learning_rate * 0.9) encoder_inputs = [] decoder_inputs = [] target_weights = [] for i in xrange(input_seq_len): encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(output_seq_len + 1): decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) for i in xrange(output_seq_len): target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) # decoder_inputs左移一个时序作为targets targets = [decoder_inputs[i + 1] for i in xrange(output_seq_len)] cell = tf.contrib.rnn.BasicLSTMCell(size) # 这里输出的状态我们不需要 outputs, _ = seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs[:output_seq_len], cell, num_encoder_symbols=num_encoder_symbols, num_decoder_symbols=num_decoder_symbols, embedding_size=size, output_projection=None, feed_previous=feed_previous, dtype=tf.float32) # 计算加权交叉熵损失 loss = seq2seq.sequence_loss(outputs, targets, target_weights) # 梯度下降优化器 opt = tf.train.GradientDescentOptimizer(learning_rate) # 优化目标:让loss最小化 update = opt.apply_gradients(opt.compute_gradients(loss)) # 模型持久化 saver = tf.train.Saver(tf.global_variables()) return encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate
def add_loss_op(self, output): """Adds loss ops to the computational graph. Hint: Use tensorflow.python.ops.seq2seq.sequence_loss to implement sequence loss. Args: output: A tensor of shape (None, self.vocab) Returns: loss: A 0-d tensor (scalar) """ # YOUR CODE HERE all_ones = [tf.ones([self.config.batch_size * self.config.num_steps])] cross_entropy = sequence_loss( [output], [tf.reshape(self.labels_placeholder, [-1])], all_ones, len(self.vocab)) tf.add_to_collection('total_loss', cross_entropy) loss = tf.add_n(tf.get_collection('total_loss')) # END YOUR CODE return loss
def sequence_loss(self, y_pred, y_true): ''' Loss function for the seq2seq RNN. Reshape predicted and true (label) tensors, generate dummy weights, then use seq2seq.sequence_loss to actually compute the loss function. ''' if self.verbose > 2: print("my_sequence_loss y_pred=%s, y_true=%s" % (y_pred, y_true)) logits = tf.unstack(y_pred, axis=1) # list of [-1, num_decoder_synbols] elements targets = tf.unstack(y_true, axis=1) # y_true has shape [-1, self.out_seq_len]; unpack to list of self.out_seq_len [-1] elements if self.verbose > 2: print("my_sequence_loss logits=%s" % (logits,)) print("my_sequence_loss targets=%s" % (targets,)) weights = [tf.ones_like(yp, dtype=tf.float32) for yp in targets] if self.verbose > 4: print("my_sequence_loss weights=%s" % (weights,)) sl = seq2seq.sequence_loss(logits, targets, weights) if self.verbose > 2: print("my_sequence_loss return = %s" % sl) return sl
def sequence_loss(self, y_pred, y_true): ''' Loss function for the seq2seq RNN. Reshape predicted and true (label) tensors, generate dummy weights, then use seq2seq.sequence_loss to actually compute the loss function. ''' if self.verbose > 2: print("my_sequence_loss y_pred=%s, y_true=%s" % (y_pred, y_true)) logits = tf.unstack( y_pred, axis=1) # list of [-1, num_decoder_synbols] elements targets = tf.unstack( y_true, axis=1 ) # y_true has shape [-1, self.out_seq_len]; unstack to list of self.out_seq_len [-1] elements if self.verbose > 2: print("my_sequence_loss logits=%s" % (logits, )) print("my_sequence_loss targets=%s" % (targets, )) weights = [tf.ones_like(yp, dtype=tf.float32) for yp in targets] if self.verbose > 4: print("my_sequence_loss weights=%s" % (weights, )) sl = seq2seq.sequence_loss(logits, targets, weights) if self.verbose > 2: print("my_sequence_loss return = %s" % sl) return sl
feed_previous = False, output_projection = output_projection, dtype = tf.float32) # define our loss function def sampled_loss(labels, logits): return tf.nn.sampled_softmax_loss( weights = w_t, biases = b, labels = tf.reshape(labels, [-1, 1]), inputs = logits, num_sampled = 512, num_classes = en_vocab_size) loss = seq2seq_lib.sequence_loss(outputs, targets, target_weights, softmax_loss_function = sampled_loss) def softmax(x): n = np.max(x) e_x = np.exp(x - n) return e_x / e_x.sum() # feed data into placeholders def feed_dict(x, y, batch_size = 64): feed = {} idxes = np.random.choice(len(x), size = batch_size, replace = False) for i in range(input_seq_len): feed[encoder_inputs[i].name] = np.array([x[j][i] for j in idxes])
def rnn_model(model, input_data, output_data,labels, vocab_size, batch_size=64,rnn_size=128): """ construct rnn seq2seq model. :param model: model class :param input_data: input data placeholder :param output_data: output data placeholder :param vocab_size: :param rnn_size: :param num_layers: :param batch_size: :param learning_rate: :return: """ end_points = {} if model == 'rnn': cell_fun = tf.contrib.rnn.BasicRNNCell elif model == 'gru': cell_fun = tf.contrib.rnn.GRUCell elif model == 'lstm': cell_fun = tf.contrib.rnn.BasicLSTMCell cell = cell_fun(rnn_size) # cell = tf.contrib.rnn.MultiRNNCell([cell] * num_layers, state_is_tuple=True) weights = [tf.ones_like(labels_t, dtype=tf.float32) for labels_t in labels] outputs,last_state = seq2seq.embedding_rnn_seq2seq(input_data,output_data,cell,vocab_size,vocab_size,len(input_data)) loss = seq2seq.sequence_loss(ou, labels, weights, vocab_size) tf.scalar_summary("loss", loss) magnitude = tf.sqrt(tf.reduce_sum(tf.square(last_state[1]))) tf.scalar_summary("magnitude at t=1", magnitude) summary_op = tf.merge_all_summaries() learning_rate = 0.05 momentum = 0.9 optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) train_op = optimizer.minimize(loss) logdir = tempfile.mkdtemp() print(logdir) summary_writer = tf.train.SummaryWriter(logdir, sess.graph_def) # if output_data is not None: # initial_state = cell.zero_state(batch_size, tf.float32) # else: # initial_state = cell.zero_state(1, tf.float32) # with tf.device("/cpu:0"): # embedding = tf.get_variable('embedding', initializer=tf.random_uniform( # [vocab_size + 1, rnn_size], -1.0, 1.0)) # inputs = tf.nn.embedding_lookup(embedding, input_data) # decoder_inputs = tf.nn.embedding_lookup(embedding, output_data) # # [batch_size, ?, rnn_size] = [64, ?, 128] # # outputs, last_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state) # outputs,last_state = basic_rnn_seq2seq(inputs,decoder_inputs,cell) # output = tf.reshape(outputs, [-1, rnn_size]) # weights = tf.Variable(tf.truncated_normal([rnn_size, vocab_size + 1])) # bias = tf.Variable(tf.zeros(shape=[vocab_size + 1])) # logits = tf.nn.bias_add(tf.matmul(output, weights), bias=bias) # # [?, vocab_size+1] # if output_data is not None: # # output_data must be one-hot encode # labels = tf.one_hot(tf.reshape(output_data, [-1]), depth=vocab_size + 1) # # should be [?, vocab_size+1] # loss = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits) # # loss shape should be [?, vocab_size+1] # total_loss = tf.reduce_mean(loss) # train_op = tf.train.AdamOptimizer(learning_rate).minimize(total_loss) # end_points['initial_state'] = initial_state # end_points['output'] = output # end_points['train_op'] = train_op # end_points['total_loss'] = total_loss # end_points['loss'] = loss # end_points['last_state'] = last_state # else: # prediction = tf.nn.softmax(logits) # end_points['initial_state'] = initial_state # end_points['last_state'] = last_state # end_points['prediction'] = prediction # return end_points
def create_model(self, model_input, vocab_size, num_frames, is_training=True, sparse_labels=None, label_weights=None, **unused_params): self.phase_train = is_training num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) model_inputs = utils.SampleRandomSequence(model_input, num_frames, self.max_steps) total_vocab_size = vocab_size + 3 enc_cell = self.get_enc_cell(self.cell_size, total_vocab_size) dec_cell = self.get_dec_cell(self.cell_size) runtime_batch_size = tf.shape(model_inputs)[0] # TODO if False: with tf.variable_scope("Enc"): enc_init_state = enc_cell.zero_state(runtime_batch_size, dtype=tf.float32) enc_outputs, enc_state = tf.nn.dynamic_rnn( enc_cell, model_inputs, initial_state=enc_init_state, scope="enc") else: enc_outputs = model_inputs enc_state = dec_cell.zero_state(runtime_batch_size, dtype=tf.float32) label_weights = tf.cast(label_weights, tf.float32) dec_weights = tf.unstack(label_weights, axis=1) dec_input_lists = tf.unstack(sparse_labels, axis=1) dec_targets = [ dec_input_lists[i + 1] for i in xrange(len(dec_input_lists) - 1) ] dec_targets += [tf.zeros_like(dec_input_lists[0])] # enc_outputs_lists = tf.split(enc_outputs, num_or_size_splits=self.max_steps, axis=1) dec_outputs, _ = attn.embedding_attention_decoder( dec_input_lists, initial_state=enc_state, attention_states=enc_outputs, cell=dec_cell, num_symbols=total_vocab_size, embedding_size=1024, output_size=total_vocab_size, output_projection=None, feed_previous=False, dtype=tf.float32, scope="LSTMEncDec") loss = seq2seq_lib.sequence_loss(dec_outputs, dec_targets, dec_weights, softmax_loss_function=None) # logits = tf.reduce_mean(dec_outputs, axis=0) label_num = tf.reduce_sum(label_weights, axis=1, keep_dims=True) logits = tf.add_n(dec_outputs) / label_num # logits = tf.Print(logits.get_shape(), [logits]) logits = logits[:, :vocab_size] # logits = tf.nn.sigmoid(enc_outputs[:, -1, :]) return { "predictions": dec_outputs, # "predictions": logits, "loss": loss, }
def _seq2seq(self): hps = self._hps vocab_size = self._vocab.count with tf.variable_scope("SumModel"): article_lens = self._article_lens # 由于sequence loss需要 seq_len * [batch_size] targets = tf.unstack(tf.transpose(self._targets)) loss_weights = tf.unstack(tf.transpose(self._loss_weights)) with tf.variable_scope('embedding'), tf.device('/cpu:0'): embedding = tf.get_variable("embedding", [vocab_size, hps.emb_dim], dtype=tf.float32) # [batch, seq_len, emb_dim] emb_encoder_inputs = tf.nn.embedding_lookup( embedding, self._articles) emb_decoder_inputs = tf.nn.embedding_lookup( embedding, self._abstracts) with tf.variable_scope("encoder"): cell_fw = LSTMCell(hps.num_hidden, initializer=tf.random_uniform_initializer( -0.1, 0.1, seed=123), state_is_tuple=False) cell_bw = LSTMCell(hps.num_hidden, initializer=tf.random_uniform_initializer( -0.1, 0.1, seed=113), state_is_tuple=False) # outputs: (output_fw, output_bw) => output_fw: [batch_size, max_time, cell_fw.output_size] # output_states: A tuple (output_state_fw, output_state_bw) encoder_outputs, encoder_output_states = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, inputs=emb_encoder_inputs, dtype=tf.float32, sequence_length=article_lens) # encoder_outputs: [batch_size, max_time, 2 * output_size] self._enc_outputs = tf.concat(encoder_outputs, axis=2) # [batch_size, 2 * output_size] encoder_state_fw, _ = encoder_output_states with tf.variable_scope("output_projection"): w = tf.get_variable( "w", [hps.num_hidden, vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) v = tf.get_variable( "b", [vocab_size], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=1e-4)) with tf.variable_scope("decoder"): loop_function = None if hps.mode == "test": loop_function = _extract_argmax_and_embed( embedding, (w, v), update_embedding=False) decoder_cell = LSTMCell( hps.num_hidden, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=113), state_is_tuple=False) # 将实际输入转化成符合要求的输入 # [seq_len, batch, emb_dim] => seq_len * [batch, emb_dim] emb_decoder_inputs = tf.unstack( tf.transpose(emb_decoder_inputs, perm=[1, 0, 2])) # [batch, cell_size] self._dec_in_state = encoder_state_fw initial_state_attention = (hps.mode == 'test') # decoder_outputs: seq_len * [batch, hidden_size] # self._dec_out_state: [batch, state_size]=[batch, 2*cell_size] decoder_outputs, self._dec_out_state = attention_decoder( decoder_inputs=emb_decoder_inputs, initial_state=self._dec_in_state, attention_states=self._enc_outputs, cell=decoder_cell, num_heads=1, loop_function=loop_function, initial_state_attention=initial_state_attention) with tf.variable_scope("output"): # 还可以写成 #[batch * seq_len, vsize] output = tf.reshape(tf.stack(values=decoder_outputs, axis=1), [-1, hps.num_hidden]) logits = tf.matmul(output, w) + v model_outputs = tf.unstack(tf.reshape( logits, [-1, hps.dec_timesteps, vocab_size]), axis=1) # seq_len * [batch, vsize] # 输出层共享 # model_outputs = [] # for i in range(len(decoder_outputs)): # if i > 0: # tf.get_variable_scope().reuse_variables() # model_outputs.append( # tf.nn.xw_plus_b(decoder_outputs[i], w, v)) with tf.variable_scope("loss"): # logits: seq_len * [batch_size, vsize] # targets: seq_len * [batch_size] # weights: seq_len * [batch_size] 注意这里的weights的作用是做mask # 1. sequence_loss先是调用sequence_loss_by_example,获取[batch_size]维的loss,在除以batch_size # 2. sequence_loss_by_example利用weights来做mask,获取实际的每个time_step的平均loss # 因为batch里面实际句子长度不一样,所有weights要先初始化zeros,然后向里面填1 self._loss = sequence_loss(logits=model_outputs, targets=targets, weights=loss_weights) if hps.mode == "test": with tf.variable_scope("decode_output"): # seq_len * [batch, vsize] => seq_len * [batch, 1] best_outputs = [tf.arg_max(x, 1) for x in model_outputs] # [batch, seq_len] self._outputs = tf.concat( axis=1, values=[tf.reshape(x, [-1, 1]) for x in best_outputs])
def create_model(self, model_input, vocab_size, num_frames, is_training=True, dense_labels=None, feature_sizes=None, input_weights=None, **unused_params): self.is_training = is_training feature_size = sum(feature_sizes) num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) # TODO self.max_steps = 300 # 30 enc_inputs = utils.SampleRandomSequence(model_input, num_frames, self.max_steps) enc_cell = self.get_enc_cell(self.cell_size, self.cell_size) dec_cell = self.get_dec_cell(self.cell_size) runtime_batch_size = tf.shape(enc_inputs)[0] enc_init_state = enc_cell.zero_state(runtime_batch_size, dtype=tf.float32) enc_outputs, enc_state = tf.nn.dynamic_rnn( enc_cell, enc_inputs, initial_state=enc_init_state, scope="enc") if True: enc_outputs_stopped = tf.stop_gradient(enc_outputs) input_weights = tf.tile(tf.expand_dims(input_weights, 2), [1, 1, self.cell_size]) enc_outputs_stopped = enc_outputs_stopped * input_weights enc_rep = tf.reduce_sum(enc_outputs_stopped, axis=1) / num_frames # enc_rep = tf.reduce_sum(enc_outputs_stopped, axis=1) / self.max_steps self.vocab_size = vocab_size cls_func = self.moe logits = cls_func(enc_rep) if cls_func == self.moe: epsilon = 1e-12 labels = tf.cast(dense_labels, tf.float32) cross_entropy_loss = labels * tf.log(logits + epsilon) + ( 1 - labels) * tf.log(1 - logits + epsilon) cross_entropy_loss = tf.negative(cross_entropy_loss) loss = tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1)) predictions = logits else: loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast( dense_labels, tf.float32), logits=logits) loss = tf.reduce_mean(tf.reduce_sum(loss, 1)) predictions = tf.nn.sigmoid(logits) else: dec_targets = tf.unstack(enc_inputs, axis=1) dec_targets.reverse() dec_inputs = [tf.zeros_like(dec_targets[0])] + dec_targets[:-1] dec_outputs, _ = attn.attention_decoder( decoder_inputs=dec_inputs, initial_state=enc_state, attention_states=enc_outputs, cell=dec_cell, output_size=feature_size, dtype=tf.float32) dec_weights = [] for _ in xrange(self.max_steps): dec_weights.append( tf.ones([ runtime_batch_size, ], dtype=tf.float32)) loss = seq2seq_lib.sequence_loss( dec_outputs, dec_targets, dec_weights, softmax_loss_function=self.reconstruct_loss) predictions = tf.no_op() return { "loss": loss, "predictions": predictions, }
def _build_model(self): """ Builds a model either for training or testing :return: """ cell = self._set_cell_type() self._build_inputs() output_projection = None print("Embedding size: ", self.embedding_size) if self.use_attn: if self.copy: print("Using attention of form ", self.attn_type, " with copy mechanism...") else: print("Using attention of form ", self.attn_type) self.outputs, self.states, self.attn_outputs = embedding_attention_seq2seq( self.encoder_inputs, self.decoder_inputs, cell, num_encoder_symbols=self.vocab_size, num_decoder_symbols=self.vocab_size, embedding_size=self.embedding_size, output_projection=output_projection, feed_previous=self.do_decode, dtype=tf.float32, copy=self.copy, attn_type=self.attn_type) else: print("Using vanilla seq2seq...") self.outputs, self.states = embedding_rnn_seq2seq( self.encoder_inputs, self.decoder_inputs, cell, num_encoder_symbols=self.vocab_size, num_decoder_symbols=self.vocab_size, embedding_size=self.embedding_size, output_projection=output_projection, feed_previous=self.do_decode, dtype=tf.float32) self.attn_outputs = None # Compute loss -- averaged across batch + with l2 loss added trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) # Only get non-bias terms non_bias_vars = [v for v in trainable_vars if "Bias" not in v.name] l2_loss = tf.add_n( [self.l2_reg * tf.nn.l2_loss(nb) for nb in non_bias_vars]) # Compute loss -- averaged across batch self.total_loss = sequence_loss(self.outputs, self.decoder_inputs, self.target_weights) + l2_loss self.training_op = tf.train.AdamOptimizer( learning_rate=0.0001).minimize(self.total_loss) self.dec_prediction = tf.transpose(tf.argmax(self.outputs, axis=1), [1, 0]) self.predictions = tf.transpose( tf.argmax(tf.stack(self.outputs), axis=-1), [1, 0]) self.saver = tf.train.Saver(max_to_keep=10) self.increment_global_step = tf.assign_add( self.global_step, 1, name='increment_global_step')