def encode_v2(self, question_embeddings, document_embeddings, question_mask, context_mask, encoderb_state_input, dropout_keep_prob): """ encode_v2() """ with vs.variable_scope("encoder"): # Question -> LSTM -> Q lstm_cell = tf.nn.rnn_cell.LSTMCell(self.embedding_size) question_length = tf.reduce_sum(tf.cast(question_mask, tf.int32), reduction_indices=1) print("Question length: ", question_length) Q_prime, _ = dynamic_rnn(lstm_cell, tf.transpose(question_embeddings, [0, 2, 1]), sequence_length=question_length, time_major=False, dtype=tf.float32) Q_prime = tf.transpose(Q_prime, [0, 2, 1]) print("Q_prime: ", Q_prime) # Non-linear projection layer on top of the question encoding W_Q = tf.get_variable("W_Q", (self.embedding_size, self.embedding_size)) b_Q = tf.get_variable("b_Q", (self.embedding_size, 1)) Q = tf.tanh( matrix_multiply_with_batch(matrix=W_Q, batch=question_embeddings, matrixByBatch=True) + b_Q) print("Q: ", Q) # Paragraph -> LSTM -> D tf.get_variable_scope().reuse_variables() print("Context mask: ", context_mask) context_length = tf.reduce_sum(tf.cast(context_mask, tf.int32), reduction_indices=1) D, _ = dynamic_rnn(lstm_cell, tf.transpose(document_embeddings, [0, 2, 1]), sequence_length=context_length, time_major=False, dtype=tf.float32) D = tf.transpose(D, [0, 2, 1]) print("D: ", D) L = tf.matmul(tf.transpose(D, [0, 2, 1]), Q) A_Q = tf.nn.softmax(L) A_D = tf.nn.softmax(tf.transpose(L, [0, 2, 1])) print("A_Q: ", A_Q) print("A_D: ", A_D) C_Q = batch_matmul(D, A_Q) print("C_Q: ", C_Q) concat = tf.concat(1, [Q, C_Q]) print("concat: ", concat) C_D = batch_matmul(tf.concat(1, [Q, C_Q]), A_D) print("C_D: ", C_D) final_D = tf.concat(1, [D, C_D]) print("final D: ", final_D) return final_D
def decode_v2(self, final_D, W, W_prime, context_mask): with vs.variable_scope("answer_start"): a_s = tf.squeeze( matrix_multiply_with_batch( matrix=W, batch=tf.transpose(final_D, [0, 2, 1]), matrixByBatch=False)) # a_s = final_D * W print("a_s: ", a_s) with vs.variable_scope("answer_end"): lstm_cell = tf.nn.rnn_cell.LSTMCell(self.output_size) context_length = tf.reduce_sum(tf.cast(context_mask, tf.int32), reduction_indices=1) print("Context length: ", context_length) final_D_prime, _ = dynamic_rnn(lstm_cell, final_D, sequence_length=context_length, time_major=False, dtype=tf.float32) print("final D prime: ", final_D_prime) a_e = matrix_multiply_with_batch(matrix=W_prime, batch=tf.transpose( final_D_prime, [0, 2, 1]), matrixByBatch=False) print("a_e: ", a_e) a_e = tf.squeeze( matrix_multiply_with_batch(matrix=W_prime, batch=tf.transpose( final_D_prime, [0, 2, 1]), matrixByBatch=False)) print("a_e: ", a_e) return (a_s, a_e)
def encoder(self, x_enc_onehot, len_enc, reuse=False): with tf.variable_scope("encoder", reuse=reuse): in_enc = self._soft_embedding_lookup(self.embed, x_enc_onehot) initial_state = self.cell().zero_state(self.config.batch_size, tf.float32) out_tuple = dynamic_rnn(cell=self.cell(reuse), inputs=in_enc, sequence_length=len_enc, initial_state=initial_state) (_, encoder_hidden) = out_tuple # linear layers for mu and log(var) latent_dim = hidden_size = self.config.hidden_size W_mu = tf.get_variable("W_mu", [hidden_size, latent_dim]) b_mu = tf.get_variable("b_mu", [latent_dim]) W_logvar = tf.get_variable("W_logvar", [hidden_size, latent_dim]) b_logvar = tf.get_variable("b_logvar", [latent_dim]) #l2_loss = tf.nn.l2_loss(W_mu) + tf.nn.l2_loss(W_logvar) mu = tf.matmul(encoder_hidden, W_mu) + b_mu logvar = tf.matmul(encoder_hidden, W_logvar) + b_logvar # sample epsilon epsilon = tf.random_normal(tf.shape(logvar), name='epsilon') # sample latent variable stddev = tf.exp(0.5 * logvar) # standard z = mu + tf.multiply(stddev, epsilon) return z, mu, logvar
def discriminator(self, inputs, inputs_length, reuse=False): with tf.variable_scope('discriminator', reuse=reuse): inputs = self._soft_embedding_lookup(self.embed, inputs) _, state = dynamic_rnn(cell=self.cell(reuse), inputs=inputs, sequence_length=inputs_length, dtype=tf.float32) output_layer = Dense(self.config.vocab_num) outputs = output_layer(state) predicted = tf.argmax(outputs, 1) return outputs, predicted
def add_prediction_op(self): """Adds the core transformation for this model which transforms a batch of input data into a batch of predictions. In this case, the transformation is a linear layer plus a softmax transformation: y = softmax(Wx + b) Hint: Make sure to create tf.Variables as needed. Hint: For this simple use-case, it's sufficient to initialize both weights W and biases b with zeros. Returns: pred: A tensor of shape (batch_size, n_classes) """ lstm_cell = tf.contrib.rnn.LSTMCell(self.config.state_size) lstm_cell = tf.contrib.rnn.DropoutWrapper( lstm_cell, input_keep_prob=self.config.dropout_keep_prob) #print "inputs: ", self.input_placeholder # Masks are shape (?, 582, 13), but the last dimension is redundant, so we get rid of it when calculating # the sequence length for the LSTM source_num_frames = tf.reduce_sum(tf.cast( self.input_masks_placeholder[:, :, 0], tf.int32), reduction_indices=1) outputs, final_state = dynamic_rnn(lstm_cell, self.input_placeholder, sequence_length=source_num_frames, dtype=tf.float32) #print "LSTM outputs: ", outputs #print "final state: ", final_state xavier = tf.contrib.layers.xavier_initializer() W = tf.get_variable("W", shape=(self.config.state_size, self.config.n_mfcc_features), initializer=xavier) b = tf.get_variable("b", shape=(1, self.config.n_mfcc_features)) print tf.shape(outputs) outputs = tf.reshape(outputs, [-1, self.config.state_size]) print tf.shape(outputs) mfcc_preds = tf.matmul(outputs, W) mfcc_preds = tf.reshape( mfcc_preds, [-1, self.config.max_num_frames, self.config.n_mfcc_features]) mfcc_preds += b #print "mfcc_preds: ", mfcc_preds self.mfcc = mfcc_preds return mfcc_preds
def build_graph(self): self.input_query = tf.placeholder(tf.int32, shape = [None,50],name = "index_query") self.query_length =tf.placeholder(dtype=tf.int32, shape=[None],name = "query_length") self.label = tf.placeholder(dtype = tf.int32,shape = [None,4],name = "label") self.dropout_pl = tf.placeholder(dtype=tf.float32, shape=[], name="dropout") self.lr_pl = tf.placeholder(dtype=tf.float32, shape=[], name="lr") _word_embeddings = tf.Variable(self.embeddings,dtype=tf.float32,trainable=self.update_embedding,name="word_embeddings") self.embed_query = tf.nn.dropout(tf.nn.embedding_lookup(params=_word_embeddings,ids=self.input_query,name="query_embeddings"),self.dropout) with tf.variable_scope('lstm'): self.cell_q = SimpleLSTMCell(self.hidden_dim) self.cell_q = tf.nn.rnn_cell.DropoutWrapper(self.cell_q, output_keep_prob=self.dropout) output,_ = dynamic_rnn(self.cell_q,self.embed_query,self,query_length,dtype=tf.float32) with tf.variabel_scope('cnn'): outputs = tf.expand_dims(output,-1) pooled_outputs = [] for i,filter_size in enumerate(self.filters_size): filter_shape = [filter_size,self.hidden_dim,self.num_filters] w = tf.Variable(tf.truncated_normal(filter_shape,stddev=0.1),name='w') b = tf.Variable(tf.constant(0.1,shape=[self.num_filters]),name='b') conv = tf.nn.conv2d(outputs,w,strides=[1,1,1,1],padding='VALID',name='conv') h = tf.nn.relu(tf.nn.bias_add(conv,b),name='relu') pooled = tf.nn.max_pool(h,ksize=[1,50-filter_size+1,1,1], strides=[1,1,1,1],padding='VALID',name='pool') pooled_outputs.append(pooled) outputs_ = tf.concat(pooled_outputs,3) self.output = tf.reshape(outputs_,shape=[-1,3*self.num_filters]) with tf.variabel_scope('output'): out_final = tf.nn.dropout(self.out,keep_prob=self.dropout) o_w = tf.Variable(tf.truncated_normal([3*self.num_filters,4],stddev=0.1),name='o_w') o_b = tf.Variable(tf.constant(0.1,shape=[4]),name='o_b') self.logits = tf.matmal(out_final,o_w) + o_b self.pred_y = tf.argmax(tf.nn.softmax(self.logits),1) self.label_y = tf.argmax(self.pred_y,1,name="pred") self.pred = tf.equal(self.pred_y,self.label_y) self.accuray = tf.reduce_mean(tf.cast(self.pred,tf.float32),name = "accuracy") self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.label),name = "loss") self.global_step = tf.Variable(0, trainable=False) opt = tf.train.AdamOptimizer(learning_rate = self.lr_pl) grads,variables = zip(opt.compute_gradients(self.loss)) gradients,_ = tf.clip_by_global_norm(gradients,self.clip_grad) self.train_op = opt.apply_gradients(zip(gradients,variables),global_step=self.global_step) tf.summary.scalar("loss",self.loss)
def _build_encoder(self, graph_embed_input): post_word_input = tf.nn.embedding_lookup( self.word_embed, self.posts_word_id) # batch*len*unit encoder_cell = MultiRNNCell( [GRUCell(self.num_units) for _ in range(self.num_layers)]) # encoder input: e(x_t) = [w(x_t); g_i] encoder_input = tf.concat([post_word_input, graph_embed_input], axis=2) encoder_output, encoder_state = dynamic_rnn(encoder_cell, encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # shape:[batch_size, max_time, cell.output_size] return encoder_output, encoder_state
def __init__(self, cfg, word_embd, max_ques_len, input_producer, generated=None): batch_size = cfg.batch_size vocab_size = len(word_embd) with tf.variable_scope('disc'): word_embd = tf.get_variable( 'word_embd', shape=word_embd.shape, initializer=tf.constant_initializer(word_embd)) if generated: self.ques = generated['ques'] self.ques_len = generated['ques_len'] # soft embedding_lookup ques = tf.reshape(self.ques, [-1, vocab_size]) ques = tf.matmul(ques, word_embd) ques = tf.reshape(ques, [batch_size, -1, cfg.embed_dim]) else: self.ques = tf.placeholder(tf.int32, shape=[None, max_ques_len], name='question') self.ques_len = tf.placeholder(tf.int32, shape=[None], name='question_length') ques = embedding_lookup(word_embd, self.ques) self.answ = input_producer.answ_disc cell = GRUCell(cfg.hidden_size) _, state = dynamic_rnn(cell, ques, sequence_length=self.ques_len, dtype=tf.float32) output_layer = Dense(vocab_size) logits = output_layer(state) labels = tf.one_hot(self.answ, vocab_size) self.pred = tf.argmax(logits, 1) loss = softmax_cross_entropy_with_logits(labels=labels, logits=logits) self.loss = tf.reduce_mean(loss)
def __init__(self, num_symbols, # 18430, vocabulary size. num_embed_units, # 300, Size of word embedding. num_units, # 512, Size of each model layer. num_layers, # 1, Number of layers in the model. num_labels, # 5, Number of labels. embed, # (18430, 300), word2vector list. learning_rate=0.5, max_gradient_norm=5.0): # todo: implement placeholders self.texts = tf.placeholder(dtype=tf.string, shape=[None, None], name='texts') # shape: batch*len self.texts_length = tf.placeholder(dtype=tf.int64, shape=[None], name='texts_length') # shape: batch self.labels = tf.placeholder(dtype=tf.int64, shape=[None], name='labels') # shape: batch self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input = self.symbol2index.lookup(self.texts) # batch*len # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input = tf.nn.embedding_lookup(self.embed, self.index_input) #batch*len*embed_unit if num_layers == 1: # cell = BasicLSTMCell(num_units) cell = GRUCell(num_units) # cell = BasicRNNCell(num_units) keep_prob = 0.95 dropped_input = tf.nn.dropout(self.embed_input, keep_prob=keep_prob) outputs, states = dynamic_rnn(cell, dropped_input, self.texts_length, dtype=tf.float32, scope="rnn") # todo: implement unfinished networks # logits = tf.layers.dense(inputs=states, units=num_labels) l1 = tf.nn.dropout(states, keep_prob=keep_prob) inner_layer = tf.layers.dense(inputs=l1, units=256, activation=tf.nn.relu) l2 = tf.nn.dropout(inner_layer, keep_prob=keep_prob) logits = tf.layers.dense(inputs=l2, units=num_labels) self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, num_items, num_embed_units, num_units, num_layers, vocab=None, embed=None, learning_rate=5e-4, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, use_lstm=True): self.epoch = tf.Variable(0, trainable=False, name='env/epoch') self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.sessions_input = tf.placeholder(tf.int32, shape=(None, None)) self.rec_lists = tf.placeholder(tf.int32, shape=(None, None, None)) self.rec_mask = tf.placeholder(tf.float32, shape=(None, None, None)) self.aims_idx = tf.placeholder(tf.int32, shape=(None, None)) self.sessions_length = tf.placeholder(tf.int32, shape=(None)) self.purchase = tf.placeholder(tf.int32, shape=(None, None)) if embed is None: self.embed = tf.get_variable( 'env/embed', [num_items, num_embed_units], tf.float32, initializer=tf.truncated_normal_initializer(0, 1)) else: self.embed = tf.get_variable('env/embed', dtype=tf.float32, initializer=embed) batch_size, encoder_length, rec_length = tf.shape( self.sessions_input)[0], tf.shape( self.sessions_input)[1], tf.shape(self.rec_lists)[2] encoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.sessions_length - 2, encoder_length), reverse=True, axis=1), [-1, encoder_length]) self.encoder_input = tf.nn.embedding_lookup( self.embed, self.sessions_input) #batch*len*unit self.aims = tf.one_hot(self.aims_idx, rec_length) if use_lstm: cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) else: cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # Training with tf.variable_scope("env"): # [batch_size, length, num_units] encoder_output, _ = dynamic_rnn(cell, self.encoder_input, self.sessions_length, dtype=tf.float32, scope="encoder") # [batch_size, length, embed_units] preference = tf.layers.dense(encoder_output, num_embed_units, name="pref_output") # [batch_size, length, rec_length, embed_units] self.candidate = tf.reshape( tf.gather_nd(self.embed, tf.expand_dims(self.rec_lists, 3)), [batch_size, encoder_length, rec_length, num_embed_units]) # [batch_size, length, rec_length] logits = tf.reduce_mean( tf.multiply(tf.expand_dims(preference, 2), self.candidate), 3) mul_prob = tf.nn.softmax(logits) * self.rec_mask # [batch_size, length, rec_length] self.norm_prob = mul_prob / ( tf.expand_dims(tf.reduce_sum(mul_prob, 2), 2) + 1e-20) # [batch_size, length, metric_num] _, self.argmax_index = tf.nn.top_k(self.norm_prob, k=FLAGS['metric'].value + 1) local_predict_loss = tf.reduce_sum( -self.aims * tf.log(self.norm_prob + 1e-20), 2) * encoder_mask self.predict_loss = tf.reduce_sum( local_predict_loss) / tf.reduce_sum(encoder_mask) # [batch_size, length, embed_units] aim_embed = tf.reduce_sum( tf.expand_dims(self.aims, 3) * self.candidate, 2) if FLAGS['use_simulated_data'].value: self.purchase_prob, local_purchase_loss, self.purchase_loss = tf.zeros( [batch_size, encoder_length, 2], dtype=tf.float32), tf.zeros([batch_size, encoder_length], dtype=tf.float32), tf.constant( 0., dtype=tf.float32) else: # [batch_size, length, 2] self.purchase_prob = tf.nn.softmax( tf.layers.dense(tf.multiply( tf.layers.dense(tf.stop_gradient(encoder_output), num_units, name="purchase_layer"), tf.layers.dense(tf.stop_gradient(aim_embed), num_units, name="purchase_aim")), 2, name="purchase_projection")) local_purchase_loss = tf.reduce_sum( -tf.one_hot(self.purchase, 2) * tf.log(self.purchase_prob + 1e-20), 2) * encoder_mask * tf.pow( tf.cast(self.purchase, tf.float32) + 1, 5.3) self.purchase_loss = tf.reduce_sum( local_purchase_loss) / tf.reduce_sum(encoder_mask) self.decoder_loss = self.predict_loss + self.purchase_loss self.score = tf.placeholder(tf.float32, (None, None)) self.score_loss = tf.reduce_sum( self.score * (local_predict_loss + local_purchase_loss)) / tf.reduce_sum(encoder_mask) # Inference with tf.variable_scope("env", reuse=True): # tf.get_variable_scope().reuse_variables() # [batch_size, 1, embed_units] inf_preference = tf.expand_dims( tf.layers.dense(encoder_output[:, -1, :], num_embed_units, name="pref_output"), 1) # [batch_size, 1, rec_length, embed_units] self.inf_candidate = tf.reshape( tf.gather_nd(self.embed, tf.expand_dims(self.rec_lists, 3)), [batch_size, 1, rec_length, num_embed_units]) # [batch_size, 1, rec_length] inf_logits = tf.reduce_mean( tf.multiply(tf.expand_dims(inf_preference, 2), self.inf_candidate), 3) inf_mul_prob = tf.nn.softmax(inf_logits) * self.rec_mask self.inf_norm_prob = inf_mul_prob / ( tf.expand_dims(tf.reduce_sum(inf_mul_prob, 2), 2) + 1e-20) # [batch_size, 1, metric_num] _, self.inf_argmax_index = tf.nn.top_k(self.inf_norm_prob, k=FLAGS['metric'].value) _, self.inf_all_argmax_index = tf.nn.top_k( self.inf_norm_prob, k=tf.shape(self.inf_norm_prob)[-1]) def gumbel_max(inp, alpha, beta): # assert len(tf.shape(inp)) == 2 g = tf.random_uniform(tf.shape(inp), 0.0001, 0.9999) g = -tf.log(-tf.log(g)) inp_g = tf.nn.softmax( (tf.nn.log_softmax(inp / 1.0) + g * alpha) * beta) return inp_g # [batch_size, action_num] _, self.inf_random_index = tf.nn.top_k(gumbel_max( tf.log(self.inf_norm_prob + 1e-20), 1, 1), k=FLAGS['metric'].value) _, self.inf_all_random_index = tf.nn.top_k( gumbel_max(tf.log(self.inf_norm_prob + 1e-20), 1, 1), k=tf.shape(self.inf_norm_prob)[-1]) inf_aim_embed = tf.reduce_sum( tf.cast( tf.reshape( tf.one_hot(self.inf_argmax_index[:, :, 0], rec_length), [batch_size, 1, rec_length, 1]), tf.float32) * self.inf_candidate, 2) if FLAGS['use_simulated_data'].value: self.inf_purchase_prob = tf.zeros([batch_size, 1, 2], dtype=tf.float32) else: # [batch_size, 1, 2] self.inf_purchase_prob = tf.nn.softmax( tf.layers.dense(tf.multiply( tf.layers.dense(tf.stop_gradient(encoder_output), num_units, name="purchase_layer"), tf.layers.dense(tf.stop_gradient(inf_aim_embed), num_units, name="purchase_aim")), 2, name="purchase_projection")) self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) opt = tf.train.AdamOptimizer(self.learning_rate) self.params = tf.trainable_variables() # For pretraining gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # For adversarial training score_gradients = tf.gradients(self.score_loss, self.params) score_clipped_gradients, self.score_gradient_norm = tf.clip_by_global_norm( score_gradients, max_gradient_norm) self.score_update = opt.apply_gradients(zip(score_clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=100, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, num_symbols, num_embed_units, num_units, num_labels, batch_size, embed, learning_rate=0.001, max_gradient_norm=5.0, learning_rate_decay_factor=0.9): # todo: implement placeholders self.texts1 = tf.placeholder(tf.string, [batch_size, None], name='texts1') self.texts2 = tf.placeholder(tf.string, [batch_size, None], name='texts2') # shape: batch*len self.texts_length1 = tf.placeholder( tf.int32, [batch_size], name='texts_length1') # shape: batch self.texts_length2 = tf.placeholder(tf.int32, [batch_size], name='texts_length2') self.max_length = tf.placeholder(tf.int32, name='max_length') self.labels = tf.placeholder(tf.int64, [batch_size], name='labels') # shape: batch self.keep_prob = tf.placeholder(tf.float32, name='keep_prob') self.embed_units = num_embed_units self.num_units = num_units self.batch_size = batch_size self._initializer = tf.truncated_normal_initializer(stddev=0.1) self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.index_input1 = self.symbol2index.lookup(self.texts1) # batch*len self.index_input2 = self.symbol2index.lookup(self.texts2) self.long_length = tf.maximum(self.texts_length1, self.texts_length2) print self.long_length.get_shape() self.mask_table = tf.sequence_mask(self.long_length, dtype=tf.float32) # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input1 = tf.nn.embedding_lookup( self.embed, self.index_input1) # batch*len*embed_unit self.embed_input2 = tf.nn.embedding_lookup(self.embed, self.index_input2) with tf.variable_scope('lstm_s'): self.lstm_s = tf.contrib.rnn.LSTMCell( num_units=num_units, initializer=tf.orthogonal_initializer, forget_bias=0) with tf.variable_scope('lstm_r'): self.lstm_r = tf.contrib.rnn.LSTMCell( num_units=num_units, initializer=tf.orthogonal_initializer, forget_bias=0) out_s1, state_s1 = dynamic_rnn(self.lstm_s, self.embed_input1, self.texts_length1, dtype=tf.float32, scope='rnn') out_s2, state_s2 = dynamic_rnn(self.lstm_s, self.embed_input2, self.texts_length2, dtype=tf.float32, scope='rnn') self.h_s1 = out_s1 self.h_s2 = out_s2 reshaped_s1 = tf.reshape(self.h_s1, [-1, self.num_units]) reshaped_s2 = tf.reshape(self.h_s2, [-1, self.num_units]) with tf.variable_scope('Attn_'): W_s = tf.get_variable(shape=[self.num_units, self.num_units], initializer=self._initializer, name='W_s') self.s_1 = tf.matmul(reshaped_s1, W_s) self.s_2 = tf.matmul(reshaped_s2, W_s) self.s_1 = tf.transpose( tf.reshape(self.s_1, [self.batch_size, -1, self.num_units]), [1, 2, 0]) self.s_2 = tf.transpose( tf.reshape(self.s_2, [self.batch_size, -1, self.num_units]), [1, 2, 0]) i = tf.constant(0) state_r = self.lstm_r.zero_state(batch_size=batch_size, dtype=tf.float32) def c(t, sr): return tf.less(t, self.max_length) def b(t, sr): return self.attention(t, sr) i, state_r = tf.while_loop(cond=c, body=b, loop_vars=(i, state_r)) with tf.variable_scope('fully_connect'): w_fc = tf.get_variable(shape=[self.num_units, num_labels], initializer=self._initializer, name='w_fc') b_fc = tf.get_variable(shape=[num_labels], initializer=self._initializer, name='b_fc') logits = tf.matmul(state_r.h, w_fc) + b_fc #logits = tf.layers.dense(outputs, num_labels) # todo: implement unfinished networks self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / \ tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int64), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters for item in tf.global_variables(): print item opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) #self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(mean_loss, global_step=self.global_step, #var_list=self.params) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, data, args, embed): self.init_states = tf.placeholder(tf.float32, (None, args.ch_size), 'ctx_inps') # batch*ch_size self.posts = tf.placeholder(tf.int32, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None, ), 'enc_lens') # batch self.prev_posts = tf.placeholder(tf.int32, (None, None), 'enc_prev_inps') self.prev_posts_length = tf.placeholder(tf.int32, (None, ), 'enc_prev_lens') self.kgs = tf.placeholder(tf.int32, (None, None, None), 'kg_inps') # batch*len self.kgs_h_length = tf.placeholder(tf.int32, (None, None), 'kg_h_lens') # batch self.kgs_hr_length = tf.placeholder(tf.int32, (None, None), 'kg_hr_lens') # batch self.kgs_hrt_length = tf.placeholder(tf.int32, (None, None), 'kg_hrt_lens') # batch self.kgs_index = tf.placeholder(tf.float32, (None, None), 'kg_indices') # batch self.origin_responses = tf.placeholder(tf.int32, (None, None), 'dec_inps') # batch*len self.origin_responses_length = tf.placeholder(tf.int32, (None, ), 'dec_lens') # batch self.context_length = tf.placeholder(tf.int32, (None, ), 'ctx_lens') self.is_train = tf.placeholder(tf.bool) num_past_turns = tf.shape(self.posts)[0] // tf.shape( self.origin_responses)[0] # deal with original data to adapt encoder and decoder batch_size, decoder_len = tf.shape(self.origin_responses)[0], tf.shape( self.origin_responses)[1] self.responses = tf.split(self.origin_responses, [1, decoder_len - 1], 1)[1] # no go_id self.responses_length = self.origin_responses_length - 1 self.responses_input = tf.split(self.origin_responses, [decoder_len - 1, 1], 1)[0] # no eos_id self.responses_target = self.responses decoder_len = decoder_len - 1 self.posts_input = self.posts # batch*len self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) kg_len = tf.shape(self.kgs)[2] kg_h_mask = tf.reshape( tf.cumsum(tf.one_hot(self.kgs_h_length - 1, kg_len), reverse=True, axis=2), [batch_size, -1, kg_len, 1]) kg_hr_mask = tf.reshape( tf.cumsum(tf.one_hot(self.kgs_hr_length - 1, kg_len), reverse=True, axis=2), [batch_size, -1, kg_len, 1]) kg_hrt_mask = tf.reshape( tf.cumsum(tf.one_hot(self.kgs_hrt_length - 1, kg_len), reverse=True, axis=2), [batch_size, -1, kg_len, 1]) kg_key_mask = kg_hr_mask kg_value_mask = kg_hrt_mask - kg_hr_mask # initialize the training process self.learning_rate = tf.Variable(float(args.lr), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * args.lr_decay) self.global_step = tf.Variable(0, trainable=False) # build the embedding table and embedding input if embed is None: # initialize the embedding randomly self.embed = tf.get_variable( 'embed', [data.vocab_size, args.embedding_size], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.encoder_input = tf.nn.embedding_lookup(self.embed, self.posts) self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) self.kg_input = tf.nn.embedding_lookup(self.embed, self.kgs) #self.knowledge_max = tf.reduce_max(tf.where(tf.cast(tf.tile(knowledge_mask, [1, 1, args.embedding_size]), tf.bool), self.knowledge_input, -mask_value), axis=1) #self.knowledge_min = tf.reduce_max(tf.where(tf.cast(tf.tile(knowledge_mask, [1, 1, args.embedding_size]), tf.bool), self.knowledge_input, mask_value), axis=1) self.kg_key_avg = tf.reduce_sum( self.kg_input * kg_key_mask, axis=2) / tf.maximum( tf.reduce_sum(kg_key_mask, axis=2), tf.ones_like(tf.expand_dims(self.kgs_hrt_length, -1), dtype=tf.float32)) self.kg_value_avg = tf.reduce_sum( self.kg_input * kg_value_mask, axis=2) / tf.maximum( tf.reduce_sum(kg_value_mask, axis=2), tf.ones_like(tf.expand_dims(self.kgs_hrt_length, -1), dtype=tf.float32)) #self.encoder_input = tf.cond(self.is_train, # lambda: tf.nn.dropout(tf.nn.embedding_lookup(self.embed, self.posts_input), 0.8), # lambda: tf.nn.embedding_lookup(self.embed, self.posts_input)) # batch*len*unit #self.decoder_input = tf.cond(self.is_train, # lambda: tf.nn.dropout(tf.nn.embedding_lookup(self.embed, self.responses_input), 0.8), # lambda: tf.nn.embedding_lookup(self.embed, self.responses_input)) # build rnn_cell cell_enc = tf.nn.rnn_cell.GRUCell(args.eh_size) cell_ctx = tf.nn.rnn_cell.GRUCell(args.ch_size) cell_dec = tf.nn.rnn_cell.GRUCell(args.dh_size) # build encoder with tf.variable_scope('encoder'): encoder_output, encoder_state = dynamic_rnn(cell_enc, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder_rnn") with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): prev_output, _ = dynamic_rnn(cell_enc, tf.nn.embedding_lookup( self.embed, self.prev_posts), self.prev_posts_length, dtype=tf.float32, scope="encoder_rnn") with tf.variable_scope('context'): encoder_state_reshape = tf.reshape( encoder_state, [-1, num_past_turns, args.eh_size]) _, self.context_state = dynamic_rnn(cell_ctx, encoder_state_reshape, self.context_length, dtype=tf.float32, scope='context_rnn') # get output projection function output_fn = MyDense(data.vocab_size, use_bias=True) sampled_sequence_loss = output_projection_layer( args.dh_size, data.vocab_size, args.softmax_samples) # construct attention ''' encoder_len = tf.shape(encoder_output)[1] attention_memory = tf.reshape(encoder_output, [batch_size, -1, args.eh_size]) attention_mask = tf.reshape(tf.sequence_mask(self.posts_length, encoder_len), [batch_size, -1]) attention_mask = tf.concat([tf.ones([batch_size, 1], tf.bool), attention_mask[:,1:]], axis=1) attn_mechanism = MyAttention(args.dh_size, attention_memory, attention_mask) ''' attn_mechanism = tf.contrib.seq2seq.BahdanauAttention( args.dh_size, prev_output, memory_sequence_length=tf.maximum(self.prev_posts_length, 1)) cell_dec_attn = tf.contrib.seq2seq.AttentionWrapper( cell_dec, attn_mechanism, attention_layer_size=args.dh_size) ctx_state_shaping = tf.layers.dense(self.context_state, args.dh_size, activation=None) dec_start = cell_dec_attn.zero_state( batch_size, dtype=tf.float32).clone(cell_state=ctx_state_shaping) # calculate kg embedding with tf.variable_scope('knowledge'): query = tf.reshape( tf.layers.dense(tf.concat(self.context_state, axis=-1), args.embedding_size, use_bias=False), [batch_size, 1, args.embedding_size]) kg_score = tf.reduce_sum(query * self.kg_key_avg, axis=2) kg_score = tf.where(tf.greater(self.kgs_hrt_length, 0), kg_score, -tf.ones_like(kg_score) * np.inf) kg_alignment = tf.nn.softmax(kg_score) kg_max = tf.argmax(kg_alignment, axis=-1) kg_max_onehot = tf.one_hot(kg_max, tf.shape(kg_alignment)[1], dtype=tf.float32) self.kg_acc = tf.reduce_sum( kg_max_onehot * self.kgs_index) / tf.maximum( tf.reduce_sum(tf.reduce_max(self.kgs_index, axis=-1)), tf.constant(1.0)) self.kg_loss = tf.reduce_sum( -tf.log(tf.clip_by_value(kg_alignment, 1e-12, 1.0)) * self.kgs_index, axis=1) / tf.maximum(tf.reduce_sum(self.kgs_index, axis=1), tf.ones([batch_size], dtype=tf.float32)) self.kg_loss = tf.reduce_mean(self.kg_loss) self.knowledge_embed = tf.reduce_sum( tf.expand_dims(kg_alignment, axis=-1) * self.kg_value_avg * tf.cast(kg_num_mask, tf.float32), axis=1) #self.knowledge_embed = tf.Print(self.knowledge_embed, ['acc', self.kg_acc, 'loss', self.kg_loss]) knowledge_embed_extend = tf.tile( tf.expand_dims(self.knowledge_embed, axis=1), [1, decoder_len, 1]) self.decoder_input = tf.concat( [self.decoder_input, knowledge_embed_extend], axis=2) # construct helper train_helper = tf.contrib.seq2seq.TrainingHelper( self.decoder_input, tf.maximum(self.responses_length, 1)) infer_helper = MyInferenceHelper(self.embed, tf.fill([batch_size], data.go_id), data.eos_id, self.knowledge_embed) #infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embed, tf.fill([batch_size], data.go_id), data.eos_id) # build decoder (train) with tf.variable_scope('decoder'): decoder_train = tf.contrib.seq2seq.BasicDecoder( cell_dec_attn, train_helper, dec_start) train_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder_train, impute_finished=True, scope="decoder_rnn") self.decoder_output = train_outputs.rnn_output #self.decoder_output = tf.nn.dropout(self.decoder_output, 0.8) self.decoder_distribution_teacher, self.decoder_loss, self.decoder_all_loss = \ sampled_sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) # build decoder (test) with tf.variable_scope('decoder', reuse=True): decoder_infer = tf.contrib.seq2seq.BasicDecoder( cell_dec_attn, infer_helper, dec_start, output_layer=output_fn) infer_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder_infer, impute_finished=True, maximum_iterations=args.max_sent_length, scope="decoder_rnn") self.decoder_distribution = infer_outputs.rnn_output self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, data.vocab_size - 2], 2)[1], 2) + 2 # for removing UNK # calculate the gradient of parameters and update self.params = [ k for k in tf.trainable_variables() if args.name in k.name ] opt = tf.train.AdamOptimizer(self.learning_rate) self.loss = self.decoder_loss + self.kg_loss gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, args.grad_clip) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # save checkpoint self.latest_saver = tf.train.Saver( write_version=tf.train.SaverDef.V2, max_to_keep=args.checkpoint_max_to_keep, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.best_saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) # create summary for tensorboard self.create_summary(args)
def __init__(self, num_lstm_units, embed, neg_num=4, gradient_clip_threshold=5.0): self.queries = tf.placeholder(dtype=tf.string, shape=[None, None]) # shape: batch*len self.queries_length = tf.placeholder(dtype=tf.int32, shape=[None]) # shape: batch self.docs = tf.placeholder(dtype=tf.string, shape=[neg_num + 1, None, None ]) # shape: (neg_num + 1)*batch*len self.docs_length = tf.placeholder( dtype=tf.int32, shape=[neg_num + 1, None]) # shape: batch*(neg_num + 1) self.word2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.learning_rate = tf.Variable(0.001, trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32) self.index_queries = self.word2index.lookup(self.queries) # batch*len self.index_docs = [ self.word2index.lookup(doc) for doc in tf.unstack(self.docs) ] self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_queries = tf.nn.embedding_lookup(self.embed, self.index_queries) self.embed_docs = [ tf.nn.embedding_lookup(self.embed, index_doc) for index_doc in self.index_docs ] with tf.variable_scope('query_lstm'): self.cell_q = SimpleLSTMCell(num_lstm_units) with tf.variable_scope('doc_lstm'): self.cell_d = SimpleLSTMCell(num_lstm_units) self.states_q = dynamic_rnn( self.cell_q, self.embed_queries, self.queries_length, dtype=tf.float32, scope="simple_lstm_cell_query")[1][1] # shape: batch*num_units self.states_d = [ dynamic_rnn(self.cell_d, self.embed_docs[i], self.docs_length[i], dtype=tf.float32, scope="simple_lstm_cell_doc")[1][1] for i in range(neg_num + 1) ] # shape: (neg_num + 1)*batch*num_units self.queries_norm = tf.sqrt( tf.reduce_sum(tf.square(self.states_q), axis=1)) self.docs_norm = [ tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1)) for i in range(neg_num + 1) ] self.prods = [ tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1) for i in range(neg_num + 1) ] self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i])) for i in range(neg_num + 1)] # shape: (neg_num + 1)*batch self.sims = tf.convert_to_tensor(self.sims) self.gamma = tf.Variable( initial_value=1.0, expected_shape=[], dtype=tf.float32) # scaling factor according to the paper self.sims = self.sims * self.gamma self.prob = tf.nn.softmax(self.sims, dim=0) # shape: (neg_num + 1)*batch self.hit_prob = tf.transpose(self.prob[0]) self.loss = -tf.reduce_mean(tf.log(self.hit_prob)) self.params = tf.trainable_variables() opt = tf.train.MomentumOptimizer( learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True) # use Nesterov's method, according to the paper gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, gradient_clip_threshold) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def encode(self, question_embeddings, context_embeddings, question_mask, context_mask, encoder_state_input, dropout_keep_prob, batch_size): """ In a generalized encode function, you pass in your inputs, masks, and an initial hidden state input into this function. :param inputs: Symbolic representations of your input :param masks: this is to make sure tf.nn.dynamic_rnn doesn't iterate through masked steps :param encoder_state_input: (Optional) pass this as initial hidden state to tf.nn.dynamic_rnn to build conditional representations :return: an encoded representation of your input. It can be context-level representation, word-level representation, or both. """ with vs.variable_scope("encoder", True): # Encode question with vs.variable_scope("question", True): lstm_cell = tf.nn.rnn_cell.LSTMCell( self.state_size) # Should be 1 at first, then 200 lstm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=dropout_keep_prob) question_length = tf.reduce_sum(tf.cast( question_mask, tf.int32), reduction_indices=1) print("Question length: ", question_length) #(fw_out, bw_out), _ = bidirectional_dynamic_rnn(lstm_cell, lstm_cell, # question_embeddings, sequence_length=question_length, # time_major=False, dtype=tf.float64, swap_memory=True) #TODO: time_major=True was causing seg faults #self.H_q = tf.concat(2, [fw_out, bw_out]) self.H_q, _ = dynamic_rnn(lstm_cell, question_embeddings, sequence_length=question_length, time_major=False, dtype=tf.float64, swap_memory=True) #last_h_q_indices = question_length - 1 #last_h_q_indices = tf.stack([tf.range(batch_size), last_h_q_indices], axis=1) #self.h_q = tf.gather_nd(self.H_q, last_h_q_indices) self.h_q = self.H_q[:, 1, :] print("H_q: ", self.H_q) print("h_q: ", self.h_q) with vs.variable_scope("context", True): # Encode context paragraph context_length = tf.reduce_sum(tf.cast(context_mask, tf.int32), reduction_indices=1) print("Context length: ", context_length) #attn_cell = GRUAttnCell(2* self.state_size, self.H_q) # TODO: 2* because fw_out and bw_out are concatenated #self.H_p, _ = dynamic_rnn(attn_cell, context_embeddings, dtype=tf.float64)#, sequence_length=context_length, dtype=tf.float64) context_lstm_cell = tf.nn.rnn_cell.LSTMCell(self.state_size) context_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( context_lstm_cell, output_keep_prob=dropout_keep_prob) #(fw_out, bw_out), _ = bidirectional_dynamic_rnn(context_lstm_cell, context_lstm_cell, # context_embeddings, sequence_length=context_length, # time_major=False, dtype=tf.float64, swap_memory=True) #TODO: time_major=True was causing seg faults #self.H_p = tf.concat(2, [fw_out, bw_out]) self.H_p, _ = dynamic_rnn(context_lstm_cell, context_embeddings, sequence_length=context_length, time_major=False, dtype=tf.float64, swap_memory=True) #self.last_h_p_indices = context_length - 1 #self.last_h_p_indices = tf.stack([tf.range(batch_size), self.last_h_p_indices], axis=1) #self.h_p = tf.gather_nd(self.H_p, self.last_h_p_indices) self.h_p = self.H_p[:, 1, :] print("H_p: ", self.H_p) print("h_p: ", self.h_p) return self.h_q, self.h_p
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.005, max_gradient_norm=5.0): self.texts = tf.placeholder(tf.string, (None, None), 'texts') # shape: [batch, length] #todo: implement placeholders self.texts_length = tf.placeholder(, , 'texts_length') # shaoe: [batch] self.labels = tf.placeholder(, , 'labels') # shape: [batch] self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.index_input = self.symbol2index.lookup(self.texts) # shape: [batch, length] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) #todo: implement embedding inputs self.embed_input = tf.nn.embedding_lookup(, ) #shape: [batch, length, num_embed_units] #todo: implement other RNNCell to replace BasicRNNCell cell = MultiRNNCell([BasicRNNCell(num_units) for _ in range(num_layers)]) outputs, states = dynamic_rnn(cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") #todo: vectors is the last hidden states of the BasicRNNCell, u may need to change the code to get the right vectors of other RNNCell vectors = states[-1] with tf.variable_scope('logits'): weight = tf.get_variable("weights", [num_units, num_labels]) bias = tf.get_variable("biases", [num_labels]) #todo: implement the linear transformation: [batch, num_units] -> [batch, num_labels], using vectors, weight, bias logits = self.loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast(tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True)
def __init__(self, data, args, embed): with tf.variable_scope("input"): with tf.variable_scope("embedding"): # build the embedding table and embedding input if embed is None: # initialize the embedding randomly self.embed = tf.get_variable( 'embed', [data.frequent_vocab_size, args.embedding_size], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) # input self.sentence = tf.placeholder(tf.int32, (None, None), 'sen_inps') # batch*len self.sentence_length = tf.placeholder(tf.int32, (None, ), 'sen_lens') # batch self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") batch_size, batch_len = tf.shape(self.sentence)[0], tf.shape( self.sentence)[1] self.scentence_max_len = batch_len - 1 # data processing LM_input = tf.split(self.sentence, [self.scentence_max_len, 1], 1)[0] # no eos_id self.LM_input = tf.nn.embedding_lookup( self.embed, LM_input) # batch*(len-1)*unit self.LM_target = tf.split(self.sentence, [1, self.scentence_max_len], 1)[1] # no go_id, batch*(len-1) self.input_len = self.sentence_length - 1 self.input_mask = tf.sequence_mask( self.input_len, self.scentence_max_len, dtype=tf.float32) # 0 for <pad>, batch*(len-1) # initialize the training process self.learning_rate = tf.Variable(float(args.lr), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * args.lr_decay) self.global_step = tf.Variable(0, trainable=False) # build LSTM NN basic_cell = tf.nn.rnn_cell.LSTMCell(args.dh_size) with tf.variable_scope('rnnlm'): LM_output, _ = dynamic_rnn(basic_cell, self.LM_input, self.input_len, dtype=tf.float32, scope="rnnlm") # fullly connected layer LM_output = tf.layers.dense( inputs=LM_output, units=data.frequent_vocab_size ) # shape of LM_output: (batch_size, batch_len-1, vocab_size) # loss with tf.variable_scope("loss", initializer=tf.orthogonal_initializer()): crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=LM_output, labels=self.LM_target) crossent = tf.reduce_sum(crossent * self.input_mask) # to ignore <pad>s self.sen_loss = crossent / tf.to_float(batch_size) self.ppl_loss = crossent / tf.reduce_sum( self.input_mask) # crossent per word. # self.ppl_loss = tf.Print(self.ppl_loss, [self.ppl_loss] ) self.decoder_distribution_teacher = tf.nn.log_softmax(LM_output) with tf.variable_scope("decode", reuse=True): self.decoder_distribution = LM_output # (batch_size, batch_len-1, vocab_size) # for inference self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, data.frequent_vocab_size - 2], 2)[1], 2) + 2 # for removing UNK. 0 for <pad> and 1 for <unk> self.loss = self.sen_loss # calculate the gradient of parameters and update self.params = [ k for k in tf.trainable_variables() if args.name in k.name ] gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, args.grad_clip) opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=args.momentum) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # save checkpoint self.latest_saver = tf.train.Saver( write_version=tf.train.SaverDef.V2, max_to_keep=args.checkpoint_max_to_keep, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.best_saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) # create summary for tensorboard self.create_summary(args)
def __init__(self, num_items, num_embed_units, num_units, num_layers, vocab=None, embed=None, learning_rate=1e-4, learning_rate_decay_factor=0.95, beam_size=5, max_gradient_norm=5.0, num_samples=512, max_length=30, use_lstm=True): self.epoch = tf.Variable(0, trainable=False, name='dis/epoch') self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.sessions_input = tf.placeholder(tf.int32, shape=(None, None)) self.sessions_length = tf.placeholder(tf.int32, shape=(None)) self.rec_lists = tf.placeholder(tf.int32, shape=(None, None, None)) self.rec_mask = tf.placeholder(tf.float32, shape=(None, None, None)) self.aims_idx = tf.placeholder(tf.int32, shape=(None, None)) self.label = tf.placeholder(tf.int32, shape=(None)) self.purchase = tf.placeholder(tf.int32, shape=(None, None)) if embed is None: self.embed = tf.get_variable('dis/embed', [num_items, num_embed_units], tf.float32) else: self.embed = tf.get_variable('dis/embed', dtype=tf.float32, initializer=embed) encoder_length, rec_length = tf.shape( self.sessions_input)[1], tf.shape(self.rec_lists)[2] encoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.sessions_length - 2, encoder_length), reverse=True, axis=1), [-1, encoder_length]) self.encoder_input = tf.nn.embedding_lookup( self.embed, self.sessions_input) #batch*len*unit if use_lstm: cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) else: cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # rnn encoder encoder_output, _ = dynamic_rnn(cell, self.encoder_input, self.sessions_length, dtype=tf.float32, scope="dis/encoder") #[batch_size, length, embed_units] self.preference = tf.layers.dense(encoder_output, num_units, name="dis/out2preference") #[batch_size, length, rec_len, num_units] self.candidate = tf.layers.dense(tf.nn.embedding_lookup( self.embed, self.rec_lists), num_units, name="dis/rec2candidate") #[batch_size, length, rec_len] self.pre_mul_can = tf.reduce_sum( tf.expand_dims(self.preference, 2) * self.candidate, 3) self.max_embed = tf.reduce_sum( tf.expand_dims(tf.nn.softmax(self.pre_mul_can / 0.1), 3) * self.candidate, 2) self.aim_embed = tf.reduce_sum( tf.expand_dims(tf.one_hot(self.aims_idx, rec_length), 3) * self.candidate, 2) if FLAGS['use_simulated_data'].value: purchase_weight = tf.constant(1.0, dtype=tf.float32) else: W_p = tf.get_variable("Wp", shape=(), dtype=tf.float32) b_p = tf.get_variable("bp", shape=(), dtype=tf.float32) purchase_weight = tf.cast(self.purchase, tf.float32) * W_p + b_p self.logits = tf.reduce_sum( tf.reduce_sum(self.max_embed * self.aim_embed, 2) * purchase_weight * encoder_mask, 1) / tf.reduce_sum(encoder_mask, 1) self.prob = tf.nn.sigmoid(self.logits) self.decoder_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.cast( self.label, tf.float32))) self.acc = tf.reduce_mean( tf.cast( tf.equal(tf.cast(tf.greater(self.prob, 0.5), tf.int32), self.label), tf.float32)) self.params = tf.trainable_variables() self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=10, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, input_producer, embed_mat, config, is_train): with tf.variable_scope("VAE") as var_scope: x_enc = input_producer.x_enc x_dec = input_producer.x_dec y_dec = input_producer.y_dec len_enc = input_producer.len_enc len_dec = input_producer.len_dec max_len = input_producer.seq_max_length vocab_num = input_producer.vocab_num batch_size = config.batch_size hidden_size = config.hidden_size embed_dim = config.embed_dim is_GRU = config.is_GRU is_argmax_sampling = config.is_argmax_sampling word_keep_prob = config.word_dropout_keep_prob max_grad_norm = config.max_grad_norm learning_rate = config.learning_rate self.KL_weight = tf.Variable(0.0, "KL_weight") self.input_ids = y_dec def _lstm_cell(): return BasicLSTMCell(num_units=hidden_size, forget_bias=1.0, state_is_tuple=True, reuse=tf.get_variable_scope().reuse) def _gru_cell(): return GRUCell(num_units=hidden_size, reuse=tf.get_variable_scope().reuse) cell = _gru_cell if is_GRU else _lstm_cell self.initial_state = cell().zero_state(batch_size, tf.float32) # encoder with tf.device("/cpu:0"): embed_init = tf.constant_initializer(embed_mat)\ if (embed_mat is not None) else None embedding = tf.get_variable("embedding", [vocab_num, embed_dim], initializer=embed_init, trainable=True) in_enc = embedding_lookup(embedding, x_enc) with tf.variable_scope("encoder"): out_tuple = dynamic_rnn(cell=cell(), inputs=in_enc, sequence_length=len_enc, initial_state=self.initial_state) (_, encoder_hidden) = out_tuple # linear layers for mu and log(var) latent_dim = hidden_size # may have to change this later W_mu = tf.get_variable("W_mu", [hidden_size,latent_dim]) b_mu = tf.get_variable("b_mu", [latent_dim]) W_logvar = tf.get_variable("W_logvar", [hidden_size,latent_dim]) b_logvar = tf.get_variable("b_logvar", [latent_dim]) #l2_loss = tf.nn.l2_loss(W_mu) + tf.nn.l2_loss(W_logvar) mu = tf.matmul(encoder_hidden, W_mu) + b_mu logvar = tf.matmul(encoder_hidden, W_logvar) + b_logvar # sample epsilon epsilon = tf.random_normal(tf.shape(logvar), name='epsilon') # sample latent variable stddev = tf.exp(0.5 * logvar) # standard deviation self.z = mu + tf.multiply(stddev, epsilon) # decoder with tf.device("/cpu:0"): in_dec = embedding_lookup(embedding, x_dec) with tf.variable_scope("decoder"): helper = WordDropoutTrainingHelper( inputs=in_dec, sequence_length=len_dec, embedding=embedding, dropout_keep_prob=word_keep_prob, drop_token_id=UNK_ID, is_argmax_sampling=is_argmax_sampling) # projection layer output_layer = Dense(units=vocab_num, activation=None, use_bias=True, trainable=True) # decoder decoder = BasicDecoder(cell=cell(), helper=helper, initial_state=self.z, output_layer=output_layer) # dynamic_decode out_tuple = dynamic_decode(decoder=decoder, output_time_major=False, # speed impute_finished=True) # get all the variables in this scope self.vars = tf.contrib.framework.get_variables(var_scope) # (ouputs, state, sequence_length) (self.outputs, _, self.cell_outputs_len) = out_tuple # final # (cell_outputs, sample_ids) (self.cell_outputs, self.sampled_ids) = self.outputs # compute softmax loss (reconstruction) len_out = tf.reduce_max(len_dec) targets = y_dec[:,:len_out] weights = tf.sequence_mask(self.cell_outputs_len, dtype=tf.float32) softmax_loss = sequence_loss(logits=self.cell_outputs, targets=targets, weights=weights, average_across_timesteps=True, average_across_batch=True) self.AE_loss = self.AE_loss_mean = softmax_loss # compute KL loss (regularization) KL_term = 1 + logvar - tf.pow(mu, 2) - tf.exp(logvar) self.KL_loss = -0.5 * tf.reduce_sum(KL_term, reduction_indices=1) self.KL_loss_mean = tf.reduce_mean(self.KL_loss) # total loss self.loss = self.AE_loss + self.KL_weight * self.KL_loss_mean # optimization self.lr = tf.Variable(learning_rate, trainable=False, name="lr") grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, self.vars), max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) self.global_step = get_or_create_global_step() self.train_op = optimizer.apply_gradients(zip(grads, self.vars), global_step=self.global_step) # learning_rate update self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_lr") self.lr_update = tf.assign(self.lr, self.new_lr) # KL weight update self.new_KL_weight = tf.placeholder(tf.float32, shape=[], name="new_kl") self.KL_weight_update = tf.assign(self.KL_weight, self.new_KL_weight) # summaries tf.summary.scalar("Loss/AE_mean", self.AE_loss_mean) tf.summary.scalar("Loss/KL_mean", self.KL_loss_mean) tf.summary.scalar("Loss/Total", self.AE_loss_mean + self.KL_loss_mean) tf.summary.scalar("Misc/KL_weight", self.KL_weight) tf.summary.scalar("Misc/mu_mean", tf.reduce_mean(mu)) tf.summary.scalar("Misc/sigma_mean", tf.reduce_mean(stddev)) tf.summary.scalar("Misc/learning_rate", self.lr) self.summary_op = tf.summary.merge_all()
def __init__(self, num_items, num_embed_units, num_units, num_layers, embed=None, learning_rate=1e-4, action_num=10, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, use_lstm=True): self.epoch = tf.Variable(0, trainable=False, name='agn/epoch') self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.sessions_input = tf.placeholder(tf.int32, shape=(None, None)) self.rec_lists = tf.placeholder(tf.int32, shape=(None, None, None)) self.rec_mask = tf.placeholder(tf.float32, shape=(None, None, None)) self.aims_idx = tf.placeholder(tf.int32, shape=(None, None)) self.sessions_length = tf.placeholder(tf.int32, shape=(None)) self.reward = tf.placeholder(tf.float32, shape=(None)) if embed is None: self.embed = tf.get_variable( 'agn/embed', [num_items, num_embed_units], tf.float32, initializer=tf.truncated_normal_initializer(0, 1)) else: self.embed = tf.get_variable('agn/embed', dtype=tf.float32, initializer=embed) batch_size, encoder_length, rec_length = tf.shape( self.sessions_input)[0], tf.shape( self.sessions_input)[1], tf.shape(self.rec_lists)[2] encoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.sessions_length - 2, encoder_length), reverse=True, axis=1), [-1, encoder_length]) # [batch_size, length] self.sessions_target = tf.concat([ self.sessions_input[:, 1:], tf.ones([batch_size, 1], dtype=tf.int32) * PAD_ID ], 1) # [batch_size, length, embed_units] self.encoder_input = tf.nn.embedding_lookup(self.embed, self.sessions_input) # [batch_size, length, rec_length] self.aims = tf.one_hot(self.aims_idx, rec_length) if use_lstm: cell = MultiRNNCell( [LSTMCell(num_units) for _ in range(num_layers)]) else: cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # Training with tf.variable_scope("agn"): output_fn, sampled_sequence_loss = output_projection_layer( num_units, num_items) self.encoder_output, self.encoder_state = dynamic_rnn( cell, self.encoder_input, self.sessions_length, dtype=tf.float32, scope="encoder") tmp_dim_1 = tf.tile( tf.reshape(tf.range(batch_size), [batch_size, 1, 1, 1]), [1, encoder_length, rec_length, 1]) tmp_dim_2 = tf.tile( tf.reshape(tf.range(encoder_length), [1, encoder_length, 1, 1]), [batch_size, 1, rec_length, 1]) # [batch_size, length, rec_length, 3] gather_idx = tf.concat( [tmp_dim_1, tmp_dim_2, tf.expand_dims(self.rec_lists, 3)], 3) # [batch_size, length, num_items], [batch_size*length] y_prob, local_loss, total_size = sampled_sequence_loss( self.encoder_output, self.sessions_target, encoder_mask) # Compute recommendation rank given rec_list # [batch_size, length, num_items] y_prob = tf.reshape(y_prob, [batch_size, encoder_length, num_items]) * \ tf.concat([tf.zeros([batch_size, encoder_length, 2], dtype=tf.float32), tf.ones([batch_size, encoder_length, num_items-2], dtype=tf.float32)], 2) # [batch_size, length, rec_len] ini_prob = tf.reshape(tf.gather_nd(y_prob, gather_idx), [batch_size, encoder_length, rec_length]) # [batch_size, length, rec_len] mul_prob = ini_prob * self.rec_mask # [batch_size, length, action_num] _, self.index = tf.nn.top_k(mul_prob, k=action_num) # [batch_size, length, metric_num] _, self.metric_index = tf.nn.top_k(mul_prob, k=(FLAGS['metric'].value + 1)) self.loss = tf.reduce_sum( tf.reshape(self.reward, [-1]) * local_loss) / total_size # Inference with tf.variable_scope("agn", reuse=True): # tf.get_variable_scope().reuse_variables() self.lstm_state = tf.placeholder(tf.float32, shape=(2, 2, None, num_units)) self.ini_state = (tf.contrib.rnn.LSTMStateTuple( self.lstm_state[0, 0, :, :], self.lstm_state[0, 1, :, :]), tf.contrib.rnn.LSTMStateTuple( self.lstm_state[1, 0, :, :], self.lstm_state[1, 1, :, :])) # [batch_size, length, num_units] self.encoder_output_predict, self.encoder_state_predict = dynamic_rnn( cell, self.encoder_input, self.sessions_length, initial_state=self.ini_state, dtype=tf.float32, scope="encoder") # [batch_size, num_units] self.final_output_predict = tf.reshape( self.encoder_output_predict[:, -1, :], [-1, num_units]) # [batch_size, num_items] self.rec_logits = output_fn(self.final_output_predict) # [batch_size, action_num] _, self.rec_index = tf.nn.top_k( self.rec_logits[:, len(_START_VOCAB):], action_num) self.rec_index += len(_START_VOCAB) def gumbel_max(inp, alpha, beta): # assert len(tf.shape(inp)) == 2 g = tf.random_uniform(tf.shape(inp), 0.0001, 0.9999) g = -tf.log(-tf.log(g)) inp_g = tf.nn.softmax( (tf.nn.log_softmax(inp / 1.0) + g * alpha) * beta) return inp_g # [batch_size, action_num] _, self.random_rec_index = tf.nn.top_k( gumbel_max(self.rec_logits[:, len(_START_VOCAB):], 1, 1), action_num) self.random_rec_index += len(_START_VOCAB) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.params = tf.trainable_variables() gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=100, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__( self, num_symbols, # 词汇表size num_embed_units, # 词嵌入size num_units, # RNN 每层单元数 num_layers, # RNN 层数 embed, # 词嵌入 entity_embed=None, # num_entities=0, # num_trans_units=100, # learning_rate=0.0001, learning_rate_decay_factor=0.95, # max_gradient_norm=5.0, # num_samples=500, # 样本个数,sampled softmax max_length=60, mem_use=True, output_alignments=True, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch_size * encoder_len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch_size self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch_size * decoder_len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch_size self.entities = tf.placeholder( tf.string, (None, None, None), 'entities') # batch_size * triple_num * triple_len self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # 没用到 self.triples = tf.placeholder( tf.string, (None, None, None, 3), 'triples') # batch_size * triple_num * triple_len * 3 self.posts_triple = tf.placeholder( tf.int32, (None, None, 1), 'enc_triples') # batch_size * encoder_len self.responses_triple = tf.placeholder( tf.string, (None, None, 3), 'dec_triples') # batch_size * decoder_len * 3 self.match_triples = tf.placeholder( tf.int32, (None, None, None), 'match_triples') # batch_size * decoder_len * triple_num # 获得 encoder_batch_size ,编码器的 encoder_len encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) # 获得 triple_num # 每个 post 包含的知识图个数(补齐过的) triple_num = tf.shape(self.triples)[1] # 获得 triple_len # 每个知识图包含的关联实体个数(补齐过的) triple_len = tf.shape(self.triples)[2] # 使用的知识三元组 one_hot_triples = tf.one_hot( self.match_triples, triple_len) # batch_size * decoder_len * triple_num * triple_len # 用 1 标注了哪个时间步产生的回复用了知识三元组 use_triples = tf.reduce_sum(one_hot_triples, axis=[2, 3]) # batch_size * decoder_len # 词汇映射到 index 的 hash table self.symbol2index = MutableHashTable( key_dtype=tf.string, # key张量的类型 value_dtype=tf.int64, # value张量的类型 default_value=UNK_ID, # 缺少key的默认值 shared_name= "in_table", # If non-empty, this table will be shared under the given name across multiple sessions name="in_table", # 操作名 checkpoint=True ) # if True, the contents of the table are saved to and restored from checkpoints. If shared_name is empty for a checkpointed table, it is shared using the table node name. # index 映射到词汇的 hash table self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) # 实体映射到 index 的 hash table self.entity2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) # index 映射到实体的 hash table self.index2entity = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) # 将 post 的 string 映射成词汇 id self.posts_word_id = self.symbol2index.lookup( self.posts) # batch_size * encoder_len # 将 post 的 string 映射成实体 id self.posts_entity_id = self.entity2index.lookup( self.posts) # batch_size * encoder_len # 将 response 的 string 映射成词汇 id self.responses_target = self.symbol2index.lookup( self.responses) # batch_size * decoder_len # 获得解码器的 batch_size,decoder_len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] # 去掉 responses_target 的最后一列,给第一列加上 GO_ID self.responses_word_id = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # batch_size * decoder_len # 得到 response 的 mask # 首先将回复的长度 one_hot 编码 # 然后横着从右向左累计求和,形成一个如果该位置在长度范围内,则为1,否则则为0的矩阵,最后一步 reshape 应该没有必要 self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # batch_size * decoder_len # 初始化 词嵌入 和 实体嵌入,传入了参数就直接赋值,没有的话就随机初始化 if embed is None: self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32) else: self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed) if entity_embed is None: self.entity_trans = tf.get_variable( 'entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) # 添加一个全连接层,输入是实体的嵌入,该层的 size=num_trans_units,激活函数是tanh # 为什么还要用全连接层连一下?????? self.entity_trans_transformed = tf.layers.dense( self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation') # 7 * num_trans_units 的全零初始化的数组 padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) # 把 padding_entity 添加到 entity_trans_transformed 的最前,补了有什么用????????????? self.entity_embed = tf.concat( [padding_entity, self.entity_trans_transformed], axis=0) # tf.nn.embedding_lookup 以后维度会+1,所以通过reshape来取消这个多出来的维度 triples_embedding = tf.reshape( tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, -1, 3 * num_trans_units]) entities_word_embedding = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units ]) # [batch_size,triple_num*triple_len,num_embed_units] # 把 head,relation,tail分割开来 head, relation, tail = tf.split(triples_embedding, [num_trans_units] * 3, axis=3) # 静态图注意力机制 with tf.variable_scope('graph_attention'): # 将头和尾连接起来 head_tail = tf.concat( [head, tail], axis=3) # batch_size * triple_num * triple_len * 200 # tanh(dot(W, head_tail)) head_tail_transformed = tf.layers.dense( head_tail, num_trans_units, activation=tf.tanh, name='head_tail_transform' ) # batch_size * triple_num * triple_len * 100 # dot(W, relation) relation_transformed = tf.layers.dense( relation, num_trans_units, name='relation_transform' ) # batch_size * triple_num * triple_len * 100 # 两个向量先元素乘,再求和,等于两个向量的内积 # dot(traspose(dot(W, relation)), tanh(dot(W, head_tail))) e_weight = tf.reduce_sum( relation_transformed * head_tail_transformed, axis=3) # batch_size * triple_num * triple_len # 图中每个三元组的 alpha 权值 alpha_weight = tf.nn.softmax( e_weight) # batch_size * triple_num * triple_len # tf.expand_dims 使 alpha_weight 维度+1 batch_size * triple_num * triple_len * 1 # 对第2个维度求和,由此产生每个图 100 维的图向量表示 graph_embed = tf.reduce_sum( tf.expand_dims(alpha_weight, 3) * head_tail, axis=2) # batch_size * triple_num * 100 """ [0, 1, 2... encoder_batch_size] 转化成 encoder_batch_size * 1 * 1 的矩阵 [[[0]], [[1]], [[2]],...] tf.tile 将矩阵的第 1 维进行扩展 encoder_batch_size * encoder_len * 1 [[[0],[0]...]],...] 与 posts_triple 在第 2 维度上进行拼接,形成 indices 矩阵 indices 矩阵: [ [[0 0], [0 0], [0 0], [0 0], [0 1], [0 0], [0 2], [0 0],...encoder_len], [[1 0], [1 0], [1 0], [1 0], [1 1], [1 0], [1 2], [1 0],...encoder_len], [[2 0], [2 0], [2 0], [2 0], [2 1], [2 0], [2 2], [2 0],...encoder_len] ,...batch_size ] tf.gather_nd 将 graph_embed 中根据上面矩阵提供的索引检索图向量,再回填至 indices 矩阵 encoder_batch_size * encoder_len * 100 """ graph_embed_input = tf.gather_nd( graph_embed, tf.concat([ tf.tile( tf.reshape(tf.range(encoder_batch_size, dtype=tf.int32), [-1, 1, 1]), [1, encoder_len, 1]), self.posts_triple ], axis=2)) # 将 responses_triple 转化成实体嵌入 batch_size * decoder_len * 300 triple_embed_input = tf.reshape( tf.nn.embedding_lookup( self.entity_embed, self.entity2index.lookup(self.responses_triple)), [batch_size, decoder_len, 3 * num_trans_units]) # 将 posts_word_id 转化成词嵌入 post_word_input = tf.nn.embedding_lookup( self.embed, self.posts_word_id) # batch_size * encoder_len * 300 # 将 responses_word_id 转化成词嵌入 response_word_input = tf.nn.embedding_lookup( self.embed, self.responses_word_id) # batch_size * decoder_len * 300 # post_word_input, graph_embed_input 在第二个维度上拼接 self.encoder_input = tf.concat( [post_word_input, graph_embed_input], axis=2) # batch_size * encoder_len * 400 # response_word_input, triple_embed_input 在第二个维度上拼接 self.decoder_input = tf.concat( [response_word_input, triple_embed_input], axis=2) # batch_size * decoder_len * 600 # 构造 deep RNN encoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell( [GRUCell(num_units) for _ in range(num_layers)]) # rnn encoder encoder_output, encoder_state = dynamic_rnn(encoder_cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # 由于词汇表维度过大,所以输出的维度不可能和词汇表一样。通过 projection 函数,可以实现从低维向高维的映射 # 返回:输出函数,选择器函数,计算序列损失,采样序列损失,总体损失的函数 output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss = output_projection_layer( num_units, num_symbols, num_samples) # 用于训练的 decoder with tf.variable_scope('decoder'): # 得到注意力函数 # 准备注意力 # attention_keys_init: 注意力的 keys # attention_values_init: 注意力的 values # attention_score_fn_init: 计算注意力上下文的函数 # attention_construct_fn_init: 计算所有上下文拼接的函数 attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \ = prepare_attention(encoder_output, 'bahdanau', num_units, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use)#'luong', num_units) # 返回训练时解码器每一个时间步对输入的处理函数 decoder_fn_train = attention_decoder_fn_train( encoder_state, attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments and mem_use, max_length=tf.reduce_max(self.responses_length)) # 输出,最终状态,alignments 的 TensorArray self.decoder_output, _, alignments_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder_rnn") if output_alignments: self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss( self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples) self.sentence_ppx = tf.identity( self.sentence_ppx, name='ppx_loss') # 将 sentence_ppx 转化成一步操作 else: self.decoder_loss = sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) # 用于推导的 decoder with tf.variable_scope('decoder', reuse=True): # 得到注意力函数 attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=(graph_embed, triples_embedding), output_alignments=output_alignments and mem_use)#'luong', num_units) decoder_fn_inference = attention_decoder_fn_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, imem=(entities_word_embedding, tf.reshape( triples_embedding, [encoder_batch_size, -1, 3 * num_trans_units])), selector_fn=selector_fn) # imem: ([batch_size,triple_num*triple_len,num_embed_units],[encoder_batch_size, triple_num*triple_len, 3*num_trans_units]) 实体次嵌入和三元组嵌入的元组 self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder( decoder_cell, decoder_fn_inference, scope="decoder_rnn") output_len = tf.shape(self.decoder_distribution)[1] # decoder_len output_ids = tf.transpose( output_ids_ta.gather( tf.range(output_len))) # [batch_size, decoder_len] # 对 output 的值域行裁剪 word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64) # [batch_size, decoder_len] # 计算的是采用的实体词在 entities 的位置 # 1、tf.shape(entities_word_embedding)[1] = triple_num*triple_len # 2、tf.range(encoder_batch_size): [batch_size] # 3、tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]): [batch_size, 1] 实体词在 entities 中的偏移量 # 4、tf.clip_by_value(-output_ids, 0, num_symbols): [batch_size, decoder_len] 实体词的相对位置 # 5、entity_ids: [batch_size * decoder_len] 加上偏移量之后在 entities 中的实际位置 entity_ids = tf.reshape( tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape( tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1]) # 计算的是所用的实体词 # 1、entities: [batch_size, triple_num, triple_len] # 2、tf.reshape(self.entities, [-1]): [batch_size * triple_num * triple_len] # 3、tf.gather: [batch_size*decoder_len] # 4、entities: [batch_size, output_len] entities = tf.reshape( tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) # 将 id 转化为实际的词 # output_ids > 0 为 bool 张量,True 的位置用 words 中该位置的词替换 self.generation = tf.where(output_ids > 0, words, entities) self.generation = tf.identity(self.generation, name='generation') # 初始化训练过程 self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) # ??? self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) # 更新参数的次数 self.global_step = tf.Variable(0, trainable=False) # 要训练的参数 self.params = tf.global_variables() # 选择优化算法 opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.lr = opt._lr # 根据 decoder_loss 计算 params 梯度 gradients = tf.gradients(self.decoder_loss, self.params) # 梯度裁剪 clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.5, max_gradient_norm=5.0, model='LSTM'): #todo: implement placeholders self.texts = tf.placeholder(dtype=tf.string, shape=[None, None]) # shape: batch*len self.texts_length = tf.placeholder(dtype=tf.int32, shape=None) # shape: batch self.labels = tf.placeholder(dtype=tf.int64, shape=None) # shape: batch self.keep_prob = tf.placeholder(dtype=tf.float32) self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input = self.symbol2index.lookup(self.texts) # batch*len # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input = tf.nn.embedding_lookup( self.embed, self.index_input) #batch*len*embed_unit #todo: implement unfinished networks if num_layers == 1: if model == 'LSTM': cell = BasicLSTMCell(num_units) elif model == 'RNN': cell = BasicRNNCell(num_units) elif model == 'GRU': cell = GRUCell(num_units) else: print("Wrong model!") return cell_dr = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=1.0, output_keep_prob=self.keep_prob) outputs, states = dynamic_rnn(cell_dr, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") if model == 'LSTM': h_state = states[0] else: h_state = states else: if model == 'LSTM': cell = BasicLSTMCell(num_units) elif model == 'RNN': cell = BasicRNNCell(num_units) elif model == 'GRU': cell = GRUCell(num_units) else: print("Wrong model!") return cell_dr = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=1.0, output_keep_prob=self.keep_prob) multi_cell = tf.contrib.rnn.MultiRNNCell([cell_dr] * num_layers, state_is_tuple=True) init_state = multi_cell.zero_state(16, tf.float32) outputs, state = tf.nn.dynamic_rnn(multi_cell, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn", initial_state=init_state, time_major=False) h_state = outputs[:, -1, :] logits = tf.layers.dense(h_state, num_labels) self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def encode_v2(self, question_embeddings, document_embeddings, question_mask, context_mask, encoderb_state_input, dropout_keep_prob, max_question_len): """ encode_v2() """ # Shared LSTM cell lstm_cell = tf.nn.rnn_cell.LSTMCell(self.state_size) lstm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, input_keep_prob=dropout_keep_prob) # Question -> LSTM -> Q with tf.variable_scope('question_embedding'): question_length = tf.reduce_sum(tf.cast(question_mask, tf.int32), reduction_indices=1) Q_prime, _ = dynamic_rnn(lstm_cell, question_embeddings, sequence_length=question_length, dtype=tf.float32) print("Q_prime: ", Q_prime) # Non-linear projection layer on top of the question encoding Q = tf.tanh(batch_linear(Q_prime, max_question_len, True)) Q = tf.transpose(Q, [0, 2, 1]) print("Q: ", Q) with tf.variable_scope('context_embedding'): # Paragraph -> LSTM -> D #tf.get_variable_scope().reuse_variables() context_length = tf.reduce_sum(tf.cast(context_mask, tf.int32), reduction_indices=1) D, _ = dynamic_rnn(lstm_cell, document_embeddings, sequence_length=context_length, dtype=tf.float32) D = tf.transpose(D, [0, 2, 1]) print("D: ", D) with tf.variable_scope('coattention'): L = tf.batch_matmul(tf.transpose(D, [0, 2, 1]), Q) print("L: ", L) A_Q = tf.map_fn(lambda x: tf.nn.softmax(x), L, dtype=tf.float32) A_D = tf.map_fn(lambda x: tf.nn.softmax(x), tf.transpose(L, [0, 2, 1]), dtype=tf.float32) print("A_Q: ", A_Q) print("A_D: ", A_D) C_Q = batch_matmul(D, A_Q) print("C_Q: ", C_Q) concat = tf.concat(1, [Q, C_Q]) print("concat: ", concat) C_D = batch_matmul(tf.concat(1, [Q, C_Q]), A_D) print("C_D: ", C_D) # Final coattention context: (batch size, context length, 3*hidden size) co_att = tf.concat(1, [D, C_D]) co_att = tf.transpose(co_att, [0, 2, 1]) print("co_att: ", co_att) with tf.variable_scope('encoder'): # LSTM for coattention encoding cell_fw = tf.nn.rnn_cell.LSTMCell(self.state_size) cell_bw = tf.nn.rnn_cell.LSTMCell(self.state_size) cell_fw = tf.nn.rnn_cell.DropoutWrapper( cell_fw, input_keep_prob=dropout_keep_prob) cell_bw = tf.nn.rnn_cell.DropoutWrapper( cell_bw, input_keep_prob=dropout_keep_prob) # Compute coattention encoding (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, co_att, sequence_length=context_length, dtype=tf.float32) print("fw out: ", fw_out) print("bw out: ", bw_out) U = tf.concat(2, [fw_out, bw_out]) print("U: ", U) return U
def __init__(self, num_lstm_units, embed, neg_num=4, gradient_clip_threshold=5.0): self.queries = tf.placeholder(dtype=tf.string, shape=[None, None]) # shape: batch*len self.queries_length = tf.placeholder(dtype=tf.int32, shape=[None]) # shape: batch self.docs = tf.placeholder(dtype=tf.string, shape=[neg_num + 1, None, None]) # shape: (neg_num + 1)*batch*len self.docs_length = tf.placeholder(dtype=tf.int32, shape=[neg_num + 1, None]) # shape: batch*(neg_num + 1) self.word2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True ) self.learning_rate = tf.Variable(0.001, trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.momentum = tf.Variable(0.9, trainable=False, dtype=tf.float32) self.index_queries = self.word2index.lookup(self.queries) # batch*len self.index_docs = [self.word2index.lookup(doc) for doc in tf.unstack(self.docs)] self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_queries = tf.nn.embedding_lookup(self.embed, self.index_queries) self.embed_docs = [tf.nn.embedding_lookup(self.embed, index_doc) for index_doc in self.index_docs] with tf.variable_scope('query_lstm'): self.cell_q = SimpleLSTMCell(num_lstm_units) with tf.variable_scope('doc_lstm'): self.cell_d = SimpleLSTMCell(num_lstm_units) self.states_q = dynamic_rnn(self.cell_q, self.embed_queries, self.queries_length, dtype=tf.float32, scope="simple_lstm_cell_query")[1][1] # shape: batch*num_units self.states_d = [dynamic_rnn(self.cell_d, self.embed_docs[i], self.docs_length[i], dtype=tf.float32, scope="simple_lstm_cell_doc")[1][1] for i in range(neg_num + 1)] # shape: (neg_num + 1)*batch*num_units self.queries_norm = tf.sqrt(tf.reduce_sum(tf.square(self.states_q), axis=1)) self.docs_norm = [tf.sqrt(tf.reduce_sum(tf.square(self.states_d[i]), axis=1)) for i in range(neg_num + 1)] self.prods = [tf.reduce_sum(tf.multiply(self.states_q, self.states_d[i]), axis=1) for i in range(neg_num + 1)] self.sims = [(self.prods[i] / (self.queries_norm * self.docs_norm[i])) for i in range(neg_num + 1)] # shape: (neg_num + 1)*batch self.sims = tf.convert_to_tensor(self.sims) self.gamma = tf.Variable(initial_value=1.0, expected_shape=[], dtype=tf.float32) # scaling factor according to the paper self.origin_sims = self.sims self.sims = self.sims * self.gamma self.prob = tf.nn.softmax(self.sims, dim=0) # shape: (neg_num + 1)*batch self.hit_prob = tf.transpose(self.prob[0]) self.loss = -tf.reduce_mean(tf.log(self.hit_prob)) self.params = tf.trainable_variables() opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=self.momentum, use_nesterov=True) # use Nesterov's method, according to the paper gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, gradient_clip_threshold) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, data, args, embed): self.posts = tf.placeholder(tf.int32, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None, ), 'enc_lens') # batch self.origin_responses = tf.placeholder(tf.int32, (None, None), 'dec_inps') # batch*len self.origin_responses_length = tf.placeholder(tf.int32, (None, ), 'dec_lens') # batch self.is_train = tf.placeholder(tf.bool) # deal with original data to adapt encoder and decoder batch_size, decoder_len = tf.shape(self.origin_responses)[0], tf.shape( self.origin_responses)[1] self.responses_input = tf.split(self.origin_responses, [decoder_len - 1, 1], 1)[0] self.responses_target = tf.split(self.origin_responses, [1, decoder_len - 1], 1)[1] self.responses_length = self.origin_responses_length - 1 decoder_len = decoder_len - 1 self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # initialize the training process self.learning_rate = tf.Variable(float(args.lr), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * args.lr_decay) self.global_step = tf.Variable(0, trainable=False) # build the embedding table and embedding input if embed is None: # initialize the embedding randomly self.embed = tf.get_variable( 'embed', [data.vocab_size, args.embedding_size], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.encoder_input = tf.nn.embedding_lookup(self.embed, self.posts) self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) #self.decoder_input = tf.cond(self.is_train, # lambda: tf.nn.dropout(tf.nn.embedding_lookup(self.embed, self.responses_input), 0.8), # lambda: tf.nn.embedding_lookup(self.embed, self.responses_input)) # build rnn_cell cell = tf.nn.rnn_cell.GRUCell(args.eh_size) # get output projection function output_fn = MyDense(data.vocab_size, use_bias=True) sampled_sequence_loss = output_projection_layer( args.dh_size, data.vocab_size, args.softmax_samples) # build encoder with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): _, self.encoder_state = dynamic_rnn(cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="decoder_rnn") # construct helper and attention infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embed, tf.fill([batch_size], data.eos_id), data.eos_id) dec_start = tf.cond( self.is_train, lambda: tf.zeros([batch_size, args.dh_size], dtype=tf.float32), lambda: self.encoder_state) # build decoder (train) with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): self.decoder_output, _ = dynamic_rnn( cell, self.decoder_input, self.responses_length, dtype=tf.float32, initial_state=self.encoder_state, scope='decoder_rnn') #self.decoder_output = tf.nn.dropout(self.decoder_output, 0.8) self.decoder_distribution_teacher, self.decoder_loss, self.decoder_all_loss = \ sampled_sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) # build decoder (test) with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): decoder_infer = tf.contrib.seq2seq.BasicDecoder( cell, infer_helper, dec_start, output_layer=output_fn) infer_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder_infer, impute_finished=True, maximum_iterations=args.max_sent_length, scope="decoder_rnn") self.decoder_distribution = infer_outputs.rnn_output self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, data.vocab_size - 2], 2)[1], 2) + 2 # for removing UNK # calculate the gradient of parameters and update self.params = [ k for k in tf.trainable_variables() if args.name in k.name ] opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, args.grad_clip) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # save checkpoint self.latest_saver = tf.train.Saver( write_version=tf.train.SaverDef.V2, max_to_keep=args.checkpoint_max_to_keep, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.best_saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) # create summary for tensorboard self.create_summary(args)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, beam_size, embed, learning_rate=0.5, remove_unk=False, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=512, max_length=8, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch*len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) # build the vocab table (string to index) self.posts_input = self.symbol2index.lookup(self.posts) # batch*len self.responses_target = self.symbol2index.lookup( self.responses) #batch*len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] self.responses_input = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int64) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # batch*len self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.encoder_input = tf.nn.embedding_lookup( self.embed, self.posts_input) #batch*len*unit self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) if use_lstm: cell = MultiRNNCell([LSTMCell(num_units)] * num_layers) else: cell = MultiRNNCell([GRUCell(num_units)] * num_layers) # rnn encoder encoder_output, encoder_state = dynamic_rnn(cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # get output projection function output_fn, sampled_sequence_loss = output_projection_layer( num_units, num_symbols, num_samples) # get attention function attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = attention_decoder_fn.prepare_attention(encoder_output, 'luong', num_units) with tf.variable_scope('decoder'): decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train( encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn) self.decoder_output, _, _ = dynamic_rnn_decoder( cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder_rnn") self.decoder_loss = sampled_sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) with tf.variable_scope('decoder', reuse=True): decoder_fn_inference = attention_decoder_fn.attention_decoder_fn_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols) self.decoder_distribution, _, _ = dynamic_rnn_decoder( cell, decoder_fn_inference, scope="decoder_rnn") self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, num_symbols - 2], 2)[1], 2) + 2 # for removing UNK self.generation = self.index2symbol.lookup(self.generation_index, name='generation') with tf.variable_scope('decoder', reuse=True): decoder_fn_beam_inference = attention_decoder_fn_beam_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, beam_size, remove_unk) _, _, self.context_state = dynamic_rnn_decoder( cell, decoder_fn_beam_inference, scope="decoder_rnn") (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols) = self.context_state self.beam_parents = tf.transpose(tf.reshape( beam_parents.stack(), [max_length + 1, -1, beam_size]), [1, 0, 2], name='beam_parents') self.beam_symbols = tf.transpose( tf.reshape(beam_symbols.stack(), [max_length + 1, -1, beam_size]), [1, 0, 2]) self.beam_symbols = self.index2symbol.lookup(tf.cast( self.beam_symbols, tf.int64), name="beam_symbols") self.result_probs = tf.transpose(tf.reshape( result_probs.stack(), [max_length + 1, -1, beam_size * 2]), [1, 0, 2], name='result_probs') self.result_symbols = tf.transpose( tf.reshape(result_symbols.stack(), [max_length + 1, -1, beam_size * 2]), [1, 0, 2]) self.result_parents = tf.transpose(tf.reshape( result_parents.stack(), [max_length + 1, -1, beam_size * 2]), [1, 0, 2], name='result_parents') self.result_symbols = self.index2symbol.lookup( tf.cast(self.result_symbols, tf.int64), name='result_symbols') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) # Exporter for serving self.model_exporter = exporter.Exporter(self.saver) inputs = {"enc_inps:0": self.posts, "enc_lens:0": self.posts_length} outputs = { "beam_symbols": self.beam_symbols, "beam_parents": self.beam_parents, "result_probs": self.result_probs, "result_symbols": self.result_symbols, "result_parents": self.result_parents } self.model_exporter.init(tf.get_default_graph().as_graph_def(), named_graph_signatures={ "inputs": exporter.generic_signature(inputs), "outputs": exporter.generic_signature(outputs) })
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, embed, entity_embed=None, num_entities=0, num_trans_units=100, learning_rate=0.0001, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=512, max_length=60, output_alignments=True, use_lstm=False): self.posts = tf.placeholder(tf.string, (None, None), 'enc_inps') # batch*len self.posts_length = tf.placeholder(tf.int32, (None), 'enc_lens') # batch self.responses = tf.placeholder(tf.string, (None, None), 'dec_inps') # batch*len self.responses_length = tf.placeholder(tf.int32, (None), 'dec_lens') # batch self.entities = tf.placeholder(tf.string, (None, None), 'entities') # batch self.entity_masks = tf.placeholder(tf.string, (None, None), 'entity_masks') # batch self.triples = tf.placeholder(tf.string, (None, None, 3), 'triples') # batch self.posts_triple = tf.placeholder(tf.int32, (None, None, 1), 'enc_triples') # batch self.responses_triple = tf.placeholder(tf.string, (None, None, 3), 'dec_triples') # batch self.match_triples = tf.placeholder(tf.int32, (None, None), 'match_triples') # batch encoder_batch_size, encoder_len = tf.unstack(tf.shape(self.posts)) triple_num = tf.shape(self.triples)[1] #use_triples = tf.reduce_sum(tf.cast(tf.greater_equal(self.match_triples, 0), tf.float32), axis=-1) one_hot_triples = tf.one_hot(self.match_triples, triple_num) use_triples = tf.reduce_sum(one_hot_triples, axis=[2]) self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.index2symbol = MutableHashTable( key_dtype=tf.int64, value_dtype=tf.string, default_value='_UNK', shared_name="out_table", name="out_table", checkpoint=True) self.entity2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=NONE_ID, shared_name="entity_in_table", name="entity_in_table", checkpoint=True) self.index2entity = MutableHashTable( key_dtype=tf.int64, value_dtype=tf.string, default_value='_NONE', shared_name="entity_out_table", name="entity_out_table", checkpoint=True) # build the vocab table (string to index) self.posts_word_id = self.symbol2index.lookup(self.posts) # batch*len self.posts_entity_id = self.entity2index.lookup(self.posts) # batch*len #self.posts_word_id = tf.Print(self.posts_word_id, ['use_triples', use_triples, 'one_hot_triples', one_hot_triples], summarize=1e6) self.responses_target = self.symbol2index.lookup(self.responses) #batch*len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(self.responses)[1] self.responses_word_id = tf.concat([tf.ones([batch_size, 1], dtype=tf.int64)*GO_ID, tf.split(self.responses_target, [decoder_len-1, 1], 1)[0]], 1) # batch*len self.decoder_mask = tf.reshape(tf.cumsum(tf.one_hot(self.responses_length-1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('word_embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('word_embed', dtype=tf.float32, initializer=embed) if entity_embed is None: # initialize the embedding randomly self.entity_trans = tf.get_variable('entity_embed', [num_entities, num_trans_units], tf.float32, trainable=False) else: # initialize the embedding by pre-trained word vectors self.entity_trans = tf.get_variable('entity_embed', dtype=tf.float32, initializer=entity_embed, trainable=False) self.entity_trans_transformed = tf.layers.dense(self.entity_trans, num_trans_units, activation=tf.tanh, name='trans_transformation') padding_entity = tf.get_variable('entity_padding_embed', [7, num_trans_units], dtype=tf.float32, initializer=tf.zeros_initializer()) self.entity_embed = tf.concat([padding_entity, self.entity_trans_transformed], axis=0) triples_embedding = tf.reshape(tf.nn.embedding_lookup(self.entity_embed, self.entity2index.lookup(self.triples)), [encoder_batch_size, triple_num, 3 * num_trans_units]) entities_word_embedding = tf.reshape(tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entities)), [encoder_batch_size, -1, num_embed_units]) self.encoder_input = tf.nn.embedding_lookup(self.embed, self.posts_word_id) #batch*len*unit self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_word_id) #batch*len*unit encoder_cell = MultiRNNCell([GRUCell(num_units) for _ in range(num_layers)]) decoder_cell = MultiRNNCell([GRUCell(num_units) for _ in range(num_layers)]) # rnn encoder encoder_output, encoder_state = dynamic_rnn(encoder_cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # get output projection function output_fn, selector_fn, sequence_loss, sampled_sequence_loss, total_loss = output_projection_layer(num_units, num_symbols, num_samples) with tf.variable_scope('decoder'): # get attention function attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init \ = prepare_attention(encoder_output, 'bahdanau', num_units, imem=triples_embedding, output_alignments=output_alignments)#'luong', num_units) decoder_fn_train = attention_decoder_fn_train( encoder_state, attention_keys_init, attention_values_init, attention_score_fn_init, attention_construct_fn_init, output_alignments=output_alignments, max_length=tf.reduce_max(self.responses_length)) self.decoder_output, _, alignments_ta = dynamic_rnn_decoder(decoder_cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder_rnn") if output_alignments: self.alignments = tf.transpose(alignments_ta.stack(), perm=[1,0,2]) #self.alignments = tf.Print(self.alignments, [self.alignments], summarize=1e8) self.decoder_loss, self.ppx_loss, self.sentence_ppx = total_loss(self.decoder_output, self.responses_target, self.decoder_mask, self.alignments, triples_embedding, use_triples, one_hot_triples) self.sentence_ppx = tf.identity(self.sentence_ppx, 'ppx_loss') #self.decoder_loss = tf.Print(self.decoder_loss, ['decoder_loss', self.decoder_loss], summarize=1e6) else: self.decoder_loss, self.sentence_ppx = sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask) self.sentence_ppx = tf.identity(self.sentence_ppx, 'ppx_loss') with tf.variable_scope('decoder', reuse=True): # get attention function attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = prepare_attention(encoder_output, 'bahdanau', num_units, reuse=True, imem=triples_embedding, output_alignments=output_alignments)#'luong', num_units) decoder_fn_inference = attention_decoder_fn_inference( output_fn, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols, imem=entities_word_embedding, selector_fn=selector_fn) self.decoder_distribution, _, output_ids_ta = dynamic_rnn_decoder(decoder_cell, decoder_fn_inference, scope="decoder_rnn") if output_alignments: output_len = tf.shape(self.decoder_distribution)[1] output_ids = tf.transpose(output_ids_ta.gather(tf.range(output_len))) word_ids = tf.cast(tf.clip_by_value(output_ids, 0, num_symbols), tf.int64) entity_ids = tf.reshape(tf.clip_by_value(-output_ids, 0, num_symbols) + tf.reshape(tf.range(encoder_batch_size) * tf.shape(entities_word_embedding)[1], [-1, 1]), [-1]) entities = tf.reshape(tf.gather(tf.reshape(self.entities, [-1]), entity_ids), [-1, output_len]) words = self.index2symbol.lookup(word_ids) self.generation = tf.where(output_ids > 0, words, entities, name='generation') else: self.generation_index = tf.argmax(self.decoder_distribution, 2) self.generation = self.index2symbol.lookup(self.generation_index, name='generation') # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.params = tf.global_variables() # calculate the gradient of parameters #opt = tf.train.GradientDescentOptimizer(self.learning_rate) opt = tf.train.AdamOptimizer(learning_rate=learning_rate) self.lr = opt._lr gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('decoder_loss', self.decoder_loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.saver_epoch = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1000, pad_step_number=True)
def RNN(x): # Define a GRU cell with tensorflow gru_cell = nn.rnn_cell.GRUCell(num_units, name="GRU") # Get gru cell output outputs, _ = nn.dynamic_rnn(gru_cell, x, dtype=dataType) return outputs[-1]
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, is_train, vocab=None, embed=None, learning_rate=0.1, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=512, max_length=30, use_lstm=True): self.posts_1 = tf.placeholder(tf.string, shape=(None, None)) self.posts_2 = tf.placeholder(tf.string, shape=(None, None)) self.posts_3 = tf.placeholder(tf.string, shape=(None, None)) self.posts_4 = tf.placeholder(tf.string, shape=(None, None)) self.entity_1 = tf.placeholder(tf.string, shape=(None, None, None, 3)) self.entity_2 = tf.placeholder(tf.string, shape=(None, None, None, 3)) self.entity_3 = tf.placeholder(tf.string, shape=(None, None, None, 3)) self.entity_4 = tf.placeholder(tf.string, shape=(None, None, None, 3)) self.entity_mask_1 = tf.placeholder(tf.float32, shape=(None, None, None)) self.entity_mask_2 = tf.placeholder(tf.float32, shape=(None, None, None)) self.entity_mask_3 = tf.placeholder(tf.float32, shape=(None, None, None)) self.entity_mask_4 = tf.placeholder(tf.float32, shape=(None, None, None)) self.posts_length_1 = tf.placeholder(tf.int32, shape=(None)) self.posts_length_2 = tf.placeholder(tf.int32, shape=(None)) self.posts_length_3 = tf.placeholder(tf.int32, shape=(None)) self.posts_length_4 = tf.placeholder(tf.int32, shape=(None)) self.responses = tf.placeholder(tf.string, shape=(None, None)) self.responses_length = tf.placeholder(tf.int32, shape=(None)) self.epoch = tf.Variable(0, trainable=False, name='epoch') self.epoch_add_op = self.epoch.assign(self.epoch + 1) if is_train: self.symbols = tf.Variable(vocab, trainable=False, name="symbols") else: self.symbols = tf.Variable(np.array(['.'] * num_symbols), name="symbols") self.symbol2index = HashTable(KeyValueTensorInitializer( self.symbols, tf.Variable( np.array([i for i in range(num_symbols)], dtype=np.int32), False)), default_value=UNK_ID, name="symbol2index") self.posts_input_1 = self.symbol2index.lookup(self.posts_1) self.posts_2_target = self.posts_2_embed = self.symbol2index.lookup( self.posts_2) self.posts_3_target = self.posts_3_embed = self.symbol2index.lookup( self.posts_3) self.posts_4_target = self.posts_4_embed = self.symbol2index.lookup( self.posts_4) self.responses_target = self.symbol2index.lookup(self.responses) batch_size, decoder_len = tf.shape(self.posts_1)[0], tf.shape( self.responses)[1] self.posts_input_2 = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int32) * GO_ID, tf.split(self.posts_2_embed, [tf.shape(self.posts_2)[1] - 1, 1], 1)[0] ], 1) self.posts_input_3 = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int32) * GO_ID, tf.split(self.posts_3_embed, [tf.shape(self.posts_3)[1] - 1, 1], 1)[0] ], 1) self.posts_input_4 = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int32) * GO_ID, tf.split(self.posts_4_embed, [tf.shape(self.posts_4)[1] - 1, 1], 1)[0] ], 1) self.responses_target = self.symbol2index.lookup(self.responses) batch_size, decoder_len = tf.shape(self.posts_1)[0], tf.shape( self.responses)[1] self.responses_input = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int32) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) self.encoder_2_mask = tf.reshape( tf.cumsum(tf.one_hot(self.posts_length_2 - 1, tf.shape(self.posts_2)[1]), reverse=True, axis=1), [-1, tf.shape(self.posts_2)[1]]) self.encoder_3_mask = tf.reshape( tf.cumsum(tf.one_hot(self.posts_length_3 - 1, tf.shape(self.posts_3)[1]), reverse=True, axis=1), [-1, tf.shape(self.posts_3)[1]]) self.encoder_4_mask = tf.reshape( tf.cumsum(tf.one_hot(self.posts_length_4 - 1, tf.shape(self.posts_4)[1]), reverse=True, axis=1), [-1, tf.shape(self.posts_4)[1]]) self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) if embed is None: self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.encoder_input_1 = tf.nn.embedding_lookup(self.embed, self.posts_input_1) self.encoder_input_2 = tf.nn.embedding_lookup(self.embed, self.posts_input_2) self.encoder_input_3 = tf.nn.embedding_lookup(self.embed, self.posts_input_3) self.encoder_input_4 = tf.nn.embedding_lookup(self.embed, self.posts_input_4) self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) entity_embedding_1 = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entity_1)), [ batch_size, tf.shape(self.entity_1)[1], tf.shape(self.entity_1)[2], 3 * num_embed_units ]) entity_embedding_2 = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entity_2)), [ batch_size, tf.shape(self.entity_2)[1], tf.shape(self.entity_2)[2], 3 * num_embed_units ]) entity_embedding_3 = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entity_3)), [ batch_size, tf.shape(self.entity_3)[1], tf.shape(self.entity_3)[2], 3 * num_embed_units ]) entity_embedding_4 = tf.reshape( tf.nn.embedding_lookup(self.embed, self.symbol2index.lookup(self.entity_4)), [ batch_size, tf.shape(self.entity_4)[1], tf.shape(self.entity_4)[2], 3 * num_embed_units ]) head_1, relation_1, tail_1 = tf.split(entity_embedding_1, [num_embed_units] * 3, axis=3) head_2, relation_2, tail_2 = tf.split(entity_embedding_2, [num_embed_units] * 3, axis=3) head_3, relation_3, tail_3 = tf.split(entity_embedding_3, [num_embed_units] * 3, axis=3) head_4, relation_4, tail_4 = tf.split(entity_embedding_4, [num_embed_units] * 3, axis=3) with tf.variable_scope('graph_attention'): #[batch_size, max_reponse_length, max_triple_num, 2*embed_units] head_tail_1 = tf.concat([head_1, tail_1], axis=3) #[batch_size, max_reponse_length, max_triple_num, embed_units] head_tail_transformed_1 = tf.layers.dense( head_tail_1, num_embed_units, activation=tf.tanh, name='head_tail_transform') #[batch_size, max_reponse_length, max_triple_num, embed_units] relation_transformed_1 = tf.layers.dense(relation_1, num_embed_units, name='relation_transform') #[batch_size, max_reponse_length, max_triple_num] e_weight_1 = tf.reduce_sum(relation_transformed_1 * head_tail_transformed_1, axis=3) #[batch_size, max_reponse_length, max_triple_num] alpha_weight_1 = tf.nn.softmax(e_weight_1) #[batch_size, max_reponse_length, embed_units] graph_embed_1 = tf.reduce_sum( tf.expand_dims(alpha_weight_1, 3) * (tf.expand_dims(self.entity_mask_1, 3) * head_tail_1), axis=2) with tf.variable_scope('graph_attention', reuse=True): head_tail_2 = tf.concat([head_2, tail_2], axis=3) head_tail_transformed_2 = tf.layers.dense( head_tail_2, num_embed_units, activation=tf.tanh, name='head_tail_transform') relation_transformed_2 = tf.layers.dense(relation_2, num_embed_units, name='relation_transform') e_weight_2 = tf.reduce_sum(relation_transformed_2 * head_tail_transformed_2, axis=3) alpha_weight_2 = tf.nn.softmax(e_weight_2) graph_embed_2 = tf.reduce_sum( tf.expand_dims(alpha_weight_2, 3) * (tf.expand_dims(self.entity_mask_2, 3) * head_tail_2), axis=2) with tf.variable_scope('graph_attention', reuse=True): head_tail_3 = tf.concat([head_3, tail_3], axis=3) head_tail_transformed_3 = tf.layers.dense( head_tail_3, num_embed_units, activation=tf.tanh, name='head_tail_transform') relation_transformed_3 = tf.layers.dense(relation_3, num_embed_units, name='relation_transform') e_weight_3 = tf.reduce_sum(relation_transformed_3 * head_tail_transformed_3, axis=3) alpha_weight_3 = tf.nn.softmax(e_weight_3) graph_embed_3 = tf.reduce_sum( tf.expand_dims(alpha_weight_3, 3) * (tf.expand_dims(self.entity_mask_3, 3) * head_tail_3), axis=2) with tf.variable_scope('graph_attention', reuse=True): head_tail_4 = tf.concat([head_4, tail_4], axis=3) head_tail_transformed_4 = tf.layers.dense( head_tail_4, num_embed_units, activation=tf.tanh, name='head_tail_transform') relation_transformed_4 = tf.layers.dense(relation_4, num_embed_units, name='relation_transform') e_weight_4 = tf.reduce_sum(relation_transformed_4 * head_tail_transformed_4, axis=3) alpha_weight_4 = tf.nn.softmax(e_weight_4) graph_embed_4 = tf.reduce_sum( tf.expand_dims(alpha_weight_4, 3) * (tf.expand_dims(self.entity_mask_4, 3) * head_tail_4), axis=2) if use_lstm: cell = MultiRNNCell([LSTMCell(num_units)] * num_layers) else: cell = MultiRNNCell([GRUCell(num_units)] * num_layers) output_fn, sampled_sequence_loss = output_projection_layer( num_units, num_symbols, num_samples) encoder_output_1, encoder_state_1 = dynamic_rnn(cell, self.encoder_input_1, self.posts_length_1, dtype=tf.float32, scope="encoder") attention_keys_1, attention_values_1, attention_score_fn_1, attention_construct_fn_1 \ = attention_decoder_fn.prepare_attention(graph_embed_1, encoder_output_1, 'luong', num_units) decoder_fn_train_1 = attention_decoder_fn.attention_decoder_fn_train( encoder_state_1, attention_keys_1, attention_values_1, attention_score_fn_1, attention_construct_fn_1, max_length=tf.reduce_max(self.posts_length_2)) encoder_output_2, encoder_state_2, alignments_ta_2 = dynamic_rnn_decoder( cell, decoder_fn_train_1, self.encoder_input_2, self.posts_length_2, scope="decoder") self.alignments_2 = tf.transpose(alignments_ta_2.stack(), perm=[1, 0, 2]) self.decoder_loss_2 = sampled_sequence_loss(encoder_output_2, self.posts_2_target, self.encoder_2_mask) with variable_scope.variable_scope('', reuse=True): attention_keys_2, attention_values_2, attention_score_fn_2, attention_construct_fn_2 \ = attention_decoder_fn.prepare_attention(graph_embed_2, encoder_output_2, 'luong', num_units) decoder_fn_train_2 = attention_decoder_fn.attention_decoder_fn_train( encoder_state_2, attention_keys_2, attention_values_2, attention_score_fn_2, attention_construct_fn_2, max_length=tf.reduce_max(self.posts_length_3)) encoder_output_3, encoder_state_3, alignments_ta_3 = dynamic_rnn_decoder( cell, decoder_fn_train_2, self.encoder_input_3, self.posts_length_3, scope="decoder") self.alignments_3 = tf.transpose(alignments_ta_3.stack(), perm=[1, 0, 2]) self.decoder_loss_3 = sampled_sequence_loss( encoder_output_3, self.posts_3_target, self.encoder_3_mask) attention_keys_3, attention_values_3, attention_score_fn_3, attention_construct_fn_3 \ = attention_decoder_fn.prepare_attention(graph_embed_3, encoder_output_3, 'luong', num_units) decoder_fn_train_3 = attention_decoder_fn.attention_decoder_fn_train( encoder_state_3, attention_keys_3, attention_values_3, attention_score_fn_3, attention_construct_fn_3, max_length=tf.reduce_max(self.posts_length_4)) encoder_output_4, encoder_state_4, alignments_ta_4 = dynamic_rnn_decoder( cell, decoder_fn_train_3, self.encoder_input_4, self.posts_length_4, scope="decoder") self.alignments_4 = tf.transpose(alignments_ta_4.stack(), perm=[1, 0, 2]) self.decoder_loss_4 = sampled_sequence_loss( encoder_output_4, self.posts_4_target, self.encoder_4_mask) attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = attention_decoder_fn.prepare_attention(graph_embed_4, encoder_output_4, 'luong', num_units) if is_train: with variable_scope.variable_scope('', reuse=True): decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train( encoder_state_4, attention_keys, attention_values, attention_score_fn, attention_construct_fn, max_length=tf.reduce_max(self.responses_length)) self.decoder_output, _, alignments_ta = dynamic_rnn_decoder( cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder") self.alignments = tf.transpose(alignments_ta.stack(), perm=[1, 0, 2]) self.decoder_loss = sampled_sequence_loss( self.decoder_output, self.responses_target, self.decoder_mask) self.params = tf.trainable_variables() self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) #opt = tf.train.GradientDescentOptimizer(self.learning_rate) opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9) gradients = tf.gradients( self.decoder_loss + self.decoder_loss_2 + self.decoder_loss_3 + self.decoder_loss_4, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) else: with variable_scope.variable_scope('', reuse=True): decoder_fn_inference = attention_decoder_fn.attention_decoder_fn_inference( output_fn, encoder_state_4, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols) self.decoder_distribution, _, alignments_ta = dynamic_rnn_decoder( cell, decoder_fn_inference, scope="decoder") output_len = tf.shape(self.decoder_distribution)[1] self.alignments = tf.transpose( alignments_ta.gather(tf.range(output_len)), [1, 0, 2]) self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, num_symbols - 2], 2)[1], 2) + 2 # for removing UNK self.generation = tf.nn.embedding_lookup(self.symbols, self.generation_index, name="generation") self.params = tf.trainable_variables() self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=10, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate=0.5, max_gradient_norm=5.0): #todo: implement placeholders self.texts = tf.placeholder(tf.string, [None, None], name="texts") # shape: batch*len self.texts_length = tf.placeholder(tf.int64, [None], name="texts_length") # shape: batch self.labels = tf.placeholder(tf.int64, [None], name="labels") # shape: batch self.symbol2index = MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) learning_rate_decay_factor = 0.9 self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) self.epoch = tf.Variable(0, trainable=False) self.epoch_add_op = self.epoch.assign(self.epoch + 1) self.index_input = self.symbol2index.lookup(self.texts) # batch*len # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.embed_input = tf.nn.embedding_lookup( self.embed, self.index_input) #batch*len*embed_unit model = 'lstm' if num_layers == 1: if (model == 'rnn'): cell = BasicRNNCell(num_units) elif (model == 'gru'): cell = GRUCell(num_units) elif (model == 'lstm'): cell = BasicLSTMCell(num_units) cell_do = tf.nn.rnn_cell.DropoutWrapper( cell, input_keep_prob=1.0, output_keep_prob=FLAGS.keep_prob) outputs, states = dynamic_rnn(cell_do, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") #todo: implement unfinished networks outputs_flat = tf.reduce_mean(outputs, 1) if (model == 'lstm'): states = states[0] # W_f = weight_variable([tf.app.flags.FLAGS.units, 5]) # b_f = bias_variable([5]) # logits = tf.matmul(outputs_flat, W_f) + b_f # fc_layer = tf.layers.dense(inputs = states, units = 32, activation = tf.nn.relu) logits = tf.layers.dense(inputs=states, units=5, activation=None) else: self.reverse_texts = tf.placeholder( tf.string, [None, None], name="reverse_texts") # shape: batch*len self.index_reverse_input = self.symbol2index.lookup( self.reverse_texts) self.embed_reverse_input = tf.nn.embedding_lookup( self.embed, self.index_reverse_input) #batch*len*embed_unit if (model == 'rnn'): cell1 = BasicRNNCell(num_units) cell2 = BasicRNNCell(num_units) elif (model == 'gru'): cell1 = GRUCell(num_units) cell2 = GRUCell(num_units) elif (model == 'lstm'): cell1 = BasicLSTMCell(num_units) cell2 = BasicLSTMCell(num_units) cell1_do = tf.nn.rnn_cell.DropoutWrapper( cell1, input_keep_prob=1.0, output_keep_prob=FLAGS.keep_prob) cell2_do = tf.nn.rnn_cell.DropoutWrapper( cell2, input_keep_prob=1.0, output_keep_prob=FLAGS.keep_prob) outputs1, states1 = dynamic_rnn(cell1_do, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") outputs2, states2 = dynamic_rnn(cell2_do, self.embed_reverse_input, self.texts_length, dtype=tf.float32, scope="rnn") if (model == 'lstm'): states = states1[0] + states2[0] else: states = states1 + states2 # fc_layer = tf.layers.dense(inputs = states, units = 32, activation = tf.nn.relu) logits = tf.layers.dense(inputs=states, units=5, activation=None) self.loss = tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.labels, logits=logits), name='loss') mean_loss = self.loss / tf.cast(tf.shape(self.labels)[0], dtype=tf.float32) predict_labels = tf.argmax(logits, 1, 'predict_labels') self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, predict_labels), tf.int32), name='accuracy') self.params = tf.trainable_variables() # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) # opt = tf.train.AdamOptimizer(self.learning_rate) gradients = tf.gradients(mean_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) tf.summary.scalar('loss/step', self.loss) for each in tf.trainable_variables(): tf.summary.histogram(each.name, each) self.merged_summary_op = tf.summary.merge_all() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def __init__(self, data, args, embed): with tf.variable_scope("input"): with tf.variable_scope("embedding"): # build the embedding table and embedding input if embed is None: # initialize the embedding randomly self.embed = tf.get_variable( 'embed', [data.vocab_size, args.embedding_size], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.sentence = tf.placeholder(tf.int32, (None, None), 'sen_inps') # batch*len self.sentence_length = tf.placeholder(tf.int32, (None, ), 'sen_lens') # batch self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") batch_size, batch_len = tf.shape(self.sentence)[0], tf.shape( self.sentence)[1] self.decoder_max_len = batch_len - 1 self.encoder_input = tf.nn.embedding_lookup( self.embed, self.sentence) # batch*len*unit self.encoder_len = self.sentence_length decoder_input = tf.split(self.sentence, [self.decoder_max_len, 1], 1)[0] # no eos_id self.decoder_input = tf.nn.embedding_lookup( self.embed, decoder_input) # batch*(len-1)*unit self.decoder_target = tf.split(self.sentence, [1, self.decoder_max_len], 1)[1] # no go_id, batch*(len-1) self.decoder_len = self.sentence_length - 1 self.decoder_mask = tf.sequence_mask( self.decoder_len, self.decoder_max_len, dtype=tf.float32) # batch*(len-1) # initialize the training process self.learning_rate = tf.Variable(float(args.lr), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * args.lr_decay) self.global_step = tf.Variable(0, trainable=False) # build rnn_cell cell_enc = tf.nn.rnn_cell.GRUCell(args.eh_size) cell_dec = tf.nn.rnn_cell.GRUCell(args.dh_size) # build encoder with tf.variable_scope('encoder'): encoder_output, encoder_state = dynamic_rnn(cell_enc, self.encoder_input, self.encoder_len, dtype=tf.float32, scope="encoder_rnn") with tf.variable_scope('recognition_net'): recog_input = encoder_state self.recog_mu = tf.layers.dense(inputs=recog_input, units=args.z_dim, activation=None, name='recog_mu') self.recog_logvar = tf.layers.dense(inputs=recog_input, units=args.z_dim, activation=None, name='recog_logvar') epsilon = tf.random_normal(tf.shape(self.recog_logvar), name="epsilon") std = tf.exp(0.5 * self.recog_logvar) self.recog_z = tf.add(self.recog_mu, tf.multiply(std, epsilon), name='recog_z') self.kld = tf.reduce_mean(0.5 * tf.reduce_sum( tf.exp(self.recog_logvar) + self.recog_mu * self.recog_mu - self.recog_logvar - 1, axis=-1)) self.prior_z = tf.random_normal(tf.shape(self.recog_logvar), name="prior_z") latent_sample = tf.cond(self.use_prior, lambda: self.prior_z, lambda: self.recog_z, name='latent_sample') dec_init_state = tf.layers.dense(inputs=latent_sample, units=args.dh_size, activation=None) with tf.variable_scope("output_layer", initializer=tf.orthogonal_initializer()): self.output_layer = Dense( data.vocab_size, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), use_bias=True) with tf.variable_scope("decode", initializer=tf.orthogonal_initializer()): train_helper = tf.contrib.seq2seq.TrainingHelper( inputs=self.decoder_input, sequence_length=self.decoder_len) train_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell_dec, helper=train_helper, initial_state=dec_init_state, output_layer=self.output_layer) train_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=train_decoder, maximum_iterations=self.decoder_max_len, impute_finished=True) logits = train_output.rnn_output crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_target, logits=logits) crossent = tf.reduce_sum(crossent * self.decoder_mask) self.sen_loss = crossent / tf.to_float(batch_size) self.ppl_loss = crossent / tf.reduce_sum(self.decoder_mask) self.decoder_distribution_teacher = tf.nn.log_softmax(logits) with tf.variable_scope("decode", reuse=True): infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embed, tf.fill([batch_size], data.go_id), data.eos_id) infer_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell_dec, helper=infer_helper, initial_state=dec_init_state, output_layer=self.output_layer) infer_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=infer_decoder, maximum_iterations=self.decoder_max_len, impute_finished=True) self.decoder_distribution = infer_output.rnn_output self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, data.vocab_size - 2], 2)[1], 2) + 2 # for removing UNK self.kl_weights = tf.minimum( tf.to_float(self.global_step) / args.full_kl_step, 1.0) self.kl_loss = self.kl_weights * tf.maximum(self.kld, args.min_kl) self.loss = self.sen_loss + self.kl_loss # calculate the gradient of parameters and update self.params = [ k for k in tf.trainable_variables() if args.name in k.name ] opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=args.momentum) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, args.grad_clip) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # save checkpoint self.latest_saver = tf.train.Saver( write_version=tf.train.SaverDef.V2, max_to_keep=args.checkpoint_max_to_keep, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.best_saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) # create summary for tensorboard self.create_summary(args)
def __init__(self, num_symbols, num_qwords, #modify num_embed_units, num_units, num_layers, is_train, vocab=None, embed=None, question_data=True, learning_rate=0.5, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=512, max_length=30, use_lstm=False): self.posts = tf.placeholder(tf.string, shape=(None, None)) # batch*len self.posts_length = tf.placeholder(tf.int32, shape=(None)) # batch self.responses = tf.placeholder(tf.string, shape=(None, None)) # batch*len self.responses_length = tf.placeholder(tf.int32, shape=(None)) # batch self.keyword_tensor = tf.placeholder(tf.float32, shape=(None, 3, None)) #(batch * len) * 3 * numsymbol self.word_type = tf.placeholder(tf.int32, shape=(None)) #(batch * len) # build the vocab table (string to index) if is_train: self.symbols = tf.Variable(vocab, trainable=False, name="symbols") else: self.symbols = tf.Variable(np.array(['.']*num_symbols), name="symbols") self.symbol2index = HashTable(KeyValueTensorInitializer(self.symbols, tf.Variable(np.array([i for i in range(num_symbols)], dtype=np.int32), False)), default_value=UNK_ID, name="symbol2index") self.posts_input = self.symbol2index.lookup(self.posts) # batch*len self.responses_target = self.symbol2index.lookup(self.responses) #batch*len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape(self.responses)[1] self.responses_input = tf.concat([tf.ones([batch_size, 1], dtype=tf.int32)*GO_ID, tf.split(self.responses_target, [decoder_len-1, 1], 1)[0]], 1) # batch*len #delete the last column of responses_target) and add 'GO at the front of it. self.decoder_mask = tf.reshape(tf.cumsum(tf.one_hot(self.responses_length-1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) # bacth * len print "embedding..." # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: print len(vocab), len(embed), len(embed[0]) print embed # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.encoder_input = tf.nn.embedding_lookup(self.embed, self.posts_input) #batch*len*unit self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) print "embedding finished" if use_lstm: cell = MultiRNNCell([LSTMCell(num_units)] * num_layers) else: cell = MultiRNNCell([GRUCell(num_units)] * num_layers) # rnn encoder encoder_output, encoder_state = dynamic_rnn(cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # get output projection function output_fn, sampled_sequence_loss = output_projection_layer(num_units, num_symbols, num_qwords, num_samples, question_data) print "encoder_output.shape:", encoder_output.get_shape() # get attention function attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = attention_decoder_fn.prepare_attention(encoder_output, 'luong', num_units) # get decoding loop function decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train(encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn) decoder_fn_inference = attention_decoder_fn.attention_decoder_fn_inference(output_fn, self.keyword_tensor, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols) if is_train: # rnn decoder self.decoder_output, _, _ = dynamic_rnn_decoder(cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder") # calculate the loss of decoder # self.decoder_output = tf.Print(self.decoder_output, [self.decoder_output]) self.decoder_loss, self.log_perplexity = sampled_sequence_loss(self.decoder_output, self.responses_target, self.decoder_mask, self.keyword_tensor, self.word_type) # building graph finished and get all parameters self.params = tf.trainable_variables() for item in tf.trainable_variables(): print item.name, item.get_shape() # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) else: # rnn decoder self.decoder_distribution, _, _ = dynamic_rnn_decoder(cell, decoder_fn_inference, scope="decoder") print("self.decoder_distribution.shape():",self.decoder_distribution.get_shape()) self.decoder_distribution = tf.Print(self.decoder_distribution, ["distribution.shape()", tf.reduce_sum(self.decoder_distribution)]) # generating the response self.generation_index = tf.argmax(tf.split(self.decoder_distribution, [2, num_symbols-2], 2)[1], 2) + 2 # for removing UNK self.generation = tf.nn.embedding_lookup(self.symbols, self.generation_index) self.params = tf.trainable_variables() self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)