def embedding_attention_seq2seq(encoder_inputs, encoder_mask, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, beam_size, num_layers=1, num_heads=1, feed_previous=False, dtype=dtypes.float32, scope=None, initial_state_attention=True): """Embedding sequence-to-sequence model with attention. Args: encoder_mask: The mask of input sentences denoting padding positions. encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. beam_size: Integer, the beam size used in beam search. num_heads: Number of attention heads that read from attention_states. feed_previous: Boolean, if True, only the first of decoder_inputs will be used (the "GO" symbol). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state, symbols), where: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. state: The state of each decoder cell the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: A list of target word ids, the best results returned by beam search """ with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): embedding = variable_scope.get_variable( "embedding", [num_encoder_symbols, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) encoder_cell = rnn_cell.EmbeddingWrapper(cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size, embedding=embedding) encoder_lens = math_ops.reduce_sum(encoder_mask, [1]) encoder_outputs, _, encoder_state = rnn.bidirectional_rnn( encoder_cell, encoder_cell, encoder_inputs, sequence_length=encoder_lens, dtype=dtype) assert encoder_cell._embedding is embedding # First calculate a concatenation of encoder outputs to put attention on. top_states = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs] attention_states = array_ops.concat(1, top_states) # Decoder. output_size = None return embedding_attention_decoder(encoder_mask, decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, beam_size=beam_size, num_heads=num_heads, output_size=output_size, num_layers=num_layers, feed_previous=feed_previous, initial_state_attention=initial_state_attention)
def embedding_attention_seq2seq(encoder_inputs_1, encoder_inputs_2, encoder_mask_1, encoder_mask_2, decoder_inputs, cell, num_encoder_symbols_1, num_encoder_symbols_2, num_decoder_symbols, # added by al embedding_size, beam_size, # added by shiyue constant_emb_en, # added by al constant_emb_fr, # added by al num_heads=1, output_projection=None, feed_previous=False, dtype=dtypes.float32, scope=None, # initial_state_attention=False #annotated by yfeng initial_state_attention=True # added by yfeng ): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. """ with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): # Encoder. # annotated by yfeng """ encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size) encoder_outputs, encoder_state = rnn.rnn( encoder_cell, encoder_inputs, dtype=dtype) # First calculate a concatenation of encoder outputs to put attention on. top_states = [array_ops.reshape(e, [-1, 1, cell.output_size]) for e in encoder_outputs] """ # start by yfeng # sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. ''' embedding_1 = variable_scope.get_variable( "embedding_1", [num_encoder_symbols_1, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) # annotated by yfeng embedding_2 = variable_scope.get_variable( # added by al "embedding_2", [num_encoder_symbols_2, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) # annotated by yfeng ''' embedding_1 = variable_scope.get_variable( "embedding_1", [num_encoder_symbols_1, embedding_size], dtype=dtype, trainable=False, initializer=init_ops.constant_initializer(constant_emb_en)) # for constant embedding embedding_2 = variable_scope.get_variable( # added by al "embedding_2", [num_encoder_symbols_2, embedding_size], dtype=dtype, trainable=False, initializer=init_ops.constant_initializer(constant_emb_fr)) # for constant embedding # initializer = init_ops.random_normal_initializer(0, 0.01, seed=1.0)) #change from uniform to normal by yfeng encoder_lens_1 = math_ops.reduce_sum(encoder_mask_1, [1]) encoder_lens_2 = math_ops.reduce_sum(encoder_mask_2, [1]) with variable_scope.variable_scope("encoder_1"): encoder_cell_1 = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols_1, embedding_size=embedding_size, embedding=embedding_1) encoder_outputs_1, _, encoder_state_1 = rnn.bidirectional_rnn( encoder_cell_1, encoder_cell_1, encoder_inputs_1, sequence_length=encoder_lens_1, dtype=dtype) with variable_scope.variable_scope("encoder_2"): encoder_cell_2 = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols_2, embedding_size=embedding_size, embedding=embedding_2) encoder_outputs_2, _, encoder_state_2 = rnn.bidirectional_rnn( encoder_cell_2, encoder_cell_2, encoder_inputs_2, sequence_length=encoder_lens_2, dtype=dtype) encoder_state = alpha * encoder_state_1 + beta * encoder_state_2 # this can be changed assert encoder_cell_1._embedding is embedding_1 assert encoder_cell_2._embedding is embedding_2 # First calculate a concatenation of encoder outputs to put attention on. top_states_1 = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs_1] top_states_2 = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs_2] # end by yfeng attention_states_1 = array_ops.concat(1, top_states_1) attention_states_2 = array_ops.concat(1, top_states_2) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols if isinstance(feed_previous, bool): return embedding_attention_decoder(encoder_mask_1, encoder_mask_2, decoder_inputs, encoder_state, attention_states_1, attention_states_2, cell, num_decoder_symbols, embedding_size, beam_size=beam_size, # added by shiyue constant_emb_fr=constant_emb_fr, # added by al num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous, initial_state_attention=initial_state_attention) # If feed_previous is a Tensor, we construct 2 graphs and use cond. def decoder(feed_previous_bool): reuse = None if feed_previous_bool else True with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=reuse): outputs, state, _ = embedding_attention_decoder(encoder_mask_1, encoder_mask_2, # modified by shiyue decoder_inputs, encoder_state, attention_states_1, attention_states_2, cell, num_decoder_symbols, embedding_size, beam_size=beam_size, # added by shiyue constant_emb_fr=constant_emb_fr, # added by al num_heads=num_heads, output_size=output_size, output_projection=output_projection, feed_previous=feed_previous_bool, update_embedding_for_previous=False, initial_state_attention=initial_state_attention) state_list = [state] if nest.is_sequence(state): state_list = nest.flatten(state) return outputs + state_list outputs_and_state = control_flow_ops.cond(feed_previous, lambda: decoder(True), lambda: decoder(False)) outputs_len = len(decoder_inputs) # Outputs length same as decoder inputs. state_list = outputs_and_state[outputs_len:] state = state_list[0] if nest.is_sequence(encoder_state): state = nest.pack_sequence_as(structure=encoder_state, flat_sequence=state_list) return outputs_and_state[:outputs_len], state
def __init__(self, hidden_size=10, vocab_size=42791, embedding_size=50, embedding_matrix=None): #tf.set_random_seed(1234) # Placeholders # ================================================== # (batch_size * max_sentence_count x max_sentence_length) self.sentences = tf.placeholder(tf.int32, [None, None], name="sentences") self.questions = tf.placeholder(tf.int32, [None, None], name="questions") self.labels = tf.placeholder(tf.int32, [ None, ], name="labels") batch_size = tf.shape(self.questions)[0] #self.batch_size = tf.placeholder(tf.int32, name="batch_size") sentences = self.sentences questions = self.questions labels = self.labels # (batch_size * mask_sentence_count x max_sentence_length) sentence_mask = tf.cast(sentences >= 0, tf.int32) self.sentence_mask = sentence_mask #sentence_mask = tf.sequence_mask(doc_lengths, dtype=tf.int32) masked_sentences = tf.mul(sentences, sentence_mask) max_sent_per_doc = tf.cast( tf.shape(sentence_mask)[0] / batch_size, tf.int32) self.max_sent_per_doc = max_sent_per_doc # Input Preparation # ================================================== with tf.variable_scope("embeddings"): if embedding_matrix == None: self.W_embeddings = tf.get_variable(shape=[vocab_size, embedding_size], \ initializer=tf.random_uniform_initializer(-0.01, 0.01),\ name="W_embeddings", dtype=tf.float32) else: ################## option to use pre-trained embeddings ################## self.W_embeddings = tf.Variable(embedding_matrix, \ name="W_embeddings", dtype=tf.float32) # SENTENCES MASKED # (batch_size x max_sent_per_doc) batch_mask = tf.reshape(tf.reduce_max(sentence_mask, 1), [batch_size, -1]) # (batch_size * max_sent_per_doc x 1 x 1) sentence_batch_mask = tf.cast(tf.reshape(batch_mask, [-1, 1, 1]), tf.float32) # batch_size * max_sent_per_doc x max_sentence_length x embedding_size sentence_embeddings = tf.gather(self.W_embeddings, masked_sentences) masked_sentence_embeddings = tf.mul( sentence_embeddings, tf.cast(tf.expand_dims(sentence_mask, -1), tf.float32)) # QUERY MASKED # create mask (batch_size x max_question_length) question_mask = tf.cast(questions > 0, tf.int32) self.question_mask = question_mask masked_question = tf.mul(question_mask, questions) self.masked_question = masked_question # (batch_size x max_question_length x embedding_size) question_embeddings = tf.gather(self.W_embeddings, masked_question) self.question_embeddings = question_embeddings question_mask_float = tf.expand_dims( tf.cast(question_mask, tf.float32), -1) masked_question_embeddings = tf.mul(question_embeddings, question_mask_float) self.masked_question_embeddings = masked_question_embeddings # CBOW Sentence Representation # ================================================== with tf.variable_scope("sentence-representation"): self.forward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Forward-D") self.backward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Backward-D") self.hidden_states_d, last_state_d = bidirectional_rnn(self.forward_cell_d, self.backward_cell_d, \ sentence_embeddings, tf.cast(sentence_mask, tf.float32), concatenate=True) doc_sentences = tf.reshape(last_state_d, [batch_size, -1, hidden_size * 2]) self.cbow_sentences = doc_sentences # # (batch_size * max_sentence_count x embedding_size) # cbow_sentences = tf.reduce_mean(masked_sentence_embeddings, 1) # self.cbow_sentences = cbow_sentences # # reshape batch to (batch_size x max_doc_length x embedding_size) # doc_sentences = tf.reshape(cbow_sentences, [batch_size, -1, embedding_size]) # Query Representation # ================================================== with tf.variable_scope("query-representation"): # easy baseline: cbow # (batch_size x embedding_size) #question_cbow = tf.reduce_mean(masked_question_embeddings, 1) #self.question_cbow = question_cbow self.forward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Forward-Q") self.backward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Backward-Q") self.hidden_states_q, last_state_q = bidirectional_rnn(self.forward_cell_q, self.backward_cell_q, \ question_embeddings, tf.cast(question_mask, tf.float32), concatenate=True) self.question_cbow = last_state_q # can use RNN representation as well************************************* # Similarity Scoring # ================================================== # Using simple dot product/cosine similiarity as of now (https://arxiv.org/pdf/1605.07427v1.pdf) with tf.variable_scope("similarity-scoring"): # (batch_size x max_sent_per_doc) #""" # dot_prod = tf.squeeze(tf.batch_matmul(doc_sentences, tf.expand_dims(question_cbow, -1)), [-1]) # self.dot_prod = dot_prod # # # softmax # numerator = tf.exp(tf.sub(dot_prod, tf.expand_dims(tf.reduce_max(dot_prod, 1), -1))) * tf.cast(batch_mask, tf.float32) # denom = tf.reduce_sum(numerator, 1) # # # Dimensions (batch x time) # probabilities = tf.div(numerator, tf.expand_dims(denom, 1)) #""" # #(batch_size x max_sent_per_doc) # sentence_norm = tf.sqrt(tf.reduce_sum(tf.mul(doc_sentences, doc_sentences), -1)) # self.sentence_norm = sentence_norm # # (batch_size) # question_norm = tf.sqrt(tf.reduce_sum(tf.mul(question_cbow, question_cbow), 1)) # self.question_norm = question_norm # # denom = tf.mul(sentence_norm, tf.expand_dims(question_norm, -1))+1e-30 # self.denom = denom # # (batch_size x max_sent_per_doc) - scalars between -1 and +1 # cosine_similarity = tf.div(dot_prod, denom) # self.cosine_similarity = cosine_similarity # # masked_pos_cos_sim = tf.sub(tf.add(cosine_similarity, 1), tf.cast(batch_mask < 1, tf.float32)) # self.masked_pos_cos_sim = masked_pos_cos_sim # normalized_cos_sim = tf.div(masked_pos_cos_sim, tf.expand_dims(tf.reduce_sum(masked_pos_cos_sim, 1), -1)) #""" attention = BilinearFunction(attending_size=hidden_size * 2, attended_size=hidden_size * 2) alpha_weights, attend_result = attention(self.question_cbow, attended=doc_sentences, \ time_mask=tf.cast(batch_mask, tf.float32)) probabilities = alpha_weights #""" #probabilities = tf.abs(dot_prod) #normalized_cos_sim self.probabilities = probabilities with tf.variable_scope("prediction"): one_hot_labels = tf.one_hot(labels, max_sent_per_doc, dtype=tf.float32) likelihoods = tf.reduce_sum(tf.mul(probabilities, one_hot_labels), 1) log_likelihoods = tf.log(likelihoods + 0.00000000000000000001) self.loss = tf.mul(tf.reduce_sum(log_likelihoods), -1) correct_vector = tf.cast(tf.equal(labels, tf.cast(tf.argmax(probabilities, 1), tf.int32)), \ tf.float32, name="correct_vector") self.accuracy = tf.reduce_mean(correct_vector)
def embedding_attention_seq2seq(encoder_inputs, encoder_mask, decoder_inputs, cell, num_encoder_symbols, num_decoder_symbols, embedding_size, beam_size, output_projection=None, num_layers=1, feed_previous=False, dtype=dtypes.float32, scope=None, initial_state_attention=True): """Embedding sequence-to-sequence model with attention. This model first embeds encoder_inputs by a newly created embedding (of shape [num_encoder_symbols x input_size]). Then it runs an bidirectional-RNN to encode embedded encoder_inputs into a state vector. It keeps the outputs of this bidirectional-RNN at every step to use for attention later. Next, it embeds decoder_inputs by another newly created embedding (of shape [num_decoder_symbols x input_size]). Then it runs attention decoder, initialized with the last encoder state, on embedded decoder_inputs and attending to encoder outputs. Args: encoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. encoder_mask: the mask of encoder inputs that label where are PADs. decoder_inputs: A list of 1D int32 Tensors of shape [batch_size]. cell: rnn_cell.RNNCell defining the cell function and size. num_encoder_symbols: Integer; number of symbols on the encoder side. num_decoder_symbols: Integer; number of symbols on the decoder side. embedding_size: Integer, the length of the embedding vector for each symbol. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_decoder_symbols] and B has shape [num_decoder_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean or scalar Boolean Tensor; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be taken from previous outputs (as in embedding_rnn_decoder). If False, decoder_inputs are used as given (the standard decoder case). dtype: The dtype of the initial RNN state (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states. Returns: A tuple of the form (outputs, state, symbols), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x num_decoder_symbols] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. symbols: When training, it is []; when decoding, it is the best translation generated by beam search. """ with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"): # word embeddings of source words embedding = variable_scope.get_variable( "embedding", [num_encoder_symbols, embedding_size], dtype=dtype, initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED)) # wrap encoder cell with embedding encoder_cell = rnn_cell.EmbeddingWrapper( cell, embedding_classes=num_encoder_symbols, embedding_size=embedding_size, embedding=embedding) # get the sentence lengths of source sentences encoder_lens = math_ops.reduce_sum(encoder_mask, [1]) # encode source sentences with a bidirectional_rnn encoder encoder_outputs, _, encoder_state = rnn.bidirectional_rnn( encoder_cell, encoder_cell, encoder_inputs, sequence_length=encoder_lens, dtype=dtype) # First calculate a concatenation of encoder outputs. top_states = [ array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs ] attention_states = array_ops.concat(top_states, 1) # Decoder. output_size = None if output_projection is None: cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols) output_size = num_decoder_symbols return embedding_attention_decoder( encoder_mask, decoder_inputs, encoder_state, attention_states, cell, num_decoder_symbols, embedding_size, beam_size=beam_size, output_size=output_size, output_projection=output_projection, num_layers=num_layers, feed_previous=feed_previous, initial_state_attention=initial_state_attention)
# Dimensions: batch x max_length x embedding_dim document_embedding = tf.gather(W_embeddings, masked_d) question_embedding = tf.gather(W_embeddings, masked_q) with tf.variable_scope("bidirection_rnn"): # Bidirectional RNNs for Document and Question forward_cell_d = GRUCell(state_size, input_size, scope="GRU-Forward-D") backward_cell_d = GRUCell(state_size, input_size, scope="GRU-Backward-D") forward_cell_q = GRUCell(state_size, input_size, scope="GRU-Forward-Q") backward_cell_q = GRUCell(state_size, input_size, scope="GRU-Backward-Q") # hidden_states_d, last_state_d = rnn(forward_cell_d, document_embedding, seq_lens_d) # hidden_states_q, last_state_q = rnn(forward_cell_q, question_embedding, seq_lens_q) hidden_states_d, last_state_d = bidirectional_rnn(forward_cell_d, backward_cell_d, \ document_embedding, seq_lens_d, concatenate=True) hidden_states_q, last_state_q = bidirectional_rnn(forward_cell_q, backward_cell_q, \ question_embedding, seq_lens_q, concatenate=True) with tf.variable_scope("attention"): time_mask = tf.sequence_mask(seq_lens_d, dtype=tf.float32) # Attention Layer attention = BilinearFunction(attending_size=state_size * 2, attended_size=state_size * 2) alpha_weights, attend_result = attention(attending=last_state_q, attended=hidden_states_d, \ seq_lens=seq_lens_d) with tf.variable_scope("prediction"): W_predict = tf.get_variable(name="predict_weight", shape=[state_size*2, max_entities], \ initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
def __init__(self, max_entities, hidden_size=128, vocab_size=50000, embedding_dim=100, batch_size=32): self.max_entities = max_entities tf.set_random_seed(1234) # Placeholders # can add assert statements to ensure shared None dimensions are equal (batch_size) self.input_d = tf.placeholder(tf.int32, [None, None], name="input_d") self.input_q = tf.placeholder(tf.int32, [None, None], name="input_q") self.input_a = tf.placeholder(tf.int32, [ None, ], name="input_a") self.input_m = tf.placeholder(tf.int32, [ None, ], name="input_m") self.cbow_mask = tf.placeholder(tf.float32, [None, None, None], name="cbow_mask") self.window_size = tf.placeholder(tf.int32, name="window_size") self.num_docs = tf.placeholder(tf.int32, name="num_docs") self.max_length = tf.placeholder(tf.int32, name="max_length") seq_lens_d = tf.reduce_sum(tf.cast(self.input_d >= 0, tf.int32), 1) seq_lens_q = tf.reduce_sum(tf.cast(self.input_q >= 0, tf.int32), 1) mask_d = tf.cast(tf.sequence_mask(seq_lens_d), tf.int32) mask_q = tf.cast(tf.sequence_mask(seq_lens_q), tf.int32) mask_m = tf.cast(tf.sequence_mask(self.input_m, maxlen=max_entities), dtype=tf.float32) # Document and Query embddings; One-hot-encoded answers masked_d = tf.mul(self.input_d, mask_d) masked_q = tf.mul(self.input_q, mask_q) one_hot_a = tf.one_hot(self.input_a, self.max_entities) # Buildling Graph (Network Layers) # ================================================== with tf.device('/cpu:0'), tf.variable_scope("embedding"): W_embeddings = tf.get_variable(shape=[vocab_size, embedding_dim], \ initializer=tf.random_uniform_initializer(-0.01, 0.01),\ name="W_embeddings") ################## Make option to use pre-trained embeddings ################## # Dimensions: batch x max_length x embedding_dim document_embedding = tf.gather(W_embeddings, masked_d) question_embedding = tf.gather(W_embeddings, masked_q) #document_embedding = tf.reshape(document_embedding, shape = [self.num_docs, self.window_size, self.max_length / self.window_size, embedding_dim]) #document_cbow = tf.reduce_mean(document_embedding, 1) # Experimental aspect. Combining the CBOW of a sequence of 20 words in every document. #document_cbow = tf.batch_matmul(self.cbow_mask, document_embedding) with tf.variable_scope("bidirection_rnn"): #seq_lens_cbow = tf.reshape(tf.div(seq_lens_d, self.window_size), [-1]) mask_d = tf.cast(tf.sequence_mask(seq_lens), tf.float32) #or float64? #mask_d = tf.cast(tf.sequence_mask(seq_lens_d), tf.float32) mask_q = tf.cast(tf.sequence_mask(seq_lens_q), tf.float32) # Bidirectional RNNs for Document and Question forward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Forward-D") backward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Backward-D") forward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Forward-Q") backward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Backward-Q") hidden_states_d, last_state_d = bidirectional_rnn(forward_cell_d, backward_cell_d, \ document_embedding, mask_d, concatenate=True) hidden_states_q, last_state_q = bidirectional_rnn(forward_cell_q, backward_cell_q, \ question_embedding, mask_q, concatenate=True) with tf.variable_scope("attention"): # Attention Layer attention = BilinearFunction(attending_size=hidden_size * 2, attended_size=hidden_size * 2) self.alpha_weights, self.attend_result = attention(attending=last_state_q, attended=hidden_states_d, \ time_mask=mask_d) with tf.variable_scope("prediction"): W_predict = tf.get_variable(name="predict_weight", shape=[hidden_size*2, self.max_entities], \ initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1)) b_predict = tf.get_variable( name="predict_bias", shape=[self.max_entities], initializer=tf.constant_initializer(0.0)) # Dimensions (batch_size x max_entities) predict_probs = (tf.matmul(self.attend_result, W_predict) + b_predict) * mask_m # Custom Softmax b/c need to use time_mask -------------------- # Also numerical stability: # e_x = exp(x - x.max(axis=1)) # out = e_x / e_x.sum(axis=1) numerator = tf.exp( tf.sub(predict_probs, tf.expand_dims(tf.reduce_max(predict_probs, 1), -1))) * mask_m denom = tf.reduce_sum(numerator, 1) # Transpose so broadcasting scalar division works properly # Dimensions (batch x max_entities) #self.predict_probs_normalized = tf.transpose(tf.div(tf.transpose(numerator), denom)) predict_probs_normalized = tf.div(numerator, tf.expand_dims(denom, 1)) likelihoods = tf.reduce_sum( tf.mul(predict_probs_normalized, one_hot_a), 1) log_likelihoods = tf.log(likelihoods + 0.00000000000000000001) # Negative log-likelihood loss self.loss = tf.mul(tf.reduce_sum(log_likelihoods), -1) correct_vector = tf.cast(tf.equal(tf.argmax(one_hot_a, 1), tf.argmax(predict_probs_normalized, 1)), \ tf.float32, name="correct_vector") self.accuracy = tf.reduce_mean(correct_vector)
inputs = document_embedding reverse_inputs = tf.reverse_sequence(inputs, seq_lens, seq_dim=1, batch_dim=0) forward_cell = GRUCell(state_size, input_size, scope="GRU-Forward") backward_cell = GRUCell(state_size, input_size, scope="GRU-Backward") #forward_outputs, forward_last_state = rnn(forward_cell, inputs, seq_lens, batch_size, embedding_dim) #backward_outputs, backward_last_state = rnn(backward_cell, reverse_inputs, seq_lens, batch_size, embedding_dim) #LS = tf.concat(1, [forward_last_state, backward_last_state]) #LSS = tf.concat(2, [forward_outputs, backward_outputs]) LS, LSS = bidirectional_rnn(forward_cell, backward_cell, inputs, seq_lens, batch_size, embedding_dim, concatenate=True) sess.run(tf.initialize_all_variables()) """ print(forward_last_state.eval(feed)) print(backward_last_state.eval(feed)) print(forward_last_state.eval(feed).shape) # batch x hidden_state """ print(LS.eval(feed)) print(LS.eval(feed).shape) print(LSS.eval(feed)) print(LSS.eval(feed).shape)
def __init__(self, num_classes, vocab_size, hidden_size=128, \ embedding_dim=100, batch_size=32, bidirectional=False): tf.set_random_seed(1234) # Placeholders # can add assert statements to ensure shared None dimensions are equal (batch_size) self.seq_lens = tf.placeholder(tf.int32, [ None, ], name="seq_lens") self.input_x = tf.placeholder(tf.int32, [None, None], name="input_x") self.input_y = tf.placeholder(tf.int32, [ None, ], name="input_y") mask_x = tf.cast(tf.sequence_mask(self.seq_lens), tf.int32) # Document and Query embeddings; One-hot-encoded answers masked_x = tf.mul(self.input_x, mask_x) one_hot_y = tf.one_hot(self.input_y, num_classes) # Buildling Graph (Network Layers) # ================================================== with tf.variable_scope("embedding"): self.W_embeddings = tf.get_variable(shape=[vocab_size, embedding_dim], \ initializer=tf.random_uniform_initializer(-0.01, 0.01),\ name="W_embeddings") # Dimensions: batch x max_length x embedding_dim input_embedding = tf.gather(self.W_embeddings, masked_x) with tf.variable_scope("rnn"): if bidirectional: # Bidirectional RNNs forward_cell = rnn_cell.GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Forward") backward_cell = rnn_cell.GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU-Backward") hidden_states, last_state = rnn.bidirectional_rnn(forward_cell, backward_cell, \ input_embedding, self.seq_lens, concatenate=True) else: # One directional RNN (start to end) cell = rnn_cell.GRUCell(state_size=hidden_size, input_size=embedding_dim, scope="GRU") hidden_states, last_state = rnn.rnn(cell, input_embedding, self.seq_lens) with tf.variable_scope("prediction"): if bidirectional: W_predict = tf.get_variable(name="predict_weight", shape=[hidden_size*2, num_classes], \ initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1)) else: W_predict = tf.get_variable(name="predict_weight", shape=[hidden_size, num_classes], \ initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1)) b_predict = tf.get_variable( name="predict_bias", shape=[num_classes], initializer=tf.constant_initializer(0.0)) # Dimensions (batch_size x num_classes) prediction_probs_unnormalized = tf.matmul(last_state, W_predict) + b_predict # Softmax # Dimensions (batch x time) prediction_probs = tf.nn.softmax(prediction_probs_unnormalized, name="prediction_probs") likelihoods = tf.reduce_sum(tf.mul(prediction_probs, one_hot_y), 1) log_likelihoods = tf.log(likelihoods) # Negative log-likelihood loss self.loss = tf.mul(tf.reduce_sum(log_likelihoods), -1) predictions = tf.argmax(prediction_probs, 1, name="predictions") correct_vector = tf.cast(tf.equal(tf.argmax(one_hot_y, 1), tf.argmax(prediction_probs, 1)), \ tf.float32, name="correct_vector") self.accuracy = tf.reduce_mean(correct_vector)
def __init__(self, hidden_size=128, vocab_size=65011, embedding_size=50, embedding_matrix=None, \ embedding_trainable=False, sentence_rep_str="cbow", question_rep_str="cbow"): tf.set_random_seed(1234) # Placeholders # ================================================== # (batch_size * max_sentence_count x max_sentence_length) self.sentences = tf.placeholder(tf.int32, [None, None], name="sentences") self.questions = tf.placeholder(tf.int32, [None, None], name="questions") self.labels = tf.placeholder(tf.int32, [None, ], name="labels") # initialize dimension variables based on contructor arguments if sentence_rep_str == "cbow": attended_size = embedding_size elif sentence_rep_str == "rnn": attended_size = hidden_size*2 else: raise ValueError("Invalid `sentence_rep_str` argument; choose 'cbow' or 'rnn'.") if question_rep_str == "cbow": attending_size = embedding_size elif question_rep_str == "rnn": attending_size = hidden_size*2 else: raise ValueError("Invalid `question_rep_str` argument; choose 'cbow', 'rnn', or 'rnn-attention'.") # Input Preparation (Mask Creation) # ================================================== with tf.variable_scope("masks"): # MASK SENTENCES # (batch_size * mask_sentence_count x max_sentence_length) sentence_mask = tf.cast(self.sentences >= 0, tf.int32) #sentence_mask = tf.sequence_mask(doc_lengths, dtype=tf.int32) masked_sentences = tf.mul(self.sentences, sentence_mask) batch_size = tf.shape(self.questions)[0] # RESHAPE SENTENCE MASK # (batch_size x max_sent_per_doc) batch_mask = tf.reshape(tf.reduce_max(sentence_mask, 1), [batch_size, -1]) answer_counts = tf.cast(tf.reduce_sum(batch_mask, 1), tf.float32) # (batch_size * max_sent_per_doc x 1 x 1) sentence_batch_mask = tf.cast(tf.reshape(batch_mask, [-1, 1, 1]), tf.float32) # MASK QUESTIONS # create mask (batch_size x max_question_length) question_mask = tf.cast(self.questions >= 0, tf.int32) masked_question = tf.mul(question_mask, self.questions) question_mask_float = tf.expand_dims(tf.cast(question_mask, tf.float32), -1) max_sent_per_doc = tf.cast(tf.shape(sentence_mask)[0]/batch_size, tf.int32) # Embeddings # ================================================== with tf.variable_scope("embeddings"): if embedding_matrix is None: self.W_embeddings = tf.get_variable(shape=[vocab_size, embedding_size], \ initializer=tf.random_uniform_initializer(-0.01, 0.01),\ name="W_embeddings", dtype=tf.float32) else: ################## option to use pre-trained embeddings ################## self.W_embeddings = tf.Variable(embedding_matrix, \ name="W_embeddings", dtype=tf.float32, trainable=embedding_trainable) # batch_size * max_sent_per_doc x max_sentence_length x embedding_size sentence_embeddings = tf.gather(self.W_embeddings, masked_sentences) masked_sentence_embeddings = tf.mul(sentence_embeddings, tf.cast(tf.expand_dims(sentence_mask, -1), tf.float32)) # (batch_size x max_question_length x embedding_size) question_embeddings = tf.gather(self.W_embeddings, masked_question) masked_question_embeddings = tf.mul(question_embeddings, question_mask_float) # Sentence Representation (CBOW or RNN) # ================================================== with tf.variable_scope("sentence-representation"): # CBOW ----------------------------------------- if sentence_rep_str == "cbow": # (batch_size * max_sentence_count x embedding_size) cbow_sentences = tf.reduce_mean(masked_sentence_embeddings, 1) # reshape batch to (batch_size x max_doc_length x embedding_size) doc_sentences = tf.reshape(cbow_sentences, [batch_size, -1, embedding_size]) self.sentence_representation = cbow_sentences # RNN ----------------------------------------- elif sentence_rep_str == "rnn": self.forward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Forward-D") self.backward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Backward-D") self.hidden_states_d, last_state_d = bidirectional_rnn(self.forward_cell_d, self.backward_cell_d, \ sentence_embeddings, tf.cast(sentence_mask, tf.float32), concatenate=True) doc_sentences = tf.reshape(last_state_d, [batch_size, -1, hidden_size*2]) self.sentence_representation = last_state_d # Query Representation (CBOW or RNN) # ================================================== with tf.variable_scope("query-representation"): # CBOW ----------------------------------------- if question_rep_str == "cbow": # (batch_size x embedding_size) question_cbow = tf.reduce_mean(masked_question_embeddings, 1) self.question_representation = question_cbow # RNN ----------------------------------------- elif question_rep_str == "rnn": self.forward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Forward-Q") self.backward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Backward-Q") self.hidden_states_q, last_state_q = bidirectional_rnn(self.forward_cell_q, self.backward_cell_q, \ question_embeddings, tf.cast(question_mask, tf.float32), concatenate=True) #tf.reduce_mean(self.hidden_states_q, ) self.question_representation = last_state_q # Similarity Scoring # ================================================== # Using simple dot product/cosine similiarity as of now (https://arxiv.org/pdf/1605.07427v1.pdf) with tf.variable_scope("similarity-scoring"): # (batch_size x max_sent_per_doc) attention = BilinearFunction(attending_size=attending_size, attended_size=attended_size) alpha_weights, attend_result = attention(self.question_representation, attended=doc_sentences, \ time_mask=tf.cast(batch_mask, tf.float32)) self.probabilities = alpha_weights # Loss # ================================================== with tf.variable_scope("prediction"): one_hot_labels = tf.one_hot(self.labels, max_sent_per_doc, dtype=tf.float32) likelihoods = tf.reduce_sum(tf.mul(self.probabilities, one_hot_labels), 1) likelihoods = tf.div(likelihoods, answer_counts) log_likelihoods = tf.log(likelihoods+0.00000000000000000001) self.loss = tf.div(tf.mul(tf.reduce_sum(log_likelihoods), -1), tf.cast(batch_size, tf.float32)) self.correct_vector = tf.cast(tf.equal(self.labels, tf.cast(tf.argmax(self.probabilities, 1), tf.int32)), tf.float64, name="correct_vector") self.predict_labels = tf.argmax(self.probabilities, 1) self.accuracy = tf.reduce_mean(self.correct_vector)