Beispiel #1
0
def embedding_attention_seq2seq(encoder_inputs, encoder_mask, decoder_inputs, cell,
                                num_encoder_symbols, num_decoder_symbols, embedding_size,
                                beam_size, num_layers=1, num_heads=1, feed_previous=False, dtype=dtypes.float32,
                                scope=None, initial_state_attention=True):
    """Embedding sequence-to-sequence model with attention.

    Args:
        encoder_mask: The mask of input sentences denoting padding positions.
        encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
        decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
        cell: rnn_cell.RNNCell defining the cell function and size.
        num_encoder_symbols: Integer; number of symbols on the encoder side.
        num_decoder_symbols: Integer; number of symbols on the decoder side.
        embedding_size: Integer, the length of the embedding vector for each symbol.
        beam_size: Integer, the beam size used in beam search.
        num_heads: Number of attention heads that read from attention_states.
        feed_previous: Boolean, if True, only the first of decoder_inputs will be used (the "GO" symbol).
        dtype: The dtype of the initial RNN state (default: tf.float32).
        scope: VariableScope for the created subgraph; defaults to "embedding_attention_seq2seq".
        initial_state_attention: If False (default), initial attentions are zero.
            If True, initialize the attentions from the initial state and attention
            states.

    Returns:
        A tuple of the form (outputs, state, symbols), where:
            outputs: A list of the same length as decoder_inputs of 2D Tensors of
                  shape [batch_size x output_size].
            state: The state of each decoder cell the final time-step.
                It is a 2D Tensor of shape [batch_size x cell.state_size].
            symbols: A list of target word ids, the best results returned by beam search
    """
    with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
        embedding = variable_scope.get_variable(
                "embedding", [num_encoder_symbols, embedding_size], dtype=dtype,
                initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))
        encoder_cell = rnn_cell.EmbeddingWrapper(cell, embedding_classes=num_encoder_symbols,
                embedding_size=embedding_size, embedding=embedding)

        encoder_lens = math_ops.reduce_sum(encoder_mask, [1])

        encoder_outputs, _, encoder_state = rnn.bidirectional_rnn(
                encoder_cell, encoder_cell, encoder_inputs, sequence_length=encoder_lens, dtype=dtype)

        assert encoder_cell._embedding is embedding

        # First calculate a concatenation of encoder outputs to put attention on.
        top_states = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size]) for e in encoder_outputs]
        attention_states = array_ops.concat(1, top_states)

        # Decoder.
        output_size = None

        return embedding_attention_decoder(encoder_mask, decoder_inputs, encoder_state, attention_states, cell,
                                           num_decoder_symbols, embedding_size, beam_size=beam_size,
                                           num_heads=num_heads, output_size=output_size, num_layers=num_layers,
                                           feed_previous=feed_previous,
                                           initial_state_attention=initial_state_attention)
Beispiel #2
0
def embedding_attention_seq2seq(encoder_inputs_1, encoder_inputs_2, encoder_mask_1, encoder_mask_2, decoder_inputs, cell,
                                num_encoder_symbols_1, num_encoder_symbols_2, num_decoder_symbols, # added by al
                                embedding_size,
                                beam_size,  # added by shiyue
                                constant_emb_en, # added by al
                                constant_emb_fr, # added by al
                                num_heads=1, output_projection=None,
                                feed_previous=False, dtype=dtypes.float32,
                                scope=None,
                                # initial_state_attention=False  #annotated by yfeng
                                initial_state_attention=True  # added by yfeng
                                ):
    """Embedding sequence-to-sequence model with attention.

    This model first embeds encoder_inputs by a newly created embedding (of shape
    [num_encoder_symbols x input_size]). Then it runs an RNN to encode
    embedded encoder_inputs into a state vector. It keeps the outputs of this
    RNN at every step to use for attention later. Next, it embeds decoder_inputs
    by another newly created embedding (of shape [num_decoder_symbols x
    input_size]). Then it runs attention decoder, initialized with the last
    encoder state, on embedded decoder_inputs and attending to encoder outputs.

    Args:
      encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      num_encoder_symbols: Integer; number of symbols on the encoder side.
      num_decoder_symbols: Integer; number of symbols on the decoder side.
      embedding_size: Integer, the length of the embedding vector for each symbol.
      num_heads: Number of attention heads that read from attention_states.
      output_projection: None or a pair (W, B) of output projection weights and
        biases; W has shape [output_size x num_decoder_symbols] and B has
        shape [num_decoder_symbols]; if provided and feed_previous=True, each
        fed previous output will first be multiplied by W and added B.
      feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
        of decoder_inputs will be used (the "GO" symbol), and all other decoder
        inputs will be taken from previous outputs (as in embedding_rnn_decoder).
        If False, decoder_inputs are used as given (the standard decoder case).
      dtype: The dtype of the initial RNN state (default: tf.float32).
      scope: VariableScope for the created subgraph; defaults to
        "embedding_attention_seq2seq".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state and attention
        states.

    Returns:
      A tuple of the form (outputs, state), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors with
          shape [batch_size x num_decoder_symbols] containing the generated
          outputs.
        state: The state of each decoder cell at the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
        # Encoder.
        # annotated by yfeng
        """
        encoder_cell = rnn_cell.EmbeddingWrapper(
            cell, embedding_classes=num_encoder_symbols,
            embedding_size=embedding_size)
        encoder_outputs, encoder_state = rnn.rnn(
            encoder_cell, encoder_inputs, dtype=dtype)

        # First calculate a concatenation of encoder outputs to put attention on.
        top_states = [array_ops.reshape(e, [-1, 1, cell.output_size])
                      for e in encoder_outputs]
        """
        # start by yfeng
        # sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
        '''
        embedding_1 = variable_scope.get_variable(
                "embedding_1", [num_encoder_symbols_1, embedding_size],
                dtype=dtype,
                initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))  # annotated by yfeng
        embedding_2 = variable_scope.get_variable( # added by al
                "embedding_2", [num_encoder_symbols_2, embedding_size],
                dtype=dtype,
                initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))  # annotated by yfeng
        '''
        embedding_1 = variable_scope.get_variable(
                "embedding_1", [num_encoder_symbols_1, embedding_size],
                dtype=dtype,
                trainable=False,
                initializer=init_ops.constant_initializer(constant_emb_en))  # for constant embedding
        embedding_2 = variable_scope.get_variable( # added by al
                "embedding_2", [num_encoder_symbols_2, embedding_size],
                dtype=dtype,
                trainable=False,
                initializer=init_ops.constant_initializer(constant_emb_fr))  # for constant embedding

        # initializer = init_ops.random_normal_initializer(0, 0.01, seed=1.0)) #change from uniform to normal by yfeng
        encoder_lens_1 = math_ops.reduce_sum(encoder_mask_1, [1])
        encoder_lens_2 = math_ops.reduce_sum(encoder_mask_2, [1])
        
        with variable_scope.variable_scope("encoder_1"):
            encoder_cell_1 = rnn_cell.EmbeddingWrapper(
                    cell, embedding_classes=num_encoder_symbols_1,
                    embedding_size=embedding_size, embedding=embedding_1)
            encoder_outputs_1, _, encoder_state_1 = rnn.bidirectional_rnn(
                    encoder_cell_1, encoder_cell_1, encoder_inputs_1, sequence_length=encoder_lens_1, dtype=dtype)

        
        with variable_scope.variable_scope("encoder_2"):
            encoder_cell_2 = rnn_cell.EmbeddingWrapper(
                    cell, embedding_classes=num_encoder_symbols_2,
                    embedding_size=embedding_size, embedding=embedding_2)
            encoder_outputs_2, _, encoder_state_2 = rnn.bidirectional_rnn(
                    encoder_cell_2, encoder_cell_2, encoder_inputs_2, sequence_length=encoder_lens_2, dtype=dtype)

        encoder_state = alpha * encoder_state_1 + beta * encoder_state_2 # this can be changed

        assert encoder_cell_1._embedding is embedding_1
        assert encoder_cell_2._embedding is embedding_2

        # First calculate a concatenation of encoder outputs to put attention on.
        top_states_1 = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size])
                      for e in encoder_outputs_1]
        top_states_2 = [array_ops.reshape(e, [-1, 1, 2 * cell.output_size])
                      for e in encoder_outputs_2]
        # end by yfeng
        attention_states_1 = array_ops.concat(1, top_states_1)
        attention_states_2 = array_ops.concat(1, top_states_2)

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        if isinstance(feed_previous, bool):
            return embedding_attention_decoder(encoder_mask_1, encoder_mask_2, 
                                               decoder_inputs, encoder_state, attention_states_1, attention_states_2, cell,
                                               num_decoder_symbols, embedding_size,
                                               beam_size=beam_size,  # added by shiyue
                                               constant_emb_fr=constant_emb_fr, # added by al
                                               num_heads=num_heads,
                                               output_size=output_size, output_projection=output_projection,
                                               feed_previous=feed_previous,
                                               initial_state_attention=initial_state_attention)

        # If feed_previous is a Tensor, we construct 2 graphs and use cond.
        def decoder(feed_previous_bool):
            reuse = None if feed_previous_bool else True
            with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                               reuse=reuse):
                outputs, state, _ = embedding_attention_decoder(encoder_mask_1, encoder_mask_2,   # modified by shiyue
                                                                decoder_inputs, encoder_state, attention_states_1, attention_states_2, cell,
                                                                num_decoder_symbols, embedding_size,
                                                                beam_size=beam_size,  # added by shiyue
                                                                constant_emb_fr=constant_emb_fr, # added by al
                                                                num_heads=num_heads,
                                                                output_size=output_size,
                                                                output_projection=output_projection,
                                                                feed_previous=feed_previous_bool,
                                                                update_embedding_for_previous=False,
                                                                initial_state_attention=initial_state_attention)
                state_list = [state]
                if nest.is_sequence(state):
                    state_list = nest.flatten(state)
                return outputs + state_list

        outputs_and_state = control_flow_ops.cond(feed_previous,
                                                  lambda: decoder(True),
                                                  lambda: decoder(False))
        outputs_len = len(decoder_inputs)  # Outputs length same as decoder inputs.
        state_list = outputs_and_state[outputs_len:]
        state = state_list[0]
        if nest.is_sequence(encoder_state):
            state = nest.pack_sequence_as(structure=encoder_state,
                                          flat_sequence=state_list)
        return outputs_and_state[:outputs_len], state
Beispiel #3
0
    def __init__(self,
                 hidden_size=10,
                 vocab_size=42791,
                 embedding_size=50,
                 embedding_matrix=None):
        #tf.set_random_seed(1234)

        # Placeholders
        # ==================================================
        # (batch_size * max_sentence_count x max_sentence_length)
        self.sentences = tf.placeholder(tf.int32, [None, None],
                                        name="sentences")
        self.questions = tf.placeholder(tf.int32, [None, None],
                                        name="questions")
        self.labels = tf.placeholder(tf.int32, [
            None,
        ], name="labels")

        batch_size = tf.shape(self.questions)[0]
        #self.batch_size = tf.placeholder(tf.int32, name="batch_size")
        sentences = self.sentences
        questions = self.questions
        labels = self.labels

        # (batch_size * mask_sentence_count x max_sentence_length)
        sentence_mask = tf.cast(sentences >= 0, tf.int32)
        self.sentence_mask = sentence_mask
        #sentence_mask = tf.sequence_mask(doc_lengths, dtype=tf.int32)
        masked_sentences = tf.mul(sentences, sentence_mask)

        max_sent_per_doc = tf.cast(
            tf.shape(sentence_mask)[0] / batch_size, tf.int32)
        self.max_sent_per_doc = max_sent_per_doc

        # Input Preparation
        # ==================================================
        with tf.variable_scope("embeddings"):
            if embedding_matrix == None:
                self.W_embeddings = tf.get_variable(shape=[vocab_size, embedding_size], \
                                               initializer=tf.random_uniform_initializer(-0.01, 0.01),\
                                               name="W_embeddings", dtype=tf.float32)
            else:
                ################## option to use pre-trained embeddings ##################
                self.W_embeddings = tf.Variable(embedding_matrix, \
                                               name="W_embeddings", dtype=tf.float32)

            # SENTENCES MASKED
            # (batch_size x max_sent_per_doc)
            batch_mask = tf.reshape(tf.reduce_max(sentence_mask, 1),
                                    [batch_size, -1])
            # (batch_size * max_sent_per_doc x 1 x 1)
            sentence_batch_mask = tf.cast(tf.reshape(batch_mask, [-1, 1, 1]),
                                          tf.float32)

            # batch_size * max_sent_per_doc x max_sentence_length x embedding_size
            sentence_embeddings = tf.gather(self.W_embeddings,
                                            masked_sentences)
            masked_sentence_embeddings = tf.mul(
                sentence_embeddings,
                tf.cast(tf.expand_dims(sentence_mask, -1), tf.float32))

            # QUERY MASKED
            # create mask (batch_size x max_question_length)
            question_mask = tf.cast(questions > 0, tf.int32)
            self.question_mask = question_mask
            masked_question = tf.mul(question_mask, questions)
            self.masked_question = masked_question

            # (batch_size x max_question_length x embedding_size)
            question_embeddings = tf.gather(self.W_embeddings, masked_question)
            self.question_embeddings = question_embeddings
            question_mask_float = tf.expand_dims(
                tf.cast(question_mask, tf.float32), -1)
            masked_question_embeddings = tf.mul(question_embeddings,
                                                question_mask_float)
            self.masked_question_embeddings = masked_question_embeddings

        # CBOW Sentence Representation
        # ==================================================
        with tf.variable_scope("sentence-representation"):

            self.forward_cell_d = GRUCell(state_size=hidden_size,
                                          input_size=embedding_size,
                                          scope="GRU-Forward-D")
            self.backward_cell_d = GRUCell(state_size=hidden_size,
                                           input_size=embedding_size,
                                           scope="GRU-Backward-D")

            self.hidden_states_d, last_state_d = bidirectional_rnn(self.forward_cell_d, self.backward_cell_d, \
                sentence_embeddings, tf.cast(sentence_mask, tf.float32), concatenate=True)

            doc_sentences = tf.reshape(last_state_d,
                                       [batch_size, -1, hidden_size * 2])
            self.cbow_sentences = doc_sentences

            # # (batch_size * max_sentence_count x embedding_size)
            # cbow_sentences = tf.reduce_mean(masked_sentence_embeddings, 1)
            # self.cbow_sentences = cbow_sentences
            # # reshape batch to (batch_size x max_doc_length x embedding_size)
            # doc_sentences = tf.reshape(cbow_sentences, [batch_size, -1, embedding_size])

        # Query Representation
        # ==================================================
        with tf.variable_scope("query-representation"):
            # easy baseline: cbow
            # (batch_size x embedding_size)

            #question_cbow = tf.reduce_mean(masked_question_embeddings, 1)
            #self.question_cbow = question_cbow

            self.forward_cell_q = GRUCell(state_size=hidden_size,
                                          input_size=embedding_size,
                                          scope="GRU-Forward-Q")
            self.backward_cell_q = GRUCell(state_size=hidden_size,
                                           input_size=embedding_size,
                                           scope="GRU-Backward-Q")

            self.hidden_states_q, last_state_q = bidirectional_rnn(self.forward_cell_q, self.backward_cell_q, \
                question_embeddings, tf.cast(question_mask, tf.float32), concatenate=True)

            self.question_cbow = last_state_q

            # can use RNN representation as well*************************************

        # Similarity Scoring
        # ==================================================
        # Using simple dot product/cosine similiarity as of now (https://arxiv.org/pdf/1605.07427v1.pdf)

        with tf.variable_scope("similarity-scoring"):
            # (batch_size x max_sent_per_doc)
            #"""
            # dot_prod = tf.squeeze(tf.batch_matmul(doc_sentences, tf.expand_dims(question_cbow, -1)), [-1])
            # self.dot_prod = dot_prod
            #
            # # softmax
            # numerator = tf.exp(tf.sub(dot_prod, tf.expand_dims(tf.reduce_max(dot_prod, 1), -1))) * tf.cast(batch_mask, tf.float32)
            # denom = tf.reduce_sum(numerator, 1)
            #
            # # Dimensions (batch x time)
            # probabilities = tf.div(numerator, tf.expand_dims(denom, 1))
            #"""

            # #(batch_size x max_sent_per_doc)
            # sentence_norm = tf.sqrt(tf.reduce_sum(tf.mul(doc_sentences, doc_sentences), -1))
            # self.sentence_norm = sentence_norm
            # # (batch_size)
            # question_norm = tf.sqrt(tf.reduce_sum(tf.mul(question_cbow, question_cbow), 1))
            # self.question_norm = question_norm
            #
            # denom = tf.mul(sentence_norm, tf.expand_dims(question_norm, -1))+1e-30
            # self.denom = denom
            # # (batch_size x max_sent_per_doc) - scalars between -1 and +1
            # cosine_similarity = tf.div(dot_prod, denom)
            # self.cosine_similarity = cosine_similarity
            #
            # masked_pos_cos_sim = tf.sub(tf.add(cosine_similarity, 1), tf.cast(batch_mask < 1, tf.float32))
            # self.masked_pos_cos_sim = masked_pos_cos_sim
            # normalized_cos_sim = tf.div(masked_pos_cos_sim, tf.expand_dims(tf.reduce_sum(masked_pos_cos_sim, 1), -1))

            #"""
            attention = BilinearFunction(attending_size=hidden_size * 2,
                                         attended_size=hidden_size * 2)
            alpha_weights, attend_result = attention(self.question_cbow, attended=doc_sentences, \
                time_mask=tf.cast(batch_mask, tf.float32))
            probabilities = alpha_weights

            #"""

            #probabilities = tf.abs(dot_prod) #normalized_cos_sim
            self.probabilities = probabilities

        with tf.variable_scope("prediction"):
            one_hot_labels = tf.one_hot(labels,
                                        max_sent_per_doc,
                                        dtype=tf.float32)

            likelihoods = tf.reduce_sum(tf.mul(probabilities, one_hot_labels),
                                        1)
            log_likelihoods = tf.log(likelihoods + 0.00000000000000000001)
            self.loss = tf.mul(tf.reduce_sum(log_likelihoods), -1)
            correct_vector = tf.cast(tf.equal(labels, tf.cast(tf.argmax(probabilities, 1), tf.int32)), \
                tf.float32, name="correct_vector")
            self.accuracy = tf.reduce_mean(correct_vector)
Beispiel #4
0
def embedding_attention_seq2seq(encoder_inputs,
                                encoder_mask,
                                decoder_inputs,
                                cell,
                                num_encoder_symbols,
                                num_decoder_symbols,
                                embedding_size,
                                beam_size,
                                output_projection=None,
                                num_layers=1,
                                feed_previous=False,
                                dtype=dtypes.float32,
                                scope=None,
                                initial_state_attention=True):
    """Embedding sequence-to-sequence model with attention.

    This model first embeds encoder_inputs by a newly created embedding (of shape
    [num_encoder_symbols x input_size]). Then it runs an bidirectional-RNN to encode
    embedded encoder_inputs into a state vector. It keeps the outputs of this
    bidirectional-RNN at every step to use for attention later. Next, it embeds decoder_inputs
    by another newly created embedding (of shape [num_decoder_symbols x
    input_size]). Then it runs attention decoder, initialized with the last
    encoder state, on embedded decoder_inputs and attending to encoder outputs.

    Args:
      encoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      encoder_mask: the mask of encoder inputs that label where are PADs.
      decoder_inputs: A list of 1D int32 Tensors of shape [batch_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      num_encoder_symbols: Integer; number of symbols on the encoder side.
      num_decoder_symbols: Integer; number of symbols on the decoder side.
      embedding_size: Integer, the length of the embedding vector for each symbol.
      output_projection: None or a pair (W, B) of output projection weights and
        biases; W has shape [output_size x num_decoder_symbols] and B has
        shape [num_decoder_symbols]; if provided and feed_previous=True, each
        fed previous output will first be multiplied by W and added B.
      feed_previous: Boolean or scalar Boolean Tensor; if True, only the first
        of decoder_inputs will be used (the "GO" symbol), and all other decoder
        inputs will be taken from previous outputs (as in embedding_rnn_decoder).
        If False, decoder_inputs are used as given (the standard decoder case).
      dtype: The dtype of the initial RNN state (default: tf.float32).
      scope: VariableScope for the created subgraph; defaults to
        "embedding_attention_seq2seq".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state and attention
        states.

    Returns:
      A tuple of the form (outputs, state, symbols), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors with
          shape [batch_size x num_decoder_symbols] containing the generated
          outputs.
        state: The state of each decoder cell at the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
        symbols: When training, it is []; when decoding, it is the best translation
          generated by beam search.
    """
    with variable_scope.variable_scope(scope or "embedding_attention_seq2seq"):
        # word embeddings of source words
        embedding = variable_scope.get_variable(
            "embedding", [num_encoder_symbols, embedding_size],
            dtype=dtype,
            initializer=init_ops.random_normal_initializer(0, 0.01, seed=SEED))
        # wrap encoder cell with embedding
        encoder_cell = rnn_cell.EmbeddingWrapper(
            cell,
            embedding_classes=num_encoder_symbols,
            embedding_size=embedding_size,
            embedding=embedding)

        # get the sentence lengths of source sentences
        encoder_lens = math_ops.reduce_sum(encoder_mask, [1])

        # encode source sentences with a bidirectional_rnn encoder
        encoder_outputs, _, encoder_state = rnn.bidirectional_rnn(
            encoder_cell,
            encoder_cell,
            encoder_inputs,
            sequence_length=encoder_lens,
            dtype=dtype)
        # First calculate a concatenation of encoder outputs.
        top_states = [
            array_ops.reshape(e, [-1, 1, 2 * cell.output_size])
            for e in encoder_outputs
        ]
        attention_states = array_ops.concat(top_states, 1)

        # Decoder.
        output_size = None
        if output_projection is None:
            cell = rnn_cell.OutputProjectionWrapper(cell, num_decoder_symbols)
            output_size = num_decoder_symbols

        return embedding_attention_decoder(
            encoder_mask,
            decoder_inputs,
            encoder_state,
            attention_states,
            cell,
            num_decoder_symbols,
            embedding_size,
            beam_size=beam_size,
            output_size=output_size,
            output_projection=output_projection,
            num_layers=num_layers,
            feed_previous=feed_previous,
            initial_state_attention=initial_state_attention)
Beispiel #5
0
    # Dimensions: batch x max_length x embedding_dim
    document_embedding = tf.gather(W_embeddings, masked_d)
    question_embedding = tf.gather(W_embeddings, masked_q)

with tf.variable_scope("bidirection_rnn"):
    # Bidirectional RNNs for Document and Question
    forward_cell_d = GRUCell(state_size, input_size, scope="GRU-Forward-D")
    backward_cell_d = GRUCell(state_size, input_size, scope="GRU-Backward-D")

    forward_cell_q = GRUCell(state_size, input_size, scope="GRU-Forward-Q")
    backward_cell_q = GRUCell(state_size, input_size, scope="GRU-Backward-Q")

    # hidden_states_d, last_state_d = rnn(forward_cell_d, document_embedding, seq_lens_d)
    # hidden_states_q, last_state_q = rnn(forward_cell_q, question_embedding, seq_lens_q)

    hidden_states_d, last_state_d = bidirectional_rnn(forward_cell_d, backward_cell_d, \
        document_embedding, seq_lens_d, concatenate=True)

    hidden_states_q, last_state_q = bidirectional_rnn(forward_cell_q, backward_cell_q, \
        question_embedding, seq_lens_q, concatenate=True)

with tf.variable_scope("attention"):
    time_mask = tf.sequence_mask(seq_lens_d, dtype=tf.float32)
    # Attention Layer
    attention = BilinearFunction(attending_size=state_size * 2,
                                 attended_size=state_size * 2)
    alpha_weights, attend_result = attention(attending=last_state_q, attended=hidden_states_d, \
       seq_lens=seq_lens_d)

with tf.variable_scope("prediction"):
    W_predict = tf.get_variable(name="predict_weight", shape=[state_size*2, max_entities], \
        initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
    def __init__(self,
                 max_entities,
                 hidden_size=128,
                 vocab_size=50000,
                 embedding_dim=100,
                 batch_size=32):
        self.max_entities = max_entities
        tf.set_random_seed(1234)

        # Placeholders
        # can add assert statements to ensure shared None dimensions are equal (batch_size)
        self.input_d = tf.placeholder(tf.int32, [None, None], name="input_d")
        self.input_q = tf.placeholder(tf.int32, [None, None], name="input_q")
        self.input_a = tf.placeholder(tf.int32, [
            None,
        ], name="input_a")
        self.input_m = tf.placeholder(tf.int32, [
            None,
        ], name="input_m")
        self.cbow_mask = tf.placeholder(tf.float32, [None, None, None],
                                        name="cbow_mask")
        self.window_size = tf.placeholder(tf.int32, name="window_size")
        self.num_docs = tf.placeholder(tf.int32, name="num_docs")
        self.max_length = tf.placeholder(tf.int32, name="max_length")

        seq_lens_d = tf.reduce_sum(tf.cast(self.input_d >= 0, tf.int32), 1)
        seq_lens_q = tf.reduce_sum(tf.cast(self.input_q >= 0, tf.int32), 1)

        mask_d = tf.cast(tf.sequence_mask(seq_lens_d), tf.int32)
        mask_q = tf.cast(tf.sequence_mask(seq_lens_q), tf.int32)
        mask_m = tf.cast(tf.sequence_mask(self.input_m, maxlen=max_entities),
                         dtype=tf.float32)

        # Document and Query embddings; One-hot-encoded answers
        masked_d = tf.mul(self.input_d, mask_d)
        masked_q = tf.mul(self.input_q, mask_q)
        one_hot_a = tf.one_hot(self.input_a, self.max_entities)

        # Buildling Graph (Network Layers)
        # ==================================================
        with tf.device('/cpu:0'), tf.variable_scope("embedding"):
            W_embeddings = tf.get_variable(shape=[vocab_size, embedding_dim], \
                                           initializer=tf.random_uniform_initializer(-0.01, 0.01),\
                                           name="W_embeddings")
            ################## Make option to use pre-trained embeddings ##################

            # Dimensions: batch x max_length x embedding_dim
            document_embedding = tf.gather(W_embeddings, masked_d)
            question_embedding = tf.gather(W_embeddings, masked_q)

#document_embedding = tf.reshape(document_embedding, shape = [self.num_docs, self.window_size, self.max_length / self.window_size, embedding_dim])
#document_cbow = tf.reduce_mean(document_embedding, 1)

# Experimental aspect. Combining the CBOW of a sequence of 20 words in every document.
#document_cbow = tf.batch_matmul(self.cbow_mask, document_embedding)

        with tf.variable_scope("bidirection_rnn"):
            #seq_lens_cbow = tf.reshape(tf.div(seq_lens_d, self.window_size), [-1])

            mask_d = tf.cast(tf.sequence_mask(seq_lens),
                             tf.float32)  #or float64?
            #mask_d = tf.cast(tf.sequence_mask(seq_lens_d), tf.float32)
            mask_q = tf.cast(tf.sequence_mask(seq_lens_q), tf.float32)

            # Bidirectional RNNs for Document and Question
            forward_cell_d = GRUCell(state_size=hidden_size,
                                     input_size=embedding_dim,
                                     scope="GRU-Forward-D")
            backward_cell_d = GRUCell(state_size=hidden_size,
                                      input_size=embedding_dim,
                                      scope="GRU-Backward-D")

            forward_cell_q = GRUCell(state_size=hidden_size,
                                     input_size=embedding_dim,
                                     scope="GRU-Forward-Q")
            backward_cell_q = GRUCell(state_size=hidden_size,
                                      input_size=embedding_dim,
                                      scope="GRU-Backward-Q")

            hidden_states_d, last_state_d = bidirectional_rnn(forward_cell_d, backward_cell_d, \
                document_embedding, mask_d, concatenate=True)

            hidden_states_q, last_state_q = bidirectional_rnn(forward_cell_q, backward_cell_q, \
                question_embedding, mask_q, concatenate=True)

        with tf.variable_scope("attention"):
            # Attention Layer
            attention = BilinearFunction(attending_size=hidden_size * 2,
                                         attended_size=hidden_size * 2)
            self.alpha_weights, self.attend_result = attention(attending=last_state_q, attended=hidden_states_d, \
                time_mask=mask_d)

        with tf.variable_scope("prediction"):
            W_predict = tf.get_variable(name="predict_weight", shape=[hidden_size*2, self.max_entities], \
                initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
            b_predict = tf.get_variable(
                name="predict_bias",
                shape=[self.max_entities],
                initializer=tf.constant_initializer(0.0))
            # Dimensions (batch_size x max_entities)
            predict_probs = (tf.matmul(self.attend_result, W_predict) +
                             b_predict) * mask_m

            # Custom Softmax b/c need to use time_mask --------------------
            # Also numerical stability:

            # e_x = exp(x - x.max(axis=1))
            # out = e_x / e_x.sum(axis=1)
            numerator = tf.exp(
                tf.sub(predict_probs,
                       tf.expand_dims(tf.reduce_max(predict_probs, 1),
                                      -1))) * mask_m
            denom = tf.reduce_sum(numerator, 1)

            # Transpose so broadcasting scalar division works properly
            # Dimensions (batch x max_entities)
            #self.predict_probs_normalized = tf.transpose(tf.div(tf.transpose(numerator), denom))
            predict_probs_normalized = tf.div(numerator,
                                              tf.expand_dims(denom, 1))
            likelihoods = tf.reduce_sum(
                tf.mul(predict_probs_normalized, one_hot_a), 1)
            log_likelihoods = tf.log(likelihoods + 0.00000000000000000001)
            # Negative log-likelihood loss
            self.loss = tf.mul(tf.reduce_sum(log_likelihoods), -1)
            correct_vector = tf.cast(tf.equal(tf.argmax(one_hot_a, 1), tf.argmax(predict_probs_normalized, 1)), \
                tf.float32, name="correct_vector")
            self.accuracy = tf.reduce_mean(correct_vector)
Beispiel #7
0
inputs = document_embedding
reverse_inputs = tf.reverse_sequence(inputs, seq_lens, seq_dim=1, batch_dim=0)

forward_cell = GRUCell(state_size, input_size, scope="GRU-Forward")
backward_cell = GRUCell(state_size, input_size, scope="GRU-Backward")

#forward_outputs, forward_last_state = rnn(forward_cell, inputs, seq_lens, batch_size, embedding_dim)
#backward_outputs, backward_last_state = rnn(backward_cell, reverse_inputs, seq_lens, batch_size, embedding_dim)

#LS = tf.concat(1, [forward_last_state, backward_last_state])
#LSS = tf.concat(2, [forward_outputs, backward_outputs])

LS, LSS = bidirectional_rnn(forward_cell,
                            backward_cell,
                            inputs,
                            seq_lens,
                            batch_size,
                            embedding_dim,
                            concatenate=True)

sess.run(tf.initialize_all_variables())
"""
print(forward_last_state.eval(feed))
print(backward_last_state.eval(feed))
print(forward_last_state.eval(feed).shape) # batch x hidden_state
"""
print(LS.eval(feed))
print(LS.eval(feed).shape)
print(LSS.eval(feed))
print(LSS.eval(feed).shape)
Beispiel #8
0
    def __init__(self, num_classes, vocab_size, hidden_size=128, \
        embedding_dim=100, batch_size=32, bidirectional=False):

        tf.set_random_seed(1234)

        # Placeholders
        # can add assert statements to ensure shared None dimensions are equal (batch_size)
        self.seq_lens = tf.placeholder(tf.int32, [
            None,
        ], name="seq_lens")
        self.input_x = tf.placeholder(tf.int32, [None, None], name="input_x")
        self.input_y = tf.placeholder(tf.int32, [
            None,
        ], name="input_y")

        mask_x = tf.cast(tf.sequence_mask(self.seq_lens), tf.int32)

        # Document and Query embeddings; One-hot-encoded answers
        masked_x = tf.mul(self.input_x, mask_x)
        one_hot_y = tf.one_hot(self.input_y, num_classes)

        # Buildling Graph (Network Layers)
        # ==================================================
        with tf.variable_scope("embedding"):
            self.W_embeddings = tf.get_variable(shape=[vocab_size, embedding_dim], \
                                           initializer=tf.random_uniform_initializer(-0.01, 0.01),\
                                           name="W_embeddings")

            # Dimensions: batch x max_length x embedding_dim
            input_embedding = tf.gather(self.W_embeddings, masked_x)

        with tf.variable_scope("rnn"):
            if bidirectional:
                # Bidirectional RNNs
                forward_cell = rnn_cell.GRUCell(state_size=hidden_size,
                                                input_size=embedding_dim,
                                                scope="GRU-Forward")
                backward_cell = rnn_cell.GRUCell(state_size=hidden_size,
                                                 input_size=embedding_dim,
                                                 scope="GRU-Backward")

                hidden_states, last_state = rnn.bidirectional_rnn(forward_cell, backward_cell, \
                    input_embedding, self.seq_lens, concatenate=True)
            else:
                # One directional RNN (start to end)
                cell = rnn_cell.GRUCell(state_size=hidden_size,
                                        input_size=embedding_dim,
                                        scope="GRU")
                hidden_states, last_state = rnn.rnn(cell, input_embedding,
                                                    self.seq_lens)

        with tf.variable_scope("prediction"):
            if bidirectional:
                W_predict = tf.get_variable(name="predict_weight", shape=[hidden_size*2, num_classes], \
                    initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
            else:
                W_predict = tf.get_variable(name="predict_weight", shape=[hidden_size, num_classes], \
                    initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
            b_predict = tf.get_variable(
                name="predict_bias",
                shape=[num_classes],
                initializer=tf.constant_initializer(0.0))
            # Dimensions (batch_size x num_classes)
            prediction_probs_unnormalized = tf.matmul(last_state,
                                                      W_predict) + b_predict

            # Softmax
            # Dimensions (batch x time)
            prediction_probs = tf.nn.softmax(prediction_probs_unnormalized,
                                             name="prediction_probs")
            likelihoods = tf.reduce_sum(tf.mul(prediction_probs, one_hot_y), 1)
            log_likelihoods = tf.log(likelihoods)

            # Negative log-likelihood loss
            self.loss = tf.mul(tf.reduce_sum(log_likelihoods), -1)
            predictions = tf.argmax(prediction_probs, 1, name="predictions")
            correct_vector = tf.cast(tf.equal(tf.argmax(one_hot_y, 1), tf.argmax(prediction_probs, 1)), \
                tf.float32, name="correct_vector")
            self.accuracy = tf.reduce_mean(correct_vector)
Beispiel #9
0
    def __init__(self, hidden_size=128, vocab_size=65011, embedding_size=50, embedding_matrix=None, \
        embedding_trainable=False, sentence_rep_str="cbow", question_rep_str="cbow"):
        tf.set_random_seed(1234)

        # Placeholders
        # ==================================================
        # (batch_size * max_sentence_count x max_sentence_length)
        self.sentences = tf.placeholder(tf.int32, [None, None], name="sentences")
        self.questions = tf.placeholder(tf.int32, [None, None], name="questions")
        self.labels = tf.placeholder(tf.int32, [None, ], name="labels")

        # initialize dimension variables based on contructor arguments
        if sentence_rep_str == "cbow":
            attended_size = embedding_size
        elif sentence_rep_str == "rnn":
            attended_size = hidden_size*2
        else:
            raise ValueError("Invalid `sentence_rep_str` argument; choose 'cbow' or 'rnn'.")

        if question_rep_str == "cbow":
            attending_size = embedding_size
        elif question_rep_str == "rnn":
            attending_size = hidden_size*2
        else:
            raise ValueError("Invalid `question_rep_str` argument; choose 'cbow', 'rnn', or 'rnn-attention'.")

        # Input Preparation (Mask Creation)
        # ==================================================
        with tf.variable_scope("masks"):
            # MASK SENTENCES
            # (batch_size * mask_sentence_count x max_sentence_length)
            sentence_mask = tf.cast(self.sentences >= 0, tf.int32)
            #sentence_mask = tf.sequence_mask(doc_lengths, dtype=tf.int32)
            masked_sentences = tf.mul(self.sentences, sentence_mask)

            batch_size = tf.shape(self.questions)[0]

            # RESHAPE SENTENCE MASK
            # (batch_size x max_sent_per_doc)
            batch_mask = tf.reshape(tf.reduce_max(sentence_mask, 1), [batch_size, -1])
            answer_counts = tf.cast(tf.reduce_sum(batch_mask, 1), tf.float32)
            # (batch_size * max_sent_per_doc x 1 x 1)
            sentence_batch_mask = tf.cast(tf.reshape(batch_mask, [-1, 1, 1]), tf.float32)

            # MASK QUESTIONS
            # create mask (batch_size x max_question_length)
            question_mask = tf.cast(self.questions >= 0, tf.int32)
            masked_question = tf.mul(question_mask, self.questions)
            question_mask_float = tf.expand_dims(tf.cast(question_mask, tf.float32), -1)

            max_sent_per_doc = tf.cast(tf.shape(sentence_mask)[0]/batch_size, tf.int32)

        # Embeddings
        # ==================================================
        with tf.variable_scope("embeddings"):
            if embedding_matrix is None:
                self.W_embeddings = tf.get_variable(shape=[vocab_size, embedding_size], \
                                               initializer=tf.random_uniform_initializer(-0.01, 0.01),\
                                               name="W_embeddings", dtype=tf.float32)
            else:
                ################## option to use pre-trained embeddings ##################
                self.W_embeddings = tf.Variable(embedding_matrix, \
                                               name="W_embeddings", dtype=tf.float32, trainable=embedding_trainable)

            # batch_size * max_sent_per_doc x max_sentence_length x embedding_size
            sentence_embeddings = tf.gather(self.W_embeddings, masked_sentences)
            masked_sentence_embeddings = tf.mul(sentence_embeddings, tf.cast(tf.expand_dims(sentence_mask, -1), tf.float32))

            # (batch_size x max_question_length x embedding_size)
            question_embeddings = tf.gather(self.W_embeddings, masked_question)
            masked_question_embeddings = tf.mul(question_embeddings, question_mask_float)

        # Sentence Representation (CBOW or RNN)
        # ==================================================
        with tf.variable_scope("sentence-representation"):

            # CBOW -----------------------------------------
            if sentence_rep_str == "cbow":
                # (batch_size * max_sentence_count x embedding_size)
                cbow_sentences = tf.reduce_mean(masked_sentence_embeddings, 1)
                # reshape batch to (batch_size x max_doc_length x embedding_size)
                doc_sentences = tf.reshape(cbow_sentences, [batch_size, -1, embedding_size])

                self.sentence_representation = cbow_sentences

            # RNN -----------------------------------------
            elif sentence_rep_str == "rnn":
                self.forward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Forward-D")
                self.backward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Backward-D")

                self.hidden_states_d, last_state_d = bidirectional_rnn(self.forward_cell_d, self.backward_cell_d, \
                    sentence_embeddings, tf.cast(sentence_mask, tf.float32), concatenate=True)

                doc_sentences = tf.reshape(last_state_d, [batch_size, -1, hidden_size*2])

                self.sentence_representation = last_state_d

        # Query Representation (CBOW or RNN)
        # ==================================================
        with tf.variable_scope("query-representation"):

            # CBOW -----------------------------------------
            if question_rep_str == "cbow":
                # (batch_size x embedding_size)
                question_cbow = tf.reduce_mean(masked_question_embeddings, 1)
                self.question_representation = question_cbow

            # RNN -----------------------------------------
            elif question_rep_str == "rnn":
                self.forward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Forward-Q")
                self.backward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Backward-Q")

                self.hidden_states_q, last_state_q = bidirectional_rnn(self.forward_cell_q, self.backward_cell_q, \
                    question_embeddings, tf.cast(question_mask, tf.float32), concatenate=True)

                #tf.reduce_mean(self.hidden_states_q, )

                self.question_representation = last_state_q


        # Similarity Scoring
        # ==================================================
        # Using simple dot product/cosine similiarity as of now (https://arxiv.org/pdf/1605.07427v1.pdf)

        with tf.variable_scope("similarity-scoring"):

            # (batch_size x max_sent_per_doc)
            attention = BilinearFunction(attending_size=attending_size, attended_size=attended_size)
            alpha_weights, attend_result = attention(self.question_representation, attended=doc_sentences, \
                 time_mask=tf.cast(batch_mask, tf.float32))
            self.probabilities = alpha_weights

        # Loss
        # ==================================================
        with tf.variable_scope("prediction"):

            one_hot_labels = tf.one_hot(self.labels, max_sent_per_doc, dtype=tf.float32)

            likelihoods = tf.reduce_sum(tf.mul(self.probabilities, one_hot_labels), 1)
            likelihoods = tf.div(likelihoods, answer_counts)
            log_likelihoods = tf.log(likelihoods+0.00000000000000000001)
            self.loss = tf.div(tf.mul(tf.reduce_sum(log_likelihoods), -1), tf.cast(batch_size, tf.float32))
            self.correct_vector = tf.cast(tf.equal(self.labels, tf.cast(tf.argmax(self.probabilities, 1), tf.int32)), tf.float64, name="correct_vector")
            self.predict_labels = tf.argmax(self.probabilities, 1)
            self.accuracy = tf.reduce_mean(self.correct_vector)