Esempio n. 1
0
 def test_call_handles_uneven_higher_order_input(self):
     batch_size = 1
     length_1 = 5
     length_2 = 6
     length_3 = 2
     embedding_dim = 4
     matrix_input = Input(shape=(length_3, embedding_dim), dtype='float32')
     attention_input = Input(shape=(
         length_1,
         length_2,
         length_3,
     ),
                             dtype='float32')
     aggregated_vector = WeightedSum()([matrix_input, attention_input])
     model = Model(inputs=[matrix_input, attention_input],
                   outputs=[aggregated_vector])
     sentence_tensor = numpy.random.rand(batch_size, length_3,
                                         embedding_dim)
     attention_tensor = numpy.random.rand(batch_size, length_1, length_2,
                                          length_3)
     aggregated_tensor = model.predict([sentence_tensor, attention_tensor])
     assert aggregated_tensor.shape == (batch_size, length_1, length_2,
                                        embedding_dim)
     for i in range(length_1):
         for j in range(length_2):
             expected_tensor = (
                 attention_tensor[0, i, j, 0] * sentence_tensor[0, 0] +
                 attention_tensor[0, i, j, 1] * sentence_tensor[0, 1])
             numpy.testing.assert_almost_equal(aggregated_tensor[0, i, j],
                                               expected_tensor,
                                               decimal=5)
Esempio n. 2
0
    def test_call_handles_masking_properly(self):
        batch_size = 1
        vocab_size = 4
        sentence_length = 5
        embedding_dim = 4
        embedding_weights = numpy.random.rand(vocab_size, embedding_dim)
        embedding = Embedding(vocab_size,
                              embedding_dim,
                              weights=[embedding_weights],
                              mask_zero=True)

        sentence_input = Input(shape=(sentence_length, ), dtype='int32')
        sentence_embedding = embedding(sentence_input)
        attention_input = Input(shape=(sentence_length, ), dtype='float32')
        aggregated_vector = WeightedSum()(
            [sentence_embedding, attention_input])
        model = Model(inputs=[sentence_input, attention_input],
                      outputs=[aggregated_vector])

        sentence_tensor = numpy.asarray([[1, 3, 2, 1, 0]])
        attention_tensor = numpy.asarray([[.3, .4, .1, 0, 1.2]])
        aggregated_tensor = model.predict([sentence_tensor, attention_tensor])
        assert aggregated_tensor.shape == (batch_size, embedding_dim)
        expected_tensor = (
            0.3 * embedding_weights[1] + 0.4 * embedding_weights[3] +
            0.1 * embedding_weights[2] + 0.0 * embedding_weights[1] +
            0.0 * embedding_weights[0])  # this one is 0 because of masking
        numpy.testing.assert_almost_equal(aggregated_tensor, [expected_tensor],
                                          decimal=5)
Esempio n. 3
0
 def test_call_works_on_simple_input(self):
     batch_size = 1
     sentence_length = 5
     embedding_dim = 4
     matrix_input = Input(shape=(sentence_length, embedding_dim),
                          dtype='float32')
     attention_input = Input(shape=(sentence_length, ), dtype='float32')
     aggregated_vector = WeightedSum()([matrix_input, attention_input])
     model = Model(inputs=[matrix_input, attention_input],
                   outputs=[aggregated_vector])
     sentence_tensor = numpy.random.rand(batch_size, sentence_length,
                                         embedding_dim)
     attention_tensor = numpy.asarray([[.3, .4, .1, 0, 1.2]])
     aggregated_tensor = model.predict([sentence_tensor, attention_tensor])
     assert aggregated_tensor.shape == (batch_size, embedding_dim)
     expected_tensor = (0.3 * sentence_tensor[0, 0] +
                        0.4 * sentence_tensor[0, 1] +
                        0.1 * sentence_tensor[0, 2] +
                        0.0 * sentence_tensor[0, 3] +
                        1.2 * sentence_tensor[0, 4])
     numpy.testing.assert_almost_equal(aggregated_tensor, [expected_tensor],
                                       decimal=5)
def run_biDAF():
    # Create embedding for both Question and News ON both word level and char level
    question_input = Input(shape=(max_len_Q,),
                           dtype='int32', name="question_input")
    passage_input = Input(shape=(max_len_P,),
                          dtype='int32', name="passage_input")
    # Load num of options input
    options_input = Input(shape=(max_num_options,),
                          dtype='int32', name="options_input")  # in order to map only options output
    embedding_layer_P = Embedding(em_len,
                                  emb_dim,
                                  weights=[embeddings],
                                  input_length=max_len_P,
                                  batch_input_shape=(batch_size, max_len_P),
                                  trainable=False)
    embedding_layer_Q = Embedding(em_len,
                                  emb_dim,
                                  weights=[embeddings],
                                  input_length=max_len_Q,
                                  batch_input_shape=(batch_size, max_len_Q),
                                  trainable=False)

    passage_embedding = embedding_layer_P(passage_input)
    question_embedding = embedding_layer_Q(question_input)



    bi_lstm_Q = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_Q, emb_dim))(
        question_embedding)
    bi_lstm_Q1 = Bidirectional(LSTM(256), batch_input_shape=(batch_size, max_len_Q, emb_dim))(question_embedding)
    bi_lstm_P = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_P, emb_dim))(
        passage_embedding)
    ##### Create Attention Layer

    similarity_function_params = {'type': 'linear', 'combination': 'x,y,x*y'}
    matrix_attention_layer = MatrixAttention(similarity_function=similarity_function_params,name='matrix_attention_layer')
    # Shape: (batch_size, num_passage_words, num_question_words)
    passage_question_similarity = matrix_attention_layer([bi_lstm_P, bi_lstm_Q])

    # Shape: (batch_size, num_passage_words, num_question_words), normalized over question words for each passage word.
    passage_question_attention = MaskedSoftmax()(passage_question_similarity)

    weighted_sum_layer = WeightedSum(name="passage_question_vectors",
                                     use_masking=False)  # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    passage_question_vectors = weighted_sum_layer([bi_lstm_Q, passage_question_attention])  # sum at(U~:t)=1
    ## Query - Passage 2d * max_len_Q
    # find most important context words by max() passage_question_similarity

    question_passage_similarity = Max(axis=-1)(passage_question_similarity)  # Shape: (batch_size, num_passage_words)
    # use softmax for b (max softmax value for similarity matrix column wise)
    question_passage_attention = MaskedSoftmax()(question_passage_similarity)  # Shape: (batch_size, num_passage_words)

    weighted_sum_layer = WeightedSum(name="question_passage_vector",
                                     use_masking=False)  # h~ = sum(weighted_bt * H:t) 2*embed_dim
    question_passage_vector = weighted_sum_layer([bi_lstm_P, question_passage_attention])  # sum bt(H~:t)=1

    repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    tiled_question_passage_vector = repeat_layer([question_passage_vector, bi_lstm_P])

    # Shape: (batch_size, num_passage_words, embedding_dim * 8)
    complex_concat_layer = ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage')
    final_merged_passage = complex_concat_layer([bi_lstm_P,
                                                 passage_question_vectors,
                                                 tiled_question_passage_vector])  # Denote G
    # Modelling layer. Take input of (?,?,emb*8) and apply bi-directional LSTM each with d dimensions, finally get 2d * Max_len_[]
    bi_model_passage = Bidirectional(LSTM(256, return_sequences=True),
                                     batch_input_shape=(batch_size, max_len_P, emb_dim))(final_merged_passage)
    # denote M

    # span begin output is calculated by Attention weight & LSTM softmax(Wp1 * [G;M])
    span_begin_input = Concatenate()([final_merged_passage, bi_model_passage])
    span_begin_weights = TimeDistributed(Dense(units=1))(span_begin_input)  # Wp1
    # Shape: (batch_size, num_passage_words)
    span_begin_probabilities = MaskedSoftmax(name="span_begin_softmax")(span_begin_weights)  # (700,)

    # as Minjoon's bidaf indicated, after obtain p1, span_start_prob, he sum all probability values of the entity instances
    # by mask out all non-entity value. and the loss function apply withoutp2
    multiword_option_mode = 'mean'
    options_sum_layer_minj = OptionAttentionSum(multiword_option_mode, name="options_probability_sum_minj")
    options_probabilities_minj = options_sum_layer_minj([passage_input, span_begin_probabilities, options_input])
    l1_norm_layer = L1Normalize()
    option_normalized_probabilities_cnn = l1_norm_layer(options_probabilities_minj)
    # dense = Dense(377, activation='sigmoid')(option_normalized_probabilities_cnn)

    biDAF = Model(inputs=[question_input, passage_input, options_input],
                      outputs=option_normalized_probabilities_cnn)
    biDAF.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    return biDAF
Esempio n. 5
0
def build_model(embedding_layer):
    """
    l2_lambda = 0.0001
    question_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH,),
                                  dtype='int32')  # * 2 since doubling the question and passage
    answer_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH,),
                                dtype='int32')  # * 2 since doubling the question and passage

    question_embedding = embedding_layer(question_input)
    answer_embedding = embedding_layer(answer_input)

    # Min's model has some highway layers here, with relu activations.  Note that highway
    # layers don't change the tensor's shape.  We need to have two different `TimeDistributed`
    # layers instantiated here, because Keras doesn't like it if a single `TimeDistributed`
    # layer gets applied to two inputs with different numbers of time steps.

    highway_layers = 2
    for i in range(highway_layers):
        highway_layer = highway.Highway(activation='relu', name='highway_{}'.format(i))
        question_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_qtd")
        question_embedding = question_layer(question_embedding)
        passage_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_ptd")
        answer_embedding = passage_layer(answer_embedding)

    # Then we pass the question and passage through a seq2seq encoder (like a biLSTM).  This
    # essentially pushes phrase-level information into the embeddings of each word.
    phrase_layer = Bidirectional(layers.GRU(return_sequences=True, units=500, activation='relu', recurrent_dropout= 0.2, dropout=0.2))#, kernel_regularizer=l2(l2_lambda),kernel_initializer='he_uniform' ))#, **(params["encoder_params"]), **(params["wrapper_params"])))

    # Shape: (batch_size, num_question_words, embedding_dim * 2)
    encoded_question = phrase_layer(question_embedding)

    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    encoded_answer = phrase_layer(answer_embedding)

    #encoded_question = layers.Dropout(0.2)(encoded_question)
    #encoded_answer = layers.Dropout(0.2)(encoded_answer)

    # PART 2:
    # Now we compute a similarity between the passage words and the question words, and
    # normalize the matrix in a couple of different ways for input into some more layers.
    matrix_attention_layer = MatrixAttention(similarity_function={'type': 'linear', 'combination': 'x,y,x*y'},
                                             name='passage_question_similarity')

    # Shape: (batch_size, num_passage_words, num_question_words)
    answer_question_similarity = matrix_attention_layer([encoded_answer, encoded_question])

    # Shape: (batch_size, num_passage_words, num_question_words), normalized over question
    # words for each passage word.
    answer_question_attention = MaskedSoftmax()(answer_question_similarity)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="answer_question_vectors", use_masking=False)
    answer_question_vectors = weighted_sum_layer([encoded_question, answer_question_attention])

    # Min's paper finds, for each document word, the most similar question word to it, and
    # computes a single attention over the whole document using these max similarities.
    # Shape: (batch_size, num_passage_words)
    question_answer_similarity = Max(axis=-1)(answer_question_similarity)
    # Shape: (batch_size, num_passage_words)
    question_answer_attention = MaskedSoftmax()(question_answer_similarity)
    # Shape: (batch_size, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False)
    question_answer_vector = weighted_sum_layer([encoded_answer, question_answer_attention])

    # Then he repeats this question/passage vector for every word in the passage, and uses it
    # as an additional input to the hidden layers above.
    repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    tiled_question_answer_vector = repeat_layer([question_answer_vector, encoded_answer])

    # Shape: (batch_size, num_passage_words, embedding_dim * 8)
    complex_concat_layer = complex_concat.ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage')
    final_merged_answer = complex_concat_layer([encoded_answer,
                                                 answer_question_vectors,
                                                 tiled_question_answer_vector])

    # PART 3:
    # Having computed a combined representation of the document that includes attended question
    # vectors, we'll pass this through a few more bi-directional encoder layers, then predict
    # the span_begin word.  Hard to find a good name for this; Min calls this part of the
    # network the "modeling layer", so we'll call this the `modeled_passage`.
    modeled_answer = final_merged_answer
    for i in range(1):
        hidden_layer = Bidirectional(layers.GRU(return_sequences=True, units=300, activation='relu', recurrent_dropout= 0.2))#, kernel_regularizer=l2(l2_lambda), kernel_initializer='he_uniform' ))#, **(params["encoder_params"]), **(params["wrapper_params"])))
        modeled_answer = hidden_layer(modeled_answer)


    #PART 4: BY HELEN
    #get the maximum for each word
    max_answer = Max(axis=-1)(modeled_answer)
    preds = layers.Dense(1, activation = 'sigmoid', name = 'prediction')(max_answer)

    model = models.Model(inputs=[question_input, answer_input], outputs=preds)

    return model
    """

    question_input = layers.Input(
        shape=(MAX_SEQUENCE_LENGTH, ),
        dtype='int32')  # * 2 since doubling the question and passage
    answer_input = layers.Input(
        shape=(MAX_SEQUENCE_LENGTH, ),
        dtype='int32')  # * 2 since doubling the question and passage

    question_embedding = embedding_layer(question_input)
    answer_embedding = embedding_layer(answer_input)

    # Min's model has some highway layers here, with relu activations.  Note that highway
    # layers don't change the tensor's shape.  We need to have two different `TimeDistributed`
    # layers instantiated here, because Keras doesn't like it if a single `TimeDistributed`
    # layer gets applied to two inputs with different numbers of time steps.
    highway_layers = 2
    for i in range(highway_layers):
        highway_layer = highway.Highway(activation='relu',
                                        name='highway_{}'.format(i))
        question_layer = layers.TimeDistributed(highway_layer,
                                                name=highway_layer.name +
                                                "_qtd")
        question_embedding = question_layer(question_embedding)
        passage_layer = layers.TimeDistributed(highway_layer,
                                               name=highway_layer.name +
                                               "_ptd")
        answer_embedding = passage_layer(answer_embedding)

    # Then we pass the question and passage through a seq2seq encoder (like a biLSTM).  This
    # essentially pushes phrase-level information into the embeddings of each word.
    phrase_layer = Bidirectional(
        layers.GRU(return_sequences=True,
                   units=500,
                   activation='relu',
                   recurrent_dropout=0.2,
                   dropout=0.3,
                   kernel_regularizer=l2(0.0001),
                   kernel_initializer='he_uniform')
    )  #, **(params["encoder_params"]), **(params["wrapper_params"])))

    # Shape: (batch_size, num_question_words, embedding_dim * 2)
    encoded_question = phrase_layer(question_embedding)

    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    encoded_answer = phrase_layer(answer_embedding)

    # PART 2:
    # Now we compute a similarity between the passage words and the question words, and
    # normalize the matrix in a couple of different ways for input into some more layers.
    matrix_attention_layer = MatrixAttention(
        similarity_function={
            'type': 'linear',
            'combination': 'x,y,x*y'
        },
        name='passage_question_similarity')

    # Shape: (batch_size, num_passage_words, num_question_words)
    answer_question_similarity = matrix_attention_layer(
        [encoded_answer, encoded_question])

    # Shape: (batch_size, num_passage_words, num_question_words), normalized over question
    # words for each passage word.
    answer_question_attention = MaskedSoftmax()(answer_question_similarity)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="answer_question_vectors",
                                     use_masking=False)
    answer_question_vectors = weighted_sum_layer(
        [encoded_question, answer_question_attention])

    # Min's paper finds, for each document word, the most similar question word to it, and
    # computes a single attention over the whole document using these max similarities.
    # Shape: (batch_size, num_passage_words)
    question_answer_similarity = Max(axis=-1)(answer_question_similarity)
    # Shape: (batch_size, num_passage_words)
    question_answer_attention = MaskedSoftmax()(question_answer_similarity)
    # Shape: (batch_size, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="question_passage_vector",
                                     use_masking=False)
    question_answer_vector = weighted_sum_layer(
        [encoded_answer, question_answer_attention])

    # Then he repeats this question/passage vector for every word in the passage, and uses it
    # as an additional input to the hidden layers above.
    repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    tiled_question_answer_vector = repeat_layer(
        [question_answer_vector, encoded_answer])

    # Shape: (batch_size, num_passage_words, embedding_dim * 8)
    complex_concat_layer = complex_concat.ComplexConcat(
        combination='1,2,1*2,1*3', name='final_merged_passage')
    final_merged_answer = complex_concat_layer([
        encoded_answer, answer_question_vectors, tiled_question_answer_vector
    ])

    # PART 3:
    # Having computed a combined representation of the document that includes attended question
    # vectors, we'll pass this through a few more bi-directional encoder layers, then predict
    # the span_begin word.  Hard to find a good name for this; Min calls this part of the
    # network the "modeling layer", so we'll call this the `modeled_passage`.
    modeled_answer = final_merged_answer
    for i in range(1):
        hidden_layer = Bidirectional(
            layers.GRU(
                return_sequences=True,
                units=300,
                activation='relu',
                recurrent_dropout=0.2,
                dropout=0.3,
            ))  #, **(params["encoder_params"]), **(params["wrapper_params"])))
        modeled_answer = hidden_layer(modeled_answer)

    #PART 4: BY HELEN
    #get the maximum for each word
    max_answer = Max(axis=-1)(modeled_answer)
    print("max answer shape", max_answer.shape)
    print("modeled_answer shape", modeled_answer.shape)

    preds = layers.Dense(1,
                         activation='sigmoid',
                         name='prediction',
                         kernel_regularizer=l2(0.0001),
                         kernel_initializer='he_uniform')(max_answer)

    print("pred shape", preds.shape)

    model = models.Model(inputs=[question_input, answer_input], outputs=preds)

    return model