def test_call_handles_uneven_higher_order_input(self): batch_size = 1 length_1 = 5 length_2 = 6 length_3 = 2 embedding_dim = 4 matrix_input = Input(shape=(length_3, embedding_dim), dtype='float32') attention_input = Input(shape=( length_1, length_2, length_3, ), dtype='float32') aggregated_vector = WeightedSum()([matrix_input, attention_input]) model = Model(inputs=[matrix_input, attention_input], outputs=[aggregated_vector]) sentence_tensor = numpy.random.rand(batch_size, length_3, embedding_dim) attention_tensor = numpy.random.rand(batch_size, length_1, length_2, length_3) aggregated_tensor = model.predict([sentence_tensor, attention_tensor]) assert aggregated_tensor.shape == (batch_size, length_1, length_2, embedding_dim) for i in range(length_1): for j in range(length_2): expected_tensor = ( attention_tensor[0, i, j, 0] * sentence_tensor[0, 0] + attention_tensor[0, i, j, 1] * sentence_tensor[0, 1]) numpy.testing.assert_almost_equal(aggregated_tensor[0, i, j], expected_tensor, decimal=5)
def test_call_handles_masking_properly(self): batch_size = 1 vocab_size = 4 sentence_length = 5 embedding_dim = 4 embedding_weights = numpy.random.rand(vocab_size, embedding_dim) embedding = Embedding(vocab_size, embedding_dim, weights=[embedding_weights], mask_zero=True) sentence_input = Input(shape=(sentence_length, ), dtype='int32') sentence_embedding = embedding(sentence_input) attention_input = Input(shape=(sentence_length, ), dtype='float32') aggregated_vector = WeightedSum()( [sentence_embedding, attention_input]) model = Model(inputs=[sentence_input, attention_input], outputs=[aggregated_vector]) sentence_tensor = numpy.asarray([[1, 3, 2, 1, 0]]) attention_tensor = numpy.asarray([[.3, .4, .1, 0, 1.2]]) aggregated_tensor = model.predict([sentence_tensor, attention_tensor]) assert aggregated_tensor.shape == (batch_size, embedding_dim) expected_tensor = ( 0.3 * embedding_weights[1] + 0.4 * embedding_weights[3] + 0.1 * embedding_weights[2] + 0.0 * embedding_weights[1] + 0.0 * embedding_weights[0]) # this one is 0 because of masking numpy.testing.assert_almost_equal(aggregated_tensor, [expected_tensor], decimal=5)
def test_call_works_on_simple_input(self): batch_size = 1 sentence_length = 5 embedding_dim = 4 matrix_input = Input(shape=(sentence_length, embedding_dim), dtype='float32') attention_input = Input(shape=(sentence_length, ), dtype='float32') aggregated_vector = WeightedSum()([matrix_input, attention_input]) model = Model(inputs=[matrix_input, attention_input], outputs=[aggregated_vector]) sentence_tensor = numpy.random.rand(batch_size, sentence_length, embedding_dim) attention_tensor = numpy.asarray([[.3, .4, .1, 0, 1.2]]) aggregated_tensor = model.predict([sentence_tensor, attention_tensor]) assert aggregated_tensor.shape == (batch_size, embedding_dim) expected_tensor = (0.3 * sentence_tensor[0, 0] + 0.4 * sentence_tensor[0, 1] + 0.1 * sentence_tensor[0, 2] + 0.0 * sentence_tensor[0, 3] + 1.2 * sentence_tensor[0, 4]) numpy.testing.assert_almost_equal(aggregated_tensor, [expected_tensor], decimal=5)
def run_biDAF(): # Create embedding for both Question and News ON both word level and char level question_input = Input(shape=(max_len_Q,), dtype='int32', name="question_input") passage_input = Input(shape=(max_len_P,), dtype='int32', name="passage_input") # Load num of options input options_input = Input(shape=(max_num_options,), dtype='int32', name="options_input") # in order to map only options output embedding_layer_P = Embedding(em_len, emb_dim, weights=[embeddings], input_length=max_len_P, batch_input_shape=(batch_size, max_len_P), trainable=False) embedding_layer_Q = Embedding(em_len, emb_dim, weights=[embeddings], input_length=max_len_Q, batch_input_shape=(batch_size, max_len_Q), trainable=False) passage_embedding = embedding_layer_P(passage_input) question_embedding = embedding_layer_Q(question_input) bi_lstm_Q = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_Q, emb_dim))( question_embedding) bi_lstm_Q1 = Bidirectional(LSTM(256), batch_input_shape=(batch_size, max_len_Q, emb_dim))(question_embedding) bi_lstm_P = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_P, emb_dim))( passage_embedding) ##### Create Attention Layer similarity_function_params = {'type': 'linear', 'combination': 'x,y,x*y'} matrix_attention_layer = MatrixAttention(similarity_function=similarity_function_params,name='matrix_attention_layer') # Shape: (batch_size, num_passage_words, num_question_words) passage_question_similarity = matrix_attention_layer([bi_lstm_P, bi_lstm_Q]) # Shape: (batch_size, num_passage_words, num_question_words), normalized over question words for each passage word. passage_question_attention = MaskedSoftmax()(passage_question_similarity) weighted_sum_layer = WeightedSum(name="passage_question_vectors", use_masking=False) # Shape: (batch_size, num_passage_words, embedding_dim * 2) passage_question_vectors = weighted_sum_layer([bi_lstm_Q, passage_question_attention]) # sum at(U~:t)=1 ## Query - Passage 2d * max_len_Q # find most important context words by max() passage_question_similarity question_passage_similarity = Max(axis=-1)(passage_question_similarity) # Shape: (batch_size, num_passage_words) # use softmax for b (max softmax value for similarity matrix column wise) question_passage_attention = MaskedSoftmax()(question_passage_similarity) # Shape: (batch_size, num_passage_words) weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False) # h~ = sum(weighted_bt * H:t) 2*embed_dim question_passage_vector = weighted_sum_layer([bi_lstm_P, question_passage_attention]) # sum bt(H~:t)=1 repeat_layer = RepeatLike(axis=1, copy_from_axis=1) # Shape: (batch_size, num_passage_words, embedding_dim * 2) tiled_question_passage_vector = repeat_layer([question_passage_vector, bi_lstm_P]) # Shape: (batch_size, num_passage_words, embedding_dim * 8) complex_concat_layer = ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage') final_merged_passage = complex_concat_layer([bi_lstm_P, passage_question_vectors, tiled_question_passage_vector]) # Denote G # Modelling layer. Take input of (?,?,emb*8) and apply bi-directional LSTM each with d dimensions, finally get 2d * Max_len_[] bi_model_passage = Bidirectional(LSTM(256, return_sequences=True), batch_input_shape=(batch_size, max_len_P, emb_dim))(final_merged_passage) # denote M # span begin output is calculated by Attention weight & LSTM softmax(Wp1 * [G;M]) span_begin_input = Concatenate()([final_merged_passage, bi_model_passage]) span_begin_weights = TimeDistributed(Dense(units=1))(span_begin_input) # Wp1 # Shape: (batch_size, num_passage_words) span_begin_probabilities = MaskedSoftmax(name="span_begin_softmax")(span_begin_weights) # (700,) # as Minjoon's bidaf indicated, after obtain p1, span_start_prob, he sum all probability values of the entity instances # by mask out all non-entity value. and the loss function apply withoutp2 multiword_option_mode = 'mean' options_sum_layer_minj = OptionAttentionSum(multiword_option_mode, name="options_probability_sum_minj") options_probabilities_minj = options_sum_layer_minj([passage_input, span_begin_probabilities, options_input]) l1_norm_layer = L1Normalize() option_normalized_probabilities_cnn = l1_norm_layer(options_probabilities_minj) # dense = Dense(377, activation='sigmoid')(option_normalized_probabilities_cnn) biDAF = Model(inputs=[question_input, passage_input, options_input], outputs=option_normalized_probabilities_cnn) biDAF.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) return biDAF
def build_model(embedding_layer): """ l2_lambda = 0.0001 question_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # * 2 since doubling the question and passage answer_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # * 2 since doubling the question and passage question_embedding = embedding_layer(question_input) answer_embedding = embedding_layer(answer_input) # Min's model has some highway layers here, with relu activations. Note that highway # layers don't change the tensor's shape. We need to have two different `TimeDistributed` # layers instantiated here, because Keras doesn't like it if a single `TimeDistributed` # layer gets applied to two inputs with different numbers of time steps. highway_layers = 2 for i in range(highway_layers): highway_layer = highway.Highway(activation='relu', name='highway_{}'.format(i)) question_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_qtd") question_embedding = question_layer(question_embedding) passage_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_ptd") answer_embedding = passage_layer(answer_embedding) # Then we pass the question and passage through a seq2seq encoder (like a biLSTM). This # essentially pushes phrase-level information into the embeddings of each word. phrase_layer = Bidirectional(layers.GRU(return_sequences=True, units=500, activation='relu', recurrent_dropout= 0.2, dropout=0.2))#, kernel_regularizer=l2(l2_lambda),kernel_initializer='he_uniform' ))#, **(params["encoder_params"]), **(params["wrapper_params"]))) # Shape: (batch_size, num_question_words, embedding_dim * 2) encoded_question = phrase_layer(question_embedding) # Shape: (batch_size, num_passage_words, embedding_dim * 2) encoded_answer = phrase_layer(answer_embedding) #encoded_question = layers.Dropout(0.2)(encoded_question) #encoded_answer = layers.Dropout(0.2)(encoded_answer) # PART 2: # Now we compute a similarity between the passage words and the question words, and # normalize the matrix in a couple of different ways for input into some more layers. matrix_attention_layer = MatrixAttention(similarity_function={'type': 'linear', 'combination': 'x,y,x*y'}, name='passage_question_similarity') # Shape: (batch_size, num_passage_words, num_question_words) answer_question_similarity = matrix_attention_layer([encoded_answer, encoded_question]) # Shape: (batch_size, num_passage_words, num_question_words), normalized over question # words for each passage word. answer_question_attention = MaskedSoftmax()(answer_question_similarity) # Shape: (batch_size, num_passage_words, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="answer_question_vectors", use_masking=False) answer_question_vectors = weighted_sum_layer([encoded_question, answer_question_attention]) # Min's paper finds, for each document word, the most similar question word to it, and # computes a single attention over the whole document using these max similarities. # Shape: (batch_size, num_passage_words) question_answer_similarity = Max(axis=-1)(answer_question_similarity) # Shape: (batch_size, num_passage_words) question_answer_attention = MaskedSoftmax()(question_answer_similarity) # Shape: (batch_size, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False) question_answer_vector = weighted_sum_layer([encoded_answer, question_answer_attention]) # Then he repeats this question/passage vector for every word in the passage, and uses it # as an additional input to the hidden layers above. repeat_layer = RepeatLike(axis=1, copy_from_axis=1) # Shape: (batch_size, num_passage_words, embedding_dim * 2) tiled_question_answer_vector = repeat_layer([question_answer_vector, encoded_answer]) # Shape: (batch_size, num_passage_words, embedding_dim * 8) complex_concat_layer = complex_concat.ComplexConcat(combination='1,2,1*2,1*3', name='final_merged_passage') final_merged_answer = complex_concat_layer([encoded_answer, answer_question_vectors, tiled_question_answer_vector]) # PART 3: # Having computed a combined representation of the document that includes attended question # vectors, we'll pass this through a few more bi-directional encoder layers, then predict # the span_begin word. Hard to find a good name for this; Min calls this part of the # network the "modeling layer", so we'll call this the `modeled_passage`. modeled_answer = final_merged_answer for i in range(1): hidden_layer = Bidirectional(layers.GRU(return_sequences=True, units=300, activation='relu', recurrent_dropout= 0.2))#, kernel_regularizer=l2(l2_lambda), kernel_initializer='he_uniform' ))#, **(params["encoder_params"]), **(params["wrapper_params"]))) modeled_answer = hidden_layer(modeled_answer) #PART 4: BY HELEN #get the maximum for each word max_answer = Max(axis=-1)(modeled_answer) preds = layers.Dense(1, activation = 'sigmoid', name = 'prediction')(max_answer) model = models.Model(inputs=[question_input, answer_input], outputs=preds) return model """ question_input = layers.Input( shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') # * 2 since doubling the question and passage answer_input = layers.Input( shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') # * 2 since doubling the question and passage question_embedding = embedding_layer(question_input) answer_embedding = embedding_layer(answer_input) # Min's model has some highway layers here, with relu activations. Note that highway # layers don't change the tensor's shape. We need to have two different `TimeDistributed` # layers instantiated here, because Keras doesn't like it if a single `TimeDistributed` # layer gets applied to two inputs with different numbers of time steps. highway_layers = 2 for i in range(highway_layers): highway_layer = highway.Highway(activation='relu', name='highway_{}'.format(i)) question_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_qtd") question_embedding = question_layer(question_embedding) passage_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_ptd") answer_embedding = passage_layer(answer_embedding) # Then we pass the question and passage through a seq2seq encoder (like a biLSTM). This # essentially pushes phrase-level information into the embeddings of each word. phrase_layer = Bidirectional( layers.GRU(return_sequences=True, units=500, activation='relu', recurrent_dropout=0.2, dropout=0.3, kernel_regularizer=l2(0.0001), kernel_initializer='he_uniform') ) #, **(params["encoder_params"]), **(params["wrapper_params"]))) # Shape: (batch_size, num_question_words, embedding_dim * 2) encoded_question = phrase_layer(question_embedding) # Shape: (batch_size, num_passage_words, embedding_dim * 2) encoded_answer = phrase_layer(answer_embedding) # PART 2: # Now we compute a similarity between the passage words and the question words, and # normalize the matrix in a couple of different ways for input into some more layers. matrix_attention_layer = MatrixAttention( similarity_function={ 'type': 'linear', 'combination': 'x,y,x*y' }, name='passage_question_similarity') # Shape: (batch_size, num_passage_words, num_question_words) answer_question_similarity = matrix_attention_layer( [encoded_answer, encoded_question]) # Shape: (batch_size, num_passage_words, num_question_words), normalized over question # words for each passage word. answer_question_attention = MaskedSoftmax()(answer_question_similarity) # Shape: (batch_size, num_passage_words, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="answer_question_vectors", use_masking=False) answer_question_vectors = weighted_sum_layer( [encoded_question, answer_question_attention]) # Min's paper finds, for each document word, the most similar question word to it, and # computes a single attention over the whole document using these max similarities. # Shape: (batch_size, num_passage_words) question_answer_similarity = Max(axis=-1)(answer_question_similarity) # Shape: (batch_size, num_passage_words) question_answer_attention = MaskedSoftmax()(question_answer_similarity) # Shape: (batch_size, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False) question_answer_vector = weighted_sum_layer( [encoded_answer, question_answer_attention]) # Then he repeats this question/passage vector for every word in the passage, and uses it # as an additional input to the hidden layers above. repeat_layer = RepeatLike(axis=1, copy_from_axis=1) # Shape: (batch_size, num_passage_words, embedding_dim * 2) tiled_question_answer_vector = repeat_layer( [question_answer_vector, encoded_answer]) # Shape: (batch_size, num_passage_words, embedding_dim * 8) complex_concat_layer = complex_concat.ComplexConcat( combination='1,2,1*2,1*3', name='final_merged_passage') final_merged_answer = complex_concat_layer([ encoded_answer, answer_question_vectors, tiled_question_answer_vector ]) # PART 3: # Having computed a combined representation of the document that includes attended question # vectors, we'll pass this through a few more bi-directional encoder layers, then predict # the span_begin word. Hard to find a good name for this; Min calls this part of the # network the "modeling layer", so we'll call this the `modeled_passage`. modeled_answer = final_merged_answer for i in range(1): hidden_layer = Bidirectional( layers.GRU( return_sequences=True, units=300, activation='relu', recurrent_dropout=0.2, dropout=0.3, )) #, **(params["encoder_params"]), **(params["wrapper_params"]))) modeled_answer = hidden_layer(modeled_answer) #PART 4: BY HELEN #get the maximum for each word max_answer = Max(axis=-1)(modeled_answer) print("max answer shape", max_answer.shape) print("modeled_answer shape", modeled_answer.shape) preds = layers.Dense(1, activation='sigmoid', name='prediction', kernel_regularizer=l2(0.0001), kernel_initializer='he_uniform')(max_answer) print("pred shape", preds.shape) model = models.Model(inputs=[question_input, answer_input], outputs=preds) return model