def bilinear_attention(att_states, att_lengths, queries, query_lengths, size, batch_size=None): attention_key = tf.contrib.layers.fully_connected(att_states, size, activation_fn=None, weights_initializer=None) # [B, Q, L] -- Q is length of query attention_scores = tf.matmul(queries, attention_key, adjoint_b=True) max_length = tf.cast(tf.reduce_max(query_lengths), tf.int32) max_query_length = tf.cast(tf.reduce_max(att_lengths), tf.int32) mask = tfutil.mask_for_lengths(att_lengths, batch_size, max_length=max_query_length) mask = tf.tile(tf.expand_dims(mask, 1), tf.stack([1, max_length, 1])) attention_scores = attention_scores + mask attention_scores_reshaped = tf.reshape(attention_scores, tf.stack([-1, max_query_length])) attention_weights = tf.reshape(tf.nn.softmax(attention_scores_reshaped), tf.shape(attention_scores)) # [B, Q, L] x [B, L, S] --> [B, L, S] ctxt_aligned_att_states = tf.matmul(attention_weights, att_states) return ctxt_aligned_att_states
def extract_co_attention_states(affinity_scores, states1, lengths1, states2, lengths2, batch_size=None): max_length2 = tf.cast(tf.reduce_max(lengths2), tf.int32) max_length1 = tf.cast(tf.reduce_max(lengths1), tf.int32) # [B, L1] mask1 = tfutil.mask_for_lengths(lengths1, batch_size, max_length=max_length1) # [B, L2, L1] mask1 = tf.tile(tf.expand_dims(mask1, 1), tf.stack([1, max_length2, 1])) # [B, L2] mask2 = tfutil.mask_for_lengths(lengths2, batch_size, max_length=max_length2) # [B, L1, L2] mask2 = tf.tile(tf.expand_dims(mask2, 1), tf.stack([1, max_length1, 1])) # [B, L1, L2] attention_scores1 = affinity_scores + mask2 # [B, L2, L1] attention_scores2 = tf.transpose(affinity_scores, [0, 2, 1]) + mask1 # [B, L1, L2] attention_weights1 = _my_softmax(attention_scores1) # [B, L2, L1] attention_weights2 = _my_softmax(attention_scores2) # [B, L2, L1] x [B, L1, S] --> [B, L2, S] att_states2 = tf.matmul(attention_weights2, states1) # [B, L2, 2*S] new_states2 = tf.concat(axis=2, values=[att_states2, states2]) # [B, L1, 2*S] att_states1 = tf.matmul(attention_weights1, new_states2) # [B, L1, 3*S] new_states1 = tf.concat(axis=2, values=[att_states1, states1]) return new_states1
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "bilinear_attention_cell"): if self._hidden_features is None: # [B, L, S] attention_states = self._attention_states self._hidden_features = [] for a in range(self._num_heads): # [B, L, S] self._hidden_features.append( tf.contrib.layers.fully_connected( attention_states, inputs.get_shape()[-1].value, activation_fn=None, weights_initializer=None, biases_initializer=None)) if attention_states.get_shape()[-1].value == inputs.get_shape( )[-1].value: self._hidden_features[ 0] = self._hidden_features[0] + attention_states self.eval = tf.get_variable("attention_is_eval", dtype=tf.bool, initializer=False, trainable=False) self.set_eval = tf.assign(self.eval, True) ds = [] # Results of attention reads will be stored here. batch_size = tf.shape(inputs)[0] mask = tfutil.mask_for_lengths(self._attention_length, batch_size) # some parts are copied from tensorflow attention code-base for a in range(self._num_heads): with tf.variable_scope("Attention_%d" % a): # [B, S] query = inputs # [B, L, 1] s = tf.matmul(self._hidden_features[a], tf.expand_dims(query, 2)) s = tf.squeeze(s, [2]) self.attention_scores[a].append(s) # [B, L] weights = tf.nn.softmax(s + mask) # Now calculate the attention-weighted vector d. self.attention_weights[a].append(weights) d = tf.reduce_sum( tf.expand_dims(weights, 2) * self._attention_states, [1]) ds.append(d) if len(ds) > 1: return tf.concat(axis=1, values=ds), None else: return ds[0], None
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or "attention_cell"): if self._reuse: tf.get_variable_scope().reuse_variables() if self._hidden_features is None: # [B, L, S] attention_states = self._attention_states self._hidden_features = [] for a in range(self._num_heads): # [B, L, S] self._hidden_features.append( tf.contrib.layers.fully_connected( attention_states, self._num_units, activation_fn=None, weights_initializer=None)) ds = [] # Results of attention reads will be stored here. batch_size = tf.shape(inputs)[0] mask = tfutil.mask_for_lengths(self._attention_length, batch_size) # some parts are copied from tensorflow attention code-base for a in range(self._num_heads): with tf.variable_scope("Attention_%d" % a): with tf.variable_scope("features%d" % a): # [B, S] y = tf.contrib.layers.fully_connected( inputs, self._num_units, activation_fn=None, weights_initializer=None) y = tf.tanh(self._hidden_features[a] + tf.expand_dims(y, 1)) with tf.variable_scope("scores%d" % a): # [B, L, 1] s = tf.contrib.layers.fully_connected( y, 1, activation_fn=None, weights_initializer=None) s = tf.squeeze(s, [2]) self.attention_scores[a].append(s) # [B, L] weights = tf.nn.softmax(s + mask) # Now calculate the attention-weighted vector d. self.attention_weights[a].append(weights) d = tf.reduce_sum( tf.expand_dims(weights, 2) * self._attention_states, [1]) ds.append(d) if len(ds) > 1: return tf.concat(axis=1, values=ds), None else: return ds[0], None
def attention(att_states, att_lengths, queries, query_lengths, size, batch_size=None): # [B, L, S] inter_states = tf.contrib.layers.fully_connected(att_states, size, activation_fn=None, weights_initializer=None, scope="inter_states") # [B, Q, S] inter_queries = tf.contrib.layers.fully_connected(queries, size, activation_fn=None, weights_initializer=None, scope="inter_queries") # [B, L, Q, S] -- Inter inter = tf.tanh( tf.expand_dims(inter_states, 2) + tf.expand_dims(inter_queries, 1)) # [B, L, Q, 1] attention_scores = tf.contrib.layers.fully_connected( inter, 1, activation_fn=None, weights_initializer=None, scope="attention_scores") attention_scores = tf.squeeze(attention_scores, [3]) max_length = tf.cast(tf.reduce_max(query_lengths), tf.int32) max_question_length = tf.cast(tf.reduce_max(att_lengths), tf.int32) mask = tfutil.mask_for_lengths(att_lengths, batch_size, max_length=max_question_length) mask = tf.tile(tf.expand_dims(mask, 1), tf.stack([1, max_length, 1])) attention_scores = attention_scores + mask attention_scores_reshaped = tf.reshape(attention_scores, tf.stack([-1, max_question_length])) attention_weights = tf.reshape(tf.nn.softmax(attention_scores_reshaped), tf.shape(attention_scores)) # [B, L, Q] x [B, Q, S] --> [B, L, S] ctxt_aligned_att_states = tf.matmul(attention_weights, att_states) return ctxt_aligned_att_states
def _highway_maxout_network(num_layers, pool_size, inputs, states, lengths, max_length, size): r = tf.contrib.layers.fully_connected(inputs, size, activation_fn=tf.tanh, weights_initializer=None, scope="r") r_tiled = tf.tile(tf.expand_dims(r, 1), tf.stack([1, max_length, 1])) ms = [] hm_inputs = tf.concat(axis=2, values=[states, r_tiled]) hm_inputs.set_shape([None, None, size + states.get_shape()[-1].value]) for i in range(num_layers): m = tf.contrib.layers.fully_connected(hm_inputs, size * pool_size, activation_fn=None, weights_initializer=None, scope="m_%d" % i) m = tf.reshape(m, tf.stack([-1, max_length, size, pool_size])) m = tf.reduce_max(m, [3]) hm_inputs = m ms.append(m) if num_layers <= 0: out = tf.contrib.layers.fully_connected(hm_inputs, pool_size, activation_fn=None, weights_initializer=None, scope="out") else: out = tf.contrib.layers.fully_connected(tf.concat(axis=2, values=ms), pool_size, activation_fn=None, weights_initializer=None, scope="out") # [B, L] out = tf.reduce_max(out, [2]) out = out + tfutil.mask_for_lengths(lengths, max_length=tf.shape(states)[1]) return out
def add_yesno(self, add_model_scope=True): if self.yesno_added: return self.yesno_added = True self._with_yesno = True scope = self.name + "/yesno" if add_model_scope else "yesno" with tf.variable_scope(scope): with tf.variable_scope("context_representation"): attention_scores = tf.contrib.layers.fully_connected(self.encoded_ctxt, 1, activation_fn=None, weights_initializer=None, biases_initializer=None, scope="context_attention") attention_scores = attention_scores + tf.expand_dims( tfutil.mask_for_lengths(self.context_length, self._batch_size, self.embedder.max_length), 2) attention_weights = tfutil.segment_softmax(attention_scores, self.context_partition) self.context_attention_weights = attention_weights self.context_representation = tf.segment_sum( tf.reduce_sum(attention_weights * self.encoded_ctxt, [1]), self.context_partition) with tf.variable_scope("yesno_output_module"): input = tf.concat(axis=1, values=[self.question_representation, self.context_representation]) input = tf.nn.dropout(input, self.keep_prob) hidden = tf.contrib.layers.fully_connected(input, self.size, activation_fn=tf.nn.relu, scope="hidden") self.yesno_scores = tf.contrib.layers.fully_connected(hidden, 1, scope="yesno_scores") self.yesno_scores = tf.reshape(self.yesno_scores, [-1]) self.yesno_probs = tf.nn.sigmoid(self.yesno_scores) self._train_variables = [p for p in tf.trainable_variables() if self.name in p.name]
def _init(self): ExtractionQAModel._init(self) if self._composition == "GRU": if self._layer_norm: rnn_constructor = lambda size: FusedRNNCellAdaptor(LayerNormGRUCell(size), use_dynamic_rnn=True) else: rnn_constructor = lambda size: FusedRNNCellAdaptor(GRUBlockCell(size), use_dynamic_rnn=True) elif self._composition == "RNN": rnn_constructor = lambda size: FusedRNNCellAdaptor(BasicRNNCell(size), use_dynamic_rnn=True) else: if self._layer_norm: rnn_constructor = lambda size: FusedRNNCellAdaptor(LayerNormLSTMCell(size), use_dynamic_rnn=True) else: rnn_constructor = lambda size: LSTMBlockFusedCell(size) with tf.device(self._device0): self._eval = tf.get_variable("is_eval", initializer=False, trainable=False) self._set_train = self._eval.initializer self._set_eval = self._eval.assign(True) self.context_mask = tfutil.mask_for_lengths(self.context_length, self._batch_size, self.embedder.max_length) question_binary_mask = tfutil.mask_for_lengths(self.question_length, self.question_embedder.batch_size, self.question_embedder.max_length, value=1.0, mask_right=False) with tf.variable_scope("preprocessing_layer"): question_binary_mask = tf.gather(question_binary_mask, self.context_partition) self._embedded_question_not_dropped = tf.gather(self._embedded_question_not_dropped, self.context_partition) # context if self._with_features: mask = tf.get_variable("attention_mask", [1, 1, self._embedded_question_not_dropped.get_shape()[-1].value], initializer=tf.constant_initializer(1.0)) # compute word wise features #masked_question = self.question_embedder.output * mask # [B, Q, L] q2c_scores = tf.matmul(self._embedded_question_not_dropped * mask, self._embedded_context_not_dropped, adjoint_b=True) q2c_scores = q2c_scores + tf.expand_dims(self.context_mask, 1) #c2q_weights = tf.reduce_max(q2c_scores / (tf.reduce_max(q2c_scores, [2], keep_dims=True) + 1e-5), [1]) q2c_weights = tf.reduce_sum(tf.nn.softmax(q2c_scores) * \ tf.expand_dims(question_binary_mask, 2), [1]) # [B, L , 1] self.context_features = tf.concat(axis=2, values=[tf.expand_dims(self._word_in_question, 2), #tf.expand_dims(c2q_weights, 2), tf.expand_dims(q2c_weights, 2)]) embedded_ctxt = tf.concat(axis=2, values=[self.embedded_context, self.context_features]) in_question_feature = tf.ones(tf.stack([self.question_embedder.batch_size, self.question_embedder.max_length, 2])) embedded_question = tf.concat(axis=2, values=[self.embedded_question, in_question_feature]) else: embedded_ctxt = self.embedded_context embedded_question = self.embedded_question if self._with_question_type_features: # Need to add another zero vector so that the total number # of features is even, for LSTM performance reasons. question_type_features = tf.stack([self._is_factoid, self._is_list, self._is_yesno, tf.zeros(tf.shape(self._is_list), dtype=tf.bool)], axis=1) question_type_features = tf.cast(question_type_features, tf.float32) question_type_features = tf.expand_dims(question_type_features, 1) embedded_question = tf.concat(axis=2, values=[embedded_question, tf.tile(question_type_features, tf.stack([1, tf.shape(embedded_question)[1], 1]))]) question_type_features = tf.gather(question_type_features, self.context_partition) embedded_ctxt = tf.concat(axis=2, values=[embedded_ctxt, tf.tile(question_type_features, tf.stack([1, tf.shape(embedded_ctxt)[1], 1]))]) if self._with_entity_tag_features: embedded_question = tf.concat(axis=2, values=[embedded_question, tf.cast(self._question_tags, tf.float32)]) embedded_ctxt = tf.concat(axis=2, values=[embedded_ctxt, tf.cast(self._context_tags, tf.float32)]) self.encoded_question = self._preprocessing_layer(rnn_constructor, embedded_question, self.question_length, projection_scope="question_proj") self.encoded_ctxt = self._preprocessing_layer(rnn_constructor, embedded_ctxt, self.context_length, share_rnn=True, projection_scope="context_proj", num_fusion_layers=self._num_intrafusion_layers) # single time attention over question attention_scores = tf.contrib.layers.fully_connected(self.encoded_question, 1, activation_fn=None, weights_initializer=None, biases_initializer=None, scope="attention") attention_scores = attention_scores + tf.expand_dims( tfutil.mask_for_lengths(self.question_length, self.question_embedder.batch_size, self.question_embedder.max_length), 2) attention_weights = tf.nn.softmax(attention_scores, 1) self.question_attention_weights = attention_weights self.question_representation = tf.reduce_sum(attention_weights * self.encoded_question, [1]) # Multiply question features for each paragraph self.encoded_question = tf.gather(self.encoded_question, self.context_partition) self.question_representation_per_context = tf.gather(self.question_representation, self.context_partition) self.question_length = tf.gather(self.question_length, self.context_partition) if self._with_inter_fusion: with tf.variable_scope("inter_fusion"): with tf.variable_scope("associative") as vs: mask = tf.get_variable("attention_mask", [1, 1, self.size], initializer=tf.constant_initializer(1.0)) mask = tf.nn.relu(mask) for i in range(1): # [B, Q, L] inter_scores = tf.matmul(self.encoded_question * mask, self.encoded_ctxt, adjoint_b=True) inter_scores = inter_scores + tf.expand_dims(self.context_mask, 1) inter_weights = tf.nn.softmax(inter_scores) inter_weights = inter_weights * tf.expand_dims(question_binary_mask, 2) # [B, L, Q] x [B, Q, S] -> [B, L, S] co_states = tf.matmul(inter_weights, self.encoded_question, adj_x=True) u = tf.contrib.layers.fully_connected(tf.concat(axis=2, values=[self.encoded_ctxt, co_states]), self.size, activation_fn=tf.sigmoid, biases_initializer=tf.constant_initializer(1.0), scope="update_gate") self.encoded_ctxt = u * self.encoded_ctxt + (1.0 - u) * co_states vs.reuse_variables() with tf.variable_scope("recurrent") as vs: self.encoded_ctxt.set_shape([None, None, self.size]) self.encoded_ctxt = dynamic_rnn(GatedAggregationRNNCell(self.size), tf.reverse_sequence(self.encoded_ctxt, self.context_length, 1), self.context_length, dtype=tf.float32, time_major=False, scope="backward")[0] self.encoded_ctxt = dynamic_rnn(GatedAggregationRNNCell(self.size), tf.reverse_sequence(self.encoded_ctxt, self.context_length, 1), self.context_length, dtype=tf.float32, time_major=False, scope="forward")[0] # No matching layer, so set matched_output to encoded_ctxt (for compatibility) self.matched_output = self.encoded_ctxt with tf.variable_scope("pointer_layer"): self.predicted_context_indices, \ self._start_scores, self._start_pointer, self.start_probs, \ self._end_scores, self._end_pointer, self.end_probs = \ self._spn_answer_layer(self.question_representation_per_context, self.encoded_ctxt) self.yesno_added = False if self._with_yesno: self.add_yesno(add_model_scope=False) self._train_variables = [p for p in tf.trainable_variables() if self.name in p.name]
def _init(self): # build char_vocab # reset vocab_size to size of actual vocabulary conv_width = 5 pad_right = math.ceil(conv_width / 2) # "fixed PAD o right side" self.vocab_size = max(self.vocab.values()) + 1 max_l = max(len(w) for w in self.vocab) + pad_right self.char_vocab = {"PAD": 0} self._word_to_chars_arr = np.zeros((self.vocab_size, max_l), np.int16) self._word_lengths_arr = np.zeros([self.vocab_size], np.int8) for w, i in sorted(self.vocab.items()): for k, c in enumerate(w): j = self.char_vocab.get(c) if j is None: j = len(self.char_vocab) self.char_vocab[c] = j self._word_to_chars_arr[i, k] = j self._word_lengths_arr[i] = len(w) + conv_width - 1 with tf.device("/cpu:0"): with tf.variable_scope("embeddings"): self._word_to_chars = tf.placeholder(tf.int64, [None, None], "word_to_chars") self._word_lengths = tf.placeholder(tf.int64, [None], "word_lengths") self.char_embedding_matrix = \ tf.get_variable("char_embedding_matrix", shape=(len(self.char_vocab), self.size), initializer=tf.random_normal_initializer(0.0, 0.1), trainable=True) self._max_length = tf.cast(tf.reduce_max(self.seq_lengths), tf.int32) self._batch_size = tf.shape(self.seq_lengths)[0] self._sliced_inputs = tf.slice(self.inputs, (0, 0), tf.stack((-1, self.max_length))) self.unique_words = tf.placeholder( tf.int64, [None], "unique_words" ) #tf.unique(tf.reshape(self._sliced_inputs, [-1])) self._word_idx = tf.placeholder(tf.int64, [None], "word_idx") self._new_inputs = tf.reshape(self._word_idx, tf.shape(self._sliced_inputs)) chars = tf.nn.embedding_lookup(self._word_to_chars, self.unique_words) wl = tf.nn.embedding_lookup(self._word_lengths, self.unique_words) max_word_length = tf.cast(tf.reduce_max(wl), tf.int32) chars = tf.slice(chars, [0, 0], tf.stack([-1, max_word_length])) embedded_chars = tf.nn.embedding_lookup( self.char_embedding_matrix, chars) #embedded_chars_reshaped = tf.reshape(embedded_chars, tf.pack([-1, max_word_length, 4 * self.size])) with tf.device(self._device): with tf.variable_scope("conv"): # [B, T, S] filter = tf.get_variable( "filter", [conv_width * self.size, self.size]) filter_reshaped = tf.reshape( filter, [conv_width, self.size, self.size]) # [B, T, S] conv_out = tf.nn.conv1d(embedded_chars, filter_reshaped, 1, "SAME") conv_mask = tf.expand_dims( tfutil.mask_for_lengths( self._word_lengths - pad_right, max_length=max_word_length), 2) conv_out = conv_out + conv_mask self.unique_embedded_words = tf.reduce_max(conv_out, [1]) embedded_words = tf.gather(self.unique_embedded_words, self._word_idx) self._embedded_words = tf.reshape( embedded_words, tf.stack([-1, self.max_length, self.size])) self._train_variables = [ p for p in tf.trainable_variables() if self.name + "/embeddings" in p.name ]