Esempio n. 1
0
def bilinear_attention(att_states,
                       att_lengths,
                       queries,
                       query_lengths,
                       size,
                       batch_size=None):
    attention_key = tf.contrib.layers.fully_connected(att_states,
                                                      size,
                                                      activation_fn=None,
                                                      weights_initializer=None)
    # [B, Q, L] --  Q is length of query
    attention_scores = tf.matmul(queries, attention_key, adjoint_b=True)
    max_length = tf.cast(tf.reduce_max(query_lengths), tf.int32)
    max_query_length = tf.cast(tf.reduce_max(att_lengths), tf.int32)
    mask = tfutil.mask_for_lengths(att_lengths,
                                   batch_size,
                                   max_length=max_query_length)
    mask = tf.tile(tf.expand_dims(mask, 1), tf.stack([1, max_length, 1]))
    attention_scores = attention_scores + mask
    attention_scores_reshaped = tf.reshape(attention_scores,
                                           tf.stack([-1, max_query_length]))
    attention_weights = tf.reshape(tf.nn.softmax(attention_scores_reshaped),
                                   tf.shape(attention_scores))
    # [B, Q, L] x [B, L, S] --> [B, L, S]
    ctxt_aligned_att_states = tf.matmul(attention_weights, att_states)
    return ctxt_aligned_att_states
Esempio n. 2
0
def extract_co_attention_states(affinity_scores,
                                states1,
                                lengths1,
                                states2,
                                lengths2,
                                batch_size=None):
    max_length2 = tf.cast(tf.reduce_max(lengths2), tf.int32)
    max_length1 = tf.cast(tf.reduce_max(lengths1), tf.int32)

    # [B, L1]
    mask1 = tfutil.mask_for_lengths(lengths1,
                                    batch_size,
                                    max_length=max_length1)
    # [B, L2, L1]
    mask1 = tf.tile(tf.expand_dims(mask1, 1), tf.stack([1, max_length2, 1]))

    # [B, L2]
    mask2 = tfutil.mask_for_lengths(lengths2,
                                    batch_size,
                                    max_length=max_length2)
    # [B, L1, L2]
    mask2 = tf.tile(tf.expand_dims(mask2, 1), tf.stack([1, max_length1, 1]))
    # [B, L1, L2]
    attention_scores1 = affinity_scores + mask2
    # [B, L2, L1]
    attention_scores2 = tf.transpose(affinity_scores, [0, 2, 1]) + mask1

    # [B, L1, L2]
    attention_weights1 = _my_softmax(attention_scores1)
    # [B, L2, L1]
    attention_weights2 = _my_softmax(attention_scores2)

    # [B, L2, L1] x [B, L1, S] --> [B, L2, S]
    att_states2 = tf.matmul(attention_weights2, states1)

    # [B, L2, 2*S]
    new_states2 = tf.concat(axis=2, values=[att_states2, states2])

    # [B, L1, 2*S]
    att_states1 = tf.matmul(attention_weights1, new_states2)

    # [B, L1, 3*S]
    new_states1 = tf.concat(axis=2, values=[att_states1, states1])

    return new_states1
Esempio n. 3
0
    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or "bilinear_attention_cell"):
            if self._hidden_features is None:
                # [B, L, S]
                attention_states = self._attention_states
                self._hidden_features = []

                for a in range(self._num_heads):
                    # [B, L, S]
                    self._hidden_features.append(
                        tf.contrib.layers.fully_connected(
                            attention_states,
                            inputs.get_shape()[-1].value,
                            activation_fn=None,
                            weights_initializer=None,
                            biases_initializer=None))

                if attention_states.get_shape()[-1].value == inputs.get_shape(
                )[-1].value:
                    self._hidden_features[
                        0] = self._hidden_features[0] + attention_states

                self.eval = tf.get_variable("attention_is_eval",
                                            dtype=tf.bool,
                                            initializer=False,
                                            trainable=False)
                self.set_eval = tf.assign(self.eval, True)

            ds = []  # Results of attention reads will be stored here.

            batch_size = tf.shape(inputs)[0]
            mask = tfutil.mask_for_lengths(self._attention_length, batch_size)

            # some parts are copied from tensorflow attention code-base
            for a in range(self._num_heads):
                with tf.variable_scope("Attention_%d" % a):
                    # [B, S]
                    query = inputs
                    # [B, L, 1]
                    s = tf.matmul(self._hidden_features[a],
                                  tf.expand_dims(query, 2))
                    s = tf.squeeze(s, [2])
                    self.attention_scores[a].append(s)
                    # [B, L]
                    weights = tf.nn.softmax(s + mask)
                    # Now calculate the attention-weighted vector d.
                    self.attention_weights[a].append(weights)

                    d = tf.reduce_sum(
                        tf.expand_dims(weights, 2) * self._attention_states,
                        [1])
                    ds.append(d)

            if len(ds) > 1:
                return tf.concat(axis=1, values=ds), None
            else:
                return ds[0], None
Esempio n. 4
0
    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or "attention_cell"):
            if self._reuse:
                tf.get_variable_scope().reuse_variables()
            if self._hidden_features is None:
                # [B, L, S]
                attention_states = self._attention_states
                self._hidden_features = []

                for a in range(self._num_heads):
                    # [B, L, S]
                    self._hidden_features.append(
                        tf.contrib.layers.fully_connected(
                            attention_states,
                            self._num_units,
                            activation_fn=None,
                            weights_initializer=None))

            ds = []  # Results of attention reads will be stored here.

            batch_size = tf.shape(inputs)[0]
            mask = tfutil.mask_for_lengths(self._attention_length, batch_size)

            # some parts are copied from tensorflow attention code-base
            for a in range(self._num_heads):
                with tf.variable_scope("Attention_%d" % a):
                    with tf.variable_scope("features%d" % a):
                        # [B, S]
                        y = tf.contrib.layers.fully_connected(
                            inputs,
                            self._num_units,
                            activation_fn=None,
                            weights_initializer=None)
                        y = tf.tanh(self._hidden_features[a] +
                                    tf.expand_dims(y, 1))
                    with tf.variable_scope("scores%d" % a):
                        # [B, L, 1]
                        s = tf.contrib.layers.fully_connected(
                            y, 1, activation_fn=None, weights_initializer=None)
                    s = tf.squeeze(s, [2])
                    self.attention_scores[a].append(s)
                    # [B, L]
                    weights = tf.nn.softmax(s + mask)
                    # Now calculate the attention-weighted vector d.
                    self.attention_weights[a].append(weights)

                    d = tf.reduce_sum(
                        tf.expand_dims(weights, 2) * self._attention_states,
                        [1])
                    ds.append(d)

            if len(ds) > 1:
                return tf.concat(axis=1, values=ds), None
            else:
                return ds[0], None
Esempio n. 5
0
def attention(att_states,
              att_lengths,
              queries,
              query_lengths,
              size,
              batch_size=None):
    # [B, L, S]
    inter_states = tf.contrib.layers.fully_connected(att_states,
                                                     size,
                                                     activation_fn=None,
                                                     weights_initializer=None,
                                                     scope="inter_states")
    # [B, Q, S]
    inter_queries = tf.contrib.layers.fully_connected(queries,
                                                      size,
                                                      activation_fn=None,
                                                      weights_initializer=None,
                                                      scope="inter_queries")

    # [B, L, Q, S] --  Inter
    inter = tf.tanh(
        tf.expand_dims(inter_states, 2) + tf.expand_dims(inter_queries, 1))

    # [B, L, Q, 1]
    attention_scores = tf.contrib.layers.fully_connected(
        inter,
        1,
        activation_fn=None,
        weights_initializer=None,
        scope="attention_scores")

    attention_scores = tf.squeeze(attention_scores, [3])

    max_length = tf.cast(tf.reduce_max(query_lengths), tf.int32)
    max_question_length = tf.cast(tf.reduce_max(att_lengths), tf.int32)
    mask = tfutil.mask_for_lengths(att_lengths,
                                   batch_size,
                                   max_length=max_question_length)
    mask = tf.tile(tf.expand_dims(mask, 1), tf.stack([1, max_length, 1]))
    attention_scores = attention_scores + mask
    attention_scores_reshaped = tf.reshape(attention_scores,
                                           tf.stack([-1, max_question_length]))
    attention_weights = tf.reshape(tf.nn.softmax(attention_scores_reshaped),
                                   tf.shape(attention_scores))
    # [B, L, Q] x [B, Q, S] --> [B, L, S]
    ctxt_aligned_att_states = tf.matmul(attention_weights, att_states)
    return ctxt_aligned_att_states
Esempio n. 6
0
def _highway_maxout_network(num_layers, pool_size, inputs, states, lengths,
                            max_length, size):
    r = tf.contrib.layers.fully_connected(inputs,
                                          size,
                                          activation_fn=tf.tanh,
                                          weights_initializer=None,
                                          scope="r")

    r_tiled = tf.tile(tf.expand_dims(r, 1), tf.stack([1, max_length, 1]))

    ms = []
    hm_inputs = tf.concat(axis=2, values=[states, r_tiled])
    hm_inputs.set_shape([None, None, size + states.get_shape()[-1].value])
    for i in range(num_layers):
        m = tf.contrib.layers.fully_connected(hm_inputs,
                                              size * pool_size,
                                              activation_fn=None,
                                              weights_initializer=None,
                                              scope="m_%d" % i)

        m = tf.reshape(m, tf.stack([-1, max_length, size, pool_size]))
        m = tf.reduce_max(m, [3])
        hm_inputs = m
        ms.append(m)

    if num_layers <= 0:
        out = tf.contrib.layers.fully_connected(hm_inputs,
                                                pool_size,
                                                activation_fn=None,
                                                weights_initializer=None,
                                                scope="out")
    else:
        out = tf.contrib.layers.fully_connected(tf.concat(axis=2, values=ms),
                                                pool_size,
                                                activation_fn=None,
                                                weights_initializer=None,
                                                scope="out")
    # [B, L]
    out = tf.reduce_max(out, [2])
    out = out + tfutil.mask_for_lengths(lengths,
                                        max_length=tf.shape(states)[1])

    return out
Esempio n. 7
0
    def add_yesno(self, add_model_scope=True):

        if self.yesno_added:
            return
        self.yesno_added = True
        self._with_yesno = True

        scope = self.name + "/yesno" if add_model_scope else "yesno"

        with tf.variable_scope(scope):

            with tf.variable_scope("context_representation"):

                attention_scores = tf.contrib.layers.fully_connected(self.encoded_ctxt, 1,
                                                                     activation_fn=None,
                                                                     weights_initializer=None,
                                                                     biases_initializer=None,
                                                                     scope="context_attention")
                attention_scores = attention_scores + tf.expand_dims(
                    tfutil.mask_for_lengths(self.context_length, self._batch_size,
                                            self.embedder.max_length), 2)
                attention_weights = tfutil.segment_softmax(attention_scores, self.context_partition)
                self.context_attention_weights = attention_weights
                self.context_representation = tf.segment_sum(
                    tf.reduce_sum(attention_weights * self.encoded_ctxt, [1]),
                    self.context_partition)

            with tf.variable_scope("yesno_output_module"):

                input = tf.concat(axis=1, values=[self.question_representation, self.context_representation])
                input = tf.nn.dropout(input, self.keep_prob)

                hidden = tf.contrib.layers.fully_connected(input, self.size,
                                                           activation_fn=tf.nn.relu,
                                                           scope="hidden")
                self.yesno_scores = tf.contrib.layers.fully_connected(hidden, 1,
                                                                      scope="yesno_scores")
                self.yesno_scores = tf.reshape(self.yesno_scores, [-1])
                self.yesno_probs = tf.nn.sigmoid(self.yesno_scores)

        self._train_variables = [p for p in tf.trainable_variables() if self.name in p.name]
Esempio n. 8
0
    def _init(self):
        ExtractionQAModel._init(self)
        if self._composition == "GRU":
            if self._layer_norm:
                rnn_constructor = lambda size: FusedRNNCellAdaptor(LayerNormGRUCell(size), use_dynamic_rnn=True)
            else:
                rnn_constructor = lambda size: FusedRNNCellAdaptor(GRUBlockCell(size), use_dynamic_rnn=True)
        elif self._composition == "RNN":
            rnn_constructor = lambda size: FusedRNNCellAdaptor(BasicRNNCell(size), use_dynamic_rnn=True)
        else:
            if self._layer_norm:
                rnn_constructor = lambda size: FusedRNNCellAdaptor(LayerNormLSTMCell(size), use_dynamic_rnn=True)
            else:
                rnn_constructor = lambda size: LSTMBlockFusedCell(size)

        with tf.device(self._device0):
            self._eval = tf.get_variable("is_eval", initializer=False, trainable=False)
            self._set_train = self._eval.initializer
            self._set_eval = self._eval.assign(True)

            self.context_mask = tfutil.mask_for_lengths(self.context_length, self._batch_size, self.embedder.max_length)

            question_binary_mask = tfutil.mask_for_lengths(self.question_length,
                                                           self.question_embedder.batch_size,
                                                           self.question_embedder.max_length,
                                                           value=1.0,
                                                           mask_right=False)

            with tf.variable_scope("preprocessing_layer"):

                question_binary_mask = tf.gather(question_binary_mask, self.context_partition)
                self._embedded_question_not_dropped = tf.gather(self._embedded_question_not_dropped, self.context_partition)

                # context
                if self._with_features:
                    mask = tf.get_variable("attention_mask", [1, 1, self._embedded_question_not_dropped.get_shape()[-1].value],
                                           initializer=tf.constant_initializer(1.0))
                    # compute word wise features
                    #masked_question = self.question_embedder.output * mask
                    # [B, Q, L]
                    q2c_scores = tf.matmul(self._embedded_question_not_dropped * mask,
                                                 self._embedded_context_not_dropped, adjoint_b=True)
                    q2c_scores = q2c_scores + tf.expand_dims(self.context_mask, 1)
                    #c2q_weights = tf.reduce_max(q2c_scores / (tf.reduce_max(q2c_scores, [2], keep_dims=True) + 1e-5), [1])

                    q2c_weights = tf.reduce_sum(tf.nn.softmax(q2c_scores) * \
                                                tf.expand_dims(question_binary_mask, 2), [1])

                    # [B, L , 1]
                    self.context_features = tf.concat(axis=2, values=[tf.expand_dims(self._word_in_question, 2),
                                                          #tf.expand_dims(c2q_weights, 2),
                                                          tf.expand_dims(q2c_weights,  2)])

                    embedded_ctxt = tf.concat(axis=2, values=[self.embedded_context, self.context_features])


                    in_question_feature = tf.ones(tf.stack([self.question_embedder.batch_size,
                                                           self.question_embedder.max_length, 2]))
                    embedded_question = tf.concat(axis=2, values=[self.embedded_question, in_question_feature])
                else:
                    embedded_ctxt = self.embedded_context
                    embedded_question = self.embedded_question

                if self._with_question_type_features:
                    # Need to add another zero vector so that the total number
                    # of features is even, for LSTM performance reasons.
                    question_type_features = tf.stack([self._is_factoid,
                                                      self._is_list,
                                                      self._is_yesno,
                                                      tf.zeros(tf.shape(self._is_list),
                                                               dtype=tf.bool)],
                                                     axis=1)
                    question_type_features = tf.cast(question_type_features, tf.float32)
                    question_type_features = tf.expand_dims(question_type_features, 1)

                    embedded_question = tf.concat(axis=2, values=[embedded_question,
                                                      tf.tile(question_type_features,
                                                              tf.stack([1, tf.shape(embedded_question)[1], 1]))])

                    question_type_features = tf.gather(question_type_features, self.context_partition)
                    embedded_ctxt = tf.concat(axis=2, values=[embedded_ctxt,
                                                  tf.tile(question_type_features,
                                                          tf.stack([1, tf.shape(embedded_ctxt)[1], 1]))])

                if self._with_entity_tag_features:
                    embedded_question = tf.concat(axis=2, values=[embedded_question,
                                                      tf.cast(self._question_tags, tf.float32)])
                    embedded_ctxt = tf.concat(axis=2, values=[embedded_ctxt,
                                                  tf.cast(self._context_tags, tf.float32)])

                self.encoded_question = self._preprocessing_layer(rnn_constructor, embedded_question,
                                                                  self.question_length, projection_scope="question_proj")

                self.encoded_ctxt = self._preprocessing_layer(rnn_constructor, embedded_ctxt, self.context_length,
                                                              share_rnn=True, projection_scope="context_proj",
                                                              num_fusion_layers=self._num_intrafusion_layers)

                # single time attention over question
                attention_scores = tf.contrib.layers.fully_connected(self.encoded_question, 1,
                                                                     activation_fn=None,
                                                                     weights_initializer=None,
                                                                     biases_initializer=None,
                                                                     scope="attention")
                attention_scores = attention_scores + tf.expand_dims(
                    tfutil.mask_for_lengths(self.question_length, self.question_embedder.batch_size,
                                            self.question_embedder.max_length), 2)
                attention_weights = tf.nn.softmax(attention_scores, 1)
                self.question_attention_weights = attention_weights
                self.question_representation = tf.reduce_sum(attention_weights * self.encoded_question, [1])

                # Multiply question features for each paragraph
                self.encoded_question = tf.gather(self.encoded_question, self.context_partition)
                self.question_representation_per_context = tf.gather(self.question_representation, self.context_partition)
                self.question_length = tf.gather(self.question_length, self.context_partition)

            if self._with_inter_fusion:
                with tf.variable_scope("inter_fusion"):
                    with tf.variable_scope("associative") as vs:
                        mask = tf.get_variable("attention_mask", [1, 1, self.size], initializer=tf.constant_initializer(1.0))
                        mask = tf.nn.relu(mask)
                        for i in range(1):
                            # [B, Q, L]
                            inter_scores = tf.matmul(self.encoded_question * mask, self.encoded_ctxt, adjoint_b=True)
                            inter_scores = inter_scores + tf.expand_dims(self.context_mask, 1)

                            inter_weights = tf.nn.softmax(inter_scores)
                            inter_weights = inter_weights * tf.expand_dims(question_binary_mask, 2)
                            # [B, L, Q] x [B, Q, S] -> [B, L, S]
                            co_states = tf.matmul(inter_weights, self.encoded_question, adj_x=True)

                            u = tf.contrib.layers.fully_connected(tf.concat(axis=2, values=[self.encoded_ctxt, co_states]), self.size,
                                                                  activation_fn=tf.sigmoid,
                                                                  biases_initializer=tf.constant_initializer(1.0),
                                                                  scope="update_gate")
                            self.encoded_ctxt = u * self.encoded_ctxt + (1.0 - u) * co_states
                            vs.reuse_variables()

                    with tf.variable_scope("recurrent") as vs:
                        self.encoded_ctxt.set_shape([None, None, self.size])
                        self.encoded_ctxt = dynamic_rnn(GatedAggregationRNNCell(self.size),
                                                        tf.reverse_sequence(self.encoded_ctxt, self.context_length, 1),
                                                        self.context_length,
                                                        dtype=tf.float32, time_major=False, scope="backward")[0]

                        self.encoded_ctxt = dynamic_rnn(GatedAggregationRNNCell(self.size),
                                                        tf.reverse_sequence(self.encoded_ctxt, self.context_length, 1),
                                                        self.context_length,
                                                        dtype=tf.float32, time_major=False, scope="forward")[0]

            # No matching layer, so set matched_output to encoded_ctxt (for compatibility)
            self.matched_output = self.encoded_ctxt

            with tf.variable_scope("pointer_layer"):
                self.predicted_context_indices, \
                self._start_scores, self._start_pointer, self.start_probs, \
                self._end_scores, self._end_pointer, self.end_probs = \
                    self._spn_answer_layer(self.question_representation_per_context, self.encoded_ctxt)

            self.yesno_added = False
            if self._with_yesno:
                self.add_yesno(add_model_scope=False)

            self._train_variables = [p for p in tf.trainable_variables() if self.name in p.name]
Esempio n. 9
0
    def _init(self):
        # build char_vocab
        # reset vocab_size to size of actual vocabulary
        conv_width = 5
        pad_right = math.ceil(conv_width / 2)  # "fixed PAD o right side"
        self.vocab_size = max(self.vocab.values()) + 1
        max_l = max(len(w) for w in self.vocab) + pad_right
        self.char_vocab = {"PAD": 0}
        self._word_to_chars_arr = np.zeros((self.vocab_size, max_l), np.int16)
        self._word_lengths_arr = np.zeros([self.vocab_size], np.int8)
        for w, i in sorted(self.vocab.items()):
            for k, c in enumerate(w):
                j = self.char_vocab.get(c)
                if j is None:
                    j = len(self.char_vocab)
                    self.char_vocab[c] = j
                self._word_to_chars_arr[i, k] = j
            self._word_lengths_arr[i] = len(w) + conv_width - 1

        with tf.device("/cpu:0"):
            with tf.variable_scope("embeddings"):
                self._word_to_chars = tf.placeholder(tf.int64, [None, None],
                                                     "word_to_chars")
                self._word_lengths = tf.placeholder(tf.int64, [None],
                                                    "word_lengths")
                self.char_embedding_matrix = \
                    tf.get_variable("char_embedding_matrix", shape=(len(self.char_vocab), self.size),
                                    initializer=tf.random_normal_initializer(0.0, 0.1), trainable=True)

                self._max_length = tf.cast(tf.reduce_max(self.seq_lengths),
                                           tf.int32)
                self._batch_size = tf.shape(self.seq_lengths)[0]
                self._sliced_inputs = tf.slice(self.inputs, (0, 0),
                                               tf.stack((-1, self.max_length)))

                self.unique_words = tf.placeholder(
                    tf.int64, [None], "unique_words"
                )  #tf.unique(tf.reshape(self._sliced_inputs, [-1]))
                self._word_idx = tf.placeholder(tf.int64, [None], "word_idx")
                self._new_inputs = tf.reshape(self._word_idx,
                                              tf.shape(self._sliced_inputs))

                chars = tf.nn.embedding_lookup(self._word_to_chars,
                                               self.unique_words)
                wl = tf.nn.embedding_lookup(self._word_lengths,
                                            self.unique_words)
                max_word_length = tf.cast(tf.reduce_max(wl), tf.int32)
                chars = tf.slice(chars, [0, 0], tf.stack([-1,
                                                          max_word_length]))

                embedded_chars = tf.nn.embedding_lookup(
                    self.char_embedding_matrix, chars)
                #embedded_chars_reshaped = tf.reshape(embedded_chars, tf.pack([-1, max_word_length, 4 *  self.size]))
                with tf.device(self._device):
                    with tf.variable_scope("conv"):
                        # [B, T, S]
                        filter = tf.get_variable(
                            "filter", [conv_width * self.size, self.size])
                        filter_reshaped = tf.reshape(
                            filter, [conv_width, self.size, self.size])
                        # [B, T, S]
                        conv_out = tf.nn.conv1d(embedded_chars,
                                                filter_reshaped, 1, "SAME")
                        conv_mask = tf.expand_dims(
                            tfutil.mask_for_lengths(
                                self._word_lengths - pad_right,
                                max_length=max_word_length), 2)
                        conv_out = conv_out + conv_mask

                    self.unique_embedded_words = tf.reduce_max(conv_out, [1])

                    embedded_words = tf.gather(self.unique_embedded_words,
                                               self._word_idx)
                    self._embedded_words = tf.reshape(
                        embedded_words,
                        tf.stack([-1, self.max_length, self.size]))

        self._train_variables = [
            p for p in tf.trainable_variables()
            if self.name + "/embeddings" in p.name
        ]