Ejemplo n.º 1
0
    def build_graph(self, values, values_mask, keys, keys_mask):
        sentinel_padding = tf.constant(1, shape=[1, 1])
        batch_size = self.FLAGS.batch_size
        with vs.variable_scope("Attention"):
            # Calculate attention distribution
            dense_layer = partial(
                tf.layers.dense,
                activation=tf.nn.tanh,
                kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001))
            projected_values_t = dense_layer(values, self.FLAGS.embedding_size)

            values_t = tf.concat([
                projected_values_t,
                tf.broadcast_to(self.values_sentinel,
                                [batch_size, 1, self.FLAGS.embedding_size])
            ], 1)  # (batch_size, value_vec_size, num_values)

            #augmented context vectors.
            keys_t = tf.concat([
                keys,
                tf.broadcast_to(self.keys_sentinel,
                                [batch_size, 1, self.FLAGS.embedding_size])
            ], 1)

            affinity_scores = tf.matmul(
                keys_t, tf.transpose(values_t, perm=[
                    0, 2, 1
                ]))  # shape (batch_size, num_keys, num_values)

            values_mask_1 = tf.expand_dims(
                tf.concat([
                    values_mask,
                    tf.broadcast_to(sentinel_padding, [batch_size, 1])
                ], 1), 1)  #shape (batch_size, 1, num_values).
            _, C2Q_softmax = masked_softmax(
                affinity_scores, values_mask_1, 2
            )  # shape (batch_size, num_keys, num_values). take softmax over values
            attn_output_1 = tf.matmul(
                C2Q_softmax,
                values_t)  # shape (batch_size, num_keys, value_vec_size)

            keys_mask_1 = tf.expand_dims(
                tf.concat([
                    keys_mask,
                    tf.broadcast_to(sentinel_padding, [batch_size, 1])
                ], 1), 2)  #shape (batch_size, num_keys, 1)
            _, Q2C_softmax = masked_softmax(affinity_scores, keys_mask_1, 1)
            Q2C_output = tf.matmul(tf.transpose(Q2C_softmax, perm=[0, 2, 1]),
                                   keys_t)

            attn_output_2 = tf.matmul(C2Q_softmax, Q2C_output)

            key_hidden = tf.concat([attn_output_2, attn_output_1], 2)

            key_hidden = key_hidden[:, :self.FLAGS.context_len, :]

            # Apply dropout
            output = tf.nn.dropout(key_hidden, self.keep_prob)

            return output
Ejemplo n.º 2
0
    def build_graph(self, values, values_mask, keys):
        """
        Keys attend to values.
        For each key, return an attention distribution and an attention output vector.

        Inputs:
          values: Tensor shape (batch_size, num_values, value_vec_size).
          values_mask: Tensor shape (batch_size, num_values).
            1s where there's real input, 0s where there's padding
          keys: Tensor shape (batch_size, num_keys, key_vec_size)

        Outputs:
          attn_dist: Tensor shape (batch_size, num_keys, num_values).
            For each key, the distribution should sum to 1,
            and should be 0 in the value locations that correspond to padding.
          output: Tensor shape (batch_size, num_keys, hidden_size).
            This is the attention output; the weighted sum of the values
            (using the attention distribution as weights).
        """
        with vs.variable_scope("MultiplicativeAttn"):
            keys_shape = keys.get_shape().as_list(
            )  # (batch_size, num_keys, key_vec_size)
            values_shape = values.get_shape().as_list(
            )  # (batch_size, num_values, value_vec_size)

            # Calculate attention distribution
            W = tf.get_variable(
                'W_mul_attn',
                shape=(self.key_vec_size, self.value_vec_size),
                initializer=tf.contrib.layers.xavier_initializer())
            keys_r = tf.reshape(
                keys,
                [-1, keys_shape[2]])  # (batch_size * num_keys, key_vec_size)
            attn_logits = tf.matmul(
                keys_r, W)  # (batch_size * num_keys, value_vec_size)
            attn_logits = tf.reshape(
                attn_logits, [-1, keys_shape[1], values_shape[2]
                              ])  # (batch_size, num_keys, value_vec_size)
            values_t = tf.transpose(
                values, perm=[0, 2,
                              1])  # (batch_size, value_vec_size, num_values)
            attn_logits = tf.matmul(
                attn_logits, values_t)  # (batch_size, num_keys, num_values)
            attn_logits_mask = tf.expand_dims(
                values_mask, 1)  # shape (batch_size, 1, num_values)
            attn_masked_logits, attn_prob_dist = masked_softmax(
                attn_logits, attn_logits_mask, 2
            )  # shape (batch_size, num_keys, num_values). take softmax over values

            # Use attention distribution to take weighted sum of values
            output = tf.matmul(
                attn_prob_dist,
                values)  # shape (batch_size, num_keys, value_vec_size)

            # Apply dropout
            output = tf.nn.dropout(output, self.keep_prob)

            return attn_masked_logits, attn_prob_dist, output
Ejemplo n.º 3
0
    def build_graph(self, values, values_mask, keys):
        """
        Keys attend to values.
        For each key, return an attention distribution and an attention output vector.

        Inputs:
          values: Tensor shape (batch_size, num_values, value_vec_size).
          values_mask: Tensor shape (batch_size, num_values).
            1s where there's real input, 0s where there's padding
          keys: Tensor shape (batch_size, num_keys, value_vec_size)

        Outputs:
          attn_dist: Tensor shape (batch_size, num_keys, num_values).
            For each key, the distribution should sum to 1,
            and should be 0 in the value locations that correspond to padding.
          output: Tensor shape (batch_size, num_keys, hidden_size).
            This is the attention output; the weighted sum of the values
            (using the attention distribution as weights).
        """
        with vs.variable_scope("GatedDotAttn"):
            # Calculate attention distribution
            values_t = tf.transpose(
                values, perm=[0, 2,
                              1])  # (batch_size, value_vec_size, num_values)
            attn_logits = tf.matmul(
                keys, values_t)  # shape (batch_size, num_keys, num_values)
            attn_logits_mask = tf.expand_dims(
                values_mask, 1)  # shape (batch_size, 1, num_values)
            _, attn_dist = masked_softmax(
                attn_logits, attn_logits_mask, 2
            )  # shape (batch_size, num_keys, num_values). take softmax over values

            # Use attention distribution to take weighted sum of values
            output = tf.matmul(
                attn_dist,
                values)  # shape (batch_size, num_keys, value_vec_size)

            # Blend
            output = tf.concat([keys, output], axis=2)

            # Apply dropout
            output = tf.nn.dropout(output, self.keep_prob)

        # Compute gate
        with tf.variable_scope('c2qgate'):
            shape = tf.shape(output)
            dim = output.get_shape().as_list()[-1]
            flatten = tf.reshape(output, (-1, dim))
            W = tf.get_variable('Wc2gate', (dim, dim))
            gate = tf.matmul(flatten, W)
            gate = tf.reshape(gate, shape)
            gate = tf.nn.sigmoid(gate)
            output = gate * output
            return attn_dist, output
Ejemplo n.º 4
0
    def build_graph(self, keys, keys_mask):
        with vs.variable_scope("Attention"):
            dense_layer_1 = partial(
                tf.layers.dense,
                activation=None,
                use_bias=False,
                kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001))
            dense_layer_2 = partial(
                tf.layers.dense,
                activation=None,
                use_bias=False,
                kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001))
            projected_keys_1 = dense_layer_1(
                keys, self.hidden_vec_size
            )  # (batch_size, num_keys, hidden_vec_size)
            projected_keys_2 = dense_layer_2(
                keys, self.hidden_vec_size
            )  # (batch_size, num_keys, hidden_vec_size)
            keys_t = tf.expand_dims(projected_keys_1, 2) + tf.expand_dims(
                projected_keys_2, 1)
            keys_t.set_shape([
                self.FLAGS.batch_size, self.FLAGS.context_len,
                self.FLAGS.context_len, self.hidden_vec_size
            ])
            keys_t = tf.nn.tanh(keys_t)
            V = partial(
                tf.layers.dense,
                activation=None,
                use_bias=False,
                kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001))
            self_attn_keys = tf.squeeze(V(keys_t, 1))

            _, self_attn_softmax = masked_softmax(self_attn_keys,
                                                  tf.expand_dims(keys_mask, 1),
                                                  1)
            output = tf.matmul(
                self_attn_softmax, keys
            )  #no tranpose needed due to symmetric, shape (batch_size, num_keys, value_vec_size)

            # Apply dropout
            output = tf.nn.dropout(output, self.keep_prob)

            return output
Ejemplo n.º 5
0
    def build_mult_graph(self, values, values_mask, keys, FLAGS):
        values = tf.nn.dropout(values, self.keep_prob)
        keys = tf.nn.dropout(keys, self.keep_prob)

        with vs.variable_scope("GatedDotAttn"):
            # Calculate attention distribution
            values_ = tf.nn.relu(dense(values, FLAGS.hidden_size, 'values'))
            values_t = tf.transpose(
                values_, perm=[0, 2,
                               1])  # (batch_size, value_vec_size, num_values)
            keys_ = tf.nn.relu(dense(keys, FLAGS.hidden_size, 'keys'))
            attn_logits = tf.matmul(keys_, values_t) / (
                FLAGS.hidden_size**0.5
            )  # shape (batch_size, num_keys, num_values)
            attn_logits_mask = tf.expand_dims(
                values_mask, 1)  # shape (batch_size, 1, num_values)
            _, attn_dist = masked_softmax(
                attn_logits, attn_logits_mask, 2
            )  # shape (batch_size, num_keys, num_values). take softmax over values

            # Use attention distribution to take weighted sum of values
            output = tf.matmul(
                attn_dist,
                values)  # shape (batch_size, num_keys, value_vec_size)

            # Blend
            output = tf.concat([keys, output], axis=2)

            # Apply dropout
            output = tf.nn.dropout(output, self.keep_prob)

        # Compute gate
        with tf.variable_scope('c2qgate'):
            shape = tf.shape(output)
            dim = output.get_shape().as_list()[-1]
            flatten = tf.reshape(output, (-1, dim))
            W = tf.get_variable('Wc2gate', (dim, dim))
            gate = tf.matmul(flatten, W)
            gate = tf.reshape(gate, shape)
            gate = tf.nn.sigmoid(gate)
            output = gate * output
            return attn_dist, output
    def build_graph(self):

        # ENCODING
        unstack_context = self.e_context_embs
        unstack_qn = self.e_qn_embs
        with tf.variable_scope('encoding') as scope:

            # Change to dynamic bidrectional rnn
            # WE CAN CHANGE THE GRU LATER WITH DROPOUT OUR
            # ADD ENCODE SIZE
            emb_fwd_cell = tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size)
            emb_back_cell = tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size)

            (c_fwd, c_back), _ = tf.nn.bidirectional_dynamic_rnn(
                emb_fwd_cell,
                emb_back_cell,
                unstack_context,
                tf.reduce_sum(self.context_mask, reduction_indices=1),
                dtype='float32')

            tf.get_variable_scope().reuse_variables()

            (qn_fwd, qn_back), _ = tf.nn.bidirectional_dynamic_rnn(
                emb_fwd_cell,
                emb_back_cell,
                unstack_qn,
                tf.reduce_sum(self.qn_mask, reduction_indices=1),
                dtype='float32')

            u_Q = tf.concat(
                [qn_fwd, qn_back], 2
            )  # [batch, q_len, 2 * hidden_size] because bidirectional stacks the forward and backward
            u_P = tf.concat([c_fwd, c_back],
                            2)  # [batch, c_len, 2 * hidden_size]

            u_Q = tf.nn.dropout(u_Q, self.keep_prob)
            u_P = tf.nn.dropout(u_P, self.keep_prob)

        # GATED ATTENTION
        v_P = []  # All attention states across time
        # each element of v_P is an attention state for one time point with dim [batch_size, hidden_size]
        print "Gated Attention"
        with tf.variable_scope('Attention_gated') as scope:
            W_uQ = tf.get_variable(
                'W_uQ',
                shape=(2 * self.FLAGS.hidden_size, self.FLAGS.hidden_size),
                initializer=tf.contrib.layers.xavier_initializer())
            W_uP = tf.get_variable(
                'W_uP',
                shape=(2 * self.FLAGS.hidden_size, self.FLAGS.hidden_size),
                initializer=tf.contrib.layers.xavier_initializer())
            W_vP = tf.get_variable(
                'W_vP',
                shape=(self.FLAGS.hidden_size, self.FLAGS.hidden_size),
                initializer=tf.contrib.layers.xavier_initializer())
            v_QP = tf.get_variable(
                'v_QP',
                shape=(self.FLAGS.hidden_size),
                initializer=tf.contrib.layers.xavier_initializer())
            W_g_QP = tf.get_variable('W_g_QP',
                                     shape=(4 * self.FLAGS.hidden_size,
                                            4 * self.FLAGS.hidden_size))

            # TO DO: add drop prob in FLAGS
            QP_cell = tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size), self.keep_prob)
            zeros_dim = tf.stack([tf.shape(u_Q)[0], self.FLAGS.hidden_size])
            QP_cell_hidden = tf.fill(zeros_dim, 0.0)
            for t in range(0, self.FLAGS.context_len):

                # TODO: MOVE THE VARIABLES TO SOMEWHERE ELSE APPROPRIATE

                WuQ_uQ = tf.tensordot(u_Q, W_uQ,
                                      axes=[[2], [0]
                                            ])  # [batch, q_len, hidden_size]
                u_P_t = tf.reshape(
                    u_P[:, t, :], (-1, 1, 2 * self.FLAGS.hidden_size)
                )  # slice only 1 context word, [batch_size, 1, 2 * hidden_size]
                WuP_uP = tf.tensordot(u_P_t, W_uP,
                                      axes=[[2],
                                            [0]])  # [batch, 1, hidden_size]

                if t == 0:
                    s_t = tf.tensordot(tf.tanh(WuQ_uQ + WuP_uP),
                                       v_QP,
                                       axes=[[2],
                                             [0]])  # returns [batch, q_len]
                else:
                    v_P_t = tf.reshape(v_P[t - 1],
                                       (-1, 1, self.FLAGS.hidden_size
                                        ))  # [batch_size, 1, hidden_size]
                    WvP_vP = tf.tensordot(
                        v_P_t, W_vP,
                        axes=[[2], [0]])  # [batch_size, 1, hidden_size]
                    s_t = tf.tensordot(tf.tanh(WuQ_uQ + WuP_uP + WvP_vP),
                                       v_QP,
                                       axes=[[2],
                                             [0]])  # returns [batch, q_len]

                #a_t = tf.nn.softmax(s_t, 1)
                _, a_t = masked_softmax(s_t, self.qn_mask, 1)  # [batch, q_len]
                # [batch, q_len] , [batch,q_len,2*hidden_size] -> [batch, 2*hidden_size]
                c_t = tf.einsum('ij,ijk->ik', a_t, u_Q)  #[batch,2*hidden_size]

                uPt_ct = tf.concat([tf.squeeze(u_P_t), c_t],
                                   1)  # [batch, 2 * 2 * hidden_size]
                g_t = tf.nn.sigmoid(tf.matmul(
                    uPt_ct, W_g_QP))  # [batch, 2 * 2 * hidden_size]
                uPt_ct_star = tf.einsum('ij,ij->ij', g_t, uPt_ct)

                if t > 0:
                    tf.get_variable_scope().reuse_variables()
                QP_output, QP_cell_hidden = QP_cell(
                    uPt_ct_star, QP_cell_hidden
                )  # both output and hidden [batch_size, hidden_size]
                v_P.append(QP_output)

            v_P = tf.stack(v_P, 1)  # [batch, context_len, hidden_size]
            v_P = tf.nn.dropout(v_P, self.keep_prob)

        #SELF ATTN
        print "self attention"
        with tf.variable_scope("self_matching_attn") as scope:
            SM_input = []
            W_v_P = tf.get_variable(
                'W_v_P',
                shape=(self.FLAGS.hidden_size, self.FLAGS.hidden_size),
                initializer=tf.contrib.layers.xavier_initializer())
            W_v_P_tot = tf.get_variable(
                'W_v_P_tot',
                shape=(self.FLAGS.hidden_size, self.FLAGS.hidden_size),
                initializer=tf.contrib.layers.xavier_initializer())

            v_SM = tf.get_variable('v_SM', shape=(self.FLAGS.hidden_size))
            for t in range(0, self.FLAGS.context_len):

                v_j_P = tf.reshape(
                    v_P[:, t, :],
                    (-1, 1, self.FLAGS.hidden_size
                     ))  #Slice 1 v_P in time t [batch_size, 1, hidden_size]
                WvP_vj = tf.tensordot(v_j_P, W_v_P,
                                      axes=[[2],
                                            [0]])  # [batch, 1, hidden_size]
                WvPtot_vP = tf.tensordot(
                    v_P, W_v_P_tot,
                    axes=[[2], [0]])  # [batch, context_len, hidden_size]

                s_t = tf.tensordot(tf.tanh(WvP_vj + WvPtot_vP),
                                   v_SM,
                                   axes=[[2], [0]])  # [batch, context_len]
                #a_t = tf.nn.softmax(s_t, 1)
                _, a_t = masked_softmax(s_t, self.context_mask, 1)
                c_t = tf.einsum('ij,ijk->ik', a_t, v_P)  #[batch, hidden_size]

                # add the gate
                vPt_ct = tf.concat([tf.squeeze(v_j_P), c_t],
                                   1)  #[batch, 2 * hidden_size]
                g_t = tf.nn.sigmoid(vPt_ct)
                vPt_ct_star = tf.einsum('ij,ij->ij', g_t,
                                        vPt_ct)  # [batch, 2*hidden_size]

                SM_input.append(vPt_ct_star)

            # Someone here just stacked and then unstack, not sure why so I will just directly use SM_input

            SM_input = tf.stack(SM_input,
                                1)  # [batch, context_len, 2 * hidden_size]

            SM_fwd_cell = tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size), self.keep_prob)
            SM_back_cell = tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.GRUCell(self.FLAGS.hidden_size), self.keep_prob)
            (h_P_fwd, h_P_back), SM_final = tf.nn.bidirectional_dynamic_rnn(
                SM_fwd_cell,
                SM_back_cell,
                SM_input,
                tf.reduce_sum(self.context_mask, reduction_indices=1),
                dtype=tf.float32)
            h_P = tf.concat([h_P_fwd, h_P_back], 2)

            h_P = tf.nn.dropout(
                h_P, self.keep_prob)  #[batch, context_len, 2*hidden_size]

        # OUTPUT
        print "output"
        with tf.variable_scope("Output") as scope:
            W_ruQ = tf.get_variable('W_ruQ',
                                    shape=(2 * self.FLAGS.hidden_size,
                                           2 * self.FLAGS.hidden_size))
            V_rQ = tf.get_variable('V_rQ',
                                   shape=(self.FLAGS.question_len,
                                          2 * self.FLAGS.hidden_size))
            W_vQ = tf.get_variable('W_vQ',
                                   shape=(2 * self.FLAGS.hidden_size,
                                          2 * self.FLAGS.hidden_size))

            v_rQ = tf.get_variable('v_rQ', shape=(2 * self.FLAGS.hidden_size))

            WuQ_ujQ = tf.tensordot(
                u_Q, W_ruQ, [[2], [0]])  #[batch, q_len, 2 * hidden_size]
            WvQ_VrQ = tf.tensordot(V_rQ, W_vQ,
                                   [[1], [0]])  #[q_len, 2*hidden_size]

            s_t = tf.tensordot(
                tf.tanh(WuQ_ujQ + WvQ_VrQ), v_rQ, axes=[
                    [2], [0]
                ])  # The addition will broadcast # final shape: [batch, q_len]
            _, a_t = masked_softmax(s_t, self.qn_mask, 1)
            rQ = tf.einsum('ij,ijk->ik', a_t, u_Q)
            rQ = tf.nn.dropout(rQ, self.keep_prob)  #[batch, 2*hidden_size]

            h_a = rQ  # initial ans pointer

            p_t = [None] * 2

            W_hP = tf.get_variable('W_hP',
                                   shape=(2 * self.FLAGS.hidden_size,
                                          self.FLAGS.hidden_size))
            W_ha = tf.get_variable('W_ha',
                                   shape=(2 * self.FLAGS.hidden_size,
                                          self.FLAGS.hidden_size))
            v_ap = tf.get_variable(
                'v_ap', shape=(self.FLAGS.hidden_size))  # answer pointer bias
            ans_cell = tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.GRUCell(2 * self.FLAGS.hidden_size),
                self.keep_prob)

            for t in range(0, 2):
                # run thru RNN 2 times (cuz one start one end)

                WhP_hP = tf.tensordot(
                    h_P, W_hP, [[2], [0]])  #[batch, context_len, hidden_size]
                Wha_ha = tf.reshape(
                    tf.tensordot(h_a, W_ha, [[1], [0]]),
                    (-1, 1, self.FLAGS.hidden_size))  #[batch, 1, encode]

                s_t = tf.tensordot(tf.tanh(WhP_hP + Wha_ha),
                                   v_ap,
                                   axes=[[2], [0]])  # [batch, context_len]
                #a_t = tf.nn.softmax(s_t, 1)
                _, a_t = masked_softmax(s_t, self.context_mask, 1)

                if t == 0:
                    self.logits_start = a_t  #[batch, context_alen]
                else:
                    self.logits_end = a_t

                c_t = tf.einsum('ij,ijk->ik', a_t, h_P)  #[batch, 2*encode]

                if t == 0:
                    h_a, _ = ans_cell(c_t, h_a)  # h_a = [batch, 2*encode]

        print "complete"
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.
        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.model == "baseline" :
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "bidaf" or self.FLAGS.model == "bidaf_dynamic" or self.FLAGS.model=="bidaf_self_attn" or self.FLAGS.model=="bidaf_dynamic_self_attn":
            print("INSIDE the BIDAF model")
            encoder = RNNEncoder_LSTM(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "coatt" or self.FLAGS.model == "coatt_dynamic" or self.FLAGS.model=="coatt_dynamic_self_attn":
            encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob)

        if self.FLAGS.model != "coatt" and self.FLAGS.model != "coatt_dynamic" and self.FLAGS.model!="coatt_dynamic_self_attn":
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "baseline" :
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
            _,attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens)  # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2)  # (batch_size, context_len, hidden_size*4)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final,self.context_mask)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "coatt" :
            #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask, "context") # (batch_size, context_len, hidden_size*2)
            #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask, "question") # (batch_size, question_len, hidden_size*2)
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
            blended_reps_final = attn_output
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                contextLen = tf.reduce_sum(self.context_mask, axis=1)
                cell = tf.contrib.rnn.LSTMBlockCell(2 * self.FLAGS.hidden_size)
                (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, attn_output, contextLen, dtype = tf.float32)
                U_1 = tf.concat([fw_out, bw_out], axis=2)
                out = tf.nn.dropout(U_1, self.keep_prob)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(out,self.context_mask)


        elif self.FLAGS.model =="bidaf"  or self.FLAGS.model=="bidaf_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)
            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            if self.FLAGS.model == "bidaf_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp, self.context_mask) #(batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp


            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_start, self.context_mask)



            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                # Concatenate the start logits with the modelling layer output to get the input to the
                # end word lstm
                #self.logits_start has a shape of #(batch_size, context_len)
                logits_start_expand = tf.expand_dims(self.logits_start, axis=2) #(batch_size, context_len, 1)
                end_lstm_input = tf.concat([logits_start_expand, mod_layer_out], axis=2) #(batch_size, context_len, 1 + hidden_size*2)

                # LSTM
                end_layer = END_WORD_LAYER(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_end = end_layer.build_graph(end_lstm_input, self.context_mask)

                blended_reps_end_final = tf.concat([attn_output, blended_reps_end], axis=2)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_end_final, self.context_mask)

        elif self.FLAGS.model =="bidaf_dynamic" or self.FLAGS.model =="bidaf_dynamic_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)

            if self.FLAGS.model == "bidaf_dynamic_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp,self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp

            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)

            # We now feed this to dynamic decoder module coded in Answer decoder
            # the output of the decoder are start, end, alpha_logits and beta_logits
            # start and end have a shape of (batch_size, num_iterations)
            #alpha_logits and beta_logits have a shape of (batch_size, num_iterations, inpit_dim)
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = mod_layer_out[:,0,:]
            u_e_init = mod_layer_out[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(mod_layer_out, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]

        elif self.FLAGS.model =="coatt_dynamic" or self.FLAGS.model == "coatt_dynamic_self_attn":
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)

            if self.FLAGS.model == "coatt_dynamic_self_attn":
                CoATT = attn_layer.build_graph1(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _, self_attn_output = self_attn_layer.build_graph(CoATT, self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([CoATT, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                U = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                attn_output = U
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = attn_output[:,0,:]
            u_e_init = attn_output[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(attn_output, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

                # Use softmax layer to compute probability distribution for end location
                # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]
Ejemplo n.º 8
0
    def build_graph(self, values, values_mask, keys, keys_mask):
        with vs.variable_scope("Attention"):
            dense_layer1 = partial(
                tf.layers.dense,
                activation=None,
                use_bias=False,
                kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001))
            dense_layer2 = partial(
                tf.layers.dense,
                activation=None,
                use_bias=False,
                kernel_regularizer=tf.contrib.layers.l1_regularizer(0.001))

            score1 = dense_layer1(keys, 1)  #shape (batch_size, num_keys, 1)
            score2 = dense_layer2(values,
                                  1)  #shape (batch_size, num_values, 1)

            #version1. too much memory. Or do (batch, k_len, 1, ndim) * (batch, 1, v_len, ndim).
            #k = tf.expand_dims(tf.traspose(keys, perm=[0,2,1]), 3)  # shape (batch_size, hidden_size, num_keys, 1).
            #v = tf.expand_dims(tf.traspose(values, perm=[0,2,1]), 2)
            #matrix = tf.traspose(tf.matmul(k, v), perm=[0,2,3,1])

            #version2. seems infeasible.
            # def matrix_func(keys, values, weight):
            #     mat = np.zeros(self.shape)
            #     for k in xrange(self.shape[0]):
            #         for i in xrange(self.shape[1]):
            #             for j in xrange(self.shape[2]):
            #                 for m in xrange(self.vec_size):
            #                     mat[k,i,j] += weight[m]*keys[k,i,m]*values[k,j,m]
            #     return mat
            # weight = tf.Variable(tf.random_normal([self.vec_size]), dtype=tf.float32, name="similarity_weight_3")
            # similarity_scores = tf.cast(tf.py_func(matrix_func, [keys, values, weight], tf.double), tf.float32)
            # similarity_scores.set_shape(self.shape[0:])

            #version3. memory efficient. associate the channel weight weight with keys in advance, then multiply the result with values.
            weight = tf.Variable(tf.random_normal([1, 1,
                                                   self.hidden_vec_size]),
                                 dtype=tf.float32,
                                 name="similarity_weight_3")
            weighted_keys = weight * keys
            similarity_scores = tf.matmul(weighted_keys,
                                          tf.transpose(values, perm=[0, 2, 1]))
            similarity_scores = score1 + tf.transpose(score2, perm=[
                0, 2, 1
            ]) + similarity_scores  # shape (batch_size, num_keys, num_values)

            attn_logits_mask = tf.expand_dims(
                values_mask, 1)  # shape (batch_size, 1, num_values)
            _, C2Q_softmax = masked_softmax(
                similarity_scores, attn_logits_mask, 2
            )  # shape (batch_size, num_keys, num_values). take softmax over values
            C2Q_output = tf.matmul(
                C2Q_softmax,
                values)  # shape (batch_size, num_keys, value_vec_size)

            max_i = tf.reduce_max(similarity_scores, 2)
            _, Q2C_softmax = masked_softmax(max_i, keys_mask,
                                            1)  # shape(batch_size, num_keys)
            Q2C_softmax = tf.expand_dims(Q2C_softmax, -1)
            Q2C_output = tf.reduce_sum(
                Q2C_softmax * keys, 1, keepdims=True
            )  #or Q2C_output = tf.matmul(tf.transpose(keys, (0, 2, 1)), tf.expand_dims(Q2C_softmax, -1))

            output = tf.concat([
                keys, C2Q_output,
                tf.broadcast_to(Q2C_output, tf.shape(keys))
            ], 2)

            # Apply dropout
            output = tf.nn.dropout(output, self.keep_prob)

            return output