Esempio n. 1
0
    def __call__(self, inputs, state, scope=None):
        """

        :param inputs: [N, d + JQ + JQ * d]
        :param state: [N, d]
        :param scope:
        :return:
        """
        with tf.variable_scope(scope or self.__class__.__name__):
            c_prev, h_prev = state
            x = tf.slice(inputs, [0, 0], [-1, self._input_size])
            q_mask = tf.slice(inputs, [0, self._input_size],
                              [-1, self._q_len])  # [N, JQ]
            qs = tf.slice(inputs, [0, self._input_size + self._q_len],
                          [-1, -1])
            qs = tf.reshape(qs,
                            [-1, self._q_len, self._input_size])  # [N, JQ, d]
            x_tiled = tf.tile(tf.expand_dims(x, 1),
                              [1, self._q_len, 1])  # [N, JQ, d]
            h_prev_tiled = tf.tile(tf.expand_dims(h_prev, 1),
                                   [1, self._q_len, 1])  # [N, JQ, d]
            f = tf.tanh(
                linear([qs, x_tiled, h_prev_tiled],
                       self._input_size,
                       True,
                       scope='f'))  # [N, JQ, d]
            a = tf.nn.softmax(
                exp_mask(linear(f, 1, True, squeeze=True, scope='a'),
                         q_mask))  # [N, JQ]
            q = tf.reduce_sum(qs * tf.expand_dims(a, -1), 1)
            z = tf.concat(1, [x, q])  # [N, 2d]
            return self._cell(z, state)
Esempio n. 2
0
    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or "SHCell"):
            a_size = 1 if self._scalar else self._state_size
            h, u = tf.split(1, 2, inputs)
            if self._logit_func == 'mul_linear':
                args = [h * u]
                a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a'))
                r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r'))
            elif self._logit_func == 'linear':
                args = [h, u]
                a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a'))
                r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r'))
            elif self._logit_func == 'tri_linear':
                args = [h, u, h * u]
                a = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='a'))
                r = tf.nn.sigmoid(linear(args, a_size, True, bias_start=self._bias, scope='r'))
            elif self._logit_func == 'double':
                args = [h, u]
                a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True, bias_start=self._bias))
                r = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True, bias_start=self._bias))

            else:
                raise Exception()
            new_state = a * state + r * (1 - a) * h
            outputs = state
            return outputs, new_state
Esempio n. 3
0
        def double_linear_controller(inputs, state, A_m):
            """

            :param inputs: [N, i]
            :param state: [N, d]
            :param memory: [N, M, m]
            :return: [N, M]
            """
            if isinstance(state, LSTMStateTuple):
                in_ = tf.concat([inputs, state.c, state.h], -1)
            else:
                in_ = tf.concat([inputs, state], -1)
            A_IS = linear(in_,
                          size,
                          bias,
                          scope='first',
                          input_keep_prob=input_keep_prob,
                          is_train=is_train)

            rank = len(A_m.get_shape())
            _memory_size = tf.shape(A_m)[rank - 2]
            tiled_A_IS = tf.tile(tf.expand_dims(A_IS, 1), [1, _memory_size, 1])

            in_ = tf.tanh(tf.add(tiled_A_IS, A_m))  # [N * M, JX, d]
            out = linear(in_,
                         1,
                         bias,
                         squeeze=True,
                         scope='second',
                         input_keep_prob=input_keep_prob,
                         is_train=is_train)  # [N * M, JX]
            return out
Esempio n. 4
0
 def __call__(self, inputs, state, scope=None):
     """Gated recurrent unit (GRU) with nunits cells."""
     with tf.variable_scope(scope or type(self).__name__):  # "GRUCell"
         with tf.variable_scope("Gates"):  # Reset gate and update gate.
             # We start with bias of 1.0 to not reset and not update.
             state = tf.reshape(state, inputs.get_shape().as_list()[:-1] + state.get_shape().as_list()[-1:])  # explicit shape definition, to use my linaer function
             r, u = tf.split(1, 2, linear([inputs, state],
                                                 2 * self._num_units, True, 1.0))
             r, u = tf.sigmoid(r), tf.sigmoid(u)
         with tf.variable_scope("Candidate"):
             c = tf.tanh(linear([inputs, r * state], self._num_units, True, var_on_cpu=self.var_on_cpu, wd=self.wd))
         new_h = u * state + (1 - u) * c
     return new_h, new_h
Esempio n. 5
0
    def __call__(self, inputs, state, scope=None):
        scope = scope or type(self).__name__
        with tf.variable_scope(scope):
            tensors = self.tensors
            N, _ = state.get_shape().as_list()
            R, A, C = self._rel_size, self._arg_size, self._num_args
            with tf.name_scope("Split"):
                ru = tf.slice(state, [0, 0], [-1, R], name='ru')  # [N, d]
                au_flat = tf.slice(state, [0, R], [-1, -1], name='au_flat')
                au = tf.reshape(au_flat, [N, C, A], name='au')

                rf = tf.slice(inputs, [0, 0], [-1, R], name='rf')
                af_flat = tf.slice(inputs, [0, R], [-1, -1], name='af_flat')
                af = tf.reshape(af_flat, [N, C, A], name='af')

            with tf.variable_scope("Attention"):
                p_flat = tf.nn.softmax(linear([ru, rf], 2*C**2, True), name='p_flat')
                p = tf.reshape(p_flat, [N, C, 2*C], name='p')
                p_key = "{}/{}".format(scope, 'p')
                assert p_key not in tensors
                tensors[p_key] = p

            with tf.name_scope("Out"):
                ru_out, _ = self._cell(rf, ru)  # [N, R]
                a = tf.concat(1, [au, af], name='a')
                a_aug = tf.tile(tf.expand_dims(a, 1), [1, C, 1, 1], name='a_aug')
                au_out = tf.reduce_sum(a_aug * tf.expand_dims(p, -1), 2, name='au_out')  # [N, C, A]
                au_out_flat = tf.reshape(au_out, [N, C*A], name='au_out_flat')
                out = tf.concat(1, [ru_out, au_out_flat], name='out')  # [N, R+A*C]
        return out, out
Esempio n. 6
0
 def pre(self, inputs, scope=None):
     """Preprocess inputs to be used by the cell. Assumes [N, J, *]
     [x, u]"""
     is_train = self._is_train
     keep_prob = self._keep_prob
     gate_size = self._gate_size
     with tf.variable_scope(scope or "pre"):
         x, u, _, _ = tf.split(2, 4, tf.slice(inputs, [0, 0, gate_size], [-1, -1, -1]))  # [N, J, d]
         a_raw = linear([x * u], gate_size, True, scope='a_raw', var_on_cpu=self._var_on_cpu,
                        wd=self._wd, initializer=self._initializer)
         a = tf.sigmoid(a_raw - self._forget_bias, name='a')
         if keep_prob < 1.0:
             x = tf.cond(is_train, lambda: tf.nn.dropout(x, keep_prob), lambda: x)
             u = tf.cond(is_train, lambda: tf.nn.dropout(u, keep_prob), lambda: u)
         v_t = tf.nn.tanh(linear([x, u], self._num_units, True,
                          var_on_cpu=self._var_on_cpu, wd=self._wd, scope='v_raw'), name='v')
         new_inputs = tf.concat(2, [a, x, u, v_t])  # [N, J, 3*d + 1]
     return new_inputs
Esempio n. 7
0
    def __call__(self, inputs, state, scope=None):
        """

        :param inputs: [N, d + JQ + JQ * d]
        :param state: [N, d]
        :param scope:
        :return:
        """
        with tf.variable_scope(scope or self.__class__.__name__):
            c_prev, h_prev = state
            x = tf.slice(inputs, [0, 0], [-1, self._input_size])
            q_mask = tf.slice(inputs, [0, self._input_size], [-1, self._q_len])  # [N, JQ]
            qs = tf.slice(inputs, [0, self._input_size + self._q_len], [-1, -1])
            qs = tf.reshape(qs, [-1, self._q_len, self._input_size])  # [N, JQ, d]
            x_tiled = tf.tile(tf.expand_dims(x, 1), [1, self._q_len, 1])  # [N, JQ, d]
            h_prev_tiled = tf.tile(tf.expand_dims(h_prev, 1), [1, self._q_len, 1])  # [N, JQ, d]
            f = tf.tanh(linear([qs, x_tiled, h_prev_tiled], self._input_size, True, scope='f'))  # [N, JQ, d]
            a = tf.nn.softmax(exp_mask(linear(f, 1, True, squeeze=True, scope='a'), q_mask))  # [N, JQ]
            q = tf.reduce_sum(qs * tf.expand_dims(a, -1), 1)
            z = tf.concat(1, [x, q])  # [N, 2d]
            return self._cell(z, state)
Esempio n. 8
0
    def __call__(self, inputs, state, name_scope=None):
        """Long short-term memory cell (GRU)."""
        with tf.variable_scope(name_scope or type(self).__name__):  # "BasicLSTMCell"
            # Parameters of gates are concatenated into one multiply for efficiency.
            c, h = tf.split(1, 2, state)
            concat = linear([inputs, h], 4 * self._num_units, True, var_on_cpu=self.var_on_cpu, wd=self.wd)

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            i, j, f, o = tf.split(1, 4, concat)

            new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * tf.tanh(j)
            new_h = tf.tanh(new_c) * tf.sigmoid(o)

        return new_h, tf.concat(1, [new_c, new_h])
Esempio n. 9
0
        def linear_controller(inputs, state, memory):
            rank = len(memory.get_shape())
            _memory_size = tf.shape(memory)[rank-2]
            tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, _memory_size, 1])
            if isinstance(state, tuple):
                tiled_states = [tf.tile(tf.expand_dims(each, 1), [1, _memory_size, 1])
                                for each in state]
            else:
                tiled_states = [tf.tile(tf.expand_dims(state, 1), [1, _memory_size, 1])]

            # [N, M, d]
            in_ = tf.concat(2, [tiled_inputs] + tiled_states + [memory])
            out = linear(in_, 1, bias, squeeze=True, input_keep_prob=input_keep_prob, is_train=is_train)
            return out
Esempio n. 10
0
        def linear_controller(inputs, state, memory):
            rank = len(memory.get_shape())
            _memory_size = tf.shape(memory)[rank-2]
            tiled_inputs = tf.tile(tf.expand_dims(inputs, 1), [1, _memory_size, 1])
            if isinstance(state, tuple):
                tiled_states = [tf.tile(tf.expand_dims(each, 1), [1, _memory_size, 1])
                                for each in state]
            else:
                tiled_states = [tf.tile(tf.expand_dims(state, 1), [1, _memory_size, 1])]

            # [N, M, d]
            in_ = tf.concat(2, [tiled_inputs] + tiled_states + [memory])
            out = linear(in_, 1, bias, squeeze=True, input_keep_prob=input_keep_prob, is_train=is_train)
            return out
    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or "SHCell"):
            a_size = 1 if self._scalar else self._state_size
            h, u = tf.split(axis=1, num_or_size_splits=2, value=inputs)
            if self._logit_func == 'mul_linear':
                args = [h * u, state * u]
                a = tf.nn.sigmoid(linear(args, a_size, True))
            elif self._logit_func == 'linear':
                args = [h, u, state]
                a = tf.nn.sigmoid(linear(args, a_size, True))
            elif self._logit_func == 'tri_linear':
                args = [h, u, state, h * u, state * u]
                a = tf.nn.sigmoid(linear(args, a_size, True))
            elif self._logit_func == 'double':
                args = [h, u, state]
                a = tf.nn.sigmoid(
                    linear(tf.tanh(linear(args, a_size, True)),
                           self._state_size, True))

            else:
                raise Exception()
            new_state = a * state + (1 - a) * h
            outputs = state
            return outputs, new_state
Esempio n. 12
0
    def __call__(self, inputs, state, scope=None):
        gate_size = self._gate_size
        with tf.variable_scope(scope or type(self).__name__):  # "RSMCell"
            with tf.name_scope("Split"):  # Reset gate and update gate.
                a = tf.slice(inputs, [0, 0], [-1, gate_size])
                x, u, v_t = tf.split(1, 3, tf.slice(inputs, [0, gate_size], [-1, -1]))
                o = tf.slice(state, [0, 0], [-1, 1])
                h, v = tf.split(1, 2, tf.slice(state, [0, gate_size], [-1, -1]))

            with tf.variable_scope("Main"):
                r_raw = linear([x * u], 1, True, scope='r_raw', var_on_cpu=self._var_on_cpu,
                               initializer=self._initializer)
                r = tf.sigmoid(r_raw, name='a')
                new_o = a * r + (1 - a) * o
                new_v = a * v_t + (1 - a) * v
                g = r * v_t
                new_h = a * g + (1 - a) * h

            with tf.name_scope("Concat"):
                new_state = tf.concat(1, [new_o, new_h, new_v])
                outputs = tf.concat(1, [a, r, x, new_h, new_v, g])

        return outputs, new_state
Esempio n. 13
0
    def __init__(self,
                 cell,
                 memory,
                 size,
                 mask=None,
                 controller=None,
                 mapper=None,
                 input_keep_prob=1.0,
                 is_train=None):
        """
        Early fusion attention cell: uses the (inputs, state) to control the current attention.

        :param cell:
        :param memory: [N, M, m]
        :param mask:
        :param controller: (inputs, prev_state, memory) -> memory_logits
        """
        self._cell = cell
        self._memory = memory
        self._mask = mask
        self._flat_memory = flatten(memory, 2)
        self._flat_mask = flatten(mask, 1)
        if controller is None:
            controller = AttentionCell.get_double_linear_controller(
                size, True, input_keep_prob=input_keep_prob, is_train=is_train)
            self.A_m = linear(self._memory,
                              size,
                              True,
                              scope='memory_prepare',
                              input_keep_prob=input_keep_prob,
                              is_train=is_train)  # [N * M, JX, d]
        self._controller = controller
        if mapper is None:
            mapper = AttentionCell.get_concat_mapper()
        elif mapper == 'sim':
            mapper = AttentionCell.get_sim_mapper()
        self._mapper = mapper
Esempio n. 14
0
    def initialize(self):
        params = self.params
        placeholders = self.placeholders
        tensors = self.tensors
        variables_dict = self.variables_dict
        N, J, V, Q, M = params.batch_size, params.max_sent_size, params.vocab_size, params.max_ques_size, params.mem_size
        d = params.hidden_size
        L = params.mem_num_layers
        att_forget_bias = params.att_forget_bias
        use_vector_gate = params.use_vector_gate
        wd = params.wd
        initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3))
        with tf.name_scope("placeholders"):
            x = tf.placeholder('int32', shape=[N, M, J], name='x')
            x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask')
            q = tf.placeholder('int32', shape=[N, J], name='q')
            q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask')
            y = tf.placeholder('int32', shape=[N], name='y')
            is_train = tf.placeholder('bool', shape=[], name='is_train')
            placeholders['x'] = x
            placeholders['x_mask'] = x_mask
            placeholders['q'] = q
            placeholders['q_mask'] = q_mask
            placeholders['y'] = y
            placeholders['is_train'] = is_train

        with tf.variable_scope("embedding"):
            A = VariableEmbedder(params,
                                 wd=wd,
                                 initializer=initializer,
                                 name='A')
            Aq = A(q, name='Aq')  # [N, S, J, d]
            Ax = A(x, name='Ax')  # [N, S, J, d]

        with tf.name_scope("encoding"):
            encoder = PositionEncoder(J, d)
            u = encoder(Aq, q_mask)  # [N, d]
            m = encoder(Ax, x_mask)  # [N, M, d]

        with tf.variable_scope("networks"):
            m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2,
                                   name='m_mask')  # [N, M]
            gate_mask = tf.expand_dims(m_mask, -1)
            m_length = tf.reduce_sum(m_mask, 1, name='m_length')  # [N]
            prev_u = tf.tile(tf.expand_dims(u, 1), [1, M, 1])  # [N, M, d]
            reg_layer = VectorReductionLayer(
                N, M, d) if use_vector_gate else ReductionLayer(N, M, d)
            gate_size = d if use_vector_gate else 1
            h = None  # [N, M, d]
            as_, rfs, rbs = [], [], []
            hs = []
            for layer_idx in range(L):
                with tf.name_scope("layer_{}".format(layer_idx)):
                    u_t = tf.tanh(
                        linear([prev_u, m], d, True, wd=wd, scope='u_t'))
                    a = tf.cast(gate_mask, 'float') * tf.sigmoid(
                        linear([prev_u * m],
                               gate_size,
                               True,
                               initializer=initializer,
                               wd=wd,
                               scope='a') - att_forget_bias)
                    h = reg_layer(u_t, a, 1.0 - a, scope='h')
                    if layer_idx + 1 < L:
                        if params.use_reset:
                            rf, rb = tf.split(
                                2, 2,
                                tf.cast(gate_mask, 'float') * tf.sigmoid(
                                    linear([prev_u * m],
                                           2 * gate_size,
                                           True,
                                           initializer=initializer,
                                           wd=wd,
                                           scope='r')))
                        else:
                            rf = rb = tf.ones(a.get_shape().as_list())
                        u_t_rev = tf.reverse_sequence(u_t, m_length, 1)
                        a_rev, rb_rev = tf.reverse_sequence(
                            a, m_length,
                            1), tf.reverse_sequence(rb, m_length, 1)
                        uf = reg_layer(u_t, a * rf, 1.0 - a, scope='uf')
                        ub_rev = reg_layer(u_t_rev,
                                           a_rev * rb_rev,
                                           1.0 - a_rev,
                                           scope='ub_rev')
                        ub = tf.reverse_sequence(ub_rev, m_length, 1)
                        prev_u = uf + ub
                    else:
                        rf = rb = tf.zeros(a.get_shape().as_list())
                    rfs.append(rf)
                    rbs.append(rb)
                    as_.append(a)
                    hs.append(h)
                    tf.get_variable_scope().reuse_variables()

            h_last = tf.squeeze(tf.slice(h, [0, M - 1, 0], [-1, -1, -1]),
                                [1])  # [N, d]
            hs_last = [
                tf.squeeze(tf.slice(each, [0, M - 1, 0], [-1, -1, -1]), [1])
                for each in hs
            ]
            a = tf.transpose(tf.pack(as_, name='a'), [1, 0, 2, 3])
            rf = tf.transpose(tf.pack(rfs, name='rf'), [1, 0, 2, 3])
            rb = tf.transpose(tf.pack(rbs, name='rb'), [1, 0, 2, 3])
            tensors['a'] = a
            tensors['rf'] = rf
            tensors['rb'] = rb

        with tf.variable_scope("class"):
            class_mode = params.class_mode
            use_class_bias = params.use_class_bias
            if class_mode == 'h':
                # W = tf.transpose(A.emb_mat, name='W')
                logits = linear([h_last], V, use_class_bias, wd=wd)
            elif class_mode == 'uh':
                logits = linear([h_last, u], V, use_class_bias, wd=wd)
            elif class_mode == 'hs':
                logits = linear(hs_last, V, use_class_bias, wd=wd)
            elif class_mode == 'hss':
                logits = linear(sum(hs_last), V, use_class_bias, wd=wd)
            else:
                raise Exception("Invalid class mode: {}".format(class_mode))
            yp = tf.cast(tf.argmax(logits, 1), 'int32')
            correct = tf.equal(yp, y)
            tensors['yp'] = yp
            tensors['correct'] = correct

        with tf.name_scope("loss"):
            with tf.name_scope("ans_loss"):
                ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits,
                                                                    y,
                                                                    name='ce')
                avg_ce = tf.reduce_mean(ce, name='avg_ce')
                tf.add_to_collection('losses', avg_ce)

            losses = tf.get_collection('losses')
            loss = tf.add_n(losses, name='loss')
            tensors['loss'] = loss

        variables_dict['all'] = tf.trainable_variables()
Esempio n. 15
0
    def initialize(self):
        params = self.params
        placeholders = self.placeholders
        tensors = self.tensors
        variables_dict = self.variables_dict
        N, J, V, Q, M = (
            params.batch_size,
            params.max_sent_size,
            params.vocab_size,
            params.max_ques_size,
            params.mem_size,
        )
        d = params.hidden_size
        L = params.mem_num_layers
        forget_bias = params.forget_bias
        wd = params.wd
        initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3))
        with tf.name_scope("placeholders"):
            x = tf.placeholder("int32", shape=[N, M, J], name="x")
            x_mask = tf.placeholder("bool", shape=[N, M, J], name="x_mask")
            q = tf.placeholder("int32", shape=[N, J], name="q")
            q_mask = tf.placeholder("bool", shape=[N, J], name="q_mask")
            y = tf.placeholder("int32", shape=[N], name="y")
            is_train = tf.placeholder("bool", shape=[], name="is_train")
            placeholders["x"] = x
            placeholders["x_mask"] = x_mask
            placeholders["q"] = q
            placeholders["q_mask"] = q_mask
            placeholders["y"] = y
            placeholders["is_train"] = is_train

        with tf.variable_scope("embedding"):
            A = VariableEmbedder(params, wd=wd, initializer=initializer, name="A")
            Aq = A(q, name="Aq")  # [N, S, J, d]
            Ax = A(x, name="Ax")  # [N, S, J, d]

        with tf.name_scope("encoding"):
            encoder = PositionEncoder(J, d)
            u = encoder(Aq, q_mask)  # [N, d]
            m = encoder(Ax, x_mask)  # [N, M, d]

        with tf.variable_scope("networks"):
            m_mask = tf.reduce_max(tf.cast(x_mask, "int64"), 2, name="m_mask")  # [N, M]
            m_length = tf.reduce_sum(m_mask, 1, name="m_length")  # [N]
            initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3))
            cell = RSMCell(d, forget_bias=forget_bias, wd=wd, initializer=initializer)
            us = tf.tile(tf.expand_dims(u, 1, name="u_prev_aug"), [1, M, 1])  # [N, d] -> [N, M, d]
            in_ = tf.concat(2, [tf.ones([N, M, 1]), m, us, tf.zeros([N, M, 2 * d])], name="x_h_in")  # [N, M, 4*d + 1]
            out, fw_state, bw_state, bi_tensors = dynamic_bidirectional_rnn(
                cell, in_, sequence_length=m_length, dtype="float", num_layers=L
            )
            a = tf.slice(out, [0, 0, 0], [-1, -1, 1])  # [N, M, 1]
            _, _, v, g = tf.split(2, 4, tf.slice(out, [0, 0, 1], [-1, -1, -1]))
            fw_h, fw_v = tf.split(1, 2, tf.slice(fw_state, [0, 1], [-1, -1]))
            bw_h, bw_v = tf.split(1, 2, tf.slice(bw_state, [0, 1], [-1, -1]))

            _, fw_u_out, fw_v_out, _ = tf.split(
                2, 4, tf.squeeze(tf.slice(bi_tensors["fw_out"], [0, L - 1, 0, 2], [-1, -1, -1, -1]), [1])
            )
            _, bw_u_out, bw_v_out, _ = tf.split(
                2, 4, tf.squeeze(tf.slice(bi_tensors["bw_out"], [0, L - 1, 0, 2], [-1, -1, -1, -1]), [1])
            )

            tensors["a"] = tf.squeeze(tf.slice(bi_tensors["in"], [0, 0, 0, 0], [-1, -1, -1, 1]), [3])
            tensors["of"] = tf.squeeze(tf.slice(bi_tensors["fw_out"], [0, 0, 0, 1], [-1, -1, -1, 1]), [3])
            tensors["ob"] = tf.squeeze(tf.slice(bi_tensors["bw_out"], [0, 0, 0, 1], [-1, -1, -1, 1]), [3])

        with tf.variable_scope("selection"):
            # w = tf.nn.relu(linear([fw_v + 1e-9*(fw_h+bw_h)], d, True, wd=wd))
            w = fw_v + 1e-9 * (fw_h + bw_h)
            tensors["s"] = a

        with tf.variable_scope("class"):
            if params.use_ques:
                logits = linear([w, u], V, True, wd=wd)
            else:
                # W = tf.transpose(A.emb_mat, name='W')
                W = tf.get_variable("W", shape=[d, V])
                logits = tf.matmul(w, W, name="logits")
            yp = tf.cast(tf.argmax(logits, 1), "int32")
            correct = tf.equal(yp, y)
            tensors["yp"] = yp
            tensors["correct"] = correct

        with tf.name_scope("loss"):
            with tf.name_scope("ans_loss"):
                ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y, name="ce")
                avg_ce = tf.reduce_mean(ce, name="avg_ce")
                tf.add_to_collection("losses", avg_ce)

            losses = tf.get_collection("losses")
            loss = tf.add_n(losses, name="loss")
            tensors["loss"] = loss

        variables_dict["all"] = tf.trainable_variables()
Esempio n. 16
0
    def __init__(self,
                 config,
                 seq_length,
                 emb_dim,
                 hidden_dim,
                 emb_train,
                 embeddings=None,
                 pred_size=3,
                 context_seq_len=None,
                 query_seq_len=None):
        ## Define hyperparameters
        # tf.reset_default_graph()
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length
        self.pred_size = pred_size
        self.context_seq_len = context_seq_len
        self.query_seq_len = query_seq_len
        # self.config = config

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length],
                                        name='premise')
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length],
                                           name='hypothesis')
        self.premise_pos = tf.placeholder(tf.int32,
                                          [None, self.sequence_length, 47],
                                          name='premise_pos')
        self.hypothesis_pos = tf.placeholder(tf.int32,
                                             [None, self.sequence_length, 47],
                                             name='hypothesis_pos')
        self.premise_char = tf.placeholder(
            tf.int32, [None, self.sequence_length, config.char_in_word_size],
            name='premise_char')
        self.hypothesis_char = tf.placeholder(
            tf.int32, [None, self.sequence_length, config.char_in_word_size],
            name='hypothesis_char')
        self.premise_exact_match = tf.placeholder(
            tf.int32, [None, self.sequence_length, 1],
            name='premise_exact_match')
        self.hypothesis_exact_match = tf.placeholder(
            tf.int32, [None, self.sequence_length, 1],
            name='hypothesis_exact_match')

        self.global_step = tf.Variable(0, name='global_step', trainable=False)

        self.dropout_keep_rate = tf.train.exponential_decay(
            config.keep_rate,
            self.global_step,
            config.dropout_decay_step,
            config.dropout_decay_rate,
            staircase=False,
            name='dropout_keep_rate')
        config.keep_rate = self.dropout_keep_rate
        tf.summary.scalar('dropout_keep_rate', self.dropout_keep_rate)

        self.y = tf.placeholder(tf.int32, [None], name='label_y')
        self.keep_rate_ph = tf.placeholder(tf.float32, [], name='keep_prob')
        self.is_train = tf.placeholder('bool', [], name='is_train')

        ## Fucntion for embedding lookup and dropout at embedding layer
        def emb_drop(E, x):
            emb = tf.nn.embedding_lookup(E, x)
            emb_drop = tf.cond(self.is_train,
                               lambda: tf.nn.dropout(emb, config.keep_rate),
                               lambda: emb)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, prem_mask = blocks.length(
            self.premise_x)  # mask [N, L , 1]
        hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x)
        self.prem_mask = prem_mask
        self.hyp_mask = hyp_mask

        ### Embedding layer ###
        with tf.variable_scope("emb"):
            with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                self.E = tf.Variable(embeddings, trainable=emb_train)
                premise_in = emb_drop(self.E, self.premise_x)  #P
                hypothesis_in = emb_drop(self.E, self.hypothesis_x)  #H

        with tf.variable_scope("char_emb"):
            char_emb_mat = tf.get_variable(
                "char_emb_mat",
                shape=[config.char_vocab_size, config.char_emb_size])
            with tf.variable_scope("char") as scope:
                char_pre = tf.nn.embedding_lookup(char_emb_mat,
                                                  self.premise_char)
                char_hyp = tf.nn.embedding_lookup(char_emb_mat,
                                                  self.hypothesis_char)

                filter_sizes = list(
                    map(int, config.out_channel_dims.split(',')))  #[100]
                heights = list(map(int,
                                   config.filter_heights.split(',')))  #[5]
                assert sum(filter_sizes) == config.char_out_size, (
                    filter_sizes, config.char_out_size)
                with tf.variable_scope("conv") as scope:
                    conv_pre = multi_conv1d(char_pre,
                                            filter_sizes,
                                            heights,
                                            "VALID",
                                            self.is_train,
                                            config.keep_rate,
                                            scope='conv')
                    scope.reuse_variables()
                    conv_hyp = multi_conv1d(char_hyp,
                                            filter_sizes,
                                            heights,
                                            "VALID",
                                            self.is_train,
                                            config.keep_rate,
                                            scope='conv')
                    conv_pre = tf.reshape(
                        conv_pre,
                        [-1, self.sequence_length, config.char_out_size])
                    conv_hyp = tf.reshape(
                        conv_hyp,
                        [-1, self.sequence_length, config.char_out_size])
            premise_in = tf.concat([premise_in, conv_pre], axis=2)
            hypothesis_in = tf.concat([hypothesis_in, conv_hyp], axis=2)

        premise_in = tf.concat(
            (premise_in, tf.cast(self.premise_pos, tf.float32)), axis=2)
        hypothesis_in = tf.concat(
            (hypothesis_in, tf.cast(self.hypothesis_pos, tf.float32)), axis=2)

        premise_in = tf.concat(
            [premise_in,
             tf.cast(self.premise_exact_match, tf.float32)],
            axis=2)
        hypothesis_in = tf.concat(
            [hypothesis_in,
             tf.cast(self.hypothesis_exact_match, tf.float32)],
            axis=2)

        with tf.variable_scope("highway") as scope:
            premise_in = highway_network(premise_in,
                                         config.highway_num_layers,
                                         True,
                                         wd=config.wd,
                                         is_train=self.is_train)
            scope.reuse_variables()
            hypothesis_in = highway_network(hypothesis_in,
                                            config.highway_num_layers,
                                            True,
                                            wd=config.wd,
                                            is_train=self.is_train)

        with tf.variable_scope("prepro") as scope:
            pre = premise_in
            hyp = hypothesis_in
            for i in range(config.self_att_enc_layers):
                with tf.variable_scope(tf.get_variable_scope(), reuse=False):
                    p = self_attention_layer(
                        config,
                        self.is_train,
                        pre,
                        p_mask=prem_mask,
                        scope="{}_layer_self_att_enc".format(
                            i))  # [N, len, dim]
                    h = self_attention_layer(
                        config,
                        self.is_train,
                        hyp,
                        p_mask=hyp_mask,
                        scope="{}_layer_self_att_enc_h".format(i))
                    pre = p
                    hyp = h
                    variable_summaries(p,
                                       "p_self_enc_summary_layer_{}".format(i))
                    variable_summaries(h,
                                       "h_self_enc_summary_layer_{}".format(i))

        with tf.variable_scope("main") as scope:

            def model_one_side(config, main, support, main_length,
                               support_length, main_mask, support_mask, scope):
                bi_att_mx = bi_attention_mx(config,
                                            self.is_train,
                                            main,
                                            support,
                                            p_mask=main_mask,
                                            h_mask=support_mask)  # [N, PL, HL]

                bi_att_mx = tf.cond(
                    self.is_train,
                    lambda: tf.nn.dropout(bi_att_mx, config.keep_rate),
                    lambda: bi_att_mx)
                out_final = dense_net(config, bi_att_mx, self.is_train)

                return out_final

            premise_final = model_one_side(config,
                                           p,
                                           h,
                                           prem_seq_lengths,
                                           hyp_seq_lengths,
                                           prem_mask,
                                           hyp_mask,
                                           scope="premise_as_main")
            f0 = premise_final
            print('f0:', f0.get_shape().as_list())

        self.logits = linear(f0,
                             self.pred_size,
                             True,
                             bias_start=0.0,
                             scope="logit",
                             squeeze=False,
                             wd=config.wd,
                             input_keep_prob=config.keep_rate,
                             is_train=self.is_train)

        tf.summary.histogram('logit_histogram', self.logits)

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))
        self.acc = tf.reduce_mean(
            tf.cast(
                tf.equal(tf.arg_max(self.logits, dimension=1),
                         tf.cast(self.y, tf.int64)), tf.float32))
        tf.summary.scalar('acc', self.acc)

        tf.summary.scalar('loss', self.total_cost)

        # calculate acc

        # L2 Loss
        if config.l2_loss:
            if config.sigmoid_growing_l2loss:
                weights_added = tf.add_n([
                    tf.nn.l2_loss(tensor)
                    for tensor in tf.trainable_variables()
                    if tensor.name.endswith("weights:0")
                    and not tensor.name.endswith("weighted_sum/weights:0")
                    or tensor.name.endswith('kernel:0')
                ])
                full_l2_step = tf.constant(config.weight_l2loss_step_full_reg,
                                           dtype=tf.int32,
                                           shape=[],
                                           name='full_l2reg_step')
                full_l2_ratio = tf.constant(config.l2_regularization_ratio,
                                            dtype=tf.float32,
                                            shape=[],
                                            name='l2_regularization_ratio')
                gs_flt = tf.cast(self.global_step, tf.float32)
                half_l2_step_flt = tf.cast(full_l2_step / 2, tf.float32)

                # (self.global_step - full_l2_step / 2)
                # tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)
                # l2loss_ratio = tf.sigmoid( tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)) * full_l2_ratio
                l2loss_ratio = tf.sigmoid(((gs_flt - half_l2_step_flt) * 8) /
                                          half_l2_step_flt) * full_l2_ratio
                tf.summary.scalar('l2loss_ratio', l2loss_ratio)
                l2loss = weights_added * l2loss_ratio
            else:
                l2loss = tf.add_n([
                    tf.nn.l2_loss(tensor)
                    for tensor in tf.trainable_variables() if tensor.name.
                    endswith("weights:0") or tensor.name.endswith('kernel:0')
                ]) * tf.constant(config.l2_regularization_ratio,
                                 dtype='float',
                                 shape=[],
                                 name='l2_regularization_ratio')
            tf.summary.scalar('l2loss', l2loss)
            self.total_cost += l2loss

        if config.wo_enc_sharing or config.wo_highway_sharing_but_penalize_diff:
            diffs = []
            for i in range(config.self_att_enc_layers):
                for tensor in tf.trainable_variables():
                    print(tensor.name)
                    if tensor.name == "prepro/{}_layer_self_att_enc/self_attention/h_logits/first/kernel:0".format(
                            i):
                        l_lg = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_attention/h_logits/first/kernel:0".format(
                            i):
                        r_lg = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_1/kernel:0".format(
                            i):
                        l_fg_lhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_1/kernel:0".format(
                            i):
                        r_fg_lhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_1/kernel:0".format(
                            i):
                        l_fg_rhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_1/kernel:0".format(
                            i):
                        r_fg_rhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_2/kernel:0".format(
                            i):
                        l_fg_lhs_2 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_2/kernel:0".format(
                            i):
                        r_fg_lhs_2 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_2/kernel:0".format(
                            i):
                        l_fg_rhs_2 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_2/kernel:0".format(
                            i):
                        r_fg_rhs_2 = tensor

                    if config.two_gate_fuse_gate:
                        if tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_3/kernel:0".format(
                                i):
                            l_fg_lhs_3 = tensor
                        elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_3/kernel:0".format(
                                i):
                            r_fg_lhs_3 = tensor
                        elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_3/kernel:0".format(
                                i):
                            l_fg_rhs_3 = tensor
                        elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_3/kernel:0".format(
                                i):
                            r_fg_rhs_3 = tensor

                diffs += [
                    l_lg - r_lg, l_fg_lhs_1 - r_fg_lhs_1,
                    l_fg_rhs_1 - r_fg_rhs_1, l_fg_lhs_2 - r_fg_lhs_2,
                    l_fg_rhs_2 - r_fg_rhs_2
                ]
                if config.two_gate_fuse_gate:
                    diffs += [l_fg_lhs_3 - r_fg_lhs_3, l_fg_rhs_3 - r_fg_rhs_3]

            diff_loss = tf.add_n([tf.nn.l2_loss(tensor)
                                  for tensor in diffs]) * tf.constant(
                                      config.diff_penalty_loss_ratio,
                                      dtype='float',
                                      shape=[],
                                      name='diff_penalty_loss_ratio')
            tf.summary.scalar('diff_penalty_loss', diff_loss)
            self.total_cost += diff_loss

        self.summary = tf.summary.merge_all()

        total_parameters = 0
        for v in tf.global_variables():
            if not v.name.endswith("weights:0") and not v.name.endswith(
                    "biases:0") and not v.name.endswith(
                        'kernel:0') and not v.name.endswith('bias:0'):
                continue
            print(v.name)
            # print(type(v.name))
            shape = v.get_shape().as_list()
            param_num = 1
            for dim in shape:
                param_num *= dim
            print(param_num)
            total_parameters += param_num
        print(total_parameters)
Esempio n. 17
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JX = tf.shape(self.x)[2]
        JQ = tf.shape(self.q)[1]
        M = tf.shape(self.x)[1]
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size
        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")
                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            0, [word_emb_mat, self.new_emb_mat])

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]
                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(3, [xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(2, [qq, Aq])  # [N, JQ, di]
                else:
                    xx = Ax
                    qq = Aq

        # highway network
        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell = BasicLSTMCell(d, state_is_tuple=True)
        d_cell = SwitchableDropoutWrapper(
            cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("prepro"):
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell,
                                             d_cell,
                                             qq,
                                             q_len,
                                             dtype='float',
                                             scope='u1')  # [N, J, d], [N, d]
            u = tf.concat(2, [fw_u, bw_u])
            if config.share_lstm_weights:
                tf.get_variable_scope().reuse_variables()
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='u1')  # [N, M, JX, 2d]
                h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]
            else:
                (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                    cell, cell, xx, x_len, dtype='float',
                    scope='h1')  # [N, M, JX, 2d]
                h = tf.concat(3, [fw_h, bw_h])  # [N, M, JX, 2d]
            self.tensor_dict['u'] = u
            self.tensor_dict['h'] = h

        with tf.variable_scope("main"):
            if config.dynamic_att:
                p0 = h
                u = tf.reshape(tf.tile(tf.expand_dims(u, 1), [1, M, 1, 1]),
                               [N * M, JQ, 2 * d])
                q_mask = tf.reshape(
                    tf.tile(tf.expand_dims(self.q_mask, 1), [1, M, 1]),
                    [N * M, JQ])
                first_cell = AttentionCell(
                    cell,
                    u,
                    mask=q_mask,
                    mapper='sim',
                    input_keep_prob=self.config.input_keep_prob,
                    is_train=self.is_train)
            else:
                p0 = attention_layer(config,
                                     self.is_train,
                                     h,
                                     u,
                                     h_mask=self.x_mask,
                                     u_mask=self.q_mask,
                                     scope="p0",
                                     tensor_dict=self.tensor_dict)
                first_cell = d_cell
            self.p = p0
            (fw_g0, bw_g0), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, p0, x_len, dtype='float',
                scope='g0')  # [N, M, JX, 2d]
            g0 = tf.concat(3, [fw_g0, bw_g0])
            (fw_g1, bw_g1), _ = bidirectional_dynamic_rnn(
                first_cell, first_cell, g0, x_len, dtype='float',
                scope='g1')  # [N, M, JX, 2d]
            g1 = tf.concat(3, [fw_g1, bw_g1])

        with tf.variable_scope("output"):
            if config.model_name == "basic":
                logits = get_logits([g1, p0], d, True, wd=config.wd, \
                        input_keep_prob=config.input_keep_prob,
                        mask=self.x_mask, is_train=self.is_train, \
                        func=config.answer_func, scope='logits1')
                a1i = softsel(tf.reshape(g1, [N, M * JX, 2 * d]), \
                        tf.reshape(logits, [N, M * JX]))
                a1i = tf.tile(tf.expand_dims(tf.expand_dims(a1i, 1), 1), \
                        [1, M, JX, 1])
                (fw_g2, bw_g2), _ = bidirectional_dynamic_rnn(d_cell, d_cell, \
                        tf.concat(3, [p0, g1, a1i, g1 * a1i]),
                        x_len, dtype='float', scope='g2')  # [N, M, JX, 2d]
                g2 = tf.concat(3, [fw_g2, bw_g2])
                logits2 = get_logits([g2, p0], d, True, wd=config.wd, \
                        input_keep_prob=config.input_keep_prob, mask=self.x_mask,
                        is_train=self.is_train, func=config.answer_func,
                        scope='logits2')
                flat_logits = tf.reshape(logits, [-1, M * JX])
                flat_yp = tf.nn.softmax(flat_logits)  # [-1, M*JX]
                yp = tf.reshape(flat_yp, [-1, M, JX])
                flat_logits2 = tf.reshape(logits2, [-1, M * JX])
                flat_yp2 = tf.nn.softmax(flat_logits2)
                yp2 = tf.reshape(flat_yp2, [-1, M, JX])

                self.tensor_dict['g1'] = g1
                self.tensor_dict['g2'] = g2

                self.logits = flat_logits
                self.logits2 = flat_logits2
                self.yp = yp
                self.yp2 = yp2

            elif config.model_name == "basic-class":
                C = 3 if config.data_dir.startswith('data/snli') else 2
                (fw_g2, bw_g2) = (fw_g1, bw_g1)

                if config.classifier == 'maxpool':
                    g2 = tf.concat(3, [fw_g2, bw_g2])  # [N, M, JX, 2d]
                    g2 = tf.reduce_max(g2, 2)  # [N, M, 2d]
                    g2_dim = 2 * d
                elif config.classifier == 'sumpool':
                    g2 = tf.concat(3, [fw_g2, bw_g2])
                    g2 = tf.reduce_sum(g2, 2)
                    g2_dim = 2 * d
                else:
                    fw_g2_ = tf.gather(tf.transpose(fw_g2, [2, 0, 1, 3]),
                                       JX - 1)
                    bw_g2_ = tf.gather(tf.transpose(bw_g2, [2, 0, 1, 3]), 0)
                    g2 = tf.concat(2, [fw_g2_, bw_g2_])
                    g2_dim = 2 * d

                g2_ = tf.reshape(g2, [N, g2_dim])

                logits0 = linear(g2_,
                                 C,
                                 True,
                                 wd=config.wd,
                                 input_keep_prob=config.input_keep_prob,
                                 is_train=self.is_train,
                                 scope='classifier')
                flat_yp0 = tf.nn.softmax(logits0)
                yp0 = tf.reshape(flat_yp0, [N, M, C])
                self.tensor_dict['g1'] = g1
                self.logits0 = logits0
                self.yp0 = yp0
                self.logits = logits0
                self.yp = yp0
Esempio n. 18
0
    def initialize(self):
        params = self.params
        placeholders = self.placeholders
        tensors = self.tensors
        variables_dict = self.variables_dict
        N, J, V, Q, M = params.batch_size, params.max_sent_size, params.vocab_size, params.max_ques_size, params.mem_size
        d = params.hidden_size
        L = params.mem_num_layers
        att_forget_bias = params.att_forget_bias
        use_vector_gate = params.use_vector_gate
        wd = params.wd
        initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3))
        with tf.name_scope("placeholders"):
            x = tf.placeholder('int32', shape=[N, M, J], name='x')
            x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask')
            q = tf.placeholder('int32', shape=[N, J], name='q')
            q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask')
            y = tf.placeholder('int32', shape=[N], name='y')
            is_train = tf.placeholder('bool', shape=[], name='is_train')
            placeholders['x'] = x
            placeholders['x_mask'] = x_mask
            placeholders['q'] = q
            placeholders['q_mask'] = q_mask
            placeholders['y'] = y
            placeholders['is_train'] = is_train

        with tf.variable_scope("embedding"):
            A = VariableEmbedder(params, wd=wd, initializer=initializer, name='A')
            Aq = A(q, name='Aq')  # [N, S, J, d]
            Ax = A(x, name='Ax')  # [N, S, J, d]

        with tf.name_scope("encoding"):
            encoder = PositionEncoder(J, d)
            u = encoder(Aq, q_mask)  # [N, d]
            m = encoder(Ax, x_mask)  # [N, M, d]

        with tf.variable_scope("networks"):
            m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2, name='m_mask')  # [N, M]
            gate_mask = tf.expand_dims(m_mask, -1)
            m_length = tf.reduce_sum(m_mask, 1, name='m_length')  # [N]
            prev_u = tf.tile(tf.expand_dims(u, 1), [1, M, 1])  # [N, M, d]
            reg_layer = VectorReductionLayer(N, M, d) if use_vector_gate else ReductionLayer(N, M, d)
            gate_size = d if use_vector_gate else 1
            h = None  # [N, M, d]
            as_, rfs, rbs = [], [], []
            hs = []
            for layer_idx in range(L):
                with tf.name_scope("layer_{}".format(layer_idx)):
                    u_t = tf.tanh(linear([prev_u, m], d, True, wd=wd, scope='u_t'))
                    a = tf.cast(gate_mask, 'float') * tf.sigmoid(linear([prev_u * m], gate_size, True, initializer=initializer, wd=wd, scope='a') - att_forget_bias)
                    h = reg_layer(u_t, a, 1.0-a, scope='h')
                    if layer_idx + 1 < L:
                        if params.use_reset:
                            rf, rb = tf.split(2, 2, tf.cast(gate_mask, 'float') *
                                tf.sigmoid(linear([prev_u * m], 2 * gate_size, True, initializer=initializer, wd=wd, scope='r')))
                        else:
                            rf = rb = tf.ones(a.get_shape().as_list())
                        u_t_rev = tf.reverse_sequence(u_t, m_length, 1)
                        a_rev, rb_rev = tf.reverse_sequence(a, m_length, 1), tf.reverse_sequence(rb, m_length, 1)
                        uf = reg_layer(u_t, a*rf, 1.0-a, scope='uf')
                        ub_rev = reg_layer(u_t_rev, a_rev*rb_rev, 1.0-a_rev, scope='ub_rev')
                        ub = tf.reverse_sequence(ub_rev, m_length, 1)
                        prev_u = uf + ub
                    else:
                        rf = rb = tf.zeros(a.get_shape().as_list())
                    rfs.append(rf)
                    rbs.append(rb)
                    as_.append(a)
                    hs.append(h)
                    tf.get_variable_scope().reuse_variables()

            h_last = tf.squeeze(tf.slice(h, [0, M-1, 0], [-1, -1, -1]), [1])  # [N, d]
            hs_last = [tf.squeeze(tf.slice(each, [0, M-1, 0], [-1, -1, -1]), [1]) for each in hs]
            a = tf.transpose(tf.pack(as_, name='a'), [1, 0, 2, 3])
            rf = tf.transpose(tf.pack(rfs, name='rf'), [1, 0, 2, 3])
            rb = tf.transpose(tf.pack(rbs, name='rb'), [1, 0, 2, 3])
            tensors['a'] = a
            tensors['rf'] = rf
            tensors['rb'] = rb

        with tf.variable_scope("class"):
            class_mode = params.class_mode
            use_class_bias = params.use_class_bias
            if class_mode == 'h':
                # W = tf.transpose(A.emb_mat, name='W')
                logits = linear([h_last], V, use_class_bias, wd=wd)
            elif class_mode == 'uh':
                logits = linear([h_last, u], V, use_class_bias, wd=wd)
            elif class_mode == 'hs':
                logits = linear(hs_last, V, use_class_bias, wd=wd)
            elif class_mode == 'hss':
                logits = linear(sum(hs_last), V, use_class_bias, wd=wd)
            else:
                raise Exception("Invalid class mode: {}".format(class_mode))
            yp = tf.cast(tf.argmax(logits, 1), 'int32')
            correct = tf.equal(yp, y)
            tensors['yp'] = yp
            tensors['correct'] = correct

        with tf.name_scope("loss"):
            with tf.name_scope("ans_loss"):
                ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y, name='ce')
                avg_ce = tf.reduce_mean(ce, name='avg_ce')
                tf.add_to_collection('losses', avg_ce)

            losses = tf.get_collection('losses')
            loss = tf.add_n(losses, name='loss')
            tensors['loss'] = loss

        variables_dict['all'] = tf.trainable_variables()
Esempio n. 19
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, W = \
            config.batch_size,  config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.max_word_size
        JQ = JX
        print('VC:{}  NEW_EMB:{}'.format(VW, self.new_emb_mat.get_shape()))
        dc, dw, dco = config.char_emb_size, config.word_emb_size, config.char_out_size

        with tf.variable_scope("emb"):
            if config.use_char_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    char_emb_mat = tf.get_variable("char_emb_mat",
                                                   shape=[VC, dc],
                                                   dtype='float')

                with tf.variable_scope("char"):
                    Acx = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cx)  # [N, M, JX, W, dc]
                    Acq = tf.nn.embedding_lookup(char_emb_mat,
                                                 self.cq)  # [N, JQ, W, dc]
                    Acx = tf.reshape(Acx, [-1, JX, W, dc])
                    Acq = tf.reshape(Acq, [-1, JQ, W, dc])

                    filter_sizes = list(
                        map(int, config.out_channel_dims.split(',')))
                    heights = list(map(int, config.filter_heights.split(',')))
                    assert sum(filter_sizes) == dco, (filter_sizes, dco)
                    with tf.variable_scope("conv"):
                        xx = multi_conv1d(Acx,
                                          filter_sizes,
                                          heights,
                                          "VALID",
                                          self.is_train,
                                          config.keep_prob,
                                          scope="xx")
                        if config.share_cnn_weights:
                            tf.get_variable_scope().reuse_variables()
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="xx")

                        else:
                            qq = multi_conv1d(Acq,
                                              filter_sizes,
                                              heights,
                                              "VALID",
                                              self.is_train,
                                              config.keep_prob,
                                              scope="qq")
                        xx = tf.reshape(xx, [-1, M, JX, dco])
                        qq = tf.reshape(qq, [-1, JQ, dco])

            if config.use_word_emb:
                with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                    if config.mode == 'train':
                        word_emb_mat = tf.get_variable(
                            "word_emb_mat",
                            dtype='float',
                            shape=[VW, dw],
                            initializer=get_initializer(config.emb_mat))
                    else:
                        word_emb_mat = tf.get_variable("word_emb_mat",
                                                       shape=[VW, dw],
                                                       dtype='float')
                    if config.use_glove_for_unk:
                        word_emb_mat = tf.concat(
                            axis=0, values=[word_emb_mat, self.new_emb_mat])

                with tf.name_scope("word"):
                    Ax = tf.nn.embedding_lookup(word_emb_mat,
                                                self.x)  # [N, M, JX, d]
                    Aq = tf.nn.embedding_lookup(word_emb_mat,
                                                self.q)  # [N, JQ, d]

                    self.tensor_dict['x'] = Ax
                    self.tensor_dict['q'] = Aq
                if config.use_char_emb:
                    xx = tf.concat(axis=3, values=[xx, Ax])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, Aq])  # [N, JQ, di]

                else:
                    xx = Ax
                    qq = Aq
                    xx = tf.reshape(xx, [-1, M, JX, d])
                    qq = tf.reshape(qq, [-1, JQ, d])
            if config.use_pos_emb:
                with tf.variable_scope("pos_onehot"), tf.device("/cpu:0"):
                    pos_x = tf.one_hot(
                        self.x_pos, depth=config.pos_tag_num)  # [N,M,JX,depth]
                    pos_q = tf.one_hot(
                        self.q_pos, depth=config.pos_tag_num)  # [N,JQ,depth]
                    xx = tf.concat(axis=3, values=[xx,
                                                   pos_x])  # [N, M, JX, di]
                    qq = tf.concat(axis=2, values=[qq, pos_q])
            if config.use_sem_emb:
                with tf.variable_scope("sem_onehot"), tf.device("/cpu:0"):
                    sem_x = tf.one_hot(self.x_sem, depth=3)  # [N,M,JX,3]
                    sem_q = tf.one_hot(self.q_sem, depth=3)  # [N,JQ,3]
                    xx = tf.concat(axis=3, values=[xx, sem_x])
                    qq = tf.concat(axis=2, values=[qq, sem_q])
            if config.use_neg_emb:
                with tf.variable_scope("neg_onehot"), tf.device("/cpu:0"):
                    neg_x = tf.one_hot(self.x_neg, depth=2)  # [N,M,JX,2]
                    neg_q = tf.one_hot(self.q_neg, depth=2)  # [N,JQ,2]
                    xx = tf.concat(axis=3, values=[xx, neg_x])
                    qq = tf.concat(axis=2, values=[qq, neg_q])

        if config.highway:
            with tf.variable_scope("highway"):
                xx = highway_network(xx,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)
                tf.get_variable_scope().reuse_variables()
                qq = highway_network(qq,
                                     config.highway_num_layers,
                                     True,
                                     wd=config.wd,
                                     is_train=self.is_train)

        self.tensor_dict['xx'] = xx
        self.tensor_dict['qq'] = qq

        cell_fw = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw = SwitchableDropoutWrapper(
            cell_fw, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw = SwitchableDropoutWrapper(
            cell_bw, self.is_train, input_keep_prob=config.input_keep_prob)
        cell_fw2 = BasicLSTMCell(d, state_is_tuple=True)
        cell_bw2 = BasicLSTMCell(d, state_is_tuple=True)
        d_cell_fw2 = SwitchableDropoutWrapper(
            cell_fw2, self.is_train, input_keep_prob=config.input_keep_prob)
        d_cell_bw2 = SwitchableDropoutWrapper(
            cell_bw2, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(self.x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(self.q_mask, 'int32'), 1)  # [N]
        if config.lstm:
            with tf.variable_scope("prepro"):
                (fw_u, bw_u), ((_, fw_u_f),
                               (_, bw_u_f)) = bidirectional_dynamic_rnn(
                                   d_cell_fw,
                                   d_cell_bw,
                                   qq,
                                   q_len,
                                   dtype='float',
                                   scope='u1')  # [N, J, d], [N, d]
                print('fw_u_f hsape:{}'.format(fw_u_f.get_shape()))
                u = tf.concat(axis=2, values=[fw_u, bw_u])  #[N,JQ,2d]
                if config.share_lstm_weights:
                    tf.get_variable_scope().reuse_variables()
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                        cell_fw, cell_bw, xx, x_len, dtype='float',
                        scope='u1')  # [N, M, JX, 2d]
                    h = tf.concat(axis=3, values=[fw_h,
                                                  bw_h])  # [N, M, JX, 2d]
                    print('fw_u_f nn hsape:{}'.format(fw_u_f.get_shape()))
                else:
                    (fw_h, bw_h), _ = bidirectional_dynamic_rnn(
                        cell_fw, cell_bw, xx, x_len, dtype='float',
                        scope='h1')  # [N, M, JX, 2d]
                    h = tf.concat(axis=3, values=[fw_h,
                                                  bw_h])  # [N, M, JX, 2d]
                self.tensor_dict['u'] = u
                self.tensor_dict['h'] = h
        else:
            h = xx
            u = qq
        h1 = h[:, 0, :, :]
        h2 = h[:, 1, :, :]
        h3 = h[:, 2, :, :]
        h4 = h[:, 3, :, :]

        n_1 = tf.reshape(self.x_mask[:, 0, :], [N, JX])
        n_2 = tf.reshape(self.x_mask[:, 1, :], [N, JX])
        n_3 = tf.reshape(self.x_mask[:, 2, :], [N, JX])
        n_4 = tf.reshape(self.x_mask[:, 3, :], [N, JX])

        if config.self_attention:
            with tf.variable_scope("h_self_weight"):

                print(h.get_shape())
                for i in range(2):
                    with tf.variable_scope("self-attention"):

                        h1 = self_attention_layer(
                            config,
                            self.is_train,
                            h1,
                            p_mask=tf.expand_dims(n_1, -1),
                            scope="{}_layer_self_att_enc_e".format(
                                i))  # [N, len, dim]
                        tf.get_variable_scope().reuse_variables()
                        h2 = self_attention_layer(
                            config,
                            self.is_train,
                            h2,
                            p_mask=tf.expand_dims(n_2, -1),
                            scope="{}_layer_self_att_enc_e".format(i))
                        tf.get_variable_scope().reuse_variables()
                        h3 = self_attention_layer(
                            config,
                            self.is_train,
                            h3,
                            p_mask=tf.expand_dims(n_3, -1),
                            scope="{}_layer_self_att_enc_e".format(i))
                        tf.get_variable_scope().reuse_variables()
                        h4 = self_attention_layer(
                            config,
                            self.is_train,
                            h4,
                            p_mask=tf.expand_dims(n_4, -1),
                            scope="{}_layer_self_att_enc_e".format(i))
                    with tf.variable_scope("self-attention"):
                        u = self_attention_layer(
                            config,
                            self.is_train,
                            u,
                            p_mask=tf.expand_dims(self.q_mask, -1),
                            scope="{}_layer_self_att_enc_p".format(i))
        if config.plot_encoder == "concate":
            h = tf.concat([h1, h2, h3, h4], axis=1)
            print("h concate shape".format(h.get_shape()))
            n_n = tf.concat([n_1, n_2, n_3, n_4], axis=1)
        elif config.plot_encoder == "sum":
            h1 = tf.expand_dims(h1, axis=1)
            h2 = tf.expand_dims(h2, axis=1)
            h3 = tf.expand_dims(h3, axis=1)
            h4 = tf.expand_dims(h4, axis=1)
            h = tf.concat([h1, h2, h3, h4], axis=1)

            h = tf.reduce_sum(h, axis=1)
            print("h sum shape".format(h.get_shape()))
        elif config.plot_encoder == "lstm":
            # h1 = tf.reduce_sum(h1, axis=1)
            h1 = tf.expand_dims(tf.reduce_sum(h1, axis=-1), axis=1)
            h2 = tf.expand_dims(tf.reduce_sum(h2, axis=-1), axis=1)
            h3 = tf.expand_dims(tf.reduce_sum(h3, axis=-1), axis=1)
            h4 = tf.expand_dims(tf.reduce_sum(h4, axis=-1), axis=1)
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell_fw2,
                                             d_cell_bw2,
                                             tf.concat([h1, h2, h3, h4],
                                                       axis=1),
                                             dtype='float',
                                             scope='1')  # [N, J, d], [N, d]
            print('fw_u_f hsape:{}'.format(fw_u_f.get_shape()))
            h = tf.concat(axis=2, values=[fw_u, bw_u])  # [N,JQ,2d]
            u = tf.expand_dims(tf.reduce_sum(u, axis=-1), axis=1)
            tf.get_variable_scope().reuse_variables()
            (fw_u, bw_u), ((_, fw_u_f), (_,
                                         bw_u_f)) = bidirectional_dynamic_rnn(
                                             d_cell_fw2,
                                             d_cell_bw2,
                                             tf.concat([u], axis=1),
                                             dtype='float',
                                             scope='1')  # [N, J, d], [N, d]
            print('fw_u_f hsape:{}'.format(fw_u_f.get_shape()))
            u = tf.concat(axis=2, values=[fw_u, bw_u])  # [N,JQ,2d]

        if config.interact:
            with tf.variable_scope("interact"):

                def get_attention(h, u, m):
                    JX = tf.shape(h)[1]
                    JQ = tf.shape(u)[1]
                    h = tf.expand_dims(h, 2)
                    u = tf.expand_dims(u, 1)
                    h = tf.tile(h, [1, 1, JQ, 1])
                    u = tf.tile(u, [1, JX, 1, 1])
                    attention = h * u  # N,JX,JQ,2d

                    return attention

                if config.plot_encoder == "concate":
                    attention = get_attention(h, u, M)
                else:
                    attention = get_attention(h, u, 1)

            with tf.variable_scope('conv_dense'):
                if config.plot_encoder == "concate":
                    out_final = dense_net(config, attention, self.is_train)
                else:
                    out_final = tf.reshape(attention, shape=[N, -1])

        else:
            h = tf.reshape(h, [-1, M * 2 * d * JX])
            print("h shape {}".format(h.get_shape()))
            u = tf.reshape(u, [-1, 2 * d * JQ])
            print("U shape {}".format(u.get_shape()))
            attention = tf.concat([h, u], axis=-1)
            out_final = attention

            out_final = linear(tf.concat([attention], axis=-1),
                               1000,
                               True,
                               bias_start=0.0,
                               scope="logit8",
                               squeeze=False,
                               wd=config.wd,
                               input_keep_prob=config.output_keep_pro,
                               is_train=self.is_train)
            out_final = tf.nn.relu(out_final)
            out_final = linear(tf.concat([out_final], axis=-1),
                               400,
                               True,
                               bias_start=0.0,
                               scope="logit9",
                               squeeze=False,
                               wd=config.wd,
                               input_keep_prob=config.output_keep_pro,
                               is_train=self.is_train)
            out_final = tf.nn.relu(out_final)

            out_final = linear(out_final,
                               300,
                               True,
                               bias_start=0.0,
                               scope="logit3",
                               squeeze=False,
                               wd=config.wd,
                               input_keep_prob=config.output_keep_pro,
                               is_train=self.is_train)

            out_final = tf.nn.relu(out_final)

        with tf.variable_scope('conv_dense'):

            if config.hao:
                out_final = linear(tf.concat(
                    [out_final, self.haoruopeng_feature], axis=-1),
                                   200,
                                   True,
                                   bias_start=0.0,
                                   scope="logit",
                                   squeeze=False,
                                   wd=config.wd,
                                   input_keep_prob=config.output_keep_pro,
                                   is_train=self.is_train)
                out_final = tf.nn.relu(out_final)
                out_final = linear(out_final,
                                   100,
                                   True,
                                   bias_start=0.0,
                                   scope="logit3",
                                   squeeze=False,
                                   wd=config.wd,
                                   input_keep_prob=config.output_keep_pro,
                                   is_train=self.is_train)

                out_final = tf.nn.relu(out_final)
            else:
                out_final = linear(tf.concat([out_final], axis=-1),
                                   200,
                                   True,
                                   bias_start=0.0,
                                   scope="logit",
                                   squeeze=False,
                                   wd=config.wd,
                                   input_keep_prob=config.output_keep_pro,
                                   is_train=self.is_train)
                out_final = linear(out_final,
                                   100,
                                   True,
                                   bias_start=0.0,
                                   scope="logit3",
                                   squeeze=False,
                                   wd=config.wd,
                                   input_keep_prob=config.output_keep_pro,
                                   is_train=self.is_train)

                out_final = tf.nn.relu(out_final)

            self.tensor_dict['outfinal'] = out_final
            self.prediction = linear(tf.concat([out_final], axis=-1),
                                     1,
                                     True,
                                     bias_start=0.0,
                                     scope="logit2",
                                     squeeze=False,
                                     wd=config.wd,
                                     input_keep_prob=config.output_keep_pro,
                                     is_train=self.is_train)
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, d, dc, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.hidden_size, \
            config.char_emb_size, config.max_word_size
        H = config.max_tree_height

        x_mask = self.x > 0
        q_mask = self.q > 0
        tx_mask = self.tx > 0  # [N, M, H, JX]

        with tf.variable_scope("char_emb"):
            char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')
            Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)  # [N, M, JX, W, dc]
            Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq)  # [N, JQ, W, dc]

            filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float')
            bias = tf.get_variable("bias", shape=[d], dtype='float')
            strides = [1, 1, 1, 1]
            Acx = tf.reshape(Acx, [-1, JX, W, dc])
            Acq = tf.reshape(Acq, [-1, JQ, W, dc])
            xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias  # [N*M, JX, W/filter_stride, d]
            qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias  # [N, JQ, W/filter_stride, d]
            xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d])
            qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d])

        with tf.variable_scope("word_emb"):
            if config.mode == 'train':
                word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat))
            else:
                word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float')
            Ax = tf.nn.embedding_lookup(word_emb_mat, self.x)  # [N, M, JX, d]
            Aq = tf.nn.embedding_lookup(word_emb_mat, self.q)  # [N, JQ, d]
            # Ax = linear([Ax], d, False, scope='Ax_reshape')
            # Aq = linear([Aq], d, False, scope='Aq_reshape')

        xx = tf.concat(3, [xxc, Ax])  # [N, M, JX, 2d]
        qq = tf.concat(2, [qqc, Aq])  # [N, JQ, 2d]
        D = d + config.word_emb_size

        with tf.variable_scope("pos_emb"):
            pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float')
            Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx)  # [N, M, H, JX, d]

        cell = BasicLSTMCell(D, state_is_tuple=True)
        cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("rnn"):
            (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start')  # [N, M, JX, 2d]
            tf.get_variable_scope().reuse_variables()
            (fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start')  # [N, J, d], [N, d]
            u = (fw_u + bw_u) / 2.0
            h = (fw_h + bw_h) / 2.0

        with tf.variable_scope("h"):
            no_op_cell = NoOpCell(D)
            tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max)
            initial_state = tf.reshape(h, [N*M*JX, D])  # [N*M*JX, D]
            inputs = tf.concat(4, [Atx, tf.cast(self.tx_edge_mask, 'float')])  # [N, M, H, JX, d+JX]
            inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX])  # [N*M*JX, H, d+JX]
            length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX])
            # length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1])
            h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state)  # [N*M*JX, H, D]
            h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4])  # [N, M, H, JX, D]

        u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1)  # [N, 1, 1, 1, 4d]
        dot = linear(h * u, 1, True, squeeze=True, scope='dot')  # [N, M, H, JX]
        # self.logits = tf.reshape(dot, [N, M * H * JX])
        self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX])  # [N, M, H, JX]
        self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
Esempio n. 21
0
    def initialize(self):
        params = self.params
        placeholders = self.placeholders
        tensors = self.tensors
        variables_dict = self.variables_dict

        self.task = int(params.task)
        self.dstc = self.task % 10 == 6
        self.match = params.use_match
        self.rnn = params.use_rnn

        N, J, Q, M = params.batch_size, params.max_sent_size, params.max_ques_size, params.mem_size
        V, Alist = params.vocab_size

        d = params.hidden_size
        L = params.mem_num_layers
        att_forget_bias = params.att_forget_bias
        use_vector_gate = params.use_vector_gate
        wd = params.wd
        initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3))

        self.ans_dic = {
            1: range(5),
            2: range(5),
            3: [0, 5],
            4: [0, 6, 7],
            5: range(8),
            6: range(11)
        }
        self.num_candidate = Alist[0] + 1
        data_task = self.task % 10 if not self.rnn else self.task
        self.ans = self.ans_dic.get(data_task, [0])
        self.num_ans = len(self.ans)
        if self.rnn and self.task == 3: self.num_ans = 3
        elif self.rnn and self.task == 4: self.num_ans = 4
        elif self.rnn: self.num_ans = 6

        with tf.name_scope("placeholders"):
            x = tf.placeholder('int32', shape=[N, M, J], name='x')
            x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask')
            q = tf.placeholder('int32', shape=[N, J], name='q')
            q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask')
            y = tf.placeholder('int32', shape=[N, self.num_ans], name='y')
            y_mask = tf.placeholder('bool',
                                    shape=[N, self.num_ans],
                                    name='y_mask')
            y_feats = []
            for i in self.ans[1:]:
                A = Alist[0] if self.rnn else Alist[i]
                y_feats.append(
                    tf.placeholder('int32',
                                   shape=[N, 2, A],
                                   name='y_feat' + str(i)))
            self.y_state_dim = self.num_ans - 2 if self.rnn else self.num_ans - 1
            y_state = tf.placeholder('bool',
                                     shape=[N, self.y_state_dim],
                                     name='y_state')
            is_train = tf.placeholder('bool', shape=[], name='is_train')

            placeholders['x'] = x
            placeholders['x_mask'] = x_mask
            placeholders['q'] = q
            placeholders['q_mask'] = q_mask
            placeholders['y'] = y
            placeholders['y_mask'] = y_mask
            placeholders['y_feats'] = y_feats
            placeholders['y_state'] = y_state
            placeholders['is_train'] = is_train

        with tf.variable_scope("embedding"):
            A = VariableEmbedder(params,
                                 wd=wd,
                                 initializer=initializer,
                                 name='A')
            Aq = A(q, name='Aq')  # [N, S, J, d]
            Ax = A(x, name='Ax')  # [N, S, J, d]

        with tf.name_scope("encoding"):
            encoder = PositionEncoder(J, d)
            u = encoder(Aq, q_mask)  # [N, d]
            m = encoder(Ax, x_mask)  # [N, M, d]

        with tf.variable_scope("networks"):
            m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2,
                                   name='m_mask')  # [N, M]
            gate_mask = tf.expand_dims(m_mask, -1)
            m_length = tf.reduce_sum(m_mask, 1, name='m_length')  # [N]
            prev_u = tf.tile(tf.expand_dims(u, 1), [1, M, 1])  # [N, M, d]
            reg_layer = VectorReductionLayer(
                N, M, d) if use_vector_gate else ReductionLayer(N, M, d)
            gate_size = d if use_vector_gate else 1
            h = None  # [N, M, d]
            as_, rfs, rbs = [], [], []
            hs = []
            for layer_idx in range(L):
                with tf.name_scope("layer_{}".format(layer_idx)):
                    u_t = tf.tanh(
                        linear([prev_u, m], d, True, wd=wd, scope='u_t'))
                    a = tf.cast(gate_mask, 'float') * tf.sigmoid(
                        linear([prev_u * m],
                               gate_size,
                               True,
                               initializer=initializer,
                               wd=wd,
                               scope='a') - att_forget_bias)
                    h = reg_layer(u_t, a, 1.0 - a, scope='h')
                    if layer_idx + 1 < L:
                        if params.use_reset:
                            rf, rb = tf.split(
                                2, 2,
                                tf.cast(gate_mask, 'float') * tf.sigmoid(
                                    linear([prev_u * m],
                                           2 * gate_size,
                                           True,
                                           initializer=initializer,
                                           wd=wd,
                                           scope='r')))
                        else:
                            rf = rb = tf.ones(a.get_shape().as_list())
                        u_t_rev = tf.reverse_sequence(u_t, m_length, 1)
                        a_rev, rb_rev = tf.reverse_sequence(
                            a, m_length,
                            1), tf.reverse_sequence(rb, m_length, 1)
                        uf = reg_layer(u_t, a * rf, 1.0 - a, scope='uf')
                        ub_rev = reg_layer(u_t_rev,
                                           a_rev * rb_rev,
                                           1.0 - a_rev,
                                           scope='ub_rev')
                        ub = tf.reverse_sequence(ub_rev, m_length, 1)
                        prev_u = uf + ub
                    else:
                        rf = rb = tf.zeros(a.get_shape().as_list())
                    rfs.append(rf)
                    rbs.append(rb)
                    as_.append(a)
                    hs.append(h)
                    tf.get_variable_scope().reuse_variables()

            h_last = tf.squeeze(tf.slice(h, [0, M - 1, 0], [-1, -1, -1]),
                                [1])  # [N, d]
            hs_last = [
                tf.squeeze(tf.slice(each, [0, M - 1, 0], [-1, -1, -1]), [1])
                for each in hs
            ]
            a = tf.transpose(tf.pack(as_, name='a'), [1, 0, 2, 3])
            rf = tf.transpose(tf.pack(rfs, name='rf'), [1, 0, 2, 3])
            rb = tf.transpose(tf.pack(rbs, name='rb'), [1, 0, 2, 3])
            tensors['a'] = a
            tensors['rf'] = rf
            tensors['rb'] = rb

        with tf.variable_scope("class"):
            class_mode = params.class_mode
            use_class_bias = params.use_class_bias
            logits = []
            drop_rate = tf.cond(is_train, lambda: tf.constant(0.5),
                                lambda: tf.constant(1.0))

            if class_mode == 'h':

                if self.rnn:  # rnn decoder
                    hiddens = []  # previous hidden vector
                    A = self.num_candidate
                    for i in range(self.num_ans):
                        # Inverse Embedding Matrix of Answers [A, A]
                        E_inv = tf.get_variable(
                            "E_inv", [A, A],
                            initializer=tf.constant_initializer(0.0))
                        prev_h = h_last
                        if i == 0:
                            # If it is the first answer, use initial y
                            prev_y = tf.reshape(
                                tf.tile(
                                    tf.get_variable(
                                        "Wx",
                                        A,
                                        initializer=tf.constant_initializer(
                                            0.0)), [N]), [N, A])
                        else:
                            # Otherwise, use Inverse Embedding Matrix
                            _prev_y = tf.reshape(
                                tf.gather(tf.transpose(y), i - 1), [N])
                            prev_y = tf.nn.embedding_lookup(E_inv, _prev_y)
                            #prev_h = hiddens[-1]
                        _logit = linear([prev_h],
                                        A,
                                        use_class_bias,
                                        wd=wd,
                                        name='0')
                        logit = _logit * prev_y
                        hiddens.append(S2)
                        logits.append(S2)

                        tf.get_variable_scope().reuse_variables()
                else:
                    if self.match:
                        # Input of softmax when using match
                        all_y_feats = [None] + y_feats
                        all_y_states = [y_state
                                        ] + [None] * (len(all_y_feats) - 1)

                    for i, j in enumerate(self.ans):
                        if self.match:
                            logits.append(
                                linear([h_last],
                                       Alist[j],
                                       use_class_bias,
                                       wd=wd,
                                       name=str(i),
                                       feat=all_y_feats[i],
                                       state=all_y_states[i],
                                       drop_rate=drop_rate))

                        else:
                            logits.append(
                                linear([h_last],
                                       Alist[j],
                                       use_class_bias,
                                       wd=wd,
                                       name=str(i)))
            elif class_mode == 'uh':
                logits = linear([h_last, u], A, use_class_bias, wd=wd)
            elif class_mode == 'hs':
                logits = linear(hs_last, A, use_class_bias, wd=wd)
            elif class_mode == 'hss':
                logits = linear(sum(hs_last), A, use_class_bias, wd=wd)
            else:
                raise Exception("Invalid class mode: {}".format(class_mode))

            for i in range(self.num_ans):
                yp_each = tf.cast(tf.expand_dims(tf.argmax(logits[i], 1), 1),
                                  'int32')
                if i == 0: yp = yp_each
                else: yp = tf.concat(1, [yp, yp_each])

            correct_ = tf.cast(tf.equal(yp, y), 'float')
            correct_sum = tf.reduce_sum(correct_ * tf.cast(y_mask, 'float'), 1)
            mask_ = tf.reduce_sum(tf.cast(y_mask, 'float'), 1)
            correct = tf.truediv(correct_sum, mask_)
            tensors['yp'] = yp
            tensors['correct_'] = correct_
            tensors['mask_'] = mask_
            tensors['y_mask'] = y_mask
            tensors['y'] = y
            tensors['correct'] = correct
            tensors['q'] = q
            if self.task > 20:
                tensors['y_state'] = y_state
                for i, j in enumerate(self.ans[1:]):
                    tensors['y_feat' + str(i)] = tf.reshape(
                        y_feats[i], [N, 2 * self.ans_num[j]])

        with tf.name_scope("loss"):
            with tf.name_scope("ans_loss"):
                tot_ce = 0

                for i in range(self.num_ans):
                    _y = tf.gather(tf.transpose(y), i)
                    ce = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits[i], _y)
                    m = tf.cast(tf.gather(tf.transpose(y_mask), i), 'float32')
                    tot_ce += tf.reduce_sum(ce * m, name='avg_ce')

                tf.add_to_collection('losses', tot_ce)

            losses = tf.get_collection('losses')
            loss = tf.add_n(losses, name='loss')
            tensors['loss'] = loss

        variables_dict['all'] = tf.trainable_variables()
Esempio n. 22
0
    def _build_forward(self):
        config = self.config
        N, M, JX, JQ, VW, VC, d, dc, W = \
            config.batch_size, config.max_num_sents, config.max_sent_size, \
            config.max_ques_size, config.word_vocab_size, config.char_vocab_size, config.hidden_size, \
            config.char_emb_size, config.max_word_size
        H = config.max_tree_height

        x_mask = self.x > 0
        q_mask = self.q > 0
        tx_mask = self.tx > 0  # [N, M, H, JX]

        with tf.variable_scope("char_emb"):
            char_emb_mat = tf.get_variable("char_emb_mat", shape=[VC, dc], dtype='float')
            Acx = tf.nn.embedding_lookup(char_emb_mat, self.cx)  # [N, M, JX, W, dc]
            Acq = tf.nn.embedding_lookup(char_emb_mat, self.cq)  # [N, JQ, W, dc]

            filter = tf.get_variable("filter", shape=[1, config.char_filter_height, dc, d], dtype='float')
            bias = tf.get_variable("bias", shape=[d], dtype='float')
            strides = [1, 1, 1, 1]
            Acx = tf.reshape(Acx, [-1, JX, W, dc])
            Acq = tf.reshape(Acq, [-1, JQ, W, dc])
            xxc = tf.nn.conv2d(Acx, filter, strides, "VALID") + bias  # [N*M, JX, W/filter_stride, d]
            qqc = tf.nn.conv2d(Acq, filter, strides, "VALID") + bias  # [N, JQ, W/filter_stride, d]
            xxc = tf.reshape(tf.reduce_max(tf.nn.relu(xxc), 2), [-1, M, JX, d])
            qqc = tf.reshape(tf.reduce_max(tf.nn.relu(qqc), 2), [-1, JQ, d])

        with tf.variable_scope("word_emb"):
            if config.mode == 'train':
                word_emb_mat = tf.get_variable("word_emb_mat", dtype='float', shape=[VW, config.word_emb_size], initializer=get_initializer(config.emb_mat))
            else:
                word_emb_mat = tf.get_variable("word_emb_mat", shape=[VW, config.word_emb_size], dtype='float')
            Ax = tf.nn.embedding_lookup(word_emb_mat, self.x)  # [N, M, JX, d]
            Aq = tf.nn.embedding_lookup(word_emb_mat, self.q)  # [N, JQ, d]
            # Ax = linear([Ax], d, False, scope='Ax_reshape')
            # Aq = linear([Aq], d, False, scope='Aq_reshape')

        xx = tf.concat(3, [xxc, Ax])  # [N, M, JX, 2d]
        qq = tf.concat(2, [qqc, Aq])  # [N, JQ, 2d]
        D = d + config.word_emb_size

        with tf.variable_scope("pos_emb"):
            pos_emb_mat = tf.get_variable("pos_emb_mat", shape=[config.pos_vocab_size, d], dtype='float')
            Atx = tf.nn.embedding_lookup(pos_emb_mat, self.tx)  # [N, M, H, JX, d]

        cell = BasicLSTMCell(D, state_is_tuple=True)
        cell = SwitchableDropoutWrapper(cell, self.is_train, input_keep_prob=config.input_keep_prob)
        x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 2)  # [N, M]
        q_len = tf.reduce_sum(tf.cast(q_mask, 'int32'), 1)  # [N]

        with tf.variable_scope("rnn"):
            (fw_h, bw_h), _ = bidirectional_dynamic_rnn(cell, cell, xx, x_len, dtype='float', scope='start')  # [N, M, JX, 2d]
            tf.get_variable_scope().reuse_variables()
            (fw_us, bw_us), (_, (fw_u, bw_u)) = bidirectional_dynamic_rnn(cell, cell, qq, q_len, dtype='float', scope='start')  # [N, J, d], [N, d]
            u = (fw_u + bw_u) / 2.0
            h = (fw_h + bw_h) / 2.0

        with tf.variable_scope("h"):
            no_op_cell = NoOpCell(D)
            tree_rnn_cell = TreeRNNCell(no_op_cell, d, tf.reduce_max)
            initial_state = tf.reshape(h, [N*M*JX, D])  # [N*M*JX, D]
            inputs = tf.concat(4, [Atx, tf.cast(self.tx_edge_mask, 'float')])  # [N, M, H, JX, d+JX]
            inputs = tf.reshape(tf.transpose(inputs, [0, 1, 3, 2, 4]), [N*M*JX, H, d + JX])  # [N*M*JX, H, d+JX]
            length = tf.reshape(tf.reduce_sum(tf.cast(tx_mask, 'int32'), 2), [N*M*JX])
            # length = tf.reshape(tf.reduce_sum(tf.cast(tf.transpose(tx_mask, [0, 1, 3, 2]), 'float'), 3), [-1])
            h, _ = dynamic_rnn(tree_rnn_cell, inputs, length, initial_state=initial_state)  # [N*M*JX, H, D]
            h = tf.transpose(tf.reshape(h, [N, M, JX, H, D]), [0, 1, 3, 2, 4])  # [N, M, H, JX, D]

        u = tf.expand_dims(tf.expand_dims(tf.expand_dims(u, 1), 1), 1)  # [N, 1, 1, 1, 4d]
        dot = linear(h * u, 1, True, squeeze=True, scope='dot')  # [N, M, H, JX]
        # self.logits = tf.reshape(dot, [N, M * H * JX])
        self.logits = tf.reshape(exp_mask(dot, tx_mask), [N, M * H * JX])  # [N, M, H, JX]
        self.yp = tf.reshape(tf.nn.softmax(self.logits), [N, M, H, JX])
Esempio n. 23
0
    def initialize(self):
        params = self.params
        placeholders = self.placeholders
        tensors = self.tensors
        variables_dict = self.variables_dict
        N, J, V, Q, M = params.batch_size, params.max_sent_size, params.vocab_size, params.max_ques_size, params.mem_size
        d = params.hidden_size
        L = params.mem_num_layers
        forget_bias = params.forget_bias
        wd = params.wd
        initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3))
        with tf.name_scope("placeholders"):
            x = tf.placeholder('int32', shape=[N, M, J], name='x')
            x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask')
            q = tf.placeholder('int32', shape=[N, J], name='q')
            q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask')
            y = tf.placeholder('int32', shape=[N], name='y')
            is_train = tf.placeholder('bool', shape=[], name='is_train')
            placeholders['x'] = x
            placeholders['x_mask'] = x_mask
            placeholders['q'] = q
            placeholders['q_mask'] = q_mask
            placeholders['y'] = y
            placeholders['is_train'] = is_train

        with tf.variable_scope("embedding"):
            A = VariableEmbedder(params,
                                 wd=wd,
                                 initializer=initializer,
                                 name='A')
            Aq = A(q, name='Aq')  # [N, S, J, d]
            Ax = A(x, name='Ax')  # [N, S, J, d]

        with tf.name_scope("encoding"):
            encoder = PositionEncoder(J, d)
            u = encoder(Aq, q_mask)  # [N, d]
            m = encoder(Ax, x_mask)  # [N, M, d]

        with tf.variable_scope("networks"):
            m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2,
                                   name='m_mask')  # [N, M]
            m_length = tf.reduce_sum(m_mask, 1, name='m_length')  # [N]
            initializer = tf.random_uniform_initializer(
                -np.sqrt(3), np.sqrt(3))
            cell = RSMCell(d,
                           forget_bias=forget_bias,
                           wd=wd,
                           initializer=initializer)
            us = tf.tile(tf.expand_dims(u, 1, name='u_prev_aug'),
                         [1, M, 1])  # [N, d] -> [N, M, d]
            in_ = tf.concat(
                2, [tf.ones([N, M, 1]), m, us,
                    tf.zeros([N, M, 2 * d])],
                name='x_h_in')  # [N, M, 4*d + 1]
            out, fw_state, bw_state, bi_tensors = dynamic_bidirectional_rnn(
                cell,
                in_,
                sequence_length=m_length,
                dtype='float',
                num_layers=L)
            a = tf.slice(out, [0, 0, 0], [-1, -1, 1])  # [N, M, 1]
            _, _, v, g = tf.split(2, 4, tf.slice(out, [0, 0, 1], [-1, -1, -1]))
            fw_h, fw_v = tf.split(1, 2, tf.slice(fw_state, [0, 1], [-1, -1]))
            bw_h, bw_v = tf.split(1, 2, tf.slice(bw_state, [0, 1], [-1, -1]))

            _, fw_u_out, fw_v_out, _ = tf.split(
                2, 4,
                tf.squeeze(
                    tf.slice(bi_tensors['fw_out'], [0, L - 1, 0, 2],
                             [-1, -1, -1, -1]), [1]))
            _, bw_u_out, bw_v_out, _ = tf.split(
                2, 4,
                tf.squeeze(
                    tf.slice(bi_tensors['bw_out'], [0, L - 1, 0, 2],
                             [-1, -1, -1, -1]), [1]))

            tensors['a'] = tf.squeeze(
                tf.slice(bi_tensors['in'], [0, 0, 0, 0], [-1, -1, -1, 1]), [3])
            tensors['of'] = tf.squeeze(
                tf.slice(bi_tensors['fw_out'], [0, 0, 0, 1], [-1, -1, -1, 1]),
                [3])
            tensors['ob'] = tf.squeeze(
                tf.slice(bi_tensors['bw_out'], [0, 0, 0, 1], [-1, -1, -1, 1]),
                [3])

        with tf.variable_scope("selection"):
            # w = tf.nn.relu(linear([fw_v + 1e-9*(fw_h+bw_h)], d, True, wd=wd))
            w = fw_v + 1e-9 * (fw_h + bw_h)
            tensors['s'] = a

        with tf.variable_scope("class"):
            if params.use_ques:
                logits = linear([w, u], V, True, wd=wd)
            else:
                # W = tf.transpose(A.emb_mat, name='W')
                W = tf.get_variable('W', shape=[d, V])
                logits = tf.matmul(w, W, name='logits')
            yp = tf.cast(tf.argmax(logits, 1), 'int32')
            correct = tf.equal(yp, y)
            tensors['yp'] = yp
            tensors['correct'] = correct

        with tf.name_scope("loss"):
            with tf.name_scope("ans_loss"):
                ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits,
                                                                    y,
                                                                    name='ce')
                avg_ce = tf.reduce_mean(ce, name='avg_ce')
                tf.add_to_collection('losses', avg_ce)

            losses = tf.get_collection('losses')
            loss = tf.add_n(losses, name='loss')
            tensors['loss'] = loss

        variables_dict['all'] = tf.trainable_variables()
Esempio n. 24
0
    def initialize(self):
        params = self.params
        placeholders = self.placeholders
        tensors = self.tensors
        variables_dict = self.variables_dict

        self.task = int(params.task)
        self.dstc = self.task%10 == 6
        self.match = params.use_match
        self.rnn = params.use_rnn
	

        N, J, Q, M = params.batch_size, params.max_sent_size, params.max_ques_size, params.mem_size
        V, Alist = params.vocab_size

        d = params.hidden_size
        L = params.mem_num_layers
        att_forget_bias = params.att_forget_bias
        use_vector_gate = params.use_vector_gate
        wd = params.wd
        initializer = tf.random_uniform_initializer(-np.sqrt(3), np.sqrt(3))
	
        self.ans_dic = {
		1 : range(5), 2 : range(5),
		3 : [0, 5], 4 : [0,6,7], 5 : range(8), 6 : range(11)
	}
        self.num_candidate = Alist[0]+1
        data_task = self.task%10 if not self.rnn else self.task
        self.ans = self.ans_dic.get(data_task, [0])
        self.num_ans = len(self.ans)
        if self.rnn and self.task==3 : self.num_ans = 3
        elif self.rnn and self.task==4: self.num_ans = 4
        elif self.rnn: self.num_ans = 6


        with tf.name_scope("placeholders"):
            x = tf.placeholder('int32', shape=[N, M, J], name='x')
            x_mask = tf.placeholder('bool', shape=[N, M, J], name='x_mask')
            q = tf.placeholder('int32', shape=[N, J], name='q')
            q_mask = tf.placeholder('bool', shape=[N, J], name='q_mask')
            y = tf.placeholder('int32', shape=[N, self.num_ans], name='y')
            y_mask = tf.placeholder('bool', shape=[N, self.num_ans], name='y_mask')
            y_feats = []
            for i in self.ans[1:]:
                A = Alist[0] if self.rnn else Alist[i]
                y_feats.append(tf.placeholder('int32', shape=[N, 2, A], name='y_feat'+str(i)))
            self.y_state_dim = self.num_ans-2 if self.rnn else self.num_ans-1
            y_state =tf.placeholder('bool', shape=[N, self.y_state_dim], name='y_state')
            is_train = tf.placeholder('bool', shape=[], name='is_train')
            
            placeholders['x'] = x
            placeholders['x_mask'] = x_mask
            placeholders['q'] = q
            placeholders['q_mask'] = q_mask
            placeholders['y'] = y
            placeholders['y_mask'] = y_mask
            placeholders['y_feats'] = y_feats
            placeholders['y_state'] = y_state
            placeholders['is_train'] = is_train


        with tf.variable_scope("embedding"):
            A = VariableEmbedder(params, wd=wd, initializer=initializer, name='A')
            Aq = A(q, name='Aq')  # [N, S, J, d]
            Ax = A(x, name='Ax')  # [N, S, J, d]

        with tf.name_scope("encoding"):
            encoder = PositionEncoder(J, d)
            u = encoder(Aq, q_mask)  # [N, d]
            m = encoder(Ax, x_mask)  # [N, M, d]

        with tf.variable_scope("networks"):
            m_mask = tf.reduce_max(tf.cast(x_mask, 'int64'), 2, name='m_mask')  # [N, M]
            gate_mask = tf.expand_dims(m_mask, -1)
            m_length = tf.reduce_sum(m_mask, 1, name='m_length')  # [N]
            prev_u = tf.tile(tf.expand_dims(u, 1), [1, M, 1])  # [N, M, d]
            reg_layer = VectorReductionLayer(N, M, d) if use_vector_gate else ReductionLayer(N, M, d)
            gate_size = d if use_vector_gate else 1
            h = None  # [N, M, d]
            as_, rfs, rbs = [], [], []
            hs = []
            for layer_idx in range(L):
                with tf.name_scope("layer_{}".format(layer_idx)):
                    u_t = tf.tanh(linear([prev_u, m], d, True, wd=wd, scope='u_t'))
                    a = tf.cast(gate_mask, 'float') * tf.sigmoid(linear([prev_u * m], gate_size, True, initializer=initializer, wd=wd, scope='a') - att_forget_bias)
                    h = reg_layer(u_t, a, 1.0-a, scope='h')
                    if layer_idx + 1 < L:
                        if params.use_reset:
                            rf, rb = tf.split(2, 2, tf.cast(gate_mask, 'float') *
                                tf.sigmoid(linear([prev_u * m], 2 * gate_size, True, initializer=initializer, wd=wd, scope='r')))
                        else:
                            rf = rb = tf.ones(a.get_shape().as_list())
                        u_t_rev = tf.reverse_sequence(u_t, m_length, 1)
                        a_rev, rb_rev = tf.reverse_sequence(a, m_length, 1), tf.reverse_sequence(rb, m_length, 1)
                        uf = reg_layer(u_t, a*rf, 1.0-a, scope='uf')
                        ub_rev = reg_layer(u_t_rev, a_rev*rb_rev, 1.0-a_rev, scope='ub_rev')
                        ub = tf.reverse_sequence(ub_rev, m_length, 1)
                        prev_u = uf + ub
                    else:
                        rf = rb = tf.zeros(a.get_shape().as_list())
                    rfs.append(rf)
                    rbs.append(rb)
                    as_.append(a)
                    hs.append(h)
                    tf.get_variable_scope().reuse_variables()

            h_last = tf.squeeze(tf.slice(h, [0, M-1, 0], [-1, -1, -1]), [1])  # [N, d]
            hs_last = [tf.squeeze(tf.slice(each, [0, M-1, 0], [-1, -1, -1]), [1]) for each in hs]
            a = tf.transpose(tf.pack(as_, name='a'), [1, 0, 2, 3])
            rf = tf.transpose(tf.pack(rfs, name='rf'), [1, 0, 2, 3])
            rb = tf.transpose(tf.pack(rbs, name='rb'), [1, 0, 2, 3])
            tensors['a'] = a
            tensors['rf'] = rf
            tensors['rb'] = rb

        with tf.variable_scope("class"):
            class_mode = params.class_mode
            use_class_bias = params.use_class_bias
            logits = []
            drop_rate = tf.cond(is_train, lambda: tf.constant(0.5),
				lambda: tf.constant(1.0))

            if class_mode == 'h':

                if self.rnn: # rnn decoder
                    hiddens = [] # previous hidden vector
                    A = self.num_candidate
                    for i in range(self.num_ans):
                        # Inverse Embedding Matrix of Answers [A, A]
                        E_inv = tf.get_variable("E_inv", [A, A], initializer = tf.constant_initializer(0.0))
                        prev_h = h_last
                        if i==0:
                            # If it is the first answer, use initial y
                            prev_y = tf.reshape(tf.tile(tf.get_variable("Wx", A, initializer = tf.constant_initializer(0.0)), [N]), [N, A])
                        else:
                            # Otherwise, use Inverse Embedding Matrix
                            _prev_y = tf.reshape(tf.gather(tf.transpose(y), i-1), [N])
                            prev_y = tf.nn.embedding_lookup(E_inv, _prev_y)
                            #prev_h = hiddens[-1]
                        _logit = linear([prev_h], A, use_class_bias, wd=wd, name='0')
                        logit = _logit * prev_y
                        hiddens.append(S2)
                        logits.append(S2)
                        
                        tf.get_variable_scope().reuse_variables()
                else:
                    if self.match:
                        # Input of softmax when using match
                        all_y_feats = [None] + y_feats
                        all_y_states = [y_state] + [None]*(len(all_y_feats)-1)

                    for i, j in enumerate(self.ans):
                        if self.match:
                            logits.append(linear([h_last], Alist[j], use_class_bias, wd=wd, name=str(i), feat = all_y_feats[i], state = all_y_states[i], drop_rate = drop_rate))

                        else:
                            logits.append(linear([h_last], Alist[j], use_class_bias, wd=wd, name=str(i) ))
            elif class_mode == 'uh':
                logits = linear([h_last, u], A, use_class_bias, wd=wd)
            elif class_mode == 'hs':
                logits = linear(hs_last, A, use_class_bias, wd=wd)
            elif class_mode == 'hss':
                logits = linear(sum(hs_last), A, use_class_bias, wd=wd)
            else:
                raise Exception("Invalid class mode: {}".format(class_mode))

	    
            for i in range(self.num_ans):
                yp_each = tf.cast(tf.expand_dims(tf.argmax(logits[i], 1), 1), 'int32')
                if i == 0: yp = yp_each
                else: yp = tf.concat(1, [yp, yp_each])
	    
            correct_ = tf.cast(tf.equal(yp, y), 'float')
            correct_sum = tf.reduce_sum(correct_ * tf.cast(y_mask, 'float'), 1)
            mask_ = tf.reduce_sum(tf.cast(y_mask, 'float'), 1)
            correct = tf.truediv(correct_sum, mask_)
            tensors['yp'] = yp
            tensors['correct_'] = correct_
            tensors['mask_'] = mask_
            tensors['y_mask'] = y_mask
            tensors['y'] = y
            tensors['correct'] = correct
            tensors['q'] = q
            if self.task>20:
                tensors['y_state'] = y_state
                for i, j in enumerate(self.ans[1:]):
                    tensors['y_feat'+str(i)] = tf.reshape(y_feats[i], [N, 2*self.ans_num[j]])

        with tf.name_scope("loss"):
            with tf.name_scope("ans_loss"):
                tot_ce = 0

                for i in range(self.num_ans):
                    _y = tf.gather(tf.transpose(y), i)
                    ce = tf.nn.sparse_softmax_cross_entropy_with_logits(logits[i], _y)
                    m = tf.cast(tf.gather(tf.transpose(y_mask), i), 'float32')
                    tot_ce += tf.reduce_sum(ce*m, name='avg_ce')

                tf.add_to_collection('losses', tot_ce)

            losses = tf.get_collection('losses')
            loss = tf.add_n(losses, name='loss')
            tensors['loss'] = loss

        variables_dict['all'] = tf.trainable_variables()