def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """
        Scaled Dot-Product Attention
        """

        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.

        # The current implementation of softmax_op only supports 2D tensor,
        # consequently it cannot be directly used here.
        # If to use the reshape_op, Besides, the shape of product inferred in
        # compile-time is not the actual shape in run-time. It cann't be used
        # to set the attribute of reshape_op.
        # So, here define the softmax for temporary solution.

        def __softmax(x, eps=1e-9):
            exp_out = layers.exp(x=x)
            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)

        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
        if dropout_rate:
            weights = layers.dropout(
                weights, dropout_prob=dropout_rate, is_test=False)
        out = layers.matmul(weights, v)
        return out
    def create_rnn_op(self):
        x = layers.data(
            shape=[self.sent_len, self.batch_size, self.input_dim],
            dtype='float32',
            name='x',
            append_batch_size=False,
            **self.p_info)
        x.stop_gradient = False
        h_boot = layers.data(
            shape=[self.input_dim],
            dtype='float32',
            name='h_boot',
            **self.p_info)
        h_boot.stop_gradient = False

        rnn = layers.StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)

            h = layers.scale(
                x=layers.elementwise_add(
                    x=h_pre, y=x_t, **self.p_info),
                scale=self.py_rnn.scale,
                **self.p_info)

            rnn.update_memory(h_pre, h)
            rnn.output(h)

        return rnn()
    def create_rnn_op(self):
        x = layers.data(
            shape=[self.sent_len, self.batch_size, self.input_dim],
            dtype='float32',
            name='x',
            append_batch_size=False,
            **self.p_info)
        x.stop_gradient = False

        rnn = layers.StaticRNN(main_program=self.main_program)
        with rnn.step():
            mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
            x_t = rnn.step_input(x)
            mem = layers.elementwise_add(x=mem_pre, y=x_t, **self.p_info)
            rnn.update_memory(mem_pre, mem)
            rnn.output(mem)

        return rnn()
Exemple #4
0
            def fibonacci(channel, quit_channel):
                while_op = While(cond=while_cond)
                with while_op.block():
                    result2 = fill_constant(
                        shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)

                    with fluid.Select() as select:
                        with select.case(
                                fluid.channel_send, channel, x, is_copy=True):
                            assign(input=x, output=x_tmp)
                            assign(input=y, output=x)
                            assign(elementwise_add(x=x_tmp, y=y), output=y)

                        with select.case(fluid.channel_recv, quit_channel,
                                         result2):
                            # Quit
                            helper = layer_helper.LayerHelper('assign')
                            helper.append_op(
                                type='assign',
                                inputs={'X': [while_false]},
                                outputs={'Out': [while_cond]})
    def create_rnn_op(self):
        x = layers.data(
            shape=[self.sent_len, self.batch_size, self.input_dim],
            dtype='float32',
            name='x',
            append_batch_size=False,
            **self.p_info)
        x.stop_gradient = False
        h_boot = layers.data(
            shape=[self.input_dim],
            dtype='float32',
            name='h_boot',
            **self.p_info)
        h_boot.stop_gradient = False

        rnn = layers.StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)

            temp_l = layers.fc(input=x_t,
                               size=self.input_dim,
                               param_attr='W',
                               bias_attr=False,
                               **self.p_info)
            temp_r = layers.fc(input=h_pre,
                               size=self.input_dim,
                               param_attr='U',
                               bias_attr=False,
                               **self.p_info)

            h = layers.sigmoid(
                x=layers.elementwise_add(
                    x=temp_l, y=temp_r, **self.p_info),
                **self.p_info)

            rnn.update_memory(h_pre, h)
            rnn.output(h)

        return rnn()
            def fibonacci(channel, quit_channel):
                while_op = While(cond=while_cond)
                with while_op.block():
                    result2 = fill_constant(shape=[1],
                                            dtype=core.VarDesc.VarType.INT32,
                                            value=0)

                    with fluid.Select() as select:
                        with select.case(fluid.channel_send,
                                         channel,
                                         x,
                                         is_copy=True):
                            assign(input=x, output=x_tmp)
                            assign(input=y, output=x)
                            assign(elementwise_add(x=x_tmp, y=y), output=y)

                        with select.case(fluid.channel_recv, quit_channel,
                                         result2):
                            # Quit
                            helper = layer_helper.LayerHelper('assign')
                            helper.append_op(type='assign',
                                             inputs={'X': [while_false]},
                                             outputs={'Out': [while_cond]})
Exemple #7
0
    def create_rnn_op(self):
        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
                        dtype='float32',
                        name='x',
                        append_batch_size=False)
        x.stop_gradient = False
        h_boot = layers.data(shape=[self.input_dim],
                             dtype='float32',
                             name='h_boot')
        h_boot.stop_gradient = False

        rnn = layers.StaticRNN()
        with rnn.step():
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)

            h = layers.scale(x=layers.elementwise_add(x=h_pre, y=x_t),
                             scale=self.py_rnn.scale)

            rnn.update_memory(h_pre, h)
            rnn.output(h)

        return rnn()
Exemple #8
0
 def attention(self, hidden, encoder_output, encoder_output_proj,
               encoder_padding_mask):
     decoder_state_proj = layers.unsqueeze(
         layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1])
     mixed_state = fluid.layers.elementwise_add(
         encoder_output_proj,
         layers.expand(decoder_state_proj,
                       [1, layers.shape(decoder_state_proj)[1], 1]))
     # attn_scores: [batch_size, src_seq_len]
     attn_scores = layers.squeeze(
         layers.fc(input=mixed_state,
                   size=1,
                   num_flatten_dims=2,
                   bias_attr=False), [2])
     if encoder_padding_mask is not None:
         attn_scores = layers.elementwise_add(attn_scores,
                                              encoder_padding_mask)
     attn_scores = layers.softmax(attn_scores)
     context = layers.reduce_sum(layers.elementwise_mul(encoder_output,
                                                        attn_scores,
                                                        axis=0),
                                 dim=1)
     return context
Exemple #9
0
    def create_rnn_op(self):
        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
                        dtype='float32',
                        name='x',
                        append_batch_size=False)
        x.stop_gradient = False
        h_boot = layers.data(shape=[self.input_dim],
                             dtype='float32',
                             name='h_boot')
        h_boot.stop_gradient = False

        rnn = layers.StaticRNN()
        with rnn.step():
            h_pre = rnn.memory(init=h_boot)
            x_t = rnn.step_input(x)

            temp_l = layers.fc(
                input=x_t,
                size=self.input_dim,
                param_attr=ParamAttr(
                    name='W',
                    initializer=fluid.initializer.ConstantInitializer(1.0)),
                bias_attr=False)
            temp_r = layers.fc(
                input=h_pre,
                size=self.input_dim,
                param_attr=ParamAttr(
                    name='U',
                    initializer=fluid.initializer.ConstantInitializer(0.0)),
                bias_attr=False)

            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))

            rnn.update_memory(h_pre, h)
            rnn.output(h)

        return rnn()
    def define_learn(self, obs, action, reward):
        """
            update policy model self.model with policy gradient algorithm
            obs is `inputs`
        """
        tokens = action[0]
        adjvec = action[1]
        with fluid.unique_name.guard():
            [_, softmax, _, sigmoid] = self.model.policy(obs)
            reshape_softmax = layers.reshape(
                softmax, [-1, self.model.parser_args.num_tokens])
            reshape_tokens = layers.reshape(tokens, [-1, 1])
            reshape_tokens.stop_gradient = True
            raw_neglogp_to = layers.softmax_with_cross_entropy(
                soft_label=False,
                logits=reshape_softmax,
                label=fluid.layers.cast(x=reshape_tokens, dtype="int64"))

            action_to_shape_sec = self.model.parser_args.num_nodes * 2
            neglogp_to = layers.reshape(
                fluid.layers.cast(raw_neglogp_to, dtype="float32"),
                [-1, action_to_shape_sec])

            adjvec = layers.cast(x=adjvec, dtype='float32')
            neglogp_ad = layers.sigmoid_cross_entropy_with_logits(x=sigmoid,
                                                                  label=adjvec)

            neglogp = layers.elementwise_add(x=layers.reduce_sum(neglogp_to,
                                                                 dim=1),
                                             y=layers.reduce_sum(neglogp_ad,
                                                                 dim=1))
            reward = layers.cast(reward, dtype="float32")
            cost = layers.reduce_mean(
                fluid.layers.elementwise_mul(x=neglogp, y=reward))
            optimizer = fluid.optimizer.Adam(learning_rate=self.lr)
            train_op = optimizer.minimize(cost)
            return cost
Exemple #11
0
def gcn_layer(gw, feature, edge_features, act, name):
    """tbd"""
    def send_func(src_feat, dst_feat, edge_feat):
        """tbd"""
        return src_feat["h"] + edge_feat["h"]

    size = feature.shape[-1]

    msg = gw.send(send_func,
                  nfeat_list=[("h", feature)],
                  efeat_list=[("h", edge_features)])

    output = gw.recv(msg, mean_recv)
    output = layers.fc(output,
                       size=size,
                       bias_attr=False,
                       param_attr=fluid.ParamAttr(name=name))

    bias = layers.create_parameter(shape=[size],
                                   dtype='float32',
                                   is_bias=True,
                                   name=name + '_bias')
    output = layers.elementwise_add(output, bias, act=act)
    return output
    def build_program(self, dtype):
        with fluid.program_guard(self.main_program, self.startup_program):
            self.feed_vars = self._prepare_feed_vars([32, 128], dtype, 2)
            self.feed_vars.append(
                fluid.data(name="data2", shape=[128, 128], dtype=dtype))

            # subgraph with 2 op nodes
            tmp_0 = self.feed_vars[0] * self.feed_vars[1]
            tmp_1 = layers.cast(tmp_0, dtype="float16")
            zero = layers.fill_constant(shape=[128], dtype="float16", value=0)
            # TODO(xreki): fix precision problem when using softmax of float16.
            # tmp_2 = layers.softmax(tmp_1)
            tmp_2 = layers.elementwise_add(tmp_1, zero)
            tmp_3 = layers.mul(tmp_0, self.feed_vars[2])
            # subgraph with 4 op nodes
            tmp_3 = layers.cast(tmp_2, dtype="float16")
            tmp_4 = layers.relu(tmp_1 + tmp_3)
            tmp_5 = layers.cast(tmp_4, dtype=dtype)
            tmp_3 = layers.cast(tmp_2, dtype=dtype)

        self.append_gradients(tmp_5)

        self.num_fused_ops = 4
        self.fetch_list = [tmp_5, self.grad(tmp_0)]
    def increment(cls, x, value, in_place=False):
        """increment each element in x by value

        Args:
            x (Variable): NULL
            value (int/float): NULL
            in_place (TYPE): Default is False

        Returns: TODO

        Raises: NULL
        """
        if len(x.shape) == 1 and x.shape[0] == 1:
            return layers.increment(x, value, in_place)

        value_tensor = layers.fill_constant(shape=[1],
                                            dtype=x.dtype,
                                            value=value)
        y = layers.elementwise_add(x, value_tensor)
        if in_place:
            y = layers.assign(y, x)
            return x
        else:
            return y
Exemple #14
0
def lm_model(hidden_size,
             vocab_size,
             batch_size,
             num_layers=2,
             num_steps=20,
             init_scale=0.1,
             dropout=None,
             rnn_model='static',
             use_py_reader=False):
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
                    default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                #i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
                i = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[0],
                                 ends=[hidden_size])
                j = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size],
                                 ends=[hidden_size * 2])
                f = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 2],
                                 ends=[hidden_size * 3])
                o = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 3],
                                 ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        #real_res = layers.concat(res, 0)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(m,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(c,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_cell_array.append(last_c)
        '''
        else:
            real_res = rnnout[-1]
            for i in range( num_layers ):

            m1, c1, m2, c2 = rnnout
            real_res = m2
            m1.stop_gradient = True
            c1.stop_gradient = True
            c2.stop_gradient = True
        '''

        #layers.Print( first_hidden, message="22", summarize=10)
        #layers.Print( rnnout[1], message="11", summarize=10)
        #real_res = ( rnnout[1] + rnnout[2] + rnnout[3] + rnnout[4]) / 4.0
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)
        '''
        last_hidden = layers.concat( hidden_array, 1 )
        last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size])
        last_hidden = layers.transpose( x = last_hidden, perm = [1, 0, 2])
        last_cell = layers.concat( cell_array, 1)
        last_cell = layers.reshape( last_cell, shape=[ -1, num_layers, hidden_size])
        last_cell = layers.transpose( x = last_cell, perm = [1, 0, 2])
        '''

        return real_res, last_hidden, last_cell

    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
                    default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                try:
                    from paddle.fluid.contrib.layers import fused_elemwise_activation
                    # layers.sigmoid(i) * layers.tanh(j)
                    tmp0 = fused_elemwise_activation(
                        x=layers.tanh(j),
                        y=i,
                        functor_list=['elementwise_mul', 'sigmoid'])
                    # pre_cell * layers.sigmoid(f)
                    tmp1 = fused_elemwise_activation(
                        x=pre_cell,
                        y=f,
                        functor_list=['elementwise_mul', 'sigmoid'])
                    c = tmp0 + tmp1
                    # layers.tanh(c) * layers.sigmoid(o)
                    m = fused_elemwise_activation(
                        x=layers.tanh(c),
                        y=o,
                        functor_list=['elementwise_mul', 'sigmoid'])
                except ImportError:
                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                        i) * layers.tanh(j)
                    m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell

    batch_size_each = batch_size // fluid.core.get_cuda_device_count()
    if use_py_reader:
        feed_shapes = [[batch_size_each, num_steps, 1],
                       [batch_size_each * num_steps, 1]]
        py_reader = fluid.layers.py_reader(capacity=16,
                                           shapes=feed_shapes,
                                           dtypes=['int64', 'int64'])
        x, y = fluid.layers.read_file(py_reader)
    else:
        x = layers.data(name="x",
                        shape=[batch_size_each, num_steps, 1],
                        dtype='int64',
                        append_batch_size=False)
        y = layers.data(name="y",
                        shape=[batch_size_each * num_steps, 1],
                        dtype='int64',
                        append_batch_size=False)

    init_hidden = layers.data(name="init_hidden",
                              shape=[num_layers, batch_size_each, hidden_size],
                              dtype='float32',
                              append_batch_size=False)
    init_cell = layers.data(name="init_cell",
                            shape=[num_layers, batch_size_each, hidden_size],
                            dtype='float32',
                            append_batch_size=False)

    init_cell.persistable = True
    init_hidden.persistable = True

    init_hidden = layers.reshape(init_hidden,
                                 shape=[num_layers, -1, hidden_size])
    init_cell = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size])

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
            initializer=fluid.initializer.UniformInitializer(low=-init_scale,
                                                             high=init_scale)))

    x_emb = layers.reshape(x_emb,
                           shape=[-1, num_steps, hidden_size],
                           inplace=True)
    if dropout != None and dropout > 0.0:
        x_emb = layers.dropout(x_emb,
                               dropout_prob=dropout,
                               dropout_implementation='upscale_in_train')

    if rnn_model == "padding":
        rnn_out, last_hidden, last_cell = padding_rnn(x_emb,
                                                      len=num_steps,
                                                      init_hidden=init_hidden,
                                                      init_cell=init_cell)
    elif rnn_model == "static":
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell)
    elif rnn_model == "cudnn":
        x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
        rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden, init_cell,  num_steps, hidden_size, num_layers, \
                is_bidirec=False, \
                default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) )
        rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
    else:
        print("type not support")
        return

    rnn_out = layers.reshape(rnn_out,
                             shape=[-1, num_steps, hidden_size],
                             inplace=True)

    softmax_weight = layers.create_parameter([hidden_size, vocab_size], dtype="float32", name="softmax_weight", \
            default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
    softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias', \
            default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))

    projection = layers.matmul(rnn_out, softmax_weight)
    projection = layers.elementwise_add(projection, softmax_bias)
    projection = layers.reshape(projection,
                                shape=[-1, vocab_size],
                                inplace=True)

    loss = layers.softmax_with_cross_entropy(logits=projection,
                                             label=y,
                                             soft_label=False)

    loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
    loss = layers.reduce_mean(loss, dim=[0])
    loss = layers.reduce_sum(loss)

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    layers.assign(input=last_cell, output=init_cell)
    layers.assign(input=last_hidden, output=init_hidden)

    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
    if use_py_reader:
        return loss, last_hidden, last_cell, feeding_list, py_reader
    else:
        return loss, last_hidden, last_cell, feeding_list
    def __call__(self, step_fn, state):
        """
        Running beam search.

        @param : step_fn : decoding one step
        @type : function

        @param : state : initial state
        @type : dict
        """
        batch_size = state["batch_size"]
        beam_size = self.beam_size

        # shape: [batch_size, 1]
        pos_index = layers.range(0, batch_size, 1, dtype="int64")
        pos_index = layers.scale(pos_index, beam_size)
        pos_index = F.unsqueeze(pos_index, [1])

        # shape: [batch_size, beam_size, 1]
        predictions = layers.fill_constant(shape=[batch_size, beam_size, 1],
                                           dtype="int64",
                                           value=self.bos_id)

        # initial input
        state["pred_token"] = predictions[:, :1]
        # shape: [batch_size, vocab_size]
        scores, state = step_fn(state)

        unk_penalty = np.zeros(self.vocab_size, dtype="float32")
        unk_penalty[self.unk_id] = -1e10
        unk_penalty = layers.assign(unk_penalty)

        eos_penalty = np.zeros(self.vocab_size, dtype="float32")
        eos_penalty[self.eos_id] = -1e10
        eos_penalty = layers.assign(eos_penalty)

        scores_after_end = np.full(self.vocab_size, -1e10, dtype="float32")
        scores_after_end[self.pad_id] = 0
        scores_after_end = layers.assign(scores_after_end)

        if self.ignore_unk:
            scores = scores + unk_penalty
        scores = scores + eos_penalty

        # shape: [batch_size, beam_size]
        sequence_scores, preds = layers.topk(scores, self.beam_size)

        predictions = layers.concat(
            [predictions, F.unsqueeze(preds, [2])], axis=2)
        state = repeat(state, beam_size)

        parent_idx_list = []
        pred_list = []

        for step in range(2, self.max_gen_len + 1):
            pre_ids = predictions[:, :, -1:]
            state["pred_token"] = layers.reshape(
                pre_ids, shape=[batch_size * beam_size, 1, 1])
            state["pred_mask"] = 1 - F.equal(state["pred_token"], self.pad_id)
            state["pred_pos"] = state["pred_pos"] + 1
            scores, state = step_fn(state)

            # Generate next
            # scores shape: [batch_size, beam_size, vocab_size]
            if self.ignore_unk:
                scores = scores + unk_penalty

            if step <= self.min_gen_len:
                scores = scores + eos_penalty

            scores = layers.reshape(
                scores, shape=[batch_size, beam_size, self.vocab_size])

            # previous token is [PAD] or [EOS]
            pre_eos_mask = F.equal(pre_ids, self.eos_id) + F.equal(
                pre_ids, self.pad_id)

            scores = scores * (1 - pre_eos_mask) + \
                layers.expand(pre_eos_mask, [1, 1, self.vocab_size]) * scores_after_end
            if self.length_average:
                scaled_value = pre_eos_mask + (1 - pre_eos_mask) * (1 -
                                                                    1 / step)
                sequence_scores = F.unsqueeze(sequence_scores,
                                              [2]) * scaled_value
                scaled_value = pre_eos_mask + (1 - pre_eos_mask) * (1 / step)
                scores = scores * scaled_value
            elif self.length_penalty >= 0.0:
                scaled_value = pre_eos_mask + (1 - pre_eos_mask) * \
                    (math.pow((4 + step) / (5 + step), self.length_penalty))
                sequence_scores = layers.elementwise_mul(scaled_value,
                                                         sequence_scores,
                                                         axis=0)
                scaled_value = pre_eos_mask + (1 - pre_eos_mask) * \
                    (math.pow(1 / (5 + step), self.length_penalty))
                scores = scores * scaled_value
            scores = layers.elementwise_add(scores, sequence_scores, axis=0)
            scores = layers.reshape(
                scores, shape=[batch_size, beam_size * self.vocab_size])

            topk_scores, topk_indices = layers.topk(scores, beam_size)
            vocab_size = layers.fill_constant(shape=[1],
                                              dtype="int64",
                                              value=self.vocab_size)
            parent_idx = layers.elementwise_floordiv(topk_indices, vocab_size)
            preds = layers.elementwise_mod(topk_indices, vocab_size)

            # Gather state / sequence_scores
            parent_idx = layers.elementwise_add(parent_idx, pos_index, axis=0)
            parent_idx = layers.reshape(parent_idx, [batch_size * beam_size])
            state = gather(state, parent_idx)
            sequence_scores = topk_scores

            predictions = layers.reshape(predictions,
                                         shape=[batch_size * beam_size, step])
            predictions = gather(predictions, parent_idx)
            predictions = layers.reshape(predictions,
                                         shape=[batch_size, beam_size, step])
            predictions = layers.concat(
                [predictions, F.unsqueeze(preds, [2])], axis=2)

        pre_ids = predictions[:, :, -1]
        pre_eos_mask = F.equal(pre_ids, self.eos_id) + F.equal(
            pre_ids, self.pad_id)
        sequence_scores = sequence_scores * pre_eos_mask + layers.scale(
            1 - pre_eos_mask, -1e10)

        _, indices = layers.argsort(sequence_scores, axis=1)
        indices = indices + pos_index
        indices = layers.reshape(indices, [-1])
        sequence_scores = layers.reshape(sequence_scores,
                                         [batch_size * beam_size])
        predictions = layers.reshape(predictions, [batch_size * beam_size, -1])
        sequence_scores = gather(sequence_scores, indices)
        predictions = layers.gather(predictions, indices)
        sequence_scores = layers.reshape(sequence_scores,
                                         [batch_size, beam_size])
        predictions = layers.reshape(predictions, [batch_size, beam_size, -1])

        results = {
            "preds": predictions[:, -1],
            "scores": sequence_scores[:, -1]
        }
        return results
Exemple #16
0
def encoder(x,
            y,
            vocab_size,
            emb_size,
            init_hidden=None,
            init_cell=None,
            para_name='',
            custom_samples=None,
            custom_probabilities=None,
            test_mode=False,
            args=None):
    x_emb = layers.embedding(input=x,
                             size=[vocab_size, emb_size],
                             dtype='float32',
                             is_sparse=False,
                             param_attr=fluid.ParamAttr(name='embedding_para'))
    rnn_input = x_emb
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    for i in range(args.num_layers):
        rnn_input = dropout(rnn_input, test_mode, args)
        if init_hidden and init_cell:
            h0 = layers.squeeze(layers.slice(init_hidden,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
            c0 = layers.squeeze(layers.slice(init_cell,
                                             axes=[0],
                                             starts=[i],
                                             ends=[i + 1]),
                                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, args.hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        rnn_out = dropout(rnn_out, test_mode, args)
        cell = dropout(cell, test_mode, args)
        rnn_outs.append(rnn_out)
        rnn_outs_ori.append(rnn_out_ori)
        rnn_input = rnn_out
        cells.append(cell)
        projs.append(input_proj)

    softmax_weight = layers.create_parameter([vocab_size, emb_size],
                                             dtype="float32",
                                             name="softmax_weight")
    softmax_bias = layers.create_parameter([vocab_size],
                                           dtype="float32",
                                           name='softmax_bias')
    projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True)
    projection = layers.elementwise_add(projection, softmax_bias)

    projection = layers.reshape(projection, shape=[-1, vocab_size])

    if args.sample_softmax and (not test_mode):
        loss = layers.sampled_softmax_with_cross_entropy(
            logits=projection,
            label=y,
            num_samples=args.n_negative_samples_batch,
            seed=args.random_seed)
    else:
        label = layers.one_hot(input=y, depth=vocab_size)
        loss = layers.softmax_with_cross_entropy(logits=projection,
                                                 label=label,
                                                 soft_label=True)
    return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell
def decoder_decode(context, is_sparse):
    init_state = context
    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)

    # fill the first element with init_state
    state_array = pd.create_array('float32')
    pd.array_write(init_state, array=state_array, i=counter)

    # ids, scores as memory
    ids_array = pd.create_array('int64')
    scores_array = pd.create_array('float32')

    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
    init_scores = pd.data(
        name="init_scores", shape=[1], dtype="float32", lod_level=2)

    pd.array_write(init_ids, array=ids_array, i=counter)
    pd.array_write(init_scores, array=scores_array, i=counter)

    cond = pd.less_than(x=counter, y=array_len)

    while_op = pd.While(cond=cond)
    with while_op.block():
        pre_ids = pd.array_read(array=ids_array, i=counter)
        pre_state = pd.array_read(array=state_array, i=counter)
        pre_score = pd.array_read(array=scores_array, i=counter)

        # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)

        pre_ids_emb = pd.embedding(
            input=pre_ids,
            size=[dict_size, word_dim],
            dtype='float32',
            is_sparse=is_sparse)

        # use rnn unit to update rnn
        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
                              size=decoder_size,
                              act='tanh')
        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
        # use score to do beam search
        current_score = pd.fc(input=current_state_with_lod,
                              size=target_dict_dim,
                              act='softmax')
        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
        # calculate accumulated scores after topk to reduce computation cost
        accu_scores = pd.elementwise_add(
            x=pd.log(topk_scores), y=pd.reshape(
                pre_score, shape=[-1]), axis=0)
        selected_ids, selected_scores = pd.beam_search(
            pre_ids,
            pre_score,
            topk_indices,
            accu_scores,
            beam_size,
            end_id=10,
            level=0)

        pd.increment(x=counter, value=1, in_place=True)

        # update the memories
        pd.array_write(current_state, array=state_array, i=counter)
        pd.array_write(selected_ids, array=ids_array, i=counter)
        pd.array_write(selected_scores, array=scores_array, i=counter)

        # update the break condition: up to the max length or all candidates of
        # source sentences have ended.
        length_cond = pd.less_than(x=counter, y=array_len)
        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
        pd.logical_and(x=length_cond, y=finish_cond, out=cond)

    translation_ids, translation_scores = pd.beam_search_decode(
        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)

    # return init_ids, init_scores

    return translation_ids, translation_scores
Exemple #19
0
    def encoder_static(input_embedding, len=3, init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(
                pre_hidden, shape=[-1, hidden_size], inplace=True)
            pre_cell = layers.reshape(
                pre_cell, shape=[-1, hidden_size], inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(
            input_embedding, num_or_sections=len, dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input, shape=[-1, hidden_size], inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)

                try:
                    from paddle.fluid.contrib.layers import fused_elemwise_activation
                    # fluid.contrib.layers.fused_elemwise_activation can do a fused
                    # operation, like:
                    # 1) x + sigmoid(y); x + tanh(y)
                    # 2) tanh(x + y)
                    # Now the unary operation supported in this fused op is limit, and
                    # we will extent this operation to support more unary operations and
                    # do this kind of fusion automitically in future version of paddle.fluid.
                    # layers.sigmoid(i) * layers.tanh(j)
                    tmp0 = fused_elemwise_activation(
                        x=layers.tanh(j),
                        y=i,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    # pre_cell * layers.sigmoid(f)
                    tmp1 = fused_elemwise_activation(
                        x=pre_cell,
                        y=f,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    c = tmp0 + tmp1
                    # layers.tanh(c) * layers.sigmoid(o)
                    m = fused_elemwise_activation(
                        x=layers.tanh(c),
                        y=o,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                except ImportError:
                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                        i) * layers.tanh(j)
                    m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(
            last_hidden, shape=[-1, num_layers, hidden_size], inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(
            last_cell, shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(
            real_res, shape=[len, -1, hidden_size], inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell
Exemple #20
0
 def less_than_branch(i, a):
     return layers.cond(i >= 3.0, lambda: layers.elementwise_add(a, a),
                        lambda: layers.elementwise_sub(a, a))
Exemple #21
0
    def scaled_dot_product_attention(q, k, v, attn_bias,
                                     biaffine_transformation,
                                     biaffine_transformation_bias,
                                     structure_mask, with_ent_structure, d_key,
                                     dropout_rate):
        """
        Scaled Dot-Product Attention
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)

        if with_ent_structure:
            # TRANSFORMATION
            # 1.reshape input
            # q: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden] -> [bs, 5, n_head, seq, hidden]
            # -> [5, n_head, bs, seq, hidden] -> [5, n_head, bs * seq, hidden]
            # transformation: [dependencies(5), n_head, hidden, hidden]
            # k: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden]
            q_ = layers.unsqueeze(scaled_q, [1])
            q_ = layers.expand(q_,
                               [1, biaffine_transformation.shape[0], 1, 1, 1])
            q_ = layers.transpose(q_, perm=[1, 2, 0, 3, 4])
            q_ = layers.reshape(
                q_,
                shape=[0, 0, -1, biaffine_transformation.shape[3]],
                inplace=True)
            k_ = layers.unsqueeze(k, [1])
            k_ = layers.expand(k_,
                               [1, biaffine_transformation.shape[0], 1, 1, 1])

            # 2.implement matmul
            # q * transformation: [5, n_head, bs * seq, hidden]
            # q * transformation: [5, n_head, bs * seq, hidden] -> [5, n_head, bs, seq, hidden]
            # -> [bs, dependencies(5), n_head, seq, hidden]
            # q * transformation * k: [bs, dependencies(5), n_head, seq, seq]
            structured_bias = layers.matmul(x=q_, y=biaffine_transformation)
            structured_bias = layers.reshape(
                structured_bias,
                shape=[0, 0, -1, k_.shape[3], k_.shape[4]],
                inplace=True)
            structured_bias = layers.transpose(structured_bias,
                                               perm=[2, 0, 1, 3, 4])
            structured_bias = layers.matmul(x=structured_bias,
                                            y=k_,
                                            transpose_y=True)

            structured_bias = layers.elementwise_add(
                structured_bias, biaffine_transformation_bias, axis=1)

            # mask & apply
            structured_bias = structured_bias * structure_mask
            structured_bias = layers.reduce_sum(structured_bias, dim=1)
            product += structured_bias

        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product)
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)
        out = layers.matmul(weights, v)
        return out
Exemple #22
0
 def min_len_penalty():
     """Plus minimum length penalty."""
     return layers.elementwise_add(logits, eos_penalty, axis=1)
Exemple #23
0
    def beam_search():
        max_len = layers.fill_constant(shape=[1],
                                       dtype=start_tokens.dtype,
                                       value=max_out_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=start_tokens.dtype,
                                        value=0,
                                        force_cpu=True)
        cond = layers.less_than(x=step_idx,
                                y=max_len)  # default force_cpu=True
        while_op = layers.While(cond)
        # array states will be stored for each step.
        ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                 step_idx)
        scores = layers.array_write(init_scores, step_idx)
        # cell states will be overwrited at each step.
        # caches contains states of history steps in decoder self-attention
        # and static encoder output projections in encoder-decoder attention
        # to reduce redundant computation.
        caches = [
            {
                "k":  # for self attention
                layers.fill_constant_batch_size_like(
                    input=start_tokens,
                    shape=[-1, n_head, 0, d_key],
                    dtype=enc_output.dtype,
                    value=0),
                "v":  # for self attention
                layers.fill_constant_batch_size_like(
                    input=start_tokens,
                    shape=[-1, n_head, 0, d_value],
                    dtype=enc_output.dtype,
                    value=0),
                "static_k":  # for encoder-decoder attention
                layers.create_tensor(dtype=enc_output.dtype),
                "static_v":  # for encoder-decoder attention
                layers.create_tensor(dtype=enc_output.dtype)
            } for i in range(n_layer)
        ]

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            # Since beam_search_op dosen't enforce pre_ids' shape, we can do
            # inplace reshape here which actually change the shape of pre_ids.
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            # gather cell states corresponding to selected parent
            pre_src_attn_bias = layers.gather(trg_src_attn_bias,
                                              index=parent_idx)
            pre_pos = layers.elementwise_mul(
                x=layers.fill_constant_batch_size_like(
                    input=pre_src_attn_bias,  # cann't use lod tensor here
                    value=1,
                    shape=[-1, 1, 1],
                    dtype=pre_ids.dtype),
                y=step_idx,
                axis=0)
            logits = wrap_decoder(trg_vocab_size,
                                  max_in_len,
                                  n_layer,
                                  n_head,
                                  d_key,
                                  d_value,
                                  d_model,
                                  d_inner_hid,
                                  prepostprocess_dropout,
                                  attention_dropout,
                                  relu_dropout,
                                  preprocess_cmd,
                                  postprocess_cmd,
                                  weight_sharing,
                                  dec_inputs=(pre_ids, pre_pos, None,
                                              pre_src_attn_bias),
                                  enc_output=enc_output,
                                  caches=caches,
                                  gather_idx=parent_idx,
                                  bos_idx=bos_idx)
            # intra-beam topK
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(logits), k=beam_size)
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores,
                                                 axis=0)
            # beam_search op uses lod to differentiate branches.
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            # topK reduction across beams, also contain special handle of
            # end beams and end sentences(batch reduction)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=eos_idx,
                return_parent_idx=True)
            layers.increment(x=step_idx, value=1.0, in_place=True)
            # cell states(caches) have been updated in wrap_decoder,
            # only need to update beam search states here.
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.assign(gather_idx, parent_idx)
            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=eos_idx)
        return finished_ids, finished_scores
Exemple #24
0
    def _graph_common(self, _amp_fun, startup_prog=None):
        size = 3
        n = np.ones([size, size], dtype='float32') * 3.2
        nn = np.ones([size, size], dtype='float32') * -2.7

        n_bf16 = amp.bf16.convert_float_to_uint16(n)
        nn_bf16 = amp.bf16.convert_float_to_uint16(nn)

        with self.static_graph():
            t_bf16 = layers.data(name='t_bf16',
                                 shape=[size, size],
                                 dtype=np.uint16)
            tt_bf16 = layers.data(name='tt_bf16',
                                  shape=[size, size],
                                  dtype=np.uint16)
            t = layers.data(name='t', shape=[size, size], dtype='float32')
            tt = layers.data(name='tt', shape=[size, size], dtype='float32')

            ret = layers.elementwise_add(t, tt)
            ret = layers.elementwise_mul(ret, t)
            ret = layers.reshape(ret, [0, 0])

            with amp.bf16.bf16_guard():
                ret_bf16 = layers.elementwise_add(t_bf16, tt_bf16)
                ret_bf16 = layers.elementwise_mul(ret_bf16, t_bf16)
                ret_bf16 = layers.reshape(ret_bf16, [0, 0])

            with amp.bf16.bf16_guard():
                ret_fp32bf16 = layers.elementwise_add(t, tt)
                ret_fp32bf16 = layers.elementwise_mul(ret_fp32bf16, t)
                ret_fp32bf16 = layers.reshape(ret_fp32bf16, [0, 0])

            static_ret_bf16, static_ret, ret_fp32bf16 = self.get_static_graph_result(
                feed={
                    't': n,
                    'tt': nn,
                    't_bf16': n_bf16,
                    'tt_bf16': nn_bf16,
                },
                fetch_list=[ret_bf16, ret, ret_fp32bf16],
                amp_fun=lambda prog: amp.bf16.rewrite_program_bf16(prog))

        self.assertTrue(np.allclose(static_ret_bf16, static_ret, 1e-2))
        self.assertTrue(np.allclose(static_ret_bf16, ret_fp32bf16, 1e-2))

        with self.static_graph():
            t = layers.data(name='t', shape=[size, size], dtype='float32')
            tt = layers.data(name='tt', shape=[size, size], dtype='float32')

            with amp.bf16.bf16_guard():
                ret = layers.elementwise_add(t, tt)
                ret = layers.reshape(ret, [0, 0], act='elu')
                ret = layers.elementwise_mul(ret, t)
            ret = layers.elementwise_add(ret, tt)

            static_ret_bf16 = \
                self.get_static_graph_result(
                    feed={'t': n, 'tt': nn},
                    fetch_list=[ret],
                    amp_fun=_amp_fun,
                    startup_prog=startup_prog
                )
        self.assertTrue(static_ret_bf16,
                        np.ones([size, size], dtype='float32') * -1.1)
Exemple #25
0
    def beam_search(self,
                    src_word,
                    src_pos,
                    src_slf_attn_bias,
                    trg_word,
                    trg_src_attn_bias,
                    bos_id=0,
                    eos_id=1,
                    beam_size=4,
                    max_len=256):
        def expand_to_beam_size(tensor, beam_size):
            tensor = layers.reshape(tensor,
                                    [tensor.shape[0], 1] + tensor.shape[1:])
            tile_dims = [1] * len(tensor.shape)
            tile_dims[1] = beam_size
            return layers.expand(tensor, tile_dims)

        def merge_batch_beams(tensor):
            return layers.reshape(tensor, [tensor.shape[0] * tensor.shape[1]] +
                                  tensor.shape[2:])

        def split_batch_beams(tensor):
            return fluid.layers.reshape(tensor,
                                        shape=[-1, beam_size] +
                                        list(tensor.shape[1:]))

        def mask_probs(probs, finished, noend_mask_tensor):
            # TODO: use where_op
            finished = layers.cast(finished, dtype=probs.dtype)
            probs = layers.elementwise_mul(layers.expand(
                layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]),
                                           noend_mask_tensor,
                                           axis=-1) - layers.elementwise_mul(
                                               probs, (finished - 1), axis=0)
            return probs

        def gather(x, indices, batch_pos):
            topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
            return layers.gather_nd(x, topk_coordinates)

        # run encoder
        enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias)

        # constant number
        inf = float(1. * 1e7)
        batch_size = enc_output.shape[0]
        max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
        vocab_size_tensor = layers.fill_constant(shape=[1],
                                                 dtype="int64",
                                                 value=self.trg_vocab_size)
        end_token_tensor = to_variable(
            np.full([batch_size, beam_size], eos_id, dtype="int64"))
        noend_array = [-inf] * self.trg_vocab_size
        noend_array[eos_id] = 0
        noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32"))
        batch_pos = layers.expand(
            layers.unsqueeze(
                to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]),
            [1, beam_size])

        predict_ids = []
        parent_ids = []
        ### initialize states of beam search ###
        log_probs = to_variable(
            np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size,
                     dtype="float32"))
        finished = to_variable(
            np.full([batch_size, beam_size], 0, dtype="bool"))
        ### initialize inputs and states of transformer decoder ###
        ## init inputs for decoder, shaped `[batch_size*beam_size, ...]`
        trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                        dtype="int64",
                                        value=bos_id)
        trg_pos = layers.zeros_like(trg_word)
        trg_src_attn_bias = merge_batch_beams(
            expand_to_beam_size(trg_src_attn_bias, beam_size))
        enc_output = merge_batch_beams(
            expand_to_beam_size(enc_output, beam_size))
        ## init states (caches) for transformer, need to be updated according to selected beam
        caches = [{
            "k":
            layers.fill_constant(
                shape=[batch_size * beam_size, self.n_head, 0, self.d_key],
                dtype=enc_output.dtype,
                value=0),
            "v":
            layers.fill_constant(
                shape=[batch_size * beam_size, self.n_head, 0, self.d_value],
                dtype=enc_output.dtype,
                value=0),
        } for i in range(self.n_layer)]

        for i in range(max_len):
            trg_pos = layers.fill_constant(shape=trg_word.shape,
                                           dtype="int64",
                                           value=i)
            caches = map_structure(  # can not be reshaped since the 0 size
                lambda x: x if i == 0 else merge_batch_beams(x), caches)
            logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
                                  enc_output, caches)
            caches = map_structure(split_batch_beams, caches)
            step_log_probs = split_batch_beams(
                fluid.layers.log(fluid.layers.softmax(logits)))
            step_log_probs = mask_probs(step_log_probs, finished,
                                        noend_mask_tensor)
            log_probs = layers.elementwise_add(x=step_log_probs,
                                               y=log_probs,
                                               axis=0)
            log_probs = layers.reshape(log_probs,
                                       [-1, beam_size * self.trg_vocab_size])
            scores = log_probs
            topk_scores, topk_indices = fluid.layers.topk(input=scores,
                                                          k=beam_size)
            beam_indices = fluid.layers.elementwise_floordiv(
                topk_indices, vocab_size_tensor)
            token_indices = fluid.layers.elementwise_mod(
                topk_indices, vocab_size_tensor)

            # update states
            caches = map_structure(
                lambda x: gather(x, beam_indices, batch_pos), caches)
            log_probs = gather(log_probs, topk_indices, batch_pos)
            finished = gather(finished, beam_indices, batch_pos)
            finished = layers.logical_or(
                finished, layers.equal(token_indices, end_token_tensor))
            trg_word = layers.reshape(token_indices, [-1, 1])

            predict_ids.append(token_indices)
            parent_ids.append(beam_indices)

            if layers.reduce_all(finished).numpy():
                break

        predict_ids = layers.stack(predict_ids, axis=0)
        parent_ids = layers.stack(parent_ids, axis=0)
        finished_seq = layers.transpose(
            layers.gather_tree(predict_ids, parent_ids), [1, 2, 0])
        finished_scores = topk_scores

        return finished_seq, finished_scores
Exemple #26
0
    def infilling_decode(self):
        if self.task_type == "dialog":
            emb_num = 4
        else:
            emb_num = 3
        input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \
                       [[-1, self.max_seq_len, self.max_seq_len]]
        input_dtypes = ['int64'] * emb_num + ['float32']
        input_lod_levels = [0] * emb_num + [0]

        shapes = input_shapes + [[-1, self.max_seq_len, 1],
                                 [-1, self.max_seq_len, 1], [-1, 1], [-1],
                                 [-1, 1, self.max_seq_len], [-1, 1]]
        dtypes = input_dtypes + [
            'int64', 'int64', 'float32', 'int32', 'float32', 'int64'
        ]
        lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0]

        inputs = self.to_ternsor(shapes, dtypes, lod_levels)
        pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                      capacity=50,
                                                      iterable=False)

        emb_ids = {}
        for key, value in zip(self.emb_keys, inputs[:emb_num]):
            emb_ids[key] = value

        input_mask = inputs[emb_num]
        tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[
            -6:]

        ernie = ErnieModel(emb_ids=emb_ids,
                           input_mask=input_mask,
                           config=self.ernie_config,
                           use_fp16=self.use_fp16,
                           task_type=self.task_type,
                           decoding=True,
                           gather_idx=parent_idx)

        max_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.max_dec_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=tgt_ids.dtype,
                                        value=0,
                                        force_cpu=True)
        pos_idx = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=1,
                                       force_cpu=True)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx)
        pos_biases = layers.array_write(layers.reshape(tgt_pos, (-1, 1)),
                                        step_idx)
        scores = layers.array_write(init_scores, step_idx)
        tgt_masks = layers.array_write(tgt_input_mask, step_idx)

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)
            tmp_mask = layers.array_read(tgt_masks, i=step_idx)

            def gen_batch_like(value,
                               dtype="int64",
                               shape=[-1, 1, 1],
                               is_scalar=True):
                if is_scalar:
                    return layers.fill_constant_batch_size_like(
                        input=parent_idx,
                        value=value,
                        shape=shape,
                        dtype=dtype)
                else:
                    return layers.elementwise_mul(
                        x=layers.fill_constant_batch_size_like(
                            input=parent_idx,
                            value=1,
                            shape=shape,
                            dtype=dtype),
                        y=value,
                        axis=0)

            tmp_mask = layers.gather(input=tmp_mask, index=parent_idx)
            append_0_mask = gen_batch_like(0.0, dtype=tmp_mask.dtype)
            append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype)
            tmp_mask = layers.concat([tmp_mask, append_1_mask], axis=2)
            pre_mask = layers.concat([tmp_mask, append_0_mask], axis=2)
            cur_mask = layers.concat([tmp_mask, append_1_mask], axis=2)

            cur_ids = gen_batch_like(self.attn_id)
            pre_pos = gen_batch_like(step_idx, is_scalar=False)
            cur_pos = gen_batch_like(pos_idx, is_scalar=False)
            if self.continuous_position:
                pre_pos = pre_pos + pos_bias
                cur_pos = cur_pos + pos_bias

            dec_emb_ids = {
                "word_embedding": layers.concat([pre_ids, cur_ids], axis=1),
                "pos_embedding": layers.concat([pre_pos, cur_pos], axis=1)
            }
            if self.task_type == "dialog":
                role_ids = gen_batch_like(0)
                turn_ids = gen_batch_like(0)
                dec_emb_ids["role_embedding"] = layers.concat(
                    [role_ids, role_ids], axis=1)
                dec_emb_ids["turn_embedding"] = layers.concat(
                    [turn_ids, turn_ids], axis=1)
            else:
                sent_ids = gen_batch_like(self.tgt_type_id)
                dec_emb_ids["sent_embedding"] = layers.concat(
                    [sent_ids, sent_ids], axis=1)
            dec_mask = layers.concat([pre_mask, cur_mask], axis=1)

            dec_out = ernie.encode(dec_emb_ids,
                                   dec_mask,
                                   parent_idx,
                                   remove_query=True)
            fc_out = self.cal_logit(dec_out[:, 1:, :], None)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(fc_out), k=self.beam_size)
            pre_lenpen = layers.pow(
                (5.0 + layers.cast(step_idx, pre_scores.dtype)) / 6.0,
                self.length_penalty)
            cur_lenpen = layers.pow(
                (5.0 + layers.cast(pos_idx, pre_scores.dtype)) / 6.0,
                self.length_penalty)
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores * pre_lenpen,
                                                 axis=0) / cur_lenpen
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=self.beam_size,
                end_id=self.eos_idx,
                return_parent_idx=True)

            layers.increment(x=step_idx, value=1.0, in_place=True)
            layers.increment(x=pos_idx, value=1.0, in_place=True)
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(tmp_mask, i=step_idx, array=tgt_masks)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)
            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

        graph_vars = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "data_ids": data_ids
        }

        for k, v in graph_vars.items():
            v.persistable = True

        return pyreader, graph_vars
Exemple #27
0
def point_network_decoder(p_vec, q_vec, hidden_size, args):
    """Output layer - pointer network"""
    tag = 'pn_decoder_'
    init_random = fluid.initializer.Normal(loc=0.0, scale=1.0)

    random_attn = layers.create_parameter(
        shape=[1, hidden_size],
        dtype='float32',
        default_initializer=init_random)
    random_attn = layers.fc(
        input=random_attn,
        size=hidden_size,
        act=None,
        param_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_w'),
        bias_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_b'))
    random_attn = layers.reshape(random_attn, shape=[-1])
    U = layers.fc(input=q_vec,
                  param_attr=fluid.ParamAttr(name=tag + 'q_vec_fc_w'),
                  bias_attr=False,
                  size=hidden_size,
                  act=None) + random_attn
    U = layers.tanh(U)

    logits = layers.fc(input=U,
                       param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'),
                       bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b'),
                       size=1,
                       act=None)
    scores = layers.sequence_softmax(input=logits)
    pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0)
    pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum')

    init_state = layers.fc(
        input=pooled_vec,
        param_attr=fluid.ParamAttr(name=tag + 'init_state_fc_w'),
        bias_attr=fluid.ParamAttr(name=tag + 'init_state_fc_b'),
        size=hidden_size,
        act=None)

    def custom_dynamic_rnn(p_vec, init_state, hidden_size, para_name, args):
        tag = para_name + "custom_dynamic_rnn_"

        def static_rnn(step,
                       p_vec=p_vec,
                       init_state=None,
                       para_name='',
                       args=args):
            tag = para_name + "static_rnn_"
            ctx = layers.fc(
                input=p_vec,
                param_attr=fluid.ParamAttr(name=tag + 'context_fc_w'),
                bias_attr=fluid.ParamAttr(name=tag + 'context_fc_b'),
                size=hidden_size,
                act=None)

            beta = []
            c_prev = init_state
            m_prev = init_state
            for i in range(step):
                m_prev0 = layers.fc(
                    input=m_prev,
                    size=hidden_size,
                    act=None,
                    param_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_w'),
                    bias_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_b'))
                m_prev1 = layers.sequence_expand(x=m_prev0, y=ctx)

                Fk = ctx + m_prev1
                Fk = layers.tanh(Fk)
                logits = layers.fc(
                    input=Fk,
                    size=1,
                    act=None,
                    param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'),
                    bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b'))

                scores = layers.sequence_softmax(input=logits)
                attn_ctx = layers.elementwise_mul(x=p_vec, y=scores, axis=0)
                attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum')

                hidden_t, cell_t = lstm_step(
                    attn_ctx,
                    hidden_t_prev=m_prev,
                    cell_t_prev=c_prev,
                    size=hidden_size,
                    para_name=tag,
                    args=args)
                m_prev = hidden_t
                c_prev = cell_t
                beta.append(scores)
            return beta

        return static_rnn(
            2, p_vec=p_vec, init_state=init_state, para_name=para_name)

    fw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "fw_",
                                    args)
    bw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "bw_",
                                    args)

    start_prob = layers.elementwise_add(
        x=fw_outputs[0], y=bw_outputs[1], axis=0) / 2
    end_prob = layers.elementwise_add(
        x=fw_outputs[1], y=bw_outputs[0], axis=0) / 2

    return start_prob, end_prob
Exemple #28
0
def lm_model(hidden_size,
             vocab_size,
             batch_size,
             num_layers=2,
             num_steps=20,
             init_scale=0.1,
             dropout=None,
             rnn_model='static',
             use_py_reader=False):
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i = layers.slice(
                    gate_input, axes=[1], starts=[0], ends=[hidden_size])
                j = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size],
                    ends=[hidden_size * 2])
                f = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 2],
                    ends=[hidden_size * 3])
                o = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 3],
                    ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(
                m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(
                c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_cell_array.append(last_c)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell

    def encoder_static(input_embedding, len=3, init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(
                pre_hidden, shape=[-1, hidden_size], inplace=True)
            pre_cell = layers.reshape(
                pre_cell, shape=[-1, hidden_size], inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(
            input_embedding, num_or_sections=len, dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input, shape=[-1, hidden_size], inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)

                try:
                    from paddle.fluid.contrib.layers import fused_elemwise_activation
                    # fluid.contrib.layers.fused_elemwise_activation can do a fused
                    # operation, like:
                    # 1) x + sigmoid(y); x + tanh(y)
                    # 2) tanh(x + y)
                    # Now the unary operation supported in this fused op is limit, and
                    # we will extent this operation to support more unary operations and
                    # do this kind of fusion automitically in future version of paddle.fluid.
                    # layers.sigmoid(i) * layers.tanh(j)
                    tmp0 = fused_elemwise_activation(
                        x=layers.tanh(j),
                        y=i,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    # pre_cell * layers.sigmoid(f)
                    tmp1 = fused_elemwise_activation(
                        x=pre_cell,
                        y=f,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                    c = tmp0 + tmp1
                    # layers.tanh(c) * layers.sigmoid(o)
                    m = fused_elemwise_activation(
                        x=layers.tanh(c),
                        y=o,
                        functor_list=['elementwise_mul', 'sigmoid'],
                        save_intermediate_out=False)
                except ImportError:
                    c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                        i) * layers.tanh(j)
                    m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(
            last_hidden, shape=[-1, num_layers, hidden_size], inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(
            last_cell, shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(
            real_res, shape=[len, -1, hidden_size], inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell

    batch_size_each = batch_size
    if use_py_reader:
        feed_shapes = [[batch_size_each, num_steps, 1],
                       [batch_size_each * num_steps, 1]]
        py_reader = fluid.layers.py_reader(
            capacity=16, shapes=feed_shapes, dtypes=['int64', 'int64'])
        x, y = fluid.layers.read_file(py_reader)
    else:
        x = layers.data(
            name="x",
            shape=[batch_size_each, num_steps, 1],
            dtype='int64',
            append_batch_size=False)
        y = layers.data(
            name="y",
            shape=[batch_size_each * num_steps, 1],
            dtype='int64',
            append_batch_size=False)

    init_hidden = layers.data(
        name="init_hidden",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
        append_batch_size=False)
    init_cell = layers.data(
        name="init_cell",
        shape=[num_layers, batch_size_each, hidden_size],
        dtype='float32',
        append_batch_size=False)

    init_cell.persistable = True
    init_hidden.persistable = True

    init_hidden_reshape = layers.reshape(
        init_hidden, shape=[num_layers, -1, hidden_size])
    init_cell_reshape = layers.reshape(
        init_cell, shape=[num_layers, -1, hidden_size])

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
            initializer=fluid.initializer.UniformInitializer(
                low=-init_scale, high=init_scale)))

    x_emb = layers.reshape(
        x_emb, shape=[-1, num_steps, hidden_size], inplace=True)
    if dropout != None and dropout > 0.0:
        x_emb = layers.dropout(
            x_emb,
            dropout_prob=dropout,
            dropout_implementation='upscale_in_train')

    if rnn_model == "padding":
        rnn_out, last_hidden, last_cell = padding_rnn(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
            init_cell=init_cell_reshape)
    elif rnn_model == "static":
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
            init_cell=init_cell_reshape)
    elif rnn_model == "cudnn":
        x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
        rnn_out, last_hidden, last_cell = layers.lstm(
            x_emb,
            init_hidden_reshape,
            init_cell_reshape,
            num_steps,
            hidden_size,
            num_layers,
            is_bidirec=False,
            default_initializer=fluid.initializer.UniformInitializer(
                low=-init_scale, high=init_scale))
        rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
    elif rnn_model == "basic_lstm":
        rnn_out, last_hidden, last_cell = basic_lstm( x_emb, init_hidden, init_cell, hidden_size, \
                num_layers=num_layers, batch_first=True, dropout_prob=dropout, \
                param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ), \
                bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ), \
                forget_bias = 0.0)
    else:
        print("type not support")
        return

    rnn_out = layers.reshape(
        rnn_out, shape=[-1, num_steps, hidden_size], inplace=True)

    softmax_weight = layers.create_parameter(
        [hidden_size, vocab_size],
        dtype="float32",
        name="softmax_weight",
        default_initializer=fluid.initializer.UniformInitializer(
            low=-init_scale, high=init_scale))
    softmax_bias = layers.create_parameter(
        [vocab_size],
        dtype="float32",
        name='softmax_bias',
        default_initializer=fluid.initializer.UniformInitializer(
            low=-init_scale, high=init_scale))

    projection = layers.matmul(rnn_out, softmax_weight)
    projection = layers.elementwise_add(projection, softmax_bias)
    projection = layers.reshape(
        projection, shape=[-1, vocab_size], inplace=True)

    loss = layers.softmax_with_cross_entropy(
        logits=projection, label=y, soft_label=False)

    loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
    loss = layers.reduce_mean(loss, dim=[0])
    loss = layers.reduce_sum(loss)

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    # This will feed last_hidden, last_cell to init_hidden, init_cell, which
    # can be used directly in next batch. This can avoid the fetching of
    # last_hidden and last_cell and feeding of init_hidden and init_cell in
    # each training step.
    layers.assign(input=last_cell, output=init_cell)
    layers.assign(input=last_hidden, output=init_hidden)

    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
    if use_py_reader:
        return loss, last_hidden, last_cell, feeding_list, py_reader
    else:
        return loss, last_hidden, last_cell, feeding_list
Exemple #29
0
    def inference(self, model, inputs, outputs):
        """
        Run inference.

        Args:
            inputs(dict): Its key is input name(str) and its value is a Variable.
            model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`.

        Returns:
            dict(str:Variable): Its key is output name(str) and its value is a Variable.
        """
        # prepare while loop
        max_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True)
        min_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True)
        step_idx = layers.fill_constant(
            shape=[1], dtype="int64", value=0, force_cpu=True)

        ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx)
        pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx)
        scores = layers.array_write(inputs["init_score"], step_idx)
        tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx)
        parent_idx = inputs["parent_idx"]

        if self.decoding_strategy == "beam_search":
            beam_size = self.beam_size
        else:
            beam_size = 1

        eos_penalty = np.zeros(self.vocab_size, dtype="float32")
        eos_penalty[self.eos_id] = -1e9
        eos_penalty = layers.assign(eos_penalty)

        token_penalty = np.zeros(self.vocab_size, dtype="float32")
        token_penalty[self.unk_id] = -1e9
        if self.mask_id >= 0:
            token_penalty[self.mask_id] = -1e9
        token_penalty = layers.assign(token_penalty)

        # start while loop
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx)
            dtype = tmp_tgt_generation_mask.dtype

            append_mask = layers.fill_constant_batch_size_like(
                    input=pre_ids,
                    value=1.0,
                    shape=[-1, 1, 1],
                    dtype=dtype)
            tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2)
            pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx)

            pre_sent = layers.fill_constant_batch_size_like(
                    input=pre_mask,
                    value=1,
                    shape=[-1, 1, 1],
                    dtype=pre_ids.dtype)

            if self.continuous_position:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias
            else:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0)

            if self.use_role:
                pre_role = layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=0,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype)
            else:
                pre_role = None

            dec_out, _ = model._generation_network(
                token_ids=pre_ids,
                type_ids=pre_sent,
                pos_ids=pre_pos,
                role_ids=pre_role,
                generation_mask=tmp_tgt_generation_mask,
                gather_idx=parent_idx)
            logits = model._calc_logits(dec_out)

            # ignore unk and mask token
            if self.ignore_unk:
                logits = layers.elementwise_add(logits, token_penalty, axis=1)

            # min dec length
            min_len_cond = layers.less_than(x=step_idx, y=min_len)
            def min_len_penalty():
                """Plus minimum length penalty."""
                return layers.elementwise_add(logits, eos_penalty, axis=1)
            def no_penalty():
                """No penalty."""
                return logits
            logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty)

            # get probs
            probs = layers.softmax(logits / self.temperature)

            if self.decoding_strategy == "beam_search":
                topk_scores, topk_indices = layers.topk(
                    input=probs, k=beam_size)
            else:
                if self.decoding_strategy.startswith("sampling"):
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                elif self.decoding_strategy.startswith("topk_sampling"):
                    topk_probs, _ = layers.topk(input=probs, k=self.topk)
                    ge_cond = layers.cast(
                        layers.greater_equal(
                            probs,
                            layers.unsqueeze(topk_probs[:, -1], [1])),
                        "float32")
                    old_probs = probs
                    probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True)
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                    probs = old_probs
                else:
                    raise ValueError(self.decoding_strategy)

                sampling_scores = layers.one_hot(
                    layers.unsqueeze(sampling_ids, [1]), probs.shape[1]
                )
                sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3
                topk_scores, topk_indices = layers.topk(
                    input=sampling_scores, k=1)

            pre_len = layers.cast(step_idx, "float32")
            layers.increment(x=step_idx, value=1.0, in_place=True)
            cur_len = layers.cast(step_idx, "float32")

            # update scores
            if self.length_average:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len
            elif self.length_penalty > 0:
                pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty)
                cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty)
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp
            else:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores, axis=0)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=self.eos_id)

        predictions = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "token_ids": inputs["token_ids"],
            "data_id": inputs["data_id"]
        }
        return predictions
Exemple #30
0
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i = layers.slice(
                    gate_input, axes=[1], starts=[0], ends=[hidden_size])
                j = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size],
                    ends=[hidden_size * 2])
                f = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 2],
                    ends=[hidden_size * 3])
                o = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 3],
                    ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(
                m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(
                c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_cell_array.append(last_c)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell
Exemple #31
0
    def beam_search():
        max_len = layers.fill_constant(
            shape=[1], dtype=start_tokens.dtype, value=max_out_len)
        step_idx = layers.fill_constant(
            shape=[1], dtype=start_tokens.dtype, value=0)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        # array states will be stored for each step.
        ids = layers.array_write(start_tokens, step_idx)
        scores = layers.array_write(init_scores, step_idx)
        # cell states will be overwrited at each step.
        # caches contains states of history steps to reduce redundant
        # computation in decoder.
        caches = [{
            "k": layers.fill_constant_batch_size_like(
                input=start_tokens,
                shape=[-1, 0, d_model],
                dtype=enc_output.dtype,
                value=0),
            "v": layers.fill_constant_batch_size_like(
                input=start_tokens,
                shape=[-1, 0, d_model],
                dtype=enc_output.dtype,
                value=0)
        } for i in range(n_layer)]
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            # sequence_expand can gather sequences according to lod thus can be
            # used in beam search to sift states corresponding to selected ids.
            pre_src_attn_bias = layers.sequence_expand(
                x=trg_src_attn_bias, y=pre_scores)
            pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
            pre_caches = [{
                "k": layers.sequence_expand(
                    x=cache["k"], y=pre_scores),
                "v": layers.sequence_expand(
                    x=cache["v"], y=pre_scores),
            } for cache in caches]
            pre_pos = layers.elementwise_mul(
                x=layers.fill_constant_batch_size_like(
                    input=pre_enc_output,  # cann't use pre_ids here since it has lod
                    value=1,
                    shape=[-1, 1],
                    dtype=pre_ids.dtype),
                y=layers.increment(
                    x=step_idx, value=1.0, in_place=False),
                axis=0)
            logits = wrap_decoder(
                trg_vocab_size,
                max_in_len,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                dropout_rate,
                weight_sharing,
                dec_inputs=(
                    pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape,
                    slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape,
                    src_attn_pre_softmax_shape, src_attn_post_softmax_shape),
                enc_output=pre_enc_output,
                caches=pre_caches)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(logits), k=beam_size)
            accu_scores = layers.elementwise_add(
                x=layers.log(topk_scores),
                y=layers.reshape(
                    pre_scores, shape=[-1]),
                axis=0)
            # beam_search op uses lod to distinguish branches.
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            selected_ids, selected_scores = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=eos_idx)
            layers.increment(x=step_idx, value=1.0, in_place=True)
            # update states
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
            layers.assign(pre_enc_output, enc_output)
            for i in range(n_layer):
                layers.assign(pre_caches[i]["k"], caches[i]["k"])
                layers.assign(pre_caches[i]["v"], caches[i]["v"])
            layers.assign(
                layers.elementwise_add(
                    x=slf_attn_pre_softmax_shape,
                    y=attn_pre_softmax_shape_delta),
                slf_attn_pre_softmax_shape)
            layers.assign(
                layers.elementwise_add(
                    x=slf_attn_post_softmax_shape,
                    y=attn_post_softmax_shape_delta),
                slf_attn_post_softmax_shape)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=eos_idx)
        return finished_ids, finished_scores
Exemple #32
0
def network(items_num, hidden_size, step, bs):
    stdv = 1.0 / math.sqrt(hidden_size)

    items = fluid.data(name="items", shape=[bs, -1],
                       dtype="int64")  #[batch_size, uniq_max]
    seq_index = fluid.data(name="seq_index", shape=[bs, -1, 2],
                           dtype="int32")  #[batch_size, seq_max, 2]
    last_index = fluid.data(name="last_index", shape=[bs, 2],
                            dtype="int32")  #[batch_size, 2]
    adj_in = fluid.data(name="adj_in", shape=[bs, -1, -1],
                        dtype="float32")  #[batch_size, seq_max, seq_max]
    adj_out = fluid.data(name="adj_out", shape=[bs, -1, -1],
                         dtype="float32")  #[batch_size, seq_max, seq_max]
    mask = fluid.data(name="mask", shape=[bs, -1, 1],
                      dtype="float32")  #[batch_size, seq_max, 1]
    label = fluid.data(name="label", shape=[bs, 1],
                       dtype="int64")  #[batch_size, 1]

    datas = [items, seq_index, last_index, adj_in, adj_out, mask, label]
    py_reader = fluid.io.DataLoader.from_generator(capacity=256,
                                                   feed_list=datas,
                                                   iterable=False)
    feed_datas = datas

    items_emb = fluid.embedding(
        input=items,
        param_attr=fluid.ParamAttr(name="emb",
                                   initializer=fluid.initializer.Uniform(
                                       low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[batch_size, uniq_max, h]

    pre_state = items_emb
    for i in range(step):
        pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size])
        state_in = layers.fc(
            input=pre_state,
            name="state_in",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
        state_out = layers.fc(
            input=pre_state,
            name="state_out",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]

        state_adj_in = layers.matmul(adj_in,
                                     state_in)  #[batch_size, uniq_max, h]
        state_adj_out = layers.matmul(adj_out,
                                      state_out)  #[batch_size, uniq_max, h]

        gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

        gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
        gru_fc = layers.fc(input=gru_input,
                           name="gru_fc",
                           size=3 * hidden_size,
                           bias_attr=False)
        pre_state, _, _ = fluid.layers.gru_unit(input=gru_fc,
                                                hidden=layers.reshape(
                                                    x=pre_state,
                                                    shape=[-1, hidden_size]),
                                                size=3 * hidden_size)

    final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
    seq = layers.gather_nd(final_state, seq_index)
    last = layers.gather_nd(final_state, last_index)

    seq_fc = layers.fc(
        input=seq,
        name="seq_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, seq_max, h]
    last_fc = layers.fc(
        input=last,
        name="last_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=1,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[bathc_size, h]

    seq_fc_t = layers.transpose(seq_fc, perm=[1, 0,
                                              2])  #[seq_max, batch_size, h]
    add = layers.elementwise_add(seq_fc_t, last_fc)  #[seq_max, batch_size, h]
    b = layers.create_parameter(
        shape=[hidden_size],
        dtype='float32',
        default_initializer=fluid.initializer.Constant(value=0.0))  #[h]
    add = layers.elementwise_add(add, b)  #[seq_max, batch_size, h]

    add_sigmoid = layers.sigmoid(add)  #[seq_max, batch_size, h]
    add_sigmoid = layers.transpose(add_sigmoid,
                                   perm=[1, 0, 2])  #[batch_size, seq_max, h]

    weight = layers.fc(
        input=add_sigmoid,
        name="weight_fc",
        size=1,
        act=None,
        num_flatten_dims=2,
        bias_attr=False,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, seq_max, 1]
    weight *= mask
    weight_mask = layers.elementwise_mul(seq, weight,
                                         axis=0)  #[batch_size, seq_max, h]
    global_attention = layers.reduce_sum(weight_mask, dim=1)  #[batch_size, h]

    final_attention = layers.concat([global_attention, last],
                                    axis=1)  #[batch_size, 2*h]
    final_attention_fc = layers.fc(
        input=final_attention,
        name="final_attention_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, h]

    all_vocab = layers.create_global_var(shape=[items_num - 1],
                                         value=0,
                                         dtype="int64",
                                         persistable=True,
                                         name="all_vocab")

    all_emb = fluid.embedding(input=all_vocab,
                              param_attr=fluid.ParamAttr(
                                  name="emb",
                                  initializer=fluid.initializer.Uniform(
                                      low=-stdv, high=stdv)),
                              size=[items_num, hidden_size])  #[all_vocab, h]

    logits = layers.matmul(x=final_attention_fc, y=all_emb,
                           transpose_y=True)  #[batch_size, all_vocab]
    softmax = layers.softmax_with_cross_entropy(logits=logits,
                                                label=label)  #[batch_size, 1]
    loss = layers.reduce_mean(softmax)  # [1]
    acc = layers.accuracy(input=logits, label=label, k=50)
    return loss, acc, py_reader, feed_datas, logits
def knowledge_seq2seq(config):
    """ knowledge seq2seq """
    emb_size = config.embed_size
    hidden_size = config.hidden_size
    input_size = emb_size
    num_layers = config.num_layers
    bi_direc = config.bidirectional
    batch_size = config.batch_size
    vocab_size = config.vocab_size
    run_type = config.run_type

    enc_input = layers.data(name="enc_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #enc_input --> goal
    enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32')
    goal_input = layers.data(name="goal_input",
                             shape=[1],
                             dtype='int64',
                             lod_level=1)  #goal_input --> x
    cue_input = layers.data(name="cue_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #cue_input --> kg
    #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32')
    memory_mask = layers.data(name='memory_mask',
                              shape=[-1, 1],
                              dtype='float32')
    tar_input = layers.data(name='tar_input',
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #tar_input --> y
    # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32')

    rnn_hidden_size = hidden_size
    if bi_direc:
        rnn_hidden_size //= 2

    enc_out, enc_last_hidden = \
        rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc")
    goal_out, goal_last_hidden = \
        rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc1")
    context_goal_out = fluid.layers.concat(
        input=[enc_last_hidden, goal_last_hidden], axis=2)
    context_goal_out = layers.reshape(context_goal_out,
                                      shape=[-1, 1, rnn_hidden_size * 4])
    # context_goal_out = layers.squeeze(context_goal_out, axes=[1])
    context_goal_out = fluid.layers.fc(context_goal_out,
                                       size=rnn_hidden_size * 2,
                                       bias_attr=False)
    context_goal_out = layers.unsqueeze(context_goal_out, axes=[0])
    bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge")
    bridge_out = layers.tanh(bridge_out)

    cue_last_mask = layers.data(name='cue_last_mask',
                                shape=[-1],
                                dtype='float32')
    knowledge_out, knowledge_last_hidden = \
        rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc")

    query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1])
    query = layers.squeeze(query, axes=[0])
    query = layers.unsqueeze(query, axes=[1])
    query = layers.reshape(query, shape=[batch_size, -1, hidden_size])
    cue_memory = layers.slice(knowledge_last_hidden,
                              axes=[0],
                              starts=[0],
                              ends=[1])
    cue_memory = layers.reshape(cue_memory,
                                shape=[batch_size, -1, hidden_size])
    memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1])

    weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask)

    cue_att = layers.reshape(cue_att, shape=[batch_size, -1])

    knowledge = weighted_cue
    if config.use_posterior:
        print("config.use_posterior", config.use_posterior)
        target_out, target_last_hidden = \
            rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                        dropout=0.0, batch_first=True, name="knowledge_enc1")
        target_goal_out = fluid.layers.concat(
            input=[target_last_hidden, goal_last_hidden], axis=2)
        target_goal_out = layers.reshape(target_goal_out,
                                         shape=[-1, 1, rnn_hidden_size * 4])
        # target_goal_out = layers.squeeze(target_goal_out, axes=[1])
        target_goal_out = fluid.layers.fc(target_goal_out,
                                          size=rnn_hidden_size * 2,
                                          bias_attr=False)
        target_goal_out = layers.unsqueeze(target_goal_out, axes=[0])

        # get attenion
        # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1])
        target_query = layers.slice(target_goal_out,
                                    axes=[0],
                                    starts=[0],
                                    ends=[1])
        target_query = layers.squeeze(target_query, axes=[0])
        target_query = layers.unsqueeze(target_query, axes=[1])
        target_query = layers.reshape(target_query,
                                      shape=[batch_size, -1, hidden_size])

        weight_target, target_att = dot_attention(target_query,
                                                  cue_memory,
                                                  mask=memory_mask)
        target_att = layers.reshape(target_att, shape=[batch_size, -1])
        # add to output
        knowledge = weight_target

    enc_memory_mask = layers.data(name="enc_memory_mask",
                                  shape=[-1, 1],
                                  dtype='float32')
    enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1])
    # decoder init_hidden, enc_memory, enc_mask
    dec_init_hidden = bridge_out
    pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32'))

    enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out,
                                                  pad_value=pad_value)
    enc_memory.persistable = True

    gru_unit = GRU_unit(input_size + hidden_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=0.0,
                        name="decoder_gru_unit")

    cue_gru_unit = GRU_unit(hidden_size + hidden_size,
                            hidden_size,
                            num_layers=num_layers,
                            dropout=0.0,
                            name="decoder_cue_gru_unit")

    tgt_vocab_size = config.vocab_size
    if run_type == "train":
        if config.use_bow:
            bow_logits = fc(knowledge,
                            hidden_size,
                            hidden_size,
                            name='bow_fc_1')
            bow_logits = layers.tanh(bow_logits)
            bow_logits = fc(bow_logits,
                            hidden_size,
                            tgt_vocab_size,
                            name='bow_fc_2')
            bow_logits = layers.softmax(bow_logits)

            bow_label = layers.data(name='bow_label',
                                    shape=[-1, config.max_len],
                                    dtype='int64')
            bow_mask = layers.data(name="bow_mask",
                                   shape=[-1, config.max_len],
                                   dtype='float32')

            bow_logits = layers.expand(bow_logits, [1, config.max_len, 1])
            bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size])
            bow_label = layers.reshape(bow_label, shape=[-1, 1])
            bow_loss = layers.cross_entropy(bow_logits,
                                            bow_label,
                                            soft_label=False)
            bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len])

            bow_loss *= bow_mask
            bow_loss = layers.reduce_sum(bow_loss, dim=[1])
            bow_loss = layers.reduce_mean(bow_loss)

        dec_input = layers.data(name="dec_input",
                                shape=[-1, 1, 1],
                                dtype='int64')
        dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32')

        dec_knowledge = weight_target

        knowledge_goal_out = fluid.layers.concat(
            input=[dec_knowledge, target_query], axis=2)
        knowledge_goal_out = layers.reshape(knowledge_goal_out,
                                            shape=[-1, 1, rnn_hidden_size * 4])
        # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1])
        knowledge_goal_out = fluid.layers.fc(knowledge_goal_out,
                                             size=rnn_hidden_size * 2,
                                             bias_attr=False)
        knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0])

        decoder_logits = \
            rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers,
                         enc_memory, enc_memory_mask, dec_knowledge, vocab_size,
                         init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout)

        target_label = layers.data(name='target_label',
                                   shape=[-1, 1],
                                   dtype='int64')
        target_mask = layers.data(name='target_mask',
                                  shape=[-1, 1],
                                  dtype='float32')

        decoder_logits = layers.reshape(decoder_logits,
                                        shape=[-1, tgt_vocab_size])
        target_label = layers.reshape(target_label, shape=[-1, 1])

        nll_loss = layers.cross_entropy(decoder_logits,
                                        target_label,
                                        soft_label=False)
        nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1])
        nll_loss *= target_mask
        nll_loss = layers.reduce_sum(nll_loss, dim=[1])
        nll_loss = layers.reduce_mean(nll_loss)

        prior_attn = cue_att + 1e-10
        posterior_att = target_att
        posterior_att.stop_gradient = True

        prior_attn = layers.log(prior_attn)

        kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) -
                                   prior_attn)
        kl_loss = layers.reduce_mean(kl_loss)

        kl_and_nll_factor = layers.data(name='kl_and_nll_factor',
                                        shape=[1],
                                        dtype='float32')
        kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1])

        final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor

        return [bow_loss, kl_loss, nll_loss, final_loss]

    elif run_type == "test":
        beam_size = config.beam_size
        batch_size = config.batch_size
        token = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                     value=config.bos_id,
                                     dtype='int64')

        token = layers.reshape(token, shape=[-1, 1])
        max_decode_len = config.max_dec_len

        dec_knowledge = knowledge
        INF = 100000000.0

        init_score_np = np.ones([beam_size * batch_size],
                                dtype='float32') * -INF

        for i in range(batch_size):
            init_score_np[i * beam_size] = 0.0

        pre_score = layers.assign(init_score_np)

        pos_index_np = np.arange(batch_size).reshape(-1, 1)
        pos_index_np = \
            np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size

        pos_index = layers.assign(pos_index_np)

        id_array = []
        score_array = []
        index_array = []
        init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1])
        init_enc_memory = layers.reshape(
            init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size])
        init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1])
        init_enc_mask = layers.reshape(init_enc_mask,
                                       shape=[batch_size * beam_size, 1, -1])

        dec_knowledge = layers.reshape(dec_knowledge,
                                       shape=[-1, 1, hidden_size])
        init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1])
        init_dec_knowledge = layers.reshape(
            init_dec_knowledge,
            shape=[batch_size * beam_size, -1, hidden_size])

        dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size])
        dec_init_hidden = layers.reshape(dec_init_hidden,
                                         shape=[1, -1, hidden_size])

        length_average = config.length_average
        UNK = config.unk_id
        EOS = config.eos_id
        for i in range(1, max_decode_len + 1):
            dec_emb = get_embedding(token, input_size, vocab_size)
            dec_out, dec_last_hidden = \
                decoder_step(gru_unit, cue_gru_unit,
                             dec_emb, dec_init_hidden, input_size, hidden_size,
                             init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None)
            output_in_size = hidden_size + hidden_size

            rnnout = layers.dropout(dec_out,
                                    dropout_prob=config.dropout,
                                    is_test=True)
            rnnout = fc(rnnout,
                        output_in_size,
                        hidden_size,
                        name='dec_out_fc1')
            rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2')

            log_softmax_output = log_softmax(rnnout)
            log_softmax_output = layers.squeeze(log_softmax_output, axes=[1])

            if i > 1:
                if length_average:
                    log_softmax_output = layers.elementwise_add(
                        (log_softmax_output / i),
                        (pre_score * (1.0 - 1.0 / i)),
                        axis=0)
                else:
                    log_softmax_output = layers.elementwise_add(
                        log_softmax_output, pre_score, axis=0)
            else:
                log_softmax_output = layers.elementwise_add(log_softmax_output,
                                                            pre_score,
                                                            axis=0)

            log_softmax_output = layers.reshape(log_softmax_output,
                                                shape=[batch_size, -1])

            topk_score, topk_index = layers.topk(log_softmax_output,
                                                 k=beam_size)
            topk_score = layers.reshape(topk_score, shape=[-1])
            topk_index = layers.reshape(topk_index, shape=[-1])

            vocab_var = layers.fill_constant([1],
                                             dtype='int64',
                                             value=vocab_size)
            new_token = topk_index % vocab_var

            index = topk_index // vocab_var
            id_array.append(new_token)
            index_array.append(index)
            index = index + pos_index

            score_array.append(topk_score)

            eos_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=EOS)
            unk_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=UNK)
            eos_eq = layers.cast(layers.equal(new_token, eos_ids),
                                 dtype='float32')

            topk_score += eos_eq * -100000000.0

            unk_eq = layers.cast(layers.equal(new_token, unk_ids),
                                 dtype='float32')
            topk_score += unk_eq * -100000000.0

            # update
            token = new_token
            pre_score = topk_score
            token = layers.reshape(token, shape=[-1, 1])

            index = layers.cast(index, dtype='int32')
            dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0])
            dec_init_hidden = layers.gather(dec_last_hidden, index=index)
            dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0])
            init_enc_memory = layers.gather(init_enc_memory, index)
            init_enc_mask = layers.gather(init_enc_mask, index)
            init_dec_knowledge = layers.gather(init_dec_knowledge, index)

        final_score = layers.concat(score_array, axis=0)
        final_ids = layers.concat(id_array, axis=0)
        final_index = layers.concat(index_array, axis=0)

        final_score = layers.reshape(
            final_score, shape=[max_decode_len, beam_size * batch_size])
        final_ids = layers.reshape(
            final_ids, shape=[max_decode_len, beam_size * batch_size])
        final_index = layers.reshape(
            final_index, shape=[max_decode_len, beam_size * batch_size])

        return final_score, final_ids, final_index
Exemple #34
0
    def net(self, inputs, is_infer=False):
        if is_infer:
            bs = self.evaluate_batch_size
        else:
            bs = self.train_batch_size

        stdv = 1.0 / math.sqrt(self.hidden_size)

        def embedding_layer(input,
                            table_name,
                            emb_dim,
                            initializer_instance=None):
            emb = fluid.embedding(
                input=input,
                size=[self.dict_size, emb_dim],
                param_attr=fluid.ParamAttr(
                    name=table_name, initializer=initializer_instance))
            return emb

        sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
        items_emb = embedding_layer(inputs[0], "emb", self.hidden_size,
                                    sparse_initializer)
        pre_state = items_emb
        for i in range(self.step):
            pre_state = layers.reshape(
                x=pre_state, shape=[bs, -1, self.hidden_size])
            state_in = layers.fc(
                input=pre_state,
                name="state_in",
                size=self.hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)))  # [batch_size, uniq_max, h]
            state_out = layers.fc(
                input=pre_state,
                name="state_out",
                size=self.hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)))  # [batch_size, uniq_max, h]

            state_adj_in = layers.matmul(inputs[3],
                                         state_in)  # [batch_size, uniq_max, h]
            state_adj_out = layers.matmul(
                inputs[4], state_out)  # [batch_size, uniq_max, h]

            gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

            gru_input = layers.reshape(
                x=gru_input, shape=[-1, self.hidden_size * 2])
            gru_fc = layers.fc(input=gru_input,
                               name="gru_fc",
                               size=3 * self.hidden_size,
                               bias_attr=False)
            pre_state, _, _ = fluid.layers.gru_unit(
                input=gru_fc,
                hidden=layers.reshape(
                    x=pre_state, shape=[-1, self.hidden_size]),
                size=3 * self.hidden_size)

        final_state = layers.reshape(
            pre_state, shape=[bs, -1, self.hidden_size])
        seq = layers.gather_nd(final_state, inputs[1])
        last = layers.gather_nd(final_state, inputs[2])

        seq_fc = layers.fc(
            input=seq,
            name="seq_fc",
            size=self.hidden_size,
            bias_attr=False,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, seq_max, h]
        last_fc = layers.fc(input=last,
                            name="last_fc",
                            size=self.hidden_size,
                            bias_attr=False,
                            act=None,
                            num_flatten_dims=1,
                            param_attr=fluid.ParamAttr(
                                initializer=fluid.initializer.Uniform(
                                    low=-stdv, high=stdv)))  # [bathc_size, h]

        seq_fc_t = layers.transpose(
            seq_fc, perm=[1, 0, 2])  # [seq_max, batch_size, h]
        add = layers.elementwise_add(seq_fc_t,
                                     last_fc)  # [seq_max, batch_size, h]
        b = layers.create_parameter(
            shape=[self.hidden_size],
            dtype='float32',
            default_initializer=fluid.initializer.Constant(value=0.0))  # [h]
        add = layers.elementwise_add(add, b)  # [seq_max, batch_size, h]

        add_sigmoid = layers.sigmoid(add)  # [seq_max, batch_size, h]
        add_sigmoid = layers.transpose(
            add_sigmoid, perm=[1, 0, 2])  # [batch_size, seq_max, h]

        weight = layers.fc(
            input=add_sigmoid,
            name="weight_fc",
            size=1,
            act=None,
            num_flatten_dims=2,
            bias_attr=False,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, seq_max, 1]
        weight *= inputs[5]
        weight_mask = layers.elementwise_mul(
            seq, weight, axis=0)  # [batch_size, seq_max, h]
        global_attention = layers.reduce_sum(
            weight_mask, dim=1)  # [batch_size, h]

        final_attention = layers.concat(
            [global_attention, last], axis=1)  # [batch_size, 2*h]
        final_attention_fc = layers.fc(
            input=final_attention,
            name="final_attention_fc",
            size=self.hidden_size,
            bias_attr=False,
            act=None,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, h]

        # all_vocab = layers.create_global_var(
        #     shape=[items_num - 1],
        #     value=0,
        #     dtype="int64",
        #     persistable=True,
        #     name="all_vocab")
        all_vocab = np.arange(1, self.dict_size).reshape((-1)).astype('int32')
        all_vocab = fluid.layers.cast(
            x=fluid.layers.assign(all_vocab), dtype='int64')

        all_emb = fluid.embedding(
            input=all_vocab,
            param_attr=fluid.ParamAttr(
                name="emb",
                initializer=fluid.initializer.Uniform(
                    low=-stdv, high=stdv)),
            size=[self.dict_size, self.hidden_size])  # [all_vocab, h]

        logits = layers.matmul(
            x=final_attention_fc, y=all_emb,
            transpose_y=True)  # [batch_size, all_vocab]
        softmax = layers.softmax_with_cross_entropy(
            logits=logits, label=inputs[6])  # [batch_size, 1]
        self.loss = layers.reduce_mean(softmax)  # [1]
        self.acc = layers.accuracy(input=logits, label=inputs[6], k=20)

        self._cost = self.loss
        if is_infer:
            self._infer_results['acc'] = self.acc
            self._infer_results['loss'] = self.loss
            return

        self._metrics["LOSS"] = self.loss
        self._metrics["train_acc"] = self.acc
Exemple #35
0
def gat_layer(gw,
              feature,
              edge_features,
              hidden_size,
              act,
              name,
              num_heads=1,
              feat_drop=0.1,
              attn_drop=0.1,
              is_test=False):
    """tbd"""
    def send_attention(src_feat, dst_feat, edge_feat):
        """tbd"""
        output = src_feat["left_a"] + dst_feat["right_a"]
        output = layers.leaky_relu(output, alpha=0.2)  # (num_edges, num_heads)
        return {"alpha": output, "h": src_feat["h"] + edge_feat["h"]}

    def reduce_attention(msg):
        """tbd"""
        alpha = msg["alpha"]  # lod-tensor (batch_size, seq_len, num_heads)
        h = msg["h"]
        alpha = paddle_helper.sequence_softmax(alpha)
        old_h = h
        h = layers.reshape(h, [-1, num_heads, hidden_size])
        alpha = layers.reshape(alpha, [-1, num_heads, 1])
        if attn_drop > 1e-15:
            alpha = layers.dropout(alpha,
                                   dropout_prob=attn_drop,
                                   is_test=is_test,
                                   dropout_implementation="upscale_in_train")
        h = h * alpha
        h = layers.reshape(h, [-1, num_heads * hidden_size])
        h = layers.lod_reset(h, old_h)
        return layers.sequence_pool(h, "sum")

    if feat_drop > 1e-15:
        feature = layers.dropout(feature,
                                 dropout_prob=feat_drop,
                                 is_test=is_test,
                                 dropout_implementation='upscale_in_train')

    ft = layers.fc(feature,
                   hidden_size * num_heads,
                   bias_attr=False,
                   param_attr=fluid.ParamAttr(name=name + '_weight'))
    left_a = layers.create_parameter(shape=[num_heads, hidden_size],
                                     dtype='float32',
                                     name=name + '_gat_l_A')
    right_a = layers.create_parameter(shape=[num_heads, hidden_size],
                                      dtype='float32',
                                      name=name + '_gat_r_A')
    reshape_ft = layers.reshape(ft, [-1, num_heads, hidden_size])
    left_a_value = layers.reduce_sum(reshape_ft * left_a, -1)
    right_a_value = layers.reduce_sum(reshape_ft * right_a, -1)

    msg = gw.send(send_attention,
                  nfeat_list=[("h", ft), ("left_a", left_a_value),
                              ("right_a", right_a_value)],
                  efeat_list=[("h", edge_features)])
    output = gw.recv(msg, reduce_attention)
    bias = layers.create_parameter(shape=[hidden_size * num_heads],
                                   dtype='float32',
                                   is_bias=True,
                                   name=name + '_bias')
    bias.stop_gradient = True
    output = layers.elementwise_add(output, bias, act=act)
    return output