Beispiel #1
0
    def forward(self, q, k, v, lengths, speaker_embed, start_index, 
                force_monotonic=False, prev_coeffs=None, window=None):
        # add position encoding as an inductive bias 
        if self.has_bias: # multi-speaker model
            omega_q = 2 * F.sigmoid(
                F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1]))
            omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze(
                self.k_pos_affine(speaker_embed), axes=[-1]))
        else: # single-speaker case
            batch_size = q.shape[0]
            omega_q = F.ones((batch_size, ), dtype="float32")
            omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default
        q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q)
        k += self.position_encoding_weight * positional_encoding(k, 0, omega_k)

        q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v)
        activations = F.matmul(q, k, transpose_y=True)
        activations /= np.sqrt(self.attention_dim)

        if self.training:
            # mask the <pad> parts from the encoder
            mask = F.sequence_mask(lengths, dtype="float32")
            attn_bias = F.scale(1. - mask, -1000)
            activations += F.unsqueeze(attn_bias, [1])
        elif force_monotonic:
            assert window is not None
            backward_step, forward_step = window
            T_enc = k.shape[1]
            batch_size, T_dec, _ = q.shape

            # actually T_dec = 1 here
            alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \
                   if prev_coeffs is None \
                   else F.argmax(prev_coeffs, axis=-1)
            backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool")
            forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool")
            mask = F.cast(F.logical_xor(backward, forward), "float32")
            # print("mask's shape:", mask.shape)
            attn_bias = F.scale(1. - mask, -1000)
            activations += attn_bias

        # softmax
        coefficients = F.softmax(activations, axis=-1)
        # context vector
        coefficients = F.dropout(coefficients, 1. - self.keep_prob,
                                 dropout_implementation='upscale_in_train')
        contexts = F.matmul(coefficients, v)
        # context normalization
        enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32")
        contexts *= F.sqrt(enc_lengths)
        # out affine
        contexts = self.out_affine(contexts)
        return contexts, coefficients
def model_func(inputs, is_train=True):
    # inputs = [src, src_sequence_length, trg, trg_sequence_length, label]
    # src = fluid.data(name="src", shape=[None, None], dtype="int64")
    # 源语言输入
    src = inputs[0]
    src_sequence_length = inputs[1]
    src_embedding = fluid.embedding(
        input=src,
        size=[source_dict_size, hidden_dim],
        dtype="float32",
        param_attr=fluid.ParamAttr(name="src_emb_table"))

    # 编码器
    encoder_output, encoder_state = encoder(src_embedding, src_sequence_length)

    encoder_output_proj = layers.fc(input=encoder_output,
                                    size=decoder_size,
                                    num_flatten_dims=2,
                                    bias_attr=False)
    src_mask = layers.sequence_mask(src_sequence_length,
                                    maxlen=layers.shape(src)[1],
                                    dtype="float32")
    encoder_padding_mask = (src_mask - 1.0) * 1e9

    # 目标语言输入,训练时有、预测生成时无该输入
    trg = inputs[2] if is_train else None

    # 解码器
    output = decoder(encoder_output=encoder_output,
                     encoder_output_proj=encoder_output_proj,
                     encoder_state=encoder_state,
                     encoder_padding_mask=encoder_padding_mask,
                     trg=trg,
                     is_train=is_train)
    return output
Beispiel #3
0
def model_func(inputs, is_train=True):
    src = inputs[0]
    src_sequence_length = inputs[1]
    # source embedding
    src_embeder = lambda x: fluid.embedding(
        input=x,
        size=[source_dict_size, hidden_dim],
        dtype="float32",
        param_attr=fluid.ParamAttr(name="src_emb_table"))
    src_embedding = src_embeder(src)

    # encoder
    encoder_output, encoder_state = encoder(src_embedding, src_sequence_length)

    encoder_output_proj = layers.fc(input=encoder_output,
                                    size=decoder_size,
                                    num_flatten_dims=2,
                                    bias_attr=False)
    src_mask = layers.sequence_mask(src_sequence_length,
                                    maxlen=layers.shape(src)[1],
                                    dtype="float32")
    encoder_padding_mask = (src_mask - 1.0) * 1e9

    trg = inputs[2] if is_train else None

    # decoder
    output = decoder(encoder_output=encoder_output,
                     encoder_output_proj=encoder_output_proj,
                     encoder_state=encoder_state,
                     encoder_padding_mask=encoder_padding_mask,
                     trg=trg,
                     is_train=is_train)
    return output
Beispiel #4
0
    def test_with_input_lengths(self):
        mp = self.mp.clone()
        sp = self.sp
        rnn1 = self.rnn1
        rnn2 = self.rnn2
        exe = self.executor
        scope = self.scope

        x = np.random.randn(12, 4, 16)
        if not self.time_major:
            x = np.transpose(x, [1, 0, 2])
        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)

        y1, (h1, c1) = rnn1(x, sequence_length=sequence_length)

        with paddle.fluid.unique_name.guard():
            with paddle.static.program_guard(mp, sp):
                x_data = paddle.data(
                    "input", [-1, -1, 16],
                    dtype=paddle.framework.get_default_dtype())
                seq_len = paddle.data("seq_len", [-1], dtype="int64")
                mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
                if self.time_major:
                    mask = paddle.transpose(mask, [1, 0])
                y, (h, c) = rnn2(x_data, sequence_length=seq_len)
                y = paddle.multiply(y, mask, axis=0)

        feed_dict = {x_data.name: x, seq_len.name: sequence_length}

        with paddle.static.scope_guard(scope):
            y2, h2, c2 = exe.run(mp, feed=feed_dict, fetch_list=[y, h, c])

        np.testing.assert_allclose(y1, y2, atol=1e-8, rtol=1e-5)
        np.testing.assert_allclose(h1, h2, atol=1e-8, rtol=1e-5)
        np.testing.assert_allclose(c1, c2, atol=1e-8, rtol=1e-5)
Beispiel #5
0
    def forward(self, src, src_length):
        # encoding
        encoder_output, encoder_final_state = self.encoder(src, src_length)

        # decoder initial states
        decoder_initial_states = [
            encoder_final_state,
            self.decoder.lstm_attention.cell.get_initial_states(
                batch_ref=encoder_output, shape=[self.hidden_size])
        ]
        # attention mask to avoid paying attention on padddings
        src_mask = layers.sequence_mask(
            src_length,
            maxlen=layers.shape(src)[1],
            dtype=encoder_output.dtype)
        encoder_padding_mask = (src_mask - 1.0) * 1e9
        encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])

        # Tile the batch dimension with beam_size
        encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch(
            encoder_output, self.beam_size)
        encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch(
            encoder_padding_mask, self.beam_size)

        # dynamic decoding with beam search
        rs, _ = self.beam_search_decoder(
            inits=decoder_initial_states,
            encoder_output=encoder_output,
            encoder_padding_mask=encoder_padding_mask)
        return rs
Beispiel #6
0
    def _build_decoder(self, enc_final_state, mode='train', beam_size=10):
        output_layer = lambda x: layers.fc(
            x,
            size=self.tar_vocab_size,
            num_flatten_dims=len(x.shape) - 1,
            param_attr=fluid.ParamAttr(
                name="output_w",
                initializer=fluid.initializer.UniformInitializer(
                    low=-self.init_scale, high=self.init_scale)),
            bias_attr=False)

        dec_cell = AttentionDecoderCell(self.num_layers, self.hidden_size,
                                        self.dropout, self.init_scale)
        dec_initial_states = [
            enc_final_state,
            dec_cell.get_initial_states(batch_ref=self.enc_output,
                                        shape=[self.hidden_size])
        ]
        max_src_seq_len = layers.shape(self.src)[1]
        src_mask = layers.sequence_mask(self.src_sequence_length,
                                        maxlen=max_src_seq_len,
                                        dtype='float32')
        enc_padding_mask = (src_mask - 1.0)
        if mode == 'train':
            dec_output, _ = rnn(cell=dec_cell,
                                inputs=self.tar_emb,
                                initial_states=dec_initial_states,
                                sequence_length=None,
                                enc_output=self.enc_output,
                                enc_padding_mask=enc_padding_mask)

            dec_output = output_layer(dec_output)

        elif mode == 'beam_search':
            output_layer = lambda x: layers.fc(
                x,
                size=self.tar_vocab_size,
                num_flatten_dims=len(x.shape) - 1,
                param_attr=fluid.ParamAttr(name="output_w"),
                bias_attr=False)
            beam_search_decoder = BeamSearchDecoder(
                dec_cell,
                self.beam_start_token,
                self.beam_end_token,
                beam_size,
                embedding_fn=self.tar_embeder,
                output_fn=output_layer)
            enc_output = beam_search_decoder.tile_beam_merge_with_batch(
                self.enc_output, beam_size)
            enc_padding_mask = beam_search_decoder.tile_beam_merge_with_batch(
                enc_padding_mask, beam_size)
            outputs, _ = dynamic_decode(beam_search_decoder,
                                        inits=dec_initial_states,
                                        max_step_num=self.beam_max_step_num,
                                        enc_output=enc_output,
                                        enc_padding_mask=enc_padding_mask)
            return outputs

        return dec_output
Beispiel #7
0
def loss_func(logits, label, trg_sequence_length):
    probs = layers.softmax(logits)
    loss = layers.cross_entropy(input=probs, label=label)
    trg_mask = layers.sequence_mask(trg_sequence_length,
                                    maxlen=layers.shape(logits)[1],
                                    dtype="float32")
    avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask)
    return avg_cost
Beispiel #8
0
def simple_rnn(rnn_input,
               init_hidden,
               hidden_size,
               kernel_param_attr=None,
               recurrent_param_attr=None,
               bias_attr=None,
               act='relu',
               sequence_length=None,
               name='simple_rnn'):

    # Transpose (sequence x batch x hidden)
    rnn_input = layers.transpose(rnn_input, [1, 0, 2])

    # Generate Mask
    mask = None
    if sequence_length:
        max_seq_len = layers.shape(rnn_input)[0]
        mask = layers.sequence_mask(sequence_length,
                                    maxlen=max_seq_len,
                                    dtype='float32')
        mask = layers.transpose(mask, [1, 0])

    # Init
    simple_rnn = SimpleRNN_unit(rnn_input, hidden_size, kernel_param_attr,
                                recurrent_param_attr, bias_attr, act)

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(rnn_input)

        if mask:
            step_mask = rnn.step_input(mask)

        if init_hidden:
            pre_hidden = rnn.memory(init=init_hidden)
        else:
            pre_hidden = rnn.memory(batch_ref=rnn_input,
                                    shape=[-1, hidden_size])

        last_hidden = simple_rnn(step_in, pre_hidden)

        rnn.update_memory(pre_hidden, last_hidden)

        rnn.step_output(last_hidden)

        step_input = last_hidden

    rnn_out = rnn()

    last_hidden = rnn_out[-1]
    last_hidden = layers.reshape(last_hidden, shape=[1, -1, hidden_size])

    rnn_output = layers.transpose(rnn_out, [1, 0, 2])
    last_hidden = layers.transpose(last_hidden, [1, 0, 2])

    return rnn_out, last_hidden
Beispiel #9
0
 def learn(self, probs, label, weight=None, length=None):
     loss = layers.cross_entropy(input=probs, label=label, soft_label=False)
     max_seq_len = layers.shape(probs)[1]
     mask = layers.sequence_mask(length, maxlen=max_seq_len, dtype="float32")
     loss = loss * mask
     loss = layers.reduce_mean(loss, dim=[0])
     loss = layers.reduce_sum(loss)
     optimizer = fluid.optimizer.Adam(self.lr)
     optimizer.minimize(loss)
     return loss
Beispiel #10
0
def loss_func(logits, label, trg_sequence_length):
    probs = layers.softmax(logits)
    # 使用交叉熵损失函数
    loss = layers.cross_entropy(input=probs, label=label)
    # 根据长度生成掩码,并依此剔除 padding 部分计算的损失
    trg_mask = layers.sequence_mask(trg_sequence_length,
                                    maxlen=layers.shape(logits)[1],
                                    dtype="float32")
    avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask)
    return avg_cost
Beispiel #11
0
def predict_test_util(place, mode):
    place = paddle.set_device(place)
    paddle.seed(123)
    np.random.seed(123)

    class Net(paddle.nn.Layer):
        def __init__(self):
            super(Net, self).__init__()
            self.rnn = getattr(paddle.nn, mode)(16,
                                                32,
                                                2,
                                                direction="bidirectional",
                                                dropout=0.1)

        def forward(self, input):
            return self.rnn(input)

    x = paddle.randn((4, 10, 16))
    x.stop_gradient = False
    seq_len = paddle.to_tensor(np.array([10, 6, 8, 5]))
    mask = sequence_mask(seq_len, maxlen=10, dtype=x.dtype)
    mask = paddle.unsqueeze(mask, [2])
    rnn = Net()
    y, _ = rnn(x)
    y = y * mask
    loss = paddle.mean(y)
    loss.backward()
    optimizer = paddle.optimizer.Adam(
        learning_rate=0.1, parameters=rnn.parameters())
    optimizer.step()
    rnn.eval()
    y, _ = rnn(x)
    # `jit.to_static` would include a train_program, eval mode might cause
    # some errors currently, such as dropout grad op gets `is_test == True`.
    rnn.train()

    rnn = paddle.jit.to_static(
        rnn, [paddle.static.InputSpec(
            shape=[None, None, 16], dtype=x.dtype)])
    paddle.jit.save(rnn, "./inference/%s_infer" % mode)

    paddle.enable_static()

    new_scope = paddle.static.Scope()
    with paddle.static.scope_guard(new_scope):
        exe = paddle.static.Executor(place)
        [inference_program, feed_target_names,
         fetch_targets] = paddle.static.load_inference_model(
             "./inference/%s_infer" % mode, exe)
        results = exe.run(inference_program,
                          feed={feed_target_names[0]: x.numpy()},
                          fetch_list=fetch_targets)
        np.testing.assert_equal(
            y.numpy(), results[0])  # eval results equal predict results
    paddle.disable_static()
Beispiel #12
0
 def spec_loss(self, decoded, input, num_frames=None):
     if num_frames is None:
         l1_loss = F.reduce_mean(F.abs(decoded - input))
     else:
         # mask the <pad> part of the decoder
         num_channels = decoded.shape[-1]
         l1_loss = F.abs(decoded - input)
         mask = F.sequence_mask(num_frames, dtype="float32")
         l1_loss *= F.unsqueeze(mask, axes=[-1])
         l1_loss = F.reduce_sum(l1_loss) / F.scale(F.reduce_sum(mask), num_channels)
     return l1_loss
Beispiel #13
0
    def forward(self, outputs, labels):
        predict, (trg_length, label) = outputs[0], labels
        # for target padding mask
        mask = layers.sequence_mask(
            trg_length, maxlen=layers.shape(predict)[1], dtype=predict.dtype)

        cost = layers.softmax_with_cross_entropy(
            logits=predict, label=label, soft_label=False)
        masked_cost = layers.elementwise_mul(cost, mask, axis=0)
        batch_mean_cost = layers.reduce_mean(masked_cost, dim=[0])
        seq_cost = layers.reduce_sum(batch_mean_cost)
        return seq_cost
    def _birnn_encoder(self, inputs, input_len, name_lens, name_pos,
                       name_tok_len):
        """forward

        Args:
            inputs (Variable): shape=[batch_size, max_seq_len, hidden_size]
            input_len (Variable): shape=[batch_size]
            name_lens (Variable): shape=[batch_size]
            name_pos (Variable): shape=[batch_size, max_name_len, max_tokens]
            name_tok_len (Variable): shape=[batch_size, max_name_len]

        Returns: TODO

        Raises: NULL

        """
        rnn_output, rnn_final_state = self._rnn_encoder.forward(
            inputs, input_len)

        max_name_len = name_pos.shape[1]
        name_begin = name_pos[:, :, 0]

        name_repr_mask = layers.sequence_mask(name_lens,
                                              max_name_len,
                                              dtype=name_tok_len.dtype)
        len_delta = layers.elementwise_mul(name_tok_len - 1,
                                           name_repr_mask,
                                           axis=0)
        name_end = name_begin + len_delta

        if self._bidirectional:
            name_fwd_repr_gathered = nn_utils.batch_gather_2d(
                rnn_output, name_end)[:, :, :self._hidden_size]
            name_bwd_repr_gathered = nn_utils.batch_gather_2d(
                rnn_output, name_begin)[:, :, self._hidden_size:]
            name_repr_gathered = layers.concat(
                input=[name_fwd_repr_gathered, name_bwd_repr_gathered],
                axis=-1)
            new_hidden_size = self._hidden_size * 2
        else:
            name_repr_gathered = layers.gather_nd(rnn_output, name_end)
            new_hidden_size = self._hidden_size

        name_repr_tmp = layers.reshape(
            name_repr_gathered, shape=[-1, max_name_len, new_hidden_size])
        name_repr_mask = layers.cast(name_repr_mask, dtype=name_repr_tmp.dtype)
        name_repr = layers.elementwise_mul(name_repr_tmp,
                                           name_repr_mask,
                                           axis=0)

        return name_repr, None
def def_seq2seq_model(num_layers, hidden_size, dropout_prob, src_vocab_size,
                      trg_vocab_size):
    "vanilla seq2seq model"
    # data
    source = fluid.data(name="src", shape=[None, None], dtype="int64")
    source_length = fluid.data(name="src_sequence_length",
                               shape=[None],
                               dtype="int64")
    target = fluid.data(name="trg", shape=[None, None], dtype="int64")
    target_length = fluid.data(name="trg_sequence_length",
                               shape=[None],
                               dtype="int64")
    label = fluid.data(name="label", shape=[None, None, 1], dtype="int64")

    # embedding
    src_emb = fluid.embedding(source, (src_vocab_size, hidden_size))
    tar_emb = fluid.embedding(target, (src_vocab_size, hidden_size))

    # encoder
    enc_cell = EncoderCell(num_layers, hidden_size, dropout_prob)
    enc_output, enc_final_state = dynamic_rnn(cell=enc_cell,
                                              inputs=src_emb,
                                              sequence_length=source_length)

    # decoder
    dec_cell = DecoderCell(num_layers, hidden_size, dropout_prob)
    dec_output, dec_final_state = dynamic_rnn(cell=dec_cell,
                                              inputs=tar_emb,
                                              initial_states=enc_final_state)
    logits = layers.fc(dec_output,
                       size=trg_vocab_size,
                       num_flatten_dims=len(dec_output.shape) - 1,
                       bias_attr=False)

    # loss
    loss = layers.softmax_with_cross_entropy(logits=logits,
                                             label=label,
                                             soft_label=False)
    loss = layers.unsqueeze(loss, axes=[2])
    max_tar_seq_len = layers.shape(target)[1]
    tar_mask = layers.sequence_mask(target_length,
                                    maxlen=max_tar_seq_len,
                                    dtype="float32")
    loss = loss * tar_mask
    loss = layers.reduce_mean(loss, dim=[0])
    loss = layers.reduce_sum(loss)

    # optimizer
    optimizer = fluid.optimizer.Adam(0.001)
    optimizer.minimize(loss)
    return loss
    def __call__(self, src, src_length, trg=None, trg_length=None):
        # encoder
        encoder_output, encoder_final_state = self.encoder(
            self.src_embeder(src), src_length)

        decoder_initial_states = [
            encoder_final_state,
            self.decoder.decoder_cell.get_initial_states(
                batch_ref=encoder_output, shape=[encoder_output.shape[-1]])
        ]
        src_mask = layers.sequence_mask(src_length,
                                        maxlen=layers.shape(src)[1],
                                        dtype="float32")
        encoder_padding_mask = (src_mask - 1.0) * 1e9
        encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])

        # decoder
        decoder_kwargs = {
            "inputs": self.trg_embeder(trg),
            "sequence_length": trg_length,
        } if self.decoder.decoding_strategy == "train_greedy" else (
            {
                "embedding_fn": self.trg_embeder,
                "beam_size": self.beam_size,
                "start_token": self.start_token,
                "end_token": self.end_token
            } if self.decoder.decoding_strategy == "beam_search" else {
                "embedding_fn":
                self.trg_embeder,
                "start_tokens":
                layers.fill_constant_batch_size_like(input=encoder_output,
                                                     shape=[-1],
                                                     dtype=src.dtype,
                                                     value=self.start_token),
                "end_token":
                self.end_token
            })
        decoder_kwargs["output_layer"] = self.output_layer

        (decoder_output, decoder_final_state,
         dec_seq_lengths) = self.decoder(decoder_initial_states,
                                         encoder_output, encoder_padding_mask,
                                         **decoder_kwargs)
        if self.decoder.decoding_strategy == "beam_search":  # for inference
            return decoder_output
        logits, samples, sample_length = (decoder_output.cell_outputs,
                                          decoder_output.sample_ids,
                                          dec_seq_lengths)
        probs = layers.softmax(logits)
        return probs, samples, sample_length
Beispiel #17
0
    def _compute_loss(self, dec_output):
        loss = layers.softmax_with_cross_entropy(logits=dec_output,
                                                 label=self.label,
                                                 soft_label=False)
        loss = layers.unsqueeze(loss, axes=[2])

        max_tar_seq_len = layers.shape(self.tar)[1]
        tar_mask = layers.sequence_mask(self.tar_sequence_length,
                                        maxlen=max_tar_seq_len,
                                        dtype='float32')
        loss = loss * tar_mask
        loss = layers.reduce_mean(loss, dim=[0])
        loss = layers.reduce_sum(loss)
        return loss
Beispiel #18
0
    def recv_func(msg):
        pad_value = L.assign(input=np.array([0.0], dtype=np.float32))

        output, length = L.sequence_pad(msg, pad_value, maxlen=max_neigh)
        mask = L.sequence_mask(length, dtype="float32", maxlen=max_neigh)
        mask = L.unsqueeze(mask, [2])
        input_mask = (L.matmul(mask, mask, transpose_y=True) - 1) * -10000
        for layer in range(num_layers):
            output = self_attention_and_residual(output,
                                                 hidden_size,
                                                 input_mask,
                                                 name="cross_feat_%s" % layer,
                                                 maxlen=max_neigh)
        return L.reduce_sum(output * mask, 1) / L.reduce_sum(mask, 1)
Beispiel #19
0
def _select_table(condition,
                  inputs,
                  table_enc,
                  table_len,
                  table_mask_by_col,
                  ptr_net,
                  grammar,
                  name=None):
    """select_table.

    Args:
        condition (TYPE): NULL
        inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1
        table_enc (TYPE): NULL
        table_len (TYPE): NULL
        ptr_net (TYPE): NULL
        grammar (TYPE): NULL
        name (str):
        table_mask_by_col (Variable):

    Returns: TODO

    Raises: NULL
    """
    condition = layers.cast(condition, dtype='float32')

    table_mask_by_len = layers.sequence_mask(table_len,
                                             maxlen=grammar.MAX_TABLE,
                                             dtype='float32')
    table_mask_by_len = layers.reshape(table_mask_by_len,
                                       [-1, grammar.MAX_TABLE])
    table_mask_by_col = layers.reshape(table_mask_by_col,
                                       [-1, grammar.MAX_TABLE])
    table_mask = layers.elementwise_mul(table_mask_by_len, table_mask_by_col)
    predicts = ptr_net.forward(inputs, table_enc, table_mask)

    zeros_l = tensor.fill_constant_batch_size_like(
        predicts,
        shape=[-1, grammar.grammar_size],
        dtype='float32',
        value=-INF)
    zeros_r = tensor.fill_constant_batch_size_like(
        predicts,
        shape=[-1, grammar.MAX_COLUMN + grammar.MAX_VALUE],
        dtype='float32',
        value=-INF)
    final_output = tensor.concat([zeros_l, predicts, zeros_r], axis=-1)
    true_final_output = layers.elementwise_mul(final_output, condition, axis=0)
    return true_final_output
Beispiel #20
0
def _select_column(condition,
                   inputs,
                   column_enc,
                   column_len,
                   ptr_net,
                   grammar,
                   column2table_mask,
                   name=None):
    """select_column.

    Args:
        condition (TYPE): NULL
        inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1
        column_enc (TYPE): NULL
        column_len (TYPE): NULL
        ptr_net (TYPE): NULL
        grammar (TYPE): NULL
        column2table_mask (Variable):
        name (str):

    Returns: TODO

    Raises: NULL
    """
    condition = layers.cast(condition, dtype='float32')

    column_mask = layers.sequence_mask(column_len,
                                       maxlen=grammar.MAX_COLUMN,
                                       dtype='float32')
    column_mask = layers.reshape(column_mask, [-1, grammar.MAX_COLUMN])
    predicts = ptr_net.forward(inputs, column_enc, column_mask)

    pred_ids = layers.argmax(predicts, axis=-1)
    valid_table_mask = nn_utils.batch_gather(column2table_mask, pred_ids)

    ## concat zeros to vocab size
    zeros_l = tensor.fill_constant_batch_size_like(
        predicts,
        shape=[-1, grammar.grammar_size + grammar.MAX_TABLE],
        dtype='float32',
        value=-INF)
    zeros_r = tensor.fill_constant_batch_size_like(
        predicts, shape=[-1, grammar.MAX_VALUE], dtype='float32', value=-INF)
    final_output = tensor.concat([zeros_l, predicts, zeros_r], axis=-1)
    true_final_output = layers.elementwise_mul(final_output, condition, axis=0)
    true_valid_table_mask = layers.elementwise_mul(valid_table_mask,
                                                   condition,
                                                   axis=0)
    return true_final_output, true_valid_table_mask
    def _compute_loss(self, dec_output):
        loss = layers.softmax_with_cross_entropy(logits=dec_output,
                                                 label=self.label,
                                                 soft_label=False)

        loss = layers.reshape(loss, shape=[self.batch_size, -1])

        max_tar_seq_len = layers.shape(self.tar)[1]
        tar_mask = layers.sequence_mask(self.tar_sequence_length,
                                        maxlen=max_tar_seq_len,
                                        dtype='float32')
        loss = loss * tar_mask
        loss = layers.reduce_mean(loss, dim=[0])
        loss = layers.reduce_sum(loss)

        loss.permissions = True

        return loss
Beispiel #22
0
    def test_with_input_lengths(self):
        rnn1 = self.rnn1
        rnn2 = self.rnn2

        x = np.random.randn(12, 4, 16)
        if not self.time_major:
            x = np.transpose(x, [1, 0, 2])
        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)

        y1, h1 = rnn1(x, sequence_length=sequence_length)

        seq_len = paddle.to_variable(sequence_length)
        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
        if self.time_major:
            mask = paddle.transpose(mask, [1, 0])
        y2, h2 = rnn2(paddle.to_variable(x), sequence_length=seq_len)
        y2 = paddle.multiply(y2, mask, axis=0)

        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
        np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
Beispiel #23
0
    def _compute_loss(self, mean, logvars, dec_output):

        kl_loss = self._kl_dvg(mean, logvars)

        rec_loss = layers.softmax_with_cross_entropy(logits=dec_output,
                                                     label=self.label,
                                                     soft_label=False)

        rec_loss = layers.reshape(rec_loss, shape=[self.batch_size, -1])

        max_tar_seq_len = layers.shape(self.tar)[1]
        tar_mask = layers.sequence_mask(self.tar_sequence_length,
                                        maxlen=max_tar_seq_len,
                                        dtype='float32')
        rec_loss = rec_loss * tar_mask
        rec_loss = layers.reduce_mean(rec_loss, dim=[0])
        rec_loss = layers.reduce_sum(rec_loss)

        loss = kl_loss * self.kl_weight + rec_loss

        return loss, kl_loss, rec_loss
Beispiel #24
0
def _select_value(condition,
                  inputs,
                  value_enc,
                  value_len,
                  ptr_net,
                  grammar,
                  name=None):
    """select_value.

    Args:
        condition (TYPE): NULL
        inputs (TYPE): NULL
        value_enc (TYPE): NULL
        value_len (TYPE): NULL
        ptr_net (TYPE): NULL
        grammar (TYPE): NULL

    Returns: TODO

    Raises: NULL
    """
    condition = layers.cast(condition, dtype='float32')

    value_mask = layers.sequence_mask(value_len,
                                      maxlen=grammar.MAX_VALUE,
                                      dtype='float32')
    value_mask = layers.reshape(value_mask, [-1, grammar.MAX_VALUE])
    predicts = ptr_net.forward(inputs, value_enc, value_mask)

    ## concat zeros to vocab size
    zeros_l = tensor.fill_constant_batch_size_like(
        predicts,
        shape=[
            -1, grammar.grammar_size + grammar.MAX_TABLE + grammar.MAX_COLUMN
        ],
        dtype='float32',
        value=-INF)
    final_output = tensor.concat([zeros_l, predicts], axis=-1)
    true_final_output = layers.elementwise_mul(final_output, condition, axis=0)
    return true_final_output
Beispiel #25
0
    def test_with_input_lengths(self):
        rnn1 = self.rnn1
        rnn2 = self.rnn2

        x = np.random.randn(12, 4, 16)
        if not self.time_major:
            x = np.transpose(x, [1, 0, 2])
        sequence_length = np.array([12, 10, 9, 8], dtype=np.int64)

        y1, (fw_h1, bw_h1) = rnn1(x, sequence_length=sequence_length)

        seq_len = paddle.to_tensor(sequence_length)
        mask = sequence_mask(seq_len, dtype=paddle.get_default_dtype())
        if self.time_major:
            mask = paddle.transpose(mask, [1, 0])
        y2, (fw_h2, bw_h2) = rnn2(paddle.to_tensor(x), sequence_length=seq_len)
        mask = paddle.unsqueeze(mask, -1)
        y2 = paddle.multiply(y2, mask)

        np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
        np.testing.assert_allclose(fw_h1, fw_h2.numpy(), atol=1e-8, rtol=1e-5)
        np.testing.assert_allclose(bw_h1, bw_h2.numpy(), atol=1e-8, rtol=1e-5)
Beispiel #26
0
    def forward(self, src, src_length, trg):
        # encoder
        encoder_output, encoder_final_state = self.encoder(src, src_length)

        # decoder initial states: use input_feed and the structure is
        # [[h,c] * num_layers, input_feed], consistent with DecoderCell.states
        decoder_initial_states = [
            encoder_final_state,
            self.decoder.lstm_attention.cell.get_initial_states(
                batch_ref=encoder_output, shape=[self.hidden_size])
        ]
        # attention mask to avoid paying attention on padddings
        src_mask = layers.sequence_mask(src_length,
                                        maxlen=layers.shape(src)[1],
                                        dtype=encoder_output.dtype)
        encoder_padding_mask = (src_mask - 1.0) * 1e9
        encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])

        # decoder with attentioon
        predict = self.decoder(trg, decoder_initial_states, encoder_output,
                               encoder_padding_mask)
        return predict
Beispiel #27
0
def gen_mask(valid_lengths, max_len, dtype="float32"):
    """
    Generate a mask tensor from valid lengths. note that it return a *reverse*
    mask. Indices within valid lengths correspond to 0, and those within
    padding area correspond to 1. 
    
    Assume that valid_lengths = [2,5,7], and max_len = 7, the generated mask is
    [[0, 0, 1, 1, 1, 1, 1],
     [0, 0, 0, 0, 0, 1, 1],
     [0, 0, 0, 0, 0, 0, 0]].

    Args:
        valid_lengths (Variable): shape(B, ), dtype: int64. A rank-1 Tensor containing the valid lengths (timesteps) of each example, where B means beatch_size.
        max_len (int): The length (number of time steps) of the mask.
        dtype (str, optional): A string that specifies the data type of the returned mask. Defaults to 'float32'.

    Returns:
        mask (Variable): shape(B, max_len), dtype float32, a mask computed from valid lengths.
    """
    mask = F.sequence_mask(valid_lengths, maxlen=max_len, dtype=dtype)
    mask = 1 - mask
    return mask
    def _simple_sum_encoder(self, inputs, input_len, name_lens, name_pos,
                            name_tok_len):
        """forward

        Args:
            inputs (Variable): shape=[batch_size, max_seq_len, hidden_size]
            input_len (Variable): shape=[batch_size]
            name_lens (Variable): shape=[batch_size]
            name_pos (Variable): shape=[batch_size, max_name_len, max_tokens]
            name_tok_len (Variable): shape=[batch_size, max_name_len]

        Returns: TODO

        Raises: NULL

        """
        max_name_len = name_pos.shape[1]
        max_name_tok_len = name_pos.shape[2]
        hidden_size = inputs.shape[2]

        name_pos_1d = layers.reshape(
            name_pos, shape=[-1, max_name_len * max_name_tok_len])
        name_enc = nn_utils.batch_gather_2d(inputs, name_pos_1d)
        name_enc = layers.reshape(
            name_enc, shape=[-1, max_name_len, max_name_tok_len, hidden_size])

        # shape = [batch_size, name_len, token_len, hidden_size]
        name_tok_mask = layers.sequence_mask(name_tok_len,
                                             maxlen=max_name_tok_len,
                                             dtype=name_enc.dtype)
        name_enc_masked = layers.elementwise_mul(name_enc,
                                                 name_tok_mask,
                                                 axis=0)
        # shape = [batch_size, name_len, hidden_size]
        output = layers.reduce_sum(name_enc_masked, dim=2)
        return output, None
Beispiel #29
0
def basic_lstm(input,
               init_hidden,
               init_cell,
               hidden_size,
               num_layers=1,
               sequence_length=None,
               dropout_prob=0.0,
               bidirectional=False,
               batch_first=True,
               param_attr=None,
               bias_attr=None,
               gate_activation=None,
               activation=None,
               forget_bias=1.0,
               dtype='float32',
               name='basic_lstm'):
    """
    LSTM implementation using basic operators, supports multiple layers and bidirectional LSTM.

    .. math::
           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)

           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )

           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)

           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)

           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}

           h_t &= o_t \odot tanh(c_t)

    Args:
        input (Variable): lstm input tensor, 
                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )  
                       if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
        init_hidden(Variable|None): The initial hidden state of the LSTM
                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
                       and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use.
                       If it's None, it will be set to all 0.
        init_cell(Variable|None): The initial hidden state of the LSTM
                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
                       and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use.
                       If it's None, it will be set to all 0.
        hidden_size (int): Hidden size of the LSTM
        num_layers (int): The total number of layers of the LSTM
        sequence_length (Variabe|None): A tensor (shape [batch_size]) stores each real length of each instance,
                        This tensor will be convert to a mask to mask the padding ids
                        If it's None means NO padding ids
        dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of each layers, 
                             NOT between time steps
        bidirectional (bool|False): If it is bidirectional
        batch_first (bool|True): The shape format of the input and output tensors. If true,
            the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false,
            the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default
            this function accepts input and emits output in batch-major form to be consistent
            with most of data format, though a bit less efficient because of extra transposes.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
            weight matrix. Note:
            If it is set to None or one attribute of ParamAttr, lstm_unit will
            create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr|None): The parameter attribute for the bias
            of LSTM unit.
            If it is set to None or one attribute of ParamAttr, lstm_unit will 
            create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        gate_activation (function|None): The activation function for gates (actGate).
                                  Default: 'fluid.layers.sigmoid'
        activation (function|None): The activation function for cell (actNode).
                             Default: 'fluid.layers.tanh'
        forget_bias (float|1.0) : Forget bias used to compute the forget gate
        dtype(string): Data type used in this unit
        name(string): Name used to identify parameters and biases

    Returns:
        rnn_out(Tensor), last_hidden(Tensor), last_cell(Tensor)
            - rnn_out is the result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
              if is_bidirec set to True, it's shape will be ( seq_len x batch_sze x hidden_size*2)
            - last_hidden is the hidden state of the last step of LSTM \
              with shape ( num_layers x batch_size x hidden_size ) \
              if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size),
              and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size)  to use.
            - last_cell is the hidden state of the last step of LSTM \
              with shape ( num_layers x batch_size x hidden_size ) \
              if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size),
              and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size)  to use.

    Examples:
        .. code-block:: python
            
            import paddle.fluid.layers as layers
            from paddle.fluid.contrib.layers import basic_lstm

            batch_size = 20
            input_size = 128
            hidden_size = 256
            num_layers = 2
            dropout = 0.5
            bidirectional = True
            batch_first = False

            input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32')
            pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
            pre_cell = layers.data( name = "pre_cell", shape=[-1, hidden_size], dtype='float32')
            sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32')

            rnn_out, last_hidden, last_cell = basic_lstm( input, pre_hidden, pre_cell, \
                    hidden_size, num_layers = num_layers, \
                    sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \
                    batch_first = batch_first)

    """
    fw_unit_list = []

    for i in range(num_layers):
        new_name = name + "_layers_" + str(i)
        if param_attr is not None and param_attr.name is not None:
            layer_param_attr = copy.deepcopy(param_attr)
            layer_param_attr.name += "_fw_w_" + str(i)
        else:
            layer_param_attr = param_attr
        if bias_attr is not None and bias_attr.name is not None:
            layer_bias_attr = copy.deepcopy(bias_attr)
            layer_bias_attr.name += "_fw_b_" + str(i)
        else:
            layer_bias_attr = bias_attr
        fw_unit_list.append(
            BasicLSTMUnit(new_name,
                          hidden_size,
                          param_attr=layer_param_attr,
                          bias_attr=layer_bias_attr,
                          gate_activation=gate_activation,
                          activation=activation,
                          forget_bias=forget_bias,
                          dtype=dtype))
    if bidirectional:
        bw_unit_list = []

        for i in range(num_layers):
            new_name = name + "_reverse_layers_" + str(i)
            if param_attr is not None and param_attr.name is not None:
                layer_param_attr = copy.deepcopy(param_attr)
                layer_param_attr.name += "_bw_w_" + str(i)
            else:
                layer_param_attr = param_attr
            if bias_attr is not None and bias_attr.name is not None:
                layer_bias_attr = copy.deepcopy(bias_attr)
                layer_bias_attr.name += "_bw_b_" + str(i)
            else:
                layer_bias_attr = param_attr
            bw_unit_list.append(
                BasicLSTMUnit(new_name,
                              hidden_size,
                              param_attr=layer_param_attr,
                              bias_attr=layer_bias_attr,
                              gate_activation=gate_activation,
                              activation=activation,
                              forget_bias=forget_bias,
                              dtype=dtype))

    if batch_first:
        input = layers.transpose(input, [1, 0, 2])

    mask = None
    if sequence_length:
        max_seq_len = layers.shape(input)[0]
        mask = layers.sequence_mask(sequence_length,
                                    maxlen=max_seq_len,
                                    dtype='float32')

        mask = layers.transpose(mask, [1, 0])

    direc_num = 1
    if bidirectional:
        direc_num = 2
        # convert to [num_layers, 2, batch_size, hidden_size]
    if init_hidden:
        init_hidden = layers.reshape(
            init_hidden, shape=[num_layers, direc_num, -1, hidden_size])
        init_cell = layers.reshape(
            init_cell, shape=[num_layers, direc_num, -1, hidden_size])

    # forward direction
    def get_single_direction_output(rnn_input,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                    pre_cell = rnn.memory(init=init_cell[i, direc_index])
                else:
                    pre_hidden = rnn.memory(batch_ref=rnn_input,
                                            shape=[-1, hidden_size])
                    pre_cell = rnn.memory(batch_ref=rnn_input,
                                          shape=[-1, hidden_size])

                new_hidden, new_cell = unit_list[i](step_input, pre_hidden,
                                                    pre_cell)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask,
                        axis=0) - layers.elementwise_mul(pre_hidden,
                                                         (step_mask - 1),
                                                         axis=0)
                    new_cell = layers.elementwise_mul(
                        new_cell, step_mask, axis=0) - layers.elementwise_mul(
                            pre_cell, (step_mask - 1), axis=0)

                rnn.update_memory(pre_hidden, new_hidden)
                rnn.update_memory(pre_cell, new_cell)

                rnn.step_output(new_hidden)
                rnn.step_output(new_cell)

                step_input = new_hidden
                if dropout_prob != None and dropout_prob > 0.0:
                    step_input = layers.dropout(
                        step_input,
                        dropout_prob=dropout_prob,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        last_cell_array = []
        rnn_output = rnn_out[-1]
        for i in range(num_layers):
            last_hidden = rnn_out[i * 2]
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)
            last_cell = rnn_out[i * 2 + 1]
            last_cell = last_cell[-1]
            last_cell_array.append(last_cell)

        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(
            last_hidden_output, shape=[num_layers, -1, hidden_size])
        last_cell_output = layers.concat(last_cell_array, axis=0)
        last_cell_output = layers.reshape(last_cell_output,
                                          shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output, last_cell_output
        # seq_len, batch_size, hidden_size

    fw_rnn_out, fw_last_hidden, fw_last_cell = get_single_direction_output(
        input, fw_unit_list, mask, direc_index=0)

    if bidirectional:
        bw_input = layers.reverse(input, axis=[0])
        bw_mask = None
        if mask:
            bw_mask = layers.reverse(mask, axis=[0])
        bw_rnn_out, bw_last_hidden, bw_last_cell = get_single_direction_output(
            bw_input, bw_unit_list, bw_mask, direc_index=1)

        bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])

        rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2)
        last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1)
        last_hidden = layers.reshape(
            last_hidden, shape=[num_layers * direc_num, -1, hidden_size])

        last_cell = layers.concat([fw_last_cell, bw_last_cell], axis=1)
        last_cell = layers.reshape(
            last_cell, shape=[num_layers * direc_num, -1, hidden_size])

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])
        return rnn_out, last_hidden, last_cell
    else:

        rnn_out = fw_rnn_out
        last_hidden = fw_last_hidden
        last_cell = fw_last_cell

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])

        return rnn_out, last_hidden, last_cell
Beispiel #30
0
def basic_gru(input,
              init_hidden,
              hidden_size,
              num_layers=1,
              sequence_length=None,
              dropout_prob=0.0,
              bidirectional=False,
              batch_first=True,
              param_attr=None,
              bias_attr=None,
              gate_activation=None,
              activation=None,
              dtype='float32',
              name='basic_gru'):
    """
    GRU implementation using basic operator, supports multiple layers and bidirectional gru.

    .. math::
            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)

            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)

            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)

            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)

    Args:
        input (Variable): GRU input tensor, 
                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )  
                       if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
        init_hidden(Variable|None): The initial hidden state of the GRU
                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
                       and can be reshaped to tensor with ( num_layers x 2 x batch_size x hidden_size) to use.
                       If it's None, it will be set to all 0.
        hidden_size (int): Hidden size of the GRU
        num_layers (int): The total number of layers of the GRU
        sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance,
                        This tensor will be convert to a mask to mask the padding ids
                        If it's None means NO padding ids
        dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers, 
                             NOT between time steps
        bidirectional (bool|False): If it is bidirectional
        batch_first (bool|True): The shape format of the input and output tensors. If true,
            the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false,
            the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default
            this function accepts input and emits output in batch-major form to be consistent
            with most of data format, though a bit less efficient because of extra transposes.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
            weight matrix. Note:
            If it is set to None or one attribute of ParamAttr, gru_unit will
            create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr|None): The parameter attribute for the bias
            of GRU unit.
            If it is set to None or one attribute of ParamAttr, gru_unit will 
            create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        gate_activation (function|None): The activation function for gates (actGate).
                                  Default: 'fluid.layers.sigmoid'
        activation (function|None): The activation function for cell (actNode).
                             Default: 'fluid.layers.tanh'
        dtype(string): data type used in this unit
        name(string): name used to identify parameters and biases

    Returns:
        rnn_out(Tensor),last_hidden(Tensor)
            - rnn_out is result of GRU hidden, with shape (seq_len x batch_size x hidden_size) \
              if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
            - last_hidden is the hidden state of the last step of GRU \
              shape is ( num_layers x batch_size x hidden_size ) \
              if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size),
              can be reshaped to a tensor with shape( num_layers x 2 x batch_size x hidden_size)

    Examples:
        .. code-block:: python
            
            import paddle.fluid.layers as layers
            from paddle.fluid.contrib.layers import basic_gru

            batch_size = 20
            input_size = 128
            hidden_size = 256
            num_layers = 2
            dropout = 0.5
            bidirectional = True
            batch_first = False

            input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32')
            pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
            sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32')


            rnn_out, last_hidden = basic_gru( input, pre_hidden, hidden_size, num_layers = num_layers, \
                    sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \
                    batch_first = batch_first)

    """

    fw_unit_list = []

    for i in range(num_layers):
        new_name = name + "_layers_" + str(i)
        if param_attr is not None and param_attr.name is not None:
            layer_param_attr = copy.deepcopy(param_attr)
            layer_param_attr.name += "_fw_w_" + str(i)
        else:
            layer_param_attr = param_attr
        if bias_attr is not None and bias_attr.name is not None:
            layer_bias_attr = copy.deepcopy(bias_attr)
            layer_bias_attr.name += "_fw_b_" + str(i)
        else:
            layer_bias_attr = bias_attr
        fw_unit_list.append(
            BasicGRUUnit(new_name, hidden_size, layer_param_attr,
                         layer_bias_attr, gate_activation, activation, dtype))
    if bidirectional:
        bw_unit_list = []

        for i in range(num_layers):
            new_name = name + "_reverse_layers_" + str(i)
            if param_attr is not None and param_attr.name is not None:
                layer_param_attr = copy.deepcopy(param_attr)
                layer_param_attr.name += "_bw_w_" + str(i)
            else:
                layer_param_attr = param_attr
            if bias_attr is not None and bias_attr.name is not None:
                layer_bias_attr = copy.deepcopy(bias_attr)
                layer_bias_attr.name += "_bw_b_" + str(i)
            else:
                layer_bias_attr = bias_attr

            bw_unit_list.append(
                BasicGRUUnit(new_name, hidden_size, layer_param_attr,
                             layer_bias_attr, gate_activation, activation,
                             dtype))

    if batch_first:
        input = layers.transpose(input, [1, 0, 2])

    mask = None
    if sequence_length:
        max_seq_len = layers.shape(input)[0]
        mask = layers.sequence_mask(sequence_length,
                                    maxlen=max_seq_len,
                                    dtype='float32')
        mask = layers.transpose(mask, [1, 0])

    direc_num = 1
    if bidirectional:
        direc_num = 2
    if init_hidden:
        init_hidden = layers.reshape(
            init_hidden, shape=[num_layers, direc_num, -1, hidden_size])

    def get_single_direction_output(rnn_input,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                else:
                    pre_hidden = rnn.memory(batch_ref=rnn_input,
                                            shape=[-1, hidden_size],
                                            ref_batch_dim_idx=1)

                new_hidden = unit_list[i](step_input, pre_hidden)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask,
                        axis=0) - layers.elementwise_mul(pre_hidden,
                                                         (step_mask - 1),
                                                         axis=0)
                rnn.update_memory(pre_hidden, new_hidden)

                rnn.step_output(new_hidden)

                step_input = new_hidden
                if dropout_prob != None and dropout_prob > 0.0:
                    step_input = layers.dropout(
                        step_input,
                        dropout_prob=dropout_prob,
                    )

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        rnn_output = rnn_out[-1]
        for i in range(num_layers):
            last_hidden = rnn_out[i]
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)

        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(
            last_hidden_output, shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output
        # seq_len, batch_size, hidden_size

    fw_rnn_out, fw_last_hidden = get_single_direction_output(input,
                                                             fw_unit_list,
                                                             mask,
                                                             direc_index=0)

    if bidirectional:
        bw_input = layers.reverse(input, axis=[0])
        bw_mask = None
        if mask:
            bw_mask = layers.reverse(mask, axis=[0])
        bw_rnn_out, bw_last_hidden = get_single_direction_output(bw_input,
                                                                 bw_unit_list,
                                                                 bw_mask,
                                                                 direc_index=1)

        bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])

        rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2)
        last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1)

        last_hidden = layers.reshape(
            last_hidden, shape=[num_layers * direc_num, -1, hidden_size])

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])
        return rnn_out, last_hidden
    else:

        rnn_out = fw_rnn_out
        last_hidden = fw_last_hidden

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])

        return rnn_out, last_hidden