Example #1
0
def epoch_train(args, model, optimizer, loader, epoch):
    """Train in one epoch"""
    model.train()
    total_loss = 0
    pad_index = args.pad_index
    bos_index = args.bos_index
    eos_index = args.eos_index

    for batch, inputs in enumerate(loader(), start=1):
        model.clear_gradients()

        if args.encoding_model.startswith("ernie"):
            words, arcs, rels = inputs
            s_arc, s_rel, words = model(words)
        else:
            words, feats, arcs, rels = inputs
            s_arc, s_rel, words = model(words, feats)

        mask = layers.logical_and(
            layers.logical_and(words != pad_index, words != bos_index),
            words != eos_index,
        )

        loss = loss_function(s_arc, s_rel, arcs, rels, mask)
        loss.backward()

        optimizer.minimize(loss)
        total_loss += loss.numpy().item()
        logging.info(
            "epoch: {}, batch: {}/{}, batch_size: {}, loss: {:.4f}".format(
                epoch, batch, math.ceil(len(loader)), len(words),
                loss.numpy().item()))
    total_loss /= len(loader)
    return total_loss
Example #2
0
    def get_metrics(self, inputs, outputs):
        """Get metrics."""
        metrics = {}
        pooled_out = self._get_pooled_output(outputs["enc_out"])
        cls_logits = self._get_classifier_output(pooled_out,
                                                 num_classes=self.num_classes,
                                                 name="cls")
        cls_loss, cls_softmax = layers.softmax_with_cross_entropy(
            logits=cls_logits, label=inputs["label"], return_softmax=True)

        cls_acc = layers.accuracy(cls_softmax, inputs["label"])
        mean_cls_loss = layers.mean(cls_loss)

        metrics["loss"] = mean_cls_loss
        metrics["cls_loss"] = mean_cls_loss
        metrics["cls_acc"] = cls_acc

        # statistics for recall & precision & f1
        if self.num_classes == 2:
            pred = layers.argmax(cls_softmax, axis=1)
            label = layers.squeeze(inputs["label"], axes=[1])
            metrics["stat_tp"] = layers.reduce_sum(
                layers.logical_and(pred == 1, label == 1).astype("float32"))
            metrics["stat_fp"] = layers.reduce_sum(
                layers.logical_and(pred == 1, label == 0).astype("float32"))
            metrics["stat_tn"] = layers.reduce_sum(
                layers.logical_and(pred == 0, label == 0).astype("float32"))
            metrics["stat_fn"] = layers.reduce_sum(
                layers.logical_and(pred == 0, label == 1).astype("float32"))
        return metrics
Example #3
0
def get_face_mask(densepose_map):
    """
    Obtain mask of faces. 

    Args:
        densepose_map (3D or 4D tensor)
    """
    need_reshape = len(densepose_map.shape) == 4
    if need_reshape:
        bo, t, h, w = densepose_map.shape
        densepose_map = L.reshape(densepose_map, (-1, h, w))

    b, h, w = densepose_map.shape
    part_map = (densepose_map / 2 + 0.5) * 24
    assert L.reduce_all((part_map >= 0)) and L.reduce_all((part_map < 25))

    mask = dg.to_variable(np.zeros((b, h, w)).astype('bool'))

    for j in [23, 24]:
        mask = L.logical_or(
            mask, L.logical_and((part_map > j - 0.1), (part_map < j + 0.1)))

    if need_reshape:
        mask = L.reshape(mask, (bo, t, h, w))

    return P.cast(mask, "float32")
    def is_finished(self, step_idx, source_length, alive_log_probs, finished_scores, finished_in_finished):
        """
            is_finished
        """
        base_1 = layers.cast(source_length, 'float32') + 55.0
        base_1 /= 6.0
        max_length_penalty = layers.pow(base_1, self.alpha)

        flat_alive_log_probs = layers.reshape(alive_log_probs, [-1])
        lower_bound_alive_scores_1 = layers.gather(flat_alive_log_probs, [self.get_alive_index])
        
        lower_bound_alive_scores = lower_bound_alive_scores_1 / max_length_penalty
        
        lowest_score_of_finished_in_finish = layers.reduce_min(finished_scores * finished_in_finished, dim=1)

        finished_in_finished = layers.cast(finished_in_finished, 'bool')
        lowest_score_of_finished_in_finish += \
                        ((1.0 - layers.cast(layers.reduce_any(finished_in_finished, 1), 'float32')) * -INF)
        
        #print lowest_score_of_finished_in_finish
        bound_is_met = layers.reduce_all(layers.greater_than(lowest_score_of_finished_in_finish, 
                                                             lower_bound_alive_scores))

        decode_length = source_length + 50
        length_cond = layers.less_than(x=step_idx, y=decode_length)

        return layers.logical_and(x=layers.logical_not(bound_is_met), y=length_cond)
Example #5
0
def epoch_evaluate(args, model, loader, puncts):
    """Evaluate in one epoch"""
    model.eval()

    total_loss, metric = 0, Metric()

    for words, feats, arcs, rels in loader():
        # ignore the first token of each sentence
        tmp_words = layers.pad(words[:, 1:],
                               paddings=[0, 0, 1, 0],
                               pad_value=args.pad_index)
        mask = tmp_words != args.pad_index

        s_arc, s_rel = model(words, feats)
        loss = loss_function(s_arc, s_rel, arcs, rels, mask)
        arc_preds, rel_preds = decode(args, s_arc, s_rel, mask)
        # ignore all punctuation if not specified
        if not args.punct:
            punct_mask = layers.reduce_all(
                layers.expand(layers.unsqueeze(words, -1),
                              (1, 1, puncts.shape[0])) != layers.expand(
                                  layers.reshape(puncts, (1, 1, -1)),
                                  (*words.shape, 1)),
                dim=-1)
            mask = layers.logical_and(mask, punct_mask)

        metric(arc_preds, rel_preds, arcs, rels, mask)
        total_loss += loss.numpy().item()

    total_loss /= len(loader)

    return total_loss, metric
Example #6
0
    def forward(self, words, feats=None):
        """Forward network"""
        # batch_size, seq_len = words.shape
        # get embedding
        words, x = self.embed(words, feats)
        mask = layers.logical_and(words != self.args.pad_index,
                                  words != self.args.eos_index)

        # apply MLPs to the BiLSTM output states
        arc_h = self.mlp_arc_h(x)
        arc_d = self.mlp_arc_d(x)
        rel_h = self.mlp_rel_h(x)
        rel_d = self.mlp_rel_d(x)

        # get arc and rel scores from the bilinear attention
        # [batch_size, seq_len, seq_len]
        s_arc = self.arc_attn(arc_d, arc_h)
        # [batch_size, seq_len, seq_len, n_rels]
        s_rel = layers.transpose(self.rel_attn(rel_d, rel_h),
                                 perm=(0, 2, 3, 1))
        # set the scores that exceed the length of each sentence to -1e5
        s_arc_mask = paddle.unsqueeze(mask, 1)
        s_arc = s_arc * s_arc_mask + paddle.scale(paddle.cast(
            s_arc_mask, 'int32'),
                                                  scale=1e5,
                                                  bias=-1,
                                                  bias_after_scale=False)

        return s_arc, s_rel, words
Example #7
0
 def __call__(self, arc_preds, rel_preds, arc_golds, rel_golds, mask):
     """call"""
     arc_mask = nn.masked_select(arc_preds == arc_golds, mask)
     rel_mask = layers.logical_and(
         nn.masked_select(rel_preds == rel_golds, mask), arc_mask)
     self.total += len(arc_mask)
     self.correct_arcs += np.sum(arc_mask.numpy()).item()
     self.correct_rels += np.sum(rel_mask.numpy()).item()
Example #8
0
    def logical_and(cls, x, y, *args, out=None, name=None):
        """wrapper of paddle.fluid.layers.logical_and

        Args:
            x (Variable): NULL
            y (Variable): NULL
            *args (TYPE): NULL
            out (TYPE): Default is None
            name (TYPE): Default is None

        Returns: TODO

        Raises: NULL

        """
        tmp = layers.logical_and(x, y, out=out, name=name)
        for var in args:
            tmp = layers.logical_and(tmp, var, out=out, name=name)
        return tmp
Example #9
0
def epoch_predict(env, args, model, loader):
    """Predict in one epoch"""
    connections, deprels, probabilities = [], [], []
    pad_index = args.pad_index
    bos_index = args.bos_index
    eos_index = args.eos_index
    for batch, inputs in enumerate(loader(), start=1):
        if args.encoding_model.startswith("ernie"):
            words = inputs[0]
            connection_prob, deprel_prob, words = model(words)
        else:
            words, feats = inputs
            connection_prob, deprel_prob, words = model(words, feats)
        mask = layers.logical_and(
            layers.logical_and(words != pad_index, words != bos_index),
            words != eos_index,
        )
        lens = nn.reduce_sum(mask, -1)
        connection_predicts, deprel_predicts = decode(args, connection_prob,
                                                      deprel_prob, mask)
        connections.extend(
            layers.split(nn.masked_select(connection_predicts, mask),
                         lens.numpy().tolist()))
        deprels.extend(
            layers.split(nn.masked_select(deprel_predicts, mask),
                         lens.numpy().tolist()))
        if args.prob:
            arc_probs = nn.index_sample(
                layers.softmax(connection_prob, -1),
                layers.unsqueeze(connection_predicts, -1))
            probabilities.extend(
                layers.split(
                    nn.masked_select(layers.squeeze(arc_probs, axes=[-1]),
                                     mask),
                    lens.numpy().tolist(),
                ))
    connections = [seq.numpy().tolist() for seq in connections]
    deprels = [env.REL.vocab[seq.numpy().tolist()] for seq in deprels]
    probabilities = [[round(p, 3) for p in seq.numpy().tolist()]
                     for seq in probabilities]

    return connections, deprels, probabilities
Example #10
0
def epoch_evaluate(args, model, loader, punctuation):
    """Evaluate in one epoch"""
    model.eval()
    total_loss, metric = 0, Metric()
    pad_index = args.pad_index
    bos_index = args.bos_index
    eos_index = args.eos_index

    for batch_index, inputs in enumerate(loader(), start=1):
        if args.encoding_model.startswith("ernie"):
            words, connections, deprel = inputs
            connection_prob, deprel_prob, words = model(words)
        else:
            words, feats, connections, deprel = inputs
            connection_prob, deprel_prob, words = model(words, feats)
        mask = layers.logical_and(
            layers.logical_and(words != pad_index, words != bos_index),
            words != eos_index,
        )
        loss = loss_function(connection_prob, deprel_prob, connections, deprel,
                             mask)
        connection_predict, deprel_predict = decode(args, connection_prob,
                                                    deprel_prob, mask)
        # ignore all punctuation if not specified
        if not args.punct:
            punct_mask = layers.reduce_all(
                layers.expand(layers.unsqueeze(words, -1),
                              (1, 1, punctuation.shape[0])) !=
                layers.expand(layers.reshape(punctuation,
                                             (1, 1, -1)), words.shape + [1]),
                dim=-1)

            mask = layers.logical_and(mask, punct_mask)

        metric(connection_predict, deprel_predict, connections, deprel, mask)
        total_loss += loss.numpy().item()

    total_loss /= len(loader)

    return total_loss, metric
Example #11
0
def _process_type_leaf(condition, decoder, grammar_stack, next_inputs,
                       finished):
    """Process when output type is LEAF

    Args:
        condition (TYPE): NULL
        decoder (TYPE): NULL
        grammar_stack (StackData): (gmr_stack_data, gmr_stack_pos)
        next_inputs (DecoderInputsWrapper): (input_var, action, grammar_mask)
        finished (TYPE): NULL

    Returns: None

    Raises: NULL
    """
    ## pop stack
    next_output, valid_pos, gmr_stack_tmp = data_structure.Stack.pop(
        grammar_stack, mask=True, in_place=False)
    valid_pos = fluider.squeeze(valid_pos, [1])

    ## update next grammar mask
    next_actions = layers.elementwise_mul(decoder.grammar_action(next_output),
                                          layers.cast(
                                              valid_pos,
                                              dtype=next_inputs.action.dtype),
                                          axis=0)
    next_gmr_mask = layers.elementwise_mul(
        decoder.grammar_mask(next_output),
        layers.cast(valid_pos, dtype=next_inputs.gmr_mask.dtype),
        axis=0)

    ## save result, while condition is True
    new_gmr_stack_data, new_gmr_stack_pos, new_actions, new_gmr_mask = nn_utils.ifelse(
        condition,
        [gmr_stack_tmp.data, gmr_stack_tmp.pos, next_actions, next_gmr_mask], [
            grammar_stack.data, grammar_stack.pos, next_inputs.action,
            next_inputs.gmr_mask
        ])

    layers.utils.map_structure(
        layers.assign,
        [new_gmr_stack_data, new_gmr_stack_pos, next_actions, new_gmr_mask], [
            grammar_stack.data, grammar_stack.pos, next_inputs.action,
            next_inputs.gmr_mask
        ])
    layers.logical_or(finished,
                      layers.logical_and(condition,
                                         layers.logical_not(valid_pos)),
                      out=finished)
Example #12
0
def select_edges(src, dst, input_mask, num_nodes, max_seqlen):
    src = fluid.layers.elementwise_max(src, num_nodes * 0)
    dst = fluid.layers.elementwise_max(dst, num_nodes * 0)
    src = fluid.layers.elementwise_min(src, num_nodes - 1)
    dst = fluid.layers.elementwise_min(dst, num_nodes - 1)

    conditions = []
    conditions.append(L.gather(input_mask, src) > 0.5)
    conditions.append(L.gather(input_mask, dst) > 0.5)
    block_src = src / max_seqlen
    block_dst = dst / max_seqlen
    conditions.append(block_src == block_dst)
    mask = None
    for cond in conditions:
        if mask is None:
            mask = cond
        else:
            mask = L.logical_and(mask, cond)

    dst = masked_select(dst, mask)
    src = masked_select(src, mask)
    return src, dst
Example #13
0
def _check_finished(decoder, next_inputs, finished, outputs_array):
    """check finished instance by next_inputs.action, and
    update finished tag and write END to outputs

    Args:
        decoder (TYPE): NULL
        next_inputs (TYPE): NULL
        finished (TYPE): NULL
        outputs_array (TYPE): NULL

    Returns: TODO

    Raises: NULL
    """
    act_stop = tensor.fill_constant_batch_size_like(
        next_inputs.action,
        shape=next_inputs.action.shape,
        value=decoder._grammar.ACTION_STOP,
        dtype='int64')
    new_finished = layers.logical_and(
        layers.equal(next_inputs.action, act_stop),
        layers.logical_not(finished))

    end_token_id = tensor.fill_constant_batch_size_like(
        outputs_array.data,
        shape=[-1],
        value=decoder._grammar.END,
        dtype=outputs_array.data.dtype)
    out_data_tmp, out_pos_tmp = data_structure.Array.push(outputs_array,
                                                          end_token_id,
                                                          in_place=False)
    new_data, new_pos = nn_utils.ifelse(
        new_finished, [out_data_tmp, out_pos_tmp],
        [outputs_array.data, outputs_array.pos])

    layers.assign(new_data, outputs_array.data)
    layers.assign(new_pos, outputs_array.pos)
    layers.logical_or(finished, new_finished, out=finished)
    def forward(self, x, y):
        # x,y误差一帧
        u1 = zeros_like(x)
        u2 = zeros_like(x)
        l_t = self.l * self.t
        taut = self.a / self.t

        grad2_x = self.conv_img_grad(y)
        # grad2_x[:, :, :, 0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0])
        # grad2_x[:, :, :, -1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2])

        grad2_y = self.conv_img_grad2(y)
        # grad2_y[:, :, 0, :] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :])
        # grad2_y[:, :, -1, :] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :])

        p11 = zeros_like(x)
        p12 = zeros_like(x)
        p21 = zeros_like(x)
        p22 = zeros_like(x)

        gsqx = grad2_x**2
        gsqy = grad2_y**2
        grad = gsqx + gsqy + 1e-12

        rho_c = y - grad2_x * u1 - grad2_y * u2 - x

        for i in range(self.n_iter):
            rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12

            v1 = zeros_like(x)
            v2 = zeros_like(x)
            mask1 = rho < -l_t * grad
            mask2 = rho > l_t * grad
            mask3 = logical_and(logical_not(logical_or(mask1, mask2)),
                                (grad > 1e-12))
            mask1 = cast(mask1, dtype='float32')
            mask2 = cast(mask2, dtype='float32')
            mask3 = cast(mask3, dtype='float32')
            mask1.stop_gradient = True
            mask2.stop_gradient = True
            mask3.stop_gradient = True

            # v1 = v1 + l_t * grad2_x * mask1 - l_t * grad2_x * mask2 - (rho / grad) * grad2_x * mask3
            # v2 = v2 + l_t * grad2_y * mask1 - l_t * grad2_y * mask2 - (rho / grad) * grad2_y * mask3
            v1 = elementwise_add(
                u1,
                elementwise_add(
                    elementwise_mul(l_t * grad2_x, mask1),
                    elementwise_add(
                        elementwise_mul(-l_t * grad2_x, mask2),
                        elementwise_mul(-elementwise_div(rho, grad),
                                        elementwise_mul(grad2_x, mask3)))))
            v2 = elementwise_add(
                u2,
                elementwise_add(
                    elementwise_mul(l_t * grad2_y, mask1),
                    elementwise_add(
                        elementwise_mul(-l_t * grad2_y, mask2),
                        elementwise_mul(-elementwise_div(rho, grad),
                                        elementwise_mul(grad2_y, mask3)))))

            del rho
            del mask1
            del mask2
            del mask3

            v1 += u1
            v2 += u2

            u1 = v1 + self.t * self.divergence(p11, p12)
            u2 = v2 + self.t * self.divergence(p21, p22)
            del v1
            del v2
            u1 = u1
            u2 = u2

            u1x, u1y = self.forward_grad(u1)
            u2x, u2y = self.forward_grad(u2)

            p11 = (p11 + taut * u1x) / (1. +
                                        taut * sqrt(u1x**2 + u1y**2 + 1e-12))
            p12 = (p12 + taut * u1y) / (1. +
                                        taut * sqrt(u1x**2 + u1y**2 + 1e-12))
            p21 = (p21 + taut * u2x) / (1. +
                                        taut * sqrt(u2x**2 + u2y**2 + 1e-12))
            p22 = (p22 + taut * u2y) / (1. +
                                        taut * sqrt(u2x**2 + u2y**2 + 1e-12))
            del u1x
            del u1y
            del u2x
            del u2y

        return u1, u2
Example #15
0
    def beam_search():
        max_len = layers.fill_constant(
            shape=[1], dtype=start_tokens.dtype, value=max_out_len)
        step_idx = layers.fill_constant(
            shape=[1], dtype=start_tokens.dtype, value=0)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        # array states will be stored for each step.
        ids = layers.array_write(start_tokens, step_idx)
        scores = layers.array_write(init_scores, step_idx)
        # cell states will be overwrited at each step.
        # caches contains states of history steps to reduce redundant
        # computation in decoder.
        caches = [{
            "k": layers.fill_constant_batch_size_like(
                input=start_tokens,
                shape=[-1, 0, d_model],
                dtype=enc_output.dtype,
                value=0),
            "v": layers.fill_constant_batch_size_like(
                input=start_tokens,
                shape=[-1, 0, d_model],
                dtype=enc_output.dtype,
                value=0)
        } for i in range(n_layer)]
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            # sequence_expand can gather sequences according to lod thus can be
            # used in beam search to sift states corresponding to selected ids.
            pre_src_attn_bias = layers.sequence_expand(
                x=trg_src_attn_bias, y=pre_scores)
            pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
            pre_caches = [{
                "k": layers.sequence_expand(
                    x=cache["k"], y=pre_scores),
                "v": layers.sequence_expand(
                    x=cache["v"], y=pre_scores),
            } for cache in caches]
            pre_pos = layers.elementwise_mul(
                x=layers.fill_constant_batch_size_like(
                    input=pre_enc_output,  # cann't use pre_ids here since it has lod
                    value=1,
                    shape=[-1, 1],
                    dtype=pre_ids.dtype),
                y=layers.increment(
                    x=step_idx, value=1.0, in_place=False),
                axis=0)
            logits = wrap_decoder(
                trg_vocab_size,
                max_in_len,
                n_layer,
                n_head,
                d_key,
                d_value,
                d_model,
                d_inner_hid,
                dropout_rate,
                weight_sharing,
                dec_inputs=(
                    pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape,
                    slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape,
                    src_attn_pre_softmax_shape, src_attn_post_softmax_shape),
                enc_output=pre_enc_output,
                caches=pre_caches)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(logits), k=beam_size)
            accu_scores = layers.elementwise_add(
                x=layers.log(topk_scores),
                y=layers.reshape(
                    pre_scores, shape=[-1]),
                axis=0)
            # beam_search op uses lod to distinguish branches.
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            selected_ids, selected_scores = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=eos_idx)
            layers.increment(x=step_idx, value=1.0, in_place=True)
            # update states
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
            layers.assign(pre_enc_output, enc_output)
            for i in range(n_layer):
                layers.assign(pre_caches[i]["k"], caches[i]["k"])
                layers.assign(pre_caches[i]["v"], caches[i]["v"])
            layers.assign(
                layers.elementwise_add(
                    x=slf_attn_pre_softmax_shape,
                    y=attn_pre_softmax_shape_delta),
                slf_attn_pre_softmax_shape)
            layers.assign(
                layers.elementwise_add(
                    x=slf_attn_post_softmax_shape,
                    y=attn_post_softmax_shape_delta),
                slf_attn_post_softmax_shape)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=eos_idx)
        return finished_ids, finished_scores
Example #16
0
    def gru_attention_infer(self, decoder_boot, max_length, char_num,
                            word_vector_dim, encoded_vector, encoded_proj,
                            decoder_size):
        init_state = decoder_boot
        beam_size = 1
        array_len = layers.fill_constant(
            shape=[1], dtype='int64', value=max_length)
        counter = layers.zeros(shape=[1], dtype='int64', force_cpu=True)

        # fill the first element with init_state
        state_array = layers.create_array('float32')
        layers.array_write(init_state, array=state_array, i=counter)

        # ids, scores as memory
        ids_array = layers.create_array('int64')
        scores_array = layers.create_array('float32')
        rois_shape = layers.shape(init_state)
        batch_size = layers.slice(
            rois_shape, axes=[0], starts=[0], ends=[1]) + 1
        lod_level = layers.range(
            start=0, end=batch_size, step=1, dtype=batch_size.dtype)

        init_ids = layers.fill_constant_batch_size_like(
            input=init_state, shape=[-1, 1], value=0, dtype='int64')
        init_ids = layers.lod_reset(init_ids, lod_level)
        init_ids = layers.lod_append(init_ids, lod_level)

        init_scores = layers.fill_constant_batch_size_like(
            input=init_state, shape=[-1, 1], value=1, dtype='float32')
        init_scores = layers.lod_reset(init_scores, init_ids)
        layers.array_write(init_ids, array=ids_array, i=counter)
        layers.array_write(init_scores, array=scores_array, i=counter)

        full_ids = fluid.layers.fill_constant_batch_size_like(
            input=init_state, shape=[-1, 1], dtype='int64', value=1)

        cond = layers.less_than(x=counter, y=array_len)
        while_op = layers.While(cond=cond)
        with while_op.block():
            pre_ids = layers.array_read(array=ids_array, i=counter)
            pre_state = layers.array_read(array=state_array, i=counter)
            pre_score = layers.array_read(array=scores_array, i=counter)
            pre_ids_emb = layers.embedding(
                input=pre_ids,
                size=[char_num, word_vector_dim],
                dtype='float32')

            context = self.simple_attention(encoded_vector, encoded_proj,
                                            pre_state, decoder_size)

            # expand the recursive_sequence_lengths of pre_state 
            # to be the same with pre_score
            pre_state_expanded = layers.sequence_expand(pre_state, pre_score)
            context_expanded = layers.sequence_expand(context, pre_score)

            fc_1 = layers.fc(input=context_expanded,
                             size=decoder_size * 3,
                             bias_attr=False,
                             name="rnn_fc1")

            fc_2 = layers.fc(input=pre_ids_emb,
                             size=decoder_size * 3,
                             bias_attr=False,
                             name="rnn_fc2")

            decoder_inputs = fc_1 + fc_2
            current_state, _, _ = layers.gru_unit(
                input=decoder_inputs,
                hidden=pre_state_expanded,
                size=decoder_size * 3)
            current_state_with_lod = layers.lod_reset(
                x=current_state, y=pre_score)
            # use score to do beam search
            current_score = layers.fc(input=current_state_with_lod,
                                      size=char_num,
                                      bias_attr=True,
                                      act='softmax',
                                      name="rnn_out_fc")
            topk_scores, topk_indices = layers.topk(current_score, k=beam_size)

            new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1)
            fluid.layers.assign(new_ids, full_ids)

            layers.increment(x=counter, value=1, in_place=True)

            # update the memories
            layers.array_write(current_state, array=state_array, i=counter)
            layers.array_write(topk_indices, array=ids_array, i=counter)
            layers.array_write(topk_scores, array=scores_array, i=counter)

            # update the break condition: 
            # up to the max length or all candidates of
            # source sentences have ended.
            length_cond = layers.less_than(x=counter, y=array_len)
            finish_cond = layers.logical_not(layers.is_empty(x=topk_indices))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
        return full_ids
Example #17
0
    def infilling_decode(self):
        if self.task_type == "dialog":
            emb_num = 4
        else:
            emb_num = 3
        input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \
                       [[-1, self.max_seq_len, self.max_seq_len]]
        input_dtypes = ['int64'] * emb_num + ['float32']
        input_lod_levels = [0] * emb_num + [0]

        shapes = input_shapes + [[-1, self.max_seq_len, 1],
                                 [-1, self.max_seq_len, 1], [-1, 1], [-1],
                                 [-1, 1, self.max_seq_len], [-1, 1]]
        dtypes = input_dtypes + [
            'int64', 'int64', 'float32', 'int32', 'float32', 'int64'
        ]
        lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0]

        inputs = self.to_ternsor(shapes, dtypes, lod_levels)
        pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                      capacity=50,
                                                      iterable=False)

        emb_ids = {}
        for key, value in zip(self.emb_keys, inputs[:emb_num]):
            emb_ids[key] = value

        input_mask = inputs[emb_num]
        tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[
            -6:]

        ernie = ErnieModel(emb_ids=emb_ids,
                           input_mask=input_mask,
                           config=self.ernie_config,
                           use_fp16=self.use_fp16,
                           task_type=self.task_type,
                           decoding=True,
                           gather_idx=parent_idx)

        max_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.max_dec_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=tgt_ids.dtype,
                                        value=0,
                                        force_cpu=True)
        pos_idx = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=1,
                                       force_cpu=True)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx)
        pos_biases = layers.array_write(layers.reshape(tgt_pos, (-1, 1)),
                                        step_idx)
        scores = layers.array_write(init_scores, step_idx)
        tgt_masks = layers.array_write(tgt_input_mask, step_idx)

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)
            tmp_mask = layers.array_read(tgt_masks, i=step_idx)

            def gen_batch_like(value,
                               dtype="int64",
                               shape=[-1, 1, 1],
                               is_scalar=True):
                if is_scalar:
                    return layers.fill_constant_batch_size_like(
                        input=parent_idx,
                        value=value,
                        shape=shape,
                        dtype=dtype)
                else:
                    return layers.elementwise_mul(
                        x=layers.fill_constant_batch_size_like(
                            input=parent_idx,
                            value=1,
                            shape=shape,
                            dtype=dtype),
                        y=value,
                        axis=0)

            tmp_mask = layers.gather(input=tmp_mask, index=parent_idx)
            append_0_mask = gen_batch_like(0.0, dtype=tmp_mask.dtype)
            append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype)
            tmp_mask = layers.concat([tmp_mask, append_1_mask], axis=2)
            pre_mask = layers.concat([tmp_mask, append_0_mask], axis=2)
            cur_mask = layers.concat([tmp_mask, append_1_mask], axis=2)

            cur_ids = gen_batch_like(self.attn_id)
            pre_pos = gen_batch_like(step_idx, is_scalar=False)
            cur_pos = gen_batch_like(pos_idx, is_scalar=False)
            if self.continuous_position:
                pre_pos = pre_pos + pos_bias
                cur_pos = cur_pos + pos_bias

            dec_emb_ids = {
                "word_embedding": layers.concat([pre_ids, cur_ids], axis=1),
                "pos_embedding": layers.concat([pre_pos, cur_pos], axis=1)
            }
            if self.task_type == "dialog":
                role_ids = gen_batch_like(0)
                turn_ids = gen_batch_like(0)
                dec_emb_ids["role_embedding"] = layers.concat(
                    [role_ids, role_ids], axis=1)
                dec_emb_ids["turn_embedding"] = layers.concat(
                    [turn_ids, turn_ids], axis=1)
            else:
                sent_ids = gen_batch_like(self.tgt_type_id)
                dec_emb_ids["sent_embedding"] = layers.concat(
                    [sent_ids, sent_ids], axis=1)
            dec_mask = layers.concat([pre_mask, cur_mask], axis=1)

            dec_out = ernie.encode(dec_emb_ids,
                                   dec_mask,
                                   parent_idx,
                                   remove_query=True)
            fc_out = self.cal_logit(dec_out[:, 1:, :], None)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(fc_out), k=self.beam_size)
            pre_lenpen = layers.pow(
                (5.0 + layers.cast(step_idx, pre_scores.dtype)) / 6.0,
                self.length_penalty)
            cur_lenpen = layers.pow(
                (5.0 + layers.cast(pos_idx, pre_scores.dtype)) / 6.0,
                self.length_penalty)
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores * pre_lenpen,
                                                 axis=0) / cur_lenpen
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=self.beam_size,
                end_id=self.eos_idx,
                return_parent_idx=True)

            layers.increment(x=step_idx, value=1.0, in_place=True)
            layers.increment(x=pos_idx, value=1.0, in_place=True)
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(tmp_mask, i=step_idx, array=tgt_masks)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)
            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

        graph_vars = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "data_ids": data_ids
        }

        for k, v in graph_vars.items():
            v.persistable = True

        return pyreader, graph_vars
Example #18
0
    def inference(self, model, inputs, outputs):
        """
        Run inference.

        Args:
            inputs(dict): Its key is input name(str) and its value is a Variable.
            model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`.

        Returns:
            dict(str:Variable): Its key is output name(str) and its value is a Variable.
        """
        # prepare while loop
        max_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.max_dec_len, force_cpu=True)
        min_len = layers.fill_constant(
            shape=[1], dtype="int64", value=self.min_dec_len, force_cpu=True)
        step_idx = layers.fill_constant(
            shape=[1], dtype="int64", value=0, force_cpu=True)

        ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)), step_idx)
        pos_biases = layers.array_write(layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx)
        scores = layers.array_write(inputs["init_score"], step_idx)
        tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"], step_idx)
        parent_idx = inputs["parent_idx"]

        if self.decoding_strategy == "beam_search":
            beam_size = self.beam_size
        else:
            beam_size = 1

        eos_penalty = np.zeros(self.vocab_size, dtype="float32")
        eos_penalty[self.eos_id] = -1e9
        eos_penalty = layers.assign(eos_penalty)

        token_penalty = np.zeros(self.vocab_size, dtype="float32")
        token_penalty[self.unk_id] = -1e9
        if self.mask_id >= 0:
            token_penalty[self.mask_id] = -1e9
        token_penalty = layers.assign(token_penalty)

        # start while loop
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask, i=step_idx)
            dtype = tmp_tgt_generation_mask.dtype

            append_mask = layers.fill_constant_batch_size_like(
                    input=pre_ids,
                    value=1.0,
                    shape=[-1, 1, 1],
                    dtype=dtype)
            tmp_tgt_generation_mask = layers.concat([tmp_tgt_generation_mask, append_mask], axis=2)
            pre_mask = tmp_tgt_generation_mask = layers.gather(input=tmp_tgt_generation_mask, index=parent_idx)

            pre_sent = layers.fill_constant_batch_size_like(
                    input=pre_mask,
                    value=1,
                    shape=[-1, 1, 1],
                    dtype=pre_ids.dtype)

            if self.continuous_position:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0) + pos_bias
            else:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype), y=step_idx, axis=0)

            if self.use_role:
                pre_role = layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=0,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype)
            else:
                pre_role = None

            dec_out, _ = model._generation_network(
                token_ids=pre_ids,
                type_ids=pre_sent,
                pos_ids=pre_pos,
                role_ids=pre_role,
                generation_mask=tmp_tgt_generation_mask,
                gather_idx=parent_idx)
            logits = model._calc_logits(dec_out)

            # ignore unk and mask token
            if self.ignore_unk:
                logits = layers.elementwise_add(logits, token_penalty, axis=1)

            # min dec length
            min_len_cond = layers.less_than(x=step_idx, y=min_len)
            def min_len_penalty():
                """Plus minimum length penalty."""
                return layers.elementwise_add(logits, eos_penalty, axis=1)
            def no_penalty():
                """No penalty."""
                return logits
            logits = layers.case([(min_len_cond, min_len_penalty)], default=no_penalty)

            # get probs
            probs = layers.softmax(logits / self.temperature)

            if self.decoding_strategy == "beam_search":
                topk_scores, topk_indices = layers.topk(
                    input=probs, k=beam_size)
            else:
                if self.decoding_strategy.startswith("sampling"):
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                elif self.decoding_strategy.startswith("topk_sampling"):
                    topk_probs, _ = layers.topk(input=probs, k=self.topk)
                    ge_cond = layers.cast(
                        layers.greater_equal(
                            probs,
                            layers.unsqueeze(topk_probs[:, -1], [1])),
                        "float32")
                    old_probs = probs
                    probs = probs * ge_cond / layers.reduce_sum(topk_probs, dim=-1, keep_dim=True)
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                    probs = old_probs
                else:
                    raise ValueError(self.decoding_strategy)

                sampling_scores = layers.one_hot(
                    layers.unsqueeze(sampling_ids, [1]), probs.shape[1]
                )
                sampling_scores = sampling_scores * probs - (1 - sampling_scores) * 1e3
                topk_scores, topk_indices = layers.topk(
                    input=sampling_scores, k=1)

            pre_len = layers.cast(step_idx, "float32")
            layers.increment(x=step_idx, value=1.0, in_place=True)
            cur_len = layers.cast(step_idx, "float32")

            # update scores
            if self.length_average:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_len, axis=0) / cur_len
            elif self.length_penalty > 0:
                pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty)
                cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty)
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores * pre_lp, axis=0) / cur_lp
            else:
                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores), y=pre_scores, axis=0)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=self.eos_id)

        predictions = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "token_ids": inputs["token_ids"],
            "data_id": inputs["data_id"]
        }
        return predictions
Example #19
0
 def cond_func(step_idx, selected_ids, selected_scores, gather_idx,
               caches, trg_src_attn_bias):
     length_cond = layers.less_than(x=step_idx, y=max_len)
     finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
     return layers.logical_and(x=length_cond, y=finish_cond)
Example #20
0
    def beam_search(enc_output, enc_bias, source_length):
        """
            beam_search
        """
        max_len = layers.fill_constant(
            shape=[1], dtype='int64', value=max_out_len)
        step_idx = layers.fill_constant(
            shape=[1], dtype='int64', value=0)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        caches_batch_size = batch_size * beam_size
        init_score = np.zeros([1, beam_size]).astype('float32')
        init_score[:, 1:] = -INF
        initial_log_probs = layers.assign(init_score)

        alive_log_probs = layers.expand(initial_log_probs, [batch_size, 1])
        # alive seq [batch_size, beam_size, 1]
        initial_ids = layers.zeros([batch_size, 1, 1], 'float32')
        alive_seq = layers.expand(initial_ids, [1, beam_size, 1]) 
        alive_seq = layers.cast(alive_seq, 'int64')

        enc_output = layers.unsqueeze(enc_output, axes=[1])
        enc_output = layers.expand(enc_output, [1, beam_size, 1, 1])
        enc_output = layers.reshape(enc_output, [caches_batch_size, -1, d_model])

        tgt_src_attn_bias = layers.unsqueeze(enc_bias, axes=[1])
        tgt_src_attn_bias = layers.expand(tgt_src_attn_bias, [1, beam_size, n_head, 1, 1]) 
        enc_bias_shape = layers.shape(tgt_src_attn_bias)
        tgt_src_attn_bias = layers.reshape(tgt_src_attn_bias, [-1, enc_bias_shape[2], 
                enc_bias_shape[3], enc_bias_shape[4]])
            
        beam_search = BeamSearch(beam_size, batch_size, decode_alpha, trg_vocab_size, d_model)

        caches = [{
            "k": layers.fill_constant(
                shape=[caches_batch_size, 0, d_model],
                dtype=enc_output.dtype,
                value=0),
            "v": layers.fill_constant(
                shape=[caches_batch_size, 0, d_model],
                dtype=enc_output.dtype,
                value=0)
        } for i in range(n_layer)]
        
        finished_seq = layers.zeros_like(alive_seq)
        finished_scores = layers.fill_constant([batch_size, beam_size], 
                                                dtype='float32', value=-INF)
        finished_flags = layers.fill_constant([batch_size, beam_size], 
                                                dtype='float32', value=0)
        
        with while_op.block():
            pos = layers.fill_constant([caches_batch_size, 1, 1], dtype='int64', value=1)
            pos = layers.elementwise_mul(pos, step_idx, axis=0)

            alive_seq_1 = layers.reshape(alive_seq, [caches_batch_size, -1])
            alive_seq_2 = alive_seq_1[:, -1:] 
            alive_seq_2 = layers.unsqueeze(alive_seq_2, axes=[1])
 
            logits = wrap_decoder(
                trg_vocab_size, max_in_len, n_layer, n_head, d_key,
                d_value, d_model, d_inner_hid, prepostprocess_dropout,
                attention_dropout, relu_dropout, preprocess_cmd,
                postprocess_cmd, weight_sharing, embedding_sharing,
                dec_inputs=(alive_seq_2, alive_seq_2, pos, None, tgt_src_attn_bias),
                enc_output=enc_output, caches=caches, is_train=False, params_type=params_type)

            alive_seq_2, alive_log_probs_2, finished_seq_2, finished_scores_2, finished_flags_2, caches_2 = \
                    beam_search.inner_func(step_idx, logits, alive_seq_1, alive_log_probs, finished_seq, 
                                           finished_scores, finished_flags, caches, enc_output, 
                                           tgt_src_attn_bias)
            
            layers.increment(x=step_idx, value=1.0, in_place=True)
            finish_cond = beam_search.is_finished(step_idx, source_length, alive_log_probs_2, 
                                                  finished_scores_2, finished_flags_2) 

            layers.assign(alive_seq_2, alive_seq)
            layers.assign(alive_log_probs_2, alive_log_probs)
            layers.assign(finished_seq_2, finished_seq)
            layers.assign(finished_scores_2, finished_scores)
            layers.assign(finished_flags_2, finished_flags)

            for i in xrange(len(caches_2)):
                layers.assign(caches_2[i]["k"], caches[i]["k"])
                layers.assign(caches_2[i]["v"], caches[i]["v"])

            layers.logical_and(x=cond, y=finish_cond, out=cond)

        finished_flags = layers.reduce_sum(finished_flags, dim=1, keep_dim=True) / beam_size
        finished_flags = layers.cast(finished_flags, 'bool')
        mask = layers.cast(layers.reduce_any(input=finished_flags, dim=1, keep_dim=True), 'float32')
        mask = layers.expand(mask, [1, beam_size])

        mask2 = 1.0 - mask
        finished_seq = layers.cast(finished_seq, 'float32')
        alive_seq = layers.cast(alive_seq, 'float32')
        #print mask

        finished_seq = layers.elementwise_mul(finished_seq, mask, axis=0) + \
                        layers.elementwise_mul(alive_seq, mask2, axis = 0)
        finished_seq = layers.cast(finished_seq, 'int32')
        finished_scores = layers.elementwise_mul(finished_scores, mask, axis=0) + \
                            layers.elementwise_mul(alive_log_probs, mask2)
        finished_seq.persistable = True
        finished_scores.persistable = True

        return finished_seq, finished_scores
Example #21
0
    def beam_search():
        max_len = layers.fill_constant(shape=[1],
                                       dtype=start_tokens.dtype,
                                       value=max_out_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=start_tokens.dtype,
                                        value=0,
                                        force_cpu=True)
        cond = layers.less_than(x=step_idx,
                                y=max_len)  # default force_cpu=True
        while_op = layers.While(cond)
        # array states will be stored for each step.
        ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                 step_idx)
        scores = layers.array_write(init_scores, step_idx)
        # cell states will be overwrited at each step.
        # caches contains states of history steps in decoder self-attention
        # and static encoder output projections in encoder-decoder attention
        # to reduce redundant computation.
        caches = [
            {
                "k":  # for self attention
                layers.fill_constant_batch_size_like(
                    input=start_tokens,
                    shape=[-1, n_head, 0, d_key],
                    dtype=enc_output.dtype,
                    value=0),
                "v":  # for self attention
                layers.fill_constant_batch_size_like(
                    input=start_tokens,
                    shape=[-1, n_head, 0, d_value],
                    dtype=enc_output.dtype,
                    value=0),
                "static_k":  # for encoder-decoder attention
                layers.create_tensor(dtype=enc_output.dtype),
                "static_v":  # for encoder-decoder attention
                layers.create_tensor(dtype=enc_output.dtype)
            } for i in range(n_layer)
        ]

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            # Since beam_search_op dosen't enforce pre_ids' shape, we can do
            # inplace reshape here which actually change the shape of pre_ids.
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            # gather cell states corresponding to selected parent
            pre_src_attn_bias = layers.gather(trg_src_attn_bias,
                                              index=parent_idx)
            pre_pos = layers.elementwise_mul(
                x=layers.fill_constant_batch_size_like(
                    input=pre_src_attn_bias,  # cann't use lod tensor here
                    value=1,
                    shape=[-1, 1, 1],
                    dtype=pre_ids.dtype),
                y=step_idx,
                axis=0)
            logits = wrap_decoder(trg_vocab_size,
                                  max_in_len,
                                  n_layer,
                                  n_head,
                                  d_key,
                                  d_value,
                                  d_model,
                                  d_inner_hid,
                                  prepostprocess_dropout,
                                  attention_dropout,
                                  relu_dropout,
                                  preprocess_cmd,
                                  postprocess_cmd,
                                  weight_sharing,
                                  dec_inputs=(pre_ids, pre_pos, None,
                                              pre_src_attn_bias),
                                  enc_output=enc_output,
                                  caches=caches,
                                  gather_idx=parent_idx,
                                  bos_idx=bos_idx)
            # intra-beam topK
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(logits), k=beam_size)
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores,
                                                 axis=0)
            # beam_search op uses lod to differentiate branches.
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            # topK reduction across beams, also contain special handle of
            # end beams and end sentences(batch reduction)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=eos_idx,
                return_parent_idx=True)
            layers.increment(x=step_idx, value=1.0, in_place=True)
            # cell states(caches) have been updated in wrap_decoder,
            # only need to update beam search states here.
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.assign(gather_idx, parent_idx)
            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=eos_idx)
        return finished_ids, finished_scores
Example #22
0
def _run_paddle_logical_and(x, y):
    x = cast_bool_if_necessary(x)
    y = cast_bool_if_necessary(y)
    return logical_and(x, y)
Example #23
0
def decode_with_grammar(decoder, inits, decode_vocab, max_step_num, **kwargs):
    """A modification of paddle.fluid.layers.dynamic_decode(...).
    Dynamic decoding performs :code:`decoder.step()` repeatedly until the returned
    Tensor indicating finished status contains all True values or the number of
    decoding step reachs to :attr:`max_step_num`.
    :code:`decoder.initialize()` would be called once before the decoding loop.
    If the `decoder` has implemented `finalize` method, :code:`decoder.finalize()`
    would be called once after the decoding loop.

    Args:
        decoder(Decoder): An instance of `Decoder`.
        inits(tuple): Argument passed to `decoder.initialize`. 
        decode_vocab(DecoderDynamicVocab): namedtuple(table table_len column column_len value value_len)
        max_step_num(int): The maximum number of steps.
        **kwargs: Additional keyword arguments. Arguments passed to `decoder.step`. 

    Returns:
        tuple: A tuple( :code:`(final_outputs, final_states)` ) including the final \
            outputs and states, both are Tensor or nested structure of Tensor. \
            `final_outputs` has the same structure and data types as \
            :code:`decoder.output_dtype` , and each Tenser in `final_outputs` \
            is the stacked of all decoding steps' outputs, which might be revised \
            by :code:`decoder.finalize` . `final_states` is the counterpart \
            at last time step of initial states returned by :code:`decoder.initialize` , \
            thus has the same structure with it and has tensors with same shapes \
            and data types.
    """
    step_cnt = tensor.fill_constant(shape=[1], dtype="int64", value=1)
    max_step_num_tensor = tensor.fill_constant(shape=[1],
                                               dtype="int64",
                                               value=max_step_num - 2)

    # shape = [batch_size, beam_size, ...]
    initial_inputs, initial_states, initial_finished = decoder.initialize(
        inits, decode_vocab)
    global_inputs, global_states, global_finished = (initial_inputs,
                                                     initial_states,
                                                     initial_finished)
    inputs = initial_inputs
    states = initial_states

    # 保存输出结果
    outputs_arr_data = tensor.fill_constant_batch_size_like(
        inputs.input,
        shape=[-1, decoder.beam_size, max_step_num],
        dtype=decoder.output_dtype.predicted_ids,
        value=0)
    outputs_arr_pos = tensor.fill_constant_batch_size_like(
        inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0)
    outputs_array = data_structure.ArrayData(
        decoder.merge_batch_beams(outputs_arr_data),
        decoder.merge_batch_beams(outputs_arr_pos))

    sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished),
                                   "int64")

    # 按语法解码的相关约束数据结构
    grammar_stack_dat = tensor.fill_constant_batch_size_like(
        inputs.input,
        shape=[-1, decoder.beam_size, max_step_num * STACK_EXPAND_TIMES],
        dtype='int64',
        value=0)
    grammar_stack_pos = tensor.fill_constant_batch_size_like(
        inputs.input, shape=[-1, decoder.beam_size, 1], dtype='int64', value=0)
    grammar_stack = data_structure.StackData(
        decoder.merge_batch_beams(grammar_stack_dat),
        decoder.merge_batch_beams(grammar_stack_pos))

    ############        循环解码,直到全部为 finish 状态        ############
    #   finish 的判断:通过 global_finished/next_finished && max_step_num 判断
    cond = layers.logical_not((layers.reduce_all(initial_finished)))
    while_op = layers.While(cond)
    with while_op.block():
        # step_outputs --> OutputWrapper
        # next_states  --> StateWrapper
        # next_inputs  --> DecoderInputsWrapper
        step_outputs, next_states, next_inputs = decoder.step(
            inputs, states, **kwargs)
        predicted_ids = step_outputs.predicted_ids
        _save_predict_output(outputs_array, predicted_ids,
                             next_states.finished)

        pred_gmr_type = decoder.grammar_type(predicted_ids)
        cond_type_leaf = layers.equal(pred_gmr_type, decoder.GMR_TYPE.LEAF)
        cond_type_midd = layers.equal(pred_gmr_type, decoder.GMR_TYPE.MID)

        _process_type_leaf(cond_type_leaf, decoder, grammar_stack, next_inputs,
                           next_states.finished)
        _process_type_midd(cond_type_midd, decoder, grammar_stack, next_inputs,
                           predicted_ids)

        ##next_sequence_lengths = layers.elementwise_add(sequence_lengths,
        ##                        tensor.cast(layers.logical_not(global_finished), sequence_lengths.dtype))

        _check_finished(decoder, next_inputs, next_states.finished,
                        outputs_array)

        layers.utils.map_structure(tensor.assign, next_inputs, global_inputs)
        layers.utils.map_structure(tensor.assign, next_states, global_states)
        tensor.assign(next_states.finished, global_finished)
        ##tensor.assign(next_sequence_lengths, sequence_lengths)

        # 更新循环条件
        layers.increment(x=step_cnt, value=1.0, in_place=True)
        layers.logical_and(
            layers.logical_not(layers.reduce_all(next_states.finished)),
            layers.less_equal(step_cnt, max_step_num_tensor), cond)

    final_outputs = outputs_array.data
    final_states = global_states

    final_outputs, final_states = decoder.finalize(final_outputs,
                                                   global_states,
                                                   sequence_lengths)

    return final_outputs, final_states
Example #24
0
        def beam_search():
            """Beam search function"""

            max_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.max_out_len,
                                           force_cpu=True)
            min_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.min_out_len)
            neg_inf = layers.fill_constant(shape=[1],
                                           dtype='float32',
                                           value=-INF)
            step_idx = layers.fill_constant(shape=[1],
                                            dtype=start_tokens.dtype,
                                            value=0,
                                            force_cpu=True)
            step_next_idx = layers.fill_constant(shape=[1],
                                                 dtype=start_tokens.dtype,
                                                 value=1,
                                                 force_cpu=True)
            cond = layers.less_than(x=step_idx,
                                    y=max_len)  # default force_cpu=True
            while_op = layers.While(cond)
            # array states will be stored for each step.
            ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                     step_idx)
            scores = layers.array_write(init_scores, step_idx)
            # cell states will be overwrited at each step.
            # caches contains states of history steps in decoder self-attention
            # and static encoder output projections in encoder-decoder attention
            # to reduce redundant computation.
            caches = [
                {
                    "k":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "v":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "static_k_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_v_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_k_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype),
                    "static_v_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype)
                } for i in range(self._dec_n_layer)
            ]

            trigram_blocking = TrigramBlocking(start_tokens,
                                               self.tokenizer,
                                               use_fp16=self._use_fp16,
                                               beam_size=self.beam_size)

            with while_op.block():
                pre_ids = layers.array_read(array=ids, i=step_idx)
                pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                # Since beam_search_op dosen't enforce pre_ids' shape, we can do
                # inplace reshape here which actually change the shape of pre_ids.
                # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                pre_scores = layers.array_read(array=scores, i=step_idx)
                # gather cell states corresponding to selected parent
                pre_src_words_attn_bias = layers.gather(
                    tgt_src_words_attn_bias, index=parent_idx)
                pre_src_sents_attn_bias = layers.gather(
                    tgt_src_sents_attn_bias, index=parent_idx)
                pre_graph_attn_bias = layers.gather(graph_attn_bias,
                                                    index=parent_idx)
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=
                        pre_src_sents_attn_bias,  # cann't use lod tensor here
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0)

                logits = self.decode(
                    dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias,
                               pre_src_sents_attn_bias, pre_graph_attn_bias),
                    enc_words_output=enc_words_output,
                    enc_sents_output=enc_sents_output,
                    caches=caches,
                    gather_idx=parent_idx)

                # prevent generating end token if length less than min_out_len
                eos_index = layers.fill_constant(
                    shape=[layers.shape(logits)[0]],
                    dtype='int64',
                    value=self.eos_idx)
                eos_index = fluid.one_hot(eos_index, depth=self.voc_size)
                less_cond = layers.cast(layers.less_than(x=step_idx,
                                                         y=min_len),
                                        dtype='float32')
                less_val = layers.elementwise_mul(less_cond, neg_inf)
                eos_val = layers.elementwise_mul(eos_index, less_val, axis=0)
                revised_logits = layers.elementwise_add(logits,
                                                        eos_val,
                                                        axis=0)

                # topK reduction across beams, also contain special handle of
                # end beams and end sentences(batch reduction)
                topk_scores, topk_indices = layers.topk(
                    input=layers.softmax(revised_logits), k=self.beam_size)

                # Roll-Back previous-scores for length-penalty
                # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back
                # because of doing this, we need store the length-penaltied score in `scores`
                # while calculating use the un-penaltied score
                # -> safe for step_idx == 0 (initialization state), because previous-score == 0
                pre_timestep_length_penalty = fluid.layers.pow(
                    ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) /
                     6.0), self.len_penalty)
                pre_scores_wo_len_penalty = fluid.layers.elementwise_mul(
                    pre_scores, pre_timestep_length_penalty)

                # calc trigram-blocking delta scores for current alive sequence
                if self.block_trigram:
                    trigram_blocking.update_seq(pre_ids, parent_idx)
                    trigram_blocking.expand_cand_seq(topk_indices)
                    fluid.layers.py_func(
                        func=trigram_blocking.blocking_forward,
                        x=[
                            trigram_blocking.cand_seq,
                            trigram_blocking.id2is_full_token
                        ],
                        out=trigram_blocking.delta_score_out,
                        backward_func=None)
                    layers.Print(trigram_blocking.delta_score_out,
                                 summarize=100,
                                 message="trigram_blocking.delta_score_out")
                    pre_scores_wo_len_penalty = fluid.layers.elementwise_add(
                        x=trigram_blocking.delta_score_out,
                        y=pre_scores_wo_len_penalty,
                        axis=0)
                # => [N, topk]

                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores),
                    y=pre_scores_wo_len_penalty,
                    axis=0)

                cur_timestep_length_penalty = layers.pow(
                    ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) /
                     6.0), self.len_penalty)
                curr_scores = layers.elementwise_div(
                    accu_scores, cur_timestep_length_penalty)

                # beam_search op uses lod to differentiate branches.
                curr_scores = layers.lod_reset(curr_scores, pre_ids)
                topk_indices = layers.lod_reset(topk_indices, pre_ids)
                selected_ids, selected_scores, gather_idx = layers.beam_search(
                    pre_ids=pre_ids,
                    pre_scores=pre_scores,
                    ids=topk_indices,
                    scores=curr_scores,
                    beam_size=self.beam_size,
                    end_id=self.eos_idx,
                    return_parent_idx=True)

                layers.increment(x=step_idx, value=1.0, in_place=True)
                layers.increment(x=step_next_idx, value=1.0, in_place=True)
                # cell states(caches) have been updated in wrap_decoder,
                # only need to update beam search states here.
                layers.array_write(selected_ids, i=step_idx, array=ids)
                layers.array_write(selected_scores, i=step_idx, array=scores)
                layers.assign(gather_idx, parent_idx)
                layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias)
                layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias)
                layers.assign(pre_graph_attn_bias, graph_attn_bias)

                length_cond = layers.less_than(x=step_idx, y=max_len)
                finish_cond = layers.logical_not(
                    layers.is_empty(x=selected_ids))
                layers.logical_and(x=length_cond, y=finish_cond, out=cond)

            finished_ids, finished_scores = layers.beam_search_decode(
                ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

            return finished_ids, finished_scores
    def _build_decoder(self,
                       enc_last_hidden,
                       enc_last_cell,
                       mode='train',
                       beam_size=10):
        softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \
                    default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale))
        if mode == 'train':
            dec_output, dec_last_hidden, dec_last_cell = basic_lstm( self.tar_emb, enc_last_hidden, enc_last_cell, \
                    self.hidden_size, num_layers=self.num_layers, \
                    batch_first=self.batch_first, \
                    dropout_prob=self.dropout, \
                    param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale) ), \
                    bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ))

            dec_output = layers.matmul(dec_output, softmax_weight)

            return dec_output
        elif mode == 'beam_search' or mode == 'greedy_search':
            dec_unit_list = []
            name = 'basic_lstm'
            for i in range(self.num_layers):
                new_name = name + "_layers_" + str(i)
                dec_unit_list.append(
                    BasicLSTMUnit(new_name, self.hidden_size, dtype='float32'))

            def decoder_step(current_in, pre_hidden_array, pre_cell_array):
                new_hidden_array = []
                new_cell_array = []

                step_in = current_in
                for i in range(self.num_layers):
                    pre_hidden = pre_hidden_array[i]
                    pre_cell = pre_cell_array[i]

                    new_hidden, new_cell = dec_unit_list[i](step_in,
                                                            pre_hidden,
                                                            pre_cell)

                    new_hidden_array.append(new_hidden)
                    new_cell_array.append(new_cell)

                    step_in = new_hidden

                return step_in, new_hidden_array, new_cell_array

            if mode == 'beam_search':
                max_src_seq_len = layers.shape(self.src)[1]
                max_length = max_src_seq_len * 2
                #max_length = layers.fill_constant( [1], dtype='int32', value = 10)
                pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1)
                full_ids = layers.fill_constant([1, 1], dtype='int64', value=1)

                score = layers.fill_constant([1], dtype='float32', value=0.0)

                #eos_ids = layers.fill_constant( [1, 1], dtype='int64', value=2)

                pre_hidden_array = []
                pre_cell_array = []
                pre_feed = layers.fill_constant([beam_size, self.hidden_size],
                                                dtype='float32',
                                                value=0)
                for i in range(self.num_layers):
                    pre_hidden_array.append(
                        layers.expand(enc_last_hidden[i], [beam_size, 1]))
                    pre_cell_array.append(
                        layers.expand(enc_last_cell[i], [beam_size, 1]))

                eos_ids = layers.fill_constant([beam_size],
                                               dtype='int64',
                                               value=2)
                init_score = np.zeros((beam_size)).astype('float32')
                init_score[1:] = -INF
                pre_score = layers.assign(init_score)
                #pre_score = layers.fill_constant( [1,], dtype='float32', value= 0.0)
                tokens = layers.fill_constant([beam_size, 1],
                                              dtype='int64',
                                              value=1)

                enc_memory = layers.expand(self.enc_output, [beam_size, 1, 1])

                pre_tokens = layers.fill_constant([beam_size, 1],
                                                  dtype='int64',
                                                  value=1)

                finished_seq = layers.fill_constant([beam_size, 1],
                                                    dtype='int64',
                                                    value=0)
                finished_scores = layers.fill_constant([beam_size],
                                                       dtype='float32',
                                                       value=-INF)
                finished_flag = layers.fill_constant([beam_size],
                                                     dtype='float32',
                                                     value=0.0)

                step_idx = layers.fill_constant(shape=[1],
                                                dtype='int32',
                                                value=0)
                cond = layers.less_than(x=step_idx,
                                        y=max_length)  # default force_cpu=True

                parent_idx = layers.fill_constant([1], dtype='int32', value=0)
                while_op = layers.While(cond)

                def compute_topk_scores_and_seq(sequences,
                                                scores,
                                                scores_to_gather,
                                                flags,
                                                beam_size,
                                                select_beam=None,
                                                generate_id=None):
                    scores = layers.reshape(scores, shape=[1, -1])
                    _, topk_indexs = layers.topk(scores, k=beam_size)

                    topk_indexs = layers.reshape(topk_indexs, shape=[-1])

                    # gather result

                    top_seq = layers.gather(sequences, topk_indexs)
                    topk_flags = layers.gather(flags, topk_indexs)
                    topk_gather_scores = layers.gather(scores_to_gather,
                                                       topk_indexs)

                    if select_beam:
                        topk_beam = layers.gather(select_beam, topk_indexs)
                    else:
                        topk_beam = select_beam

                    if generate_id:
                        topk_id = layers.gather(generate_id, topk_indexs)
                    else:
                        topk_id = generate_id
                    return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id

                def grow_alive(curr_seq, curr_scores, curr_log_probs,
                               curr_finished, select_beam, generate_id):
                    curr_scores += curr_finished * -INF
                    return compute_topk_scores_and_seq(curr_seq,
                                                       curr_scores,
                                                       curr_log_probs,
                                                       curr_finished,
                                                       beam_size,
                                                       select_beam,
                                                       generate_id=generate_id)

                def grow_finished(finished_seq, finished_scores, finished_flag,
                                  curr_seq, curr_scores, curr_finished):
                    finished_seq = layers.concat([
                        finished_seq,
                        layers.fill_constant(
                            [beam_size, 1], dtype='int64', value=1)
                    ],
                                                 axis=1)
                    curr_scores += (1.0 - curr_finished) * -INF
                    #layers.Print( curr_scores, message="curr scores")
                    curr_finished_seq = layers.concat([finished_seq, curr_seq],
                                                      axis=0)
                    curr_finished_scores = layers.concat(
                        [finished_scores, curr_scores], axis=0)
                    curr_finished_flags = layers.concat(
                        [finished_flag, curr_finished], axis=0)

                    return compute_topk_scores_and_seq(curr_finished_seq,
                                                       curr_finished_scores,
                                                       curr_finished_scores,
                                                       curr_finished_flags,
                                                       beam_size)

                def is_finished(alive_log_prob, finished_scores,
                                finished_in_finished):

                    max_out_len = 200
                    max_length_penalty = layers.pow(
                        layers.fill_constant([1],
                                             dtype='float32',
                                             value=((5.0 + max_out_len) /
                                                    6.0)), alpha)

                    lower_bound_alive_score = layers.slice(
                        alive_log_prob, starts=[0], ends=[1],
                        axes=[0]) / max_length_penalty

                    lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished
                    lowest_score_of_fininshed_in_finished += (
                        1.0 - finished_in_finished) * -INF
                    lowest_score_of_fininshed_in_finished = layers.reduce_min(
                        lowest_score_of_fininshed_in_finished)

                    met = layers.less_than(
                        lower_bound_alive_score,
                        lowest_score_of_fininshed_in_finished)
                    met = layers.cast(met, 'float32')
                    bound_is_met = layers.reduce_sum(met)

                    finished_eos_num = layers.reduce_sum(finished_in_finished)

                    finish_cond = layers.less_than(
                        finished_eos_num,
                        layers.fill_constant([1],
                                             dtype='float32',
                                             value=beam_size))

                    return finish_cond

                def grow_top_k(step_idx, alive_seq, alive_log_prob,
                               parant_idx):
                    pre_ids = alive_seq

                    dec_step_emb = layers.embedding(
                        input=pre_ids,
                        size=[self.tar_vocab_size, self.hidden_size],
                        dtype='float32',
                        is_sparse=False,
                        param_attr=fluid.ParamAttr(
                            name='target_embedding',
                            initializer=fluid.initializer.UniformInitializer(
                                low=-self.init_scale, high=self.init_scale)))

                    dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                        dec_step_emb, pre_hidden_array, pre_cell_array)

                    projection = layers.matmul(dec_att_out, softmax_weight)

                    logits = layers.softmax(projection)
                    current_log = layers.elementwise_add(x=layers.log(logits),
                                                         y=alive_log_prob,
                                                         axis=0)
                    base_1 = layers.cast(step_idx, 'float32') + 6.0
                    base_1 /= 6.0
                    length_penalty = layers.pow(base_1, alpha)

                    len_pen = layers.pow(
                        ((5. + layers.cast(step_idx + 1, 'float32')) / 6.),
                        alpha)

                    current_log = layers.reshape(current_log, shape=[1, -1])

                    current_log = current_log / length_penalty
                    topk_scores, topk_indices = layers.topk(input=current_log,
                                                            k=beam_size)

                    topk_scores = layers.reshape(topk_scores, shape=[-1])

                    topk_log_probs = topk_scores * length_penalty

                    generate_id = layers.reshape(
                        topk_indices, shape=[-1]) % self.tar_vocab_size

                    selected_beam = layers.reshape(
                        topk_indices, shape=[-1]) // self.tar_vocab_size

                    topk_finished = layers.equal(generate_id, eos_ids)

                    topk_finished = layers.cast(topk_finished, 'float32')

                    generate_id = layers.reshape(generate_id, shape=[-1, 1])

                    pre_tokens_list = layers.gather(tokens, selected_beam)

                    full_tokens_list = layers.concat(
                        [pre_tokens_list, generate_id], axis=1)


                    return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \
                            dec_att_out, new_hidden_array, new_cell_array

                with while_op.block():
                    topk_seq, topk_log_probs, topk_scores, topk_finished, topk_beam, topk_generate_id, attention_out, new_hidden_array, new_cell_array = \
                        grow_top_k(  step_idx, pre_tokens, pre_score, parent_idx)
                    alive_seq, alive_log_prob, _, alive_beam, alive_id = grow_alive(
                        topk_seq, topk_scores, topk_log_probs, topk_finished,
                        topk_beam, topk_generate_id)

                    finished_seq_2, finished_scores_2, finished_flags_2, _, _ = grow_finished(
                        finished_seq, finished_scores, finished_flag, topk_seq,
                        topk_scores, topk_finished)

                    finished_cond = is_finished(alive_log_prob,
                                                finished_scores_2,
                                                finished_flags_2)

                    layers.increment(x=step_idx, value=1.0, in_place=True)

                    layers.assign(alive_beam, parent_idx)
                    layers.assign(alive_id, pre_tokens)
                    layers.assign(alive_log_prob, pre_score)
                    layers.assign(alive_seq, tokens)
                    layers.assign(finished_seq_2, finished_seq)
                    layers.assign(finished_scores_2, finished_scores)
                    layers.assign(finished_flags_2, finished_flag)

                    # update init_hidden, init_cell, input_feed
                    new_feed = layers.gather(attention_out, parent_idx)
                    layers.assign(new_feed, pre_feed)
                    for i in range(self.num_layers):
                        new_hidden_var = layers.gather(new_hidden_array[i],
                                                       parent_idx)
                        layers.assign(new_hidden_var, pre_hidden_array[i])
                        new_cell_var = layers.gather(new_cell_array[i],
                                                     parent_idx)
                        layers.assign(new_cell_var, pre_cell_array[i])

                    length_cond = layers.less_than(x=step_idx, y=max_length)
                    layers.logical_and(x=length_cond,
                                       y=finished_cond,
                                       out=cond)

                tokens_with_eos = tokens

                all_seq = layers.concat([tokens_with_eos, finished_seq],
                                        axis=0)
                all_score = layers.concat([pre_score, finished_scores], axis=0)
                _, topk_index = layers.topk(all_score, k=beam_size)
                topk_index = layers.reshape(topk_index, shape=[-1])
                final_seq = layers.gather(all_seq, topk_index)
                final_score = layers.gather(all_score, topk_index)

                return final_seq
            elif mode == 'greedy_search':
                max_src_seq_len = layers.shape(self.src)[1]
                max_length = max_src_seq_len * 2
                #max_length = layers.fill_constant( [1], dtype='int32', value = 10)
                pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1)
                full_ids = layers.fill_constant([1, 1], dtype='int64', value=1)

                score = layers.fill_constant([1], dtype='float32', value=0.0)

                eos_ids = layers.fill_constant([1, 1], dtype='int64', value=2)

                pre_hidden_array = []
                pre_cell_array = []
                pre_feed = layers.fill_constant([1, self.hidden_size],
                                                dtype='float32',
                                                value=0)
                for i in range(self.num_layers):
                    pre_hidden_array.append(enc_last_hidden[i])
                    pre_cell_array.append(enc_last_cell[i])
                    #pre_hidden_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0)  )
                    #pre_cell_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) )

                step_idx = layers.fill_constant(shape=[1],
                                                dtype='int32',
                                                value=0)
                cond = layers.less_than(x=step_idx,
                                        y=max_length)  # default force_cpu=True
                while_op = layers.While(cond)

                with while_op.block():

                    dec_step_emb = layers.embedding(
                        input=pre_ids,
                        size=[self.tar_vocab_size, self.hidden_size],
                        dtype='float32',
                        is_sparse=False,
                        param_attr=fluid.ParamAttr(
                            name='target_embedding',
                            initializer=fluid.initializer.UniformInitializer(
                                low=-self.init_scale, high=self.init_scale)))

                    dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                        dec_step_emb, pre_hidden_array, pre_cell_array)

                    projection = layers.matmul(dec_att_out, softmax_weight)

                    logits = layers.softmax(projection)
                    logits = layers.log(logits)

                    current_log = layers.elementwise_add(logits, score, axis=0)

                    topk_score, topk_indices = layers.topk(input=current_log,
                                                           k=1)

                    new_ids = layers.concat([full_ids, topk_indices])
                    layers.assign(new_ids, full_ids)
                    #layers.Print( full_ids, message="ful ids")
                    layers.assign(topk_score, score)
                    layers.assign(topk_indices, pre_ids)
                    layers.assign(dec_att_out, pre_feed)
                    for i in range(self.num_layers):
                        layers.assign(new_hidden_array[i], pre_hidden_array[i])
                        layers.assign(new_cell_array[i], pre_cell_array[i])

                    layers.increment(x=step_idx, value=1.0, in_place=True)

                    eos_met = layers.not_equal(topk_indices, eos_ids)
                    length_cond = layers.less_than(x=step_idx, y=max_length)
                    layers.logical_and(x=length_cond, y=eos_met, out=cond)

                return full_ids

            raise Exception("error")
        else:
            print("mode not supprt", mode)
Example #26
0
def _do_beam_search(trg_vocab_size, max_in_len, n_layer, n_head, d_key,
                    d_value, d_model, d_inner_hid, prepostprocess_dropout,
                    attention_dropout, relu_dropout, preprocess_cmd,
                    postprocess_cmd, weight_sharing, beam_size, max_len,
                    bos_idx, eos_idx, ids, scores, parent_idx,
                    trg_src_attn_bias, caches, enc_output, step_idx):
    """
        do beam search
    """
    cond = layers.less_than(x=step_idx, y=max_len)  # default force_cpu=True
    while_op = layers.While(cond)
    with while_op.block():
        pre_ids = layers.array_read(array=ids, i=step_idx)
        # Since beam_search_op dosen't enforce pre_ids' shape, we can do
        # inplace reshape here which actually change the shape of pre_ids.
        pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
        pre_scores = layers.array_read(array=scores, i=step_idx)
        # gather cell states corresponding to selected parent
        pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=parent_idx)
        pre_pos = layers.elementwise_mul(
            x=layers.fill_constant_batch_size_like(
                input=pre_src_attn_bias,  # cann't use lod tensor here
                value=1,
                shape=[-1, 1, 1],
                dtype=pre_ids.dtype),
            y=step_idx,
            axis=0)
        logits = wrap_decoder(trg_vocab_size,
                              max_in_len,
                              n_layer,
                              n_head,
                              d_key,
                              d_value,
                              d_model,
                              d_inner_hid,
                              prepostprocess_dropout,
                              attention_dropout,
                              relu_dropout,
                              preprocess_cmd,
                              postprocess_cmd,
                              weight_sharing,
                              dec_inputs=(pre_ids, pre_pos, None,
                                          pre_src_attn_bias),
                              enc_output=enc_output,
                              caches=caches,
                              gather_idx=parent_idx,
                              bos_idx=bos_idx)
        # intra-beam topK
        topk_scores, topk_indices = layers.topk(input=layers.softmax(logits),
                                                k=beam_size)
        accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                             y=pre_scores,
                                             axis=0)
        # beam_search op uses lod to differentiate branches.
        accu_scores = layers.lod_reset(accu_scores, pre_ids)
        # topK reduction across beams, also contain special handle of
        # end beams and end sentences(batch reduction)
        selected_ids, selected_scores, gather_idx = layers.beam_search(
            pre_ids=pre_ids,
            pre_scores=pre_scores,
            ids=topk_indices,
            scores=accu_scores,
            beam_size=beam_size,
            end_id=eos_idx,
            return_parent_idx=True)
        layers.increment(x=step_idx, value=1.0, in_place=True)
        # cell states(caches) have been updated in wrap_decoder,
        # only need to update beam search states here.
        layers.array_write(selected_ids, i=step_idx, array=ids)
        layers.array_write(selected_scores, i=step_idx, array=scores)
        layers.assign(gather_idx, parent_idx)
        layers.assign(pre_src_attn_bias, trg_src_attn_bias)
        length_cond = layers.less_than(x=step_idx, y=max_len)
        finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
        layers.logical_and(x=length_cond, y=finish_cond, out=cond)
def decoder_decode(context, is_sparse):
    init_state = context
    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)

    # fill the first element with init_state
    state_array = pd.create_array('float32')
    pd.array_write(init_state, array=state_array, i=counter)

    # ids, scores as memory
    ids_array = pd.create_array('int64')
    scores_array = pd.create_array('float32')

    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
    init_scores = pd.data(
        name="init_scores", shape=[1], dtype="float32", lod_level=2)

    pd.array_write(init_ids, array=ids_array, i=counter)
    pd.array_write(init_scores, array=scores_array, i=counter)

    cond = pd.less_than(x=counter, y=array_len)

    while_op = pd.While(cond=cond)
    with while_op.block():
        pre_ids = pd.array_read(array=ids_array, i=counter)
        pre_state = pd.array_read(array=state_array, i=counter)
        pre_score = pd.array_read(array=scores_array, i=counter)

        # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)

        pre_ids_emb = pd.embedding(
            input=pre_ids,
            size=[dict_size, word_dim],
            dtype='float32',
            is_sparse=is_sparse)

        # use rnn unit to update rnn
        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
                              size=decoder_size,
                              act='tanh')
        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
        # use score to do beam search
        current_score = pd.fc(input=current_state_with_lod,
                              size=target_dict_dim,
                              act='softmax')
        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
        # calculate accumulated scores after topk to reduce computation cost
        accu_scores = pd.elementwise_add(
            x=pd.log(topk_scores), y=pd.reshape(
                pre_score, shape=[-1]), axis=0)
        selected_ids, selected_scores = pd.beam_search(
            pre_ids,
            pre_score,
            topk_indices,
            accu_scores,
            beam_size,
            end_id=10,
            level=0)

        pd.increment(x=counter, value=1, in_place=True)

        # update the memories
        pd.array_write(current_state, array=state_array, i=counter)
        pd.array_write(selected_ids, array=ids_array, i=counter)
        pd.array_write(selected_scores, array=scores_array, i=counter)

        # update the break condition: up to the max length or all candidates of
        # source sentences have ended.
        length_cond = pd.less_than(x=counter, y=array_len)
        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
        pd.logical_and(x=length_cond, y=finish_cond, out=cond)

    translation_ids, translation_scores = pd.beam_search_decode(
        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)

    # return init_ids, init_scores

    return translation_ids, translation_scores
Example #28
0
    def decoder(self, init_state):
        """
        implement decoder in inference mode
        """
        # pd.Print(init_state)
        # define counter variable in the decoding
        array_len = pd.fill_constant(shape=[1],
                                     dtype='int64',
                                     value=self.max_length)
        counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
        static_count = pd.zeros(shape=[1], dtype='int64', force_cpu=True)

        # define tensor array to save content at each time step, and write initial id, score and state
        state_h_array = pd.create_array('float32')
        pd.array_write(self.h, array=state_h_array, i=counter)
        state_c_array = pd.create_array('float32')
        pd.array_write(self.c, array=state_c_array, i=counter)

        src_indexes = fluid.layers.data(name='source_index',
                                        shape=[1],
                                        dtype='int64',
                                        lod_level=1)
        src_index_array = pd.create_array('int64')
        pd.array_write(src_indexes, array=src_index_array, i=counter)

        ids_array = pd.create_array('int64')
        scores_array = pd.create_array('float32')

        init_ids = fluid.layers.data(name="init_ids",
                                     shape=[1],
                                     dtype="int64",
                                     lod_level=2)
        init_scores = fluid.layers.data(name="init_scores",
                                        shape=[1],
                                        dtype="float32",
                                        lod_level=2)

        pd.array_write(init_ids, array=ids_array, i=counter)
        pd.array_write(init_scores, array=scores_array, i=counter)

        encoder_vec_array = pd.create_array('float32')
        pd.array_write(self.encoder_vec,
                       array=encoder_vec_array,
                       i=static_count)
        encoder_vec_full_array = pd.create_array('float32')
        pd.array_write(self.encoder_vec_full,
                       array=encoder_vec_full_array,
                       i=static_count)
        encoder_proj_array = pd.create_array('float32')
        pd.array_write(self.encoder_proj,
                       array=encoder_proj_array,
                       i=static_count)

        event_embedding_array = pd.create_array('float32')
        pd.array_write(self.event_embedding,
                       array=event_embedding_array,
                       i=static_count)

        # define conditional variable to stop loop
        cond = pd.less_than(x=counter, y=array_len)
        # define while_op
        while_op = pd.While(cond=cond)
        with while_op.block():  # define the computing of each step
            # pd.Print(counter)

            # obtain input at present step of decoder, including id chosen at previous step, corresponding score and state at previous step.
            pre_ids = pd.array_read(array=ids_array, i=counter)
            pre_h_state = pd.array_read(array=state_h_array, i=counter)
            pre_c_state = pd.array_read(array=state_c_array, i=counter)

            # pre_score = pd.array_read(array=scores_array, i=counter)
            pre_score = pd.array_read(array=scores_array, i=static_count)

            _encoder_input_ids = pd.array_read(array=src_index_array,
                                               i=static_count)

            event_embedding = pd.array_read(array=event_embedding_array,
                                            i=static_count)

            # print("pre_h_state", pre_h_state)
            encoder_vec = pd.array_read(array=encoder_vec_array,
                                        i=static_count)
            encoder_vec_full = pd.array_read(array=encoder_vec_full_array,
                                             i=static_count)
            encoder_proj = pd.array_read(array=encoder_proj_array,
                                         i=static_count)

            # # update input state as state correspondent with id chosen at previous step
            # pre_h_state_expanded = pd.sequence_expand(pre_h_state, pre_score)
            # pre_c_state_expanded = pd.sequence_expand(pre_c_state, pre_score)
            # computing logic of decoder under the same train mode, including input vector and computing unit of decoder
            # compute predicting probability of normalized word
            pre_ids_emb = pd.embedding(
                input=pre_ids,
                size=[self.target_dict_dim, self.embedding_dim],
                dtype='float32',
                param_attr=fluid.ParamAttr(name="trg_embedding"))

            # pd.Print(pre_ids_emb)
            att_context = self.simple_attention(encoder_vec, encoder_proj,
                                                pre_h_state)
            # print("att_context", att_context)
            # print("pre_ids_emb", pre_ids_emb)
            # pd.Print(att_context)

            prob_c = fluid.layers.sequence_expand_as(pre_score, encoder_vec)
            # pd.Print(prob_c)

            current_score, current_h, current_c, this_prob_c = self.copy_decoder(
                pre_ids_emb, encoder_vec, encoder_vec_full, encoder_proj,
                _encoder_input_ids, pre_ids, prob_c, att_context, pre_h_state,
                pre_c_state, event_embedding)

            # decoder_inputs = fluid.layers.concat(
            #     input=[att_context, pre_ids_emb], axis=1)
            # current_h, current_c = self.lstm_step(
            #         decoder_inputs, pre_h_state, pre_c_state, self.decoder_size)
            # # compute predicting probability of nomarlized word
            # current_score = fluid.layers.fc(input=current_h,
            #                       size=self.target_dict_dim,
            #                       act='softmax',
            #                       param_attr=fluid.ParamAttr(name="out_softmax_w"),
            #                       bias_attr=fluid.ParamAttr(name="out_softmax_b"))

            # # current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
            # #                       size=decoder_size,
            # #                       act='tanh')
            # current_state_with_lod = pd.lod_reset(x=current_h, y=pre_score)
            # current_score = pd.fc(input=current_state_with_lod,
            #                       size=self.target_dict_dim,
            #                       act='softmax',
            #                       param_attr=fluid.ParamAttr(name="out_softmax_w"),
            #                       bias_attr=fluid.ParamAttr(name="out_softmax_b"))
            # print(current_score)
            topk_scores, topk_indices = pd.topk(current_score,
                                                k=self.beam_size)
            # pd.Print(topk_indices)
            # pd.Print(topk_scores)
            selected_ids, selected_scores = topk_indices, topk_scores

            # # compute accumulated score and perform beam search
            # accu_scores = pd.elementwise_add(
            #     x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0)
            # selected_ids, selected_scores = pd.beam_search(
            #     pre_ids,
            #     pre_score,
            #     topk_indices,
            #     accu_scores,
            #     self.beam_size,
            #     # end_id=self.end_id,
            #     end_id=999999,
            #     level=0)

            # pd.Print(selected_ids)
            # pd.Print(selected_scores)

            pd.increment(x=counter, value=1, in_place=True)
            # write search result and corresponding hidden layer into tensor array
            pd.array_write(current_h, array=state_h_array, i=counter)
            pd.array_write(current_c, array=state_c_array, i=counter)
            pd.array_write(selected_ids, array=ids_array, i=counter)
            pd.array_write(selected_scores, array=scores_array, i=counter)
            # pd.Print(selected_ids)
            # pd.Print(selected_scores)

            # update condition to stop loop
            length_cond = pd.less_than(x=counter, y=array_len)
            finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
            pd.logical_and(x=length_cond, y=finish_cond, out=cond)

        # pd.Print(array_len)
        # translation_ids, translation_scores = pd.beam_search_decode(
        #     ids=ids_array, scores=scores_array, beam_size=self.beam_size, end_id=self.end_id)
        # pd.Print(translation_ids)
        translation_ids, translation_ids_index = pd.tensor_array_to_tensor(
            ids_array, axis=1)
        translation_scores, translation_scores_index = pd.tensor_array_to_tensor(
            scores_array, axis=1)

        return translation_ids, translation_scores
Example #29
0
    def fast_decode(self):
        """create model for inference"""
        if self.task_type == "dialog":
            emb_num = 4
        else:
            emb_num = 3
        input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \
                       [[-1, self.max_seq_len, self.max_seq_len]]
        input_dtypes = ['int64'] * emb_num + ['float32']
        input_lod_levels = [0] * emb_num + [0]

        shapes = input_shapes + [[-1, 1, 1], [-1, 1, 1], [-1, 1], [-1],
                                 [-1, 1, self.max_seq_len], [-1, 1]]
        dtypes = input_dtypes + [
            'int64', 'int64', 'float32', 'int32', 'float32', 'int64'
        ]
        lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0]

        inputs = self.to_tensor(shapes, dtypes, lod_levels)
        pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                      capacity=70,
                                                      iterable=False)
        emb_ids = {}
        for key, value in zip(self.emb_keys, inputs[:emb_num]):
            emb_ids[key] = value

        input_mask = inputs[emb_num]
        tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[
            -6:]

        unimo = UNIMOModel(emb_ids=emb_ids,
                           input_mask=input_mask,
                           config=self.gene_config,
                           task_type=self.task_type,
                           decoding=True,
                           gather_idx=parent_idx)

        max_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.max_out_len,
                                       force_cpu=True)
        min_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.min_out_len,
                                       force_cpu=True)
        neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-1e18)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=tgt_ids.dtype,
                                        value=0,
                                        force_cpu=True)
        step_next_idx = layers.fill_constant(shape=[1],
                                             dtype=tgt_ids.dtype,
                                             value=1,
                                             force_cpu=True)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx)
        pos_biases = layers.array_write(tgt_pos, step_idx)
        scores = layers.array_write(init_scores, step_idx)
        tgt_masks = layers.array_write(tgt_input_mask, step_idx)

        trigram_blocking = TrigramBlocking(tgt_ids,
                                           self.tokenizer,
                                           beam_size=self.beam_size)

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            def gen_batch_like(value,
                               dtype="int64",
                               shape=[-1, 1, 1],
                               is_scalar=True):
                """generate batch"""
                if is_scalar:
                    return layers.fill_constant_batch_size_like(
                        input=parent_idx,
                        value=value,
                        shape=shape,
                        dtype=dtype)
                else:
                    return layers.elementwise_mul(
                        x=layers.fill_constant_batch_size_like(
                            input=parent_idx,
                            value=1,
                            shape=shape,
                            dtype=dtype),
                        y=value,
                        axis=0)

            tmp_mask = layers.array_read(tgt_masks, i=step_idx)
            tmp_mask = layers.gather(input=tmp_mask, index=parent_idx)
            append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype)
            pre_mask = layers.concat([tmp_mask, append_1_mask], axis=2)

            pre_pos = gen_batch_like(step_idx, is_scalar=False)
            pre_pos = pre_pos + pos_bias  ####################### pos start from 2

            pre_sent = gen_batch_like(self.tgt_type_id, dtype=pre_ids.dtype)

            dec_emb_ids = {"word_embedding": pre_ids, "pos_embedding": pre_pos}
            if self.task_type == "dialog":
                role_ids = gen_batch_like(0)
                turn_ids = gen_batch_like(0)
                dec_emb_ids["role_embedding"] = role_ids
                dec_emb_ids["turn_embedding"] = turn_ids
            else:
                dec_emb_ids["sent_embedding"] = pre_sent

            dec_out = unimo.encode(emb_ids=dec_emb_ids,
                                   input_mask=pre_mask,
                                   gather_idx=parent_idx)
            fc_out = self.cal_logit(dec_out, None)

            # prevent generating end token if length less than min_out_len
            eos_index = layers.fill_constant(shape=[layers.shape(fc_out)[0]],
                                             dtype='int64',
                                             value=self.eos_id)
            eos_index = fluid.one_hot(eos_index, depth=self.vocab_size)
            less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len),
                                    dtype='float32')
            less_val = layers.elementwise_mul(less_cond, neg_inf)
            eos_val = layers.elementwise_mul(eos_index, less_val, axis=0)
            revised_logits = layers.elementwise_add(fc_out, eos_val, axis=0)

            # topK reduction across beams, also contain special handle of
            # end beams and end sentences(batch reduction)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(revised_logits), k=self.beam_size)

            # Roll-Back previous-scores for length-penalty
            # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back
            # because of doing this, we need store the length-penaltied score in `scores`
            # while calculating use the un-penaltied score
            # -> safe for step_idx == 0 (initialization state), because previous-score == 0
            pre_timestep_length_penalty = fluid.layers.pow(
                ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0),
                self.length_penalty)
            pre_scores_wo_len_penalty = fluid.layers.elementwise_mul(
                pre_scores, pre_timestep_length_penalty)

            # calc trigram-blocking delta scores for current alive sequence
            if self.block_trigram:
                trigram_blocking.update_seq(pre_ids, parent_idx)
                trigram_blocking.expand_cand_seq(topk_indices)
                fluid.layers.py_func(func=trigram_blocking.blocking_forward,
                                     x=[
                                         trigram_blocking.cand_seq,
                                         trigram_blocking.id2is_full_token
                                     ],
                                     out=trigram_blocking.delta_score_out,
                                     backward_func=None)
                pre_scores_wo_len_penalty = fluid.layers.elementwise_add(
                    x=trigram_blocking.delta_score_out,
                    y=pre_scores_wo_len_penalty,
                    axis=0)
            # => [N, topk]
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores_wo_len_penalty,
                                                 axis=0)

            cur_timestep_length_penalty = layers.pow(
                ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0),
                self.length_penalty)
            curr_scores = layers.elementwise_div(accu_scores,
                                                 cur_timestep_length_penalty)

            # beam_search op uses lod to differentiate branches.
            curr_scores = layers.lod_reset(curr_scores, pre_ids)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=curr_scores,
                beam_size=self.beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.increment(x=step_idx, value=1.0, in_place=True)
            layers.increment(x=step_next_idx, value=1.0, in_place=True)
            # cell states(caches) have been updated in wrap_decoder,
            # only need to update beam search states here.
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_masks)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)
            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=self.beam_size, end_id=self.eos_id)

        graph_vars = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "data_ids": data_ids
        }

        for k, v in graph_vars.items():
            v.persistable = True

        return pyreader, graph_vars