def prepare_encoder(src_word,
                    src_pos,
                    src_vocab_size,
                    src_emb_dim,
                    src_pad_idx,
                    src_max_len,
                    dropout=0.,
                    pos_pad_idx=0,
                    pos_enc_param_name=None):
    """Add word embeddings and position encodings.
    The output tensor has a shape of:
    [batch_size, max_src_length_in_batch, d_model].

    This module is used at the bottom of the encoder stacks.
    """
    src_word_emb = layers.embedding(
        src_word,
        size=[src_vocab_size, src_emb_dim],
        padding_idx=src_pad_idx,
        param_attr=fluid.initializer.Normal(0., 1.))
    src_pos_enc = layers.embedding(
        src_pos,
        size=[src_max_len, src_emb_dim],
        padding_idx=pos_pad_idx,
        param_attr=fluid.ParamAttr(
            name=pos_enc_param_name, trainable=False))
    enc_input = src_word_emb + src_pos_enc

    # FIXME(guosheng): Decouple the program desc with batch_size.
    enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
    return layers.dropout(
        enc_input, dropout_prob=dropout,
        is_test=False) if dropout else enc_input
    def __combine_heads(x):
        """
        Transpose and then reshape the last two dimensions of inpunt tensor x
        so that it becomes one dimension, which is reverse to __split_heads.
        """
        if len(x.shape) == 3: return x
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")

        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
        # FIXME(guosheng): Decouple the program desc with batch_size.
        return layers.reshape(
            x=trans_x,
            shape=map(int,
                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
    def __split_heads(x, n_head):
        """
        Reshape the last dimension of inpunt tensor x so that it becomes two
        dimensions and then transpose. Specifically, input a tensor with shape
        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
        with shape [bs, n_head, max_sequence_length, hidden_dim].
        """
        if n_head == 1:
            return x

        hidden_size = x.shape[-1]
        # FIXME(guosheng): Decouple the program desc with batch_size.
        reshaped = layers.reshape(
            x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])

        # permuate the dimensions into:
        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
Exemple #4
0
def encoder(x,
            y,
            vocab_size,
            emb_size,
            init_hidden=None,
            init_cell=None,
            para_name='',
            custom_samples=None,
            custom_probabilities=None,
            test_mode=False,
            args=None):
    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, emb_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(name='embedding_para'))
    rnn_input = x_emb
    rnn_outs = []
    rnn_outs_ori = []
    cells = []
    projs = []
    for i in range(args.num_layers):
        rnn_input = dropout(rnn_input, test_mode, args)
        if init_hidden and init_cell:
            h0 = layers.squeeze(
                layers.slice(
                    init_hidden, axes=[0], starts=[i], ends=[i + 1]),
                axes=[0])
            c0 = layers.squeeze(
                layers.slice(
                    init_cell, axes=[0], starts=[i], ends=[i + 1]),
                axes=[0])
        else:
            h0 = c0 = None
        rnn_out, cell, input_proj = lstmp_encoder(
            rnn_input, args.hidden_size, h0, c0,
            para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args)
        rnn_out_ori = rnn_out
        if i > 0:
            rnn_out = rnn_out + rnn_input
        rnn_out = dropout(rnn_out, test_mode, args)
        cell = dropout(cell, test_mode, args)
        rnn_outs.append(rnn_out)
        rnn_outs_ori.append(rnn_out_ori)
        rnn_input = rnn_out
        cells.append(cell)
        projs.append(input_proj)

    softmax_weight = layers.create_parameter(
        [vocab_size, emb_size], dtype="float32", name="softmax_weight")
    softmax_bias = layers.create_parameter(
        [vocab_size], dtype="float32", name='softmax_bias')
    projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True)
    projection = layers.elementwise_add(projection, softmax_bias)

    projection = layers.reshape(projection, shape=[-1, vocab_size])

    if args.sample_softmax and (not test_mode):
        loss = layers.sampled_softmax_with_cross_entropy(
            logits=projection,
            label=y,
            num_samples=args.n_negative_samples_batch,
            seed=args.random_seed)
        if args.debug:
            layers.Print(loss, message='out_loss', summarize=100)
    else:
        label = layers.one_hot(input=y, depth=vocab_size)
        loss = layers.softmax_with_cross_entropy(
            logits=projection, label=label, soft_label=True)
    return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
Exemple #5
0
def wrap_decoder(trg_vocab_size,
                 max_length,
                 n_layer,
                 n_head,
                 d_key,
                 d_value,
                 d_model,
                 d_inner_hid,
                 dropout_rate,
                 dec_inputs=None,
                 enc_output=None):
    """
    The wrapper assembles together all needed layers for the decoder.
    """
    if dec_inputs is None:
        # This is used to implement independent decoder program in inference.
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
            enc_output, trg_data_shape, slf_attn_pre_softmax_shape, \
            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
            src_attn_post_softmax_shape = make_all_inputs(
            decoder_data_input_fields + decoder_util_input_fields)
    else:
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
            trg_data_shape, slf_attn_pre_softmax_shape, \
            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
            src_attn_post_softmax_shape = dec_inputs

    dec_input = prepare_decoder(
        trg_word,
        trg_pos,
        trg_vocab_size,
        d_model,
        max_length,
        dropout_rate,
        trg_data_shape,
    )
    dec_output = decoder(
        dec_input,
        enc_output,
        trg_slf_attn_bias,
        trg_src_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        slf_attn_pre_softmax_shape,
        slf_attn_post_softmax_shape,
        src_attn_pre_softmax_shape,
        src_attn_post_softmax_shape,
    )
    # Return logits for training and probs for inference.
    predict = layers.reshape(x=layers.fc(input=dec_output,
                                         size=trg_vocab_size,
                                         bias_attr=False,
                                         num_flatten_dims=2),
                             shape=[-1, trg_vocab_size],
                             act="softmax" if dec_inputs is None else None)
    return predict
Exemple #6
0
    def beam_search():
        max_len = layers.fill_constant(shape=[1],
                                       dtype=start_tokens.dtype,
                                       value=max_out_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=start_tokens.dtype,
                                        value=0,
                                        force_cpu=True)
        cond = layers.less_than(x=step_idx,
                                y=max_len)  # default force_cpu=True
        while_op = layers.While(cond)
        # array states will be stored for each step.
        ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                 step_idx)
        scores = layers.array_write(init_scores, step_idx)
        # cell states will be overwrited at each step.
        # caches contains states of history steps in decoder self-attention
        # and static encoder output projections in encoder-decoder attention
        # to reduce redundant computation.
        caches = [
            {
                "k":  # for self attention
                layers.fill_constant_batch_size_like(
                    input=start_tokens,
                    shape=[-1, n_head, 0, d_key],
                    dtype=enc_output.dtype,
                    value=0),
                "v":  # for self attention
                layers.fill_constant_batch_size_like(
                    input=start_tokens,
                    shape=[-1, n_head, 0, d_value],
                    dtype=enc_output.dtype,
                    value=0),
                "static_k":  # for encoder-decoder attention
                layers.create_tensor(dtype=enc_output.dtype),
                "static_v":  # for encoder-decoder attention
                layers.create_tensor(dtype=enc_output.dtype)
            } for i in range(n_layer)
        ]

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            # Since beam_search_op dosen't enforce pre_ids' shape, we can do
            # inplace reshape here which actually change the shape of pre_ids.
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            # gather cell states corresponding to selected parent
            pre_src_attn_bias = layers.gather(trg_src_attn_bias,
                                              index=parent_idx)
            pre_pos = layers.elementwise_mul(
                x=layers.fill_constant_batch_size_like(
                    input=pre_src_attn_bias,  # cann't use lod tensor here
                    value=1,
                    shape=[-1, 1, 1],
                    dtype=pre_ids.dtype),
                y=step_idx,
                axis=0)
            logits = wrap_decoder(trg_vocab_size,
                                  max_in_len,
                                  n_layer,
                                  n_head,
                                  d_key,
                                  d_value,
                                  d_model,
                                  d_inner_hid,
                                  prepostprocess_dropout,
                                  attention_dropout,
                                  relu_dropout,
                                  preprocess_cmd,
                                  postprocess_cmd,
                                  weight_sharing,
                                  dec_inputs=(pre_ids, pre_pos, None,
                                              pre_src_attn_bias),
                                  enc_output=enc_output,
                                  caches=caches,
                                  gather_idx=parent_idx)
            # intra-beam topK
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(logits), k=beam_size)
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores,
                                                 axis=0)
            # beam_search op uses lod to differentiate branches.
            topk_indices = layers.lod_reset(accu_scores, pre_ids)
            # topK reduction across beams, also contain special handle of
            # end beams and end sentences(batch reduction)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=eos_idx,
                return_parent_idx=True)
            layers.increment(x=step_idx, value=1.0, in_place=True)
            # cell states(caches) have been updated in wrap_decoder,
            # only need to update beam search states here.
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.assign(gather_idx, parent_idx)
            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=eos_idx)
        return finished_ids, finished_scores
Exemple #7
0
def mask_prob(p, onehot_eos, finished):
    is_finished = L.cast(L.reshape(finished, [-1, 1]) != 0, 'float32')
    p = is_finished * (1. - L.cast(onehot_eos, 'float32')) * -9999. + (
        1. - is_finished) * p
    return p
Exemple #8
0
    def get_single_direction_output(rnn_input,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                    pre_cell = rnn.memory(init=init_cell[i, direc_index])
                else:
                    pre_hidden = rnn.memory(
                        batch_ref=rnn_input, shape=[-1, hidden_size])
                    pre_cell = rnn.memory(
                        batch_ref=rnn_input, shape=[-1, hidden_size])

                new_hidden, new_cell = unit_list[i](step_input, pre_hidden,
                                                    pre_cell)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask, axis=0) - layers.elementwise_mul(
                            pre_hidden, (step_mask - 1), axis=0)
                    new_cell = layers.elementwise_mul(
                        new_cell, step_mask, axis=0) - layers.elementwise_mul(
                            pre_cell, (step_mask - 1), axis=0)

                rnn.update_memory(pre_hidden, new_hidden)
                rnn.update_memory(pre_cell, new_cell)

                rnn.step_output(new_hidden)
                rnn.step_output(new_cell)

                step_input = new_hidden
                if dropout_prob != None and dropout_prob > 0.0:
                    step_input = layers.dropout(
                        step_input,
                        dropout_prob=dropout_prob,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        last_cell_array = []
        rnn_output = rnn_out[-1]
        for i in range(num_layers):
            last_hidden = rnn_out[i * 2]
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)
            last_cell = rnn_out[i * 2 + 1]
            last_cell = last_cell[-1]
            last_cell_array.append(last_cell)

        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(
            last_hidden_output, shape=[num_layers, -1, hidden_size])
        last_cell_output = layers.concat(last_cell_array, axis=0)
        last_cell_output = layers.reshape(
            last_cell_output, shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output, last_cell_output
    def beam_search(enc_output, enc_bias, source_length):
        """
            beam_search
        """
        max_len = layers.fill_constant(shape=[1],
                                       dtype='int64',
                                       value=max_out_len)
        step_idx = layers.fill_constant(shape=[1], dtype='int64', value=0)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        caches_batch_size = batch_size * beam_size
        init_score = np.zeros([1, beam_size]).astype('float32')
        init_score[:, 1:] = -INF
        initial_log_probs = layers.assign(init_score)

        alive_log_probs = layers.expand(initial_log_probs, [batch_size, 1])
        # alive seq [batch_size, beam_size, 1]
        initial_ids = layers.zeros([batch_size, 1, 1], 'float32')
        alive_seq = layers.expand(initial_ids, [1, beam_size, 1])
        alive_seq = layers.cast(alive_seq, 'int64')

        enc_output = layers.unsqueeze(enc_output, axes=[1])
        enc_output = layers.expand(enc_output, [1, beam_size, 1, 1])
        enc_output = layers.reshape(enc_output,
                                    [caches_batch_size, -1, d_model])

        tgt_src_attn_bias = layers.unsqueeze(enc_bias, axes=[1])
        tgt_src_attn_bias = layers.expand(tgt_src_attn_bias,
                                          [1, beam_size, n_head, 1, 1])
        enc_bias_shape = layers.shape(tgt_src_attn_bias)
        tgt_src_attn_bias = layers.reshape(
            tgt_src_attn_bias,
            [-1, enc_bias_shape[2], enc_bias_shape[3], enc_bias_shape[4]])

        beam_search = BeamSearch(beam_size, batch_size, decode_alpha,
                                 trg_vocab_size, d_model)

        caches = [{
            "k":
            layers.fill_constant(shape=[caches_batch_size, 0, d_model],
                                 dtype=enc_output.dtype,
                                 value=0),
            "v":
            layers.fill_constant(shape=[caches_batch_size, 0, d_model],
                                 dtype=enc_output.dtype,
                                 value=0)
        } for i in range(n_layer)]

        finished_seq = layers.zeros_like(alive_seq)
        finished_scores = layers.fill_constant([batch_size, beam_size],
                                               dtype='float32',
                                               value=-INF)
        finished_flags = layers.fill_constant([batch_size, beam_size],
                                              dtype='float32',
                                              value=0)

        with while_op.block():
            pos = layers.fill_constant([caches_batch_size, 1, 1],
                                       dtype='int64',
                                       value=1)
            pos = layers.elementwise_mul(pos, step_idx, axis=0)

            alive_seq_1 = layers.reshape(alive_seq, [caches_batch_size, -1])
            alive_seq_2 = alive_seq_1[:, -1:]
            alive_seq_2 = layers.unsqueeze(alive_seq_2, axes=[1])

            logits = wrap_decoder(trg_vocab_size,
                                  max_in_len,
                                  n_layer,
                                  n_head,
                                  d_key,
                                  d_value,
                                  d_model,
                                  d_inner_hid,
                                  prepostprocess_dropout,
                                  attention_dropout,
                                  relu_dropout,
                                  preprocess_cmd,
                                  postprocess_cmd,
                                  weight_sharing,
                                  embedding_sharing,
                                  dec_inputs=(alive_seq_2, alive_seq_2, pos,
                                              None, tgt_src_attn_bias),
                                  enc_output=enc_output,
                                  caches=caches,
                                  is_train=False,
                                  params_type=params_type)

            alive_seq_2, alive_log_probs_2, finished_seq_2, finished_scores_2, finished_flags_2, caches_2 = \
                    beam_search.inner_func(step_idx, logits, alive_seq_1, alive_log_probs, finished_seq,
                                           finished_scores, finished_flags, caches, enc_output,
                                           tgt_src_attn_bias)

            layers.increment(x=step_idx, value=1.0, in_place=True)
            finish_cond = beam_search.is_finished(step_idx, source_length,
                                                  alive_log_probs_2,
                                                  finished_scores_2,
                                                  finished_flags_2)

            layers.assign(alive_seq_2, alive_seq)
            layers.assign(alive_log_probs_2, alive_log_probs)
            layers.assign(finished_seq_2, finished_seq)
            layers.assign(finished_scores_2, finished_scores)
            layers.assign(finished_flags_2, finished_flags)

            for i in xrange(len(caches_2)):
                layers.assign(caches_2[i]["k"], caches[i]["k"])
                layers.assign(caches_2[i]["v"], caches[i]["v"])

            layers.logical_and(x=cond, y=finish_cond, out=cond)

        finished_flags = layers.reduce_sum(
            finished_flags, dim=1, keep_dim=True) / beam_size
        finished_flags = layers.cast(finished_flags, 'bool')
        mask = layers.cast(
            layers.reduce_any(input=finished_flags, dim=1, keep_dim=True),
            'float32')
        mask = layers.expand(mask, [1, beam_size])

        mask2 = 1.0 - mask
        finished_seq = layers.cast(finished_seq, 'float32')
        alive_seq = layers.cast(alive_seq, 'float32')
        #print mask

        finished_seq = layers.elementwise_mul(finished_seq, mask, axis=0) + \
                        layers.elementwise_mul(alive_seq, mask2, axis = 0)
        finished_seq = layers.cast(finished_seq, 'int32')
        finished_scores = layers.elementwise_mul(finished_scores, mask, axis=0) + \
                            layers.elementwise_mul(alive_log_probs, mask2)
        finished_seq.persistable = True
        finished_scores.persistable = True

        return finished_seq, finished_scores
Exemple #10
0
def graph_linformer(gw,
                    feature,
                    edge_feature,
                    hidden_size,
                    name,
                    num_heads=4,
                    attn_drop=False,
                    concat=True,
                    skip_feat=True,
                    gate=False,
                    norm=True,
                    relu=True,
                    k_hop=2,
                    is_test=False):
    """Implementation of graph Transformer from UniMP

    This is an implementation of the paper Unified Massage Passing Model for Semi-Supervised Classification
    (https://arxiv.org/abs/2009.03509).

    Args:
        name: Granph Transformer layer names.
        
        gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)

        feature: A tensor with shape (num_nodes, feature_size).

        hidden_size: The hidden size for graph transformer.

        num_heads: The head number in graph transformer.

        attn_drop: Dropout rate for attention.
        
        edge_feature: A tensor with shape (num_edges, feature_size).
        num_heads: 8
        
        concat: Reshape the output (num_nodes, num_heads, hidden_size) by concat (num_nodes, hidden_size * num_heads) or mean (num_nodes, hidden_size)
        
        skip_feat: Whether use skip connect
        
        gate: Whether add skip_feat and output up with gate weight
        
        norm: Whether use layer_norm for output
        
        relu: Whether use relu activation for output

        is_test: Whether in test phrase.

    Return:
        A tensor with shape (num_nodes, hidden_size * num_heads) or (num_nodes, hidden_size)
    """
    def send_attention(src_feat, dst_feat, edge_feat):
        if edge_feat is None or not edge_feat:
            k_h = L.elu(
                L.reshape(src_feat["k_h"],
                          [-1, num_heads, hidden_size, 1])) + 1
            v_h = dst_feat["v_h"]
        else:
            edge_feat = edge_feat["edge"]
            edge_feat = L.reshape(edge_feat, [-1, num_heads, hidden_size])
            k_h = L.elu(src_feat["k_h"] + edge_feat) + 1
            v_h = dst_feat["v_h"] + edge_feat
        k_h = L.reshape(k_h, [-1, num_heads, hidden_size, 1])

        v_h = L.reshape(v_h, [-1, num_heads, hidden_size, 1])
        sum_kTv = L.matmul(k_h, v_h, transpose_y=True)
        sum_k = L.reshape(k_h, [-1, num_heads * hidden_size])
        sum_kTv = L.reshape(sum_kTv,
                            [-1, num_heads * hidden_size * hidden_size])

        return {"sum_k": sum_k, "sum_kTv": sum_kTv}

    def send_copy(src_feat, dst_feat, edge_feat):
        return src_feat

    def reduce_sum(msg):
        return L.sequence_pool(msg, "sum")

    q = L.elu(
        linear(feature,
               hidden_size * num_heads,
               name=name + '_q_weight',
               init_type='gcn')) + 1
    k = linear(feature,
               hidden_size * num_heads,
               name=name + '_k_weight',
               init_type='gcn')
    v = linear(feature,
               hidden_size * num_heads,
               name=name + '_v_weight',
               init_type='gcn')

    reshape_q = L.reshape(q, [-1, num_heads, 1, hidden_size])
    reshape_k = L.reshape(k, [-1, num_heads, hidden_size])
    reshape_v = L.reshape(v, [-1, num_heads, hidden_size])

    msg = gw.send(send_attention,
                  nfeat_list=[("k_h", reshape_k), ("v_h", reshape_v)],
                  efeat_list=[('edge', edge_feature)])

    sum_k = gw.recv(msg["sum_k"], reduce_sum)
    sum_kTv = gw.recv(msg["sum_kTv"], reduce_sum)

    for i in range(1, k_hop):
        msg = gw.send(send_copy,
                      nfeat_list=[("sum_k", sum_k), ("sum_kTv", sum_kTv)])
        sum_k = gw.recv(msg["sum_k"], reduce_sum)
        sum_kTv = gw.recv(msg["sum_kTv"], reduce_sum)
        # sum_k: [-1, num_heads * hidden_size]
        # sum_kTv: [-1, num_heads * hidden_size * hidden_size]
    sum_k = L.reshape(sum_k, [-1, num_heads, 1, hidden_size])
    sum_kTv = L.reshape(sum_kTv, [-1, num_heads, hidden_size, hidden_size])
    out_feat = L.reshape(L.matmul(reshape_q, sum_kTv),
                         [-1, num_heads, hidden_size]) / L.reduce_sum(
                             reshape_q * sum_k, -1)
    if concat:
        out_feat = L.reshape(out_feat, [-1, num_heads * hidden_size])
    else:
        out_feat = L.reduce_mean(out_feat, dim=1)

    if skip_feat:
        if concat:
            skip_feature = linear(feature,
                                  hidden_size * num_heads,
                                  name=name + '_skip_weight',
                                  init_type='lin')
        else:
            skip_feature = linear(feature,
                                  hidden_size,
                                  name=name + '_skip_weight',
                                  init_type='lin')

        if gate:
            temp_output = L.concat(
                [skip_feature, out_feat, out_feat - skip_feature], axis=-1)
            gate_f = L.sigmoid(
                linear(temp_output,
                       1,
                       name=name + '_gate_weight',
                       init_type='lin'))
            out_feat = skip_feature * gate_f + out_feat * (1 - gate_f)
        else:
            out_feat = skip_feature + out_feat

    if norm:
        out_feat = layer_norm(out_feat, name="ln_%s" % name)

    if relu:
        out_feat = L.relu(out_feat)

    return out_feat
Exemple #11
0
def basic_gru(input,
              init_hidden,
              hidden_size,
              num_layers=1,
              sequence_length=None,
              dropout_prob=0.0,
              bidirectional=False,
              batch_first=True,
              param_attr=None,
              bias_attr=None,
              gate_activation=None,
              activation=None,
              dtype='float32',
              name='basic_gru'):
    r"""
    GRU implementation using basic operator, supports multiple layers and bidirectional gru.

    .. math::
            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)

            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)

            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)

            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)

    Args:
        input (Variable): GRU input tensor, 
                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )  
                       if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
        init_hidden(Variable|None): The initial hidden state of the GRU
                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
                       and can be reshaped to tensor with ( num_layers x 2 x batch_size x hidden_size) to use.
                       If it's None, it will be set to all 0.
        hidden_size (int): Hidden size of the GRU
        num_layers (int): The total number of layers of the GRU
        sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance,
                        This tensor will be convert to a mask to mask the padding ids
                        If it's None means NO padding ids
        dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers, 
                             NOT between time steps
        bidirectional (bool|False): If it is bidirectional
        batch_first (bool|True): The shape format of the input and output tensors. If true,
            the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false,
            the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default
            this function accepts input and emits output in batch-major form to be consistent
            with most of data format, though a bit less efficient because of extra transposes.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
            weight matrix. Note:
            If it is set to None or one attribute of ParamAttr, gru_unit will
            create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr|None): The parameter attribute for the bias
            of GRU unit.
            If it is set to None or one attribute of ParamAttr, gru_unit will 
            create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        gate_activation (function|None): The activation function for gates (actGate).
                                  Default: 'fluid.layers.sigmoid'
        activation (function|None): The activation function for cell (actNode).
                             Default: 'fluid.layers.tanh'
        dtype(string): data type used in this unit
        name(string): name used to identify parameters and biases

    Returns:
        rnn_out(Tensor),last_hidden(Tensor)
            - rnn_out is result of GRU hidden, with shape (seq_len x batch_size x hidden_size) \
              if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
            - last_hidden is the hidden state of the last step of GRU \
              shape is ( num_layers x batch_size x hidden_size ) \
              if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size),
              can be reshaped to a tensor with shape( num_layers x 2 x batch_size x hidden_size)

    Examples:
        .. code-block:: python
            
            import paddle.fluid.layers as layers
            from paddle.fluid.contrib.layers import basic_gru

            batch_size = 20
            input_size = 128
            hidden_size = 256
            num_layers = 2
            dropout = 0.5
            bidirectional = True
            batch_first = False

            input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32')
            pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
            sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32')


            rnn_out, last_hidden = basic_gru( input, pre_hidden, hidden_size, num_layers = num_layers, \
                    sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \
                    batch_first = batch_first)

    """

    fw_unit_list = []

    for i in range(num_layers):
        new_name = name + "_layers_" + str(i)
        if param_attr is not None and param_attr.name is not None:
            layer_param_attr = copy.deepcopy(param_attr)
            layer_param_attr.name += "_fw_w_" + str(i)
        else:
            layer_param_attr = param_attr
        if bias_attr is not None and bias_attr.name is not None:
            layer_bias_attr = copy.deepcopy(bias_attr)
            layer_bias_attr.name += "_fw_b_" + str(i)
        else:
            layer_bias_attr = bias_attr
        fw_unit_list.append(
            BasicGRUUnit(new_name, hidden_size, layer_param_attr,
                         layer_bias_attr, gate_activation, activation, dtype))
    if bidirectional:
        bw_unit_list = []

        for i in range(num_layers):
            new_name = name + "_reverse_layers_" + str(i)
            if param_attr is not None and param_attr.name is not None:
                layer_param_attr = copy.deepcopy(param_attr)
                layer_param_attr.name += "_bw_w_" + str(i)
            else:
                layer_param_attr = param_attr
            if bias_attr is not None and bias_attr.name is not None:
                layer_bias_attr = copy.deepcopy(bias_attr)
                layer_bias_attr.name += "_bw_b_" + str(i)
            else:
                layer_bias_attr = bias_attr

            bw_unit_list.append(
                BasicGRUUnit(new_name, hidden_size, layer_param_attr,
                             layer_bias_attr, gate_activation, activation,
                             dtype))

    if batch_first:
        input = layers.transpose(input, [1, 0, 2])

    mask = None
    if sequence_length:
        max_seq_len = layers.shape(input)[0]
        mask = layers.sequence_mask(
            sequence_length, maxlen=max_seq_len, dtype='float32')
        mask = layers.transpose(mask, [1, 0])

    direc_num = 1
    if bidirectional:
        direc_num = 2
    if init_hidden:
        init_hidden = layers.reshape(
            init_hidden, shape=[num_layers, direc_num, -1, hidden_size])

    def get_single_direction_output(rnn_input,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                else:
                    pre_hidden = rnn.memory(
                        batch_ref=rnn_input,
                        shape=[-1, hidden_size],
                        ref_batch_dim_idx=1)

                new_hidden = unit_list[i](step_input, pre_hidden)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask, axis=0) - layers.elementwise_mul(
                            pre_hidden, (step_mask - 1), axis=0)
                rnn.update_memory(pre_hidden, new_hidden)

                rnn.step_output(new_hidden)

                step_input = new_hidden
                if dropout_prob != None and dropout_prob > 0.0:
                    step_input = layers.dropout(
                        step_input,
                        dropout_prob=dropout_prob, )

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        rnn_output = rnn_out[-1]
        for i in range(num_layers):
            last_hidden = rnn_out[i]
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)

        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(
            last_hidden_output, shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output
        # seq_len, batch_size, hidden_size

    fw_rnn_out, fw_last_hidden = get_single_direction_output(
        input, fw_unit_list, mask, direc_index=0)

    if bidirectional:
        bw_input = layers.reverse(input, axis=[0])
        bw_mask = None
        if mask:
            bw_mask = layers.reverse(mask, axis=[0])
        bw_rnn_out, bw_last_hidden = get_single_direction_output(
            bw_input, bw_unit_list, bw_mask, direc_index=1)

        bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])

        rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2)
        last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1)

        last_hidden = layers.reshape(
            last_hidden, shape=[num_layers * direc_num, -1, hidden_size])

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])
        return rnn_out, last_hidden
    else:

        rnn_out = fw_rnn_out
        last_hidden = fw_last_hidden

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])

        return rnn_out, last_hidden
Exemple #12
0
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[0],
                                 ends=[hidden_size])
                j = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size],
                                 ends=[hidden_size * 2])
                f = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 2],
                                 ends=[hidden_size * 3])
                o = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 3],
                                 ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(m,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(c,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_cell_array.append(last_c)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell
Exemple #13
0
def graph_transformer(gw,
                      feature,
                      edge_feature,
                      hidden_size,
                      name,
                      num_heads=4,
                      attn_drop=False,
                      concat=True,
                      skip_feat=True,
                      gate=False,
                      norm=True,
                      relu=True,
                      is_test=False):
    """Implementation of graph Transformer from UniMP

    This is an implementation of the paper Unified Massage Passing Model for Semi-Supervised Classification
    (https://arxiv.org/abs/2009.03509).

    Args:
        name: Granph Transformer layer names.
        
        gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`)

        feature: A tensor with shape (num_nodes, feature_size).

        hidden_size: The hidden size for graph transformer.

        num_heads: The head number in graph transformer.

        attn_drop: Dropout rate for attention.
        
        edge_feature: A tensor with shape (num_edges, feature_size).
        
        concat: Reshape the output (num_nodes, num_heads, hidden_size) by concat (num_nodes, hidden_size * num_heads) or mean (num_nodes, hidden_size)
        
        skip_feat: Whether use skip connect
        
        gate: Whether add skip_feat and output up with gate weight
        
        norm: Whether use layer_norm for output
        
        relu: Whether use relu activation for output

        is_test: Whether in test phrase.

    Return:
        A tensor with shape (num_nodes, hidden_size * num_heads) or (num_nodes, hidden_size)
    """
    def send_attention(src_feat, dst_feat, edge_feat):
        if edge_feat is None or not edge_feat:
            output = src_feat["k_h"] * dst_feat["q_h"]
            output = L.reduce_sum(output, -1)
            output = output / (hidden_size**0.5)
            return {
                "alpha": output,
                "v": src_feat["v_h"]
            }  # batch x h     batch x h x feat
        else:
            edge_feat = edge_feat["edge"]
            edge_feat = L.reshape(edge_feat, [-1, num_heads, hidden_size])
            output = (src_feat["k_h"] + edge_feat) * dst_feat["q_h"]
            output = L.reduce_sum(output, -1)
            output = output / (hidden_size**0.5)
            return {
                "alpha": output,
                "v": (src_feat["v_h"] + edge_feat)
            }  # batch x h     batch x h x feat

    class Reduce_attention():
        def __init__(self, ):
            self.alpha = None

        def __call__(self, msg):
            alpha = msg["alpha"]  # lod-tensor (batch_size, num_heads)
            if attn_drop:
                old_h = alpha
                dropout = F.data(name='attn_drop', shape=[1], dtype="int64")
                u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'),
                                     min=0.,
                                     max=1.)
                keeped = L.cast(u > dropout, dtype="float32")
                self_attn_mask = L.scale(x=keeped,
                                         scale=10000.0,
                                         bias=-1.0,
                                         bias_after_scale=False)
                n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads,
                                                axis=1)
                n_head_self_attn_mask.stop_gradient = True
                alpha = n_head_self_attn_mask + alpha
                alpha = L.lod_reset(alpha, old_h)

            h = msg["v"]
            alpha = paddle_helper.sequence_softmax(alpha)
            self.alpha = alpha
            old_h = h
            h = h * alpha
            h = L.lod_reset(h, old_h)
            h = L.sequence_pool(h, "sum")
            if concat:
                h = L.reshape(h, [-1, num_heads * hidden_size])
            else:
                h = L.reduce_mean(h, dim=1)
            return h

    reduce_attention = Reduce_attention()

    q = linear(feature,
               hidden_size * num_heads,
               name=name + '_q_weight',
               init_type='gcn')
    k = linear(feature,
               hidden_size * num_heads,
               name=name + '_k_weight',
               init_type='gcn')
    v = linear(feature,
               hidden_size * num_heads,
               name=name + '_v_weight',
               init_type='gcn')

    reshape_q = L.reshape(q, [-1, num_heads, hidden_size])
    reshape_k = L.reshape(k, [-1, num_heads, hidden_size])
    reshape_v = L.reshape(v, [-1, num_heads, hidden_size])

    msg = gw.send(send_attention,
                  nfeat_list=[("q_h", reshape_q), ("k_h", reshape_k),
                              ("v_h", reshape_v)],
                  efeat_list=[('edge', edge_feature)])
    out_feat = gw.recv(msg, reduce_attention)

    if skip_feat:
        if concat:
            skip_feature = linear(feature,
                                  hidden_size * num_heads,
                                  name=name + '_skip_weight',
                                  init_type='lin')
        else:
            skip_feature = linear(feature,
                                  hidden_size,
                                  name=name + '_skip_weight',
                                  init_type='lin')

        if gate:
            temp_output = L.concat(
                [skip_feature, out_feat, out_feat - skip_feature], axis=-1)
            gate_f = L.sigmoid(
                linear(temp_output,
                       1,
                       name=name + '_gate_weight',
                       init_type='lin'))
            out_feat = skip_feature * gate_f + out_feat * (1 - gate_f)
        else:
            out_feat = skip_feature + out_feat

    if norm:
        out_feat = layer_norm(out_feat, name="ln_%s" % name)

    if relu:
        out_feat = L.relu(out_feat)

    return out_feat
Exemple #14
0
def lm_model(hidden_size,
             vocab_size,
             num_layers=2,
             num_steps=20,
             init_scale=0.1,
             dropout=None,
             rnn_model='static',
             use_dataloader=False):
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[0],
                                 ends=[hidden_size])
                j = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size],
                                 ends=[hidden_size * 2])
                f = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 2],
                                 ends=[hidden_size * 3])
                o = layers.slice(gate_input,
                                 axes=[1],
                                 starts=[hidden_size * 3],
                                 ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(m,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(c,
                                  axes=[0],
                                  starts=[num_steps - 1],
                                  ends=[num_steps])
            last_cell_array.append(last_c)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell

    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]
                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell

    x = fluid.data(name="x", shape=[None, num_steps, 1], dtype='int64')
    y = fluid.data(name="y", shape=[None, 1], dtype='int64')

    if use_dataloader:
        dataloader = fluid.io.DataLoader.from_generator(feed_list=[x, y],
                                                        capacity=16,
                                                        iterable=False,
                                                        use_double_buffer=True)

    init_hidden = fluid.data(name="init_hidden",
                             shape=[None, num_layers, hidden_size],
                             dtype='float32')
    init_cell = fluid.data(name="init_cell",
                           shape=[None, num_layers, hidden_size],
                           dtype='float32')
    init_cell.persistable = True
    init_hidden.persistable = True

    init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2])
    init_cell = layers.transpose(init_cell, perm=[1, 0, 2])

    init_hidden_reshape = layers.reshape(init_hidden,
                                         shape=[num_layers, -1, hidden_size])
    init_cell_reshape = layers.reshape(init_cell,
                                       shape=[num_layers, -1, hidden_size])

    x_emb = layers.embedding(
        input=x,
        size=[vocab_size, hidden_size],
        dtype='float32',
        is_sparse=False,
        param_attr=fluid.ParamAttr(
            name='embedding_para',
            initializer=fluid.initializer.UniformInitializer(low=-init_scale,
                                                             high=init_scale)))

    x_emb = layers.reshape(x_emb,
                           shape=[-1, num_steps, hidden_size],
                           inplace=True)
    if dropout != None and dropout > 0.0:
        x_emb = layers.dropout(x_emb,
                               dropout_prob=dropout,
                               dropout_implementation='upscale_in_train')

    if rnn_model == "padding":
        rnn_out, last_hidden, last_cell = padding_rnn(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
            init_cell=init_cell_reshape)
    elif rnn_model == "static":
        rnn_out, last_hidden, last_cell = encoder_static(
            x_emb,
            len=num_steps,
            init_hidden=init_hidden_reshape,
            init_cell=init_cell_reshape)
    elif rnn_model == "cudnn":
        x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
        rnn_out, last_hidden, last_cell = layers.lstm(
            x_emb,
            init_hidden_reshape,
            init_cell_reshape,
            num_steps,
            hidden_size,
            num_layers,
            is_bidirec=False,
            default_initializer=fluid.initializer.UniformInitializer(
                low=-init_scale, high=init_scale))
        rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
    elif rnn_model == "basic_lstm":
        rnn_out, last_hidden, last_cell = basic_lstm( x_emb, init_hidden, init_cell, hidden_size, \
                num_layers=num_layers, batch_first=True, dropout_prob=dropout, \
                param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ), \
                bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ), \
                forget_bias = 0.0)
    else:
        print("type not support")
        return

    rnn_out = layers.reshape(rnn_out,
                             shape=[-1, num_steps, hidden_size],
                             inplace=True)

    softmax_weight = layers.create_parameter(
        [hidden_size, vocab_size],
        dtype="float32",
        name="softmax_weight",
        default_initializer=fluid.initializer.UniformInitializer(
            low=-init_scale, high=init_scale))
    softmax_bias = layers.create_parameter(
        [vocab_size],
        dtype="float32",
        name='softmax_bias',
        default_initializer=fluid.initializer.UniformInitializer(
            low=-init_scale, high=init_scale))

    projection = layers.matmul(rnn_out, softmax_weight)
    projection = layers.elementwise_add(projection, softmax_bias)
    projection = layers.reshape(projection,
                                shape=[-1, vocab_size],
                                inplace=True)

    loss = layers.softmax_with_cross_entropy(logits=projection,
                                             label=y,
                                             soft_label=False)

    loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
    loss = layers.reduce_mean(loss, dim=[0])
    loss = layers.reduce_sum(loss)

    loss.persistable = True
    last_cell.persistable = True
    last_hidden.persistable = True

    # This will feed last_hidden, last_cell to init_hidden, init_cell, which
    # can be used directly in next batch. This can avoid the fetching of
    # last_hidden and last_cell and feeding of init_hidden and init_cell in
    # each training step.
    last_hidden = layers.transpose(last_hidden, perm=[1, 0, 2])
    last_cell = layers.transpose(last_cell, perm=[1, 0, 2])
    feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
    if use_dataloader:
        return loss, last_hidden, last_cell, feeding_list, dataloader
    else:
        return loss, last_hidden, last_cell, feeding_list
Exemple #15
0
    def encoder_static(input_embedding,
                       len=3,
                       init_hidden=None,
                       init_cell=None):

        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(init_hidden,
                                      axes=[0],
                                      starts=[i],
                                      ends=[i + 1])
            pre_cell = layers.slice(init_cell,
                                    axes=[0],
                                    starts=[i],
                                    ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden,
                                        shape=[-1, hidden_size],
                                        inplace=True)
            pre_cell = layers.reshape(pre_cell,
                                      shape=[-1, hidden_size],
                                      inplace=True)
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        res = []
        sliced_inputs = layers.split(input_embedding,
                                     num_or_sections=len,
                                     dim=1)

        for index in range(len):
            input = sliced_inputs[index]
            input = layers.reshape(input,
                                   shape=[-1, hidden_size],
                                   inplace=True)
            for k in range(num_layers):
                pre_hidden = hidden_array[k]
                pre_cell = cell_array[k]
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]
                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input,
                                          num_or_sections=4,
                                          dim=-1)

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                hidden_array[k] = m
                cell_array[k] = c
                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            res.append(input)

        last_hidden = layers.concat(hidden_array, 1)
        last_hidden = layers.reshape(last_hidden,
                                     shape=[-1, num_layers, hidden_size],
                                     inplace=True)
        last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])

        last_cell = layers.concat(cell_array, 1)
        last_cell = layers.reshape(last_cell,
                                   shape=[-1, num_layers, hidden_size])
        last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])

        real_res = layers.concat(res, 0)
        real_res = layers.reshape(real_res,
                                  shape=[len, -1, hidden_size],
                                  inplace=True)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])

        return real_res, last_hidden, last_cell
Exemple #16
0
def knowledge_seq2seq(config):
    """ knowledge seq2seq """
    emb_size = config.embed_size
    hidden_size = config.hidden_size
    input_size = emb_size
    num_layers = config.num_layers
    bi_direc = config.bidirectional
    batch_size = config.batch_size
    vocab_size = config.vocab_size
    run_type = config.run_type

    enc_input = layers.data(name="enc_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #enc_input --> goal
    enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32')
    goal_input = layers.data(name="goal_input",
                             shape=[1],
                             dtype='int64',
                             lod_level=1)  #goal_input --> x
    cue_input = layers.data(name="cue_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #cue_input --> kg
    #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32')
    memory_mask = layers.data(name='memory_mask',
                              shape=[-1, 1],
                              dtype='float32')
    tar_input = layers.data(name='tar_input',
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #tar_input --> y
    # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32')

    rnn_hidden_size = hidden_size
    if bi_direc:
        rnn_hidden_size //= 2

    enc_out, enc_last_hidden = \
        rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc")
    goal_out, goal_last_hidden = \
        rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc1")
    context_goal_out = fluid.layers.concat(
        input=[enc_last_hidden, goal_last_hidden], axis=2)
    context_goal_out = layers.reshape(context_goal_out,
                                      shape=[-1, 1, rnn_hidden_size * 4])
    # context_goal_out = layers.squeeze(context_goal_out, axes=[1])
    context_goal_out = fluid.layers.fc(context_goal_out,
                                       size=rnn_hidden_size * 2,
                                       bias_attr=False)
    context_goal_out = layers.unsqueeze(context_goal_out, axes=[0])
    bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge")
    bridge_out = layers.tanh(bridge_out)

    cue_last_mask = layers.data(name='cue_last_mask',
                                shape=[-1],
                                dtype='float32')
    knowledge_out, knowledge_last_hidden = \
        rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc")

    query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1])
    query = layers.squeeze(query, axes=[0])
    query = layers.unsqueeze(query, axes=[1])
    query = layers.reshape(query, shape=[batch_size, -1, hidden_size])
    cue_memory = layers.slice(knowledge_last_hidden,
                              axes=[0],
                              starts=[0],
                              ends=[1])
    cue_memory = layers.reshape(cue_memory,
                                shape=[batch_size, -1, hidden_size])
    memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1])

    weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask)

    cue_att = layers.reshape(cue_att, shape=[batch_size, -1])

    knowledge = weighted_cue
    if config.use_posterior:
        print("config.use_posterior", config.use_posterior)
        target_out, target_last_hidden = \
            rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                        dropout=0.0, batch_first=True, name="knowledge_enc1")
        target_goal_out = fluid.layers.concat(
            input=[target_last_hidden, goal_last_hidden], axis=2)
        target_goal_out = layers.reshape(target_goal_out,
                                         shape=[-1, 1, rnn_hidden_size * 4])
        # target_goal_out = layers.squeeze(target_goal_out, axes=[1])
        target_goal_out = fluid.layers.fc(target_goal_out,
                                          size=rnn_hidden_size * 2,
                                          bias_attr=False)
        target_goal_out = layers.unsqueeze(target_goal_out, axes=[0])

        # get attenion
        # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1])
        target_query = layers.slice(target_goal_out,
                                    axes=[0],
                                    starts=[0],
                                    ends=[1])
        target_query = layers.squeeze(target_query, axes=[0])
        target_query = layers.unsqueeze(target_query, axes=[1])
        target_query = layers.reshape(target_query,
                                      shape=[batch_size, -1, hidden_size])

        weight_target, target_att = dot_attention(target_query,
                                                  cue_memory,
                                                  mask=memory_mask)
        target_att = layers.reshape(target_att, shape=[batch_size, -1])
        # add to output
        knowledge = weight_target

    enc_memory_mask = layers.data(name="enc_memory_mask",
                                  shape=[-1, 1],
                                  dtype='float32')
    enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1])
    # decoder init_hidden, enc_memory, enc_mask
    dec_init_hidden = bridge_out
    pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32'))

    enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out,
                                                  pad_value=pad_value)
    enc_memory.persistable = True

    gru_unit = GRU_unit(input_size + hidden_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=0.0,
                        name="decoder_gru_unit")

    cue_gru_unit = GRU_unit(hidden_size + hidden_size,
                            hidden_size,
                            num_layers=num_layers,
                            dropout=0.0,
                            name="decoder_cue_gru_unit")

    tgt_vocab_size = config.vocab_size
    if run_type == "train":
        if config.use_bow:
            bow_logits = fc(knowledge,
                            hidden_size,
                            hidden_size,
                            name='bow_fc_1')
            bow_logits = layers.tanh(bow_logits)
            bow_logits = fc(bow_logits,
                            hidden_size,
                            tgt_vocab_size,
                            name='bow_fc_2')
            bow_logits = layers.softmax(bow_logits)

            bow_label = layers.data(name='bow_label',
                                    shape=[-1, config.max_len],
                                    dtype='int64')
            bow_mask = layers.data(name="bow_mask",
                                   shape=[-1, config.max_len],
                                   dtype='float32')

            bow_logits = layers.expand(bow_logits, [1, config.max_len, 1])
            bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size])
            bow_label = layers.reshape(bow_label, shape=[-1, 1])
            bow_loss = layers.cross_entropy(bow_logits,
                                            bow_label,
                                            soft_label=False)
            bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len])

            bow_loss *= bow_mask
            bow_loss = layers.reduce_sum(bow_loss, dim=[1])
            bow_loss = layers.reduce_mean(bow_loss)

        dec_input = layers.data(name="dec_input",
                                shape=[-1, 1, 1],
                                dtype='int64')
        dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32')

        dec_knowledge = weight_target

        knowledge_goal_out = fluid.layers.concat(
            input=[dec_knowledge, target_query], axis=2)
        knowledge_goal_out = layers.reshape(knowledge_goal_out,
                                            shape=[-1, 1, rnn_hidden_size * 4])
        # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1])
        knowledge_goal_out = fluid.layers.fc(knowledge_goal_out,
                                             size=rnn_hidden_size * 2,
                                             bias_attr=False)
        knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0])

        decoder_logits = \
            rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers,
                         enc_memory, enc_memory_mask, dec_knowledge, vocab_size,
                         init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout)

        target_label = layers.data(name='target_label',
                                   shape=[-1, 1],
                                   dtype='int64')
        target_mask = layers.data(name='target_mask',
                                  shape=[-1, 1],
                                  dtype='float32')

        decoder_logits = layers.reshape(decoder_logits,
                                        shape=[-1, tgt_vocab_size])
        target_label = layers.reshape(target_label, shape=[-1, 1])

        nll_loss = layers.cross_entropy(decoder_logits,
                                        target_label,
                                        soft_label=False)
        nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1])
        nll_loss *= target_mask
        nll_loss = layers.reduce_sum(nll_loss, dim=[1])
        nll_loss = layers.reduce_mean(nll_loss)

        prior_attn = cue_att + 1e-10
        posterior_att = target_att
        posterior_att.stop_gradient = True

        prior_attn = layers.log(prior_attn)

        kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) -
                                   prior_attn)
        kl_loss = layers.reduce_mean(kl_loss)

        kl_and_nll_factor = layers.data(name='kl_and_nll_factor',
                                        shape=[1],
                                        dtype='float32')
        kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1])

        final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor

        return [bow_loss, kl_loss, nll_loss, final_loss]

    elif run_type == "test":
        beam_size = config.beam_size
        batch_size = config.batch_size
        token = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                     value=config.bos_id,
                                     dtype='int64')

        token = layers.reshape(token, shape=[-1, 1])
        max_decode_len = config.max_dec_len

        dec_knowledge = knowledge
        INF = 100000000.0

        init_score_np = np.ones([beam_size * batch_size],
                                dtype='float32') * -INF

        for i in range(batch_size):
            init_score_np[i * beam_size] = 0.0

        pre_score = layers.assign(init_score_np)

        pos_index_np = np.arange(batch_size).reshape(-1, 1)
        pos_index_np = \
            np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size

        pos_index = layers.assign(pos_index_np)

        id_array = []
        score_array = []
        index_array = []
        init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1])
        init_enc_memory = layers.reshape(
            init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size])
        init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1])
        init_enc_mask = layers.reshape(init_enc_mask,
                                       shape=[batch_size * beam_size, 1, -1])

        dec_knowledge = layers.reshape(dec_knowledge,
                                       shape=[-1, 1, hidden_size])
        init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1])
        init_dec_knowledge = layers.reshape(
            init_dec_knowledge,
            shape=[batch_size * beam_size, -1, hidden_size])

        dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size])
        dec_init_hidden = layers.reshape(dec_init_hidden,
                                         shape=[1, -1, hidden_size])

        length_average = config.length_average
        UNK = config.unk_id
        EOS = config.eos_id
        for i in range(1, max_decode_len + 1):
            dec_emb = get_embedding(token, input_size, vocab_size)
            dec_out, dec_last_hidden = \
                decoder_step(gru_unit, cue_gru_unit,
                             dec_emb, dec_init_hidden, input_size, hidden_size,
                             init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None)
            output_in_size = hidden_size + hidden_size

            rnnout = layers.dropout(dec_out,
                                    dropout_prob=config.dropout,
                                    is_test=True)
            rnnout = fc(rnnout,
                        output_in_size,
                        hidden_size,
                        name='dec_out_fc1')
            rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2')

            log_softmax_output = log_softmax(rnnout)
            log_softmax_output = layers.squeeze(log_softmax_output, axes=[1])

            if i > 1:
                if length_average:
                    log_softmax_output = layers.elementwise_add(
                        (log_softmax_output / i),
                        (pre_score * (1.0 - 1.0 / i)),
                        axis=0)
                else:
                    log_softmax_output = layers.elementwise_add(
                        log_softmax_output, pre_score, axis=0)
            else:
                log_softmax_output = layers.elementwise_add(log_softmax_output,
                                                            pre_score,
                                                            axis=0)

            log_softmax_output = layers.reshape(log_softmax_output,
                                                shape=[batch_size, -1])

            topk_score, topk_index = layers.topk(log_softmax_output,
                                                 k=beam_size)
            topk_score = layers.reshape(topk_score, shape=[-1])
            topk_index = layers.reshape(topk_index, shape=[-1])

            vocab_var = layers.fill_constant([1],
                                             dtype='int64',
                                             value=vocab_size)
            new_token = topk_index % vocab_var

            index = topk_index // vocab_var
            id_array.append(new_token)
            index_array.append(index)
            index = index + pos_index

            score_array.append(topk_score)

            eos_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=EOS)
            unk_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=UNK)
            eos_eq = layers.cast(layers.equal(new_token, eos_ids),
                                 dtype='float32')

            topk_score += eos_eq * -100000000.0

            unk_eq = layers.cast(layers.equal(new_token, unk_ids),
                                 dtype='float32')
            topk_score += unk_eq * -100000000.0

            # update
            token = new_token
            pre_score = topk_score
            token = layers.reshape(token, shape=[-1, 1])

            index = layers.cast(index, dtype='int32')
            dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0])
            dec_init_hidden = layers.gather(dec_last_hidden, index=index)
            dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0])
            init_enc_memory = layers.gather(init_enc_memory, index)
            init_enc_mask = layers.gather(init_enc_mask, index)
            init_dec_knowledge = layers.gather(init_dec_knowledge, index)

        final_score = layers.concat(score_array, axis=0)
        final_ids = layers.concat(id_array, axis=0)
        final_index = layers.concat(index_array, axis=0)

        final_score = layers.reshape(
            final_score, shape=[max_decode_len, beam_size * batch_size])
        final_ids = layers.reshape(
            final_ids, shape=[max_decode_len, beam_size * batch_size])
        final_index = layers.reshape(
            final_index, shape=[max_decode_len, beam_size * batch_size])

        return final_score, final_ids, final_index
Exemple #17
0
def static_identity(x):
    x = layers.reshape(x, x.shape)
    return x
Exemple #18
0
def network(items_num, hidden_size, step, bs):
    stdv = 1.0 / math.sqrt(hidden_size)

    items = fluid.data(name="items", shape=[bs, -1],
                       dtype="int64")  #[batch_size, uniq_max]
    seq_index = fluid.data(name="seq_index", shape=[bs, -1, 2],
                           dtype="int32")  #[batch_size, seq_max, 2]
    last_index = fluid.data(name="last_index", shape=[bs, 2],
                            dtype="int32")  #[batch_size, 2]
    adj_in = fluid.data(name="adj_in", shape=[bs, -1, -1],
                        dtype="float32")  #[batch_size, seq_max, seq_max]
    adj_out = fluid.data(name="adj_out", shape=[bs, -1, -1],
                         dtype="float32")  #[batch_size, seq_max, seq_max]
    mask = fluid.data(name="mask", shape=[bs, -1, 1],
                      dtype="float32")  #[batch_size, seq_max, 1]
    label = fluid.data(name="label", shape=[bs, 1],
                       dtype="int64")  #[batch_size, 1]

    datas = [items, seq_index, last_index, adj_in, adj_out, mask, label]
    py_reader = fluid.io.DataLoader.from_generator(capacity=256,
                                                   feed_list=datas,
                                                   iterable=False)
    feed_datas = datas

    items_emb = fluid.embedding(
        input=items,
        param_attr=fluid.ParamAttr(name="emb",
                                   initializer=fluid.initializer.Uniform(
                                       low=-stdv, high=stdv)),
        size=[items_num, hidden_size])  #[batch_size, uniq_max, h]

    pre_state = items_emb
    for i in range(step):
        pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size])
        state_in = layers.fc(
            input=pre_state,
            name="state_in",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]
        state_out = layers.fc(
            input=pre_state,
            name="state_out",
            size=hidden_size,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  #[batch_size, uniq_max, h]

        state_adj_in = layers.matmul(adj_in,
                                     state_in)  #[batch_size, uniq_max, h]
        state_adj_out = layers.matmul(adj_out,
                                      state_out)  #[batch_size, uniq_max, h]

        gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

        gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2])
        gru_fc = layers.fc(input=gru_input,
                           name="gru_fc",
                           size=3 * hidden_size,
                           bias_attr=False)
        pre_state, _, _ = fluid.layers.gru_unit(input=gru_fc,
                                                hidden=layers.reshape(
                                                    x=pre_state,
                                                    shape=[-1, hidden_size]),
                                                size=3 * hidden_size)

    final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
    seq = layers.gather_nd(final_state, seq_index)
    last = layers.gather_nd(final_state, last_index)

    seq_fc = layers.fc(
        input=seq,
        name="seq_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=2,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, seq_max, h]
    last_fc = layers.fc(
        input=last,
        name="last_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        num_flatten_dims=1,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[bathc_size, h]

    seq_fc_t = layers.transpose(seq_fc, perm=[1, 0,
                                              2])  #[seq_max, batch_size, h]
    add = layers.elementwise_add(seq_fc_t, last_fc)  #[seq_max, batch_size, h]
    b = layers.create_parameter(
        shape=[hidden_size],
        dtype='float32',
        default_initializer=fluid.initializer.Constant(value=0.0))  #[h]
    add = layers.elementwise_add(add, b)  #[seq_max, batch_size, h]

    add_sigmoid = layers.sigmoid(add)  #[seq_max, batch_size, h]
    add_sigmoid = layers.transpose(add_sigmoid,
                                   perm=[1, 0, 2])  #[batch_size, seq_max, h]

    weight = layers.fc(
        input=add_sigmoid,
        name="weight_fc",
        size=1,
        act=None,
        num_flatten_dims=2,
        bias_attr=False,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, seq_max, 1]
    weight *= mask
    weight_mask = layers.elementwise_mul(seq, weight,
                                         axis=0)  #[batch_size, seq_max, h]
    global_attention = layers.reduce_sum(weight_mask, dim=1)  #[batch_size, h]

    final_attention = layers.concat([global_attention, last],
                                    axis=1)  #[batch_size, 2*h]
    final_attention_fc = layers.fc(
        input=final_attention,
        name="final_attention_fc",
        size=hidden_size,
        bias_attr=False,
        act=None,
        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-stdv, high=stdv)))  #[batch_size, h]

    all_vocab = layers.create_global_var(shape=[items_num - 1],
                                         value=0,
                                         dtype="int64",
                                         persistable=True,
                                         name="all_vocab")

    all_emb = fluid.embedding(input=all_vocab,
                              param_attr=fluid.ParamAttr(
                                  name="emb",
                                  initializer=fluid.initializer.Uniform(
                                      low=-stdv, high=stdv)),
                              size=[items_num, hidden_size])  #[all_vocab, h]

    logits = layers.matmul(x=final_attention_fc, y=all_emb,
                           transpose_y=True)  #[batch_size, all_vocab]
    softmax = layers.softmax_with_cross_entropy(logits=logits,
                                                label=label)  #[batch_size, 1]
    loss = layers.reduce_mean(softmax)  # [1]
    acc = layers.accuracy(input=logits, label=label, k=20)
    return loss, acc, py_reader, feed_datas
Exemple #19
0
def multi_head_attention(queries,
                         keys,
                         values,
                         attn_bias,
                         d_key,
                         d_value,
                         d_model,
                         n_head=1,
                         dropout_rate=0.,
                         cache=None,
                         param_initializer=None,
                         name='multi_head_att'):
    """
    Multi-Head Attention. Note that attn_bias is added to the logit before
    computing softmax activiation to mask certain selected positions so that
    they will not considered in attention weights.
    """
    keys = queries if keys is None else keys
    values = keys if values is None else values

    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
        raise ValueError(
            "Inputs: quries, keys and values should all be 3-D tensors.")

    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
        """
        Add linear projection to queries, keys, and values.
        """
        q = layers.fc(input=queries,
                      size=d_key * n_head,
                      num_flatten_dims=2,
                      param_attr=fluid.ParamAttr(
                          name=name + '_query_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_query_fc.b_0')
        k = layers.fc(input=keys,
                      size=d_key * n_head,
                      num_flatten_dims=2,
                      param_attr=fluid.ParamAttr(
                          name=name + '_key_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_key_fc.b_0')
        v = layers.fc(input=values,
                      size=d_value * n_head,
                      num_flatten_dims=2,
                      param_attr=fluid.ParamAttr(
                          name=name + '_value_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_value_fc.b_0')
        return q, k, v

    def __split_heads(x, n_head):
        """
        Reshape the last dimension of inpunt tensor x so that it becomes two
        dimensions and then transpose. Specifically, input a tensor with shape
        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
        with shape [bs, n_head, max_sequence_length, hidden_dim].
        """
        hidden_size = x.shape[-1]
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        reshaped = layers.reshape(x=x,
                                  shape=[0, 0, n_head, hidden_size // n_head],
                                  inplace=True)

        # permuate the dimensions into:
        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])

    def __combine_heads(x):
        """
        Transpose and then reshape the last two dimensions of inpunt tensor x
        so that it becomes one dimension, which is reverse to __split_heads.
        """
        if len(x.shape) == 3: return x
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")

        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        return layers.reshape(
            x=trans_x,
            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
            inplace=True)

    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
        """
        Scaled Dot-Product Attention
        """
        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        if attn_bias:
            product += attn_bias
        weights = layers.softmax(product)
        if dropout_rate:
            weights = layers.dropout(weights,
                                     dropout_prob=dropout_rate,
                                     dropout_implementation="upscale_in_train",
                                     is_test=False)
        out = layers.matmul(weights, v)
        return out

    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)

    if cache is not None:  # use cache and concat time steps
        # Since the inplace reshape in __split_heads changes the shape of k and
        # v, which is the cache input for next time step, reshape the cache
        # input from the previous time step first.
        k = cache["k"] = layers.concat(
            [layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
        v = cache["v"] = layers.concat(
            [layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)

    q = __split_heads(q, n_head)
    k = __split_heads(k, n_head)
    v = __split_heads(v, n_head)

    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
                                                  dropout_rate)

    out = __combine_heads(ctx_multiheads)

    # Project back to the model size.
    proj_out = layers.fc(input=out,
                         size=d_model,
                         num_flatten_dims=2,
                         param_attr=fluid.ParamAttr(
                             name=name + '_output_fc.w_0',
                             initializer=param_initializer),
                         bias_attr=name + '_output_fc.b_0')
    return proj_out
Exemple #20
0
def conditional_gru(input,
                    encode_hidden,
                    init_hidden,
                    encode_hidden_size,
                    hidden_size,
                    num_layers=1,
                    sequence_length=None,
                    dropout_prob=0.0,
                    bidirectional=False,
                    batch_first=True,
                    param_attr=None,
                    bias_attr=None,
                    gate_activation=None,
                    activation=None,
                    dtype="float32",
                    name="conditional_gru"):
    """
        定义一个新的GRU类型,多了参数Cu,Cr,C。GRU的新公式:
        .. math::
            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + C_u h_i + b_u)

            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + C_r h_i + b_r)

            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + C_u h_i + C h_i + b_m)

            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
        其他定义与GRU相同
    Args:
       input (Variable): GRU input tensor,
                      if batch_first = False, shape should be ( seq_len x batch_size x input_size )
                      if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
       encode_hidden: The hidden state from the encoder of the GRU. If bidirectional is True, the encode_hidden is assert
                      to contain two parts, former half part is for forward direction, and later half part is for backward
                      direction.
       encode_hidden_size: The size of encode_hidden. If bidirectional is True, the encode_hidden_size includes the
                      former half part and the later half part, i.e., the actual size of encode_hidden is
                      encode_hidden_size / 2
       init_hidden(Variable|None): The initial hidden state of the GRU
                      This is a tensor with shape ( num_layers x batch_size x hidden_size)
                      if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
                      and can be reshaped to tensor with ( num_layers x 2 x batch_size x hidden_size) to use.
                      If it's None, it will be set to all 0.
       hidden_size (int): Hidden size of the GRU
       num_layers (int): The total number of layers of the GRU
       sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance,
                       This tensor will be convert to a mask to mask the padding ids
                       If it's None means NO padding ids
       dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of each layers,
                            NOT between time steps
       bidirectional (bool|False): If it is bidirectional
       batch_first (bool|True): The shape format of the input and output tensors. If true,
           the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false,
           the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default
           this function accepts input and emits output in batch-major form to be consistent
           with most of data format, though a bit less efficient because of extra transposes.
       param_attr(ParamAttr|None): The parameter attribute for the learnable
           weight matrix. Note:
           If it is set to None or one attribute of ParamAttr, gru_unit will
           create ParamAttr as param_attr. If the Initializer of the param_attr
           is not set, the parameter is initialized with Xavier. Default: None.
       bias_attr (ParamAttr|None): The parameter attribute for the bias
           of GRU unit.
           If it is set to None or one attribute of ParamAttr, gru_unit will
           create ParamAttr as bias_attr. If the Initializer of the bias_attr
           is not set, the bias is initialized zero. Default: None.
       gate_activation (function|None): The activation function for gates (actGate).
                                 Default: 'fluid.layers.sigmoid'
       activation (function|None): The activation function for cell (actNode).
                            Default: 'fluid.layers.tanh'
       dtype(string): data type used in this unit
       name(string): name used to identify parameters and biases

       Returns:
        rnn_out(Tensor),last_hidden(Tensor)
            - rnn_out is result of GRU hidden, with shape (seq_len x batch_size x hidden_size) \
              if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
            - last_hidden is the hidden state of the last step of GRU \
              shape is ( num_layers x batch_size x hidden_size ) \
              if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size),
              can be reshaped to a tensor with shape( num_layers x 2 x batch_size x hidden_size)
            - all_hidden is all the hidden states of the input, including the last_hidden and medium hidden states. \
              shape is (num_layers x seq_len x batch_size x hidden_size). if is_bidirec set to True, shape will be
              (2 x num_layers x seq_len x batch_size x hidden_size)
    """
    if bidirectional:
        encode_hidden, bw_encode_hidden = layers.split(encode_hidden, num_or_sections=2, dim=-1)
        encode_hidden_size = int(encode_hidden_size / 2)

    fw_unit_list = []

    for i in range(num_layers):
        new_name = name + '_layers_' + str(i)
        if param_attr is not None and param_attr.name is not None:
            layer_param_attr = copy.deepcopy(param_attr)
            layer_param_attr.name += '_fw_w_' + str(i)
        else:
            layer_param_attr = param_attr
        if bias_attr is not None and bias_attr.name is not None:
            layer_bias_attr = copy.deepcopy(bias_attr)
            layer_bias_attr.name += '_fw_b_' + str(i)
        else:
            layer_bias_attr = bias_attr

        fw_unit_list.append(
            ConditionalGRUUnit(new_name, encode_hidden_size, hidden_size, layer_param_attr,
                               layer_bias_attr, gate_activation, activation, dtype)
        )

    if bidirectional:
        bw_unit_list = []

        for i in range(num_layers):
            new_name = name + '_reverse_layers_' + str(i)
            if param_attr is not None and param_attr.name is not None:
                layer_param_attr = copy.deepcopy(param_attr)
                layer_param_attr.name += '_bw_w_' + str(i)
            else:
                layer_param_attr = param_attr
            if bias_attr is not None and bias_attr.name is not None:
                layer_bias_attr = copy.deepcopy(bias_attr)
                layer_bias_attr.name += '_bw_b_' + str(i)
            else:
                layer_bias_attr = bias_attr
            bw_unit_list.append(
                ConditionalGRUUnit(new_name, encode_hidden_size, hidden_size, layer_param_attr,
                                   layer_bias_attr, gate_activation, activation, dtype)
            )

    if batch_first:
        input = layers.transpose(input, [1, 0, 2])

    mask = None
    if sequence_length:
        max_seq_len = layers.shape(input)[0]
        mask = layers.sequence_mask(
            sequence_length, maxlen=max_seq_len, dtype='float32'
        )
        mask = layers.transpose(mask, [1, 0])

    direc_num = 1
    if bidirectional:
        direc_num = 2
    if init_hidden:
        init_hidden = layers.reshape(
            init_hidden, shape=[num_layers, direc_num, -1, hidden_size]
        )

    def get_single_direction_output(rnn_input,
                                    encode_hidden,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        #print(rnn_input.shape)
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                else:
                    pre_hidden = rnn.memory(batch_ref=rnn_input,
                                            shape=[-1, hidden_size],
                                            ref_batch_dim_idx=1)
                encode_h = encode_hidden[i]
                pre_encode_hidden = layers.concat([pre_hidden, encode_h], axis=1)
                new_hidden = unit_list[i](step_input, pre_encode_hidden)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask, axis=0) - layers.elementwise_mul(
                        pre_hidden, (step_mask - 1), axis=0)
                rnn.update_memory(pre_hidden, new_hidden)

                rnn.step_output(new_hidden)

                step_input = new_hidden
                if dropout_prob is not None and dropout_prob > 0.0:
                    step_input = layers.dropout(step_input, dropout_prob=dropout_prob, )

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        all_hidden_array = []  # 增加这个来得到所有隐含状态
        rnn_output = rnn_out[-1]

        for i in range(num_layers):
            last_hidden = rnn_out[i]
            all_hidden_array.append(last_hidden)
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)

        all_hidden_array = layers.concat(all_hidden_array, axis=0)
        all_hidden_array = layers.reshape(all_hidden_array, shape=[num_layers, input.shape[0], -1, hidden_size])
        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(last_hidden_output, shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output, all_hidden_array

    fw_rnn_out, fw_last_hidden, fw_all_hidden = get_single_direction_output(
        input, encode_hidden, fw_unit_list, mask, direc_index=0)

    if bidirectional:
        bw_input = layers.reverse(input, axis=[0])
        bw_mask = None
        if mask:
            bw_mask = layers.reverse(mask, axis=[0])
        bw_rnn_out, bw_last_hidden, bw_all_hidden = get_single_direction_output(
            bw_input, bw_encode_hidden, bw_unit_list, bw_mask, direc_index=1)

        bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])

        rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2)
        last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1)
        all_hidden = layers.concat([fw_all_hidden, bw_all_hidden], axis=0)

        last_hidden = layers.reshape(
            last_hidden, shape=[num_layers * direc_num, -1, hidden_size])

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])
        return rnn_out, last_hidden, all_hidden
    else:

        rnn_out = fw_rnn_out
        last_hidden = fw_last_hidden
        all_hidden = fw_all_hidden

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])

        return rnn_out, last_hidden, all_hidden
Exemple #21
0
def basic_lstm(input,
               init_hidden,
               init_cell,
               hidden_size,
               num_layers=1,
               sequence_length=None,
               dropout_prob=0.0,
               bidirectional=False,
               batch_first=True,
               param_attr=None,
               bias_attr=None,
               gate_activation=None,
               activation=None,
               forget_bias=1.0,
               dtype='float32',
               name='basic_lstm'):
    r"""
    LSTM implementation using basic operators, supports multiple layers and bidirectional LSTM.

    .. math::
           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)

           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )

           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)

           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)

           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}

           h_t &= o_t \odot tanh(c_t)

    Args:
        input (Variable): lstm input tensor, 
                       if batch_first = False, shape should be ( seq_len x batch_size x input_size )  
                       if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
        init_hidden(Variable|None): The initial hidden state of the LSTM
                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
                       and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use.
                       If it's None, it will be set to all 0.
        init_cell(Variable|None): The initial hidden state of the LSTM
                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
                       and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use.
                       If it's None, it will be set to all 0.
        hidden_size (int): Hidden size of the LSTM
        num_layers (int): The total number of layers of the LSTM
        sequence_length (Variabe|None): A tensor (shape [batch_size]) stores each real length of each instance,
                        This tensor will be convert to a mask to mask the padding ids
                        If it's None means NO padding ids
        dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of each layers, 
                             NOT between time steps
        bidirectional (bool|False): If it is bidirectional
        batch_first (bool|True): The shape format of the input and output tensors. If true,
            the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false,
            the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default
            this function accepts input and emits output in batch-major form to be consistent
            with most of data format, though a bit less efficient because of extra transposes.
        param_attr(ParamAttr|None): The parameter attribute for the learnable
            weight matrix. Note:
            If it is set to None or one attribute of ParamAttr, lstm_unit will
            create ParamAttr as param_attr. If the Initializer of the param_attr
            is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr (ParamAttr|None): The parameter attribute for the bias
            of LSTM unit.
            If it is set to None or one attribute of ParamAttr, lstm_unit will 
            create ParamAttr as bias_attr. If the Initializer of the bias_attr
            is not set, the bias is initialized zero. Default: None.
        gate_activation (function|None): The activation function for gates (actGate).
                                  Default: 'fluid.layers.sigmoid'
        activation (function|None): The activation function for cell (actNode).
                             Default: 'fluid.layers.tanh'
        forget_bias (float|1.0) : Forget bias used to compute the forget gate
        dtype(string): Data type used in this unit
        name(string): Name used to identify parameters and biases

    Returns:
        rnn_out(Tensor), last_hidden(Tensor), last_cell(Tensor)
            - rnn_out is the result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
              if is_bidirec set to True, it's shape will be ( seq_len x batch_sze x hidden_size*2)
            - last_hidden is the hidden state of the last step of LSTM \
              with shape ( num_layers x batch_size x hidden_size ) \
              if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size),
              and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size)  to use.
            - last_cell is the hidden state of the last step of LSTM \
              with shape ( num_layers x batch_size x hidden_size ) \
              if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size),
              and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size)  to use.

    Examples:
        .. code-block:: python
            
            import paddle.fluid.layers as layers
            from paddle.fluid.contrib.layers import basic_lstm

            batch_size = 20
            input_size = 128
            hidden_size = 256
            num_layers = 2
            dropout = 0.5
            bidirectional = True
            batch_first = False

            input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32')
            pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
            pre_cell = layers.data( name = "pre_cell", shape=[-1, hidden_size], dtype='float32')
            sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32')

            rnn_out, last_hidden, last_cell = basic_lstm( input, pre_hidden, pre_cell, \
                    hidden_size, num_layers = num_layers, \
                    sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \
                    batch_first = batch_first)

    """
    fw_unit_list = []

    for i in range(num_layers):
        new_name = name + "_layers_" + str(i)
        if param_attr is not None and param_attr.name is not None:
            layer_param_attr = copy.deepcopy(param_attr)
            layer_param_attr.name += "_fw_w_" + str(i)
        else:
            layer_param_attr = param_attr
        if bias_attr is not None and bias_attr.name is not None:
            layer_bias_attr = copy.deepcopy(bias_attr)
            layer_bias_attr.name += "_fw_b_" + str(i)
        else:
            layer_bias_attr = bias_attr
        fw_unit_list.append(
            BasicLSTMUnit(
                new_name,
                hidden_size,
                param_attr=layer_param_attr,
                bias_attr=layer_bias_attr,
                gate_activation=gate_activation,
                activation=activation,
                forget_bias=forget_bias,
                dtype=dtype))
    if bidirectional:
        bw_unit_list = []

        for i in range(num_layers):
            new_name = name + "_reverse_layers_" + str(i)
            if param_attr is not None and param_attr.name is not None:
                layer_param_attr = copy.deepcopy(param_attr)
                layer_param_attr.name += "_bw_w_" + str(i)
            else:
                layer_param_attr = param_attr
            if bias_attr is not None and bias_attr.name is not None:
                layer_bias_attr = copy.deepcopy(bias_attr)
                layer_bias_attr.name += "_bw_b_" + str(i)
            else:
                layer_bias_attr = param_attr
            bw_unit_list.append(
                BasicLSTMUnit(
                    new_name,
                    hidden_size,
                    param_attr=layer_param_attr,
                    bias_attr=layer_bias_attr,
                    gate_activation=gate_activation,
                    activation=activation,
                    forget_bias=forget_bias,
                    dtype=dtype))

    if batch_first:
        input = layers.transpose(input, [1, 0, 2])

    mask = None
    if sequence_length:
        max_seq_len = layers.shape(input)[0]
        mask = layers.sequence_mask(
            sequence_length, maxlen=max_seq_len, dtype='float32')

        mask = layers.transpose(mask, [1, 0])

    direc_num = 1
    if bidirectional:
        direc_num = 2
        # convert to [num_layers, 2, batch_size, hidden_size]
    if init_hidden:
        init_hidden = layers.reshape(
            init_hidden, shape=[num_layers, direc_num, -1, hidden_size])
        init_cell = layers.reshape(
            init_cell, shape=[num_layers, direc_num, -1, hidden_size])

    # forward direction
    def get_single_direction_output(rnn_input,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                    pre_cell = rnn.memory(init=init_cell[i, direc_index])
                else:
                    pre_hidden = rnn.memory(
                        batch_ref=rnn_input, shape=[-1, hidden_size])
                    pre_cell = rnn.memory(
                        batch_ref=rnn_input, shape=[-1, hidden_size])

                new_hidden, new_cell = unit_list[i](step_input, pre_hidden,
                                                    pre_cell)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask, axis=0) - layers.elementwise_mul(
                            pre_hidden, (step_mask - 1), axis=0)
                    new_cell = layers.elementwise_mul(
                        new_cell, step_mask, axis=0) - layers.elementwise_mul(
                            pre_cell, (step_mask - 1), axis=0)

                rnn.update_memory(pre_hidden, new_hidden)
                rnn.update_memory(pre_cell, new_cell)

                rnn.step_output(new_hidden)
                rnn.step_output(new_cell)

                step_input = new_hidden
                if dropout_prob != None and dropout_prob > 0.0:
                    step_input = layers.dropout(
                        step_input,
                        dropout_prob=dropout_prob,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        last_cell_array = []
        rnn_output = rnn_out[-1]
        for i in range(num_layers):
            last_hidden = rnn_out[i * 2]
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)
            last_cell = rnn_out[i * 2 + 1]
            last_cell = last_cell[-1]
            last_cell_array.append(last_cell)

        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(
            last_hidden_output, shape=[num_layers, -1, hidden_size])
        last_cell_output = layers.concat(last_cell_array, axis=0)
        last_cell_output = layers.reshape(
            last_cell_output, shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output, last_cell_output
        # seq_len, batch_size, hidden_size

    fw_rnn_out, fw_last_hidden, fw_last_cell = get_single_direction_output(
        input, fw_unit_list, mask, direc_index=0)

    if bidirectional:
        bw_input = layers.reverse(input, axis=[0])
        bw_mask = None
        if mask:
            bw_mask = layers.reverse(mask, axis=[0])
        bw_rnn_out, bw_last_hidden, bw_last_cell = get_single_direction_output(
            bw_input, bw_unit_list, bw_mask, direc_index=1)

        bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])

        rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2)
        last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1)
        last_hidden = layers.reshape(
            last_hidden, shape=[num_layers * direc_num, -1, hidden_size])

        last_cell = layers.concat([fw_last_cell, bw_last_cell], axis=1)
        last_cell = layers.reshape(
            last_cell, shape=[num_layers * direc_num, -1, hidden_size])

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])
        return rnn_out, last_hidden, last_cell
    else:

        rnn_out = fw_rnn_out
        last_hidden = fw_last_hidden
        last_cell = fw_last_cell

        if batch_first:
            rnn_out = layers.transpose(rnn_out, [1, 0, 2])

        return rnn_out, last_hidden, last_cell
def decode(context, is_sparse):
    init_state = context
    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)

    # fill the first element with init_state
    state_array = pd.create_array('float32')
    pd.array_write(init_state, array=state_array, i=counter)

    # ids, scores as memory
    ids_array = pd.create_array('int64')
    scores_array = pd.create_array('float32')

    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
    init_scores = pd.data(name="init_scores",
                          shape=[1],
                          dtype="float32",
                          lod_level=2)

    pd.array_write(init_ids, array=ids_array, i=counter)
    pd.array_write(init_scores, array=scores_array, i=counter)

    cond = pd.less_than(x=counter, y=array_len)

    while_op = pd.While(cond=cond)
    with while_op.block():
        pre_ids = pd.array_read(array=ids_array, i=counter)
        pre_state = pd.array_read(array=state_array, i=counter)
        pre_score = pd.array_read(array=scores_array, i=counter)

        # expand the lod of pre_state to be the same with pre_score
        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)

        pre_ids_emb = pd.embedding(input=pre_ids,
                                   size=[dict_size, word_dim],
                                   dtype='float32',
                                   is_sparse=is_sparse)

        # use rnn unit to update rnn
        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
                              size=decoder_size,
                              act='tanh')
        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
        # use score to do beam search
        current_score = pd.fc(input=current_state_with_lod,
                              size=target_dict_dim,
                              act='softmax')
        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
        # calculate accumulated scores after topk to reduce computation cost
        accu_scores = pd.elementwise_add(x=pd.log(topk_scores),
                                         y=pd.reshape(pre_score, shape=[-1]),
                                         axis=0)
        selected_ids, selected_scores = pd.beam_search(pre_ids,
                                                       pre_score,
                                                       topk_indices,
                                                       accu_scores,
                                                       beam_size,
                                                       end_id=10,
                                                       level=0)

        pd.increment(x=counter, value=1, in_place=True)

        # update the memories
        pd.array_write(current_state, array=state_array, i=counter)
        pd.array_write(selected_ids, array=ids_array, i=counter)
        pd.array_write(selected_scores, array=scores_array, i=counter)

        # update the break condition: up to the max length or all candidates of
        # source sentences have ended.
        length_cond = pd.less_than(x=counter, y=array_len)
        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
        pd.logical_and(x=length_cond, y=finish_cond, out=cond)

    translation_ids, translation_scores = pd.beam_search_decode(
        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)

    # return init_ids, init_scores

    return translation_ids, translation_scores
def relative_transformer(src_vocab_size,
                         trg_vocab_size,
                         max_length,
                         n_layer,
                         n_head,
                         d_key,
                         d_value,
                         d_model,
                         d_inner_hid,
                         prepostprocess_dropout,
                         attention_dropout,
                         relu_dropout,
                         preprocess_cmd,
                         postprocess_cmd,
                         weight_sharing,
                         embedding_sharing,
                         label_smooth_eps,
                         use_py_reader=False,
                         is_test=False,
                         params_type="normal",
                         all_data_inputs=None):
    """
        transformer
    """
    if embedding_sharing:
        assert src_vocab_size == trg_vocab_size, (
            "Vocabularies in source and target should be same for weight sharing."
        )

    data_input_names = encoder_data_input_fields + \
                decoder_data_input_fields[:-1] + label_data_input_fields + dense_bias_input_fields

    if use_py_reader:
        all_inputs = all_data_inputs
    else:
        all_inputs = make_all_inputs(data_input_names)

    enc_inputs_len = len(encoder_data_input_fields)
    dec_inputs_len = len(decoder_data_input_fields[:-1])
    enc_inputs = all_inputs[0:enc_inputs_len]
    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]
    real_label = all_inputs[enc_inputs_len + dec_inputs_len]
    weights = all_inputs[enc_inputs_len + dec_inputs_len + 1]
    reverse_label = all_inputs[enc_inputs_len + dec_inputs_len + 2]

    enc_output = wrap_encoder(src_vocab_size,
                              max_length,
                              n_layer,
                              n_head,
                              d_key,
                              d_value,
                              d_model,
                              d_inner_hid,
                              prepostprocess_dropout,
                              attention_dropout,
                              relu_dropout,
                              preprocess_cmd,
                              postprocess_cmd,
                              weight_sharing,
                              embedding_sharing,
                              enc_inputs,
                              params_type=params_type)

    predict = wrap_decoder(trg_vocab_size,
                           max_length,
                           n_layer,
                           n_head,
                           d_key,
                           d_value,
                           d_model,
                           d_inner_hid,
                           prepostprocess_dropout,
                           attention_dropout,
                           relu_dropout,
                           preprocess_cmd,
                           postprocess_cmd,
                           weight_sharing,
                           embedding_sharing,
                           dec_inputs,
                           enc_output,
                           is_train=True if not is_test else False,
                           params_type=params_type)

    # Padding index do not contribute to the total loss. The weights is used to
    # cancel padding index in calculating the loss.
    if label_smooth_eps:
        label = layers.one_hot(input=real_label, depth=trg_vocab_size)
        label = label * (1 - label_smooth_eps) + (1 - label) * (
            label_smooth_eps / (trg_vocab_size - 1))
        label.stop_gradient = True
    else:
        label = real_label

    cost = layers.softmax_with_cross_entropy(
        logits=predict,
        label=label,
        soft_label=True if label_smooth_eps else False)
    weighted_cost = cost * weights
    sum_cost = layers.reduce_sum(weighted_cost)
    sum_cost.persistable = True
    token_num = layers.reduce_sum(weights)
    token_num.persistable = True
    token_num.stop_gradient = True
    avg_cost = sum_cost / token_num

    sen_count = layers.shape(dec_inputs[0])[0]
    batch_predict = layers.reshape(
        predict, shape=[sen_count, -1, ModelHyperParams.trg_vocab_size])
    batch_label = layers.reshape(real_label, shape=[sen_count, -1])
    batch_weights = layers.reshape(weights, shape=[sen_count, -1, 1])
    return sum_cost, avg_cost, token_num, batch_predict, cost, sum_cost, batch_label, batch_weights
Exemple #24
0
    def net(self, items_num, hidden_size, step, bs):
        stdv = 1.0 / math.sqrt(hidden_size)

        def embedding_layer(input,
                            table_name,
                            emb_dim,
                            initializer_instance=None):
            emb = fluid.embedding(
                input=input,
                size=[items_num, emb_dim],
                param_attr=fluid.ParamAttr(name=table_name,
                                           initializer=initializer_instance),
            )
            return emb

        sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv)
        items_emb = embedding_layer(self.items, "emb", hidden_size,
                                    sparse_initializer)
        pre_state = items_emb
        for i in range(step):
            pre_state = layers.reshape(x=pre_state,
                                       shape=[bs, -1, hidden_size])
            state_in = layers.fc(
                input=pre_state,
                name="state_in",
                size=hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(initializer=fluid.initializer.
                                           Uniform(low=-stdv, high=stdv)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)))  # [batch_size, uniq_max, h]
            state_out = layers.fc(
                input=pre_state,
                name="state_out",
                size=hidden_size,
                act=None,
                num_flatten_dims=2,
                param_attr=fluid.ParamAttr(initializer=fluid.initializer.
                                           Uniform(low=-stdv, high=stdv)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-stdv, high=stdv)))  # [batch_size, uniq_max, h]

            state_adj_in = layers.matmul(self.adj_in,
                                         state_in)  # [batch_size, uniq_max, h]
            state_adj_out = layers.matmul(
                self.adj_out, state_out)  # [batch_size, uniq_max, h]

            gru_input = layers.concat([state_adj_in, state_adj_out], axis=2)

            gru_input = layers.reshape(x=gru_input,
                                       shape=[-1, hidden_size * 2])
            gru_fc = layers.fc(input=gru_input,
                               name="gru_fc",
                               size=3 * hidden_size,
                               bias_attr=False)
            pre_state, _, _ = fluid.layers.gru_unit(
                input=gru_fc,
                hidden=layers.reshape(x=pre_state, shape=[-1, hidden_size]),
                size=3 * hidden_size)

        final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size])
        seq = layers.gather_nd(final_state, self.seq_index)
        last = layers.gather_nd(final_state, self.last_index)

        seq_fc = layers.fc(
            input=seq,
            name="seq_fc",
            size=hidden_size,
            bias_attr=False,
            act=None,
            num_flatten_dims=2,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, seq_max, h]
        last_fc = layers.fc(
            input=last,
            name="last_fc",
            size=hidden_size,
            bias_attr=False,
            act=None,
            num_flatten_dims=1,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [bathc_size, h]

        seq_fc_t = layers.transpose(seq_fc,
                                    perm=[1, 0, 2])  # [seq_max, batch_size, h]
        add = layers.elementwise_add(seq_fc_t,
                                     last_fc)  # [seq_max, batch_size, h]
        b = layers.create_parameter(
            shape=[hidden_size],
            dtype='float32',
            default_initializer=fluid.initializer.Constant(value=0.0))  # [h]
        add = layers.elementwise_add(add, b)  # [seq_max, batch_size, h]

        add_sigmoid = layers.sigmoid(add)  # [seq_max, batch_size, h]
        add_sigmoid = layers.transpose(add_sigmoid,
                                       perm=[1, 0,
                                             2])  # [batch_size, seq_max, h]

        weight = layers.fc(
            input=add_sigmoid,
            name="weight_fc",
            size=1,
            act=None,
            num_flatten_dims=2,
            bias_attr=False,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, seq_max, 1]
        weight *= self.mask
        weight_mask = layers.elementwise_mul(
            seq, weight, axis=0)  # [batch_size, seq_max, h]
        global_attention = layers.reduce_sum(weight_mask,
                                             dim=1)  # [batch_size, h]

        final_attention = layers.concat([global_attention, last],
                                        axis=1)  # [batch_size, 2*h]
        final_attention_fc = layers.fc(
            input=final_attention,
            name="final_attention_fc",
            size=hidden_size,
            bias_attr=False,
            act=None,
            param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-stdv, high=stdv)))  # [batch_size, h]

        # all_vocab = layers.create_global_var(
        #     shape=[items_num - 1],
        #     value=0,
        #     dtype="int64",
        #     persistable=True,
        #     name="all_vocab")
        all_vocab = np.arange(1, items_num).reshape((-1)).astype('int32')
        all_vocab = fluid.layers.cast(x=fluid.layers.assign(all_vocab),
                                      dtype='int64')

        all_emb = fluid.embedding(
            input=all_vocab,
            param_attr=fluid.ParamAttr(name="emb",
                                       initializer=fluid.initializer.Uniform(
                                           low=-stdv, high=stdv)),
            size=[items_num, hidden_size])  # [all_vocab, h]

        logits = layers.matmul(x=final_attention_fc,
                               y=all_emb,
                               transpose_y=True)  # [batch_size, all_vocab]
        softmax = layers.softmax_with_cross_entropy(
            logits=logits, label=self.label)  # [batch_size, 1]
        self.loss = layers.reduce_mean(softmax)  # [1]
        self.acc = layers.accuracy(input=logits, label=self.label, k=20)
Exemple #25
0
def wrap_decoder(trg_vocab_size,
                 max_length,
                 n_layer,
                 n_head,
                 d_key,
                 d_value,
                 d_model,
                 d_inner_hid,
                 prepostprocess_dropout,
                 attention_dropout,
                 relu_dropout,
                 preprocess_cmd,
                 postprocess_cmd,
                 weight_sharing,
                 dec_inputs=None,
                 enc_output=None,
                 caches=None,
                 gather_idx=None):
    """
    The wrapper assembles together all needed layers for the decoder.
    """
    if dec_inputs is None:
        # This is used to implement independent decoder program in inference.
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \
            make_all_inputs(decoder_data_input_fields)
    else:
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs

    dec_input = prepare_decoder(trg_word,
                                trg_pos,
                                trg_vocab_size,
                                d_model,
                                max_length,
                                prepostprocess_dropout,
                                word_emb_param_name=word_emb_param_names[0]
                                if weight_sharing else word_emb_param_names[1])
    dec_output = decoder(dec_input,
                         enc_output,
                         trg_slf_attn_bias,
                         trg_src_attn_bias,
                         n_layer,
                         n_head,
                         d_key,
                         d_value,
                         d_model,
                         d_inner_hid,
                         prepostprocess_dropout,
                         attention_dropout,
                         relu_dropout,
                         preprocess_cmd,
                         postprocess_cmd,
                         caches=caches,
                         gather_idx=gather_idx)
    # Reshape to 2D tensor to use GEMM instead of BatchedGEMM
    dec_output = layers.reshape(dec_output,
                                shape=[-1, dec_output.shape[-1]],
                                inplace=True)
    if weight_sharing:
        predict = layers.matmul(
            x=dec_output,
            y=fluid.default_main_program().global_block().var(
                word_emb_param_names[0]),
            transpose_y=True)
    else:
        predict = layers.fc(input=dec_output,
                            size=trg_vocab_size,
                            bias_attr=False)
    if dec_inputs is None:
        # Return probs for independent decoder program.
        predict = layers.softmax(predict)
    return predict
Exemple #26
0
    def inference(self, model, inputs, outputs):
        """
        Run inference.

        Args:
            inputs(dict): Its key is input name(str) and its value is a Variable.
            model(object): A generate model. Need to implement `_generation_network` and `_calc_logits`.

        Returns:
            dict(str:Variable): Its key is output name(str) and its value is a Variable.
        """
        # prepare while loop
        max_len = layers.fill_constant(shape=[1],
                                       dtype="int64",
                                       value=self.max_dec_len,
                                       force_cpu=True)
        min_len = layers.fill_constant(shape=[1],
                                       dtype="int64",
                                       value=self.min_dec_len,
                                       force_cpu=True)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype="int64",
                                        value=0,
                                        force_cpu=True)

        ids = layers.array_write(layers.reshape(inputs["tgt_ids"], (-1, 1)),
                                 step_idx)
        pos_biases = layers.array_write(
            layers.reshape(inputs["tgt_pos"], (-1, 1)), step_idx)
        scores = layers.array_write(inputs["init_score"], step_idx)
        tgt_generation_mask = layers.array_write(inputs["tgt_generation_mask"],
                                                 step_idx)
        parent_idx = inputs["parent_idx"]

        if self.decoding_strategy == "beam_search":
            beam_size = self.beam_size
        else:
            beam_size = 1

        eos_penalty = np.zeros(self.vocab_size, dtype="float32")
        eos_penalty[self.eos_id] = -1e9
        eos_penalty = layers.assign(eos_penalty)

        token_penalty = np.zeros(self.vocab_size, dtype="float32")
        token_penalty[self.unk_id] = -1e9
        if self.mask_id >= 0:
            token_penalty[self.mask_id] = -1e9
        token_penalty = layers.assign(token_penalty)

        # start while loop
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)
        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            tmp_tgt_generation_mask = layers.array_read(tgt_generation_mask,
                                                        i=step_idx)
            dtype = tmp_tgt_generation_mask.dtype

            append_mask = layers.fill_constant_batch_size_like(
                input=pre_ids, value=1.0, shape=[-1, 1, 1], dtype=dtype)
            tmp_tgt_generation_mask = layers.concat(
                [tmp_tgt_generation_mask, append_mask], axis=2)
            pre_mask = tmp_tgt_generation_mask = layers.gather(
                input=tmp_tgt_generation_mask, index=parent_idx)

            pre_sent = layers.fill_constant_batch_size_like(
                input=pre_mask, value=1, shape=[-1, 1, 1], dtype=pre_ids.dtype)

            if self.continuous_position:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0) + pos_bias
            else:
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=pre_mask,
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0)

            dec_out, _ = model._generation_network(
                token_ids=pre_ids,
                type_ids=pre_sent,
                pos_ids=pre_pos,
                generation_mask=tmp_tgt_generation_mask,
                gather_idx=parent_idx)
            logits = model._calc_logits(dec_out)

            # ignore unk and mask token
            if self.ignore_unk:
                logits = layers.elementwise_add(logits, token_penalty, axis=1)

            # min dec length
            min_len_cond = layers.less_than(x=step_idx, y=min_len)

            def min_len_penalty():
                """Plus minimum length penalty."""
                return layers.elementwise_add(logits, eos_penalty, axis=1)

            def no_penalty():
                """No penalty."""
                return logits

            logits = layers.case([(min_len_cond, min_len_penalty)],
                                 default=no_penalty)

            # get probs
            probs = layers.softmax(logits / self.temperature)

            if self.decoding_strategy == "beam_search":
                topk_scores, topk_indices = layers.topk(input=probs,
                                                        k=beam_size)
            else:
                if self.decoding_strategy.startswith("sampling"):
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                elif self.decoding_strategy.startswith("topk_sampling"):
                    topk_probs, _ = layers.topk(input=probs, k=self.topk)
                    ge_cond = layers.cast(
                        layers.greater_equal(
                            probs, layers.unsqueeze(topk_probs[:, -1], [1])),
                        "float32")
                    old_probs = probs
                    probs = probs * ge_cond / layers.reduce_sum(
                        topk_probs, dim=-1, keep_dim=True)
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                    probs = old_probs
                elif self.decoding_strategy.startswith("topp_sampling"):
                    sorted_probs, sorted_idx = layers.argsort(probs,
                                                              descending=True)
                    cum_sorted_probs = layers.cumsum(sorted_probs,
                                                     axis=1,
                                                     exclusive=True)
                    lt_cond = layers.cast(
                        layers.less_than(
                            cum_sorted_probs,
                            layers.fill_constant_batch_size_like(
                                cum_sorted_probs, cum_sorted_probs.shape,
                                cum_sorted_probs.dtype, self.topp)), "float32")
                    old_probs = probs
                    candidate_probs = sorted_probs * lt_cond
                    probs = candidate_probs / layers.reduce_sum(
                        candidate_probs, dim=-1, keep_dim=True)
                    sampling_ids = layers.sampling_id(probs, dtype="int")
                    sampling_ids = layers.index_sample(
                        sorted_idx, layers.unsqueeze(sampling_ids, [1]))
                    sampling_ids = layers.squeeze(sampling_ids, [1])
                    probs = old_probs
                else:
                    raise ValueError(self.decoding_strategy)

                sampling_scores = layers.one_hot(
                    layers.unsqueeze(sampling_ids, [1]), probs.shape[1])
                sampling_scores = sampling_scores * probs - (
                    1 - sampling_scores) * 1e3
                topk_scores, topk_indices = layers.topk(input=sampling_scores,
                                                        k=1)

            pre_len = layers.cast(step_idx, "float32")
            layers.increment(x=step_idx, value=1.0, in_place=True)
            cur_len = layers.cast(step_idx, "float32")

            # update scores
            if self.length_average:
                accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                     y=pre_scores * pre_len,
                                                     axis=0) / cur_len
            elif self.length_penalty > 0:
                pre_lp = layers.pow((5 + pre_len) / 6, self.length_penalty)
                cur_lp = layers.pow((5 + cur_len) / 6, self.length_penalty)
                accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                     y=pre_scores * pre_lp,
                                                     axis=0) / cur_lp
            else:
                accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                     y=pre_scores,
                                                     axis=0)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            accu_scores = layers.lod_reset(accu_scores, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=accu_scores,
                beam_size=beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_generation_mask)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)

            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=beam_size, end_id=self.eos_id)

        predictions = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "token_ids": inputs["token_ids"],
            "data_id": inputs["data_id"]
        }
        return predictions
Exemple #27
0
    def _grammar_step(self, logits, next_cell_states, decode_states, actions,
                      gmr_mask):
        """跟进文法约束完成一步解码逻辑

        Args:
            logits (Variable): shape = [batch_size, beam_size, vocab_size]
            next_cell_states (Variable): NULL
            decode_states (StateWrapper): NULL

        Returns: TODO

        Raises: NULL

        """
        # 解码出符合语法规则的 token logits
        logits, valid_table_mask = self._output_layer(
            logits, actions, gmr_mask, decode_states.valid_table_mask)

        # 初始化 vocab size
        self._vocab_size = logits.shape[-1]
        self._vocab_size_tensor = layers.fill_constant(shape=[1],
                                                       dtype='int64',
                                                       value=logits.shape[-1])

        # 计算 log probs,并 mask 掉 finished 部分
        step_log_probs = layers.log(layers.softmax(logits))
        step_log_probs = self._mask_finished_probs(step_log_probs,
                                                   decode_states.finished)

        scores = layers.reshape(step_log_probs,
                                [-1, self._beam_size * self._vocab_size])
        topk_scores, topk_indices = layers.topk(input=scores,
                                                k=self._beam_size)
        topk_scores = layers.reshape(topk_scores, shape=[-1])
        topk_indices = layers.reshape(topk_indices, shape=[-1])

        # top-k 对应的 beam
        beam_indices = layers.elementwise_floordiv(topk_indices,
                                                   self._vocab_size_tensor)
        # top-k 对应的 token id
        token_indices = layers.elementwise_mod(topk_indices,
                                               self._vocab_size_tensor)

        # 根据 top k 的来源,重新组织 step_log_probs
        next_log_probs = nn_utils.batch_gather(
            layers.reshape(step_log_probs,
                           [-1, self._beam_size * self._vocab_size]),
            topk_indices)

        def _beam_gather(x, beam_indices):
            """reshape x to beam dim, and gather each beam_indices
            Args:
                x (TYPE): NULL
            Returns: Variable
            """
            x = self.split_batch_beams(x)
            return nn_utils.batch_gather(x, beam_indices)

        next_cell_states = layers.utils.map_structure(
            lambda x: _beam_gather(x, beam_indices), next_cell_states)
        next_finished = _beam_gather(decode_states.finished, beam_indices)
        next_lens = _beam_gather(decode_states.lengths, beam_indices)

        next_lens = layers.elementwise_add(
            next_lens,
            layers.cast(layers.logical_not(next_finished), next_lens.dtype))
        next_finished = layers.logical_or(
            next_finished, layers.equal(token_indices, self._end_token_tensor))

        decode_output = OutputWrapper(topk_scores, token_indices, beam_indices)
        decode_states = StateWrapper(next_cell_states, next_log_probs,
                                     next_finished, next_lens,
                                     valid_table_mask)

        return decode_output, decode_states
Exemple #28
0
def transformer(
        src_vocab_size,
        trg_vocab_size,
        max_length,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate,
        src_pad_idx,
        trg_pad_idx,
        pos_pad_idx, ):
    file_obj = fluid.layers.open_recordio_file(
        filename='./wmt16.recordio',
        shapes=[
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
            [batch_size, n_head, max_length, max_length],
            [batch_size, n_head, max_length, max_length],
            [batch_size, n_head, max_length, max_length],
            [batch_size * max_length, 1],
            [batch_size * max_length, 1],
        ],
        dtypes=[
            'int64',
            'int64',
            'int64',
            'int64',
            'float32',
            'float32',
            'float32',
            'int64',
            'float32',
        ],
        lod_levels=[0] * 9)

    src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file(
        file_obj)

    enc_input = prepare_encoder(
        src_word,
        src_pos,
        src_vocab_size,
        d_model,
        src_pad_idx,
        max_length,
        dropout_rate, )
    enc_output = encoder(
        enc_input,
        src_slf_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate, )

    dec_input = prepare_decoder(
        trg_word,
        trg_pos,
        trg_vocab_size,
        d_model,
        trg_pad_idx,
        max_length,
        dropout_rate, )
    dec_output = decoder(
        dec_input,
        enc_output,
        trg_slf_attn_bias,
        trg_src_attn_bias,
        n_layer,
        n_head,
        d_key,
        d_value,
        d_model,
        d_inner_hid,
        dropout_rate, )

    # TODO(guosheng): Share the weight matrix between the embedding layers and
    # the pre-softmax linear transformation.
    predict = layers.reshape(
        x=layers.fc(input=dec_output,
                    size=trg_vocab_size,
                    param_attr=fluid.initializer.Xavier(uniform=False),
                    bias_attr=False,
                    num_flatten_dims=2),
        shape=[-1, trg_vocab_size],
        act="softmax")

    cost = layers.cross_entropy(input=predict, label=gold)
    weighted_cost = cost * weights
    return layers.reduce_sum(weighted_cost)
Exemple #29
0
def beam_search_step(state, logits, eos_id, beam_width, is_first_step,
                     length_penalty):
    """logits.shape == [B*W, V]"""
    beam_size, vocab_size = logits.shape  # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size
    logits_np = logits.numpy()
    for i in range(beam_size):
        logits_np[i][17963] = 0  # make [UNK] prob = 0
    logits = D.to_variable(logits_np)

    bsz, beam_width = state.log_probs.shape
    onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size),
                        'int64')  #[1, V]

    probs = L.log(L.softmax(logits))  #[B*W, V]
    probs = mask_prob(probs, onehot_eos, state.finished)  #[B*W, V]
    allprobs = L.reshape(state.log_probs, [-1, 1]) + probs  #[B*W, V]

    not_finished = 1 - L.reshape(state.finished, [-1, 1])  #[B*W,1]
    not_eos = 1 - onehot_eos
    length_to_add = not_finished * not_eos  #[B*W,V]
    alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add

    allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size])
    alllen = L.reshape(alllen, [-1, beam_width * vocab_size])
    allscore = hyp_score(allprobs, alllen, length_penalty)
    if is_first_step:
        allscore = L.reshape(
            allscore,
            [bsz, beam_width, -1])[:, 0, :]  # first step only consiter beam 0
    scores, idx = L.topk(allscore, k=beam_width)  #[B, W]
    next_beam_id = idx // vocab_size  #[B, W]
    next_word_id = idx % vocab_size

    gather_idx = L.concat([L.where(idx != -1)[:, :1],
                           L.reshape(idx, [-1, 1])], 1)
    next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape)
    next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape)

    gather_idx = L.concat(
        [L.where(next_beam_id != -1)[:, :1],
         L.reshape(next_beam_id, [-1, 1])], 1)
    next_finished = L.reshape(
        L.gather_nd(state.finished, gather_idx), state.finished.shape
    )  #[gather new beam state according to new beam id]

    next_finished += L.cast(next_word_id == eos_id, 'int64')
    next_finished = L.cast(next_finished > 0, 'int64')

    next_state = BeamSearchState(log_probs=next_probs,
                                 lengths=next_len,
                                 finished=next_finished)
    output = BeamSearchOutput(scores=scores,
                              predicted_ids=next_word_id,
                              beam_parent_ids=next_beam_id)

    return output, next_state
Exemple #30
0
    def build(self):
        args = self.args
        emb_size = args.embed_size
        proj_size = args.embed_size
        hidden_size = args.hidden_size
        batch_size = args.batch_size
        num_layers = args.num_layers
        num_steps = args.num_steps

        lstm_outputs = []

        x_f = layers.data(name="x", shape=[1], dtype='int64', lod_level=1)
        y_f = layers.data(name="y", shape=[1], dtype='int64', lod_level=1)

        x_b = layers.data(name="x_r", shape=[1], dtype='int64', lod_level=1)
        y_b = layers.data(name="y_r", shape=[1], dtype='int64', lod_level=1)

        init_hiddens_ = layers.data(
            name="init_hiddens", shape=[1], dtype='float32')
        init_cells_ = layers.data(
            name="init_cells", shape=[1], dtype='float32')

        if args.debug:
            layers.Print(init_cells_, message='init_cells_', summarize=10)
            layers.Print(init_hiddens_, message='init_hiddens_', summarize=10)

        init_hiddens = layers.reshape(
            init_hiddens_, shape=[2 * num_layers, -1, proj_size])
        init_cells = layers.reshape(
            init_cells_, shape=[2 * num_layers, -1, hidden_size])

        init_hidden = layers.slice(
            init_hiddens, axes=[0], starts=[0], ends=[num_layers])
        init_cell = layers.slice(
            init_cells, axes=[0], starts=[0], ends=[num_layers])
        init_hidden_r = layers.slice(
            init_hiddens, axes=[0], starts=[num_layers],
            ends=[2 * num_layers])
        init_cell_r = layers.slice(
            init_cells, axes=[0], starts=[num_layers], ends=[2 * num_layers])

        if args.use_custom_samples:
            custom_samples = layers.data(
                name="custom_samples",
                shape=[args.n_negative_samples_batch + 1],
                dtype='int64',
                lod_level=1)
            custom_samples_r = layers.data(
                name="custom_samples_r",
                shape=[args.n_negative_samples_batch + 1],
                dtype='int64',
                lod_level=1)
            custom_probabilities = layers.data(
                name="custom_probabilities",
                shape=[args.n_negative_samples_batch + 1],
                dtype='float32',
                lod_level=1)
        else:
            custom_samples = None
            custom_samples_r = None
            custom_probabilities = None

        forward, fw_hiddens, fw_hiddens_ori, fw_cells, fw_projs = encoder(
            x_f,
            y_f,
            self.vocab_size,
            emb_size,
            init_hidden,
            init_cell,
            para_name='fw_',
            custom_samples=custom_samples,
            custom_probabilities=custom_probabilities,
            test_mode=self.test_mode,
            args=args)
        backward, bw_hiddens, bw_hiddens_ori, bw_cells, bw_projs = encoder(
            x_b,
            y_b,
            self.vocab_size,
            emb_size,
            init_hidden_r,
            init_cell_r,
            para_name='bw_',
            custom_samples=custom_samples_r,
            custom_probabilities=custom_probabilities,
            test_mode=self.test_mode,
            args=args)

        losses = layers.concat([forward[-1], backward[-1]])
        self.loss = layers.reduce_mean(losses)
        self.loss.permissions = True
        self.loss.persistable = True

        if args.debug:
            x_emb, projection, loss = forward
            layers.Print(init_cells, message='init_cells', summarize=10)
            layers.Print(init_hiddens, message='init_hiddens', summarize=10)
            layers.Print(init_cell, message='init_cell', summarize=10)
            layers.Print(y_b, message='y_b', summarize=10)
            layers.Print(x_emb, message='x_emb', summarize=10)
            layers.Print(projection, message='projection', summarize=10)
            layers.Print(losses, message='losses', summarize=320)
            layers.Print(self.loss, message='loss', summarize=320)
        self.grad_vars = [x_f, y_f, x_b, y_b, self.loss]
        self.grad_vars_name = ['x', 'y', 'x_r', 'y_r', 'final_loss']
        fw_vars_name = ['x_emb', 'proj', 'loss'] + [
            'init_hidden', 'init_cell'
        ] + ['rnn_out', 'rnn_out2', 'cell', 'cell2', 'xproj', 'xproj2']
        bw_vars_name = ['x_emb_r', 'proj_r', 'loss_r'] + [
            'init_hidden_r', 'init_cell_r'
        ] + [
            'rnn_out_r', 'rnn_out2_r', 'cell_r', 'cell2_r', 'xproj_r',
            'xproj2_r'
        ]
        fw_vars = forward + [init_hidden, init_cell
                             ] + fw_hiddens + fw_cells + fw_projs
        bw_vars = backward + [init_hidden_r, init_cell_r
                              ] + bw_hiddens + bw_cells + bw_projs
        for i in range(len(fw_vars_name)):
            self.grad_vars.append(fw_vars[i])
            self.grad_vars.append(bw_vars[i])
            self.grad_vars_name.append(fw_vars_name[i])
            self.grad_vars_name.append(bw_vars_name[i])
        if args.use_custom_samples:
            self.feed_order = [
                'x', 'y', 'x_r', 'y_r', 'custom_samples', 'custom_samples_r',
                'custom_probabilities'
            ]
        else:
            self.feed_order = ['x', 'y', 'x_r', 'y_r']
        self.last_hidden = [
            fluid.layers.sequence_last_step(input=x)
            for x in fw_hiddens_ori + bw_hiddens_ori
        ]
        self.last_cell = [
            fluid.layers.sequence_last_step(input=x)
            for x in fw_cells + bw_cells
        ]
        self.last_hidden = layers.concat(self.last_hidden, axis=0)
        self.last_hidden.persistable = True
        self.last_cell = layers.concat(self.last_cell, axis=0)
        self.last_cell.persistable = True
        if args.debug:
            layers.Print(self.last_cell, message='last_cell', summarize=10)
            layers.Print(self.last_hidden, message='last_hidden', summarize=10)
Exemple #31
0
    def beam_search(self,
                    src_word,
                    src_pos,
                    src_slf_attn_bias,
                    trg_word,
                    trg_src_attn_bias,
                    bos_id=0,
                    eos_id=1,
                    beam_size=4,
                    max_len=256):
        def expand_to_beam_size(tensor, beam_size):
            tensor = layers.reshape(tensor, [tensor.shape[0], 1] +
                                    list(tensor.shape[1:]))
            tile_dims = [1] * len(tensor.shape)
            tile_dims[1] = beam_size
            return layers.expand(tensor, tile_dims)

        def merge_batch_beams(tensor):
            var_dim_in_state = 2  # count in beam dim
            tensor = layers.transpose(
                tensor,
                list(range(var_dim_in_state, len(tensor.shape))) +
                list(range(0, var_dim_in_state)))

            tensor = layers.reshape(tensor, [0] *
                                    (len(tensor.shape) - var_dim_in_state) +
                                    [batch_size * beam_size])
            res = layers.transpose(
                tensor,
                list(
                    range((len(tensor.shape) + 1 - var_dim_in_state),
                          len(tensor.shape))) +
                list(range(0, (len(tensor.shape) + 1 - var_dim_in_state))))
            return res

        def split_batch_beams(tensor):
            var_dim_in_state = 1
            tensor = layers.transpose(
                tensor,
                list(range(var_dim_in_state, len(tensor.shape))) +
                list(range(0, var_dim_in_state)))
            tensor = layers.reshape(tensor, [0] *
                                    (len(tensor.shape) - var_dim_in_state) +
                                    [batch_size, beam_size])
            res = layers.transpose(
                tensor,
                list(
                    range((len(tensor.shape) - 1 - var_dim_in_state),
                          len(tensor.shape))) +
                list(range(0, (len(tensor.shape) - 1 - var_dim_in_state))))
            return res

        def mask_probs(probs, finished, noend_mask_tensor):
            finished = layers.cast(finished, dtype=probs.dtype)
            probs = layers.elementwise_mul(layers.expand(
                layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]),
                                           noend_mask_tensor,
                                           axis=-1) - layers.elementwise_mul(
                                               probs, (finished - 1), axis=0)
            return probs

        def gather(input, indices, batch_pos):
            topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
            return layers.gather_nd(input, topk_coordinates)

        # run encoder
        enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias)
        batch_size = enc_output.shape[0]

        # constant number
        inf = float(1. * 1e7)
        max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
        vocab_size_tensor = layers.fill_constant(shape=[1],
                                                 dtype="int64",
                                                 value=self.trg_vocab_size)
        end_token_tensor = to_variable(
            np.full([batch_size, beam_size], eos_id, dtype="int64"))
        noend_array = [-inf] * self.trg_vocab_size
        noend_array[eos_id] = 0
        noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32"))
        batch_pos = layers.expand(
            layers.unsqueeze(
                to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]),
            [1, beam_size])
        predict_ids = []
        parent_ids = []
        ### initialize states of beam search ###
        log_probs = to_variable(
            np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size,
                     dtype="float32"))

        finished = to_variable(
            np.full([batch_size, beam_size], 0, dtype="bool"))

        trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                        dtype="int64",
                                        value=bos_id)

        trg_src_attn_bias = merge_batch_beams(
            expand_to_beam_size(trg_src_attn_bias, beam_size))
        enc_output = merge_batch_beams(
            expand_to_beam_size(enc_output, beam_size))

        # init states (caches) for transformer, need to be updated according to selected beam
        caches = [{
            "k":
            layers.fill_constant(
                shape=[batch_size, beam_size, self.n_head, 0, self.d_key],
                dtype=enc_output.dtype,
                value=0),
            "v":
            layers.fill_constant(
                shape=[batch_size, beam_size, self.n_head, 0, self.d_value],
                dtype=enc_output.dtype,
                value=0),
        } for i in range(self.n_layer)]

        for i in range(max_len):
            trg_pos = layers.fill_constant(shape=trg_word.shape,
                                           dtype="int64",
                                           value=i)
            caches = map_structure(merge_batch_beams,
                                   caches)  # TODO: modified for dygraph2static
            logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
                                  enc_output, caches)
            caches = map_structure(split_batch_beams, caches)
            step_log_probs = split_batch_beams(
                fluid.layers.log(fluid.layers.softmax(logits)))

            step_log_probs = mask_probs(step_log_probs, finished,
                                        noend_mask_tensor)
            log_probs = layers.elementwise_add(x=step_log_probs,
                                               y=log_probs,
                                               axis=0)
            log_probs = layers.reshape(log_probs,
                                       [-1, beam_size * self.trg_vocab_size])
            scores = log_probs
            topk_scores, topk_indices = fluid.layers.topk(input=scores,
                                                          k=beam_size)
            beam_indices = fluid.layers.elementwise_floordiv(
                topk_indices, vocab_size_tensor)
            token_indices = fluid.layers.elementwise_mod(
                topk_indices, vocab_size_tensor)

            # update states
            caches = map_structure(
                lambda x: gather(x, beam_indices, batch_pos), caches)
            log_probs = gather(log_probs, topk_indices, batch_pos)
            finished = gather(finished, beam_indices, batch_pos)
            finished = layers.logical_or(
                finished, layers.equal(token_indices, end_token_tensor))
            trg_word = layers.reshape(token_indices, [-1, 1])

            predict_ids.append(token_indices)
            parent_ids.append(beam_indices)

            if layers.reduce_all(finished).numpy():
                break

        predict_ids = layers.stack(predict_ids, axis=0)
        parent_ids = layers.stack(parent_ids, axis=0)
        finished_seq = layers.transpose(
            layers.gather_tree(predict_ids, parent_ids), [1, 2, 0])
        finished_scores = topk_scores

        return finished_seq, finished_scores
Exemple #32
0
def grammar_output(inputs,
                   actions,
                   gmr_mask,
                   last_col2tbl_mask,
                   decode_vocab,
                   grammar,
                   name=None,
                   column2table=None):
    """output logits according to grammar

    Args:
        inputs (Variable): shape = [batch_size, max_len, hidden_size]. infer 阶段 max_len 恒为1
        actions (Variable): shape = [batch_size, max_len]. infer 阶段 max_len 恒为1
        gmr_mask (Variable): shape = [batch_size, max_len, grammar_size]. infer 阶段 max_len 恒为1
        last_col2tbl_mask (Variable): shape = [batch_size, max_len, max_table]. 解码过程中,上一个step为column时,其对应的 table mask
        decode_vocab (DecoderDynamicVocab): (table, table_len, column, column_len, value, value_len, column2table_mask).
                                            这里的column2table_mask是跟column一一对应的table mask。
        gramamr (Grammar): NULL
        name (str): Variable 的 name 前缀。用于多次调用时的参数共享。默认为 None,表示参数不会共享。

    Returns: (Variable, Variable)
        output: 词表输出概率
        valid_table_mask: 只在预测阶段有效

    Raises: NULL
    """
    batch_size = layers.shape(inputs)[0]
    max_len = inputs.shape[1]
    vocab_size = grammar.vocab_size

    action_shape = [batch_size, max_len]
    act_apply_rule = tensor.fill_constant(shape=action_shape,
                                          value=grammar.ACTION_APPLY,
                                          dtype='int64')
    act_stop = tensor.fill_constant(shape=action_shape,
                                    value=grammar.ACTION_STOP,
                                    dtype='int64')
    act_select_t = tensor.fill_constant(shape=action_shape,
                                        value=grammar.ACTION_SELECT_T,
                                        dtype='int64')
    act_select_c = tensor.fill_constant(shape=action_shape,
                                        value=grammar.ACTION_SELECT_C,
                                        dtype='int64')
    act_select_v = tensor.fill_constant(shape=action_shape,
                                        value=grammar.ACTION_SELECT_V,
                                        dtype='int64')
    cond_apply_rule = layers.logical_or(layers.equal(actions, act_apply_rule),
                                        layers.equal(actions, act_stop))
    cond_select_t = layers.equal(actions, act_select_t)
    cond_select_c = layers.equal(actions, act_select_c)
    cond_select_v = layers.equal(actions, act_select_v)

    # expand vocab to [-1, max_len, ...]
    if max_len == 1:
        expand_to_seq_len = lambda x: layers.unsqueeze(x, [1])
    else:
        expand_to_seq_len = lambda x: layers.expand(layers.unsqueeze(
            x, [1]), [1, max_len] + [1] * (len(x.shape) - 1))
    table_enc = expand_to_seq_len(decode_vocab.table)
    table_len = expand_to_seq_len(decode_vocab.table_len)
    column_enc = expand_to_seq_len(decode_vocab.column)
    column_len = expand_to_seq_len(decode_vocab.column_len)
    value_enc = expand_to_seq_len(decode_vocab.value)
    value_len = expand_to_seq_len(decode_vocab.value_len)
    column2table_mask = expand_to_seq_len(decode_vocab.column2table_mask)

    # merge batch & seq_len dim
    inputs = nn_utils.merge_first_ndim(inputs, n=2)
    actions = nn_utils.merge_first_ndim(actions, n=2)
    gmr_mask = nn_utils.merge_first_ndim(gmr_mask, n=2)
    last_col2tbl_mask = nn_utils.merge_first_ndim(last_col2tbl_mask, n=2)
    table_enc = nn_utils.merge_first_ndim(table_enc, n=2)
    table_len = nn_utils.merge_first_ndim(table_len, n=2)
    column_enc = nn_utils.merge_first_ndim(column_enc, n=2)
    column_len = nn_utils.merge_first_ndim(column_len, n=2)
    value_enc = nn_utils.merge_first_ndim(value_enc, n=2)
    value_len = nn_utils.merge_first_ndim(value_len, n=2)
    column2table_mask = nn_utils.merge_first_ndim(column2table_mask, n=2)
    cond_apply_rule = nn_utils.merge_first_ndim(cond_apply_rule, n=2)
    cond_select_t = nn_utils.merge_first_ndim(cond_select_t, n=2)
    cond_select_c = nn_utils.merge_first_ndim(cond_select_c, n=2)
    cond_select_v = nn_utils.merge_first_ndim(cond_select_v, n=2)

    t_ptr_net = models.PointerNetwork(score_type="affine",
                                      name='gmr_output_t_ptr')
    c_ptr_net = models.PointerNetwork(score_type="affine",
                                      name='gmr_output_c_ptr')
    v_ptr_net = models.PointerNetwork(score_type="affine",
                                      name='gmr_output_v_ptr')

    ## 核心处理逻辑 ##
    apply_rule_output = _apply_rule(cond_apply_rule,
                                    inputs,
                                    gmr_mask,
                                    grammar,
                                    name=name)
    select_t_output = \
            _select_table(cond_select_t, inputs, table_enc, table_len, last_col2tbl_mask, t_ptr_net, grammar)
    select_c_output, valid_table_mask = \
            _select_column(cond_select_c, inputs, column_enc, column_len, c_ptr_net, grammar, column2table_mask)
    select_v_output = _select_value(cond_select_v, inputs, value_enc,
                                    value_len, v_ptr_net, grammar)

    output = fluider.elementwise_add(apply_rule_output,
                                     select_t_output,
                                     select_c_output,
                                     select_v_output,
                                     axis=0)
    output = layers.reshape(output, shape=[batch_size, max_len, vocab_size])
    return output, valid_table_mask
Exemple #33
0
 def expand_to_beam_size(tensor, beam_size):
     tensor = layers.reshape(tensor, [tensor.shape[0], 1] +
                             list(tensor.shape[1:]))
     tile_dims = [1] * len(tensor.shape)
     tile_dims[1] = beam_size
     return layers.expand(tensor, tile_dims)
def multi_head_attention(queries,
                         keys,
                         values,
                         attn_bias,
                         d_key,
                         d_value,
                         d_model,
                         input_mask,
                         n_head=1,
                         dropout_rate=0.,
                         cache=None,
                         param_initializer=None,
                         name='multi_head_att'):
    """
    Multi-Head Attention. Note that attn_bias is added to the logit before
    computing softmax activiation to mask certain selected positions so that
    they will not considered in attention weights.
    """
    keys = queries if keys is None else keys
    values = keys if values is None else values

    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
        """
        Add linear projection to queries, keys, and values.
        """
        q = layers.fc(input=queries,
                      size=d_key * n_head,
                      num_flatten_dims=len(queries.shape) - 1,
                      param_attr=fluid.ParamAttr(
                          name=name + '_query_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_query_fc.b_0')
        k = layers.fc(input=keys,
                      size=d_key * n_head,
                      num_flatten_dims=len(keys.shape) - 1,
                      param_attr=fluid.ParamAttr(
                          name=name + '_key_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_key_fc.b_0')
        v = layers.fc(input=values,
                      size=d_value * n_head,
                      num_flatten_dims=len(values.shape) - 1,
                      param_attr=fluid.ParamAttr(
                          name=name + '_value_fc.w_0',
                          initializer=param_initializer),
                      bias_attr=name + '_value_fc.b_0')
        return q, k, v

    def __split_heads(x, n_head):
        """
        Reshape the last dimension of inpunt tensor x so that it becomes two
        dimensions and then transpose. Specifically, input a tensor with shape
        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
        with shape [bs, n_head, max_sequence_length, hidden_dim].
        """
        hidden_size = x.shape[-1]
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        reshaped = layers.reshape(x=x,
                                  shape=[0, 0, n_head, hidden_size // n_head],
                                  inplace=True)

        # permuate the dimensions into:
        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])

    def __combine_heads(x):
        """
        Transpose and then reshape the last two dimensions of inpunt tensor x
        so that it becomes one dimension, which is reverse to __split_heads.
        """
        if len(x.shape) == 3: return x
        if len(x.shape) != 4:
            raise ValueError("Input(x) should be a 4-D Tensor.")
        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
        # The value 0 in shape attr means copying the corresponding dimension
        # size of the input as the output dimension size.
        #trans_x.desc.set_shape((-1, 1, n_head, d_value))
        return layers.reshape(x=trans_x, shape=[0, 0, d_model], inplace=True)

    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
    q = to_3d(q)
    k = to_3d(k)
    v = to_3d(v)

    if cache is not None:  # use cache and concat time steps
        # Since the inplace reshape in __split_heads changes the shape of k and
        # v, which is the cache input for next time step, reshape the cache
        # input from the previous time step first.
        k = cache["k"] = layers.concat(
            [layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
        v = cache["v"] = layers.concat(
            [layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)

    out, _ = sparse_scaled_dot_product_attention(q, k, v, input_mask,
                                                 dropout_rate, n_head, d_key,
                                                 d_value)

    out = to_2d(out)

    # Project back to the model size.
    proj_out = layers.fc(input=out,
                         size=d_model,
                         num_flatten_dims=len(out.shape) - 1,
                         param_attr=fluid.ParamAttr(
                             name=name + '_output_fc.w_0',
                             initializer=param_initializer),
                         bias_attr=name + '_output_fc.b_0')
    return proj_out, _