Example #1
0
def fluid_sequence_pad(input, pad_value, maxlen=None):
    """
    args:
        input: (batch*seq_len, dim)
    returns:
        (batch, max_seq_len, dim)
    """
    pad_value = layers.cast(
        fluid.layers.assign(input=np.array([pad_value], 'float32')),
        input.dtype)
    input_padded, _ = layers.sequence_pad(
        input, pad_value, maxlen=maxlen)  # (batch, max_seq_len, 1), (batch, 1)
    # TODO, maxlen=300, used to solve issues: https://github.com/PaddlePaddle/Paddle/issues/14164
    return input_padded
Example #2
0
    def recv_func(msg):
        pad_value = L.assign(input=np.array([0.0], dtype=np.float32))

        output, length = L.sequence_pad(msg, pad_value, maxlen=max_neigh)
        mask = L.sequence_mask(length, dtype="float32", maxlen=max_neigh)
        mask = L.unsqueeze(mask, [2])
        input_mask = (L.matmul(mask, mask, transpose_y=True) - 1) * -10000
        for layer in range(num_layers):
            output = self_attention_and_residual(output,
                                                 hidden_size,
                                                 input_mask,
                                                 name="cross_feat_%s" % layer,
                                                 maxlen=max_neigh)
        return L.reduce_sum(output * mask, 1) / L.reduce_sum(mask, 1)
def knowledge_seq2seq(config):
    """ knowledge seq2seq """
    emb_size = config.embed_size
    hidden_size = config.hidden_size
    input_size = emb_size
    num_layers = config.num_layers
    bi_direc = config.bidirectional
    batch_size = config.batch_size
    vocab_size = config.vocab_size
    run_type = config.run_type

    enc_input = layers.data(name="enc_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #enc_input --> goal
    enc_mask = layers.data(name="enc_mask", shape=[-1, 1], dtype='float32')
    goal_input = layers.data(name="goal_input",
                             shape=[1],
                             dtype='int64',
                             lod_level=1)  #goal_input --> x
    cue_input = layers.data(name="cue_input",
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #cue_input --> kg
    #cue_mask = layers.data(name='cue_mask', shape=[-1, 1], dtype='float32')
    memory_mask = layers.data(name='memory_mask',
                              shape=[-1, 1],
                              dtype='float32')
    tar_input = layers.data(name='tar_input',
                            shape=[1],
                            dtype='int64',
                            lod_level=1)  #tar_input --> y
    # tar_mask = layers.data(name="tar_mask", shape=[-1, 1], dtype='float32')

    rnn_hidden_size = hidden_size
    if bi_direc:
        rnn_hidden_size //= 2

    enc_out, enc_last_hidden = \
        rnn_encoder(enc_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc")
    goal_out, goal_last_hidden = \
        rnn_encoder(goal_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, name="rnn_enc1")
    context_goal_out = fluid.layers.concat(
        input=[enc_last_hidden, goal_last_hidden], axis=2)
    context_goal_out = layers.reshape(context_goal_out,
                                      shape=[-1, 1, rnn_hidden_size * 4])
    # context_goal_out = layers.squeeze(context_goal_out, axes=[1])
    context_goal_out = fluid.layers.fc(context_goal_out,
                                       size=rnn_hidden_size * 2,
                                       bias_attr=False)
    context_goal_out = layers.unsqueeze(context_goal_out, axes=[0])
    bridge_out = fc(context_goal_out, hidden_size, hidden_size, name="bridge")
    bridge_out = layers.tanh(bridge_out)

    cue_last_mask = layers.data(name='cue_last_mask',
                                shape=[-1],
                                dtype='float32')
    knowledge_out, knowledge_last_hidden = \
        rnn_encoder(cue_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                    dropout=0.0, batch_first=True, last_mask=cue_last_mask, name="knowledge_enc")

    query = layers.slice(bridge_out, axes=[0], starts=[0], ends=[1])
    query = layers.squeeze(query, axes=[0])
    query = layers.unsqueeze(query, axes=[1])
    query = layers.reshape(query, shape=[batch_size, -1, hidden_size])
    cue_memory = layers.slice(knowledge_last_hidden,
                              axes=[0],
                              starts=[0],
                              ends=[1])
    cue_memory = layers.reshape(cue_memory,
                                shape=[batch_size, -1, hidden_size])
    memory_mask = layers.reshape(memory_mask, shape=[batch_size, 1, -1])

    weighted_cue, cue_att = dot_attention(query, cue_memory, mask=memory_mask)

    cue_att = layers.reshape(cue_att, shape=[batch_size, -1])

    knowledge = weighted_cue
    if config.use_posterior:
        print("config.use_posterior", config.use_posterior)
        target_out, target_last_hidden = \
            rnn_encoder(tar_input, vocab_size, input_size, rnn_hidden_size, batch_size, num_layers, bi_direc,
                        dropout=0.0, batch_first=True, name="knowledge_enc1")
        target_goal_out = fluid.layers.concat(
            input=[target_last_hidden, goal_last_hidden], axis=2)
        target_goal_out = layers.reshape(target_goal_out,
                                         shape=[-1, 1, rnn_hidden_size * 4])
        # target_goal_out = layers.squeeze(target_goal_out, axes=[1])
        target_goal_out = fluid.layers.fc(target_goal_out,
                                          size=rnn_hidden_size * 2,
                                          bias_attr=False)
        target_goal_out = layers.unsqueeze(target_goal_out, axes=[0])

        # get attenion
        # target_query = layers.slice(target_last_hidden, axes=[0], starts=[0], ends=[1])
        target_query = layers.slice(target_goal_out,
                                    axes=[0],
                                    starts=[0],
                                    ends=[1])
        target_query = layers.squeeze(target_query, axes=[0])
        target_query = layers.unsqueeze(target_query, axes=[1])
        target_query = layers.reshape(target_query,
                                      shape=[batch_size, -1, hidden_size])

        weight_target, target_att = dot_attention(target_query,
                                                  cue_memory,
                                                  mask=memory_mask)
        target_att = layers.reshape(target_att, shape=[batch_size, -1])
        # add to output
        knowledge = weight_target

    enc_memory_mask = layers.data(name="enc_memory_mask",
                                  shape=[-1, 1],
                                  dtype='float32')
    enc_memory_mask = layers.unsqueeze(enc_memory_mask, axes=[1])
    # decoder init_hidden, enc_memory, enc_mask
    dec_init_hidden = bridge_out
    pad_value = fluid.layers.assign(input=np.array([0.0], dtype='float32'))

    enc_memory, origl_len_1 = layers.sequence_pad(x=enc_out,
                                                  pad_value=pad_value)
    enc_memory.persistable = True

    gru_unit = GRU_unit(input_size + hidden_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=0.0,
                        name="decoder_gru_unit")

    cue_gru_unit = GRU_unit(hidden_size + hidden_size,
                            hidden_size,
                            num_layers=num_layers,
                            dropout=0.0,
                            name="decoder_cue_gru_unit")

    tgt_vocab_size = config.vocab_size
    if run_type == "train":
        if config.use_bow:
            bow_logits = fc(knowledge,
                            hidden_size,
                            hidden_size,
                            name='bow_fc_1')
            bow_logits = layers.tanh(bow_logits)
            bow_logits = fc(bow_logits,
                            hidden_size,
                            tgt_vocab_size,
                            name='bow_fc_2')
            bow_logits = layers.softmax(bow_logits)

            bow_label = layers.data(name='bow_label',
                                    shape=[-1, config.max_len],
                                    dtype='int64')
            bow_mask = layers.data(name="bow_mask",
                                   shape=[-1, config.max_len],
                                   dtype='float32')

            bow_logits = layers.expand(bow_logits, [1, config.max_len, 1])
            bow_logits = layers.reshape(bow_logits, shape=[-1, tgt_vocab_size])
            bow_label = layers.reshape(bow_label, shape=[-1, 1])
            bow_loss = layers.cross_entropy(bow_logits,
                                            bow_label,
                                            soft_label=False)
            bow_loss = layers.reshape(bow_loss, shape=[-1, config.max_len])

            bow_loss *= bow_mask
            bow_loss = layers.reduce_sum(bow_loss, dim=[1])
            bow_loss = layers.reduce_mean(bow_loss)

        dec_input = layers.data(name="dec_input",
                                shape=[-1, 1, 1],
                                dtype='int64')
        dec_mask = layers.data(name="dec_mask", shape=[-1, 1], dtype='float32')

        dec_knowledge = weight_target

        knowledge_goal_out = fluid.layers.concat(
            input=[dec_knowledge, target_query], axis=2)
        knowledge_goal_out = layers.reshape(knowledge_goal_out,
                                            shape=[-1, 1, rnn_hidden_size * 4])
        # knowledge_goal_out = layers.squeeze(knowledge_goal_out, axes=[1])
        knowledge_goal_out = fluid.layers.fc(knowledge_goal_out,
                                             size=rnn_hidden_size * 2,
                                             bias_attr=False)
        knowledge_goal_out = layers.unsqueeze(knowledge_goal_out, axes=[0])

        decoder_logits = \
            rnn_decoder(gru_unit, cue_gru_unit, dec_input, input_size, hidden_size, num_layers,
                         enc_memory, enc_memory_mask, dec_knowledge, vocab_size,
                         init_hidden=dec_init_hidden, mask=dec_mask, dropout=config.dropout)

        target_label = layers.data(name='target_label',
                                   shape=[-1, 1],
                                   dtype='int64')
        target_mask = layers.data(name='target_mask',
                                  shape=[-1, 1],
                                  dtype='float32')

        decoder_logits = layers.reshape(decoder_logits,
                                        shape=[-1, tgt_vocab_size])
        target_label = layers.reshape(target_label, shape=[-1, 1])

        nll_loss = layers.cross_entropy(decoder_logits,
                                        target_label,
                                        soft_label=False)
        nll_loss = layers.reshape(nll_loss, shape=[batch_size, -1])
        nll_loss *= target_mask
        nll_loss = layers.reduce_sum(nll_loss, dim=[1])
        nll_loss = layers.reduce_mean(nll_loss)

        prior_attn = cue_att + 1e-10
        posterior_att = target_att
        posterior_att.stop_gradient = True

        prior_attn = layers.log(prior_attn)

        kl_loss = posterior_att * (layers.log(posterior_att + 1e-10) -
                                   prior_attn)
        kl_loss = layers.reduce_mean(kl_loss)

        kl_and_nll_factor = layers.data(name='kl_and_nll_factor',
                                        shape=[1],
                                        dtype='float32')
        kl_and_nll_factor = layers.reshape(kl_and_nll_factor, shape=[-1])

        final_loss = bow_loss + kl_loss * kl_and_nll_factor + nll_loss * kl_and_nll_factor

        return [bow_loss, kl_loss, nll_loss, final_loss]

    elif run_type == "test":
        beam_size = config.beam_size
        batch_size = config.batch_size
        token = layers.fill_constant(shape=[batch_size * beam_size, 1],
                                     value=config.bos_id,
                                     dtype='int64')

        token = layers.reshape(token, shape=[-1, 1])
        max_decode_len = config.max_dec_len

        dec_knowledge = knowledge
        INF = 100000000.0

        init_score_np = np.ones([beam_size * batch_size],
                                dtype='float32') * -INF

        for i in range(batch_size):
            init_score_np[i * beam_size] = 0.0

        pre_score = layers.assign(init_score_np)

        pos_index_np = np.arange(batch_size).reshape(-1, 1)
        pos_index_np = \
            np.tile(pos_index_np, (1, beam_size)).reshape(-1).astype('int32') * beam_size

        pos_index = layers.assign(pos_index_np)

        id_array = []
        score_array = []
        index_array = []
        init_enc_memory = layers.expand(enc_memory, [1, beam_size, 1])
        init_enc_memory = layers.reshape(
            init_enc_memory, shape=[batch_size * beam_size, -1, hidden_size])
        init_enc_mask = layers.expand(enc_memory_mask, [1, beam_size, 1])
        init_enc_mask = layers.reshape(init_enc_mask,
                                       shape=[batch_size * beam_size, 1, -1])

        dec_knowledge = layers.reshape(dec_knowledge,
                                       shape=[-1, 1, hidden_size])
        init_dec_knowledge = layers.expand(dec_knowledge, [1, beam_size, 1])
        init_dec_knowledge = layers.reshape(
            init_dec_knowledge,
            shape=[batch_size * beam_size, -1, hidden_size])

        dec_init_hidden = layers.expand(dec_init_hidden, [1, 1, beam_size])
        dec_init_hidden = layers.reshape(dec_init_hidden,
                                         shape=[1, -1, hidden_size])

        length_average = config.length_average
        UNK = config.unk_id
        EOS = config.eos_id
        for i in range(1, max_decode_len + 1):
            dec_emb = get_embedding(token, input_size, vocab_size)
            dec_out, dec_last_hidden = \
                decoder_step(gru_unit, cue_gru_unit,
                             dec_emb, dec_init_hidden, input_size, hidden_size,
                             init_enc_memory, init_enc_mask, init_dec_knowledge, mask=None)
            output_in_size = hidden_size + hidden_size

            rnnout = layers.dropout(dec_out,
                                    dropout_prob=config.dropout,
                                    is_test=True)
            rnnout = fc(rnnout,
                        output_in_size,
                        hidden_size,
                        name='dec_out_fc1')
            rnnout = fc(rnnout, hidden_size, vocab_size, name='dec_out_fc2')

            log_softmax_output = log_softmax(rnnout)
            log_softmax_output = layers.squeeze(log_softmax_output, axes=[1])

            if i > 1:
                if length_average:
                    log_softmax_output = layers.elementwise_add(
                        (log_softmax_output / i),
                        (pre_score * (1.0 - 1.0 / i)),
                        axis=0)
                else:
                    log_softmax_output = layers.elementwise_add(
                        log_softmax_output, pre_score, axis=0)
            else:
                log_softmax_output = layers.elementwise_add(log_softmax_output,
                                                            pre_score,
                                                            axis=0)

            log_softmax_output = layers.reshape(log_softmax_output,
                                                shape=[batch_size, -1])

            topk_score, topk_index = layers.topk(log_softmax_output,
                                                 k=beam_size)
            topk_score = layers.reshape(topk_score, shape=[-1])
            topk_index = layers.reshape(topk_index, shape=[-1])

            vocab_var = layers.fill_constant([1],
                                             dtype='int64',
                                             value=vocab_size)
            new_token = topk_index % vocab_var

            index = topk_index // vocab_var
            id_array.append(new_token)
            index_array.append(index)
            index = index + pos_index

            score_array.append(topk_score)

            eos_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=EOS)
            unk_ids = layers.fill_constant([beam_size * batch_size],
                                           dtype='int64',
                                           value=UNK)
            eos_eq = layers.cast(layers.equal(new_token, eos_ids),
                                 dtype='float32')

            topk_score += eos_eq * -100000000.0

            unk_eq = layers.cast(layers.equal(new_token, unk_ids),
                                 dtype='float32')
            topk_score += unk_eq * -100000000.0

            # update
            token = new_token
            pre_score = topk_score
            token = layers.reshape(token, shape=[-1, 1])

            index = layers.cast(index, dtype='int32')
            dec_last_hidden = layers.squeeze(dec_last_hidden, axes=[0])
            dec_init_hidden = layers.gather(dec_last_hidden, index=index)
            dec_init_hidden = layers.unsqueeze(dec_init_hidden, axes=[0])
            init_enc_memory = layers.gather(init_enc_memory, index)
            init_enc_mask = layers.gather(init_enc_mask, index)
            init_dec_knowledge = layers.gather(init_dec_knowledge, index)

        final_score = layers.concat(score_array, axis=0)
        final_ids = layers.concat(id_array, axis=0)
        final_index = layers.concat(index_array, axis=0)

        final_score = layers.reshape(
            final_score, shape=[max_decode_len, beam_size * batch_size])
        final_ids = layers.reshape(
            final_ids, shape=[max_decode_len, beam_size * batch_size])
        final_index = layers.reshape(
            final_index, shape=[max_decode_len, beam_size * batch_size])

        return final_score, final_ids, final_index
Example #4
0
    def __init__(self, embedding_dim, encoder_size, decoder_size,
                 source_dict_dim, target_dict_dim, tag_dict_dim, is_generating,
                 beam_size, max_length, source_entity_dim, source_pos_dim,
                 embedding_entity_dim, embedding_pos_dim, end_id):
        # The encoding process. Encodes the input words into tensors.
        self.encoder_size = encoder_size
        self.decoder_size = decoder_size
        self.embedding_dim = embedding_dim
        self.source_dict_dim = target_dict_dim
        self.is_generating = is_generating
        self.source_dict_dim = source_dict_dim
        self.target_dict_dim = target_dict_dim
        self.tag_dict_dim = tag_dict_dim
        self.max_length = max_length
        self.end_id = end_id
        self.beam_size = beam_size
        self.no_grad_set = []

        self.dropout_prob = 0.5

        src_word_idx = fluid.layers.data(name='source_sequence',
                                         shape=[1],
                                         dtype='int64',
                                         lod_level=1)
        # print(src_word_idx.shape)
        self.src_word_idx = src_word_idx

        src_embedding = fluid.layers.embedding(
            input=src_word_idx,
            size=[source_dict_dim, embedding_dim],
            dtype='float32',
            param_attr=fluid.ParamAttr(name='emb'))

        src_entity_idx = fluid.layers.data(name='source_entities',
                                           shape=[1],
                                           dtype='int64',
                                           lod_level=1)

        entity_embedding = fluid.layers.embedding(
            input=src_entity_idx,
            size=[source_entity_dim, embedding_entity_dim],
            dtype='float32')

        src_pos_idx = fluid.layers.data(name='source_pos',
                                        shape=[1],
                                        dtype='int64',
                                        lod_level=1)

        pos_embedding = fluid.layers.embedding(
            input=src_pos_idx,
            size=[source_pos_dim, embedding_pos_dim],
            dtype='float32')
        # print(src_embedding)
        # print(entity_embedding)
        # print(pos_embedding)

        embeddings = fluid.layers.concat(
            input=[src_embedding, entity_embedding, pos_embedding], axis=1)
        # print(embeddings)
        # if not is_generating:
        #     embeddings = fluid.layers.dropout(
        #                     embeddings, dropout_prob=self.dropout_prob)

        src_forward, src_reversed = self.bi_lstm_encoder(
            input_seq=embeddings, gate_size=encoder_size)

        encoded_vector = fluid.layers.concat(input=[src_forward, src_reversed],
                                             axis=1)

        pad_zero = pd.fill_constant(shape=[self.encoder_size * 2],
                                    dtype='float32',
                                    value=0)
        encoded_vector_full, encoded_vector_length = pd.sequence_pad(
            encoded_vector,
            pad_zero,
            maxlen=self.max_length,
            name="copy_score_padding")
        print(encoded_vector_full)

        # if not is_generating:
        #     encoded_vector = fluid.layers.dropout(
        #                     encoded_vector, dropout_prob=self.dropout_prob)
        self.encoder_vec = encoded_vector
        self.encoder_vec_full = encoded_vector_full

        encoded_proj = fluid.layers.fc(input=encoded_vector,
                                       size=decoder_size,
                                       bias_attr=False)
        self.encoder_proj = encoded_proj

        backward_first = fluid.layers.sequence_pool(input=src_reversed,
                                                    pool_type='first')
        decoder_boot = fluid.layers.fc(input=backward_first,
                                       size=decoder_size,
                                       bias_attr=False,
                                       act='tanh')
        cell_init = fluid.layers.fill_constant_batch_size_like(
            input=decoder_boot,
            value=1.0,
            shape=[-1, decoder_size],
            dtype='float32')
        # cell_init.stop_gradient = False
        cell_init.stop_gradient = True

        # Create a RNN state cell by providing the input and hidden states, and
        # specifies the hidden state as output.
        # h = InitState(init=decoder_boot, need_reorder=True)
        self.h = decoder_boot
        self.c = cell_init

        event_cla_id = fluid.layers.data(name='event_class',
                                         shape=[1],
                                         dtype='int64')

        self.event_embedding = fluid.layers.embedding(
            input=event_cla_id,
            size=[self.tag_dict_dim, embedding_entity_dim],
            dtype='float32')

        # self.decoder_lstm = fluid.contrib.layers.BasicLSTMUnit(
        #     "decoder_lstm",
        #     self.decoder_size,
        #     fluid.ParamAttr(initializer=fluid.initializer.UniformInitializer(
        #         low=-0.1, high=0.1)),
        #     fluid.ParamAttr(initializer=fluid.initializer.Constant(0.0)), )

        #####
        # DECODER
        #####
        label = fluid.layers.data(name='label_sequence',
                                  shape=[1],
                                  dtype='int64',
                                  lod_level=1)
        if not is_generating:
            rnn_out = self.train_decoder(decoder_boot)
            predict_label = fluid.layers.argmax(x=rnn_out, axis=1)
            # print(label.shape)
            # print(rnn_out.shape)
            # print(predict_label.shape)
            cost = fluid.layers.cross_entropy(input=rnn_out, label=label)
            avg_cost = fluid.layers.mean(x=cost)
            self.predict = rnn_out
            self.label = predict_label
            self.avg_cost = avg_cost
            feeding_list = [
                "source_sequence", "source_entities", "source_pos",
                "event_class", "source_index", "target_sequence",
                "label_sequence"
            ]
            # return avg_cost, feeding_list

            self.feeding_list = feeding_list
        else:
            # rnn_out = self.train_decoder(decoder_boot)
            # translation_ids = fluid.layers.argmax(x=rnn_out, axis=-1)

            beam_search_out = self.decoder(decoder_boot)
            translation_ids, translation_scores = beam_search_out
            feeding_list = [
                "source_sequence", "source_entities", "source_pos",
                "event_class", "source_index", "label_sequence"
            ]
            # feeding_list = ["source_sequence", "source_entities",
            #         "source_pos", "target_sequence"]
            # feeding_list = ["source_sequence", "source_entities",
            #         "source_pos", "target_sequence", "label_sequence"]

            # return translation_ids, translation_scores, feeding_list
            self.translation_ids = translation_ids
            self.translation_scores = translation_scores
            self.feeding_list = feeding_list

        self.no_grad_set = set(self.no_grad_set)