Beispiel #1
0
def simple_rnn(rnn_input,
               init_hidden,
               hidden_size,
               kernel_param_attr=None,
               recurrent_param_attr=None,
               bias_attr=None,
               act='relu',
               sequence_length=None,
               name='simple_rnn'):

    # Transpose (sequence x batch x hidden)
    rnn_input = layers.transpose(rnn_input, [1, 0, 2])

    # Generate Mask
    mask = None
    if sequence_length:
        max_seq_len = layers.shape(rnn_input)[0]
        mask = layers.sequence_mask(sequence_length,
                                    maxlen=max_seq_len,
                                    dtype='float32')
        mask = layers.transpose(mask, [1, 0])

    # Init
    simple_rnn = SimpleRNN_unit(rnn_input, hidden_size, kernel_param_attr,
                                recurrent_param_attr, bias_attr, act)

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(rnn_input)

        if mask:
            step_mask = rnn.step_input(mask)

        if init_hidden:
            pre_hidden = rnn.memory(init=init_hidden)
        else:
            pre_hidden = rnn.memory(batch_ref=rnn_input,
                                    shape=[-1, hidden_size])

        last_hidden = simple_rnn(step_in, pre_hidden)

        rnn.update_memory(pre_hidden, last_hidden)

        rnn.step_output(last_hidden)

        step_input = last_hidden

    rnn_out = rnn()

    last_hidden = rnn_out[-1]
    last_hidden = layers.reshape(last_hidden, shape=[1, -1, hidden_size])

    rnn_output = layers.transpose(rnn_out, [1, 0, 2])
    last_hidden = layers.transpose(last_hidden, [1, 0, 2])

    return rnn_out, last_hidden
Beispiel #2
0
    def get_single_direction_output(rnn_input,
                                    encode_hidden,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        #print(rnn_input.shape)
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                else:
                    pre_hidden = rnn.memory(batch_ref=rnn_input,
                                            shape=[-1, hidden_size],
                                            ref_batch_dim_idx=1)
                encode_h = encode_hidden[i]
                pre_encode_hidden = layers.concat([pre_hidden, encode_h], axis=1)
                new_hidden = unit_list[i](step_input, pre_encode_hidden)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask, axis=0) - layers.elementwise_mul(
                        pre_hidden, (step_mask - 1), axis=0)
                rnn.update_memory(pre_hidden, new_hidden)

                rnn.step_output(new_hidden)

                step_input = new_hidden
                if dropout_prob is not None and dropout_prob > 0.0:
                    step_input = layers.dropout(step_input, dropout_prob=dropout_prob, )

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        all_hidden_array = []  # 增加这个来得到所有隐含状态
        rnn_output = rnn_out[-1]

        for i in range(num_layers):
            last_hidden = rnn_out[i]
            all_hidden_array.append(last_hidden)
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)

        all_hidden_array = layers.concat(all_hidden_array, axis=0)
        all_hidden_array = layers.reshape(all_hidden_array, shape=[num_layers, input.shape[0], -1, hidden_size])
        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(last_hidden_output, shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output, all_hidden_array
def rnn_decoder(gru_unit,
                cue_gru_unit,
                input,
                input_size,
                hidden_size,
                num_layers,
                memory,
                memory_mask,
                knowledge,
                output_size,
                init_hidden=None,
                mask=None,
                dropout=0.0,
                batch_first=True,
                name="decoder"):
    """ rnn decoder """
    input_emb = get_embedding(input, input_size, output_size)
    if batch_first:
        input_emb = layers.transpose(input_emb, perm=[1, 0, 2])
        if mask:
            trans_mask = layers.transpose(mask, perm=[1, 0])

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(input_emb)
        step_mask = None

        if mask:
            step_mask = rnn.step_input(trans_mask)

        # split pre_hidden
        pre_hidden_list = []

        pre_hidden = rnn.memory(init=init_hidden)
        real_out, last_hidden = \
            decoder_step(gru_unit, cue_gru_unit, step_in, pre_hidden, input_size,
                         hidden_size, memory, memory_mask, knowledge, mask=step_mask)

        rnn.update_memory(pre_hidden, last_hidden)

        step_in = layers.squeeze(real_out, axes=[1])
        rnn.step_output(step_in)

    rnnout = rnn()
    rnnout = layers.transpose(rnnout, perm=[1, 0, 2])
    rnnout = layers.elementwise_mul(rnnout, mask, axis=0)

    output_in_size = hidden_size + hidden_size
    rnnout = layers.dropout(rnnout, dropout_prob=dropout)
    rnnout = fc(rnnout, output_in_size, hidden_size, name='dec_out_fc1')
    rnnout = fc(rnnout, hidden_size, output_size, name='dec_out_fc2')

    softmax_out = layers.softmax(rnnout)

    return softmax_out
def gru_rnn(input,
            input_size,
            hidden_size,
            init_hidden=None,
            batch_first=False,
            mask=None,
            num_layers=1,
            dropout=0.0,
            name="gru"):
    """ gru rnn """

    gru_unit = GRU_unit(input_size,
                        hidden_size,
                        num_layers=num_layers,
                        dropout=dropout,
                        name=name + "_gru_unit")

    if batch_first:
        input = layers.transpose(x=input, perm=[1, 0, 2])
        if mask:
            mask = layers.transpose(mask, perm=[1, 0])

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(input)
        step_mask = None

        if mask:
            step_mask = rnn.step_input(mask)

        pre_hidden = rnn.memory(init=init_hidden)
        new_hidden, last_hidden = gru_unit(step_in, pre_hidden, step_mask)
        rnn.update_memory(pre_hidden, last_hidden)
        step_in = new_hidden
        rnn.step_output(step_in)
        rnn.step_output(last_hidden)

    rnn_res = rnn()
    rnn_out = rnn_res[0]
    last_hidden = layers.slice(rnn_res[1],
                               axes=[0],
                               starts=[-1],
                               ends=[1000000000])
    last_hidden = layers.reshape(last_hidden,
                                 shape=[num_layers, -1, hidden_size])

    if batch_first:
        rnnout = layers.transpose(x=rnn_out, perm=[1, 0, 2])

    return rnnout, last_hidden
Beispiel #5
0
    def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
        weight_1_arr = []
        weight_2_arr = []
        bias_arr = []
        hidden_array = []
        cell_array = []
        mask_array = []
        for i in range(num_layers):
            weight_1 = layers.create_parameter(
                [hidden_size * 2, hidden_size * 4],
                dtype="float32",
                name="fc_weight1_" + str(i),
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale))
            weight_1_arr.append(weight_1)
            bias_1 = layers.create_parameter(
                [hidden_size * 4],
                dtype="float32",
                name="fc_bias1_" + str(i),
                default_initializer=fluid.initializer.Constant(0.0))
            bias_arr.append(bias_1)

            pre_hidden = layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = layers.slice(
                init_cell, axes=[0], starts=[i], ends=[i + 1])
            pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
            pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
            hidden_array.append(pre_hidden)
            cell_array.append(pre_cell)

        input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2])
        rnn = PaddingRNN()

        with rnn.step():
            input = rnn.step_input(input_embedding)
            for k in range(num_layers):
                pre_hidden = rnn.memory(init=hidden_array[k])
                pre_cell = rnn.memory(init=cell_array[k])
                weight_1 = weight_1_arr[k]
                bias = bias_arr[k]

                nn = layers.concat([input, pre_hidden], 1)
                gate_input = layers.matmul(x=nn, y=weight_1)

                gate_input = layers.elementwise_add(gate_input, bias)
                i = layers.slice(
                    gate_input, axes=[1], starts=[0], ends=[hidden_size])
                j = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size],
                    ends=[hidden_size * 2])
                f = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 2],
                    ends=[hidden_size * 3])
                o = layers.slice(
                    gate_input,
                    axes=[1],
                    starts=[hidden_size * 3],
                    ends=[hidden_size * 4])

                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
                    i) * layers.tanh(j)
                m = layers.tanh(c) * layers.sigmoid(o)

                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)

                rnn.step_output(m)
                rnn.step_output(c)

                input = m

                if dropout != None and dropout > 0.0:
                    input = layers.dropout(
                        input,
                        dropout_prob=dropout,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(input)
        rnnout = rnn()

        last_hidden_array = []
        last_cell_array = []
        real_res = rnnout[-1]
        for i in range(num_layers):
            m = rnnout[i * 2]
            c = rnnout[i * 2 + 1]
            m.stop_gradient = True
            c.stop_gradient = True
            last_h = layers.slice(
                m, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_hidden_array.append(last_h)
            last_c = layers.slice(
                c, axes=[0], starts=[num_steps - 1], ends=[num_steps])
            last_cell_array.append(last_c)
        real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
        last_hidden = layers.concat(last_hidden_array, 0)
        last_cell = layers.concat(last_cell_array, 0)

        return real_res, last_hidden, last_cell
Beispiel #6
0
    def get_single_direction_output(rnn_input,
                                    unit_list,
                                    mask=None,
                                    direc_index=0):
        rnn = StaticRNN()
        with rnn.step():
            step_input = rnn.step_input(rnn_input)

            if mask:
                step_mask = rnn.step_input(mask)

            for i in range(num_layers):
                if init_hidden:
                    pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
                    pre_cell = rnn.memory(init=init_cell[i, direc_index])
                else:
                    pre_hidden = rnn.memory(batch_ref=rnn_input,
                                            shape=[-1, hidden_size])
                    pre_cell = rnn.memory(batch_ref=rnn_input,
                                          shape=[-1, hidden_size])

                new_hidden, new_cell = unit_list[i](step_input, pre_hidden,
                                                    pre_cell)

                if mask:
                    new_hidden = layers.elementwise_mul(
                        new_hidden, step_mask,
                        axis=0) - layers.elementwise_mul(pre_hidden,
                                                         (step_mask - 1),
                                                         axis=0)
                    new_cell = layers.elementwise_mul(
                        new_cell, step_mask, axis=0) - layers.elementwise_mul(
                            pre_cell, (step_mask - 1), axis=0)

                rnn.update_memory(pre_hidden, new_hidden)
                rnn.update_memory(pre_cell, new_cell)

                rnn.step_output(new_hidden)
                rnn.step_output(new_cell)

                step_input = new_hidden
                if dropout_prob != None and dropout_prob > 0.0:
                    step_input = layers.dropout(
                        step_input,
                        dropout_prob=dropout_prob,
                        dropout_implementation='upscale_in_train')

            rnn.step_output(step_input)

        rnn_out = rnn()

        last_hidden_array = []
        last_cell_array = []
        rnn_output = rnn_out[-1]
        for i in range(num_layers):
            last_hidden = rnn_out[i * 2]
            last_hidden = last_hidden[-1]
            last_hidden_array.append(last_hidden)
            last_cell = rnn_out[i * 2 + 1]
            last_cell = last_cell[-1]
            last_cell_array.append(last_cell)

        last_hidden_output = layers.concat(last_hidden_array, axis=0)
        last_hidden_output = layers.reshape(
            last_hidden_output, shape=[num_layers, -1, hidden_size])
        last_cell_output = layers.concat(last_cell_array, axis=0)
        last_cell_output = layers.reshape(last_cell_output,
                                          shape=[num_layers, -1, hidden_size])

        return rnn_output, last_hidden_output, last_cell_output
    def _build_decoder(self,
                       enc_last_hidden,
                       enc_last_cell,
                       mode='train',
                       beam_size=10):

        dec_input = layers.transpose(self.tar_emb, [1, 0, 2])
        dec_unit_list = []
        for i in range(self.num_layers):
            new_name = "dec_layers_" + str(i)
            dec_unit_list.append(
                BasicLSTMUnit(
                    new_name,
                    self.hidden_size,
                    ParamAttr(initializer=fluid.initializer.UniformInitializer(
                        low=-self.init_scale, high=self.init_scale)),
                    ParamAttr(initializer=fluid.initializer.Constant(0.0)),
                ))


        attention_weight = layers.create_parameter([self.hidden_size * 2, self.hidden_size], dtype="float32", name="attention_weight", \
                default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale))

        memory_weight = layers.create_parameter([self.hidden_size, self.hidden_size], dtype="float32", name="memory_weight", \
                default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale))

        def dot_attention(query, memory, mask=None):
            attn = layers.matmul(query, memory, transpose_y=True)

            if mask:
                attn = layers.transpose(attn, [1, 0, 2])
                attn = layers.elementwise_add(attn, mask * 1000000000, -1)
                attn = layers.transpose(attn, [1, 0, 2])
            weight = layers.softmax(attn)
            weight_memory = layers.matmul(weight, memory)

            return weight_memory, weight

        max_src_seq_len = layers.shape(self.src)[1]
        src_mask = layers.sequence_mask(self.src_sequence_length,
                                        maxlen=max_src_seq_len,
                                        dtype='float32')

        softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \
                default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale))

        def decoder_step(currrent_in, pre_feed, pre_hidden_array,
                         pre_cell_array, enc_memory):
            new_hidden_array = []
            new_cell_array = []

            step_input = layers.concat([currrent_in, pre_feed], 1)

            for i in range(self.num_layers):
                pre_hidden = pre_hidden_array[i]
                pre_cell = pre_cell_array[i]

                new_hidden, new_cell = dec_unit_list[i](step_input, pre_hidden,
                                                        pre_cell)

                new_hidden_array.append(new_hidden)
                new_cell_array.append(new_cell)

                step_input = new_hidden

            memory_mask = src_mask - 1.0
            enc_memory = layers.matmul(enc_memory, memory_weight)
            att_in = layers.unsqueeze(step_input, [1])
            dec_att, _ = dot_attention(att_in, enc_memory)
            dec_att = layers.squeeze(dec_att, [1])
            concat_att_out = layers.concat([dec_att, step_input], 1)
            concat_att_out = layers.matmul(concat_att_out, attention_weight)

            return concat_att_out, new_hidden_array, new_cell_array

        if mode == "train":
            dec_rnn = StaticRNN()
            with dec_rnn.step():
                step_input = dec_rnn.step_input(dec_input)
                input_feed = dec_rnn.memory(batch_ref=dec_input,
                                            shape=[-1, self.hidden_size])
                step_input = layers.concat([step_input, input_feed], 1)

                for i in range(self.num_layers):
                    pre_hidden = dec_rnn.memory(init=enc_last_hidden[i])
                    pre_cell = dec_rnn.memory(init=enc_last_cell[i])

                    new_hidden, new_cell = dec_unit_list[i](step_input,
                                                            pre_hidden,
                                                            pre_cell)

                    dec_rnn.update_memory(pre_hidden, new_hidden)
                    dec_rnn.update_memory(pre_cell, new_cell)

                    step_input = new_hidden

                    if self.dropout != None and self.dropout > 0.0:
                        print("using dropout", self.dropout)
                        step_input = fluid.layers.dropout(
                            step_input,
                            dropout_prob=self.dropout,
                            dropout_implementation='upscale_in_train')
                memory_mask = src_mask - 1.0
                enc_memory = layers.matmul(self.enc_output, memory_weight)
                att_in = layers.unsqueeze(step_input, [1])
                dec_att, _ = dot_attention(att_in, enc_memory, memory_mask)
                dec_att = layers.squeeze(dec_att, [1])
                concat_att_out = layers.concat([dec_att, step_input], 1)
                concat_att_out = layers.matmul(concat_att_out,
                                               attention_weight)
                #concat_att_out = layers.tanh( concat_att_out )

                dec_rnn.update_memory(input_feed, concat_att_out)

                dec_rnn.step_output(concat_att_out)

            dec_rnn_out = dec_rnn()
            dec_output = layers.transpose(dec_rnn_out, [1, 0, 2])

            dec_output = layers.matmul(dec_output, softmax_weight)

            return dec_output
        elif mode == 'beam_search':

            max_length = max_src_seq_len * 2
            #max_length = layers.fill_constant( [1], dtype='int32', value = 10)
            pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1)
            full_ids = layers.fill_constant([1, 1], dtype='int64', value=1)

            score = layers.fill_constant([1], dtype='float32', value=0.0)

            #eos_ids = layers.fill_constant( [1, 1], dtype='int64', value=2)

            pre_hidden_array = []
            pre_cell_array = []
            pre_feed = layers.fill_constant([beam_size, self.hidden_size],
                                            dtype='float32',
                                            value=0)
            for i in range(self.num_layers):
                pre_hidden_array.append(
                    layers.expand(enc_last_hidden[i], [beam_size, 1]))
                pre_cell_array.append(
                    layers.expand(enc_last_cell[i], [beam_size, 1]))

            eos_ids = layers.fill_constant([beam_size], dtype='int64', value=2)
            init_score = np.zeros((beam_size)).astype('float32')
            init_score[1:] = -INF
            pre_score = layers.assign(init_score)
            #pre_score = layers.fill_constant( [1,], dtype='float32', value= 0.0)
            tokens = layers.fill_constant([beam_size, 1],
                                          dtype='int64',
                                          value=1)

            enc_memory = layers.expand(self.enc_output, [beam_size, 1, 1])

            pre_tokens = layers.fill_constant([beam_size, 1],
                                              dtype='int64',
                                              value=1)

            finished_seq = layers.fill_constant([beam_size, 1],
                                                dtype='int64',
                                                value=0)
            finished_scores = layers.fill_constant([beam_size],
                                                   dtype='float32',
                                                   value=-INF)
            finished_flag = layers.fill_constant([beam_size],
                                                 dtype='float32',
                                                 value=0.0)

            step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0)
            cond = layers.less_than(x=step_idx,
                                    y=max_length)  # default force_cpu=True

            parent_idx = layers.fill_constant([1], dtype='int32', value=0)
            while_op = layers.While(cond)

            def compute_topk_scores_and_seq(sequences,
                                            scores,
                                            scores_to_gather,
                                            flags,
                                            beam_size,
                                            select_beam=None,
                                            generate_id=None):
                scores = layers.reshape(scores, shape=[1, -1])
                _, topk_indexs = layers.topk(scores, k=beam_size)

                topk_indexs = layers.reshape(topk_indexs, shape=[-1])

                # gather result

                top_seq = layers.gather(sequences, topk_indexs)
                topk_flags = layers.gather(flags, topk_indexs)
                topk_gather_scores = layers.gather(scores_to_gather,
                                                   topk_indexs)

                if select_beam:
                    topk_beam = layers.gather(select_beam, topk_indexs)
                else:
                    topk_beam = select_beam

                if generate_id:
                    topk_id = layers.gather(generate_id, topk_indexs)
                else:
                    topk_id = generate_id
                return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id

            def grow_alive(curr_seq, curr_scores, curr_log_probs,
                           curr_finished, select_beam, generate_id):
                curr_scores += curr_finished * -INF
                return compute_topk_scores_and_seq(curr_seq,
                                                   curr_scores,
                                                   curr_log_probs,
                                                   curr_finished,
                                                   beam_size,
                                                   select_beam,
                                                   generate_id=generate_id)

            def grow_finished(finished_seq, finished_scores, finished_flag,
                              curr_seq, curr_scores, curr_finished):
                finished_seq = layers.concat([
                    finished_seq,
                    layers.fill_constant(
                        [beam_size, 1], dtype='int64', value=1)
                ],
                                             axis=1)
                curr_scores += (1.0 - curr_finished) * -INF
                #layers.Print( curr_scores, message="curr scores")
                curr_finished_seq = layers.concat([finished_seq, curr_seq],
                                                  axis=0)
                curr_finished_scores = layers.concat(
                    [finished_scores, curr_scores], axis=0)
                curr_finished_flags = layers.concat(
                    [finished_flag, curr_finished], axis=0)

                return compute_topk_scores_and_seq(curr_finished_seq,
                                                   curr_finished_scores,
                                                   curr_finished_scores,
                                                   curr_finished_flags,
                                                   beam_size)

            def is_finished(alive_log_prob, finished_scores,
                            finished_in_finished):

                max_out_len = 200
                max_length_penalty = layers.pow(
                    layers.fill_constant([1],
                                         dtype='float32',
                                         value=((5.0 + max_out_len) / 6.0)),
                    alpha)

                lower_bound_alive_score = layers.slice(
                    alive_log_prob, starts=[0], ends=[1],
                    axes=[0]) / max_length_penalty

                lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished
                lowest_score_of_fininshed_in_finished += (
                    1.0 - finished_in_finished) * -INF
                lowest_score_of_fininshed_in_finished = layers.reduce_min(
                    lowest_score_of_fininshed_in_finished)

                met = layers.less_than(lower_bound_alive_score,
                                       lowest_score_of_fininshed_in_finished)
                met = layers.cast(met, 'float32')
                bound_is_met = layers.reduce_sum(met)

                finished_eos_num = layers.reduce_sum(finished_in_finished)

                finish_cond = layers.less_than(
                    finished_eos_num,
                    layers.fill_constant([1], dtype='float32',
                                         value=beam_size))

                return finish_cond

            def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx):
                pre_ids = alive_seq

                dec_step_emb = layers.embedding(
                    input=pre_ids,
                    size=[self.tar_vocab_size, self.hidden_size],
                    dtype='float32',
                    is_sparse=False,
                    param_attr=fluid.ParamAttr(
                        name='target_embedding',
                        initializer=fluid.initializer.UniformInitializer(
                            low=-self.init_scale, high=self.init_scale)))

                dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                    dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array,
                    enc_memory)

                projection = layers.matmul(dec_att_out, softmax_weight)

                logits = layers.softmax(projection)
                current_log = layers.elementwise_add(x=layers.log(logits),
                                                     y=alive_log_prob,
                                                     axis=0)
                base_1 = layers.cast(step_idx, 'float32') + 6.0
                base_1 /= 6.0
                length_penalty = layers.pow(base_1, alpha)

                len_pen = layers.pow(
                    ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha)

                current_log = layers.reshape(current_log, shape=[1, -1])

                current_log = current_log / length_penalty
                topk_scores, topk_indices = layers.topk(input=current_log,
                                                        k=beam_size)

                topk_scores = layers.reshape(topk_scores, shape=[-1])

                topk_log_probs = topk_scores * length_penalty

                generate_id = layers.reshape(topk_indices,
                                             shape=[-1]) % self.tar_vocab_size

                selected_beam = layers.reshape(
                    topk_indices, shape=[-1]) // self.tar_vocab_size

                topk_finished = layers.equal(generate_id, eos_ids)

                topk_finished = layers.cast(topk_finished, 'float32')

                generate_id = layers.reshape(generate_id, shape=[-1, 1])

                pre_tokens_list = layers.gather(tokens, selected_beam)

                full_tokens_list = layers.concat(
                    [pre_tokens_list, generate_id], axis=1)


                return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \
                        dec_att_out, new_hidden_array, new_cell_array

            with while_op.block():
                topk_seq, topk_log_probs, topk_scores, topk_finished, topk_beam, topk_generate_id, attention_out, new_hidden_array, new_cell_array = \
                    grow_top_k(  step_idx, pre_tokens, pre_score, parent_idx)
                alive_seq, alive_log_prob, _, alive_beam, alive_id = grow_alive(
                    topk_seq, topk_scores, topk_log_probs, topk_finished,
                    topk_beam, topk_generate_id)

                finished_seq_2, finished_scores_2, finished_flags_2, _, _ = grow_finished(
                    finished_seq, finished_scores, finished_flag, topk_seq,
                    topk_scores, topk_finished)

                finished_cond = is_finished(alive_log_prob, finished_scores_2,
                                            finished_flags_2)

                layers.increment(x=step_idx, value=1.0, in_place=True)

                layers.assign(alive_beam, parent_idx)
                layers.assign(alive_id, pre_tokens)
                layers.assign(alive_log_prob, pre_score)
                layers.assign(alive_seq, tokens)
                layers.assign(finished_seq_2, finished_seq)
                layers.assign(finished_scores_2, finished_scores)
                layers.assign(finished_flags_2, finished_flag)

                # update init_hidden, init_cell, input_feed
                new_feed = layers.gather(attention_out, parent_idx)
                layers.assign(new_feed, pre_feed)
                for i in range(self.num_layers):
                    new_hidden_var = layers.gather(new_hidden_array[i],
                                                   parent_idx)
                    layers.assign(new_hidden_var, pre_hidden_array[i])
                    new_cell_var = layers.gather(new_cell_array[i], parent_idx)
                    layers.assign(new_cell_var, pre_cell_array[i])

                length_cond = layers.less_than(x=step_idx, y=max_length)
                layers.logical_and(x=length_cond, y=finished_cond, out=cond)

            tokens_with_eos = tokens

            all_seq = layers.concat([tokens_with_eos, finished_seq], axis=0)
            all_score = layers.concat([pre_score, finished_scores], axis=0)
            _, topk_index = layers.topk(all_score, k=beam_size)
            topk_index = layers.reshape(topk_index, shape=[-1])
            final_seq = layers.gather(all_seq, topk_index)
            final_score = layers.gather(all_score, topk_index)

            return final_seq
        elif mode == 'greedy_search':
            max_length = max_src_seq_len * 2
            #max_length = layers.fill_constant( [1], dtype='int32', value = 10)
            pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1)
            full_ids = layers.fill_constant([1, 1], dtype='int64', value=1)

            score = layers.fill_constant([1], dtype='float32', value=0.0)

            eos_ids = layers.fill_constant([1, 1], dtype='int64', value=2)

            pre_hidden_array = []
            pre_cell_array = []
            pre_feed = layers.fill_constant([1, self.hidden_size],
                                            dtype='float32',
                                            value=0)
            for i in range(self.num_layers):
                pre_hidden_array.append(enc_last_hidden[i])
                pre_cell_array.append(enc_last_cell[i])
                #pre_hidden_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0)  )
                #pre_cell_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) )

            step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0)
            cond = layers.less_than(x=step_idx,
                                    y=max_length)  # default force_cpu=True
            while_op = layers.While(cond)

            with while_op.block():

                dec_step_emb = layers.embedding(
                    input=pre_ids,
                    size=[self.tar_vocab_size, self.hidden_size],
                    dtype='float32',
                    is_sparse=False,
                    param_attr=fluid.ParamAttr(
                        name='target_embedding',
                        initializer=fluid.initializer.UniformInitializer(
                            low=-self.init_scale, high=self.init_scale)))

                dec_att_out, new_hidden_array, new_cell_array = decoder_step(
                    dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array,
                    self.enc_output)

                projection = layers.matmul(dec_att_out, softmax_weight)

                logits = layers.softmax(projection)
                logits = layers.log(logits)

                current_log = layers.elementwise_add(logits, score, axis=0)

                topk_score, topk_indices = layers.topk(input=current_log, k=1)

                new_ids = layers.concat([full_ids, topk_indices])
                layers.assign(new_ids, full_ids)
                #layers.Print( full_ids, message="ful ids")
                layers.assign(topk_score, score)
                layers.assign(topk_indices, pre_ids)
                layers.assign(dec_att_out, pre_feed)
                for i in range(self.num_layers):
                    layers.assign(new_hidden_array[i], pre_hidden_array[i])
                    layers.assign(new_cell_array[i], pre_cell_array[i])

                layers.increment(x=step_idx, value=1.0, in_place=True)

                eos_met = layers.not_equal(topk_indices, eos_ids)
                length_cond = layers.less_than(x=step_idx, y=max_length)
                layers.logical_and(x=length_cond, y=eos_met, out=cond)

            return full_ids
def convlstm2d_rnn(rnn_input,
                   init_hidden,
                   init_cell,
                   padding,
                   hidden_h,
                   hidden_w,
                   filters,
                   filter_size,
                   drop_out=None,
                   sequence_length=None,
                   name='conv_lstm_2d'):

    # transpose : (sequence x batch x C x H x W)
    rnn_input = layers.transpose(rnn_input, [1, 0, 4, 2, 3])

    # generate mask
    mask = None
    if sequence_length:
        max_seq_len = layers.shape(rnn_input)[0]
        mask = layers.sequence_mask(sequence_length,
                                    maxlen=max_seq_len,
                                    dtype='float32')
        mask = layers.transpose(mask, [1, 0])

    # init
    conv_lstm_2d = ConvLSTM2D_unit(filters, filter_size, padding)

    rnn = PaddingRNN()
    with rnn.step():
        step_in = rnn.step_input(rnn_input)

        if mask:
            step_mask = rnn.step_input(mask)

        if init_hidden and init_cell:
            pre_hidden = rnn.memory(init=init_hidden)
            pre_cell = rnn.memory(init=init_cell)
        else:
            pre_hidden = rnn.memory(batch_ref=rnn_input,
                                    shape=[-1, filters, hidden_h, hidden_w])
            pre_cell = rnn.memory(batch_ref=rnn_input,
                                  shape=[-1, filters, hidden_h, hidden_w])

        real_out, last_hidden, last_cell = conv_lstm_2d(
            step_in, pre_hidden, pre_cell)

        if mask:
            last_hidden = dot(last_hidden, step_mask, axis=0) - dot(
                pre_hidden, (step_mask - 1), axis=0)
            last_cell = dot(last_cell, step_mask, axis=0) - dot(
                pre_cell, (step_mask - 1), axis=0)

        rnn.update_memory(pre_hidden, last_hidden)
        rnn.update_memory(pre_cell, last_cell)

        rnn.step_output(last_hidden)
        rnn.step_output(last_cell)

        step_input = last_hidden

        if drop_out != None and drop_out > 0.0:
            step_input = layers.dropout(
                step_input,
                dropout_prob=drop_out,
                dropout_implementation='upscale_in_train')

    rnn_res = rnn()
    rnn_out = rnn_res[0]
    last_hidden = layers.slice(rnn_res[1],
                               axes=[0],
                               starts=[-1],
                               ends=[1000000000])

    rnn_out = layers.transpose(rnn_out, [1, 0, 3, 4, 2])
    last_hidden = layers.transpose(last_hidden, [1, 0, 3, 4, 2])

    # print('rnn_out ', rnn_out.shape)
    # print('last_hidden ', last_hidden.shape)

    return rnn_out, last_hidden