Ejemplo n.º 1
0
    def func(self, place):
        # the shape of input variable should be clearly specified, not inlcude -1.
        shape = [2, 3, 7, 9]
        eps = 0.0001
        dtype = np.float64

        x = layers.data('x', shape, False, dtype)
        y = layers.data('y', shape, False, dtype)
        x.persistable = True
        y.persistable = True
        out = layers.elementwise_div(x, y, axis=0)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        y_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        y_arr[np.abs(y_arr) < 0.005] = 0.02

        gradient_checker.double_grad_check([x, y],
                                           out,
                                           x_init=[x_arr, y_arr],
                                           place=place,
                                           eps=eps,
                                           atol=1e-3)
Ejemplo n.º 2
0
    def training_network(self, img, caption):
        # build caption and mask
        target = caption[:, 1:]
        source = caption[:, :-1]
        padding_filled = layers.fill_constant_batch_size_like(target, shape=[-1, decoder_config['sentence_length'] - 1],
                                                              dtype='int64', value=config.dc['padding_idx'])
        mask = layers.equal(target, padding_filled)
        mask = layers.cast(layers.logical_not(mask), 'float32')
        scale_factor = layers.reduce_sum(mask)
        mask.stop_gradient = True
        scale_factor.stop_gradient = True

        # mdl
        decoder = Decoder(decoder_config['hidden_dim'], rnn_layer=1)
        image_embed, global_image_feat = self._img2feature(img)  # [batch, k+1, hidden], [batch, hidden]

        # 这里要改,要么在rnn里面做embedding,要么在外面做!
        seq_out = decoder.call(global_image_feat, image_embed, embedding_function, words=source)

        loss = layers.squeeze(ImageCaptionModel.loss(target, seq_out), axes=[2])
        loss = layers.elementwise_mul(loss, mask)
        output_loss = layers.elementwise_div(layers.reduce_sum(loss), scale_factor, name='loss')
        return output_loss
Ejemplo n.º 3
0
 def greater_equal_branch(i, a):
     return layers.cond(i < 8.0, lambda: layers.elementwise_mul(a, a),
                        lambda: layers.elementwise_div(a, a))
Ejemplo n.º 4
0
    def _dygraph_clip(self, params_grads):
        sum_square_fp32, sum_square_fp16 = [], []
        unslice_params_fp32, unslice_params_fp16 = [], []

        for p, g in params_grads:
            p_slice = True  # using for slice parameter in sharding stage3
            if g is None or getattr(p, 'need_clip', True) is False:
                continue
            if hasattr(p, "unslice"):
                p_slice = False

            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.get_tensor_from_selected_rows(
                    layers.merge_selected_rows(g))
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)

            if p.dtype == paddle.float16:
                if p_slice: sum_square_fp16.append(sum_square)
                else: unslice_params_fp16.append(sum_square)
            elif p.dtype == paddle.float32:
                if p_slice: sum_square_fp32.append(sum_square)
                else: unslice_params_fp32.append(sum_square)

        # global norm of non-distributed FP16 params_and_grads
        if len(sum_square_fp16) == 0:
            global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
        else:
            global_norm_fp16 = layers.concat(sum_square_fp16)
            global_norm_fp16 = layers.reduce_sum(global_norm_fp16)
            global_norm_fp16 = paddle.cast(
                global_norm_fp16, dtype=paddle.float32)

        # global norm of non-distributed FP16 params_and_grads for unslice parameters
        if len(unslice_params_fp16) == 0:
            global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
        else:
            global_unslice_fp16 = layers.concat(unslice_params_fp16)
            global_unslice_fp16 = layers.reduce_sum(global_unslice_fp16)
            global_unslice_fp16 = paddle.cast(
                global_unslice_fp16, dtype=paddle.float32)

        # global norm of non-distributed FP32 params_and_grads
        global_norm_fp32 = layers.concat(sum_square_fp32) if len(
            sum_square_fp32) != 0 else paddle.to_tensor(
                [0.], dtype=paddle.float32)
        global_norm_fp32 = layers.reduce_sum(global_norm_fp32)

        # global norm of non-distributed FP32 params_and_grads for unslice parameters
        global_unslice_fp32 = layers.concat(unslice_params_fp32) if len(
            unslice_params_fp32) != 0 else paddle.to_tensor(
                [0.], dtype=paddle.float32)
        global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32)
        global_unslice_var = global_unslice_fp16 + global_unslice_fp32

        global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var

        # add all reduce to get global norm of distributed params_and_grads
        dev_id = int(self._device.split(":")[1])
        if paddle.device.get_device() == "cpu":
            global_norm_var = global_norm_var.cuda(dev_id)

        with device_guard(dev_id, "gpu"):
            paddle.distributed.all_reduce(global_norm_var, group=self._group)

        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)

        clip_var = layers.elementwise_div(
            x=max_global_norm,
            y=layers.elementwise_max(
                x=global_norm_var, y=max_global_norm))
        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)

        for p, g in params_grads:
            if getattr(p, 'need_clip', True) is False or g is None:
                continue
            origin_state = g.stop_gradient
            g.stop_gradient = True
            if p.dtype == paddle.float16:
                g.scale_(clip_var_fp16.item())
            else:
                g.scale_(clip_var.item())
            g.stop_gradient = origin_state
            # p._reset_grad_inplace_version(True)

        return params_grads
Ejemplo n.º 5
0
 def __softmax(x, eps=1e-9):
     exp_out = layers.exp(x=x)
     sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
     return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
Ejemplo n.º 6
0
def norm(inputs, dim):
    tp = [1,0]
    mm = L.sqrt(L.reduce_sum(L.elementwise_mul(inputs, inputs), dim=-dim))
    h = L.elementwise_div(inputs, mm, axis=tp[dim])
    return h
    def network(self, for_test=False):
        """
        定义train_model的网络结构
        :return:
        """
        if not for_test:
            before = fluid.data(name='before_train',
                                shape=[-1, self.sent_len],
                                dtype='int64')
            target = fluid.data(name='target_train',
                                shape=[-1, self.sent_len],
                                dtype='int64')
            after = fluid.data(name='after_train',
                               shape=[-1, self.sent_len],
                               dtype='int64')
            # 定义数据读取工具
            reader = fluid.io.DataLoader.from_generator(
                feed_list=[before, target, after], capacity=64, iterable=True)
            # 前向传播
            rnn_out, encode_hidden = self.forward(target)
            pred_before = self.sent_pred(target,
                                         dir='before',
                                         encode_hidden=encode_hidden,
                                         for_test=False)
            pred_after = self.sent_pred(target,
                                        dir='after',
                                        encode_hidden=encode_hidden,
                                        for_test=False)
        else:
            before = fluid.data(name='before_test',
                                shape=[-1, self.sent_len],
                                dtype='int64')
            target = fluid.data(name='target_test',
                                shape=[-1, self.sent_len],
                                dtype='int64')
            after = fluid.data(name='after_test',
                               shape=[-1, self.sent_len],
                               dtype='int64')
            # 定义数据读取工具
            reader = fluid.io.DataLoader.from_generator(
                feed_list=[before, target, after], capacity=64, iterable=True)
            # 前向传播
            rnn_out, encode_hidden = self.forward(target)
            pred_before = self.sent_pred(target,
                                         dir='before',
                                         encode_hidden=encode_hidden,
                                         for_test=True)
            pred_after = self.sent_pred(target,
                                        dir='after',
                                        encode_hidden=encode_hidden,
                                        for_test=True)

        # 将batch_size 置为1列,为什么不是0列?0列是num_layers.
        pred_before = layers.transpose(pred_before, perm=[0, 2, 1, 3])
        pred_after = layers.transpose(pred_after, perm=[0, 2, 1, 3])
        if not for_test:
            before_emb = self.embedding(before)
            after_emb = self.embedding(after)
            vocab_emb = self.embedding.parameters()[0]
        else:
            before_emb = self.test_embedding(before)
            after_emb = self.test_embedding(after)
            vocab_emb = self.test_embedding.parameters()[0]
        #loss_before = layers.cross_entropy(pred_before, before, soft_label=False)
        #loss_after = layers.cross_entropy(pred_after, after, soft_label=False)
        vocab_emb = layers.reshape(
            vocab_emb, shape=[1, 1, 1, vocab_emb.shape[0], vocab_emb.shape[1]])
        new_shape = pred_before.shape[:-1] + (1, ) + pred_before.shape[-1:]
        pred_before = layers.reshape(pred_before, shape=new_shape)
        pred_after = layers.reshape(pred_after, shape=new_shape)
        prob_w_before = layers.reduce_sum(layers.elementwise_mul(
            pred_before, vocab_emb),
                                          dim=[0, 4])
        prob_w_after = layers.reduce_sum(layers.elementwise_mul(
            pred_after, vocab_emb),
                                         dim=[0, 4])
        prob_w_before = layers.reduce_sum(layers.exp(prob_w_before), dim=-1)
        prob_w_after = layers.reduce_sum(layers.exp(prob_w_after), dim=-1)
        new_shape = before_emb.shape[:-1] + (1, ) + before_emb.shape[-1:]
        before_emb = layers.reshape(before_emb, shape=new_shape)
        after_emb = layers.reshape(after_emb, shape=new_shape)
        pred_before = layers.reduce_sum(layers.elementwise_mul(
            pred_before, before_emb),
                                        dim=[0, 3, 4])
        pred_after = layers.reduce_sum(layers.elementwise_mul(
            pred_after, after_emb),
                                       dim=[0, 3, 4])
        prob_before = layers.elementwise_div(layers.exp(pred_before),
                                             prob_w_before + 1e-6)
        prob_after = layers.elementwise_div(layers.exp(pred_after),
                                            prob_w_after + 1e-6)
        loss = -layers.reduce_mean(
            (layers.log(prob_after) + layers.log(prob_before)) / 2.0)
        return loss, reader
Ejemplo n.º 8
0
        def beam_search():
            """Beam search function"""

            max_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.max_out_len,
                                           force_cpu=True)
            min_len = layers.fill_constant(shape=[1],
                                           dtype=start_tokens.dtype,
                                           value=self.min_out_len)
            neg_inf = layers.fill_constant(shape=[1],
                                           dtype='float32',
                                           value=-INF)
            step_idx = layers.fill_constant(shape=[1],
                                            dtype=start_tokens.dtype,
                                            value=0,
                                            force_cpu=True)
            step_next_idx = layers.fill_constant(shape=[1],
                                                 dtype=start_tokens.dtype,
                                                 value=1,
                                                 force_cpu=True)
            cond = layers.less_than(x=step_idx,
                                    y=max_len)  # default force_cpu=True
            while_op = layers.While(cond)
            # array states will be stored for each step.
            ids = layers.array_write(layers.reshape(start_tokens, (-1, 1)),
                                     step_idx)
            scores = layers.array_write(init_scores, step_idx)
            # cell states will be overwrited at each step.
            # caches contains states of history steps in decoder self-attention
            # and static encoder output projections in encoder-decoder attention
            # to reduce redundant computation.
            caches = [
                {
                    "k":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "v":  # for self attention
                        layers.fill_constant_batch_size_like(
                            input=start_tokens,
                            shape=[-1, self._n_head, 0, self._emb_size // self._n_head],
                            dtype=enc_words_output.dtype,
                            value=0),
                    "static_k_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_v_word":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_words_output.dtype),
                    "static_k_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype),
                    "static_v_sent":  # for encoder-decoder attention
                        layers.create_tensor(dtype=enc_sents_output.dtype)
                } for i in range(self._dec_n_layer)
            ]

            trigram_blocking = TrigramBlocking(start_tokens,
                                               self.tokenizer,
                                               use_fp16=self._use_fp16,
                                               beam_size=self.beam_size)

            with while_op.block():
                pre_ids = layers.array_read(array=ids, i=step_idx)
                pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                # Since beam_search_op dosen't enforce pre_ids' shape, we can do
                # inplace reshape here which actually change the shape of pre_ids.
                # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
                pre_scores = layers.array_read(array=scores, i=step_idx)
                # gather cell states corresponding to selected parent
                pre_src_words_attn_bias = layers.gather(
                    tgt_src_words_attn_bias, index=parent_idx)
                pre_src_sents_attn_bias = layers.gather(
                    tgt_src_sents_attn_bias, index=parent_idx)
                pre_graph_attn_bias = layers.gather(graph_attn_bias,
                                                    index=parent_idx)
                pre_pos = layers.elementwise_mul(
                    x=layers.fill_constant_batch_size_like(
                        input=
                        pre_src_sents_attn_bias,  # cann't use lod tensor here
                        value=1,
                        shape=[-1, 1, 1],
                        dtype=pre_ids.dtype),
                    y=step_idx,
                    axis=0)

                logits = self.decode(
                    dec_input=(pre_ids, pre_pos, None, pre_src_words_attn_bias,
                               pre_src_sents_attn_bias, pre_graph_attn_bias),
                    enc_words_output=enc_words_output,
                    enc_sents_output=enc_sents_output,
                    caches=caches,
                    gather_idx=parent_idx)

                # prevent generating end token if length less than min_out_len
                eos_index = layers.fill_constant(
                    shape=[layers.shape(logits)[0]],
                    dtype='int64',
                    value=self.eos_idx)
                eos_index = fluid.one_hot(eos_index, depth=self.voc_size)
                less_cond = layers.cast(layers.less_than(x=step_idx,
                                                         y=min_len),
                                        dtype='float32')
                less_val = layers.elementwise_mul(less_cond, neg_inf)
                eos_val = layers.elementwise_mul(eos_index, less_val, axis=0)
                revised_logits = layers.elementwise_add(logits,
                                                        eos_val,
                                                        axis=0)

                # topK reduction across beams, also contain special handle of
                # end beams and end sentences(batch reduction)
                topk_scores, topk_indices = layers.topk(
                    input=layers.softmax(revised_logits), k=self.beam_size)

                # Roll-Back previous-scores for length-penalty
                # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back
                # because of doing this, we need store the length-penaltied score in `scores`
                # while calculating use the un-penaltied score
                # -> safe for step_idx == 0 (initialization state), because previous-score == 0
                pre_timestep_length_penalty = fluid.layers.pow(
                    ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) /
                     6.0), self.len_penalty)
                pre_scores_wo_len_penalty = fluid.layers.elementwise_mul(
                    pre_scores, pre_timestep_length_penalty)

                # calc trigram-blocking delta scores for current alive sequence
                if self.block_trigram:
                    trigram_blocking.update_seq(pre_ids, parent_idx)
                    trigram_blocking.expand_cand_seq(topk_indices)
                    fluid.layers.py_func(
                        func=trigram_blocking.blocking_forward,
                        x=[
                            trigram_blocking.cand_seq,
                            trigram_blocking.id2is_full_token
                        ],
                        out=trigram_blocking.delta_score_out,
                        backward_func=None)
                    layers.Print(trigram_blocking.delta_score_out,
                                 summarize=100,
                                 message="trigram_blocking.delta_score_out")
                    pre_scores_wo_len_penalty = fluid.layers.elementwise_add(
                        x=trigram_blocking.delta_score_out,
                        y=pre_scores_wo_len_penalty,
                        axis=0)
                # => [N, topk]

                accu_scores = layers.elementwise_add(
                    x=layers.log(topk_scores),
                    y=pre_scores_wo_len_penalty,
                    axis=0)

                cur_timestep_length_penalty = layers.pow(
                    ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) /
                     6.0), self.len_penalty)
                curr_scores = layers.elementwise_div(
                    accu_scores, cur_timestep_length_penalty)

                # beam_search op uses lod to differentiate branches.
                curr_scores = layers.lod_reset(curr_scores, pre_ids)
                topk_indices = layers.lod_reset(topk_indices, pre_ids)
                selected_ids, selected_scores, gather_idx = layers.beam_search(
                    pre_ids=pre_ids,
                    pre_scores=pre_scores,
                    ids=topk_indices,
                    scores=curr_scores,
                    beam_size=self.beam_size,
                    end_id=self.eos_idx,
                    return_parent_idx=True)

                layers.increment(x=step_idx, value=1.0, in_place=True)
                layers.increment(x=step_next_idx, value=1.0, in_place=True)
                # cell states(caches) have been updated in wrap_decoder,
                # only need to update beam search states here.
                layers.array_write(selected_ids, i=step_idx, array=ids)
                layers.array_write(selected_scores, i=step_idx, array=scores)
                layers.assign(gather_idx, parent_idx)
                layers.assign(pre_src_words_attn_bias, tgt_src_words_attn_bias)
                layers.assign(pre_src_sents_attn_bias, tgt_src_sents_attn_bias)
                layers.assign(pre_graph_attn_bias, graph_attn_bias)

                length_cond = layers.less_than(x=step_idx, y=max_len)
                finish_cond = layers.logical_not(
                    layers.is_empty(x=selected_ids))
                layers.logical_and(x=length_cond, y=finish_cond, out=cond)

            finished_ids, finished_scores = layers.beam_search_decode(
                ids, scores, beam_size=self.beam_size, end_id=self.eos_idx)

            return finished_ids, finished_scores
    def forward(self, x, y):
        # x,y误差一帧
        u1 = zeros_like(x)
        u2 = zeros_like(x)
        l_t = self.l * self.t
        taut = self.a / self.t

        grad2_x = self.conv_img_grad(y)
        # grad2_x[:, :, :, 0] = 0.5 * (x[:, :, :, 1] - x[:, :, :, 0])
        # grad2_x[:, :, :, -1] = 0.5 * (x[:, :, :, -1] - x[:, :, :, -2])

        grad2_y = self.conv_img_grad2(y)
        # grad2_y[:, :, 0, :] = 0.5 * (x[:, :, 1, :] - x[:, :, 0, :])
        # grad2_y[:, :, -1, :] = 0.5 * (x[:, :, -1, :] - x[:, :, -2, :])

        p11 = zeros_like(x)
        p12 = zeros_like(x)
        p21 = zeros_like(x)
        p22 = zeros_like(x)

        gsqx = grad2_x**2
        gsqy = grad2_y**2
        grad = gsqx + gsqy + 1e-12

        rho_c = y - grad2_x * u1 - grad2_y * u2 - x

        for i in range(self.n_iter):
            rho = rho_c + grad2_x * u1 + grad2_y * u2 + 1e-12

            v1 = zeros_like(x)
            v2 = zeros_like(x)
            mask1 = rho < -l_t * grad
            mask2 = rho > l_t * grad
            mask3 = logical_and(logical_not(logical_or(mask1, mask2)),
                                (grad > 1e-12))
            mask1 = cast(mask1, dtype='float32')
            mask2 = cast(mask2, dtype='float32')
            mask3 = cast(mask3, dtype='float32')
            mask1.stop_gradient = True
            mask2.stop_gradient = True
            mask3.stop_gradient = True

            # v1 = v1 + l_t * grad2_x * mask1 - l_t * grad2_x * mask2 - (rho / grad) * grad2_x * mask3
            # v2 = v2 + l_t * grad2_y * mask1 - l_t * grad2_y * mask2 - (rho / grad) * grad2_y * mask3
            v1 = elementwise_add(
                u1,
                elementwise_add(
                    elementwise_mul(l_t * grad2_x, mask1),
                    elementwise_add(
                        elementwise_mul(-l_t * grad2_x, mask2),
                        elementwise_mul(-elementwise_div(rho, grad),
                                        elementwise_mul(grad2_x, mask3)))))
            v2 = elementwise_add(
                u2,
                elementwise_add(
                    elementwise_mul(l_t * grad2_y, mask1),
                    elementwise_add(
                        elementwise_mul(-l_t * grad2_y, mask2),
                        elementwise_mul(-elementwise_div(rho, grad),
                                        elementwise_mul(grad2_y, mask3)))))

            del rho
            del mask1
            del mask2
            del mask3

            v1 += u1
            v2 += u2

            u1 = v1 + self.t * self.divergence(p11, p12)
            u2 = v2 + self.t * self.divergence(p21, p22)
            del v1
            del v2
            u1 = u1
            u2 = u2

            u1x, u1y = self.forward_grad(u1)
            u2x, u2y = self.forward_grad(u2)

            p11 = (p11 + taut * u1x) / (1. +
                                        taut * sqrt(u1x**2 + u1y**2 + 1e-12))
            p12 = (p12 + taut * u1y) / (1. +
                                        taut * sqrt(u1x**2 + u1y**2 + 1e-12))
            p21 = (p21 + taut * u2x) / (1. +
                                        taut * sqrt(u2x**2 + u2y**2 + 1e-12))
            p22 = (p22 + taut * u2y) / (1. +
                                        taut * sqrt(u2x**2 + u2y**2 + 1e-12))
            del u1x
            del u1y
            del u2x
            del u2y

        return u1, u2
Ejemplo n.º 10
0
    def _dygraph_clip(self, params_grads):
        params_and_grads = []

        sum_square_dist_fp16 = []
        sum_square_dist_fp32 = []
        sum_square_not_dist_fp16 = []
        sum_square_not_dist_fp32 = []

        for p, g in params_grads:
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)

            not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or (
                hasattr(p, 'is_firstly_shared')
                and getattr(p, 'is_firstly_shared', True))

            if not_shared_enable:
                if p.is_distributed:
                    if p.dtype == paddle.float16:
                        sum_square_dist_fp16.append(sum_square)
                    elif p.dtype == paddle.float32:
                        sum_square_dist_fp32.append(sum_square)
                else:
                    if p.dtype == paddle.float16:
                        sum_square_not_dist_fp16.append(sum_square)
                    elif p.dtype == paddle.float32:
                        sum_square_not_dist_fp32.append(sum_square)

        # global norm of distributed FP16 params_and_grads
        if len(sum_square_dist_fp16) == 0:
            global_norm_dist_fp16 = paddle.to_tensor([0.],
                                                     dtype=paddle.float32)
        else:
            global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16)
            global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16)
            global_norm_dist_fp16 = paddle.cast(global_norm_dist_fp16,
                                                dtype=paddle.float32)

        # global norm of non-distributed FP16 params_and_grads
        if len(sum_square_not_dist_fp16) == 0:
            global_norm_not_dist_fp16 = paddle.to_tensor([0.],
                                                         dtype=paddle.float32)
        else:
            global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16)
            global_norm_not_dist_fp16 = layers.reduce_sum(
                global_norm_not_dist_fp16)
            global_norm_not_dist_fp16 = paddle.cast(global_norm_not_dist_fp16,
                                                    dtype=paddle.float32)

        # global norm of distributed FP32 params_and_grads
        global_norm_dist_fp32 = layers.concat(sum_square_dist_fp32) if len(
            sum_square_dist_fp32) != 0 else paddle.to_tensor(
                [0.], dtype=paddle.float32)
        global_norm_dist_fp32 = layers.reduce_sum(global_norm_dist_fp32)

        # global norm of non-distributed FP32 params_and_grads
        global_norm_not_dist_fp32 = layers.concat(
            sum_square_not_dist_fp32
        ) if len(sum_square_not_dist_fp32) != 0 else paddle.to_tensor(
            [0.], dtype=paddle.float32)
        global_norm_not_dist_fp32 = layers.reduce_sum(
            global_norm_not_dist_fp32)

        global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32
        global_norm_var_not_dist = global_norm_not_dist_fp16 + global_norm_not_dist_fp32

        # add all reduce to get global norm of distributed params_and_grads
        if self._hcg.get_model_parallel_world_size() > 1:
            paddle.distributed.all_reduce(
                global_norm_var_dist,
                group=self._hcg.get_check_parallel_group())

        # add all reduce to get global norm of non-distributed params_and_grads in groups of pp
        if self._hcg.get_pipe_parallel_world_size() > 1:
            paddle.distributed.all_reduce(
                global_norm_var_not_dist,
                group=self._hcg.get_pipe_parallel_group())

        # In Sharding mode, param and grad is mapping different rank in optimizer.
        # ClipGradByGlobalNorm need allreduce to get globol norm
        if self._hcg.get_sharding_parallel_world_size() > 1:
            paddle.distributed.all_reduce(
                global_norm_var_not_dist,
                group=self._hcg.get_sharding_parallel_group())

        global_norm_var_fp32 = layers.sqrt(global_norm_var_dist +
                                           global_norm_var_not_dist)

        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm)
        clip_var = layers.elementwise_div(x=max_global_norm,
                                          y=layers.elementwise_max(
                                              x=global_norm_var_fp32,
                                              y=max_global_norm))
        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
        for p, g in params_grads:
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                params_and_grads.append((p, g))
                continue
            if p.dtype == paddle.float16:
                new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16)
            else:
                new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

        return params_and_grads
Ejemplo n.º 11
0
 def __softmax(x, eps=1e-9):
     exp_out = layers.exp(x=x)
     sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
     return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
Ejemplo n.º 12
0
    def _dygraph_clip(self, params_grads):
        normal_params_grads = []
        moe_params_grads = []

        # separate moe params from normal params
        if self.moe_group is not None and self.moe_group.nranks > 1:
            for p, g in params_grads:
                if self.is_expert_param_func(p):
                    moe_params_grads.append((p, g))
                else:
                    normal_params_grads.append((p, g))
        else:
            normal_params_grads = params_grads

        # why to return sum_dtype?
        # we will call `get_l2_norm_pow` twice and the precisions may be different.
        # For convenience and simplification, we use sum_dtype directly instead of global_norm_var_normal.dtype
        global_norm_var_normal, sum_dtype \
            = self.get_l2_norm_pow(normal_params_grads)
        global_norm_var_moe = None
        if len(moe_params_grads) > 0:
            global_norm_var_moe, _ \
                = self.get_l2_norm_pow(moe_params_grads, sum_dtype)
            if global_norm_var_moe is not None:
                collective.all_reduce(global_norm_var_moe,
                                      op=collective.ReduceOp.SUM,
                                      group=self.moe_group)

        if global_norm_var_normal is None and global_norm_var_moe is None:
            return params_grads
        elif global_norm_var_normal is None:
            global_norm_var = global_norm_var_moe
        elif global_norm_var_moe is None:
            global_norm_var = global_norm_var_normal
        else:
            if global_norm_var_normal.dtype != global_norm_var_moe.dtype:
                # compared with normal norm, moe norm is the later one,
                # so its precision is no lower than normal norm
                global_norm_var_normal = \
                    global_norm_var_normal.astype(global_norm_var_moe.dtype)
            global_norm_var = global_norm_var_normal + global_norm_var_moe

        params_and_grads = []
        global_norm_var = layers.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(shape=[1],
                                               dtype=global_norm_var.dtype,
                                               value=self.clip_norm)
        clip_var = layers.elementwise_div(x=max_global_norm,
                                          y=layers.elementwise_max(
                                              x=global_norm_var,
                                              y=max_global_norm))
        for p, g in params_grads:
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                params_and_grads.append((p, g))
                continue
            # TODO(wangxi): use inplace elementwise_mul
            clip_input = (clip_var.astype('float16') if g.dtype
                          == core.VarDesc.VarType.FP16 else clip_var)
            new_grad = layers.elementwise_mul(x=g, y=clip_input)
            params_and_grads.append((p, new_grad))
        return params_and_grads
Ejemplo n.º 13
0
def log_fun(x, y):
    return F.elementwise_mul(x, (F.elementwise_div(x, y)))
Ejemplo n.º 14
0
def compute_weight(v, g, dim, power):
    assert len(g.shape) == 1, "magnitude should be a vector"
    v_normalized = F.elementwise_div(v, (norm_except(v, dim, power) + 1e-12),
                                     axis=dim)
    weight = F.elementwise_mul(v_normalized, g, axis=dim)
    return weight
Ejemplo n.º 15
0
    def fast_decode(self):
        """create model for inference"""
        if self.task_type == "dialog":
            emb_num = 4
        else:
            emb_num = 3
        input_shapes = [[-1, self.max_seq_len, 1]] * emb_num + \
                       [[-1, self.max_seq_len, self.max_seq_len]]
        input_dtypes = ['int64'] * emb_num + ['float32']
        input_lod_levels = [0] * emb_num + [0]

        shapes = input_shapes + [[-1, 1, 1], [-1, 1, 1], [-1, 1], [-1],
                                 [-1, 1, self.max_seq_len], [-1, 1]]
        dtypes = input_dtypes + [
            'int64', 'int64', 'float32', 'int32', 'float32', 'int64'
        ]
        lod_levels = input_lod_levels + [2, 2, 2, 0, 0, 0]

        inputs = self.to_tensor(shapes, dtypes, lod_levels)
        pyreader = fluid.io.DataLoader.from_generator(feed_list=inputs,
                                                      capacity=70,
                                                      iterable=False)
        emb_ids = {}
        for key, value in zip(self.emb_keys, inputs[:emb_num]):
            emb_ids[key] = value

        input_mask = inputs[emb_num]
        tgt_ids, tgt_pos, init_scores, parent_idx, tgt_input_mask, data_ids = inputs[
            -6:]

        unimo = UNIMOModel(emb_ids=emb_ids,
                           input_mask=input_mask,
                           config=self.gene_config,
                           task_type=self.task_type,
                           decoding=True,
                           gather_idx=parent_idx)

        max_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.max_out_len,
                                       force_cpu=True)
        min_len = layers.fill_constant(shape=[1],
                                       dtype=tgt_ids.dtype,
                                       value=self.min_out_len,
                                       force_cpu=True)
        neg_inf = layers.fill_constant(shape=[1], dtype='float32', value=-1e18)
        step_idx = layers.fill_constant(shape=[1],
                                        dtype=tgt_ids.dtype,
                                        value=0,
                                        force_cpu=True)
        step_next_idx = layers.fill_constant(shape=[1],
                                             dtype=tgt_ids.dtype,
                                             value=1,
                                             force_cpu=True)
        cond = layers.less_than(x=step_idx, y=max_len)
        while_op = layers.While(cond)

        ids = layers.array_write(layers.reshape(tgt_ids, (-1, 1)), step_idx)
        pos_biases = layers.array_write(tgt_pos, step_idx)
        scores = layers.array_write(init_scores, step_idx)
        tgt_masks = layers.array_write(tgt_input_mask, step_idx)

        trigram_blocking = TrigramBlocking(tgt_ids,
                                           self.tokenizer,
                                           beam_size=self.beam_size)

        with while_op.block():
            pre_ids = layers.array_read(array=ids, i=step_idx)
            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
            pre_scores = layers.array_read(array=scores, i=step_idx)
            pos_bias = layers.array_read(array=pos_biases, i=step_idx)
            pos_bias = layers.gather(input=pos_bias, index=parent_idx)

            def gen_batch_like(value,
                               dtype="int64",
                               shape=[-1, 1, 1],
                               is_scalar=True):
                """generate batch"""
                if is_scalar:
                    return layers.fill_constant_batch_size_like(
                        input=parent_idx,
                        value=value,
                        shape=shape,
                        dtype=dtype)
                else:
                    return layers.elementwise_mul(
                        x=layers.fill_constant_batch_size_like(
                            input=parent_idx,
                            value=1,
                            shape=shape,
                            dtype=dtype),
                        y=value,
                        axis=0)

            tmp_mask = layers.array_read(tgt_masks, i=step_idx)
            tmp_mask = layers.gather(input=tmp_mask, index=parent_idx)
            append_1_mask = gen_batch_like(1.0, dtype=tmp_mask.dtype)
            pre_mask = layers.concat([tmp_mask, append_1_mask], axis=2)

            pre_pos = gen_batch_like(step_idx, is_scalar=False)
            pre_pos = pre_pos + pos_bias  ####################### pos start from 2

            pre_sent = gen_batch_like(self.tgt_type_id, dtype=pre_ids.dtype)

            dec_emb_ids = {"word_embedding": pre_ids, "pos_embedding": pre_pos}
            if self.task_type == "dialog":
                role_ids = gen_batch_like(0)
                turn_ids = gen_batch_like(0)
                dec_emb_ids["role_embedding"] = role_ids
                dec_emb_ids["turn_embedding"] = turn_ids
            else:
                dec_emb_ids["sent_embedding"] = pre_sent

            dec_out = unimo.encode(emb_ids=dec_emb_ids,
                                   input_mask=pre_mask,
                                   gather_idx=parent_idx)
            fc_out = self.cal_logit(dec_out, None)

            # prevent generating end token if length less than min_out_len
            eos_index = layers.fill_constant(shape=[layers.shape(fc_out)[0]],
                                             dtype='int64',
                                             value=self.eos_id)
            eos_index = fluid.one_hot(eos_index, depth=self.vocab_size)
            less_cond = layers.cast(layers.less_than(x=step_idx, y=min_len),
                                    dtype='float32')
            less_val = layers.elementwise_mul(less_cond, neg_inf)
            eos_val = layers.elementwise_mul(eos_index, less_val, axis=0)
            revised_logits = layers.elementwise_add(fc_out, eos_val, axis=0)

            # topK reduction across beams, also contain special handle of
            # end beams and end sentences(batch reduction)
            topk_scores, topk_indices = layers.topk(
                input=layers.softmax(revised_logits), k=self.beam_size)

            # Roll-Back previous-scores for length-penalty
            # previous-scores has been length-penaltied, before this timestep length-penalty, need roll-back
            # because of doing this, we need store the length-penaltied score in `scores`
            # while calculating use the un-penaltied score
            # -> safe for step_idx == 0 (initialization state), because previous-score == 0
            pre_timestep_length_penalty = fluid.layers.pow(
                ((5.0 + fluid.layers.cast(step_idx, pre_scores.dtype)) / 6.0),
                self.length_penalty)
            pre_scores_wo_len_penalty = fluid.layers.elementwise_mul(
                pre_scores, pre_timestep_length_penalty)

            # calc trigram-blocking delta scores for current alive sequence
            if self.block_trigram:
                trigram_blocking.update_seq(pre_ids, parent_idx)
                trigram_blocking.expand_cand_seq(topk_indices)
                fluid.layers.py_func(func=trigram_blocking.blocking_forward,
                                     x=[
                                         trigram_blocking.cand_seq,
                                         trigram_blocking.id2is_full_token
                                     ],
                                     out=trigram_blocking.delta_score_out,
                                     backward_func=None)
                pre_scores_wo_len_penalty = fluid.layers.elementwise_add(
                    x=trigram_blocking.delta_score_out,
                    y=pre_scores_wo_len_penalty,
                    axis=0)
            # => [N, topk]
            accu_scores = layers.elementwise_add(x=layers.log(topk_scores),
                                                 y=pre_scores_wo_len_penalty,
                                                 axis=0)

            cur_timestep_length_penalty = layers.pow(
                ((5.0 + layers.cast(step_next_idx, accu_scores.dtype)) / 6.0),
                self.length_penalty)
            curr_scores = layers.elementwise_div(accu_scores,
                                                 cur_timestep_length_penalty)

            # beam_search op uses lod to differentiate branches.
            curr_scores = layers.lod_reset(curr_scores, pre_ids)
            topk_indices = layers.lod_reset(topk_indices, pre_ids)
            selected_ids, selected_scores, gather_idx = layers.beam_search(
                pre_ids=pre_ids,
                pre_scores=pre_scores,
                ids=topk_indices,
                scores=curr_scores,
                beam_size=self.beam_size,
                end_id=self.eos_id,
                return_parent_idx=True)

            layers.increment(x=step_idx, value=1.0, in_place=True)
            layers.increment(x=step_next_idx, value=1.0, in_place=True)
            # cell states(caches) have been updated in wrap_decoder,
            # only need to update beam search states here.
            layers.array_write(selected_ids, i=step_idx, array=ids)
            layers.array_write(selected_scores, i=step_idx, array=scores)
            layers.array_write(pre_mask, i=step_idx, array=tgt_masks)
            layers.array_write(pos_bias, i=step_idx, array=pos_biases)
            layers.assign(gather_idx, parent_idx)

            length_cond = layers.less_than(x=step_idx, y=max_len)
            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
            layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        finished_ids, finished_scores = layers.beam_search_decode(
            ids, scores, beam_size=self.beam_size, end_id=self.eos_id)

        graph_vars = {
            "finished_ids": finished_ids,
            "finished_scores": finished_scores,
            "data_ids": data_ids
        }

        for k, v in graph_vars.items():
            v.persistable = True

        return pyreader, graph_vars