Example #1
0
    def forward(self, p, q, pad_mask=None):
        """
        Args:
            p(obj:`Tensor`): the first forward logits of training examples.
            q(obj:`Tensor`): the second forward logits of training examples.
            pad_mask(obj:`Tensor`, optional): The Tensor containing the binary mask to index with, it's data type is bool.

        Returns:
            loss(obj:`Tensor`): the rdrop loss of p and q
        """
        p_loss = F.kl_div(F.log_softmax(p, axis=-1),
                          F.softmax(q, axis=-1),
                          reduction=self.reduction)
        q_loss = F.kl_div(F.log_softmax(q, axis=-1),
                          F.softmax(p, axis=-1),
                          reduction=self.reduction)

        # pad_mask is for seq-level tasks
        if pad_mask is not None:
            p_loss = paddle.masked_select(p_loss, pad_mask)
            q_loss = paddle.masked_select(q_loss, pad_mask)

        # You can choose whether to use function "sum" and "mean" depending on your task
        p_loss = p_loss.sum()
        q_loss = q_loss.sum()
        loss = (p_loss + q_loss) / 2
        return loss
Example #2
0
 def forward(self, x, y):
     _,z_s = self.ques_encoder(y)
     z_s = paddle.fluid.layers.transpose(z_s, perm=[1, 0, 2]) # [B,1,I]
     for i in range(self.hop):
         z_s_ = paddle.fluid.layers.expand(z_s,expand_times=[1, x.shape[1], 1])  # [B, S, I]
         s = self.FFNs_start[i](paddle.concat([x, z_s_, x * z_s_], axis=2)).squeeze(2)
         p_s = F.softmax(s, axis=1)  # [B, S]
         u_s = p_s.unsqueeze(1).bmm(x)  # [B, 1, I]
         z_e = self.SFUs_start[i](z_s, u_s)  # [B, 1, I]
         z_s_ = paddle.fluid.layers.expand(z_s,expand_times=[1, x.shape[1], 1])  # [B, S, I]
         e = self.FFNs_end[i](paddle.concat([x, z_s_, x * z_s_], axis=2)).squeeze(2)
         p_e = F.softmax(e, axis=1)  # [B, S]
         u_e = p_e.unsqueeze(1).bmm(x)  # [B, 1, I]
         z_s = self.SFUs_end[i](z_e, u_e)
     if self.normalize:
         if self.training:
             # In training we output log-softmax for NLL
             p_s = F.log_softmax(s, axis=1)  # [B, S]
             p_e = F.log_softmax(e, axis=1)  # [B, S]
         else:
             # ...Otherwise 0-1 probabilities
             p_s = F.softmax(s, axis=1)  # [B, S]
             p_e = F.softmax(e, axis=1)  # [B, S]
     else:
         p_s = s.exp()
         p_e = e.exp()
     return p_s, p_e,z_s
Example #3
0
    def check_api(self, place=fluid.CPUPlace(), axis=None, dtype=None):
        ref_out = ref_log_softmax(self.x, axis, dtype)
        main_program = fluid.Program()
        mylogsoftmax = nn.LogSoftmax(axis)
        with fluid.program_guard(main_program):
            x = fluid.data(name='x', shape=self.x_shape)
            y = functional.log_softmax(x, axis, dtype)
        exe = fluid.Executor(place)
        out = exe.run(main_program, feed={'x': self.x}, fetch_list=[y])
        self.assertTrue(np.allclose(out[0], ref_out))

        with fluid.dygraph.guard(place):
            x = fluid.dygraph.to_variable(self.x)
            y = functional.log_softmax(x, axis, dtype)
        self.assertTrue(np.allclose(y.numpy(), ref_out))
Example #4
0
def calc_minilm_loss(loss_fct, s, t, attn_mask, num_relation_heads=0):
    # Initialize head_num
    if num_relation_heads > 0 and num_relation_heads != s.shape[1]:
        # s'shape: [bs, seq_len, head_num, head_dim]
        s = tensor.transpose(x=s, perm=[0, 2, 1, 3])
        # s'shape: [bs, seq_len, num_relation_heads, head_dim_new]
        s = tensor.reshape(x=s, shape=[0, 0, num_relation_heads, -1])
        #s's shape: [bs, num_relation_heads, seq_len,, head_dim_new]
        s = tensor.transpose(x=s, perm=[0, 2, 1, 3])
    if num_relation_heads > 0 and num_relation_heads != t.shape[1]:
        t = tensor.transpose(x=t, perm=[0, 2, 1, 3])
        t = tensor.reshape(x=t, shape=[0, 0, num_relation_heads, -1])
        t = tensor.transpose(x=t, perm=[0, 2, 1, 3])

    pad_seq_len = s.shape[2]
    s_head_dim, t_head_dim = s.shape[3], t.shape[3]
    scaled_dot_product_s = tensor.matmul(
        x=s, y=s, transpose_y=True) / math.sqrt(s_head_dim)
    del s
    scaled_dot_product_s += attn_mask

    scaled_dot_product_t = tensor.matmul(
        x=t, y=t, transpose_y=True) / math.sqrt(t_head_dim)
    del t
    scaled_dot_product_t += attn_mask
    loss = loss_fct(F.log_softmax(scaled_dot_product_s),
                    F.softmax(scaled_dot_product_t))
    return loss
    def check_api(self, axis=-1, dtype=None):
        x = self.x.copy()
        if dtype is not None:
            x = x.astype(dtype)
        ref_out = np.apply_along_axis(ref_log_softmax, axis, x)
        with paddle.static.program_guard(paddle.static.Program()):
            x = paddle.fluid.data(name='x', shape=self.x_shape)
            y = F.log_softmax(x, axis, dtype)
            exe = paddle.static.Executor(self.place)
            out = exe.run(feed={'x': self.x}, fetch_list=[y])
        self.assertTrue(np.allclose(out[0], ref_out))

        paddle.disable_static()
        x = paddle.to_tensor(self.x)
        y = F.log_softmax(x, axis, dtype)
        self.assertTrue(np.allclose(y.numpy(), ref_out), True)
        paddle.enable_static()
Example #6
0
 def forward(self, logits, label):
     logits = F.normalize(logits, p=2, axis=1, epsilon=self.eps)
     wn = F.normalize(self.w, p=2, axis=0, epsilon=self.eps)
     cosine = paddle.matmul(logits, wn)
     y = paddle.zeros((logits.shape[0], self.n_classes))
     for i in range(logits.shape[0]):
         y[i, label[i]] = self.margin
     pred = F.log_softmax((cosine - y) * self.scale, -1)
     return self.nll_loss(pred, label), pred
Example #7
0
 def calc_loss(self, x, target):
     if self._label_smoothing:
         target = self._labelsmoothing(target)
         x = -F.log_softmax(x, axis=-1)
         cost = paddle.sum(x * target, axis=-1)
     else:
         cost = F.cross_entropy(x, label=target)
     avg_cost = self.reduce_loss(cost)
     return avg_cost
Example #8
0
 def _crossentropy(self, input, target):
     if self._label_smoothing:
         target = self._labelsmoothing(target)
         input = -F.log_softmax(input, axis=-1)
         cost = paddle.sum(target * input, axis=-1)
     else:
         cost = F.cross_entropy(input=input, label=target)
     avg_cost = paddle.mean(cost)
     return avg_cost
Example #9
0
    def forward(self, outputs, targets, length=None):
        targets = F.one_hot(targets, outputs.shape[1])
        try:
            predictions = self.loss_fn(outputs, targets)
        except TypeError:
            predictions = self.loss_fn(outputs)

        predictions = F.log_softmax(predictions, axis=1)
        loss = self.criterion(predictions, targets) / targets.sum()
        return loss
Example #10
0
    def greedy_search(self, input_ids, logits_processors, max_length,
                      pad_token_id, eos_token_id, **model_kwargs):
        batch_size, cur_len = input_ids.shape
        origin_len = cur_len
        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')
        scores = paddle.full([batch_size, 1],
                             0.0,
                             dtype=paddle.get_default_dtype())

        while cur_len < max_length:
            # prepare model inputs & get model output
            model_inputs = self.prepare_inputs_for_generation(
                input_ids, **model_kwargs)
            outputs = self(**model_inputs)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs
            # [batch_size, vocab_size]
            logits = logits[:, -1, :]

            # pre-process distribution
            logits = self.adjust_logits_during_generation(logits)
            logits = logits_processors(input_ids, logits)

            # greedy
            probs = F.log_softmax(logits)
            next_tokens = paddle.argmax(probs, axis=-1).unsqueeze(-1)
            next_scores = paddle.index_sample(probs, next_tokens)

            if eos_token_id is not None:
                next_tokens = paddle.where(
                    unfinished_flag, next_tokens,
                    paddle.full_like(next_tokens, pad_token_id))

            scores = self.update_scores_for_generation(scores, next_scores,
                                                       cur_len - origin_len,
                                                       unfinished_flag)

            cur_len += 1
            input_ids = paddle.concat([input_ids, next_tokens], axis=1)

            if eos_token_id is not None:
                unfinished_flag = paddle.logical_and(
                    unfinished_flag, next_tokens != eos_token_id)

            # Stop when there is a </s> in all sentences
            if not paddle.any(unfinished_flag):
                break

            model_kwargs = self.update_model_kwargs_for_generation(
                outputs, model_kwargs)
        return input_ids[:, origin_len:], scores
Example #11
0
 def forward(self, x, label):
     loss_dict = {}
     if self.epsilon is not None:
         class_num = x.shape[-1]
         label = self._labelsmoothing(label, class_num)
         x = -F.log_softmax(x, axis=-1)
         loss = paddle.sum(x * label, axis=-1)
     else:
         if label.shape[-1] == x.shape[-1]:
             label = F.softmax(label, axis=-1)
             soft_label = True
         else:
             soft_label = False
         loss = F.cross_entropy(x, label=label, soft_label=soft_label)
     return loss
Example #12
0
    def forward(self, x):
        x = self.conv1(x)
        # x = self.maxpool(x)

        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)

        # global average pooling layer
        x = F.avg_pool2d(x, x.shape[-2:])
        # flatten for input to fully-connected layer
        x = x.flatten(1)
        x = self.fc(x)

        return F.log_softmax(x, axis=1)
    def forward(self, confidence, predicted_locations, labels, gt_locations):
        """Compute classification loss and smooth l1 loss.

        Args:
            confidence (batch_size, num_priors, num_classes): class predictions.
            locations (batch_size, num_priors, 4): predicted locations.
            labels (batch_size, num_priors): real labels of all the priors.
            boxes (batch_size, num_priors, 4): real boxes corresponding all the priors.
        """
        num_classes = confidence.shape[2]
        with paddle.no_grad():
            # derived from cross_entropy=sum(log(p))
            loss = -F.log_softmax(confidence, 2)[:, :, 0]
            mask = box_utils.hard_negative_mining(loss, labels,
                                                  self.neg_pos_ratio)

        confidence = paddle.concat([
            confidence[:, :, 0].masked_select(mask).reshape([-1, 1]),
            confidence[:, :, 1].masked_select(mask).reshape([-1, 1])
        ],
                                   axis=1)
        classification_loss = F.cross_entropy(confidence.reshape(
            [-1, num_classes]),
                                              labels.masked_select(mask),
                                              reduction='sum')
        pos_mask = labels > 0
        predicted_locations = predicted_locations.masked_select(
            paddle.concat([
                pos_mask.reshape(pos_mask.shape + [1]),
                pos_mask.reshape(pos_mask.shape + [1]),
                pos_mask.reshape(pos_mask.shape + [1]),
                pos_mask.reshape(pos_mask.shape + [1])
            ],
                          axis=2)).reshape([-1, 4])
        gt_locations = gt_locations.masked_select(
            paddle.concat([
                pos_mask.reshape(pos_mask.shape + [1]),
                pos_mask.reshape(pos_mask.shape + [1]),
                pos_mask.reshape(pos_mask.shape + [1]),
                pos_mask.reshape(pos_mask.shape + [1])
            ],
                          axis=2)).reshape([-1, 4])
        smooth_l1_loss = F.smooth_l1_loss(predicted_locations,
                                          gt_locations.cast('float32'),
                                          reduction='sum')  # smooth_l1_loss
        # smooth_l1_loss = F.mse_loss(predicted_locations, gt_locations, reduction='sum')  #l2 loss
        num_pos = gt_locations.shape[0]
        return smooth_l1_loss / num_pos, classification_loss / num_pos
Example #14
0
    def forward(self, logits):
        assert logits.ndim == 3, (
            f'the input logits must be a ' +
            f'3d tensor of shape [n_spk,n_uttns,emb_dim],' +
            f'but received logits.ndim = {logits.ndim}')
        import pdb
        pdb.set_trace()

        logits = F.normalize(logits, p=2, axis=-1, epsilon=self.eps)
        proto = paddle.mean(logits[:, 1:, :], axis=1, keepdim=False).transpose(
            (1, 0))  # [emb_dim, n_spk]
        query = logits[:, 0, :]  # [n_spk, emb_dim]
        similarity = paddle.matmul(query, proto) * self.s  #[n_spk,n_spk]
        label = paddle.arange(0, similarity.shape[0])
        log_sim = F.log_softmax(similarity, -1)
        return self.nll_loss(log_sim, label), log_sim
Example #15
0
def calc_minilm_loss(loss_fct, s, t, attn_mask, num_relation_heads=0):
    """
    Calculates loss for Q-Q, K-K, V-V relation from MiniLMv2.
    Args:
        loss_fct (callable):
            Loss function for distillation. It only supports kl_div loss now.
        s (Tensor):
            Q, K, V of Student.
        t (Tensor):
            Q, K, V of teacher.
        attn_mask (Tensor):
            Attention mask for relation.
        num_relation_heads (int):
            The number of relation heads. 0 means `num_relation_heads` equals
            to origin head num.
            Defaults to 0.

    Returns:
        Tensor: MiniLM loss value.

    """
    # Initialize head_num
    if num_relation_heads > 0 and num_relation_heads != s.shape[1]:
        # s'shape: [bs, seq_len, head_num, head_dim]
        s = tensor.transpose(x=s, perm=[0, 2, 1, 3])
        # s'shape: [bs, seq_len, num_relation_heads, head_dim_new]
        s = tensor.reshape(x=s, shape=[0, 0, num_relation_heads, -1])
        # s' shape: [bs, num_relation_heads, seq_len, head_dim_new]
        s = tensor.transpose(x=s, perm=[0, 2, 1, 3])
    if num_relation_heads > 0 and num_relation_heads != t.shape[1]:
        t = tensor.transpose(x=t, perm=[0, 2, 1, 3])
        t = tensor.reshape(x=t, shape=[0, 0, num_relation_heads, -1])
        t = tensor.transpose(x=t, perm=[0, 2, 1, 3])

    s_head_dim, t_head_dim = s.shape[3], t.shape[3]
    scaled_dot_product_s = tensor.matmul(
        x=s, y=s, transpose_y=True) / math.sqrt(s_head_dim)
    del s
    scaled_dot_product_s += attn_mask

    scaled_dot_product_t = tensor.matmul(
        x=t, y=t, transpose_y=True) / math.sqrt(t_head_dim)
    del t
    scaled_dot_product_t += attn_mask
    loss = loss_fct(F.log_softmax(scaled_dot_product_s),
                    F.softmax(scaled_dot_product_t))
    return loss
Example #16
0
 def forward(self, x, label):
     if isinstance(x, dict):
         x = x["logits"]
     if self.epsilon is not None:
         class_num = x.shape[-1]
         label = self._labelsmoothing(label, class_num)
         x = -F.log_softmax(x, axis=-1)
         loss = paddle.sum(x * label, axis=-1)
     else:
         if label.shape[-1] == x.shape[-1]:
             label = F.softmax(label, axis=-1)
             soft_label = True
         else:
             soft_label = False
         loss = F.cross_entropy(x, label=label, soft_label=soft_label)
     loss = loss.mean()
     return {"CELoss": loss}
Example #17
0
 def pointer(self, x, state):
     x_ = paddle.fluid.layers.expand(state.unsqueeze(1),expand_times=[1, x.shape[1], 1])
     out = paddle.concat([x, x_], axis=2)
     s0 = F.tanh(self.linear(out))
     s = self.weights(s0).reshape(shape=[x.shape[0], x.shape[1]])
     a = F.softmax(s)
     res = a.unsqueeze(1).bmm(x).squeeze(1)
     if self.normalize:
         if self.training:
             # In training we output log-softmax for NLL
             scores = F.log_softmax(s)
         else:
             # ...Otherwise 0-1 probabilities
             scores = F.softmax(s)
     else:
         scores = a.exp()
     return scores,res
Example #18
0
    def __call__(self, logits, label, mode="train"):
        loss_dict = {}
        if self.epsilon is not None:
            class_num = logits.shape[-1]
            label = self._labelsmoothing(label, class_num)

            x = -F.log_softmax(x, axis=-1)
            loss = paddle.sum(x * label, axis=-1)
        else:
            if label.shape[-1] == logits.shape[-1]:
                label = F.softmax(label, axis=-1)
                soft_label = True
            else:
                soft_label = False
            loss = F.cross_entropy(logits, label=label, soft_label=soft_label)

        loss_dict[self.name] = paddle.mean(loss)
        return loss_dict
Example #19
0
 def forward(self, pred, batch):
     pred = pred.reshape([-1, pred.shape[2]])
     max_len = batch[2].max()
     tgt = batch[1][:, 1:2 + max_len]
     tgt = tgt.reshape([-1])
     if self.smoothing:
         eps = 0.1
         n_class = pred.shape[1]
         one_hot = F.one_hot(tgt, pred.shape[1])
         one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
         log_prb = F.log_softmax(pred, axis=1)
         non_pad_mask = paddle.not_equal(
             tgt, paddle.zeros(tgt.shape, dtype=tgt.dtype))
         loss = -(one_hot * log_prb).sum(axis=1)
         loss = loss.masked_select(non_pad_mask).mean()
     else:
         loss = self.loss_func(pred, tgt)
     return {'loss': loss}
Example #20
0
    def forward(self, logits, targets):
        logits = F.normalize(logits, p=2, axis=1, epsilon=1e-8)
        wn = F.normalize(self.w, p=2, axis=0, epsilon=1e-8)
        cosine = logits @ wn

        sine = paddle.sqrt(1.0 - paddle.square(cosine))
        phi = cosine * self.cos_m - sine * self.sin_m  # cos(theta + m)
        if self.easy_margin:
            phi = paddle.where(cosine > 0, phi, cosine)
        else:
            phi = paddle.where(cosine > self.th, phi, cosine - self.mm)
        target_one_hot = F.one_hot(targets, self.n_classes)
        outputs = (target_one_hot * phi) + (
            (1.0 - target_one_hot) * cosine) - target_one_hot * self.margin2
        outputs = self.scale * outputs
        pred = F.log_softmax(outputs, axis=-1)

        return self.nll_loss(pred, targets), pred
Example #21
0
    def forward(self, logit, label):
        logit = paddle.reshape(
            logit, [logit.shape[0], logit.shape[1], -1])  # N,C,H,W => N,C,H*W
        logit = paddle.transpose(logit, [0, 2, 1])  # N,C,H*W => N,H*W,C
        logit = paddle.reshape(logit,
                               [-1, logit.shape[2]])  # N,H*W,C => N*H*W,C
        label = paddle.reshape(label, [-1, 1])
        range_ = paddle.arange(0, label.shape[0])
        range_ = paddle.unsqueeze(range_, axis=-1)
        label = paddle.cast(label, dtype='int64')
        label = paddle.concat([range_, label], axis=-1)
        logpt = F.log_softmax(logit)
        logpt = paddle.gather_nd(logpt, label)

        pt = paddle.exp(logpt.detach())
        loss = -1 * (1 - pt)**self.gamma * logpt
        loss = paddle.mean(loss)
        return loss
Example #22
0
 def forward(self, x, label):
     assert len(x.shape) == len(label.shape), \
         "x and label shape length should be same but got {} for x.shape and {} for label.shape".format(x.shape, label.shape)
     if self.epsilon is not None:
         class_num = x.shape[-1]
         label = self._labelsmoothing(label, class_num)
         x = -F.log_softmax(x, axis=self.axis)
         loss = paddle.sum(x * label, axis=self.axis)
     else:
         if label.shape[self.axis] == x.shape[self.axis]:
             if self.label_act == "softmax":
                 label = F.softmax(label, axis=self.axis)
             soft_label = True
         else:
             soft_label = False
         loss = F.cross_entropy(
             x, label=label, soft_label=soft_label, axis=self.axis)
     loss = loss.mean()
     return loss
Example #23
0
    def forward(self, inputs):
        # 公共网络层 
        x = F.relu(self.conv1(inputs))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        # 行动策略网络层
        x_act = F.relu(self.act_conv1(x))
        x_act = paddle.reshape(
                x_act, [-1, 4 * self.board_height * self.board_width])
        
        x_act  = F.log_softmax(self.act_fc1(x_act))        
        # 状态价值网络层
        x_val  = F.relu(self.val_conv1(x))
        x_val = paddle.reshape(
                x_val, [-1, 2 * self.board_height * self.board_width])
        x_val = F.relu(self.val_fc1(x_val))
        x_val = F.tanh(self.val_fc2(x_val))

        return x_act,x_val
Example #24
0
    def valid_one_epoch(self, epoch):
        losses = []
        accs = []
        for i in range(self.model_num):
            if self.use_data_parallel:
                self.parallel_models[i].eval()
            else:
                self.models[i].eval()
            losses.append(AvgrageMeter())
            accs.append(AvgrageMeter())

        for _, (images, labels) in enumerate(self.valid_loader):
            images, labels = to_variable(images), to_variable(labels)
            batch_size = images.shape[0]

            logits = []
            if self.use_data_parallel:
                for model in self.parallel_models:
                    logits.append(model(images))
            else:
                for model in self.models:
                    logits.append(model(images))
            for i in range(self.model_num):
                gt_loss = self.models[i].loss(logits[i], labels)
                kl_loss = 0
                for j in range(self.model_num):
                    if i != j:
                        x = F.log_softmax(logits[i], axis=1)
                        y = fluid.layers.softmax(logits[j], axis=1)
                        kl_loss += fluid.layers.kldiv_loss(
                            x, y, reduction='batchmean')

                loss = gt_loss
                if (self.model_num > 1):
                    loss += kl_loss / (self.model_num - 1)

                prec = fluid.layers.accuracy(input=logits[i],
                                             label=labels,
                                             k=1)
                losses[i].update(loss.numpy(), batch_size)
                accs[i].update(prec.numpy() * 100, batch_size)
        return losses, accs
Example #25
0
 def predict_word(dec_seq, enc_output, n_active_inst, n_bm,
                  memory_key_padding_mask):
     tgt_key_padding_mask = self.generate_padding_mask(dec_seq)
     dec_seq = self.embedding(dec_seq).transpose([1, 0, 2])
     dec_seq = self.positional_encoding(dec_seq)
     tgt_mask = self.generate_square_subsequent_mask(
         dec_seq.shape[0])
     dec_output = self.decoder(
         dec_seq,
         enc_output,
         tgt_mask=tgt_mask,
         tgt_key_padding_mask=tgt_key_padding_mask,
         memory_key_padding_mask=memory_key_padding_mask,
     ).transpose([1, 0, 2])
     dec_output = dec_output[:,
                             -1, :]  # Pick the last step: (bh * bm) * d_h
     word_prob = F.log_softmax(self.tgt_word_prj(dec_output),
                               axis=1)
     word_prob = word_prob.reshape([n_active_inst, n_bm, -1])
     return word_prob
Example #26
0
    def forward(self, logit_1, logit_2, label=None):
        """
        Calculate the KL loss. If the label is not None, it considers the
        ignore_index in label and calculates the masked loss.

        Args:
            logit_1 (Tensor): Logit tensor, the data type is float32 or float64.
                The shape is (N, C), where C is number of classes, and if shape is
                more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
            logit_2 (Tensor): Logit tensor, the data type is float32 or float64.
                The shape of logit_2 and logit_1 are the same.
            label (Tensor, optional): Label tensor, the data type is int64.
                The shape is (N), where each value is 0 <= label[i] <= C-1, and
                if shape is more than 2D, this is (N, D1, D2,..., Dk), k >= 1.
        Returns:
            (Tensor): The average loss.
        """
        if logit_1.shape != logit_2.shape:
            raise ValueError(
                'The shape of logit_1 = {} must be the same as the shape of logit_2 = {}.'
                .format(logit_1.shape, logit_2.shape))

        logit_1 = F.log_softmax(logit_1 / self.temperature, axis=1)
        logit_2 = F.softmax(logit_2 / self.temperature, axis=1)
        loss = self.kl_loss(logit_1, logit_2)
        loss = loss * self.temperature * self.temperature

        if label is None:
            avg_loss = paddle.mean(loss)
        else:
            mask = label != self.ignore_index
            mask = paddle.cast(mask, 'float32')
            mask = paddle.unsqueeze(mask, axis=1)
            label.stop_gradient = True
            mask.stop_gradient = True

            loss = loss * mask
            avg_loss = paddle.mean(loss) / (paddle.mean(mask) + self.EPS)
        return avg_loss
Example #27
0
    def forward_test(self, src):
        bs = src.shape[0]
        if self.encoder is not None:
            src = self.positional_encoding(src.transpose([1, 0, 2]))
            memory = self.encoder(src)
        else:
            memory = src.squeeze(2).transpose([2, 0, 1])
        dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64)
        for len_dec_seq in range(1, 25):
            src_enc = memory.clone()
            tgt_key_padding_mask = self.generate_padding_mask(dec_seq)
            dec_seq_embed = self.embedding(dec_seq).transpose([1, 0, 2])
            dec_seq_embed = self.positional_encoding(dec_seq_embed)
            tgt_mask = self.generate_square_subsequent_mask(
                dec_seq_embed.shape[0])
            output = self.decoder(dec_seq_embed,
                                  src_enc,
                                  tgt_mask=tgt_mask,
                                  memory_mask=None,
                                  tgt_key_padding_mask=tgt_key_padding_mask,
                                  memory_key_padding_mask=None)
            dec_output = output.transpose([1, 0, 2])

            dec_output = dec_output[:,
                                    -1, :]  # Pick the last step: (bh * bm) * d_h
            word_prob = F.log_softmax(self.tgt_word_prj(dec_output), axis=1)
            word_prob = word_prob.reshape([1, bs, -1])
            preds_idx = word_prob.argmax(axis=2)

            if paddle.equal_all(
                    preds_idx[-1],
                    paddle.full(preds_idx[-1].shape, 3, dtype='int64')):
                break

            preds_prob = word_prob.max(axis=2)
            dec_seq = paddle.concat(
                [dec_seq, preds_idx.reshape([-1, 1])], axis=1)

        return dec_seq
Example #28
0
    def _test_base(self, run_ipu=True):
        scope = fluid.core.Scope()
        main_prog = paddle.static.Program()
        startup_prog = paddle.static.Program()
        SEED = self.SEED
        main_prog.random_seed = SEED
        startup_prog.random_seed = SEED

        with fluid.scope_guard(scope):
            with paddle.static.program_guard(main_prog, startup_prog):
                x = paddle.static.data(
                    name=self.feed_list[0],
                    shape=self.feed_shape[0],
                    dtype=self.feed_dtype[0])
                out = F.log_softmax(x, **self.attrs)

                fetch_list = [out.name]

            if run_ipu:
                place = paddle.IPUPlace()
            else:
                place = paddle.CPUPlace()
            exe = paddle.static.Executor(place)
            exe.run(startup_prog)

            if run_ipu:
                feed_list = self.feed_list
                ipu_strategy = compiler.get_ipu_strategy()
                ipu_strategy.is_training = self.is_training
                program = compiler.IPUCompiledProgram(
                    main_prog,
                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
            else:
                program = main_prog

            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
            return result[0]
Example #29
0
def soft_cross_entropy(inp, target):
    inp_likelihood = F.log_softmax(inp, axis=-1)
    target_prob = F.softmax(target, axis=-1)
    return -1. * paddle.mean(paddle.sum(inp_likelihood * target_prob, axis=-1))
Example #30
0
    def beam_search(self, input_ids, beam_scorer, logits_processors,
                    max_length, pad_token_id, eos_token_id, **model_kwargs):
        batch_size = len(beam_scorer._beam_hyps)
        num_beams = beam_scorer.num_beams

        batch_beam_size, cur_len = input_ids.shape
        origin_len = cur_len

        assert (
            num_beams * batch_size == batch_beam_size
        ), "Batch dimension of `input_ids` should be {}, but received {}.".format(
            num_beams * batch_size, batch_beam_size)

        beam_scores = paddle.zeros((batch_size, num_beams),
                                   dtype=paddle.get_default_dtype())
        beam_scores[:, 1:] = -1e9
        beam_scores = paddle.reshape(beam_scores, [-1])

        while cur_len < max_length:
            # prepare model inputs & get model output
            model_inputs = self.prepare_inputs_for_generation(
                input_ids, **model_kwargs)
            outputs = self(**model_inputs)
            logits = outputs[0] if isinstance(outputs, tuple) else outputs
            # [batch_size, vocab_size]
            logits = logits[:, -1, :]

            # pre-process distribution
            logits = self.adjust_logits_during_generation(logits)
            logits = logits_processors(input_ids, logits)

            # beam search
            # [batch_size * num_beams, vocab_size]
            next_scores = F.log_softmax(logits)

            next_scores = next_scores + beam_scores.unsqueeze(-1)
            # reshape for beam search
            vocab_size = next_scores.shape[-1]
            next_scores = next_scores.reshape(
                [batch_size, num_beams * vocab_size])

            next_scores, next_tokens = paddle.topk(next_scores,
                                                   2 * num_beams,
                                                   axis=1)

            next_indices = next_tokens // vocab_size
            next_tokens = next_tokens % vocab_size

            # stateless
            beam_outputs = beam_scorer.process(
                input_ids,
                next_scores,
                next_tokens,
                next_indices,
                pad_token_id=pad_token_id,
                eos_token_id=eos_token_id,
            )
            beam_scores = beam_outputs["next_beam_scores"]
            beam_next_tokens = beam_outputs["next_beam_tokens"]
            beam_idx = beam_outputs["next_beam_indices"]

            cur_len += 1
            input_ids = paddle.concat([
                paddle.index_select(input_ids, beam_idx),
                beam_next_tokens.unsqueeze(-1)
            ],
                                      axis=-1)

            if beam_scorer.is_done:
                break
            model_kwargs = self.update_model_kwargs_for_generation(
                outputs, model_kwargs)
            if model_kwargs["cache"] is not None:
                # reorder the cache
                model_kwargs["cache"] = map_structure(
                    lambda x: paddle.index_select(x, beam_idx),
                    model_kwargs["cache"])

        pred_ids, scores = beam_scorer.finalize(input_ids,
                                                beam_scores,
                                                next_tokens,
                                                next_indices,
                                                pad_token_id=pad_token_id,
                                                eos_token_id=eos_token_id)
        return pred_ids[:, origin_len:], scores