Ejemplo n.º 1
0
        def build_position_ids(src_ids, dst_ids):
            src_shape = L.shape(src_ids)
            src_batch = src_shape[0]
            src_seqlen = src_shape[1]
            dst_seqlen = src_seqlen - 1 # without cls

            src_position_ids = L.reshape(
                L.range(
                    0, src_seqlen, 1, dtype='int32'), [1, src_seqlen, 1],
                inplace=True) # [1, slot_seqlen, 1]
            src_position_ids = L.expand(src_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1]
            zero = L.fill_constant([1], dtype='int64', value=0)
            input_mask = L.cast(L.equal(src_ids, zero), "int32")  # assume pad id == 0 [B, slot_seqlen, 1]
            src_pad_len = L.reduce_sum(input_mask, 1, keep_dim=True) # [B, 1, 1]

            dst_position_ids = L.reshape(
                L.range(
                    src_seqlen, src_seqlen+dst_seqlen, 1, dtype='int32'), [1, dst_seqlen, 1],
                inplace=True) # [1, slot_seqlen, 1]
            dst_position_ids = L.expand(dst_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen, 1]
            dst_position_ids = dst_position_ids - src_pad_len # [B, slot_seqlen, 1]

            position_ids = L.concat([src_position_ids, dst_position_ids], 1)
            position_ids = L.cast(position_ids, 'int64')
            position_ids.stop_gradient = True
            return position_ids
Ejemplo n.º 2
0
def concat_coord(x):
    ins_feat = x  # [N, c, h, w]

    batch_size = L.shape(x)[0]
    h = L.shape(x)[2]
    w = L.shape(x)[3]
    float_h = L.cast(h, 'float32')
    float_w = L.cast(w, 'float32')

    y_range = L.range(0., float_h, 1., dtype='float32')  # [h, ]
    y_range = 2.0 * y_range / (float_h - 1.0) - 1.0
    x_range = L.range(0., float_w, 1., dtype='float32')  # [w, ]
    x_range = 2.0 * x_range / (float_w - 1.0) - 1.0
    x_range = L.reshape(x_range, (1, -1))  # [1, w]
    y_range = L.reshape(y_range, (-1, 1))  # [h, 1]
    x = L.expand(x_range, [h, 1])  # [h, w]
    y = L.expand(y_range, [1, w])  # [h, w]

    x = L.reshape(x, (1, 1, h, w))  # [1, 1, h, w]
    y = L.reshape(y, (1, 1, h, w))  # [1, 1, h, w]
    x = L.expand(x, [batch_size, 1, 1, 1])  # [N, 1, h, w]
    y = L.expand(y, [batch_size, 1, 1, 1])  # [N, 1, h, w]

    ins_kernel_feat = L.concat([ins_feat, x, y], axis=1)  # [N, c+2, h, w]

    return ins_kernel_feat
Ejemplo n.º 3
0
    def _build_position_ids(self, src_ids):
        src_shape = L.shape(src_ids)
        src_seqlen = src_shape[1]
        src_batch = src_shape[0]

        slot_seqlen = self.slot_seqlen

        num_b = (src_seqlen / slot_seqlen) - 1
        a_position_ids = L.reshape(L.range(0, slot_seqlen, 1, dtype='int32'),
                                   [1, slot_seqlen, 1],
                                   inplace=True)  # [1, slot_seqlen, 1]
        a_position_ids = L.expand(
            a_position_ids, [src_batch, 1, 1])  # [B, slot_seqlen * num_b, 1]

        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero),
                            "int32")  # assume pad id == 0 [B, slot_seqlen, 1]
        a_pad_len = L.reduce_sum(input_mask, 1)  # [B, 1, 1]

        b_position_ids = L.reshape(L.range(slot_seqlen,
                                           2 * slot_seqlen,
                                           1,
                                           dtype='int32'), [1, slot_seqlen, 1],
                                   inplace=True)  # [1, slot_seqlen, 1]
        b_position_ids = L.expand(
            b_position_ids,
            [src_batch, num_b, 1])  # [B, slot_seqlen * num_b, 1]
        b_position_ids = b_position_ids - a_pad_len  # [B, slot_seqlen * num_b, 1]

        position_ids = L.concat([a_position_ids, b_position_ids], 1)
        position_ids = L.cast(position_ids, 'int64')
        position_ids.stop_gradient = True
        return position_ids
Ejemplo n.º 4
0
def decode(conv_output, anchors, stride, num_class, conf_thresh):
    conv_shape = P.shape(conv_output)
    batch_size = conv_shape[0]
    n_grid = conv_shape[1]
    anchor_per_scale = len(anchors)
    conv_output = P.reshape(
        conv_output,
        (batch_size, n_grid, n_grid, anchor_per_scale, 5 + num_class))
    conv_raw_dxdy = conv_output[:, :, :, :, 0:2]
    conv_raw_dwdh = conv_output[:, :, :, :, 2:4]
    conv_raw_conf = conv_output[:, :, :, :, 4:5]
    conv_raw_prob = conv_output[:, :, :, :, 5:]

    rows = P.range(0, n_grid, 1, 'float32')
    cols = P.range(0, n_grid, 1, 'float32')
    rows = P.expand(P.reshape(rows, (1, -1, 1)), [n_grid, 1, 1])
    cols = P.expand(P.reshape(cols, (-1, 1, 1)), [1, n_grid, 1])
    offset = P.concat([rows, cols], axis=-1)
    offset = P.reshape(offset, (1, n_grid, n_grid, 1, 2))
    offset = P.expand(offset, [batch_size, 1, 1, anchor_per_scale, 1])

    pred_xy = (P.sigmoid(conv_raw_dxdy) + offset) * stride
    pred_wh = (P.exp(conv_raw_dwdh) * P.assign(anchors))
    pred_xywh = P.concat([pred_xy, pred_wh], axis=-1)
    pred_conf = P.sigmoid(conv_raw_conf)
    pred_prob = P.sigmoid(conv_raw_prob)

    pred_xywh = P.reshape(pred_xywh, (batch_size, -1, 4))  # [-1, -1, 4]
    pred_conf = P.reshape(pred_conf, (batch_size, -1, 1))  # [-1, -1, 1]
    pred_prob = P.reshape(pred_prob,
                          (batch_size, -1, num_class))  # [-1, -1, 80]
    return pred_xywh, pred_conf, pred_prob
Ejemplo n.º 5
0
def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None):
    """Matrix NMS for multi-class masks.

    Args:
        seg_masks (Tensor): shape (n, h, w)   0、1组成的掩码
        cate_labels (Tensor): shape (n), mask labels in descending order
        cate_scores (Tensor): shape (n), mask scores in descending order
        kernel (str):  'linear' or 'gauss'
        sigma (float): std in gaussian method
        sum_masks (Tensor):  shape (n, )      n个物体的面积

    Returns:
        Tensor: cate_scores_update, tensors of shape (n)
    """
    n_samples = L.shape(cate_labels)[0]   # 物体数
    seg_masks = L.reshape(seg_masks, (n_samples, -1))   # [n, h*w]
    # inter.
    inter_matrix = L.matmul(seg_masks, seg_masks, transpose_y=True)   # [n, n] 自己乘以自己的转置。两两之间的交集面积。
    # union.
    sum_masks_x = L.expand(L.reshape(sum_masks, (1, -1)), [n_samples, 1])     # [n, n]  sum_masks重复了n行得到sum_masks_x
    # iou.
    iou_matrix = inter_matrix / (sum_masks_x + L.transpose(sum_masks_x, [1, 0]) - inter_matrix)
    rows = L.range(0, n_samples, 1, 'int32')
    cols = L.range(0, n_samples, 1, 'int32')
    rows = L.expand(L.reshape(rows, (1, -1)), [n_samples, 1])
    cols = L.expand(L.reshape(cols, (-1, 1)), [1, n_samples])
    tri_mask = L.cast(rows > cols, 'float32')
    iou_matrix = tri_mask * iou_matrix   # [n, n]   只取上三角部分

    # label_specific matrix.
    cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1])     # [n, n]  cate_labels重复了n行得到cate_labels_x
    label_matrix = L.cast(L.equal(cate_labels_x, L.transpose(cate_labels_x, [1, 0])), 'float32')
    label_matrix = tri_mask * label_matrix   # [n, n]   只取上三角部分

    # IoU compensation
    compensate_iou = L.reduce_max(iou_matrix * label_matrix, dim=0)
    compensate_iou = L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1])     # [n, n]
    compensate_iou = L.transpose(compensate_iou, [1, 0])      # [n, n]

    # IoU decay
    decay_iou = iou_matrix * label_matrix

    # # matrix nms
    if kernel == 'gaussian':
        decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2))
        compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2))
        decay_coefficient = L.reduce_min((decay_matrix / compensate_matrix), dim=0)
    elif kernel == 'linear':
        decay_matrix = (1-decay_iou)/(1-compensate_iou)
        decay_coefficient = L.reduce_min(decay_matrix, dim=0)
    else:
        raise NotImplementedError

    # update the score.
    cate_scores_update = cate_scores * decay_coefficient
    return cate_scores_update
Ejemplo n.º 6
0
    def fast_nms(self, boxes, scores, masks, max_num_detections=100):
        iou_threshold = self.nms_thresh
        top_k = self.top_k

        # 同类方框根据得分降序排列
        scores, idx = P.argsort(scores, axis=1, descending=True)

        idx = idx[:, :top_k]
        scores = scores[:, :top_k]

        num_classes, num_dets = P.shape(idx)[0], P.shape(idx)[1]

        idx = P.reshape(idx, (-1, ))
        boxes = P.gather(boxes, idx)
        boxes = P.reshape(boxes, (num_classes, num_dets, 4))
        masks = P.gather(masks, idx)
        masks = P.reshape(masks, (num_classes, num_dets, -1))

        # 计算一个c×n×n的IOU矩阵,其中每个n×n矩阵表示对该类n个候选框,两两之间的IOU
        iou = jaccard(boxes, boxes)
        # 因为自己与自己的IOU=1,IOU(A,B)=IOU(B,A),所以对上一步得到的IOU矩阵
        # 进行一次处理。具体做法是将每一个通道,的对角线元素和下三角部分置为0
        rows = P.range(0, num_dets, 1, 'int32')
        cols = P.range(0, num_dets, 1, 'int32')
        rows = P.expand(P.reshape(rows, (1, -1)), [num_dets, 1])
        cols = P.expand(P.reshape(cols, (-1, 1)), [1, num_dets])
        tri_mask = P.cast(rows > cols, 'float32')
        tri_mask = P.expand(P.reshape(tri_mask, (1, num_dets, num_dets)),
                            [num_classes, 1, 1])
        iou = tri_mask * iou
        iou_max = P.reduce_max(iou, dim=1)

        # Now just filter out the ones higher than the threshold
        keep = P.where(iou_max <= iou_threshold)

        # Assign each kept detection to its corresponding class
        classes = P.range(0, num_classes, 1, 'int32')
        classes = P.expand(P.reshape(classes, (-1, 1)), [1, num_dets])
        classes = P.gather_nd(classes, keep)

        boxes = P.gather_nd(boxes, keep)
        masks = P.gather_nd(masks, keep)
        scores = P.gather_nd(scores, keep)

        # Only keep the top cfg.max_num_detections highest scores across all classes
        scores, idx = P.argsort(scores, axis=0, descending=True)
        idx = idx[:max_num_detections]
        scores = scores[:max_num_detections]

        classes = P.gather(classes, idx)
        boxes = P.gather(boxes, idx)
        masks = P.gather(masks, idx)

        return boxes, masks, classes, scores
Ejemplo n.º 7
0
def fast_nms(boxes, scores, conf_thresh, nms_thresh, keep_top_k, nms_top_k):
    '''
    :param boxes:    [?, 4]
    :param scores:   [80, ?]
    '''

    # 同类方框根据得分降序排列
    scores, idx = P.argsort(scores, axis=1, descending=True)

    idx = idx[:, :keep_top_k]
    scores = scores[:, :keep_top_k]

    num_classes, num_dets = P.shape(idx)[0], P.shape(idx)[1]

    idx = P.reshape(idx, (-1, ))
    boxes = P.gather(boxes, idx)
    boxes = P.reshape(boxes, (num_classes, num_dets, 4))

    # 计算一个c×n×n的IOU矩阵,其中每个n×n矩阵表示对该类n个候选框,两两之间的IOU
    iou = _iou(boxes, boxes)

    # 因为自己与自己的IOU=1,IOU(A,B)=IOU(B,A),所以对上一步得到的IOU矩阵
    # 进行一次处理。具体做法是将每一个通道,的对角线元素和下三角部分置为0
    rows = P.range(0, num_dets, 1, 'int32')
    cols = P.range(0, num_dets, 1, 'int32')
    rows = P.expand(P.reshape(rows, (1, -1)), [num_dets, 1])
    cols = P.expand(P.reshape(cols, (-1, 1)), [1, num_dets])
    tri_mask = P.cast(rows > cols, 'float32')
    tri_mask = P.expand(P.reshape(tri_mask, (1, num_dets, num_dets)),
                        [num_classes, 1, 1])
    iou = tri_mask * iou
    iou_max = P.reduce_max(iou, dim=1)

    # 同一类别,n个框与“分数比它高的框”的最高iou超过nms_thresh的话,就丢弃。下标是0的框肯定被保留。
    keep = P.where(iou_max <= nms_thresh)

    # Assign each kept detection to its corresponding class
    classes = P.range(0, num_classes, 1, 'int32')
    classes = P.expand(P.reshape(classes, (-1, 1)), [1, num_dets])
    classes = P.gather_nd(classes, keep)

    boxes = P.gather_nd(boxes, keep)
    scores = P.gather_nd(scores, keep)

    # Only keep the top cfg.max_num_detections highest scores across all classes
    scores, idx = P.argsort(scores, axis=0, descending=True)
    idx = idx[:nms_top_k]
    scores = scores[:nms_top_k]

    classes = P.gather(classes, idx)
    boxes = P.gather(boxes, idx)

    return boxes, scores, classes
Ejemplo n.º 8
0
 def __call__(self, input):
     if not self.coord_conv:
         return input
     b = L.shape(input)[0]
     h = L.shape(input)[2]
     w = L.shape(input)[3]
     x_range = L.range(0, w, 1., dtype='float32') / (w - 1) * 2.0 - 1
     y_range = L.range(0, h, 1., dtype='float32') / (h - 1) * 2.0 - 1
     x_range = L.reshape(x_range, (1, 1, 1, -1))  # [1, 1, 1, w]
     y_range = L.reshape(y_range, (1, 1, -1, 1))  # [1, 1, h, 1]
     x_range = L.expand(x_range, [b, 1, h, 1])  # [b, 1, h, w]
     y_range = L.expand(y_range, [b, 1, 1, w])  # [b, 1, h, w]
     offset = L.concat([input, x_range, y_range], axis=1)
     return offset
Ejemplo n.º 9
0
    def _decode(self,
                x,
                y,
                w,
                h,
                anchors,
                stride,
                scale_x_y,
                eps,
                is_gt=False):
        conv_shape = x.shape  # (8, 13, 13, 3)
        batch_size = conv_shape[0]
        n_grid = conv_shape[1]
        anchor_per_scale = conv_shape[3]

        _x = L.unsqueeze(x, 4)
        _y = L.unsqueeze(y, 4)
        conv_raw_dxdy = L.concat([_x, _y], -1)  # (8, 13, 13, 3, 2)
        _w = L.unsqueeze(w, 4)
        _h = L.unsqueeze(h, 4)
        conv_raw_dwdh = L.concat([_w, _h], -1)  # (8, 13, 13, 3, 2)

        rows = L.range(0, n_grid, 1, 'float32')
        cols = L.range(0, n_grid, 1, 'float32')
        rows = L.expand(L.reshape(rows, (1, -1, 1)), [n_grid, 1, 1])
        cols = L.expand(L.reshape(cols, (-1, 1, 1)), [1, n_grid, 1])
        offset = L.concat([rows, cols], axis=-1)
        offset = L.reshape(offset, (1, n_grid, n_grid, 1, 2))
        offset = L.expand(offset, [batch_size, 1, 1, anchor_per_scale, 1])

        if is_gt:
            decode_xy = (conv_raw_dxdy + offset) / n_grid
        else:
            if (abs(scale_x_y - 1.0) < eps):
                decode_xy = L.sigmoid(conv_raw_dxdy)
                decode_xy = (decode_xy + offset) / n_grid
            else:
                # Grid Sensitive
                decode_xy = scale_x_y * L.sigmoid(conv_raw_dxdy) - 0.5 * (
                    scale_x_y - 1.0)
                decode_xy = (decode_xy + offset) / n_grid
        anchor_t = fluid.layers.assign(np.copy(anchors).astype(np.float32))
        decode_wh = (L.exp(conv_raw_dwdh) * anchor_t) / (n_grid * stride)
        decode_xywh = L.concat([decode_xy, decode_wh], axis=-1)
        if is_gt:
            decode_xywh.stop_gradient = True

        return decode_xywh  # (8, 13, 13, 3, 4)
 def __call__(self, input):
     if not self.coord_conv:
         return input
     b = input.shape[0]
     h = input.shape[2]
     w = input.shape[3]
     x_range = L.range(0, w, 1., dtype='float32') / (w - 1) * 2.0 - 1
     y_range = L.range(0, h, 1., dtype='float32') / (h - 1) * 2.0 - 1
     # x_range = paddle.to_tensor(x_range, place=input.place)
     # y_range = paddle.to_tensor(y_range, place=input.place)
     x_range = L.reshape(x_range, (1, 1, 1, -1))  # [1, 1, 1, w]
     y_range = L.reshape(y_range, (1, 1, -1, 1))  # [1, 1, h, 1]
     x_range = L.expand(x_range, [b, 1, h, 1])  # [b, 1, h, w]
     y_range = L.expand(y_range, [b, 1, 1, w])  # [b, 1, h, w]
     offset = L.concat([input, x_range, y_range], axis=1)
     return offset
def batch_scatter(ref, indices, updates, in_place=False, overwrite=False):
    """Scatter updates to ref, according to corrensponding index in indices
    in each batch. Currently, it only support 2d Tensor.

    Args:
        ref (Variable): with shape [batch_size, ...]
        indices (Variable): with shape [batch_size, 1]
        updates (Variable): with shape [batch_size]
        in_place (bool): if True, scatter result will be assign to ref. otherwise,
                         a new Tensor will be returned. Default is False.
        overwrite (bool): if True, scatter will over write corrensponding elements.
                          Default is False.

    Returns: TODO

    Raises: NULL

    Examples:
        ref
            [[1, 1, 1],
             [1, 1, 1]]
        indices
            [[2], [1]]
        updates
            [2, 3]

        return
            [[1, 1, 2],
             [1, 3, 1]]

    """
    ref_dtype = ref.dtype
    if ref_dtype not in PaddleVarType.floats:
        ref_in = layers.cast(ref, dtype='float32')
    else:
        ref_in = ref

    if updates.dtype != ref_in.dtype:
        updates = layers.cast(updates, dtype=ref_in.dtype)

    batch_size = layers.cast(layers.shape(ref_in)[0], dtype=indices.dtype)
    zero = layers.fill_constant(shape=[1], dtype=indices.dtype, value=0)
    one = layers.fill_constant(shape=[1], dtype=indices.dtype, value=1)
    batch_indices = layers.unsqueeze(
        layers.range(zero, batch_size, one, dtype=indices.dtype), [1])
    coord = layers.concat([batch_indices, indices], axis=1)
    if overwrite:
        mask = layers.gather_nd(ref_in, coord)
        mask = layers.elementwise_sub(layers.zeros_like(mask), mask)
        ref_in = layers.scatter_nd_add(ref_in, coord, mask)

    output = layers.scatter_nd_add(ref_in, coord, updates)
    if ref_dtype not in PaddleVarType.floats:
        output = layers.cast(output, dtype=ref_dtype)
    if in_place:
        layers.assign(output, ref)
        return ref
    else:
        return output
    def __init__(self, beam_size, batch_size, alpha, vocab_size, hidden_size):
        self.beam_size = beam_size
        self.batch_size = batch_size
        self.alpha = alpha
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.gather_top2k_append_index = layers.range(0, 2 * self.batch_size * beam_size, 1, 'int64') // \
                                                      (2 * self.beam_size) * (self.beam_size)

        self.gather_topk_append_index = layers.range(0, self.batch_size * beam_size, 1, 'int64') // \
                                                     self.beam_size * (2 * self.beam_size)

        self.gather_finish_topk_append_index = layers.range(0, self.batch_size * beam_size, 1, 'int64') // \
                                                            self.beam_size * (3 * self.beam_size)

        self.eos_id = layers.fill_constant([self.batch_size, 2 * self.beam_size], 'int64', value=1)
        self.get_alive_index = layers.range(0, self.batch_size, 1, 'int64') * self.beam_size
Ejemplo n.º 13
0
def crop(masks, boxes, padding: int = 1):
    """
    "Crop" predicted masks by zeroing out everything not in the predicted bbox.
    Vectorized by Chong (thanks Chong).

    Args:
        - masks should be a size [h, w, n] tensor of masks    。n是正样本数量
        - boxes should be a size [n, 4] tensor of bbox coords in relative point form
    """
    h, w, n = P.shape(masks)[0], P.shape(masks)[1], P.shape(masks)[2]
    x1, x2 = sanitize_coordinates(boxes[:, 0],
                                  boxes[:, 2],
                                  w,
                                  padding,
                                  cast=False)
    y1, y2 = sanitize_coordinates(boxes[:, 1],
                                  boxes[:, 3],
                                  h,
                                  padding,
                                  cast=False)

    rows = P.range(0, w, 1, 'int32')
    cols = P.range(0, h, 1, 'int32')
    rows = P.expand(P.reshape(rows, (1, -1, 1)), [h, 1, n])
    cols = P.expand(P.reshape(cols, (-1, 1, 1)), [1, w, n])
    rows.stop_gradient = True
    cols.stop_gradient = True

    x1 = P.reshape(x1, (1, 1, -1))
    x2 = P.reshape(x2, (1, 1, -1))
    y1 = P.reshape(y1, (1, 1, -1))
    y2 = P.reshape(y2, (1, 1, -1))
    x1.stop_gradient = True
    x2.stop_gradient = True
    y1.stop_gradient = True
    y2.stop_gradient = True
    masks_left = P.cast(rows >= P.expand(x1, [h, w, 1]), 'float32')
    masks_right = P.cast(rows < P.expand(x2, [h, w, 1]), 'float32')
    masks_up = P.cast(cols >= P.expand(y1, [h, w, 1]), 'float32')
    masks_down = P.cast(cols < P.expand(y2, [h, w, 1]), 'float32')

    crop_mask = masks_left * masks_right * masks_up * masks_down

    return masks * crop_mask
Ejemplo n.º 14
0
def positional_encoding(tensor, start_index, omega):
    """
    tensor: a reference tensor we use to get shape. actually only T and C are needed. Shape(B, T, C)
    start_index: int, we can actually use start and length to specify them.
    omega (B,): speaker position rates

    return (B, T, C), position embedding
    """
    dtype = omega.dtype
    _, length, dimension = tensor.shape
    index = F.range(start_index, start_index + length, 1, dtype=dtype)
    channel = F.range(0, dimension, 2, dtype=dtype)

    p = F.unsqueeze(omega, [1, 2]) \
      * F.unsqueeze(index, [1]) \
      / (10000 ** (channel / float(dimension)))

    encodings = F.concat([F.sin(p), F.cos(p)], axis=2)
    return encodings
Ejemplo n.º 15
0
 def _build_position_ids(self, src_ids):
     d_shape = L.shape(src_ids)
     d_seqlen = d_shape[1]
     d_batch = d_shape[0]
     position_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'),
                              [1, d_seqlen, 1],
                              inplace=True)
     position_ids = L.expand(position_ids, [d_batch, 1, 1])
     position_ids = L.cast(position_ids, 'int64')
     position_ids.stop_gradient = True
     return position_ids
Ejemplo n.º 16
0
    def get_mask_feats(self, inputs):
        name_list = list(inputs.keys())
        name_list = name_list[self.start_level:self.end_level +
                              1]  # [p2, p3, p4, p5]
        inputs2 = [inputs[name] for name in name_list]  # [p2, p3, p4, p5]
        inputs = inputs2

        feature_add_all_level = self.convs_all_levels[0][0](inputs[0])

        for i in range(1, len(inputs)):
            input_p = inputs[i]
            if i == 3:
                input_feat = input_p
                batch_size = L.shape(input_feat)[0]
                h = L.shape(input_feat)[2]
                w = L.shape(input_feat)[3]
                float_h = L.cast(h, 'float32')
                float_w = L.cast(w, 'float32')

                y_range = L.range(0., float_h, 1., dtype='float32')  # [h, ]
                y_range = 2.0 * y_range / (float_h - 1.0) - 1.0
                x_range = L.range(0., float_w, 1., dtype='float32')  # [w, ]
                x_range = 2.0 * x_range / (float_w - 1.0) - 1.0
                x_range = L.reshape(x_range, (1, -1))  # [1, w]
                y_range = L.reshape(y_range, (-1, 1))  # [h, 1]
                x = L.expand(x_range, [h, 1])  # [h, w]
                y = L.expand(y_range, [1, w])  # [h, w]

                x = L.reshape(x, (1, 1, h, w))  # [1, 1, h, w]
                y = L.reshape(y, (1, 1, h, w))  # [1, 1, h, w]
                x = L.expand(x, [batch_size, 1, 1, 1])  # [N, 1, h, w]
                y = L.expand(y, [batch_size, 1, 1, 1])  # [N, 1, h, w]

                input_p = L.concat([input_p, x, y], axis=1)  # [N, c+2, h, w]

            for ly in self.convs_all_levels[i]:
                input_p = ly(input_p)
            feature_add_all_level += input_p

        feature_pred = self.conv_pred(feature_add_all_level)
        return feature_pred
Ejemplo n.º 17
0
    def forward(self, features):
        src_ids, sent_ids = features
        dtype = 'float16' if self.hparam['fp16'] else 'float32'
        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), dtype) # assume pad id == 0
        #input_mask = L.unsqueeze(input_mask, axes=[2])
        d_shape = L.shape(src_ids)
        seqlen = d_shape[1]
        batch_size = d_shape[0]
        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
        pos_ids = L.expand(pos_ids, [batch_size, 1])
        pos_ids = L.unsqueeze(pos_ids, axes=[2])
        pos_ids = L.cast(pos_ids, 'int64')
        pos_ids.stop_gradient = True
        input_mask.stop_gradient = True
        task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
        task_ids.stop_gradient = True

        bert = ErnieModel(
            src_ids=src_ids,
            position_ids=pos_ids,
            sentence_ids=sent_ids,
            task_ids=task_ids,
            input_mask=input_mask,
            config=self.hparam,
            use_fp16=self.hparam['fp16']
        )

        cls_feats = bert.get_pooled_output()

        cls_feats = L.dropout(
            x=cls_feats,
            dropout_prob=0.1,
            dropout_implementation="upscale_in_train"
        )

        logits = L.fc(
            input=cls_feats,
            size=self.hparam['num_label'],
            param_attr=F.ParamAttr(
                name="cls_out_w",
                initializer=F.initializer.TruncatedNormal(scale=0.02)),
            bias_attr=F.ParamAttr(
                name="cls_out_b", initializer=F.initializer.Constant(0.))
        )

        propeller.summary.histogram('pred', logits)

        if self.mode is propeller.RunMode.PREDICT:
            probs = L.softmax(logits)
            return probs
        else:
            return logits
def generate_relative_positions_matrix(length,
                                       max_relative_position,
                                       cache=False):
    if not cache:
        range_vec = layers.range(0, length, 1, 'int32')
        range_vec.stop_gradient = True
        shapes = layers.shape(range_vec)
        range_vec = layers.reshape(range_vec, shape=[1, shapes[0]])
        range_mat = layers.expand(range_vec, [shapes[0], 1])
        distance_mat = range_mat - layers.transpose(range_mat, [1, 0])
    else:
        distance_mat = layers.range(-1 * length + 1, 1, 1, 'int32')
        distance_mat.stop_gradient = True
        shapes = layers.shape(distance_mat)
        distance_mat = layers.reshape(distance_mat, [1, shapes[0]])

    distance_mat_clipped = layers.clip(
        layers.cast(distance_mat, dtype="float32"),
        float(-max_relative_position), float(max_relative_position))
    final_mat = layers.cast(distance_mat_clipped,
                            dtype='int32') + max_relative_position
    return final_mat
Ejemplo n.º 19
0
def gen_bias(encoder_inputs, decoder_inputs, step):
    decoder_bsz, decoder_seqlen = decoder_inputs.shape[:2]
    attn_bias = L.reshape(L.range(0, decoder_seqlen, 1, dtype='float32') + 1, [1, -1, 1])
    decoder_bias = L.cast((L.matmul(attn_bias, 1. / attn_bias, transpose_y=True) >= 1.),
                          'float32')  #[1, 1, decoderlen, decoderlen]
    encoder_bias = L.unsqueeze(L.cast(L.ones_like(encoder_inputs), 'float32'), [1])  #[bsz, 1, encoderlen]
    encoder_bias = L.expand(encoder_bias, [1, decoder_seqlen, 1])  #[bsz,decoderlen, encoderlen]
    decoder_bias = L.expand(decoder_bias, [decoder_bsz, 1, 1])  #[bsz, decoderlen, decoderlen]
    if step > 0:
        bias = L.concat([encoder_bias, L.ones([decoder_bsz, decoder_seqlen, step], 'float32'), decoder_bias], -1)
    else:
        bias = L.concat([encoder_bias, decoder_bias], -1)
    return bias
Ejemplo n.º 20
0
    def _ranking(self, inputs, predictions):
        """ Reranking generated responses. """
        src_token = inputs["src_token"]
        src_mask = inputs["src_mask"]
        src_pos = inputs["src_pos"]
        src_type = inputs["src_type"]
        src_turn = inputs["src_turn"]
        src_embed = self.embedder(src_token, src_pos, src_type, src_turn)

        batch_size, num_latent, tgt_seq_len = predictions.shape

        # shape: [batch_size, num_latent, seq_len, 1]
        preds_token = F.unsqueeze(predictions, [3])
        preds_mask = F.not_equal(preds_token, self.padding_idx, "int64")
        preds_pos = layers.range(0, tgt_seq_len, 1, dtype="float32")
        preds_pos = F.unsqueeze(preds_pos, [0, 0, 1])
        preds_pos = layers.expand(preds_pos, [batch_size, num_latent, 1, 1])
        preds_pos = layers.cast(preds_pos, "int64")
        preds_type = layers.zeros_like(preds_token)
        preds_turn = layers.zeros_like(preds_token)

        scores = []
        for i in range(num_latent):
            pred_token = preds_token[:, i]
            pred_mask = preds_mask[:, i]
            pred_pos = preds_pos[:, i]
            pred_type = preds_type[:, i]
            pred_turn = preds_turn[:, i]

            input_mask = layers.concat([src_mask, pred_mask], axis=1)
            input_mask.stop_gradient = True
            pred_embed = self.embedder(pred_token, pred_pos, pred_type,
                                       pred_turn)
            embed = layers.concat([src_embed, pred_embed], axis=1)
            embed = self.embed_layer_norm(embed)

            mask_embed = self.mask_embed
            mask_embed = layers.expand(mask_embed, [batch_size, 1, 1])
            mask_embed = self.embed_layer_norm(mask_embed)

            out = layers.concat([mask_embed, embed], axis=1)
            mask = self._create_mask(input_mask, append_head=True)

            for layer in self.layers:
                out = layer(out, mask, None)

            mask_embed = out[:, 0]
            score = self.discriminator(mask_embed)
            scores.append(score[:, 0])
        scores = layers.stack(scores, axis=1)
        return scores
def batch_gather(var, indices):
    """Gather slices from var in each batch, according to corrensponding
    index in indices. Currently, it only support 2d Tensor.

    Args:
        var (Variable): with shape [batch_size, ...]
        indices (Variable): with shape [batch_size, 1] or [batch_size]

    Returns: Variable with shape [batch_size]

    Raises: NULL

    Examples:
        var
            [[1, 2, 3],
             [4, 5, 6]]
        indices
            [[2], [1]]

        return
            [[3], [5]]

    """
    if len(indices.shape) >= 2 and indices.shape[-1] != 1:
        raise ValueError(
            'shape of indices error. it should be a 1-D layers, or a 2-D layers which '
            'the 2nd dimension is 1. but got shape = %s' %
            (str(indices.shape), ))

    if len(indices.shape) == 1:
        indices = layers.reshape(indices, shape=[-1, 1])

    reshape_input = len(var.shape) == 1
    if reshape_input:
        var = PaddleFluidWrapper.reshape(var, shape=[-1, 1])

    batch_size = layers.cast(layers.shape(indices)[0], dtype=indices.dtype)
    zero = layers.fill_constant(shape=[1], dtype=indices.dtype, value=0)
    one = layers.fill_constant(shape=[1], dtype=indices.dtype, value=1)
    batch_indices = layers.unsqueeze(
        layers.range(zero, batch_size, one, dtype=indices.dtype), [1])

    coord = layers.concat([batch_indices, indices], axis=1)
    coord.stop_gradient = True
    output = layers.gather_nd(var, coord)
    if reshape_input:
        output = PaddleFluidWrapper.reshape(output, shape=[-1])
    return output
Ejemplo n.º 22
0
 def gather_2d_by_gather(tensor_nd,
                         beam_idx,
                         beam_size,
                         batch_size,
                         need_flat=True):
     batch_idx = layers.range(0, batch_size, 1,
                              dtype="int64") * beam_size
     flat_tensor = merge_beam_dim(tensor_nd) if need_flat else tensor_nd
     idx = layers.reshape(
         layers.elementwise_add(beam_idx, batch_idx, 0), [-1])
     new_flat_tensor = layers.gather(flat_tensor, idx)
     new_tensor_nd = layers.reshape(
         new_flat_tensor,
         shape=[batch_size, beam_idx.shape[1]] +
         tensor_nd.shape[2:]) if need_flat else new_flat_tensor
     return new_tensor_nd
def batch_gather_2d(var, indices):
    """Gather slices from var in each batch, according to corrensponding
    index in indices. Currently, it only support 2d Tensor.

    Args:
        var (Variable): with shape [batch_size, ...]
        indices (Variable): with shape [batch_size, max_len]

    Returns: Variable with shape [batch_size]

    Raises: NULL

    Examples:
        var
            [[1, 2, 3],
             [4, 5, 6]]
        indices
            [[2, 0], [1, 2]]

        return
            [[3, 1], [5, 6]]

    """
    if len(indices.shape) != 2:
        raise ValueError('shape of indices error. it should be a 2-D layers. '
                         'but got shape = %s' % (str(indices.shape), ))

    batch_size = layers.shape(indices)[0]

    zero = layers.fill_constant(shape=[1], dtype=indices.dtype, value=0)
    one = layers.fill_constant(shape=[1], dtype=indices.dtype, value=1)
    end = layers.cast(batch_size, dtype=indices.dtype)
    batch_indices_1d = layers.unsqueeze(
        layers.range(zero, end, one, dtype=indices.dtype), [1])

    seq_len = indices.shape[1]
    batch_indices = layers.expand(batch_indices_1d, [1, seq_len])

    coord_2d = layers.concat(
        [layers.unsqueeze(batch_indices, [2]),
         layers.unsqueeze(indices, [2])],
        axis=2)
    coord_2d.stop_gradient = True
    coord_1d = layers.reshape(coord_2d, shape=[-1, 2])
    output_1d = layers.gather_nd(var, coord_1d)
    output_2d = layers.reshape(output_1d, [batch_size, seq_len, var.shape[-1]])
    return output_2d
Ejemplo n.º 24
0
def index_sample(x, index):
    """Select input value according to index
    
    Arags:
        input: input matrix
        index: index matrix

    Returns:
        output

    >>> input
    [
        [1, 2, 3],
        [4, 5, 6]
    ]
    >>> index
    [
        [1, 2],
        [0, 1]
    ]
    >>> index_sample(input, index)
    [
        [2, 3],
        [4, 5]
    ]
    """
    x_s = x.shape
    dim = len(index.shape) - 1
    assert x_s[:dim] == index.shape[:dim]
    r_x = layers.reshape(x, shape=(-1, *x_s[dim:]))
    index = layers.reshape(index, shape=(index.shape[0], index.shape[1], 1))
    # generate arange index, shape like index
    # arr_index = layers.arange(start=0, end=layers.cast(layers.shape(x)[0], ), dtype=index.dtype)
    batch_size = layers.cast(layers.shape(index)[0], dtype=index.dtype)
    zero = layers.fill_constant(shape=[1], dtype=index.dtype, value=0)
    one = layers.fill_constant(shape=[1], dtype=index.dtype, value=1)
    arr_index = layers.unsqueeze(
        layers.range(zero, batch_size, one, dtype=index.dtype), [1, 2])

    arr_index = layers.expand_as(arr_index, index)
    #  genrate new index
    new_index = layers.concat([arr_index, index], -1)
    new_index = layers.reshape(new_index, (-1, 2))
    # get output
    out = layers.gather_nd(r_x, new_index)
    out = layers.reshape(out, (-1, x_s[-1] * 2))
    return out
def build_and_run_program(place, batch_size, beam_size, stop_gradient=False):
    fluid.default_startup_program().random_seed = 1
    fluid.default_main_program().random_seed = 1
    np.random.seed(2)

    x = layers.assign(
        np.random.rand(batch_size, beam_size, 32).astype("float32"))
    indices = fluid.data(shape=[None, beam_size], dtype="int64", name="indices")
    step_idx = layers.fill_constant(
        shape=[1], dtype="int64", value=0, force_cpu=True)
    max_len = layers.fill_constant(
        shape=[1], dtype="int64", value=10, force_cpu=True)
    cond = layers.less_than(x=step_idx, y=max_len)
    while_op = layers.While(cond)
    scores = layers.array_write(x, step_idx)
    with while_op.block():
        bs = layers.cast(layers.shape(x)[0], "int64")
        for _ in range(20):
            bs = layers.cast(bs, 'int64')
        bs.stop_gradient = stop_gradient
        batch_pos = layers.expand(
            layers.unsqueeze(
                layers.range(
                    0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size])
        topk_coordinates = layers.stack([batch_pos, indices], axis=2)
        topk_coordinates.stop_gradient = stop_gradient
        score = layers.gather_nd(x, topk_coordinates)
        layers.increment(x=step_idx, value=1.0, in_place=True)
        layers.array_write(score, i=step_idx, array=scores)
        length_cond = layers.less_than(x=step_idx, y=max_len)
        layers.assign(length_cond, cond)

    out = layers.tensor_array_to_tensor(scores, axis=0, use_stack=True)[0]
    loss = layers.reduce_mean(out)
    opt = fluid.optimizer.Adam(0.01)
    opt.minimize(loss)
    exe = fluid.Executor(place)
    data = np.random.random_integers(
        low=0, high=beam_size - 1, size=(batch_size, beam_size)).astype("int64")
    loss_val, = exe.run(feed={"indices": data}, fetch_list=[loss])

    return loss_val
Ejemplo n.º 26
0
 def __init__(self, input_mask):
     super(BigBirdWrapper, self).__init__()
     max_seqlen = L.shape(input_mask)[1]
     input_mask = L.reshape(input_mask, [-1])
     num_nodes = L.shape(input_mask)[0]
     src, dst = build_edges(num_nodes, input_mask, max_seqlen)
     self._edges_src = src
     self._edges_dst = dst
     self._edges_src.stop_gradient = True
     self._edges_dst.stop_gradient = True
     self._num_nodes = num_nodes
     self._num_edges = L.shape(self._edges_src)[0]
     self._node_ids = L.range(0, self._num_nodes, step=1, dtype="int32")
     self._edge_uniq_dst, _, uniq_count = L.unique_with_counts(
         self._edges_dst, dtype="int32")
     self._edge_uniq_dst.stop_gradient = True
     last = L.reduce_sum(uniq_count, keep_dim=True)
     uniq_count = L.cumsum(uniq_count, exclusive=True)
     self._edge_uniq_dst_count = L.concat([uniq_count, last])
     self._edge_uniq_dst_count.stop_gradient = True
Ejemplo n.º 27
0
    def forward(self, features):
        src_ids, sent_ids, input_seqlen = features
        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.cast(L.equal(src_ids, zero),
                            'float32')  # assume pad id == 0
        #input_mask = L.unsqueeze(input_mask, axes=[2])
        d_shape = L.shape(src_ids)
        seqlen = d_shape[1]
        batch_size = d_shape[0]
        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
        pos_ids = L.expand(pos_ids, [batch_size, 1])
        pos_ids = L.unsqueeze(pos_ids, axes=[2])
        pos_ids = L.cast(pos_ids, 'int64')
        pos_ids.stop_gradient = True
        input_mask.stop_gradient = True
        task_ids = L.zeros_like(
            src_ids) + self.hparam.task_id  #this shit wont use at the moment
        task_ids.stop_gradient = True

        model = ErnieModel(src_ids=src_ids,
                           position_ids=pos_ids,
                           sentence_ids=sent_ids,
                           task_ids=task_ids,
                           input_mask=input_mask,
                           config=self.hparam,
                           use_fp16=self.hparam['use_fp16'])

        enc_out = model.get_sequence_output()
        logits = L.fc(
            input=enc_out,
            size=self.num_label,
            num_flatten_dims=2,
            param_attr=F.ParamAttr(
                name="cls_seq_label_out_w",
                initializer=F.initializer.TruncatedNormal(scale=0.02)),
            bias_attr=F.ParamAttr(name="cls_seq_label_out_b",
                                  initializer=F.initializer.Constant(0.)))

        propeller.summary.histogram('pred', logits)

        return logits, input_seqlen
Ejemplo n.º 28
0
    def forward(self, indices, speaker_position_rate=None):
        """
        Args:
            indices (Variable): shape (B, T), dtype: int64, position
                indices, where B means the batch size, T means the time steps.
            speaker_position_rate (Variable | float, optional), position
                rate. It can be a float point number or a Variable with 
                shape (1,), then this speaker_position_rate is used for every 
                example. It can also be a Variable with shape (B, ), which 
                contains a speaker position rate for each utterance.
        Returns:
            out (Variable): shape(B, T, C_pos), dtype float32, position embedding, where C_pos 
                means position embedding size.
        """
        batch_size, time_steps = indices.shape

        # convert speaker_position_rate to a Variable with shape(B, )
        if isinstance(speaker_position_rate, float):
            speaker_position_rate = dg.to_variable(
                np.array([speaker_position_rate]).astype("float32"))
            speaker_position_rate = F.expand(speaker_position_rate,
                                             [batch_size])
        elif isinstance(speaker_position_rate, fluid.framework.Variable) \
            and list(speaker_position_rate.shape) == [1]:
            speaker_position_rate = F.expand(speaker_position_rate,
                                             [batch_size])
        assert len(speaker_position_rate.shape) == 1 and \
            list(speaker_position_rate.shape) == [batch_size]

        weight = compute_position_embedding(self.weight,
                                            speaker_position_rate)  # (B, V, C)
        # make indices for gather_nd
        batch_id = F.expand(
            F.unsqueeze(
                F.range(
                    0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
        # (B, T, 2)
        gather_nd_id = F.stack([batch_id, indices], -1)

        out = F.gather_nd(weight, gather_nd_id)
        return out
Ejemplo n.º 29
0
def build_edges(num_nodes, input_mask, max_seqlen):
    edges = L.range(start=0, end=num_nodes, step=1, dtype="int32")
    all_edges = []
    # Window
    filter_func = lambda x, y: select_edges(x, y, input_mask, num_nodes,
                                            max_seqlen)

    all_edges.append(filter_func(edges - 1, edges))  # win-1
    all_edges.append(filter_func(edges + 1, edges))  # win-2
    all_edges.append(filter_func(edges, edges))  #self-loop

    # Global Assume [CLS] is the first token.

    # vertical cls-window attention
    cls_position = edges / max_seqlen * max_seqlen
    all_edges.append(filter_func(cls_position, edges))

    # horizontal cls attention
    all_edges.append(filter_func(edges, cls_position))

    # Random
    for i in range(2):
        rand_edge = L.floor(
            L.uniform_random(min=0, max=1, shape=[num_nodes]) *
            L.cast(max_seqlen, dtype="float32"))
        rand_edge = L.cast(rand_edge, dtype="int32") + cls_position
        all_edges.append(filter_func(rand_edge, edges))

    if len(all_edges) > 1:
        src = L.concat([s for s, d in all_edges], 0)
        dst = L.concat([d for s, d in all_edges], 0)
    else:
        src = all_edges[0][0]
        dst = all_edges[0][1]

    # sort edges
    sorted_src, sorted_dst = uniq_edges(src, dst, num_nodes)
    return sorted_src, sorted_dst
Ejemplo n.º 30
0
    def forward(self,
                src_ids,
                sent_ids=None,
                pos_ids=None,
                input_mask=None,
                attn_bias=None,
                past_cache=None,
                use_causal_mask=False):
        """
        Args:
            src_ids (`Variable` of shape `[batch_size, seq_len]`):
                Indices of input sequence tokens in the vocabulary.
            sent_ids (optional, `Variable` of shape `[batch_size, seq_len]`):
                aka token_type_ids, Segment token indices to indicate first and second portions of the inputs.
                if None, assume all tokens come from `segment_a`
            pos_ids(optional, `Variable` of shape `[batch_size, seq_len]`):
                Indices of positions of each input sequence tokens in the position embeddings.
            input_mask(optional `Variable` of shape `[batch_size, seq_len]`):
                Mask to avoid performing attention on the padding token indices of the encoder input.
            attn_bias(optional, `Variable` of shape `[batch_size, seq_len, seq_len] or False`):
                3D version of `input_mask`, if set, overrides `input_mask`; if set not False, will not apply attention mask
            past_cache(optional, tuple of two lists: cached key and cached value,
                each is a list of `Variable`s of shape `[batch_size, seq_len, hidden_size]`):
                cached key/value tensor that will be concated to generated key/value when performing self attention.
                if set, `attn_bias` should not be None.

        Returns:
            pooled (`Variable` of shape `[batch_size, hidden_size]`):
                output logits of pooler classifier
            encoded(`Variable` of shape `[batch_size, seq_len, hidden_size]`):
                output logits of transformer stack
        """
        assert len(
            src_ids.shape
        ) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % (repr(
            src_ids.shape))
        assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None'
        d_batch = L.shape(src_ids)[0]
        d_seqlen = L.shape(src_ids)[1]
        if pos_ids is None:
            pos_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'),
                                [1, -1])
            pos_ids = L.cast(pos_ids, 'int64')
        if attn_bias is None:
            if input_mask is None:
                input_mask = L.cast(src_ids != 0, 'float32')
            assert len(input_mask.shape) == 2
            input_mask = L.unsqueeze(input_mask, axes=[-1])
            attn_bias = L.matmul(input_mask, input_mask, transpose_y=True)
            if use_causal_mask:
                sequence = L.reshape(
                    L.range(0, d_seqlen, 1, dtype='float32') + 1.,
                    [1, 1, -1, 1])
                causal_mask = L.cast((L.matmul(
                    sequence, 1. / sequence, transpose_y=True) >= 1.),
                                     'float32')
                attn_bias *= causal_mask
        else:
            assert len(
                attn_bias.shape
            ) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape
        attn_bias = (1. - attn_bias) * -10000.0
        attn_bias = L.unsqueeze(attn_bias, [1])
        attn_bias = L.expand(attn_bias,
                             [1, self.n_head, 1, 1])  # avoid broadcast =_=
        attn_bias.stop_gradient = True

        if sent_ids is None:
            sent_ids = L.zeros_like(src_ids)

        src_embedded = self.word_emb(src_ids)
        pos_embedded = self.pos_emb(pos_ids)
        sent_embedded = self.sent_emb(sent_ids)
        embedded = src_embedded + pos_embedded + sent_embedded

        embedded = self.dropout(self.ln(embedded))

        encoded, hidden_list, cache_list = self.encoder_stack(
            embedded, attn_bias, past_cache=past_cache)
        if self.pooler is not None:
            pooled = self.pooler(encoded[:, 0, :])
        else:
            pooled = None

        additional_info = {
            'hiddens': hidden_list,
            'caches': cache_list,
        }

        if self.return_additional_info:
            return pooled, encoded, additional_info
        else:
            return pooled, encoded