def build_position_ids(src_ids, dst_ids): src_shape = L.shape(src_ids) src_batch = src_shape[0] src_seqlen = src_shape[1] dst_seqlen = src_seqlen - 1 # without cls src_position_ids = L.reshape( L.range( 0, src_seqlen, 1, dtype='int32'), [1, src_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] src_position_ids = L.expand(src_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids, zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] src_pad_len = L.reduce_sum(input_mask, 1, keep_dim=True) # [B, 1, 1] dst_position_ids = L.reshape( L.range( src_seqlen, src_seqlen+dst_seqlen, 1, dtype='int32'), [1, dst_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] dst_position_ids = L.expand(dst_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen, 1] dst_position_ids = dst_position_ids - src_pad_len # [B, slot_seqlen, 1] position_ids = L.concat([src_position_ids, dst_position_ids], 1) position_ids = L.cast(position_ids, 'int64') position_ids.stop_gradient = True return position_ids
def concat_coord(x): ins_feat = x # [N, c, h, w] batch_size = L.shape(x)[0] h = L.shape(x)[2] w = L.shape(x)[3] float_h = L.cast(h, 'float32') float_w = L.cast(w, 'float32') y_range = L.range(0., float_h, 1., dtype='float32') # [h, ] y_range = 2.0 * y_range / (float_h - 1.0) - 1.0 x_range = L.range(0., float_w, 1., dtype='float32') # [w, ] x_range = 2.0 * x_range / (float_w - 1.0) - 1.0 x_range = L.reshape(x_range, (1, -1)) # [1, w] y_range = L.reshape(y_range, (-1, 1)) # [h, 1] x = L.expand(x_range, [h, 1]) # [h, w] y = L.expand(y_range, [1, w]) # [h, w] x = L.reshape(x, (1, 1, h, w)) # [1, 1, h, w] y = L.reshape(y, (1, 1, h, w)) # [1, 1, h, w] x = L.expand(x, [batch_size, 1, 1, 1]) # [N, 1, h, w] y = L.expand(y, [batch_size, 1, 1, 1]) # [N, 1, h, w] ins_kernel_feat = L.concat([ins_feat, x, y], axis=1) # [N, c+2, h, w] return ins_kernel_feat
def _build_position_ids(self, src_ids): src_shape = L.shape(src_ids) src_seqlen = src_shape[1] src_batch = src_shape[0] slot_seqlen = self.slot_seqlen num_b = (src_seqlen / slot_seqlen) - 1 a_position_ids = L.reshape(L.range(0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] a_position_ids = L.expand( a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1] b_position_ids = L.reshape(L.range(slot_seqlen, 2 * slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] b_position_ids = L.expand( b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1] b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1] position_ids = L.concat([a_position_ids, b_position_ids], 1) position_ids = L.cast(position_ids, 'int64') position_ids.stop_gradient = True return position_ids
def decode(conv_output, anchors, stride, num_class, conf_thresh): conv_shape = P.shape(conv_output) batch_size = conv_shape[0] n_grid = conv_shape[1] anchor_per_scale = len(anchors) conv_output = P.reshape( conv_output, (batch_size, n_grid, n_grid, anchor_per_scale, 5 + num_class)) conv_raw_dxdy = conv_output[:, :, :, :, 0:2] conv_raw_dwdh = conv_output[:, :, :, :, 2:4] conv_raw_conf = conv_output[:, :, :, :, 4:5] conv_raw_prob = conv_output[:, :, :, :, 5:] rows = P.range(0, n_grid, 1, 'float32') cols = P.range(0, n_grid, 1, 'float32') rows = P.expand(P.reshape(rows, (1, -1, 1)), [n_grid, 1, 1]) cols = P.expand(P.reshape(cols, (-1, 1, 1)), [1, n_grid, 1]) offset = P.concat([rows, cols], axis=-1) offset = P.reshape(offset, (1, n_grid, n_grid, 1, 2)) offset = P.expand(offset, [batch_size, 1, 1, anchor_per_scale, 1]) pred_xy = (P.sigmoid(conv_raw_dxdy) + offset) * stride pred_wh = (P.exp(conv_raw_dwdh) * P.assign(anchors)) pred_xywh = P.concat([pred_xy, pred_wh], axis=-1) pred_conf = P.sigmoid(conv_raw_conf) pred_prob = P.sigmoid(conv_raw_prob) pred_xywh = P.reshape(pred_xywh, (batch_size, -1, 4)) # [-1, -1, 4] pred_conf = P.reshape(pred_conf, (batch_size, -1, 1)) # [-1, -1, 1] pred_prob = P.reshape(pred_prob, (batch_size, -1, num_class)) # [-1, -1, 80] return pred_xywh, pred_conf, pred_prob
def matrix_nms(seg_masks, cate_labels, cate_scores, kernel='gaussian', sigma=2.0, sum_masks=None): """Matrix NMS for multi-class masks. Args: seg_masks (Tensor): shape (n, h, w) 0、1组成的掩码 cate_labels (Tensor): shape (n), mask labels in descending order cate_scores (Tensor): shape (n), mask scores in descending order kernel (str): 'linear' or 'gauss' sigma (float): std in gaussian method sum_masks (Tensor): shape (n, ) n个物体的面积 Returns: Tensor: cate_scores_update, tensors of shape (n) """ n_samples = L.shape(cate_labels)[0] # 物体数 seg_masks = L.reshape(seg_masks, (n_samples, -1)) # [n, h*w] # inter. inter_matrix = L.matmul(seg_masks, seg_masks, transpose_y=True) # [n, n] 自己乘以自己的转置。两两之间的交集面积。 # union. sum_masks_x = L.expand(L.reshape(sum_masks, (1, -1)), [n_samples, 1]) # [n, n] sum_masks重复了n行得到sum_masks_x # iou. iou_matrix = inter_matrix / (sum_masks_x + L.transpose(sum_masks_x, [1, 0]) - inter_matrix) rows = L.range(0, n_samples, 1, 'int32') cols = L.range(0, n_samples, 1, 'int32') rows = L.expand(L.reshape(rows, (1, -1)), [n_samples, 1]) cols = L.expand(L.reshape(cols, (-1, 1)), [1, n_samples]) tri_mask = L.cast(rows > cols, 'float32') iou_matrix = tri_mask * iou_matrix # [n, n] 只取上三角部分 # label_specific matrix. cate_labels_x = L.expand(L.reshape(cate_labels, (1, -1)), [n_samples, 1]) # [n, n] cate_labels重复了n行得到cate_labels_x label_matrix = L.cast(L.equal(cate_labels_x, L.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = tri_mask * label_matrix # [n, n] 只取上三角部分 # IoU compensation compensate_iou = L.reduce_max(iou_matrix * label_matrix, dim=0) compensate_iou = L.expand(L.reshape(compensate_iou, (1, -1)), [n_samples, 1]) # [n, n] compensate_iou = L.transpose(compensate_iou, [1, 0]) # [n, n] # IoU decay decay_iou = iou_matrix * label_matrix # # matrix nms if kernel == 'gaussian': decay_matrix = L.exp(-1 * sigma * (decay_iou ** 2)) compensate_matrix = L.exp(-1 * sigma * (compensate_iou ** 2)) decay_coefficient = L.reduce_min((decay_matrix / compensate_matrix), dim=0) elif kernel == 'linear': decay_matrix = (1-decay_iou)/(1-compensate_iou) decay_coefficient = L.reduce_min(decay_matrix, dim=0) else: raise NotImplementedError # update the score. cate_scores_update = cate_scores * decay_coefficient return cate_scores_update
def fast_nms(self, boxes, scores, masks, max_num_detections=100): iou_threshold = self.nms_thresh top_k = self.top_k # 同类方框根据得分降序排列 scores, idx = P.argsort(scores, axis=1, descending=True) idx = idx[:, :top_k] scores = scores[:, :top_k] num_classes, num_dets = P.shape(idx)[0], P.shape(idx)[1] idx = P.reshape(idx, (-1, )) boxes = P.gather(boxes, idx) boxes = P.reshape(boxes, (num_classes, num_dets, 4)) masks = P.gather(masks, idx) masks = P.reshape(masks, (num_classes, num_dets, -1)) # 计算一个c×n×n的IOU矩阵,其中每个n×n矩阵表示对该类n个候选框,两两之间的IOU iou = jaccard(boxes, boxes) # 因为自己与自己的IOU=1,IOU(A,B)=IOU(B,A),所以对上一步得到的IOU矩阵 # 进行一次处理。具体做法是将每一个通道,的对角线元素和下三角部分置为0 rows = P.range(0, num_dets, 1, 'int32') cols = P.range(0, num_dets, 1, 'int32') rows = P.expand(P.reshape(rows, (1, -1)), [num_dets, 1]) cols = P.expand(P.reshape(cols, (-1, 1)), [1, num_dets]) tri_mask = P.cast(rows > cols, 'float32') tri_mask = P.expand(P.reshape(tri_mask, (1, num_dets, num_dets)), [num_classes, 1, 1]) iou = tri_mask * iou iou_max = P.reduce_max(iou, dim=1) # Now just filter out the ones higher than the threshold keep = P.where(iou_max <= iou_threshold) # Assign each kept detection to its corresponding class classes = P.range(0, num_classes, 1, 'int32') classes = P.expand(P.reshape(classes, (-1, 1)), [1, num_dets]) classes = P.gather_nd(classes, keep) boxes = P.gather_nd(boxes, keep) masks = P.gather_nd(masks, keep) scores = P.gather_nd(scores, keep) # Only keep the top cfg.max_num_detections highest scores across all classes scores, idx = P.argsort(scores, axis=0, descending=True) idx = idx[:max_num_detections] scores = scores[:max_num_detections] classes = P.gather(classes, idx) boxes = P.gather(boxes, idx) masks = P.gather(masks, idx) return boxes, masks, classes, scores
def fast_nms(boxes, scores, conf_thresh, nms_thresh, keep_top_k, nms_top_k): ''' :param boxes: [?, 4] :param scores: [80, ?] ''' # 同类方框根据得分降序排列 scores, idx = P.argsort(scores, axis=1, descending=True) idx = idx[:, :keep_top_k] scores = scores[:, :keep_top_k] num_classes, num_dets = P.shape(idx)[0], P.shape(idx)[1] idx = P.reshape(idx, (-1, )) boxes = P.gather(boxes, idx) boxes = P.reshape(boxes, (num_classes, num_dets, 4)) # 计算一个c×n×n的IOU矩阵,其中每个n×n矩阵表示对该类n个候选框,两两之间的IOU iou = _iou(boxes, boxes) # 因为自己与自己的IOU=1,IOU(A,B)=IOU(B,A),所以对上一步得到的IOU矩阵 # 进行一次处理。具体做法是将每一个通道,的对角线元素和下三角部分置为0 rows = P.range(0, num_dets, 1, 'int32') cols = P.range(0, num_dets, 1, 'int32') rows = P.expand(P.reshape(rows, (1, -1)), [num_dets, 1]) cols = P.expand(P.reshape(cols, (-1, 1)), [1, num_dets]) tri_mask = P.cast(rows > cols, 'float32') tri_mask = P.expand(P.reshape(tri_mask, (1, num_dets, num_dets)), [num_classes, 1, 1]) iou = tri_mask * iou iou_max = P.reduce_max(iou, dim=1) # 同一类别,n个框与“分数比它高的框”的最高iou超过nms_thresh的话,就丢弃。下标是0的框肯定被保留。 keep = P.where(iou_max <= nms_thresh) # Assign each kept detection to its corresponding class classes = P.range(0, num_classes, 1, 'int32') classes = P.expand(P.reshape(classes, (-1, 1)), [1, num_dets]) classes = P.gather_nd(classes, keep) boxes = P.gather_nd(boxes, keep) scores = P.gather_nd(scores, keep) # Only keep the top cfg.max_num_detections highest scores across all classes scores, idx = P.argsort(scores, axis=0, descending=True) idx = idx[:nms_top_k] scores = scores[:nms_top_k] classes = P.gather(classes, idx) boxes = P.gather(boxes, idx) return boxes, scores, classes
def __call__(self, input): if not self.coord_conv: return input b = L.shape(input)[0] h = L.shape(input)[2] w = L.shape(input)[3] x_range = L.range(0, w, 1., dtype='float32') / (w - 1) * 2.0 - 1 y_range = L.range(0, h, 1., dtype='float32') / (h - 1) * 2.0 - 1 x_range = L.reshape(x_range, (1, 1, 1, -1)) # [1, 1, 1, w] y_range = L.reshape(y_range, (1, 1, -1, 1)) # [1, 1, h, 1] x_range = L.expand(x_range, [b, 1, h, 1]) # [b, 1, h, w] y_range = L.expand(y_range, [b, 1, 1, w]) # [b, 1, h, w] offset = L.concat([input, x_range, y_range], axis=1) return offset
def _decode(self, x, y, w, h, anchors, stride, scale_x_y, eps, is_gt=False): conv_shape = x.shape # (8, 13, 13, 3) batch_size = conv_shape[0] n_grid = conv_shape[1] anchor_per_scale = conv_shape[3] _x = L.unsqueeze(x, 4) _y = L.unsqueeze(y, 4) conv_raw_dxdy = L.concat([_x, _y], -1) # (8, 13, 13, 3, 2) _w = L.unsqueeze(w, 4) _h = L.unsqueeze(h, 4) conv_raw_dwdh = L.concat([_w, _h], -1) # (8, 13, 13, 3, 2) rows = L.range(0, n_grid, 1, 'float32') cols = L.range(0, n_grid, 1, 'float32') rows = L.expand(L.reshape(rows, (1, -1, 1)), [n_grid, 1, 1]) cols = L.expand(L.reshape(cols, (-1, 1, 1)), [1, n_grid, 1]) offset = L.concat([rows, cols], axis=-1) offset = L.reshape(offset, (1, n_grid, n_grid, 1, 2)) offset = L.expand(offset, [batch_size, 1, 1, anchor_per_scale, 1]) if is_gt: decode_xy = (conv_raw_dxdy + offset) / n_grid else: if (abs(scale_x_y - 1.0) < eps): decode_xy = L.sigmoid(conv_raw_dxdy) decode_xy = (decode_xy + offset) / n_grid else: # Grid Sensitive decode_xy = scale_x_y * L.sigmoid(conv_raw_dxdy) - 0.5 * ( scale_x_y - 1.0) decode_xy = (decode_xy + offset) / n_grid anchor_t = fluid.layers.assign(np.copy(anchors).astype(np.float32)) decode_wh = (L.exp(conv_raw_dwdh) * anchor_t) / (n_grid * stride) decode_xywh = L.concat([decode_xy, decode_wh], axis=-1) if is_gt: decode_xywh.stop_gradient = True return decode_xywh # (8, 13, 13, 3, 4)
def __call__(self, input): if not self.coord_conv: return input b = input.shape[0] h = input.shape[2] w = input.shape[3] x_range = L.range(0, w, 1., dtype='float32') / (w - 1) * 2.0 - 1 y_range = L.range(0, h, 1., dtype='float32') / (h - 1) * 2.0 - 1 # x_range = paddle.to_tensor(x_range, place=input.place) # y_range = paddle.to_tensor(y_range, place=input.place) x_range = L.reshape(x_range, (1, 1, 1, -1)) # [1, 1, 1, w] y_range = L.reshape(y_range, (1, 1, -1, 1)) # [1, 1, h, 1] x_range = L.expand(x_range, [b, 1, h, 1]) # [b, 1, h, w] y_range = L.expand(y_range, [b, 1, 1, w]) # [b, 1, h, w] offset = L.concat([input, x_range, y_range], axis=1) return offset
def batch_scatter(ref, indices, updates, in_place=False, overwrite=False): """Scatter updates to ref, according to corrensponding index in indices in each batch. Currently, it only support 2d Tensor. Args: ref (Variable): with shape [batch_size, ...] indices (Variable): with shape [batch_size, 1] updates (Variable): with shape [batch_size] in_place (bool): if True, scatter result will be assign to ref. otherwise, a new Tensor will be returned. Default is False. overwrite (bool): if True, scatter will over write corrensponding elements. Default is False. Returns: TODO Raises: NULL Examples: ref [[1, 1, 1], [1, 1, 1]] indices [[2], [1]] updates [2, 3] return [[1, 1, 2], [1, 3, 1]] """ ref_dtype = ref.dtype if ref_dtype not in PaddleVarType.floats: ref_in = layers.cast(ref, dtype='float32') else: ref_in = ref if updates.dtype != ref_in.dtype: updates = layers.cast(updates, dtype=ref_in.dtype) batch_size = layers.cast(layers.shape(ref_in)[0], dtype=indices.dtype) zero = layers.fill_constant(shape=[1], dtype=indices.dtype, value=0) one = layers.fill_constant(shape=[1], dtype=indices.dtype, value=1) batch_indices = layers.unsqueeze( layers.range(zero, batch_size, one, dtype=indices.dtype), [1]) coord = layers.concat([batch_indices, indices], axis=1) if overwrite: mask = layers.gather_nd(ref_in, coord) mask = layers.elementwise_sub(layers.zeros_like(mask), mask) ref_in = layers.scatter_nd_add(ref_in, coord, mask) output = layers.scatter_nd_add(ref_in, coord, updates) if ref_dtype not in PaddleVarType.floats: output = layers.cast(output, dtype=ref_dtype) if in_place: layers.assign(output, ref) return ref else: return output
def __init__(self, beam_size, batch_size, alpha, vocab_size, hidden_size): self.beam_size = beam_size self.batch_size = batch_size self.alpha = alpha self.vocab_size = vocab_size self.hidden_size = hidden_size self.gather_top2k_append_index = layers.range(0, 2 * self.batch_size * beam_size, 1, 'int64') // \ (2 * self.beam_size) * (self.beam_size) self.gather_topk_append_index = layers.range(0, self.batch_size * beam_size, 1, 'int64') // \ self.beam_size * (2 * self.beam_size) self.gather_finish_topk_append_index = layers.range(0, self.batch_size * beam_size, 1, 'int64') // \ self.beam_size * (3 * self.beam_size) self.eos_id = layers.fill_constant([self.batch_size, 2 * self.beam_size], 'int64', value=1) self.get_alive_index = layers.range(0, self.batch_size, 1, 'int64') * self.beam_size
def crop(masks, boxes, padding: int = 1): """ "Crop" predicted masks by zeroing out everything not in the predicted bbox. Vectorized by Chong (thanks Chong). Args: - masks should be a size [h, w, n] tensor of masks 。n是正样本数量 - boxes should be a size [n, 4] tensor of bbox coords in relative point form """ h, w, n = P.shape(masks)[0], P.shape(masks)[1], P.shape(masks)[2] x1, x2 = sanitize_coordinates(boxes[:, 0], boxes[:, 2], w, padding, cast=False) y1, y2 = sanitize_coordinates(boxes[:, 1], boxes[:, 3], h, padding, cast=False) rows = P.range(0, w, 1, 'int32') cols = P.range(0, h, 1, 'int32') rows = P.expand(P.reshape(rows, (1, -1, 1)), [h, 1, n]) cols = P.expand(P.reshape(cols, (-1, 1, 1)), [1, w, n]) rows.stop_gradient = True cols.stop_gradient = True x1 = P.reshape(x1, (1, 1, -1)) x2 = P.reshape(x2, (1, 1, -1)) y1 = P.reshape(y1, (1, 1, -1)) y2 = P.reshape(y2, (1, 1, -1)) x1.stop_gradient = True x2.stop_gradient = True y1.stop_gradient = True y2.stop_gradient = True masks_left = P.cast(rows >= P.expand(x1, [h, w, 1]), 'float32') masks_right = P.cast(rows < P.expand(x2, [h, w, 1]), 'float32') masks_up = P.cast(cols >= P.expand(y1, [h, w, 1]), 'float32') masks_down = P.cast(cols < P.expand(y2, [h, w, 1]), 'float32') crop_mask = masks_left * masks_right * masks_up * masks_down return masks * crop_mask
def positional_encoding(tensor, start_index, omega): """ tensor: a reference tensor we use to get shape. actually only T and C are needed. Shape(B, T, C) start_index: int, we can actually use start and length to specify them. omega (B,): speaker position rates return (B, T, C), position embedding """ dtype = omega.dtype _, length, dimension = tensor.shape index = F.range(start_index, start_index + length, 1, dtype=dtype) channel = F.range(0, dimension, 2, dtype=dtype) p = F.unsqueeze(omega, [1, 2]) \ * F.unsqueeze(index, [1]) \ / (10000 ** (channel / float(dimension))) encodings = F.concat([F.sin(p), F.cos(p)], axis=2) return encodings
def _build_position_ids(self, src_ids): d_shape = L.shape(src_ids) d_seqlen = d_shape[1] d_batch = d_shape[0] position_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1], inplace=True) position_ids = L.expand(position_ids, [d_batch, 1, 1]) position_ids = L.cast(position_ids, 'int64') position_ids.stop_gradient = True return position_ids
def get_mask_feats(self, inputs): name_list = list(inputs.keys()) name_list = name_list[self.start_level:self.end_level + 1] # [p2, p3, p4, p5] inputs2 = [inputs[name] for name in name_list] # [p2, p3, p4, p5] inputs = inputs2 feature_add_all_level = self.convs_all_levels[0][0](inputs[0]) for i in range(1, len(inputs)): input_p = inputs[i] if i == 3: input_feat = input_p batch_size = L.shape(input_feat)[0] h = L.shape(input_feat)[2] w = L.shape(input_feat)[3] float_h = L.cast(h, 'float32') float_w = L.cast(w, 'float32') y_range = L.range(0., float_h, 1., dtype='float32') # [h, ] y_range = 2.0 * y_range / (float_h - 1.0) - 1.0 x_range = L.range(0., float_w, 1., dtype='float32') # [w, ] x_range = 2.0 * x_range / (float_w - 1.0) - 1.0 x_range = L.reshape(x_range, (1, -1)) # [1, w] y_range = L.reshape(y_range, (-1, 1)) # [h, 1] x = L.expand(x_range, [h, 1]) # [h, w] y = L.expand(y_range, [1, w]) # [h, w] x = L.reshape(x, (1, 1, h, w)) # [1, 1, h, w] y = L.reshape(y, (1, 1, h, w)) # [1, 1, h, w] x = L.expand(x, [batch_size, 1, 1, 1]) # [N, 1, h, w] y = L.expand(y, [batch_size, 1, 1, 1]) # [N, 1, h, w] input_p = L.concat([input_p, x, y], axis=1) # [N, c+2, h, w] for ly in self.convs_all_levels[i]: input_p = ly(input_p) feature_add_all_level += input_p feature_pred = self.conv_pred(feature_add_all_level) return feature_pred
def forward(self, features): src_ids, sent_ids = features dtype = 'float16' if self.hparam['fp16'] else 'float32' zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), dtype) # assume pad id == 0 #input_mask = L.unsqueeze(input_mask, axes=[2]) d_shape = L.shape(src_ids) seqlen = d_shape[1] batch_size = d_shape[0] pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) pos_ids = L.expand(pos_ids, [batch_size, 1]) pos_ids = L.unsqueeze(pos_ids, axes=[2]) pos_ids = L.cast(pos_ids, 'int64') pos_ids.stop_gradient = True input_mask.stop_gradient = True task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment task_ids.stop_gradient = True bert = ErnieModel( src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, task_ids=task_ids, input_mask=input_mask, config=self.hparam, use_fp16=self.hparam['fp16'] ) cls_feats = bert.get_pooled_output() cls_feats = L.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train" ) logits = L.fc( input=cls_feats, size=self.hparam['num_label'], param_attr=F.ParamAttr( name="cls_out_w", initializer=F.initializer.TruncatedNormal(scale=0.02)), bias_attr=F.ParamAttr( name="cls_out_b", initializer=F.initializer.Constant(0.)) ) propeller.summary.histogram('pred', logits) if self.mode is propeller.RunMode.PREDICT: probs = L.softmax(logits) return probs else: return logits
def generate_relative_positions_matrix(length, max_relative_position, cache=False): if not cache: range_vec = layers.range(0, length, 1, 'int32') range_vec.stop_gradient = True shapes = layers.shape(range_vec) range_vec = layers.reshape(range_vec, shape=[1, shapes[0]]) range_mat = layers.expand(range_vec, [shapes[0], 1]) distance_mat = range_mat - layers.transpose(range_mat, [1, 0]) else: distance_mat = layers.range(-1 * length + 1, 1, 1, 'int32') distance_mat.stop_gradient = True shapes = layers.shape(distance_mat) distance_mat = layers.reshape(distance_mat, [1, shapes[0]]) distance_mat_clipped = layers.clip( layers.cast(distance_mat, dtype="float32"), float(-max_relative_position), float(max_relative_position)) final_mat = layers.cast(distance_mat_clipped, dtype='int32') + max_relative_position return final_mat
def gen_bias(encoder_inputs, decoder_inputs, step): decoder_bsz, decoder_seqlen = decoder_inputs.shape[:2] attn_bias = L.reshape(L.range(0, decoder_seqlen, 1, dtype='float32') + 1, [1, -1, 1]) decoder_bias = L.cast((L.matmul(attn_bias, 1. / attn_bias, transpose_y=True) >= 1.), 'float32') #[1, 1, decoderlen, decoderlen] encoder_bias = L.unsqueeze(L.cast(L.ones_like(encoder_inputs), 'float32'), [1]) #[bsz, 1, encoderlen] encoder_bias = L.expand(encoder_bias, [1, decoder_seqlen, 1]) #[bsz,decoderlen, encoderlen] decoder_bias = L.expand(decoder_bias, [decoder_bsz, 1, 1]) #[bsz, decoderlen, decoderlen] if step > 0: bias = L.concat([encoder_bias, L.ones([decoder_bsz, decoder_seqlen, step], 'float32'), decoder_bias], -1) else: bias = L.concat([encoder_bias, decoder_bias], -1) return bias
def _ranking(self, inputs, predictions): """ Reranking generated responses. """ src_token = inputs["src_token"] src_mask = inputs["src_mask"] src_pos = inputs["src_pos"] src_type = inputs["src_type"] src_turn = inputs["src_turn"] src_embed = self.embedder(src_token, src_pos, src_type, src_turn) batch_size, num_latent, tgt_seq_len = predictions.shape # shape: [batch_size, num_latent, seq_len, 1] preds_token = F.unsqueeze(predictions, [3]) preds_mask = F.not_equal(preds_token, self.padding_idx, "int64") preds_pos = layers.range(0, tgt_seq_len, 1, dtype="float32") preds_pos = F.unsqueeze(preds_pos, [0, 0, 1]) preds_pos = layers.expand(preds_pos, [batch_size, num_latent, 1, 1]) preds_pos = layers.cast(preds_pos, "int64") preds_type = layers.zeros_like(preds_token) preds_turn = layers.zeros_like(preds_token) scores = [] for i in range(num_latent): pred_token = preds_token[:, i] pred_mask = preds_mask[:, i] pred_pos = preds_pos[:, i] pred_type = preds_type[:, i] pred_turn = preds_turn[:, i] input_mask = layers.concat([src_mask, pred_mask], axis=1) input_mask.stop_gradient = True pred_embed = self.embedder(pred_token, pred_pos, pred_type, pred_turn) embed = layers.concat([src_embed, pred_embed], axis=1) embed = self.embed_layer_norm(embed) mask_embed = self.mask_embed mask_embed = layers.expand(mask_embed, [batch_size, 1, 1]) mask_embed = self.embed_layer_norm(mask_embed) out = layers.concat([mask_embed, embed], axis=1) mask = self._create_mask(input_mask, append_head=True) for layer in self.layers: out = layer(out, mask, None) mask_embed = out[:, 0] score = self.discriminator(mask_embed) scores.append(score[:, 0]) scores = layers.stack(scores, axis=1) return scores
def batch_gather(var, indices): """Gather slices from var in each batch, according to corrensponding index in indices. Currently, it only support 2d Tensor. Args: var (Variable): with shape [batch_size, ...] indices (Variable): with shape [batch_size, 1] or [batch_size] Returns: Variable with shape [batch_size] Raises: NULL Examples: var [[1, 2, 3], [4, 5, 6]] indices [[2], [1]] return [[3], [5]] """ if len(indices.shape) >= 2 and indices.shape[-1] != 1: raise ValueError( 'shape of indices error. it should be a 1-D layers, or a 2-D layers which ' 'the 2nd dimension is 1. but got shape = %s' % (str(indices.shape), )) if len(indices.shape) == 1: indices = layers.reshape(indices, shape=[-1, 1]) reshape_input = len(var.shape) == 1 if reshape_input: var = PaddleFluidWrapper.reshape(var, shape=[-1, 1]) batch_size = layers.cast(layers.shape(indices)[0], dtype=indices.dtype) zero = layers.fill_constant(shape=[1], dtype=indices.dtype, value=0) one = layers.fill_constant(shape=[1], dtype=indices.dtype, value=1) batch_indices = layers.unsqueeze( layers.range(zero, batch_size, one, dtype=indices.dtype), [1]) coord = layers.concat([batch_indices, indices], axis=1) coord.stop_gradient = True output = layers.gather_nd(var, coord) if reshape_input: output = PaddleFluidWrapper.reshape(output, shape=[-1]) return output
def gather_2d_by_gather(tensor_nd, beam_idx, beam_size, batch_size, need_flat=True): batch_idx = layers.range(0, batch_size, 1, dtype="int64") * beam_size flat_tensor = merge_beam_dim(tensor_nd) if need_flat else tensor_nd idx = layers.reshape( layers.elementwise_add(beam_idx, batch_idx, 0), [-1]) new_flat_tensor = layers.gather(flat_tensor, idx) new_tensor_nd = layers.reshape( new_flat_tensor, shape=[batch_size, beam_idx.shape[1]] + tensor_nd.shape[2:]) if need_flat else new_flat_tensor return new_tensor_nd
def batch_gather_2d(var, indices): """Gather slices from var in each batch, according to corrensponding index in indices. Currently, it only support 2d Tensor. Args: var (Variable): with shape [batch_size, ...] indices (Variable): with shape [batch_size, max_len] Returns: Variable with shape [batch_size] Raises: NULL Examples: var [[1, 2, 3], [4, 5, 6]] indices [[2, 0], [1, 2]] return [[3, 1], [5, 6]] """ if len(indices.shape) != 2: raise ValueError('shape of indices error. it should be a 2-D layers. ' 'but got shape = %s' % (str(indices.shape), )) batch_size = layers.shape(indices)[0] zero = layers.fill_constant(shape=[1], dtype=indices.dtype, value=0) one = layers.fill_constant(shape=[1], dtype=indices.dtype, value=1) end = layers.cast(batch_size, dtype=indices.dtype) batch_indices_1d = layers.unsqueeze( layers.range(zero, end, one, dtype=indices.dtype), [1]) seq_len = indices.shape[1] batch_indices = layers.expand(batch_indices_1d, [1, seq_len]) coord_2d = layers.concat( [layers.unsqueeze(batch_indices, [2]), layers.unsqueeze(indices, [2])], axis=2) coord_2d.stop_gradient = True coord_1d = layers.reshape(coord_2d, shape=[-1, 2]) output_1d = layers.gather_nd(var, coord_1d) output_2d = layers.reshape(output_1d, [batch_size, seq_len, var.shape[-1]]) return output_2d
def index_sample(x, index): """Select input value according to index Arags: input: input matrix index: index matrix Returns: output >>> input [ [1, 2, 3], [4, 5, 6] ] >>> index [ [1, 2], [0, 1] ] >>> index_sample(input, index) [ [2, 3], [4, 5] ] """ x_s = x.shape dim = len(index.shape) - 1 assert x_s[:dim] == index.shape[:dim] r_x = layers.reshape(x, shape=(-1, *x_s[dim:])) index = layers.reshape(index, shape=(index.shape[0], index.shape[1], 1)) # generate arange index, shape like index # arr_index = layers.arange(start=0, end=layers.cast(layers.shape(x)[0], ), dtype=index.dtype) batch_size = layers.cast(layers.shape(index)[0], dtype=index.dtype) zero = layers.fill_constant(shape=[1], dtype=index.dtype, value=0) one = layers.fill_constant(shape=[1], dtype=index.dtype, value=1) arr_index = layers.unsqueeze( layers.range(zero, batch_size, one, dtype=index.dtype), [1, 2]) arr_index = layers.expand_as(arr_index, index) # genrate new index new_index = layers.concat([arr_index, index], -1) new_index = layers.reshape(new_index, (-1, 2)) # get output out = layers.gather_nd(r_x, new_index) out = layers.reshape(out, (-1, x_s[-1] * 2)) return out
def build_and_run_program(place, batch_size, beam_size, stop_gradient=False): fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 np.random.seed(2) x = layers.assign( np.random.rand(batch_size, beam_size, 32).astype("float32")) indices = fluid.data(shape=[None, beam_size], dtype="int64", name="indices") step_idx = layers.fill_constant( shape=[1], dtype="int64", value=0, force_cpu=True) max_len = layers.fill_constant( shape=[1], dtype="int64", value=10, force_cpu=True) cond = layers.less_than(x=step_idx, y=max_len) while_op = layers.While(cond) scores = layers.array_write(x, step_idx) with while_op.block(): bs = layers.cast(layers.shape(x)[0], "int64") for _ in range(20): bs = layers.cast(bs, 'int64') bs.stop_gradient = stop_gradient batch_pos = layers.expand( layers.unsqueeze( layers.range( 0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size]) topk_coordinates = layers.stack([batch_pos, indices], axis=2) topk_coordinates.stop_gradient = stop_gradient score = layers.gather_nd(x, topk_coordinates) layers.increment(x=step_idx, value=1.0, in_place=True) layers.array_write(score, i=step_idx, array=scores) length_cond = layers.less_than(x=step_idx, y=max_len) layers.assign(length_cond, cond) out = layers.tensor_array_to_tensor(scores, axis=0, use_stack=True)[0] loss = layers.reduce_mean(out) opt = fluid.optimizer.Adam(0.01) opt.minimize(loss) exe = fluid.Executor(place) data = np.random.random_integers( low=0, high=beam_size - 1, size=(batch_size, beam_size)).astype("int64") loss_val, = exe.run(feed={"indices": data}, fetch_list=[loss]) return loss_val
def __init__(self, input_mask): super(BigBirdWrapper, self).__init__() max_seqlen = L.shape(input_mask)[1] input_mask = L.reshape(input_mask, [-1]) num_nodes = L.shape(input_mask)[0] src, dst = build_edges(num_nodes, input_mask, max_seqlen) self._edges_src = src self._edges_dst = dst self._edges_src.stop_gradient = True self._edges_dst.stop_gradient = True self._num_nodes = num_nodes self._num_edges = L.shape(self._edges_src)[0] self._node_ids = L.range(0, self._num_nodes, step=1, dtype="int32") self._edge_uniq_dst, _, uniq_count = L.unique_with_counts( self._edges_dst, dtype="int32") self._edge_uniq_dst.stop_gradient = True last = L.reduce_sum(uniq_count, keep_dim=True) uniq_count = L.cumsum(uniq_count, exclusive=True) self._edge_uniq_dst_count = L.concat([uniq_count, last]) self._edge_uniq_dst_count.stop_gradient = True
def forward(self, features): src_ids, sent_ids, input_seqlen = features zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids, zero), 'float32') # assume pad id == 0 #input_mask = L.unsqueeze(input_mask, axes=[2]) d_shape = L.shape(src_ids) seqlen = d_shape[1] batch_size = d_shape[0] pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) pos_ids = L.expand(pos_ids, [batch_size, 1]) pos_ids = L.unsqueeze(pos_ids, axes=[2]) pos_ids = L.cast(pos_ids, 'int64') pos_ids.stop_gradient = True input_mask.stop_gradient = True task_ids = L.zeros_like( src_ids) + self.hparam.task_id #this shit wont use at the moment task_ids.stop_gradient = True model = ErnieModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, task_ids=task_ids, input_mask=input_mask, config=self.hparam, use_fp16=self.hparam['use_fp16']) enc_out = model.get_sequence_output() logits = L.fc( input=enc_out, size=self.num_label, num_flatten_dims=2, param_attr=F.ParamAttr( name="cls_seq_label_out_w", initializer=F.initializer.TruncatedNormal(scale=0.02)), bias_attr=F.ParamAttr(name="cls_seq_label_out_b", initializer=F.initializer.Constant(0.))) propeller.summary.histogram('pred', logits) return logits, input_seqlen
def forward(self, indices, speaker_position_rate=None): """ Args: indices (Variable): shape (B, T), dtype: int64, position indices, where B means the batch size, T means the time steps. speaker_position_rate (Variable | float, optional), position rate. It can be a float point number or a Variable with shape (1,), then this speaker_position_rate is used for every example. It can also be a Variable with shape (B, ), which contains a speaker position rate for each utterance. Returns: out (Variable): shape(B, T, C_pos), dtype float32, position embedding, where C_pos means position embedding size. """ batch_size, time_steps = indices.shape # convert speaker_position_rate to a Variable with shape(B, ) if isinstance(speaker_position_rate, float): speaker_position_rate = dg.to_variable( np.array([speaker_position_rate]).astype("float32")) speaker_position_rate = F.expand(speaker_position_rate, [batch_size]) elif isinstance(speaker_position_rate, fluid.framework.Variable) \ and list(speaker_position_rate.shape) == [1]: speaker_position_rate = F.expand(speaker_position_rate, [batch_size]) assert len(speaker_position_rate.shape) == 1 and \ list(speaker_position_rate.shape) == [batch_size] weight = compute_position_embedding(self.weight, speaker_position_rate) # (B, V, C) # make indices for gather_nd batch_id = F.expand( F.unsqueeze( F.range( 0, batch_size, 1, dtype="int64"), [1]), [1, time_steps]) # (B, T, 2) gather_nd_id = F.stack([batch_id, indices], -1) out = F.gather_nd(weight, gather_nd_id) return out
def build_edges(num_nodes, input_mask, max_seqlen): edges = L.range(start=0, end=num_nodes, step=1, dtype="int32") all_edges = [] # Window filter_func = lambda x, y: select_edges(x, y, input_mask, num_nodes, max_seqlen) all_edges.append(filter_func(edges - 1, edges)) # win-1 all_edges.append(filter_func(edges + 1, edges)) # win-2 all_edges.append(filter_func(edges, edges)) #self-loop # Global Assume [CLS] is the first token. # vertical cls-window attention cls_position = edges / max_seqlen * max_seqlen all_edges.append(filter_func(cls_position, edges)) # horizontal cls attention all_edges.append(filter_func(edges, cls_position)) # Random for i in range(2): rand_edge = L.floor( L.uniform_random(min=0, max=1, shape=[num_nodes]) * L.cast(max_seqlen, dtype="float32")) rand_edge = L.cast(rand_edge, dtype="int32") + cls_position all_edges.append(filter_func(rand_edge, edges)) if len(all_edges) > 1: src = L.concat([s for s, d in all_edges], 0) dst = L.concat([d for s, d in all_edges], 0) else: src = all_edges[0][0] dst = all_edges[0][1] # sort edges sorted_src, sorted_dst = uniq_edges(src, dst, num_nodes) return sorted_src, sorted_dst
def forward(self, src_ids, sent_ids=None, pos_ids=None, input_mask=None, attn_bias=None, past_cache=None, use_causal_mask=False): """ Args: src_ids (`Variable` of shape `[batch_size, seq_len]`): Indices of input sequence tokens in the vocabulary. sent_ids (optional, `Variable` of shape `[batch_size, seq_len]`): aka token_type_ids, Segment token indices to indicate first and second portions of the inputs. if None, assume all tokens come from `segment_a` pos_ids(optional, `Variable` of shape `[batch_size, seq_len]`): Indices of positions of each input sequence tokens in the position embeddings. input_mask(optional `Variable` of shape `[batch_size, seq_len]`): Mask to avoid performing attention on the padding token indices of the encoder input. attn_bias(optional, `Variable` of shape `[batch_size, seq_len, seq_len] or False`): 3D version of `input_mask`, if set, overrides `input_mask`; if set not False, will not apply attention mask past_cache(optional, tuple of two lists: cached key and cached value, each is a list of `Variable`s of shape `[batch_size, seq_len, hidden_size]`): cached key/value tensor that will be concated to generated key/value when performing self attention. if set, `attn_bias` should not be None. Returns: pooled (`Variable` of shape `[batch_size, hidden_size]`): output logits of pooler classifier encoded(`Variable` of shape `[batch_size, seq_len, hidden_size]`): output logits of transformer stack """ assert len( src_ids.shape ) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % (repr( src_ids.shape)) assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None' d_batch = L.shape(src_ids)[0] d_seqlen = L.shape(src_ids)[1] if pos_ids is None: pos_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'), [1, -1]) pos_ids = L.cast(pos_ids, 'int64') if attn_bias is None: if input_mask is None: input_mask = L.cast(src_ids != 0, 'float32') assert len(input_mask.shape) == 2 input_mask = L.unsqueeze(input_mask, axes=[-1]) attn_bias = L.matmul(input_mask, input_mask, transpose_y=True) if use_causal_mask: sequence = L.reshape( L.range(0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1]) causal_mask = L.cast((L.matmul( sequence, 1. / sequence, transpose_y=True) >= 1.), 'float32') attn_bias *= causal_mask else: assert len( attn_bias.shape ) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape attn_bias = (1. - attn_bias) * -10000.0 attn_bias = L.unsqueeze(attn_bias, [1]) attn_bias = L.expand(attn_bias, [1, self.n_head, 1, 1]) # avoid broadcast =_= attn_bias.stop_gradient = True if sent_ids is None: sent_ids = L.zeros_like(src_ids) src_embedded = self.word_emb(src_ids) pos_embedded = self.pos_emb(pos_ids) sent_embedded = self.sent_emb(sent_ids) embedded = src_embedded + pos_embedded + sent_embedded embedded = self.dropout(self.ln(embedded)) encoded, hidden_list, cache_list = self.encoder_stack( embedded, attn_bias, past_cache=past_cache) if self.pooler is not None: pooled = self.pooler(encoded[:, 0, :]) else: pooled = None additional_info = { 'hiddens': hidden_list, 'caches': cache_list, } if self.return_additional_info: return pooled, encoded, additional_info else: return pooled, encoded