def get_feature_by_coordinate(self, x, coord, offset_h, offset_w, padded_x_w): x = paddle.reshape(x, [0, 0, -1]) index = paddle.cast( coord[:, :, :, :self.N] * padded_x_w, dtype='int64') + coord[:, :, :, self.N:] # offset_x*w + offset_y index = paddle.unsqueeze(index, 1) index = paddle.tile(index, [1, self.in_channel, 1, 1, 1]) index = paddle.reshape(index, (0, 0, -1)) x_range = list(range(3)) dim = 2 x_range[0] = dim x_range[dim] = 0 x_swaped = paddle.transpose(x, perm=x_range) index_range = list(range(3)) index_range[0] = dim index_range[dim] = 0 index_swaped = paddle.transpose(index, perm=index_range) x_shape = layers.shape(x_swaped) index_shape = layers.shape(index_swaped) prod = paddle.prod(x_shape[1:], keepdim=True) x_swaped_flattend = paddle.reshape(x_swaped, [-1]) index_swaped_flattend = paddle.reshape(index_swaped, [-1]) index_swaped_flattend *= prod bias = paddle.arange(start=0, end=prod, step=1, dtype='float32') bias = paddle.tile(bias, index_shape[0]) index_swaped_flattend += bias gathered = paddle.gather(x_swaped_flattend, index_swaped_flattend) gathered = paddle.reshape(gathered, layers.shape(index_swaped)) x_offset = paddle.transpose(gathered, perm=x_range) x_offset = paddle.reshape( x_offset, (-1, self.in_channel, offset_h, offset_w, self.N)) return x_offset
def intersect(box_a, box_b): # 相交区域的面积 """ We resize both tensors to [A,B,2] without new malloc: [A,2] -> [A,1,2] -> [A,B,2] [B,2] -> [1,B,2] -> [A,B,2] Then we compute the area of intersect between box_a and box_b. Args: box_a: (tensor) bounding boxes, Shape: [n,A,4]. box_b: (tensor) bounding boxes, Shape: [n,B,4]. Return: (tensor) intersection area, Shape: [n,A,B]. """ n = P.shape(box_a)[0] A = P.shape(box_a)[1] B = P.shape(box_b)[1] box_a = P.reshape(box_a, (n, A, 1, 4)) box_b = P.reshape(box_b, (n, 1, B, 4)) expand_box_a = P.expand(box_a, [1, 1, B, 1]) expand_box_b = P.expand(box_b, [1, A, 1, 1]) # 相交矩形的左上角坐标、右下角坐标 left_up = P.elementwise_max(expand_box_a[:, :, :, :2], expand_box_b[:, :, :, :2]) right_down = P.elementwise_min(expand_box_a[:, :, :, 2:], expand_box_b[:, :, :, 2:]) inter_section = P.relu(right_down - left_up) return inter_section[:, :, :, 0] * inter_section[:, :, :, 1]
def jaccard(box_a, box_b, iscrowd: bool = False): """Compute the jaccard overlap of two sets of boxes. The jaccard overlap is simply the intersection over union of two boxes. Here we operate on ground truth boxes and default boxes. If iscrowd=True, put the crowd in box_b. E.g.: A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) Args: box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] Return: jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] """ use_batch = True if len(box_a.shape) == 2: use_batch = False box_a = P.reshape(box_a, (1, P.shape(box_a)[0], P.shape(box_a)[1])) box_b = P.reshape(box_b, (1, P.shape(box_b)[0], P.shape(box_b)[1])) inter = intersect(box_a, box_b) area_a = (box_a[:, :, 2] - box_a[:, :, 0]) * (box_a[:, :, 3] - box_a[:, :, 1]) area_a = P.reshape(area_a, (P.shape(area_a)[0], P.shape(area_a)[1], 1)) area_a = P.expand(area_a, [1, 1, P.shape(inter)[2]]) area_b = (box_b[:, :, 2] - box_b[:, :, 0]) * (box_b[:, :, 3] - box_b[:, :, 1]) area_b = P.reshape(area_b, (P.shape(area_b)[0], 1, P.shape(area_b)[1])) area_b = P.expand(area_b, [1, P.shape(inter)[1], 1]) union = area_a + area_b - inter out = inter / area_a if iscrowd else inter / union return out if use_batch else out[0]
def concat_coord(x): ins_feat = x # [N, c, h, w] batch_size = L.shape(x)[0] h = L.shape(x)[2] w = L.shape(x)[3] float_h = L.cast(h, 'float32') float_w = L.cast(w, 'float32') y_range = L.range(0., float_h, 1., dtype='float32') # [h, ] y_range = 2.0 * y_range / (float_h - 1.0) - 1.0 x_range = L.range(0., float_w, 1., dtype='float32') # [w, ] x_range = 2.0 * x_range / (float_w - 1.0) - 1.0 x_range = L.reshape(x_range, (1, -1)) # [1, w] y_range = L.reshape(y_range, (-1, 1)) # [h, 1] x = L.expand(x_range, [h, 1]) # [h, w] y = L.expand(y_range, [1, w]) # [h, w] x = L.reshape(x, (1, 1, h, w)) # [1, 1, h, w] y = L.reshape(y, (1, 1, h, w)) # [1, 1, h, w] x = L.expand(x, [batch_size, 1, 1, 1]) # [N, 1, h, w] y = L.expand(y, [batch_size, 1, 1, 1]) # [N, 1, h, w] ins_kernel_feat = L.concat([ins_feat, x, y], axis=1) # [N, c+2, h, w] return ins_kernel_feat
def fast_nms(self, boxes, scores, masks, max_num_detections=100): iou_threshold = self.nms_thresh top_k = self.top_k # 同类方框根据得分降序排列 scores, idx = P.argsort(scores, axis=1, descending=True) idx = idx[:, :top_k] scores = scores[:, :top_k] num_classes, num_dets = P.shape(idx)[0], P.shape(idx)[1] idx = P.reshape(idx, (-1, )) boxes = P.gather(boxes, idx) boxes = P.reshape(boxes, (num_classes, num_dets, 4)) masks = P.gather(masks, idx) masks = P.reshape(masks, (num_classes, num_dets, -1)) # 计算一个c×n×n的IOU矩阵,其中每个n×n矩阵表示对该类n个候选框,两两之间的IOU iou = jaccard(boxes, boxes) # 因为自己与自己的IOU=1,IOU(A,B)=IOU(B,A),所以对上一步得到的IOU矩阵 # 进行一次处理。具体做法是将每一个通道,的对角线元素和下三角部分置为0 rows = P.range(0, num_dets, 1, 'int32') cols = P.range(0, num_dets, 1, 'int32') rows = P.expand(P.reshape(rows, (1, -1)), [num_dets, 1]) cols = P.expand(P.reshape(cols, (-1, 1)), [1, num_dets]) tri_mask = P.cast(rows > cols, 'float32') tri_mask = P.expand(P.reshape(tri_mask, (1, num_dets, num_dets)), [num_classes, 1, 1]) iou = tri_mask * iou iou_max = P.reduce_max(iou, dim=1) # Now just filter out the ones higher than the threshold keep = P.where(iou_max <= iou_threshold) # Assign each kept detection to its corresponding class classes = P.range(0, num_classes, 1, 'int32') classes = P.expand(P.reshape(classes, (-1, 1)), [1, num_dets]) classes = P.gather_nd(classes, keep) boxes = P.gather_nd(boxes, keep) masks = P.gather_nd(masks, keep) scores = P.gather_nd(scores, keep) # Only keep the top cfg.max_num_detections highest scores across all classes scores, idx = P.argsort(scores, axis=0, descending=True) idx = idx[:max_num_detections] scores = scores[:max_num_detections] classes = P.gather(classes, idx) boxes = P.gather(boxes, idx) masks = P.gather(masks, idx) return boxes, masks, classes, scores
def fast_nms(boxes, scores, conf_thresh, nms_thresh, keep_top_k, nms_top_k): ''' :param boxes: [?, 4] :param scores: [80, ?] ''' # 同类方框根据得分降序排列 scores, idx = P.argsort(scores, axis=1, descending=True) idx = idx[:, :keep_top_k] scores = scores[:, :keep_top_k] num_classes, num_dets = P.shape(idx)[0], P.shape(idx)[1] idx = P.reshape(idx, (-1, )) boxes = P.gather(boxes, idx) boxes = P.reshape(boxes, (num_classes, num_dets, 4)) # 计算一个c×n×n的IOU矩阵,其中每个n×n矩阵表示对该类n个候选框,两两之间的IOU iou = _iou(boxes, boxes) # 因为自己与自己的IOU=1,IOU(A,B)=IOU(B,A),所以对上一步得到的IOU矩阵 # 进行一次处理。具体做法是将每一个通道,的对角线元素和下三角部分置为0 rows = P.range(0, num_dets, 1, 'int32') cols = P.range(0, num_dets, 1, 'int32') rows = P.expand(P.reshape(rows, (1, -1)), [num_dets, 1]) cols = P.expand(P.reshape(cols, (-1, 1)), [1, num_dets]) tri_mask = P.cast(rows > cols, 'float32') tri_mask = P.expand(P.reshape(tri_mask, (1, num_dets, num_dets)), [num_classes, 1, 1]) iou = tri_mask * iou iou_max = P.reduce_max(iou, dim=1) # 同一类别,n个框与“分数比它高的框”的最高iou超过nms_thresh的话,就丢弃。下标是0的框肯定被保留。 keep = P.where(iou_max <= nms_thresh) # Assign each kept detection to its corresponding class classes = P.range(0, num_classes, 1, 'int32') classes = P.expand(P.reshape(classes, (-1, 1)), [1, num_dets]) classes = P.gather_nd(classes, keep) boxes = P.gather_nd(boxes, keep) scores = P.gather_nd(scores, keep) # Only keep the top cfg.max_num_detections highest scores across all classes scores, idx = P.argsort(scores, axis=0, descending=True) idx = idx[:nms_top_k] scores = scores[:nms_top_k] classes = P.gather(classes, idx) boxes = P.gather(boxes, idx) return boxes, scores, classes
def _relative_attention_inner(q, k, v, transpose): batch_size = layers.shape(q)[0] heads = layers.shape(q)[1] length = layers.shape(q)[2] xy_matmul = layers.matmul(q, k, transpose_y=transpose) x_t = layers.transpose(q, [2, 0, 1, 3]) x_t_r = layers.reshape(x_t, [length, batch_size * heads, -1]) x_tz_matmul = layers.matmul(x_t_r, v, transpose_y=transpose) x_tz_matmul_r = layers.reshape(x_tz_matmul, [length, batch_size, heads, -1]) x_tz_matmul_r_t = layers.transpose(x_tz_matmul_r, [1, 2, 0, 3]) return xy_matmul + x_tz_matmul_r_t
def get_prediction(self, feats, eval=True): name_list = list(feats.keys()) feats2 = [feats[name] for name in name_list] # [p2, p3, p4, p5] feats = feats2 # 有5个张量,5个张量的strides=[8, 8, 16, 32, 32],所以先对首尾张量进行插值。 # 一定要设置align_corners=False, align_mode=0才能和原版SOLO输出一致。 new_feats = [ L.resize_bilinear(feats[0], out_shape=L.shape(feats[1])[2:], align_corners=False, align_mode=0), feats[1], feats[2], feats[3], L.resize_bilinear(feats[4], out_shape=L.shape(feats[3])[2:], align_corners=False, align_mode=0) ] kernel_preds, cls_preds = [], [] for idx in range(len(self.seg_num_grids)): krn_feat = new_feats[idx] # 给卷积核分支 # ============ kernel branch (卷积核分支) ============ ins_kernel_feat = concat_coord(krn_feat) # 带上坐标信息。[N, c+2, h, w] kernel_feat = ins_kernel_feat # ins_kernel_feat不再使用 seg_num_grid = self.seg_num_grids[idx] # 这个特征图一行(列)的格子数 # kernel_feat插值成格子图。 [N, c+2, seg_num_grid, seg_num_grid] kernel_feat = L.resize_bilinear( kernel_feat, out_shape=[seg_num_grid, seg_num_grid], align_corners=False, align_mode=0) # 扔掉插入的坐标那2个通道,作为cls_feat。 [N, c, seg_num_grid, seg_num_grid] cls_feat = kernel_feat[:, :-2, :, :] for kernel_layer in self.krn_convs: kernel_feat = kernel_layer(kernel_feat) for class_layer in self.cls_convs: cls_feat = class_layer(cls_feat) kernel_pred = kernel_feat # [N, 256, seg_num_grid, seg_num_grid] 每个格子的预测卷积核 cls_pred = cls_feat # [N, 80, seg_num_grid, seg_num_grid] 每个格子的预测概率,未进行sigmoid()激活 if eval: # [N, seg_num_grid, seg_num_grid, 80] 每个格子的预测概率,已进行sigmoid()激活 cls_pred = L.transpose(points_nms(L.sigmoid(cls_pred), kernel=2), perm=[0, 2, 3, 1]) kernel_preds.append(kernel_pred) cls_preds.append(cls_pred) return [kernel_preds, cls_preds]
def __call__(self, input): if not self.coord_conv: return input b = L.shape(input)[0] h = L.shape(input)[2] w = L.shape(input)[3] x_range = L.range(0, w, 1., dtype='float32') / (w - 1) * 2.0 - 1 y_range = L.range(0, h, 1., dtype='float32') / (h - 1) * 2.0 - 1 x_range = L.reshape(x_range, (1, 1, 1, -1)) # [1, 1, 1, w] y_range = L.reshape(y_range, (1, 1, -1, 1)) # [1, 1, h, 1] x_range = L.expand(x_range, [b, 1, h, 1]) # [b, 1, h, w] y_range = L.expand(y_range, [b, 1, 1, w]) # [b, 1, h, w] offset = L.concat([input, x_range, y_range], axis=1) return offset
def bbox_iou(boxes1, boxes2): ''' 预测框 boxes1 (?, grid_h, grid_w, 3, 1, 4),神经网络的输出(tx, ty, tw, th)经过了后处理求得的(bx, by, bw, bh) 图片中所有的gt boxes2 (?, 1, 1, 1, 150, 4) paddle里不支持省略号,boxes1_area = boxes1[..., 2] * boxes1[..., 3] 冒号要写完 ''' boxes1_area = boxes1[:, :, :, :, :, 2] * boxes1[:, :, :, :, :, 3] # 所有格子的3个预测框的面积 boxes2_area = boxes2[:, :, :, :, :, 2] * boxes2[:, :, :, :, :, 3] # 所有ground truth的面积 # (x, y, w, h)变成(x0, y0, x1, y1) boxes1 = P.concat([ boxes1[:, :, :, :, :, :2] - boxes1[:, :, :, :, :, 2:] * 0.5, boxes1[:, :, :, :, :, :2] + boxes1[:, :, :, :, :, 2:] * 0.5 ], axis=-1) boxes2 = P.concat([ boxes2[:, :, :, :, :, :2] - boxes2[:, :, :, :, :, 2:] * 0.5, boxes2[:, :, :, :, :, :2] + boxes2[:, :, :, :, :, 2:] * 0.5 ], axis=-1) # 所有格子的3个预测框 分别 和 150个ground truth 计算iou。 所以left_up和right_down的shape = (?, grid_h, grid_w, 3, 150, 2) expand_boxes1 = P.expand(boxes1, [1, 1, 1, 1, P.shape(boxes2)[4], 1 ]) # 不同于pytorch和tf,boxes1和boxes2都要扩展为相同shape expand_boxes2 = P.expand( boxes2, [1, P.shape(boxes1)[1], P.shape(boxes1)[2], P.shape(boxes1)[3], 1, 1]) # 不同于pytorch和tf,boxes1和boxes2都要扩展为相同shape left_up = P.elementwise_max(expand_boxes1[:, :, :, :, :, :2], expand_boxes2[:, :, :, :, :, :2]) # 相交矩形的左上角坐标 right_down = P.elementwise_min(expand_boxes1[:, :, :, :, :, 2:], expand_boxes2[:, :, :, :, :, 2:]) # 相交矩形的右下角坐标 inter_section = P.relu( right_down - left_up) # 相交矩形的w和h,是负数时取0 (?, grid_h, grid_w, 3, 150, 2) inter_area = inter_section[:, :, :, :, :, 0] * inter_section[:, :, :, :, :, 1] # 相交矩形的面积 (?, grid_h, grid_w, 3, 150) expand_boxes1_area = P.expand(boxes1_area, [1, 1, 1, 1, P.shape(boxes2)[4]]) expand_boxes2_area = P.expand(boxes2_area, [ 1, P.shape(expand_boxes1_area)[1], P.shape(expand_boxes1_area)[2], P.shape(expand_boxes1_area)[3], 1 ]) union_area = expand_boxes1_area + expand_boxes2_area - inter_area # union_area (?, grid_h, grid_w, 3, 150) iou = 1.0 * inter_area / union_area # iou (?, grid_h, grid_w, 3, 150) return iou
def PredictionModule(x, num_priors, num_classes, mask_dim, shared_conv_w, shared_conv_b, shared_bbox_w, shared_bbox_b, shared_conf_w, shared_conf_b, shared_mask_w, shared_mask_b): ''' 改编自DSSD算法中的PredictionModule,改成了3x3卷积。3个分支分别预测bbox、conf、mask系数。 x / | \ bbox conf mask ''' x = P.conv2d(x, 256, filter_size=(3, 3), stride=1, padding=1, param_attr=shared_conv_w, bias_attr=shared_conv_b) x = P.relu(x) bbox_x = x conf_x = x mask_x = x bbox = P.conv2d(bbox_x, num_priors * 4, filter_size=(3, 3), stride=1, padding=1, param_attr=shared_bbox_w, bias_attr=shared_bbox_b) bbox = P.transpose(bbox, perm=[0, 2, 3, 1]) bbox = P.reshape(bbox, (P.shape(bbox)[0], -1, 4)) conf = P.conv2d(conf_x, num_priors * num_classes, filter_size=(3, 3), stride=1, padding=1, param_attr=shared_conf_w, bias_attr=shared_conf_b) conf = P.transpose(conf, perm=[0, 2, 3, 1]) conf = P.reshape(conf, (P.shape(conf)[0], -1, num_classes)) mask = P.conv2d(mask_x, num_priors * mask_dim, filter_size=(3, 3), stride=1, padding=1, param_attr=shared_mask_w, bias_attr=shared_mask_b) mask = P.transpose(mask, perm=[0, 2, 3, 1]) mask = P.reshape(mask, (P.shape(mask)[0], -1, mask_dim)) mask = P.tanh(mask) preds = {'loc': bbox, 'conf': conf, 'mask': mask} return preds
def __call__(self, msg): alpha = msg["alpha"] # lod-tensor (batch_size, num_heads) if attn_drop: old_h = alpha dropout = F.data(name='attn_drop', shape=[1], dtype="int64") u = L.uniform_random(shape=L.cast(L.shape(alpha)[:1], 'int64'), min=0., max=1.) keeped = L.cast(u > dropout, dtype="float32") self_attn_mask = L.scale(x=keeped, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = L.stack(x=[self_attn_mask] * num_heads, axis=1) n_head_self_attn_mask.stop_gradient = True alpha = n_head_self_attn_mask + alpha alpha = L.lod_reset(alpha, old_h) h = msg["v"] alpha = paddle_helper.sequence_softmax(alpha) self.alpha = alpha old_h = h h = h * alpha h = L.lod_reset(h, old_h) h = L.sequence_pool(h, "sum") if concat: h = L.reshape(h, [-1, num_heads * hidden_size]) else: h = L.reduce_mean(h, dim=1) return h
def attention(self, hidden, encoder_output, encoder_output_proj, encoder_padding_mask): # 定义attention用以计算context,即 c_i,这里使用Bahdanau attention机制 decoder_state_proj = layers.unsqueeze( layers.fc(hidden, size=self.hidden_size, bias_attr=False), [1]) # 拿解码器的一个向量,和编码器的所有输出,进行一个结合/混合/融合/交融/关联 mixed_state = fluid.layers.elementwise_add( encoder_output_proj, layers.expand(decoder_state_proj, [1, layers.shape(decoder_state_proj)[1], 1])) # 解码器的一个向量,和编码器的所有输出,进行一个结合/混合/融合/交融/关联 后,进行全连接转成一个数值关系 attn_scores = layers.squeeze( layers.fc(input=mixed_state, size=1, num_flatten_dims=2, bias_attr=False), [2]) if encoder_padding_mask is not None: attn_scores = layers.elementwise_add(attn_scores, encoder_padding_mask) # 数值关系softmax,变成了权重关系 attn_scores = layers.softmax(attn_scores) # 加权平均权重,就是解码器的一个向量一顿操作后,拿到的上下文向量 context = layers.reduce_sum(layers.elementwise_mul(encoder_output, attn_scores, axis=0), dim=1) return context
def model_func(inputs, is_train=True): src = inputs[0] src_sequence_length = inputs[1] # source embedding src_embeder = lambda x: fluid.embedding( input=x, size=[source_dict_size, hidden_dim], dtype="float32", param_attr=fluid.ParamAttr(name="src_emb_table")) src_embedding = src_embeder(src) # encoder encoder_output, encoder_state = encoder(src_embedding, src_sequence_length) encoder_output_proj = layers.fc(input=encoder_output, size=decoder_size, num_flatten_dims=2, bias_attr=False) src_mask = layers.sequence_mask(src_sequence_length, maxlen=layers.shape(src)[1], dtype="float32") encoder_padding_mask = (src_mask - 1.0) * 1e9 trg = inputs[2] if is_train else None # decoder output = decoder(encoder_output=encoder_output, encoder_output_proj=encoder_output_proj, encoder_state=encoder_state, encoder_padding_mask=encoder_padding_mask, trg=trg, is_train=is_train) return output
def get_seg(self, kernel_preds, cls_preds, mask_protos, ori_shapes, resize_shapes): num_levels = len(cls_preds) # 输出层个数=5 featmap_size = L.shape(mask_protos)[-2:] # 特征图大小,为stride=4 result_list = [] # for img_id in range(len(img_metas)): for img_id in range(1): cate_pred_list = [ L.reshape(cls_preds[i][img_id], (-1, self.cate_out_channels)) for i in range(num_levels) ] mask_proto = mask_protos[img_id:img_id + 1, :, :, :] kernel_pred_list = [ L.reshape(L.transpose(kernel_preds[i][img_id], perm=[1, 2, 0]), (-1, self.kernel_out_channels)) for i in range(num_levels) ] resize_shape = resize_shapes[img_id] ori_shape = ori_shapes[img_id] cate_pred_list = L.concat(cate_pred_list, axis=0) kernel_pred_list = L.concat(kernel_pred_list, axis=0) masks, classes, scores = self.get_seg_single( cate_pred_list, mask_proto, kernel_pred_list, featmap_size, resize_shape, ori_shape) # result_list.append(result) # return result_list return { 'masks': masks, 'classes': classes, 'scores': scores, }
def decode(conv_output, anchors, stride, num_class, conf_thresh): conv_shape = P.shape(conv_output) batch_size = conv_shape[0] n_grid = conv_shape[1] anchor_per_scale = len(anchors) conv_output = P.reshape( conv_output, (batch_size, n_grid, n_grid, anchor_per_scale, 5 + num_class)) conv_raw_dxdy = conv_output[:, :, :, :, 0:2] conv_raw_dwdh = conv_output[:, :, :, :, 2:4] conv_raw_conf = conv_output[:, :, :, :, 4:5] conv_raw_prob = conv_output[:, :, :, :, 5:] rows = P.range(0, n_grid, 1, 'float32') cols = P.range(0, n_grid, 1, 'float32') rows = P.expand(P.reshape(rows, (1, -1, 1)), [n_grid, 1, 1]) cols = P.expand(P.reshape(cols, (-1, 1, 1)), [1, n_grid, 1]) offset = P.concat([rows, cols], axis=-1) offset = P.reshape(offset, (1, n_grid, n_grid, 1, 2)) offset = P.expand(offset, [batch_size, 1, 1, anchor_per_scale, 1]) pred_xy = (P.sigmoid(conv_raw_dxdy) + offset) * stride pred_wh = (P.exp(conv_raw_dwdh) * P.assign(anchors)) pred_xywh = P.concat([pred_xy, pred_wh], axis=-1) pred_conf = P.sigmoid(conv_raw_conf) pred_prob = P.sigmoid(conv_raw_prob) pred_xywh = P.reshape(pred_xywh, (batch_size, -1, 4)) # [-1, -1, 4] pred_conf = P.reshape(pred_conf, (batch_size, -1, 1)) # [-1, -1, 1] pred_prob = P.reshape(pred_prob, (batch_size, -1, num_class)) # [-1, -1, 80] return pred_xywh, pred_conf, pred_prob
def graph_gather(gw, feature, index): """Implementation of graph gather Gather the corresponding index for each graph. Args: gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, ). index (int32): A tensor with K-rank where the first dim denotes the graph. Shape (num_graph, ) or (num_graph, k1, k2, k3, ..., kn). WARNING: We dont support negative index. Return: A tensor with shape (num_graph, k1, k2, k3, ..., kn, hidden_size) """ shape = L.shape(index) output_dim = int(feature.shape[-1]) index = index + gw.graph_lod[:-1] index = L.reshape(index, [-1]) feature = L.gather(feature, index, overwrite=False) new_shape = [] for i in range(shape.shape[0]): new_shape.append(shape[i]) new_shape.append(output_dim) feature = L.reshape(feature, new_shape) return feature
def var(input, axis=None, keepdim=False, unbiased=True, out=None, name=None): dtype = convert_dtype(input.dtype) if dtype not in ["float32", "float64"]: raise ValueError("Layer tensor.var() only supports floating-point " "dtypes, but received {}.".format(dtype)) rank = len(input.shape) axes = axis if axis != None and axis != [] else range(rank) axes = [e if e >= 0 else e + rank for e in axes] inp_shape = input.shape if in_dygraph_mode() else layers.shape(input) mean = layers.reduce_mean(input, dim=axis, keep_dim=True, name=name) tmp = layers.reduce_mean( (input - mean)**2, dim=axis, keep_dim=keepdim, name=name) if unbiased: n = 1 for i in axes: n *= inp_shape[i] if not in_dygraph_mode(): n = layers.cast(n, dtype) zero_const = layers.fill_constant(shape=[1], dtype=dtype, value=0.0) factor = where(n > 1.0, n / (n - 1.0), zero_const) else: factor = n / (n - 1.0) if n > 1.0 else 0.0 tmp *= factor if out: layers.assign(input=tmp, output=out) return out else: return tmp
def _build_position_ids(self, src_ids): src_shape = L.shape(src_ids) src_seqlen = src_shape[1] src_batch = src_shape[0] slot_seqlen = self.slot_seqlen num_b = (src_seqlen / slot_seqlen) - 1 a_position_ids = L.reshape(L.range(0, slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] a_position_ids = L.expand( a_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids[:, :slot_seqlen], zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] a_pad_len = L.reduce_sum(input_mask, 1) # [B, 1, 1] b_position_ids = L.reshape(L.range(slot_seqlen, 2 * slot_seqlen, 1, dtype='int32'), [1, slot_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] b_position_ids = L.expand( b_position_ids, [src_batch, num_b, 1]) # [B, slot_seqlen * num_b, 1] b_position_ids = b_position_ids - a_pad_len # [B, slot_seqlen * num_b, 1] position_ids = L.concat([a_position_ids, b_position_ids], 1) position_ids = L.cast(position_ids, 'int64') position_ids.stop_gradient = True return position_ids
def forward(self, src, src_length): # encoding encoder_output, encoder_final_state = self.encoder(src, src_length) # decoder initial states decoder_initial_states = [ encoder_final_state, self.decoder.lstm_attention.cell.get_initial_states( batch_ref=encoder_output, shape=[self.hidden_size]) ] # attention mask to avoid paying attention on padddings src_mask = layers.sequence_mask( src_length, maxlen=layers.shape(src)[1], dtype=encoder_output.dtype) encoder_padding_mask = (src_mask - 1.0) * 1e9 encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1]) # Tile the batch dimension with beam_size encoder_output = BeamSearchDecoder.tile_beam_merge_with_batch( encoder_output, self.beam_size) encoder_padding_mask = BeamSearchDecoder.tile_beam_merge_with_batch( encoder_padding_mask, self.beam_size) # dynamic decoding with beam search rs, _ = self.beam_search_decoder( inits=decoder_initial_states, encoder_output=encoder_output, encoder_padding_mask=encoder_padding_mask) return rs
def ce_conf_loss(self, pred_allboxes_conf, labels_pos_mask, labels_neg_mask, class_vectors, labels_pos_cid2, gt_area): labels_pos_cid2 = P.reshape(labels_pos_cid2, (-1, )) # [batch_size*num_priors] pred_allboxes_conf_r = P.reshape( pred_allboxes_conf, (-1, P.shape(pred_allboxes_conf)[2] )) # [batch_size*num_priors, num_classes] label_prob = P.gather( class_vectors, labels_pos_cid2) # one-hot掩码 (batch_size*num_priors, num_classes) pred_prob = P.softmax(pred_allboxes_conf_r) pred_prob = P.cast(pred_prob, 'float32') prob_loss = label_prob * (0 - P.log(pred_prob + 1e-9)) # 加了极小的常数防止nan prob_loss = P.reduce_sum(prob_loss, dim=1) # 只留下正反例的损失 labels_pos_mask2 = P.reshape(labels_pos_mask, (-1, )) # [batch_size*num_priors] labels_neg_mask2 = P.reshape(labels_neg_mask, (-1, )) # [batch_size*num_priors] conf_loss_scale = 2.0 - gt_area # gt面积越小,权重越大,越受重视 conf_loss_scale = P.reshape(conf_loss_scale, (-1, )) # [batch_size*num_priors] prob_pos_loss = prob_loss * labels_pos_mask2 * conf_loss_scale prob_neg_loss = prob_loss * labels_neg_mask2 ce_loss = prob_pos_loss + prob_neg_loss ce_loss = P.reduce_sum(ce_loss) return ce_loss
def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen): input_shape = L.shape(input_mask) input_batch = input_shape[0] input_seqlen = input_shape[1] num_slot = input_seqlen / slot_seqlen num_b = num_slot - 1 ones = L.ones([num_b], dtype="float32") # [num_b] diag_ones = L.diag(ones) # [num_b, num_b] diag_ones = L.unsqueeze(diag_ones, [1, -1]) # [num_b, 1, num_b, 1] diag_ones = L.expand( diag_ones, [1, slot_seqlen, 1, slot_seqlen]) # [num_b, seqlen, num_b, seqlen] diag_ones = L.reshape(diag_ones, [1, num_b * slot_seqlen, num_b * slot_seqlen ]) # [1, num_b*seqlen, num_b*seqlen] graph_attn_bias = L.concat([ L.ones([1, num_b * slot_seqlen, slot_seqlen], dtype="float32"), diag_ones ], 2) graph_attn_bias = L.concat([ L.ones([1, slot_seqlen, num_slot * slot_seqlen], dtype="float32"), graph_attn_bias ], 1) # [1, seq, seq] pad_attn_bias = L.matmul(input_mask, input_mask, transpose_y=True) # [batch, seq, seq] attn_bias = graph_attn_bias * pad_attn_bias attn_bias = (1. - attn_bias) * -10000. attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq] if attn_bias.dtype != dtype: attn_bias = L.cast(attn_bias, dtype) return attn_bias
def build_position_ids(src_ids, dst_ids): src_shape = L.shape(src_ids) src_batch = src_shape[0] src_seqlen = src_shape[1] dst_seqlen = src_seqlen - 1 # without cls src_position_ids = L.reshape( L.range( 0, src_seqlen, 1, dtype='int32'), [1, src_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] src_position_ids = L.expand(src_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen * num_b, 1] zero = L.fill_constant([1], dtype='int64', value=0) input_mask = L.cast(L.equal(src_ids, zero), "int32") # assume pad id == 0 [B, slot_seqlen, 1] src_pad_len = L.reduce_sum(input_mask, 1, keep_dim=True) # [B, 1, 1] dst_position_ids = L.reshape( L.range( src_seqlen, src_seqlen+dst_seqlen, 1, dtype='int32'), [1, dst_seqlen, 1], inplace=True) # [1, slot_seqlen, 1] dst_position_ids = L.expand(dst_position_ids, [src_batch, 1, 1]) # [B, slot_seqlen, 1] dst_position_ids = dst_position_ids - src_pad_len # [B, slot_seqlen, 1] position_ids = L.concat([src_position_ids, dst_position_ids], 1) position_ids = L.cast(position_ids, 'int64') position_ids.stop_gradient = True return position_ids
def model_func(inputs, is_train=True): # inputs = [src, src_sequence_length, trg, trg_sequence_length, label] # src = fluid.data(name="src", shape=[None, None], dtype="int64") # 源语言输入 src = inputs[0] src_sequence_length = inputs[1] src_embedding = fluid.embedding( input=src, size=[source_dict_size, hidden_dim], dtype="float32", param_attr=fluid.ParamAttr(name="src_emb_table")) # 编码器 encoder_output, encoder_state = encoder(src_embedding, src_sequence_length) encoder_output_proj = layers.fc(input=encoder_output, size=decoder_size, num_flatten_dims=2, bias_attr=False) src_mask = layers.sequence_mask(src_sequence_length, maxlen=layers.shape(src)[1], dtype="float32") encoder_padding_mask = (src_mask - 1.0) * 1e9 # 目标语言输入,训练时有、预测生成时无该输入 trg = inputs[2] if is_train else None # 解码器 output = decoder(encoder_output=encoder_output, encoder_output_proj=encoder_output_proj, encoder_state=encoder_state, encoder_padding_mask=encoder_padding_mask, trg=trg, is_train=is_train) return output
def _build_decoder(self, enc_final_state, mode='train', beam_size=10): output_layer = lambda x: layers.fc( x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr( name="output_w", initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale)), bias_attr=False) dec_cell = AttentionDecoderCell(self.num_layers, self.hidden_size, self.dropout, self.init_scale) dec_initial_states = [ enc_final_state, dec_cell.get_initial_states(batch_ref=self.enc_output, shape=[self.hidden_size]) ] max_src_seq_len = layers.shape(self.src)[1] src_mask = layers.sequence_mask(self.src_sequence_length, maxlen=max_src_seq_len, dtype='float32') enc_padding_mask = (src_mask - 1.0) if mode == 'train': dec_output, _ = rnn(cell=dec_cell, inputs=self.tar_emb, initial_states=dec_initial_states, sequence_length=None, enc_output=self.enc_output, enc_padding_mask=enc_padding_mask) dec_output = output_layer(dec_output) elif mode == 'beam_search': output_layer = lambda x: layers.fc( x, size=self.tar_vocab_size, num_flatten_dims=len(x.shape) - 1, param_attr=fluid.ParamAttr(name="output_w"), bias_attr=False) beam_search_decoder = BeamSearchDecoder( dec_cell, self.beam_start_token, self.beam_end_token, beam_size, embedding_fn=self.tar_embeder, output_fn=output_layer) enc_output = beam_search_decoder.tile_beam_merge_with_batch( self.enc_output, beam_size) enc_padding_mask = beam_search_decoder.tile_beam_merge_with_batch( enc_padding_mask, beam_size) outputs, _ = dynamic_decode(beam_search_decoder, inits=dec_initial_states, max_step_num=self.beam_max_step_num, enc_output=enc_output, enc_padding_mask=enc_padding_mask) return outputs return dec_output
def batch_scatter(ref, indices, updates, in_place=False, overwrite=False): """Scatter updates to ref, according to corrensponding index in indices in each batch. Currently, it only support 2d Tensor. Args: ref (Variable): with shape [batch_size, ...] indices (Variable): with shape [batch_size, 1] updates (Variable): with shape [batch_size] in_place (bool): if True, scatter result will be assign to ref. otherwise, a new Tensor will be returned. Default is False. overwrite (bool): if True, scatter will over write corrensponding elements. Default is False. Returns: TODO Raises: NULL Examples: ref [[1, 1, 1], [1, 1, 1]] indices [[2], [1]] updates [2, 3] return [[1, 1, 2], [1, 3, 1]] """ ref_dtype = ref.dtype if ref_dtype not in PaddleVarType.floats: ref_in = layers.cast(ref, dtype='float32') else: ref_in = ref if updates.dtype != ref_in.dtype: updates = layers.cast(updates, dtype=ref_in.dtype) batch_size = layers.cast(layers.shape(ref_in)[0], dtype=indices.dtype) zero = layers.fill_constant(shape=[1], dtype=indices.dtype, value=0) one = layers.fill_constant(shape=[1], dtype=indices.dtype, value=1) batch_indices = layers.unsqueeze( layers.range(zero, batch_size, one, dtype=indices.dtype), [1]) coord = layers.concat([batch_indices, indices], axis=1) if overwrite: mask = layers.gather_nd(ref_in, coord) mask = layers.elementwise_sub(layers.zeros_like(mask), mask) ref_in = layers.scatter_nd_add(ref_in, coord, mask) output = layers.scatter_nd_add(ref_in, coord, updates) if ref_dtype not in PaddleVarType.floats: output = layers.cast(output, dtype=ref_dtype) if in_place: layers.assign(output, ref) return ref else: return output
def _iou(box_a, box_b): ''' :param box_a: [c, A, 4] :param box_b: [c, B, 4] :return: [c, A, B] 两两之间的iou ''' # 变成左上角坐标、右下角坐标 boxes1 = P.concat([ box_a[:, :, :2] - box_a[:, :, 2:] * 0.5, box_a[:, :, :2] + box_a[:, :, 2:] * 0.5 ], axis=-1) boxes2 = P.concat([ box_b[:, :, :2] - box_b[:, :, 2:] * 0.5, box_b[:, :, :2] + box_b[:, :, 2:] * 0.5 ], axis=-1) c = P.shape(boxes1)[0] A = P.shape(boxes1)[1] B = P.shape(boxes2)[1] box_a = P.reshape(boxes1, (c, A, 1, 4)) box_b = P.reshape(boxes2, (c, 1, B, 4)) expand_box_a = P.expand(box_a, [1, 1, B, 1]) expand_box_b = P.expand(box_b, [1, A, 1, 1]) # 两个矩形的面积 boxes1_area = (expand_box_a[:, :, :, 2] - expand_box_a[:, :, :, 0]) * \ (expand_box_a[:, :, :, 3] - expand_box_a[:, :, :, 1]) boxes2_area = (expand_box_b[:, :, :, 2] - expand_box_b[:, :, :, 0]) * \ (expand_box_b[:, :, :, 3] - expand_box_b[:, :, :, 1]) # 相交矩形的左上角坐标、右下角坐标 left_up = P.elementwise_max(expand_box_a[:, :, :, :2], expand_box_b[:, :, :, :2]) right_down = P.elementwise_min(expand_box_a[:, :, :, 2:], expand_box_b[:, :, :, 2:]) # 相交矩形的面积inter_area。iou # inter_section = P.elementwise_max(right_down - left_up, 0.0) inter_section = P.relu(right_down - left_up) inter_area = inter_section[:, :, :, 0] * inter_section[:, :, :, 1] union_area = boxes1_area + boxes2_area - inter_area iou = inter_area / (union_area + 1e-9) return iou
def test_shape(self): program = Program() with program_guard(program): input = layers.data( name="input", shape=[3, 100, 100], dtype="float32") out = layers.shape(input) self.assertIsNotNone(out) print(str(program))
def loss_func(logits, label, trg_sequence_length): probs = layers.softmax(logits) loss = layers.cross_entropy(input=probs, label=label) trg_mask = layers.sequence_mask(trg_sequence_length, maxlen=layers.shape(logits)[1], dtype="float32") avg_cost = layers.reduce_sum(loss * trg_mask) / layers.reduce_sum(trg_mask) return avg_cost
def encoder(enc_input, input_mask, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, hidden_act, preprocess_cmd="n", postprocess_cmd="da", param_initializer=None, name=''): """ The encoder is composed of a stack of identical layers returned by calling encoder_layer. """ d_shape = L.shape(input_mask) pad_idx = build_pad_idx(input_mask) attn_bias = build_attn_bias(input_mask, n_head, enc_input.dtype) enc_input = to_2d(enc_input) all_hidden = [] all_attn = [] all_ffn = [] for i in range(n_layer): enc_output, ctx_multiheads_attn, ffn_output = encoder_layer( enc_input, input_mask, attn_bias, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, hidden_act, preprocess_cmd, postprocess_cmd, param_initializer=param_initializer, name=name + '_layer_' + str(i)) all_hidden.append(enc_output) all_attn.append(ctx_multiheads_attn) all_ffn.append(ffn_output) enc_input = enc_output enc_output = pre_process_layer(enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder") enc_output = to_3d(enc_output) return enc_output, all_hidden, all_attn, all_ffn