def detect(self, batch_idx, conf_preds, decoded_boxes, mask_data): """ Perform nms for only the max scoring class that isn't background (class 0) """ # 确实是先坐标全部解码完成,在进行分数过滤。可以考虑过滤后再进行坐标解码 cur_scores = conf_preds[batch_idx, 1:, :] conf_scores = P.reduce_max(cur_scores, dim=0) ''' gpu版本的paddlepaddle1.6.2里有一个问题。keep如果是[None],并且在gather()里使用了keep,就会出现 cudaGetLastError invalid configuration argument errno: 9 这个错误。cpu版本则可以正常跑。 为了避免上面的问题,只能让keep不是[None],所以这里给keep额外添加了一个元素keep_extra。 ''' keep = P.where(conf_scores > self.conf_thresh) keep_extra = P.where(conf_scores < self.conf_thresh) keep_extra = keep_extra[:1] keep = P.concat([keep, keep_extra], axis=0) scores = P.gather(P.transpose(cur_scores, perm=[1, 0]), keep) scores = P.transpose(scores, perm=[1, 0]) boxes = P.gather(decoded_boxes, keep) masks = P.gather(mask_data[batch_idx], keep) ''' 因为上面增加了一个keep_extra,所以keep一定至少有一个预测框。 当官方修复了上述问题后,删除上面keep_extra的代码,下面的代码解除注释。 这么做的原因是判断keep为空太难了。 ''' # 可能没有框被保留。所以添加一个得分垫底的框让fast_nms()能进行下去 # extra_box = P.fill_constant((1, 4), 'float32', value=-1.0) # extra_score = P.fill_constant((P.shape(cur_scores)[0], 1), 'float32', value=-1.0) # extra_mask = P.fill_constant((1, P.shape(mask_data)[2]), 'float32', value=-1.0) # boxes = P.concat([boxes, extra_box], axis=0) # scores = P.concat([scores, extra_score], axis=1) # masks = P.concat([masks, extra_mask], axis=0) return self.fast_nms(boxes, scores, masks)
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" _, vocab_size = logits.shape bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] #log.debug(gather_idx.numpy()) #log.debug(state.finished.numpy()) #log.debug(next_finished.numpy()) next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') #log.debug(next_word_id.numpy()) #log.debug(next_beam_id.numpy()) next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" beam_size, vocab_size = logits.shape # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size logits_np = logits.numpy() for i in range(beam_size): logits_np[i][17963] = 0 # make [UNK] prob = 0 logits = D.to_variable(logits_np) bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def matrix_nms(bboxes, scores, score_threshold, post_threshold, nms_top_k, keep_top_k, use_gaussian=False, gaussian_sigma=2.): scores = L.transpose(scores, [1, 0]) inds = L.where(scores > score_threshold) if len(inds) == 0: return L.zeros((0, 6), 'float32') - 1.0 cate_scores = L.gather_nd(scores, inds) cate_labels = inds[:, 1] bboxes = L.gather(bboxes, inds[:, 0]) # sort and keep top nms_top_k _, sort_inds = L.argsort(cate_scores, descending=True) if nms_top_k > 0 and len(sort_inds) > nms_top_k: sort_inds = sort_inds[:nms_top_k] bboxes = L.gather(bboxes, sort_inds) cate_scores = L.gather(cate_scores, sort_inds) cate_labels = L.gather(cate_labels, sort_inds) # Matrix NMS kernel = 'gaussian' if use_gaussian else 'linear' cate_scores = _matrix_nms(bboxes, cate_labels, cate_scores, kernel=kernel, sigma=gaussian_sigma) # filter. keep = L.where(cate_scores >= post_threshold) if len(keep) == 0: return L.zeros((0, 6), 'float32') - 1.0 bboxes = L.gather(bboxes, keep) cate_scores = L.gather(cate_scores, keep) cate_labels = L.gather(cate_labels, keep) # sort and keep keep_top_k _, sort_inds = L.argsort(cate_scores, descending=True) if len(sort_inds) > keep_top_k: sort_inds = sort_inds[:keep_top_k] bboxes = L.gather(bboxes, sort_inds) cate_scores = L.gather(cate_scores, sort_inds) cate_labels = L.gather(cate_labels, sort_inds) cate_scores = L.unsqueeze(cate_scores, 1) cate_labels = L.unsqueeze(cate_labels, 1) cate_labels = L.cast(cate_labels, 'float32') pred = L.concat([cate_labels, cate_scores, bboxes], 1) return pred
def no_nms(bboxes, scores, score_threshold, keep_top_k): scores = L.transpose(scores, [1, 0]) inds = L.where(scores > score_threshold) if len(inds) == 0: return L.zeros((0, 6), 'float32') - 1.0 cate_scores = L.gather_nd(scores, inds) cate_labels = inds[:, 1] bboxes = L.gather(bboxes, inds[:, 0]) # sort and keep top keep_top_k _, sort_inds = L.argsort(cate_scores, descending=True) if keep_top_k > 0 and len(sort_inds) > keep_top_k: sort_inds = sort_inds[:keep_top_k] bboxes = L.gather(bboxes, sort_inds) cate_scores = L.gather(cate_scores, sort_inds) cate_labels = L.gather(cate_labels, sort_inds) cate_scores = L.unsqueeze(cate_scores, 1) cate_labels = L.unsqueeze(cate_labels, 1) cate_labels = L.cast(cate_labels, 'float32') pred = L.concat([cate_labels, cate_scores, bboxes], 1) return pred
def var(input, axis=None, keepdim=False, unbiased=True, out=None, name=None): dtype = input.dtype if dtype not in ["float32", "float64"]: raise ValueError("Layer tensor.var() only supports floating-point " "dtypes, but received {}.".format(dtype)) rank = len(input.shape) axes = axis if axis != None and axis != [] else range(rank) axes = [e if e >= 0 else e + rank for e in axes] inp_shape = input.shape if fluid.in_dygraph_mode() else layers.shape(input) mean = layers.reduce_mean(input, dim=axis, keep_dim=True, name=name) tmp = layers.reduce_mean((input - mean)**2, dim=axis, keep_dim=keepdim, name=name) if unbiased: n = 1 for i in axes: n *= inp_shape[i] if not fluid.in_dygraph_mode(): n = layers.cast(n, dtype) zero_const = layers.fill_constant(shape=[1], dtype=dtype, value=0.0) factor = layers.where(n > 1.0, n / (n - 1.0), zero_const) else: factor = n / (n - 1.0) if n > 1.0 else 0.0 tmp *= factor if out: layers.assign(input=tmp, output=out) return out else: return tmp
def iou_single(a, b, mask, n_class): valid = mask == 1 valid_flatten = paddle.reshape(valid, (-1, )) valid_flatten = paddle.cast(valid_flatten, dtype="int32") index = where(valid_flatten == 1) if index.shape[0] == 0: return paddle.zeros((1, )) index = paddle.reshape(index, (1, -1)) a_flatten = paddle.reshape(a, (1, -1)) a = paddle.index_sample(a_flatten, index) a = paddle.reshape(a, (-1, )) b_flatten = paddle.reshape(b, (1, -1)) b = paddle.index_sample(b_flatten, index) b = paddle.reshape(b, (-1, )) miou = [] for i in range(n_class): inter = paddle.logical_and(a == i, b == i) inter = paddle.cast(inter, dtype='float32') union = paddle.logical_or(a == i, b == i) union = paddle.cast(union, dtype='float32') miou.append(paddle.sum(inter) / (paddle.sum(union) + EPS)) miou = sum(miou) / len(miou) return miou
def masked_select(input, mask): """Select the input value according to the mask Arags: input: input matrix mask: mask matrix Returns: output >>> input [ [1, 2, 3], [4, 5, 6] ] >>> mask [ [True, True, False], [True, False, False] ] >>> masked_select(input, mask) [1, 2, 4] """ select = layers.where(mask) output = layers.gather_nd(input, select) return output
def forward(self, src_ids, *args, **kwargs): pooled, encoded = ErnieModel.forward(self, src_ids, *args, **kwargs) encoded_2d = L.gather_nd(encoded, L.where(src_ids == mask_id)) encoded_2d = self.mlm(encoded_2d) encoded_2d = self.mlm_ln(encoded_2d) logits_2d = L.matmul( encoded_2d, self.word_emb.weight, transpose_y=True) + self.mlm_bias return logits_2d
def get_subgraph_by_masked(self, graph, mask): index = L.where(mask) if index.shape[0] > 0: edges = graph.edges sub_edges = paddle.gather(edges, index, axis=0) sg = pgl.Graph(sub_edges, num_nodes=graph.num_nodes) return sg else: return None
def fast_nms(self, boxes, scores, masks, max_num_detections=100): iou_threshold = self.nms_thresh top_k = self.top_k # 同类方框根据得分降序排列 scores, idx = P.argsort(scores, axis=1, descending=True) idx = idx[:, :top_k] scores = scores[:, :top_k] num_classes, num_dets = P.shape(idx)[0], P.shape(idx)[1] idx = P.reshape(idx, (-1, )) boxes = P.gather(boxes, idx) boxes = P.reshape(boxes, (num_classes, num_dets, 4)) masks = P.gather(masks, idx) masks = P.reshape(masks, (num_classes, num_dets, -1)) # 计算一个c×n×n的IOU矩阵,其中每个n×n矩阵表示对该类n个候选框,两两之间的IOU iou = jaccard(boxes, boxes) # 因为自己与自己的IOU=1,IOU(A,B)=IOU(B,A),所以对上一步得到的IOU矩阵 # 进行一次处理。具体做法是将每一个通道,的对角线元素和下三角部分置为0 rows = P.range(0, num_dets, 1, 'int32') cols = P.range(0, num_dets, 1, 'int32') rows = P.expand(P.reshape(rows, (1, -1)), [num_dets, 1]) cols = P.expand(P.reshape(cols, (-1, 1)), [1, num_dets]) tri_mask = P.cast(rows > cols, 'float32') tri_mask = P.expand(P.reshape(tri_mask, (1, num_dets, num_dets)), [num_classes, 1, 1]) iou = tri_mask * iou iou_max = P.reduce_max(iou, dim=1) # Now just filter out the ones higher than the threshold keep = P.where(iou_max <= iou_threshold) # Assign each kept detection to its corresponding class classes = P.range(0, num_classes, 1, 'int32') classes = P.expand(P.reshape(classes, (-1, 1)), [1, num_dets]) classes = P.gather_nd(classes, keep) boxes = P.gather_nd(boxes, keep) masks = P.gather_nd(masks, keep) scores = P.gather_nd(scores, keep) # Only keep the top cfg.max_num_detections highest scores across all classes scores, idx = P.argsort(scores, axis=0, descending=True) idx = idx[:max_num_detections] scores = scores[:max_num_detections] classes = P.gather(classes, idx) boxes = P.gather(boxes, idx) masks = P.gather(masks, idx) return boxes, masks, classes, scores
def fast_nms(boxes, scores, conf_thresh, nms_thresh, keep_top_k, nms_top_k): ''' :param boxes: [?, 4] :param scores: [80, ?] ''' # 同类方框根据得分降序排列 scores, idx = P.argsort(scores, axis=1, descending=True) idx = idx[:, :keep_top_k] scores = scores[:, :keep_top_k] num_classes, num_dets = P.shape(idx)[0], P.shape(idx)[1] idx = P.reshape(idx, (-1, )) boxes = P.gather(boxes, idx) boxes = P.reshape(boxes, (num_classes, num_dets, 4)) # 计算一个c×n×n的IOU矩阵,其中每个n×n矩阵表示对该类n个候选框,两两之间的IOU iou = _iou(boxes, boxes) # 因为自己与自己的IOU=1,IOU(A,B)=IOU(B,A),所以对上一步得到的IOU矩阵 # 进行一次处理。具体做法是将每一个通道,的对角线元素和下三角部分置为0 rows = P.range(0, num_dets, 1, 'int32') cols = P.range(0, num_dets, 1, 'int32') rows = P.expand(P.reshape(rows, (1, -1)), [num_dets, 1]) cols = P.expand(P.reshape(cols, (-1, 1)), [1, num_dets]) tri_mask = P.cast(rows > cols, 'float32') tri_mask = P.expand(P.reshape(tri_mask, (1, num_dets, num_dets)), [num_classes, 1, 1]) iou = tri_mask * iou iou_max = P.reduce_max(iou, dim=1) # 同一类别,n个框与“分数比它高的框”的最高iou超过nms_thresh的话,就丢弃。下标是0的框肯定被保留。 keep = P.where(iou_max <= nms_thresh) # Assign each kept detection to its corresponding class classes = P.range(0, num_classes, 1, 'int32') classes = P.expand(P.reshape(classes, (-1, 1)), [1, num_dets]) classes = P.gather_nd(classes, keep) boxes = P.gather_nd(boxes, keep) scores = P.gather_nd(scores, keep) # Only keep the top cfg.max_num_detections highest scores across all classes scores, idx = P.argsort(scores, axis=0, descending=True) idx = idx[:nms_top_k] scores = scores[:nms_top_k] classes = P.gather(classes, idx) boxes = P.gather(boxes, idx) return boxes, scores, classes
def masked_select(input, mask): """masked_select Slice the value from given Mask Args: input: Input tensor to be selected mask: A bool tensor for sliced. Return: Part of inputs where mask is True. """ index = L.where(mask) return L.gather(input, index)
def ohem_single(score, gt_text, training_mask): gt_part = paddle.cast(gt_text > 0.5, dtype='float32') gt_tr_part = paddle.cast(paddle.logical_and(gt_text > 0.5, training_mask <= 0.5), dtype='float32') pos_num = int(paddle.sum(gt_part)) - int(paddle.sum(gt_tr_part)) #pos_num = int(np.sum(gt_text.numpy() > 0.5)) - int(np.sum((gt_text.numpy() > 0.5) & (training_mask.numpy() <= 0.5))) #pos_num = int(paddle.sum(gt_text > 0.5)) - int(paddle.sum((gt_text > 0.5) & (training_mask <= 0.5))) if pos_num == 0: # selected_mask = gt_text.copy() * 0 # may be not good selected_mask = training_mask selected_mask = paddle.reshape( selected_mask, (1, selected_mask.shape[0], selected_mask.shape[1])) selected_mask = paddle.cast(selected_mask, dtype='float32') return selected_mask neg_num = int(np.sum(gt_text.numpy() <= 0.5)) neg_num = int(min(pos_num * 3, neg_num)) if neg_num == 0: selected_mask = training_mask # selected_mask = selected_mask.view(1, selected_mask.shape[0], selected_mask.shape[1]).float() selected_mask = paddle.reshape( selected_mask, (1, selected_mask.shape[0], selected_mask.shape[1])) selected_mask = paddle.cast(selected_mask, dtype='float32') return selected_mask gt_text_flatten = paddle.reshape(gt_text, (-1, )) index = where(gt_text_flatten <= 0.5) index = paddle.reshape(index, (1, -1)) score_flatten = paddle.reshape(score, (1, -1)) neg_score = paddle.index_sample(score_flatten, index) neg_score = paddle.reshape(neg_score, (-1, )) neg_score_sorted = paddle.sort(-neg_score) threshold = -neg_score_sorted[neg_num - 1] item1 = paddle.logical_or(score >= threshold, gt_text > 0.5) selected_mask = paddle.logical_and(item1, training_mask > 0.5) # selected_mask = selected_mask.reshape(1, selected_mask.shape[0], selected_mask.shape[1]).float() selected_mask = paddle.reshape( selected_mask, (1, selected_mask.shape[0], selected_mask.shape[1])) #selected_mask = selected_mask.reshape(1, selected_mask.shape[0], selected_mask.shape[1]) selected_mask = paddle.cast(selected_mask, dtype='float32') return selected_mask
def get_loss(self, kernel_preds, cate_preds, mask_feats, ins_labels, cate_labels, grid_orders, fg_nums): ''' 丢掉了lod信息,只能改写一下了。 :param kernel_preds: kernel_preds里每个元素形状是[N, 256, seg_num_grid, seg_num_grid], 每个格子的预测卷积核。 从 小感受野 到 大感受野。 :param cate_preds: cate_preds里每个元素形状是 [N, 80, seg_num_grid, seg_num_grid], 每个格子的预测概率,未进行sigmoid()激活。 从 小感受野 到 大感受野。 :param mask_feats: [bs, 256, s4, s4] 掩码原型 :param ins_labels: 5个元素。5个输出层的正样本对应掩码。里面每个元素形状是[M, s4, s4] M表示该输出层所有图片的正样本对应掩码 被无差别地拼接起来。从 小感受野 到 大感受野。 :param cate_labels: 5个元素。5个输出层的正样本的类别id。里面每个元素形状是[N*seg_num_grid*seg_num_grid, ] 从 小感受野 到 大感受野。 :param grid_orders: 5个元素。5个输出层的正样本在[N*seg_num_grid*seg_num_grid, ]中的下标。里每个元素形状是[M, ] 与ins_labels中的掩码对应。 :param fg_nums: [N, ] 每张图片的正样本个数。 :return: ''' new_kernel_preds = [] gathered_img_id_list = [] for kernel_preds_level, grid_orders_level in zip(kernel_preds, grid_orders): # 首先,将正样本预测的卷积核抽出来。 # [N, 256, seg_num_grid, seg_num_grid] -> [N, seg_num_grid, seg_num_grid, 256] kernel_preds_level = L.transpose(kernel_preds_level, perm=[0, 2, 3, 1]) reshape_pred = L.reshape(kernel_preds_level, shape=(-1, L.shape(kernel_preds_level)[-1])) # [N*seg_num_grid*seg_num_grid, 256] gathered_pred = L.gather(reshape_pred, index=grid_orders_level) # [M=5, 256] 比如第一张图片有2个正样本,第二张图片有3个正样本(假如batch_size=2) # 然后,确定这些正样本是第几张图片的正样本。 batch_size = L.shape(kernel_preds_level)[0] seg_num_grid = L.shape(kernel_preds_level)[1] img_ids = L.range(0, batch_size, 1, dtype='int32') # [N, ] img_ids = L.unsqueeze(img_ids, axes=[1, 2]) # [N, 1, 1] img_ids = L.expand(img_ids, [1, seg_num_grid, seg_num_grid]) # [N, seg_num_grid, seg_num_grid] img_ids = L.reshape(img_ids, [-1, ]) # [N*seg_num_grid*seg_num_grid, ] gathered_img_id = L.gather(img_ids, index=grid_orders_level) # [M=5, ] 这些正样本是第几张图片的正样本 new_kernel_preds.append(gathered_pred) gathered_img_id_list.append(gathered_img_id) # 生成掩码 ins_pred_list = [] for kernel_pred, gathered_img_id in zip(new_kernel_preds, gathered_img_id_list): # kernel_pred shape=[5, 256] 第一张图片填充1个卷积核。使得这一批图片预测的卷积核个数相同。 # gathered_img_id data=[5, ] 这些正样本是第几张图片的正样本 # 第一张图片的正样本卷积核卷积第一张图片的掩码原型,第二张图片的正样本卷积核卷积第二张图片的掩码原型,... batch_size = L.shape(mask_feats)[0] cur_ins_pred = [] for i in range(batch_size): mask_feat = mask_feats[i:i + 1] # [1, 256, s4, s4] 掩码原型 interest = L.where(gathered_img_id == i) kr = L.gather(kernel_pred, interest) # [m, 256] kr = L.unsqueeze(kr, [2, 3]) # [m, 256, 1, 1] pred_mask = F.conv2d(mask_feat, kr) # [1, m, s4, s4] cur_ins_pred.append(L.squeeze(pred_mask, [0])) cur_ins_pred = L.concat(cur_ins_pred, 0) # [M, s4, s4] ins_pred_list.append(cur_ins_pred) num_ins = fluid.layers.reduce_sum(fg_nums) # 所有图片所有输出层的正样本个数 cate_preds = [ # cate_preds里每个元素变成 [N*seg_num_grid*seg_num_grid, 80] fluid.layers.reshape( fluid.layers.transpose(cate_pred, [0, 2, 3, 1]), shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds ] flatten_cate_preds = fluid.layers.concat(cate_preds) # [N*seg_num_grid_1*seg_num_grid_1 + N*seg_num_grid_2*seg_num_grid_2 + ..., 80] new_cate_labels = [] cate_labels = fluid.layers.concat(cate_labels) cate_labels = fluid.layers.unsqueeze(cate_labels, 1) loss_ins, loss_cate = self.solov2_loss( ins_pred_list, ins_labels, flatten_cate_preds, cate_labels, num_ins) return {'loss_ins': loss_ins, 'loss_cate': loss_cate}
def fastnms(all_pred_boxes, all_pred_scores, resize_shape, origin_shape, conf_thresh, nms_thresh, keep_top_k, nms_top_k, use_yolo_box): ''' :param all_pred_boxes: [batch_size, -1, 4] :param all_pred_scores: [batch_size, -1, 80] :param resize_shape: [batch_size, 2] :param origin_shape: [batch_size, 2] ''' conf_preds = P.transpose(all_pred_scores, perm=[0, 2, 1]) # [1, 80, -1] cur_scores = conf_preds[0] # [80, -1] conf_scores = P.reduce_max(cur_scores, dim=0) # [-1, ] # keep如果是[None],并且在gather()里使用了keep,就会出现 # cudaGetLastError invalid configuration argument errno: 9 这个错误。 # 为了避免上面的问题,只能让keep不是[None],所以这里当keep是[None]时给keep赋予一个坐标[[0]]。 keep = P.where(conf_scores > conf_thresh) def exist_objs_1(keep): return keep def no_objs_1(): keep_extra = P.zeros((1, 1), 'int64') return keep_extra keep = P.cond(P.shape(keep)[0] == 0, no_objs_1, lambda: exist_objs_1(keep)) scores = P.gather(all_pred_scores[0], keep) scores = P.transpose(scores, perm=[1, 0]) boxes = P.gather(all_pred_boxes[0], keep) boxes, scores, classes = fast_nms(boxes, scores, conf_thresh, nms_thresh, keep_top_k, nms_top_k) # 再做一次分数过滤。前面提到,只要某个框最高分数>阈值就保留, # 然而计算上面那个矩阵时,这个框其实重复了80次,每一个分身代表是不同类的物品。 # 非最高分数的其它类别,它的得分可能小于阈值,要过滤。 # 所以fastnms存在这么一个现象:某个框它最高分数 > 阈值,它有一个非最高分数类的得分也超过了阈值, # 那么最后有可能两个框都保留,而且这两个框有相同的xywh keep = P.where(scores > conf_thresh) def exist_objs_2(keep, boxes, classes, scores): boxes = P.gather(boxes, keep) classes = P.gather(classes, keep) scores = P.gather(scores, keep) return boxes, classes, scores def no_objs_2(boxes, classes, scores): keep = P.zeros((1, 1), 'int64') boxes = P.gather(boxes, keep) classes = P.gather(classes, keep) scores = P.gather(scores, keep) scores -= 2.0 # 巧妙设置为负分数让python端过滤 return boxes, classes, scores boxes, classes, scores = P.cond( P.shape(keep)[0] == 0, lambda: no_objs_2(boxes, classes, scores), lambda: exist_objs_2(keep, boxes, classes, scores)) # 变成左上角坐标、右下角坐标 boxes = P.concat( [boxes[:, :2] - boxes[:, 2:] * 0.5, boxes[:, :2] + boxes[:, 2:] * 0.5], axis=-1) # 缩放到原图大小 resize_shape_f = P.cast(resize_shape, 'float32') origin_shape_f = P.cast(origin_shape, 'float32') if use_yolo_box: scale = origin_shape_f else: scale = origin_shape_f / resize_shape_f scale = P.expand(scale, [1, 2]) boxes *= scale # 批大小是1才支持这么做,因为scale第0维表示批大小,boxes第0维却表示这张图片预测出的物体数 # 批大小在前 boxes = P.reshape(boxes, (1, -1, 4), name='boxes') scores = P.reshape(scores, (1, -1), name='scores') classes = P.reshape(classes, (1, -1), name='classes') return [boxes, scores, classes]
def seq2seq(model, tokenizer, args): log.info('Training starts with args: %r' % args) attn_id = tokenizer.vocab[args.attn_token] def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0): if query_len is None: query_len = batch_ids.shape[1] if mask_type != 'empty': mask = (batch_ids != pad_value).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) if mask_type == 'causal': assert query_len == batch_ids.shape[1] mask = np.tril(mask) elif mask_type == 'causal_without_diag': assert query_len == batch_ids.shape[1] mask = np.tril(mask, -1) elif mask_type == 'diag': assert query_len == batch_ids.shape[1] mask = np.stack([np.diag(np.diag(m)) for m in mask], 0) else: mask_type == 'empty' mask = np.zeros_like(batch_ids).astype(np.float32) mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) return mask def make_some_noice(ids): if args.use_random_noice: noice_ids = np.random.randint(1, len(tokenizer.vocab), size=ids.shape) else: noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]'] pos, = np.where(np.ones_like(ids)) np.random.shuffle(pos) pos = pos[:int(args.noise_prob * len(pos))] ids[pos, ] = noice_ids[pos, ] return ids def map_fn(example_id, src_ids, tgt_ids): src_ids = src_ids[:args.max_encode_len] tgt_ids = tgt_ids[:args.max_decode_len] src_ids, src_sids = tokenizer.build_for_ernie(src_ids) src_pids = np.arange(len(src_ids)) tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids) tgt_pids = np.arange(len(tgt_ids)) + len(src_ids) # continues position tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id attn_ids = np.ones_like(tgt_ids) * attn_id if args.noise_prob > 0.: tgt_labels = deepcopy(tgt_ids) tgt_ids = make_some_noice(tgt_ids) #corrupted else: tgt_labels = tgt_ids return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels) def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, tgt_sids, attn_ids, tgt_labels): ''' attention mask: *** src, tgt, attn src 00, 01, 11 tgt 10, 11, 12 attn 20, 21, 22 *** s1, s2 | t1 t2 t3| attn1 attn2 attn3 s1 1, 1 | 0, 0, 0,| 0, 0, 0, s2 1, 1 | 0, 0, 0,| 0, 0, 0, - t1 1, 1, | 1, 0, 0,| 0, 0, 0, t2 1, 1, | 1, 1, 0,| 0, 0, 0, t3 1, 1, | 1, 1, 1,| 0, 0, 0, - attn1 1, 1, | 0, 0, 0,| 1, 0, 0, attn2 1, 1, | 1, 0, 0,| 0, 1, 0, attn3 1, 1, | 1, 1, 0,| 0, 0, 1, for details, see Fig3. https://arxiv.org/abs/2001.11314 ''' src_len = src_ids.shape[1] tgt_len = tgt_ids.shape[1] mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len) mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len) mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len) mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len) mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len) mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len) mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len) mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len) ''' mask = np.concatenate([ np.concatenate([mask_00, mask_01, mask_02], 2), np.concatenate([mask_10, mask_11, mask_12], 2), np.concatenate([mask_20, mask_21, mask_22], 2), ], 1) ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1) pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1) sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1) ''' mask_src_2_src = mask_00 mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2) mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2) tgt_labels = tgt_labels[np.where(tgt_labels != 0)] return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()} feature_column = propeller.data.FeatureColumns([ propeller.data.LabelColumn('id'), propeller.data.TextColumn('src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), propeller.data.TextColumn('tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), ]) train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \ .map(map_fn) dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ .map(map_fn) \ .padded_batch(args.eval_bsz) \ .map(after_padding) log.debug('shard %d of %d' % (D.parallel.Env().dev_id, D.parallel.Env().nranks)) train_ds = train_ds.shard( D.parallel.Env().nranks, D.parallel.Env().dev_id).shuffle(10000).padded_batch( args.bsz).map(after_padding) dev_ds = dev_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id) shapes = [[None, None]] * 7 + [[None, None, None]] * 3 + [[None]] types = ['int64'] * 11 train_ds.data_shapes = shapes train_ds.data_types = types dev_ds.data_shapes = shapes dev_ds.data_types = types vocab_size, _ = model.word_emb.weight.shape ctx = D.parallel.prepare_context() model = D.parallel.DataParallel(model, ctx) g_clip = F.clip.GradientClipByGlobalNorm(1.0) opt = AdamW(learning_rate=LinearDecay( args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) attn_id = tokenizer.vocab[args.attn_token] for step, data in enumerate(train_ds.start(place)): (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=L.where(attn_ids == attn_id)) scaled_loss = model.scale_loss(loss) scaled_loss.backward() model.apply_collective_grads() opt.minimize(scaled_loss) model.clear_gradients() if step % 10 == 0: loss = loss.numpy() ppl = np.exp(loss) log.debug('[step %d]train loss %.5f, ppl %.5f, lr %.3e' % (step, loss, ppl, opt.current_step_lr())) if args.save_dir is not None and step % 1000 == 0 and D.parallel.Env( ).dev_id == 0: F.save_dygraph(model.state_dict(), args.save_dir) if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0: assert os.path.exists( args.predict_output_dir ), 'predict_output_dir not found: %s' % args.predict_output_dir log.debug('doing predict on gpu %d...' % D.parallel.Env().dev_id) evaluate(model, dev_ds, step, args) if step > args.max_steps: break evaluate(model, dev_ds, step, args) if args.save_dir is not None: F.save_dygraph(model.state_dict(), args.save_dir)
def __call__( self, predictions, labels_pos_mask, # Shape: [batch_size, 19248, 1] labels_neg_mask, # Shape: [batch_size, 19248, 1] labels_allboxes_vector, # Shape: [batch_size, 19248, 8] segment_t, # list Shape: [batch_size, 19248, 1] label_masks, labels_best_truth_idx, labels_pos_index, labels_pos_cid, # Shape: [batch_size, 19248] labels_pos_cid2, # Shape: [batch_size, 19248] priors, class_vectors, batch_size, use_maskiou=True, use_ce_loss=True, use_ghm_c_loss=False, use_focal_loss=False, use_ohem_loss=False): pred_allboxes_encode_x0y0x1y1 = predictions[ 'loc'] # Shape: [batch_size, 19248, 4] pred_allboxes_conf = predictions[ 'conf'] # Shape: [batch_size, 19248, 1+80] pred_allboxes_mask_coef = predictions[ 'mask'] # Shape: [batch_size, 19248, 原型数=32] pred_proto = predictions[ 'proto'] # Shape: [batch_size, s4=138, s4=138, 原型数=32] pred_segm = predictions[ 'segm'] # Shape: [batch_size, 类别数=80, s8=69, s8=69] labels_allboxes_x0y0x1y1 = labels_allboxes_vector[:, :, 0: 4] # Shape: [batch_size, 19248, 4] labels_allboxes_decode_x0y0x1y1 = labels_allboxes_vector[:, :, 4: 8] # Shape: [batch_size, 19248, 4] losses = {} # 1.bbox_loss,只有正例才计算。 # bbox_alpha = 1.5 # bbox_loss = P.smooth_l1(P.reshape(pred_allboxes_encode_x0y0x1y1, (-1, 4)), P.reshape(labels_allboxes_x0y0x1y1, (-1, 4))) # bbox_loss = P.reshape(labels_pos_mask, (-1, 1)) * bbox_loss # bbox_loss = P.reduce_sum(bbox_loss) * bbox_alpha # losses['B'] = bbox_loss # 1.bbox_loss,ciou_loss pred_x0y0x1y1 = [] for idx in range(batch_size): temp = decode(pred_allboxes_encode_x0y0x1y1[idx], priors) pred_x0y0x1y1.append(temp) pred_x0y0x1y1 = P.concat(pred_x0y0x1y1, axis=0) # Shape: [batch_size*num_priors, 4] pred_x0y0x1y1 = P.reshape( pred_x0y0x1y1, (batch_size, -1, 4)) # Shape: [batch_size, num_priors, 4] ciou = P.reshape( self.bbox_ciou(pred_x0y0x1y1, labels_allboxes_decode_x0y0x1y1), (batch_size, -1, 1)) # (batch_size, num_priors, 1) # 每个预测框ciou_loss的权重 = 2 - (ground truth的面积/图片面积) gt_area = (labels_allboxes_decode_x0y0x1y1[:, :, 2:3] - labels_allboxes_decode_x0y0x1y1[:, :, 0:1]) * \ (labels_allboxes_decode_x0y0x1y1[:, :, 3:4] - labels_allboxes_decode_x0y0x1y1[:, :, 1:2]) bbox_loss_scale = 2.0 - gt_area ciou_loss = labels_pos_mask * bbox_loss_scale * (1 - ciou) bbox_alpha = 1.5 ciou_loss = P.reduce_sum(ciou_loss) * bbox_alpha losses['B'] = ciou_loss # 2.mask_loss,只有正例才计算 mask_h = P.shape(pred_proto)[1] mask_w = P.shape(pred_proto)[2] loss_m = 0 maskiou_t_list = [] maskiou_net_input_list = [] label_t_list = [] for idx in range(batch_size): # [[0], [0], [0], [0], [0], [0], [0], [0]]。把8个正样本的最匹配gt的下标(在label_x0y0x1y1cid[idx]中的下标)选出来。 # 因为只有一个gt,所以下标全是0 labels_pos_index[idx].stop_gradient = True cur_gt = P.gather(labels_best_truth_idx[idx], labels_pos_index[idx]) # (?, 1) cur_gt.stop_gradient = True cur_x0y0x1y1 = P.gather(labels_allboxes_decode_x0y0x1y1[idx], labels_pos_index[idx]) # (?, 4) proto_masks = pred_proto[idx] # (138, 138, 32) # pred_mask_coef (batch_size, 19248, 32)。 把8个正样本预测的mask系数选出来。 proto_coef = P.gather(pred_allboxes_mask_coef[idx], labels_pos_index[idx]) # (?, 32) # (?, 138, 138),把8个正样本所匹配的gt的真实mask抽出来。因为匹配到同一个gt,所以是同一个mask重复了8次。 mask_t = P.gather(label_masks[idx], cur_gt) # (?, 138, 138) # (?, ),把8个正样本所匹配的gt的真实cid抽出来。因为匹配到同一个gt,所以是同一个cid重复了8次。 label_t = P.gather(labels_pos_cid[idx], labels_pos_index[idx]) # (?, ) # Size: (138, 138, ?) = 原型*系数转置 pred_masks = P.matmul(proto_masks, proto_coef, transpose_y=True) pred_masks = P.sigmoid(pred_masks) # sigmoid激活 pred_masks = crop(pred_masks, cur_x0y0x1y1) pred_masks = P.transpose(pred_masks, perm=[2, 0, 1]) masks_pos_loss = mask_t * (0 - P.log(pred_masks + 1e-9) ) # 二值交叉熵,加了极小的常数防止nan masks_neg_loss = (1 - mask_t) * (0 - P.log(1 - pred_masks + 1e-9) ) # 二值交叉熵,加了极小的常数防止nan pre_loss = (masks_pos_loss + masks_neg_loss) pre_loss = P.reduce_sum(pre_loss, dim=[1, 2]) # gt面积越小,对应mask损失权重越大 cur_cxcywh = center_size(cur_x0y0x1y1) gt_box_width = cur_cxcywh[:, 2] gt_box_height = cur_cxcywh[:, 3] pre_loss = pre_loss / (gt_box_width * gt_box_height) loss_m += P.reduce_sum(pre_loss) if use_maskiou: # mask_t中,面积<=5*5的被丢弃 # discard_mask_area = 5*5 ''' gpu版本的paddlepaddle1.6.2里有一个问题。select如果是[None],并且在gather()里使用了select,就会出现 cudaGetLastError invalid configuration argument errno: 9 这个错误。cpu版本则可以正常跑。 为了避免上面的问题,只能让select不是[None],所以这里不做面积过滤,mask_t全部保留。 ''' discard_mask_area = -1 gt_mask_area = P.reduce_sum(mask_t, dim=[1, 2]) gt_mask_area.stop_gradient = True select = P.where(gt_mask_area > discard_mask_area) select.stop_gradient = True pred_masks = P.gather(pred_masks, select) mask_t = P.gather(mask_t, select) label_t = P.gather(label_t, select) label_t.stop_gradient = True maskiou_net_input = P.reshape( pred_masks, (P.shape(pred_masks)[0], 1, mask_h, mask_w)) pred_masks = P.cast(pred_masks > 0.5, 'float32') # 四舍五入 maskiou_t = self._mask_iou(pred_masks, mask_t) # (8, ) maskiou_net_input_list.append( maskiou_net_input) # (8, 1, 138, 138) maskiou_t_list.append(maskiou_t) # (8, ) label_t_list.append(label_t) # (8, ) mask_alpha = 6.125 losses['M'] = loss_m * mask_alpha / mask_h / mask_w # 余下部分 if use_maskiou: maskiou_net_input = P.concat( maskiou_net_input_list, axis=0) # (21, 1, 138, 138) 21个正例预测的掩码 maskiou_t = P.concat(maskiou_t_list, axis=0) # (21, ) 21个正例预测的掩码和真实掩码的iou label_t = P.concat(label_t_list, axis=0) # (21, ) 21个正例预测的cid label_t.stop_gradient = True # 因为是整数所以才? maskiou_targets = [maskiou_net_input, maskiou_t, label_t] # 3.conf_loss。 conf_alpha = 1.0 if use_ce_loss: conf_loss = self.ce_conf_loss(pred_allboxes_conf, labels_pos_mask, labels_neg_mask, class_vectors, labels_pos_cid2, gt_area) elif use_ghm_c_loss: conf_loss = self.ghm_c_loss(pred_allboxes_conf, labels_pos_mask, labels_neg_mask, class_vectors, labels_pos_cid2) elif use_focal_loss: conf_loss = self.focal_conf_loss(pred_allboxes_conf, labels_pos_mask, labels_neg_mask, class_vectors, labels_pos_cid2) elif use_ohem_loss: conf_loss = self.ohem_conf_loss(pred_allboxes_conf, batch_size, labels_neg_mask, labels_pos_mask, labels_pos_index, class_vectors, labels_pos_cid) losses['C'] = conf_loss * conf_alpha # 4.mask_iou_loss,只有正例才计算。 if use_maskiou: # maskiou_net_input (21, 1, 138, 138) 21个正例预测的掩码 # maskiou_t (21, ) 21个正例预测的掩码和真实掩码的iou # label_t (21, ) 21个正例预测的cid maskiou_net_input, maskiou_t, label_t = maskiou_targets maskiou_p = maskiou_net(maskiou_net_input, self.num_classes - 1) maskiou_p = P.reduce_max(maskiou_p, dim=[2, 3]) # 最大池化 (21, 80) temp_mask = P.gather(class_vectors, label_t) # 掩码 (21, 81) temp_mask = temp_mask[:, 1:] # 掩码 (21, 80) maskiou_p = temp_mask * maskiou_p # 只保留真实类别的那个通道 (21, 80) maskiou_p = P.reduce_sum(maskiou_p, dim=1, keep_dim=True) # (21, 1) loss_i = P.smooth_l1( maskiou_p, P.reshape(maskiou_t, (P.shape(maskiou_t)[0], 1))) maskiou_alpha = 25.0 losses['I'] = maskiou_alpha * P.reduce_sum(loss_i) # 5.semantic_segmentation_loss,只有正例才计算 mask_h = P.shape(pred_segm)[2] mask_w = P.shape(pred_segm)[3] loss_s = 0.0 for idx in range(batch_size): cur_segment = pred_segm[idx] # (80, 69, 69) l = P.sigmoid_cross_entropy_with_logits(cur_segment, segment_t[idx]) loss_s += P.reduce_sum(l) semantic_segmentation_alpha = 1.0 losses['S'] = loss_s / mask_h / mask_w * semantic_segmentation_alpha total_num_pos = P.cast(P.reduce_sum(labels_pos_mask), 'float32') for k in losses: if k not in ('S', ): losses[k] /= total_num_pos else: losses[k] /= batch_size total_loss = 0.0 for k in losses: total_loss += losses[k] # Loss Key: # - B: Box Localization Loss # - M: Mask Loss # - C: Class Confidence Loss # - I: MaskIou Loss # - S: Semantic Segmentation Loss # return losses['M'], losses['C'] return losses, total_loss
def ohem_conf_loss(self, pred_allboxes_conf, batch_size, labels_neg_mask, labels_pos_mask, labels_pos_index, class_vectors, labels_pos_cid): batch_conf = P.reshape(pred_allboxes_conf, (-1, self.num_classes)) loss_c = log_sum_exp(batch_conf) - batch_conf[:, 0] loss_c = P.reshape(loss_c, (batch_size, -1)) # (batch_size, 19248) labels_neg_mask = P.concat(labels_neg_mask, axis=0) # (batch_size*19248, 1) labels_neg_mask = P.reshape(labels_neg_mask, (batch_size, -1)) # (batch_size, 19248) loss_c = labels_neg_mask * loss_c # 只留下负样本损失, (batch_size, 19248) sorted_loss_c, loss_idx = P.argsort(loss_c, axis=-1, descending=True) labels_pos_mask = P.concat(labels_pos_mask, axis=0) # (batch_size*19248, 1) labels_pos_mask = P.reshape(labels_pos_mask, (batch_size, -1)) # (batch_size, 19248) num_pos = P.cast(P.reduce_sum(labels_pos_mask, dim=1), 'int32') # (batch_size, ) num_neg = self.negpos_ratio * num_pos # (batch_size, ) neg_topk_mask = [] for idx in range(batch_size): desc = P.range(num_neg[idx], num_neg[idx] - P.shape(labels_pos_mask)[1], -1, 'int32') neg_topk_mask.append(desc) neg_topk_mask = P.concat(neg_topk_mask, axis=0) # (batch_size*19248, ) neg_topk_mask = P.reshape(neg_topk_mask, (batch_size, -1)) # (batch_size, 19248) neg_topk_mask = P.cast(neg_topk_mask > 0, 'float32') # (batch_size, 19248) sorted_loss_c = neg_topk_mask * sorted_loss_c selected_poss = [] selected_negs = [] selected_pos_class_vectors = [] selected_neg_class_vectors = [] for idx in range(batch_size): selected_neg_idx_idx = P.where(sorted_loss_c[idx] > 0) selected_neg_idx_idx.stop_gradient = True selected_neg_idx = P.gather(loss_idx[idx], selected_neg_idx_idx) selected_neg_idx.stop_gradient = True selected_neg = P.gather(pred_allboxes_conf[idx], selected_neg_idx) selected_neg.stop_gradient = True selected_negs.append(selected_neg) selected_pos = P.gather(pred_allboxes_conf[idx], labels_pos_index[idx]) selected_pos.stop_gradient = True selected_poss.append(selected_pos) zeros = P.fill_constant(shape=[ P.shape(selected_neg)[0], ], value=0, dtype='int32') zeros.stop_gradient = True selected_neg_class_vector = P.gather(class_vectors, zeros) selected_neg_class_vector.stop_gradient = True selected_neg_class_vectors.append(selected_neg_class_vector) labels_pos_cid.stop_gradient = True labels_pos_index[idx].stop_gradient = True selected_pos_cid = P.gather(labels_pos_cid[idx], labels_pos_index[idx]) selected_pos_cid.stop_gradient = True selected_pos_class_vector = P.gather(class_vectors, selected_pos_cid) selected_pos_class_vector.stop_gradient = True selected_pos_class_vectors.append(selected_pos_class_vector) selected_negs = P.concat(selected_negs, axis=0) # (?, 1+80) selected_poss = P.concat(selected_poss, axis=0) # (?, 1+80) pred_ = P.concat([selected_negs, selected_poss], axis=0) # (?, 1+80) selected_neg_class_vectors = P.concat(selected_neg_class_vectors, axis=0) # (?, 1+80) selected_pos_class_vectors = P.concat(selected_pos_class_vectors, axis=0) # (?, 1+80) labels_ = P.concat( [selected_neg_class_vectors, selected_pos_class_vectors], axis=0) # (?, 1+80) # softmax交叉熵 fenzi = P.exp(pred_) fenmu = P.reduce_sum(fenzi, dim=1, keep_dim=True) pred_prob = fenzi / P.expand_as(fenmu, target_tensor=fenzi) conf_loss = labels_ * (0 - P.log(pred_prob + 1e-9)) # 交叉熵,加了极小的常数防止nan conf_loss = P.reduce_sum(conf_loss) return conf_loss
def __call__(self, kernel_preds, cls_preds, mask_protos, batch_gt_objs_tensors, batch_gt_clss_tensors, batch_gt_masks_tensors, batch_gt_pos_idx_tensors): ''' :param kernel_preds: kernel_preds里每个元素形状是[N, 256, seg_num_grid, seg_num_grid], 每个格子的预测卷积核。 从 小感受野 到 大感受野。 :param cls_preds: cls_preds里每个元素形状是 [N, 80, seg_num_grid, seg_num_grid], 每个格子的预测概率,未进行sigmoid()激活。 从 小感受野 到 大感受野。 :param mask_protos: [bs, 256, s4, s4] 掩码原型 :param batch_gt_objs_tensors: 里每个元素形状是[N, seg_num_grid, seg_num_grid, 1], 每个格子的objness。 从 小感受野 到 大感受野。 :param batch_gt_clss_tensors: 里每个元素形状是[N, seg_num_grid, seg_num_grid, 80], 每个格子真实类别onehot。 从 小感受野 到 大感受野。 :param batch_gt_masks_tensors: 里每个元素形状是[N, -1, s4, s4], 真实掩码。 从 小感受野 到 大感受野。 :param batch_gt_pos_idx_tensors: 里每个元素形状是[N, -1, 3], 正样本的下标。 从 小感受野 到 大感受野。 :return: ''' batch_size = self.batch_size num_layers = len(kernel_preds) # ================= 计算损失 ================= num_ins = 0. # 记录这一批图片的正样本个数 loss_clss, loss_masks = [], [] for bid in range(batch_size): for lid in range(num_layers): # ================ 掩码损失 ====================== mask_proto = mask_protos[bid] # [256, s4, s4] 这张图片产生的掩码原型。 kernel_pred = kernel_preds[lid][ bid] # [256, seg_num_grid, seg_num_grid] 格子预测的卷积核(yolact中的“掩码系数”) kernel_pred = L.transpose( kernel_pred, perm=[1, 2, 0] ) # [seg_num_grid, seg_num_grid, 256] 格子预测的卷积核(yolact中的“掩码系数”) gt_objs = batch_gt_objs_tensors[lid][ bid] # [seg_num_grid, seg_num_grid, 1] gt_masks = batch_gt_masks_tensors[lid][bid] # [-1, s4, s4] pmidx = batch_gt_pos_idx_tensors[lid][bid] # [-1, 3] gt_objs.stop_gradient = True gt_masks.stop_gradient = True pmidx.stop_gradient = True idx_sum = L.reduce_sum(pmidx, dim=1) keep = L.where(idx_sum > -1) keep = L.reshape(keep, (-1, )) keep.stop_gradient = True pmidx = L.gather(pmidx, keep) # [M, 3] yx_idx = pmidx[:, :2] # [M, 2] m_idx = pmidx[:, 2] # [M, ] yx_idx.stop_gradient = True m_idx.stop_gradient = True # 抽出来 gt_obj = L.gather_nd(gt_objs, yx_idx) # [M, 1] 是否是真正的正样本。 pos_krn = L.gather_nd(kernel_pred, yx_idx) # [M, 256] 正样本的卷积核(掩码系数)。 gt_mask = L.gather(gt_masks, m_idx) # [M, s4, s4] 真实掩码。 # 正样本数量 num_ins += L.reduce_sum(gt_obj) # 生成预测掩码 mask_proto = L.transpose(mask_proto, perm=[1, 2, 0]) # [s4, s4, 256] masks = L.matmul(mask_proto, pos_krn, transpose_y=True) # [s4, s4, M] masks = L.sigmoid(masks) # [s4, s4, M] masks = L.transpose(masks, perm=[2, 0, 1]) # [M, s4, s4] loss_mask = self.dice_loss(masks, gt_mask, gt_obj) loss_masks.append(loss_mask) # ================ 分类损失。sigmoid_focal_loss() ====================== gamma = self.loss_gamma alpha = self.loss_alpha pred_conf = cls_preds[lid][ bid] # [80, seg_num_grid, seg_num_grid] 未进行sigmoid()激活。 pred_conf = L.transpose(pred_conf, perm=[ 1, 2, 0 ]) # [seg_num_grid, seg_num_grid, 80] 未进行sigmoid()激活。 pred_conf = L.sigmoid( pred_conf ) # [seg_num_grid, seg_num_grid, 80] 已进行sigmoid()激活。 gt_clss = batch_gt_clss_tensors[lid][ bid] # [seg_num_grid, seg_num_grid, 80] 真实类别onehot gt_clss.stop_gradient = True pos_loss = gt_clss * (0 - L.log(pred_conf + 1e-9)) * L.pow( 1 - pred_conf, gamma) * alpha neg_loss = ( 1.0 - gt_clss) * (0 - L.log(1 - pred_conf + 1e-9)) * L.pow( pred_conf, gamma) * (1 - alpha) focal_loss = pos_loss + neg_loss focal_loss = L.reduce_sum(focal_loss, dim=[0, 1]) loss_clss.append(focal_loss) loss_masks = L.concat(loss_masks, axis=0) loss_masks = L.reduce_sum(loss_masks) * self.ins_loss_weight loss_masks = loss_masks / L.elementwise_max( L.ones((1, ), dtype='float32'), num_ins) loss_clss = L.concat(loss_clss, axis=0) loss_clss = L.reduce_sum(loss_clss) * self.clss_loss_weight loss_clss = loss_clss / L.elementwise_max( L.ones((1, ), dtype='float32'), num_ins) loss_all = {"loss_masks": loss_masks, "loss_clss": loss_clss} return loss_all
def reorder_(t, parent_id): """reorder cache according to parent beam id""" gather_idx = L.where(parent_id != -1)[:, 0] * beam_width + L.reshape( parent_id, [-1]) t = L.gather(t, gather_idx) return t
def build_pad_idx(input_mask): pad_idx = L.where(L.cast(L.squeeze(input_mask, [2]), 'bool')) return pad_idx
def finetune( self, train_path, dev_path=None, save_dir="ernie_gen_result", init_ckpt_path=None, use_gpu=True, max_steps=500, batch_size=8, max_encode_len=50, max_decode_len=50, learning_rate=5e-5, warmup_proportion=0.1, weight_decay=0.1, noise_prob=0, label_smooth=0, beam_width=5, length_penalty=1.0, log_interval=100, save_interval=200, ): """ finetune with the specified dataset. Args: train_path(str): the train dataset path. dev_path(str): the dev dataset path. save_dir(str): the model params and dev dataset predict result save path. init_ckpt_path(str): incremental training load path. use_gpu(bool): use gpu or not. max_steps(int): max training steps. batch_size(int): the batch size. max_encode_len(int): the max encode length. max_decode_len(int): the max decode length. learning_rate(float): the learning rate. warmup_proportion(float): the warmup proportion. weight_decay(float): the weight decay magnitude. noise_prob(float): the nosie probability. see the ernie gen paper for details. label_smooth(float): the label smooth magnitude. beam_width(int): the beam size during evaluating the dev dataset. length_penalty(float): the length penalty during evaluating the dev dataset. log_interval(int): the log interval. save_interval(int): the save interval. dev set will be evaluated after saving. Return: result(dict): A Dictionary of shape:: { last_save_path(str): last model save path. last_ppl(float): last model ppl. } """ self.max_encode_len = max_encode_len self.max_decode_len = max_decode_len self.noise_prob = noise_prob place = F.CUDAPlace(0) if use_gpu else F.CPUPlace() with F.dygraph.guard(place): if init_ckpt_path is not None: logger.info('loading checkpoint from %s' % init_ckpt_path) sd, _ = D.load_dygraph(init_ckpt_path) self.model.set_dict(sd) feature_column = propeller.data.FeatureColumns([ propeller.data.LabelColumn('id'), propeller.data.TextColumn( 'src', unk_id=self.tokenizer.unk_id, vocab_dict=self.tokenizer.vocab, tokenizer=self.tokenizer.tokenize), propeller.data.TextColumn( 'tgt', unk_id=self.tokenizer.unk_id, vocab_dict=self.tokenizer.vocab, tokenizer=self.tokenizer.tokenize), ]) train_ds = feature_column.build_dataset('train', data_file=train_path, shuffle=False, repeat=True, use_gz=False)\ .map(self._map_fn).shuffle(10000).padded_batch(batch_size).map(self._after_padding) train_ds.data_shapes = [[None, None]] * 7 + [[None, None, None] ] * 3 + [[None]] train_ds.data_types = ['int64'] * 11 if dev_path: dev_ds = feature_column.build_dataset('dev', data_file=dev_path, shuffle=False, repeat=False, use_gz=False) \ .map(self._map_fn) \ .padded_batch(1) \ .map(self._after_padding) dev_ds.data_shapes = [[None, None]] * 7 + [[None, None, None] ] * 3 + [[None]] dev_ds.data_types = ['int64'] * 11 vocab_size, _ = self.model.word_emb.weight.shape g_clip = F.clip.GradientClipByGlobalNorm(1.0) opt = AdamW( learning_rate=LinearDecay(learning_rate, int(warmup_proportion * max_steps), max_steps), parameter_list=self.model.parameters(), weight_decay=weight_decay, grad_clip=g_clip) loss = None save_path = None ppl = None if save_dir and not os.path.exists(save_dir): os.makedirs(save_dir) for step, data in enumerate(train_ds.start(place)): (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data _, __, info = self.model( src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = self.model( tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if label_smooth > 0.: tgt_labels = L.label_smooth( F.one_hot(tgt_labels, vocab_size), epsilon=label_smooth) loss, _, __ = self.model( attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=L.where(attn_ids == self.tokenizer.vocab['[MASK]'])) loss.backward() opt.minimize(loss) self.model.clear_gradients() if step % log_interval == 0: loss_np = loss.numpy() ppl = np.exp(loss_np) logger.info( '[step %d / %d]train loss %.5f, ppl %.5f, elr %.3e' % (step, max_steps, loss_np, ppl, opt.current_step_lr())) if save_dir and step % save_interval == 0 and step > 0: loss_np = loss.numpy() ppl = np.exp(loss_np) save_name = "step_%s_ppl_%.5f" % (step, ppl) save_path = os.path.join(save_dir, save_name) logger.info("save the model in %s" % save_path) F.save_dygraph(self.model.state_dict(), save_path) if dev_path: logger.info('evaluating...') res = self._evaluate(dev_ds, place, beam_width, length_penalty) output_path = os.path.join( save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl)) logger.info( 'save the predict result in %s' % output_path) with open(output_path, 'w') as fout: fout.write(('\n'.join(res))) if step > max_steps: break if loss: loss_np = loss.numpy() ppl = np.exp(loss_np) logger.info('[final step %d]train loss %.5f, ppl %.5f, elr %.3e' % (step, loss_np, ppl, opt.current_step_lr())) if save_dir: save_name = "step_%s_ppl_%.5f" % (step, ppl) save_path = os.path.join(save_dir, save_name) logger.info("save the model in %s" % save_path) F.save_dygraph(self.model.state_dict(), save_path) if dev_path: logger.info('evaluating...') res = self._evaluate(dev_ds, place, beam_width, length_penalty) output_path = os.path.join( save_dir, "step_%s_ppl_%.5f.txt" % (step, ppl)) logger.info( 'save the predict result in %s' % output_path) with open(output_path, 'w') as fout: fout.write(('\n'.join(res))) result = { "last_save_path": "%s.pdparams" % save_path, "last_ppl": ppl[0], } return result
def get_seg_single(self, cate_preds, mask_proto, kernel_preds, featmap_size, resize_shape, ori_shape): ''' :param cate_preds: [所有格子数, 80] :param mask_proto: [1, 256, s4, s4] 掩码原型 :param kernel_preds: [所有格子数, 256] 每个格子生成的卷积核,是1x1卷积核,输入通道数是256,即掩码原型的通道数。 :param featmap_size: (s4, s4) :param resize_shape: shape=[3, ] :param ori_shape: shape=[3, ] :return: ''' # overall info. upsampled_size_out = (featmap_size[0] * 4, featmap_size[1] * 4 ) # 输入网络的图片大小 cfg = self.nms_cfg # 第一次过滤,分数过滤 inds = L.where(cate_preds > cfg['score_thr']) # [M, 2] # if len(inds) == 0: # return None # 静态图里写条件判断太难了。 def exist_objs_1(inds, cate_preds): inds.stop_gradient = True scores = L.gather_nd(cate_preds, inds) # [M, ] M个物体的分数 return inds, scores def no_objs_1(cate_preds): inds = L.zeros((1, 2), np.int64) inds.stop_gradient = True scores = L.gather_nd(cate_preds, inds) - 99.0 # [M, ] M个物体的分数。后面会被过滤掉。 return inds, scores # 是否有物体 inds, scores = L.cond( L.shape(inds)[0] == 0, lambda: no_objs_1(cate_preds), lambda: exist_objs_1(inds, cate_preds)) classes = inds[:, 1] # [M, ] M个物体的类别id kernel_preds = L.gather(kernel_preds, inds[:, 0]) # [M, 256] M个物体的卷积核 n_stage = len(self.seg_num_grids) # 5个输出层 strides = [] for ind_ in range(n_stage): st = L.zeros((1, ), dtype=np.float32) + self.strides[ind_] st = L.expand(st, [ self.seg_num_grids[ind_]**2, ]) # [40*40, ] strides.append(st) strides = L.concat(strides, axis=0) strides.stop_gradient = True strides = L.gather(strides, inds[:, 0]) # [M, ] M个物体的下采样倍率 # mask encoding.原版SOLO中的写法。1x1的卷积核卷积掩码原型,即可得到掩码。 # M, C = kernel_preds.shape # kernel_preds = kernel_preds.view(M, C, 1, 1) # 被当做卷积核使 # seg_preds = F.conv2d(seg_preds, kernel_preds, stride=1).squeeze(0).sigmoid() # 1x1的卷积核卷积掩码原型,等价于矩阵相乘。注意,3x3卷积核的话可不等价。 # 这里是由于暂时没发现等价api,所以用矩阵相乘代替。solov2和yolact在这里是一样的。 mask_proto = L.squeeze(mask_proto, axes=[0]) # [256, s4, s4] mask_proto = L.transpose(mask_proto, perm=[1, 2, 0]) # [s4, s4, 256] masks = L.matmul(mask_proto, kernel_preds, transpose_y=True) # [s4, s4, M] masks = L.sigmoid(masks) # [s4, s4, M] masks = L.transpose(masks, perm=[2, 0, 1]) # [M, s4, s4] # mask. seg_masks = L.cast(masks > cfg['mask_thr'], 'float32') # [M, s4, s4] 前景的话值为1 sum_masks = L.reduce_sum(seg_masks, dim=[1, 2]) # [M, ] M个物体的掩码面积 # 第二次过滤,下采样倍率过滤。掩码的面积 超过 下采样倍率 才保留下来。 keep = L.where(sum_masks > strides) # if keep.sum() == 0: # return None # 静态图里写条件判断太难了。 def exist_objs_2(keep, seg_masks, masks, sum_masks, scores, classes): keep = L.reshape(keep, (-1, )) # [M2, ] keep.stop_gradient = True seg_masks = L.gather(seg_masks, keep) # [M2, s4, s4] M2个物体的掩码 masks = L.gather(masks, keep) # [M2, s4, s4] M2个物体的掩码概率 sum_masks = L.gather(sum_masks, keep) # [M2, ] M2个物体的掩码面积 scores = L.gather(scores, keep) # [M2, ] M2个物体的分数 classes = L.gather(classes, keep) # [M2, ] M2个物体的类别id return seg_masks, masks, sum_masks, scores, classes def no_objs_2(seg_masks, masks, sum_masks, scores, classes): keep = L.zeros((1, ), np.int64) keep.stop_gradient = True seg_masks = L.gather(seg_masks, keep) # [M2, s4, s4] M2个物体的掩码 masks = L.gather(masks, keep) # [M2, s4, s4] M2个物体的掩码概率 sum_masks = L.gather(sum_masks, keep) # [M2, ] M2个物体的掩码面积 scores = L.gather(scores, keep) - 99.0 # [M2, ] M2个物体的分数。负分数,后面会被过滤掉。 classes = L.gather(classes, keep) # [M2, ] M2个物体的类别id return seg_masks, masks, sum_masks, scores, classes # 是否有物体 seg_masks, masks, sum_masks, scores, classes = L.cond( L.shape(keep)[0] == 0, lambda: no_objs_2(seg_masks, masks, sum_masks, scores, classes), lambda: exist_objs_2(keep, seg_masks, masks, sum_masks, scores, classes)) # mask scoring. # [M2, ] 前景的掩码概率求和,再除以掩码面积。即M2个物体的前景部分的平均掩码概率 avg_prob = L.reduce_sum(masks * seg_masks, dim=[1, 2]) / sum_masks scores *= avg_prob # [M2, ] M2个物体的最终分数 = 分类概率 * 平均掩码概率 # 第三次过滤,只保留得分前cfg['nms_pre']个物体 _, sort_inds = L.argsort(scores, axis=-1, descending=True) # 最终分数降序。最大值的下标,第2大值的下标,... sort_inds = sort_inds[:cfg['nms_pre']] # 最多cfg['nms_pre']个物体。 seg_masks = L.gather(seg_masks, sort_inds) # [M3, s4, s4] M3个物体的掩码 masks = L.gather(masks, sort_inds) # [M3, s4, s4] M3个物体的掩码概率 sum_masks = L.gather(sum_masks, sort_inds) # [M3, ] M3个物体的掩码面积 scores = L.gather(scores, sort_inds) # [M3, ] M3个物体的分数 classes = L.gather(classes, sort_inds) # [M3, ] M3个物体的类别id # Matrix NMS scores = matrix_nms(seg_masks, classes, scores, kernel=cfg['kernel'], sigma=cfg['sigma'], sum_masks=sum_masks) # 第四次过滤,分数过滤 keep = L.where(scores >= cfg['update_thr']) # if keep.sum() == 0: # return None def exist_objs_3(keep, masks, classes, scores, upsampled_size_out, resize_shape, ori_shape): keep = L.reshape(keep, (-1, )) keep.stop_gradient = True masks = L.gather(masks, keep) # [M4, s4, s4] M4个物体的掩码概率 scores = L.gather(scores, keep) # [M4, ] M4个物体的分数 classes = L.gather(classes, keep) # [M4, ] M4个物体的类别id # 第五次过滤,只保留得分前cfg['max_per_img']个物体 _, sort_inds = L.argsort(scores, axis=-1, descending=True) sort_inds = sort_inds[:cfg['max_per_img']] sort_inds.stop_gradient = True masks = L.gather(masks, sort_inds) # [M5, s4, s4] M5个物体的掩码概率 scores = L.gather(scores, sort_inds) # [M5, ] M5个物体的分数 classes = L.gather(classes, sort_inds) # [M5, ] M5个物体的类别id masks = L.resize_bilinear( L.unsqueeze(masks, axes=[0]), out_shape=upsampled_size_out, align_corners=False, align_mode=0)[:, :, :resize_shape[0], :resize_shape[1]] # 去掉黑边 masks = L.resize_bilinear(masks, out_shape=ori_shape[:2], align_corners=False, align_mode=0) # 插值成原图大小 masks = L.cast(masks > cfg['mask_thr'], 'float32')[0] return masks, classes, scores def no_objs_3(): masks = L.zeros([1, 1, 1], 'float32') - 1.0 classes = L.zeros([ 1, ], 'int64') - 1 scores = L.zeros([ 1, ], 'float32') - 2.0 return masks, classes, scores # 是否有物体 masks, classes, scores = L.cond( L.shape(keep)[0] == 0, no_objs_3, lambda: exist_objs_3(keep, masks, classes, scores, upsampled_size_out, resize_shape, ori_shape)) return masks, classes, scores