def draw_densecap(image, scores, rois, im_info, cap_probs, bbox_pred): """ bbox_pred: [None, 4] rois: [None, 5] """ # for bbox unnormalization bbox_mean = np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS).reshape((1, 4)) bbox_stds = np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS).reshape((1, 4)) boxes = rois[:, 1:5] / im_info[2] # [None, 12] cap_ids = np.argmax(cap_probs, axis=1).reshape((-1, cfg.TIME_STEPS)) # bbox target unnormalization box_deltas = bbox_pred * bbox_stds + bbox_mean # do the transformation pred_boxes = bbox_transform_inv(boxes, box_deltas) pred_boxes = clip_boxes(pred_boxes, image.shape) pos_dets = np.hstack( (pred_boxes, scores[:, 1][:, np.newaxis])).astype(np.float32, copy=False) keep = nms(pos_dets, cfg.TEST.NMS) pos_boxes = boxes[keep, :] cap_ids = cap_ids[keep, :] im_info[2] = 1. img_cap = draw_bounding_boxes(image, pos_boxes, im_info, cap_ids) return img_cap
def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors): """A simplified version compared to fast/er RCNN For details please see the technical report """ if type(cfg_key) == bytes: cfg_key = cfg_key.decode('utf-8') pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH # Get the scores and bounding boxes scores = rpn_cls_prob[:, :, :, num_anchors:] rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4)) scores = scores.reshape((-1, 1)) proposals = bbox_transform_inv(anchors, rpn_bbox_pred) if cfg.DEBUG_ALL: print ('number of proposals before clip boxes to image board: {}'.format( proposals.shape[0] )) proposals = clip_boxes(proposals, im_info[:2]) # remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) if cfg.FILTER_SMALL_BOX: min_size = cfg[cfg_key].RPN_MIN_SIZE keep = _filter_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # Pick the top region proposals order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # Non-maximal suppression if cfg.DEBUG_ALL: print("number of proposals before nms: {}".format(proposals.shape[0])) keep = nms(np.hstack((proposals, scores)), nms_thresh) if cfg.DEBUG_ALL: print("number of proposals after nms: {}".format(len(keep))) # Pick th top region proposals after NMS if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Only support single image as input batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) return blob, scores
def interpret_objects(cls_prob, bbox_pred, rois, im_info, nms_thres=-1., min_score=0.00001, use_gt_boxes=False, max_per_image=2000): box_deltas = bbox_pred.data.cpu().numpy() cls_prob = cls_prob.data.cpu().numpy() all_boxes = [[] for _ in xrange(cls_prob.shape[1])] for j in xrange(1, cls_prob.shape[1]): # skip the background inds = np.where(cls_prob[:, j] > min_score)[0] if len(inds) == 0: continue cls_scores = cls_prob[inds, j] if use_gt_boxes: cls_boxes = rois.data.cpu().numpy()[inds, 1:5] / im_info[0][2] else: t_box_deltas = np.asarray( [box_deltas[i, (j * 4):(j * 4 + 4)] for i in inds], dtype=np.float) cls_boxes = bbox_transform_inv_hdn( rois.data.cpu().numpy()[inds, 1:5], t_box_deltas) / im_info[0][2] cls_boxes = clip_boxes(cls_boxes, im_info[0][:2] / im_info[0][2]) cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ .astype(np.float32, copy=False) if nms_thres > 0.: keep = nms(cls_dets, nms_thres) cls_dets = cls_dets[keep, :] all_boxes[j] = cls_dets if max_per_image > 0: image_scores = np.hstack([ all_boxes[j][:, -1] for j in xrange(1, cls_prob.shape[1]) if len(all_boxes[j]) > 0 ]) #print('{} detections.'.format(len(image_scores))) if len(image_scores) > max_per_image: image_thresh = np.sort(image_scores)[-max_per_image] for j in xrange(1, cls_prob.shape[1]): if len(all_boxes[j]) == 0: continue keep = np.where(all_boxes[j][:, -1] >= image_thresh)[0] all_boxes[j] = all_boxes[j][keep, :] return all_boxes
def compute_rois_offset(rois, offset, im_info=None): """Compute bounding-box offset for region of interests""" assert rois.shape[1] == 4 assert offset.shape[1] == 4 if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: # Optionally normalize targets by a precomputed mean and stdev -- reverse the transformation offset_unnorm = offset * np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS) + \ np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS) else: offset_unnorm = offset.copy() rois_offset = bbox_transform_inv(rois, offset_unnorm) if not im_info is None: rois_offset = clip_boxes(rois_offset, im_info[:2]) return rois_offset
def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, anchors, num_anchors): """A layer that just selects the top region proposals without using non-maximal suppression, For details please see the technical report """ rpn_top_n = cfg.TEST.RPN_TOP_N scores = rpn_cls_prob[:, :, :, num_anchors:] rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4)) scores = scores.reshape((-1, 1)) length = scores.shape[0] if length < rpn_top_n: # Random selection, maybe unnecessary and loses good proposals # But such case rarely happens top_inds = npr.choice(length, size=rpn_top_n, replace=True) else: top_inds = scores.argsort(0)[::-1] top_inds = top_inds[:rpn_top_n] top_inds = top_inds.reshape(rpn_top_n, ) # Do the selection here anchors = anchors[top_inds, :] rpn_bbox_pred = rpn_bbox_pred[top_inds, :] scores = scores[top_inds] # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, rpn_bbox_pred) # Clip predicted boxes to image proposals = clip_boxes(proposals, im_info[:2]) # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) return blob, scores
def im_detect(sess, net, im, boxes=None, use_box_at=-1): """Detect object classes in an image given object proposals. Arguments: im (ndarray): color image to test (in BGR order) boxes (ndarray): R x 4 array of object proposals or None (for RPN) use_box_at (int32): Use predicted box at a given timestep, default to the last one (use_box_at=-1) Returns: scores (ndarray): R x 1 array of object class scores pred_boxes (ndarray)): R x 4 array of predicted bounding boxes captions (list): length R list of list of word tokens (captions) """ # for bbox unnormalization bbox_mean = np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS).reshape((1, 4)) bbox_stds = np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS).reshape((1, 4)) blobs, im_scales = _get_blobs(im, boxes) assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs['data'] blobs['im_info'] = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) if cfg.TEST.USE_BEAM_SEARCH: scores, box_offsets, captions, boxes = beam_search( sess, net, blobs, im_scales) else: scores, box_offsets, captions, boxes = greedy_search( sess, net, blobs, im_scales) # bbox target unnormalization box_deltas = box_offsets * bbox_stds + bbox_mean # do the transformation pred_boxes = bbox_transform_inv(boxes, box_deltas) pred_boxes = clip_boxes(pred_boxes, im.shape) return scores[:, 1], pred_boxes, captions
def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride=[ 16, ], anchor_scales=[ 16, ]): """ Parameters ---------- rpn_cls_prob_reshape: (1 , H , W , Ax2) outputs of RPN, prob of bg or fg NOTICE: the old version is ordered by (1, H, W, 2, A) !!!! rpn_bbox_pred: (1 , H , W , Ax4), rgs boxes output of RPN im_info: a list of [image_height, image_width, scale_ratios] cfg_key: 'TRAIN' or 'TEST' _feat_stride: the downsampling ratio of feature map to the original input image anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16]) ---------- Returns ---------- rpn_rois : (1 x H x W x A, 5) e.g. [0, x1, y1, x2, y2] # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) #layer_params = yaml.load(self.param_str_) """ cfg_key = cfg_key.decode('ascii') # TODO 后期可能进行修改anchor的尺度,因为文本较为密集,需要进行完善修改 # _anchors value # [[0 2 15 13] # [0 0 15 15] # [0 -4 15 19] # [0 -9 15 24] # [0 -16 15 31] # [0 -26 15 41] # [0 -41 15 56] # [0 -62 15 77] # [0 -91 15 106] # [0 -134 15 149]] _anchors = generate_anchors( scales=np.array(anchor_scales)) #生成基本的10个anchor _num_anchors = _anchors.shape[0] #10个anchor im_info = im_info[0] #原始图像的高宽、缩放尺度 assert rpn_cls_prob_reshape.shape[0] == 1, \ 'Only single item batches are supported' pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N #12000,在做nms之前,最多保留的候选box数目 post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N #2000,做完nms之后,最多保留的box的数目 nms_thresh = cfg[cfg_key].RPN_NMS_THRESH #nms用参数,阈值是0.7 min_size = cfg[cfg_key].RPN_MIN_SIZE #候选box的最小尺寸,目前是16,高宽均要大于16 height, width = rpn_cls_prob_reshape.shape[1:3] #feature-map的高宽 # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want # (1, H, W, A) # 获取第一个分类结果 scores = np.reshape( np.reshape(rpn_cls_prob_reshape, [1, height, width, _num_anchors, 2])[:, :, :, :, 1], [1, height, width, _num_anchors]) #提取到object的分数,non-object的我们不关心 #并reshape到1*H*W*10 bbox_deltas = rpn_bbox_pred #模型输出的pred是相对值,需要进一步处理成真实图像中的坐标 #im_info = bottom[2].data[0, :] if DEBUG: print('im_size: ({}, {})'.format(im_info[0], im_info[1])) print('scale: {}'.format(im_info[2])) # 1. Generate proposals from bbox deltas and shifted anchors if DEBUG: print('score map size: {}'.format(scores.shape)) # Enumerate all shifts # 同anchor-target-layer-tf这个文件一样,生成anchor的shift,进一步得到整张图像上的所有anchor shift_x = np.arange(0, width) * _feat_stride shift_y = np.arange(0, height) * _feat_stride #print('w,h,x',width,height,width*height) # shift_x shape = [height, width] # 生成同样维度的两个矩阵 shift_x, shift_y = np.meshgrid(shift_x, shift_y) # print("shift_x", shift_x.shape) # print("shift_y", shift_y.shape) # shifts shape = [height*width,4] shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() #print("shift shape", shifts.shape) # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = _num_anchors # 10 K = shifts.shape[0] # height*width,[height*width,4] anchors = _anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) # print('_anchors.reshape((1, A, 4))',np.shape(_anchors.reshape((1, A, 4)))) # print('shifts.reshape((1, K, 4)).transpose((1, 0, 2))',np.shape(shifts.reshape((1, K, 4)).transpose((1, 0, 2)))) anchors = anchors.reshape((K * A, 4)) #这里得到的anchor就是整张图像上的所有anchor # print(anchors) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.reshape((-1, 4)) #(HxWxA, 4) # Same story for the scores: scores = scores.reshape((-1, 1)) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas) #做逆变换,得到box在图像上的真实坐标 # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info[:2]) #将所有的proposal修建一下,超出图像范围的将会被修剪掉 # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = _filter_boxes(proposals, min_size * im_info[2]) #移除那些proposal小于一定尺寸的proposal proposals = proposals[keep, :] #保留剩下的proposal scores = scores[keep] bbox_deltas = bbox_deltas[keep, :] # # remove irregular boxes, too fat too tall # keep = _filter_irregular_boxes(proposals) # proposals = proposals[keep, :] # scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order = scores.ravel().argsort()[::-1] #score按得分的高低进行排序 if pre_nms_topN > 0: #保留12000个proposal进去做nms order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] bbox_deltas = bbox_deltas[order, :] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep = nms(np.hstack((proposals, scores)), nms_thresh) #进行nms操作,保留2000个proposal if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] bbox_deltas = bbox_deltas[keep, :] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 blob = np.hstack( (scores.astype(np.float32, copy=False), proposals.astype(np.float32, copy=False))) return blob, bbox_deltas
def interpret_relationships(cls_prob, bbox_pred, rois, cls_prob_predicate, mat_phrase, im_info, nms=-1., clip=True, min_score=0.01, top_N=100, use_gt_boxes=False, triplet_nms=-1., topk=10, reranked_score=None): scores, inds = cls_prob[:, 1:].data.max(1) if reranked_score is not None: if isinstance(reranked_score, Variable): reranked_score = reranked_score.data scores *= reranked_score.squeeze() inds += 1 scores, inds = scores.cpu().numpy(), inds.cpu().numpy() predicate_scores, predicate_inds = cls_prob_predicate[:, 1:].data.topk(dim=1, k=topk) predicate_inds += 1 predicate_scores, predicate_inds = predicate_scores.cpu().numpy().reshape( -1), predicate_inds.cpu().numpy().reshape(-1) # Apply bounding-box regression deltas box_deltas = bbox_pred.data.cpu().numpy() box_deltas = np.asarray([ box_deltas[i, (inds[i] * 4):(inds[i] * 4 + 4)] for i in range(len(inds)) ], dtype=np.float) keep = range(scores.shape[0]) if use_gt_boxes: triplet_nms = -1. pred_boxes = rois.data.cpu().numpy()[:, 1:5] / im_info[0][2] else: pred_boxes = bbox_transform_inv_hdn(rois.data.cpu().numpy()[:, 1:5], box_deltas) / im_info[0][2] pred_boxes = clip_boxes(pred_boxes, im_info[0][:2] / im_info[0][2]) # nms if nms > 0. and pred_boxes.shape[0] > 0: assert nms < 1., 'Wrong nms parameters' pred_boxes, scores, inds, keep = nms_detections(pred_boxes, scores, nms, inds=inds) sub_list = np.array([], dtype=int) obj_list = np.array([], dtype=int) pred_list = np.array([], dtype=int) # mapping the object id mapping = np.ones(cls_prob.size(0), dtype=np.int64) * -1 mapping[keep] = range(len(keep)) sub_list = mapping[mat_phrase[:, 0]] obj_list = mapping[mat_phrase[:, 1]] pred_remain = np.logical_and(sub_list >= 0, obj_list >= 0) pred_list = np.where(pred_remain)[0] sub_list = sub_list[pred_remain] obj_list = obj_list[pred_remain] # expand the sub/obj and pred list to k-column pred_list = np.vstack([pred_list * topk + i for i in range(topk)]).transpose().reshape(-1) sub_list = np.vstack([sub_list for i in range(topk)]).transpose().reshape(-1) obj_list = np.vstack([obj_list for i in range(topk)]).transpose().reshape(-1) if use_gt_boxes: total_scores = predicate_scores[pred_list] else: total_scores = predicate_scores[pred_list] * \ scores[sub_list] * scores[obj_list] top_N_list = total_scores.argsort()[::-1][:10000] total_scores = total_scores[top_N_list] pred_ids = predicate_inds[pred_list[top_N_list]] # category of predicates sub_assignment = sub_list[top_N_list] # subjects assignments obj_assignment = obj_list[top_N_list] # objects assignments sub_ids = inds[sub_assignment] # category of subjects obj_ids = inds[obj_assignment] # category of objects sub_boxes = pred_boxes[sub_assignment] # boxes of subjects obj_boxes = pred_boxes[obj_assignment] # boxes of objects if triplet_nms > 0.: sub_ids, obj_ids, pred_ids, sub_boxes, obj_boxes, keep = triplet_nms_py( sub_ids, obj_ids, pred_ids, sub_boxes, obj_boxes, triplet_nms) sub_assignment = sub_assignment[keep] obj_assignment = obj_assignment[keep] total_scores = total_scores[keep] if len(sub_list) == 0: print('No Relatinoship remains') # pdb.set_trace() return pred_boxes, scores, inds, sub_ids, obj_ids, sub_boxes, obj_boxes, pred_ids, sub_assignment, obj_assignment, total_scores
def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_infos, _feat_stride, opts, anchor_scales, anchor_ratios, mappings): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # layer_params = yaml.load(self.param_str_) batch_size = rpn_cls_prob_reshape.shape[0] _anchors = generate_anchors.generate_anchors(scales=anchor_scales, ratios=anchor_ratios) _num_anchors = _anchors.shape[0] pre_nms_topN = opts['num_box_pre_NMS'] post_nms_topN = opts['num_box_post_NMS'] nms_thres = opts['nms_thres'] min_size = opts['min_size'] blob = [] for i in range(batch_size): im_info = im_infos[i] # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want height = mappings[int(im_info[0])] width = mappings[int(im_info[1])] scores = rpn_cls_prob_reshape[i, _num_anchors:, :height, :width] bbox_deltas = rpn_bbox_pred[i, :, :height, :width] if DEBUG: print( 'im_size: ({}, {})'.format(im_info[0], im_info[1])) print( 'scale: {}'.format(im_info[2])) if DEBUG: print( 'score map size: {}'.format(scores.shape)) # Enumerate all shifts shift_x = np.arange(0, width) * _feat_stride shift_y = np.arange(0, height) * _feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = _num_anchors K = shifts.shape[0] anchors = _anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) anchors = anchors.reshape((K * A, 4)) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, 4)) # Same story for the scores: # # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = scores.transpose((1, 2, 0)).reshape((-1, 1)) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image if opts['dropout_box_runoff_image']: _allowed_border = 16 inds_inside = np.where( (proposals[:, 0] >= -_allowed_border) & (proposals[:, 1] >= -_allowed_border) & (proposals[:, 2] < im_info[1] + _allowed_border) & # width (proposals[:, 3] < im_info[0] + _allowed_border) # height )[0] proposals = proposals[inds_inside, :] proposals = clip_boxes(proposals, im_info[:2]) # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = _filter_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) # print 'proposals', proposals # print 'scores', scores keep = nms(np.hstack((proposals, scores)).astype(np.float32), nms_thres) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 batch_inds = np.ones((proposals.shape[0], 1), dtype=np.float32) * i blob.append(np.hstack((batch_inds, proposals.astype(np.float32, copy=False), scores.astype(np.float32, copy=False)))) return np.concatenate(blob, axis=0)
def proposal_layer(rpn_cls_prob_reshape,rpn_bbox_pred,im_info,cfg_key,_feat_stride = [16,],anchor_scales = [8, 16, 32]): ''' input[0],input[1],input[2], cfg_key, _feat_stride, anchor_scales :param rpn_cls_prob_reshape: 提取得到的bbox的是否保存的score, shape is N, W, H, 18, 其实就是区分是前景还是是背景 fg:前景,bg:背景 :param rpn_bbox_pred: shape is N, W, H, 36, 提取得到的bbox的坐标 并不是ground truth :param im_info: :param cfg_key: :param _feat_stride: :param anchor_scales: :return: ''' # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # layer_params = yaml.load(self.param_str_) _anchors = generate_anchors(scales=np.array(anchor_scales)) _num_anchors = _anchors.shape[0] rpn_cls_prob_reshape = np.transpose(rpn_cls_prob_reshape, [0,3,1,2]) rpn_bbox_pred = np.transpose(rpn_bbox_pred, [0,3,1,2]) #rpn_cls_prob_reshape = np.transpose(np.reshape(rpn_cls_prob_reshape,[1,rpn_cls_prob_reshape.shape[0],rpn_cls_prob_reshape.shape[1],rpn_cls_prob_reshape.shape[2]]),[0,3,2,1]) #rpn_bbox_pred = np.transpose(rpn_bbox_pred,[0,3,2,1]) im_info = im_info[0] assert rpn_cls_prob_reshape.shape[0] == 1, \ 'Only single item batches are supported' # cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' #cfg_key = 'TEST' # 在执行NMS(non-maximize suppress, 非最大抑制)之前最多的proposal的个数 pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N # 在执行NMS(non-maximize suppress, 非最大抑制)之后最多的proposal的个数 post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N # non-maximize suppress所使用的阈值 nms_thresh = cfg[cfg_key].RPN_NMS_THRESH # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) min_size = cfg[cfg_key].RPN_MIN_SIZE # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want scores = rpn_cls_prob_reshape[:, _num_anchors:, :, :] bbox_deltas = rpn_bbox_pred #im_info = bottom[2].data[0, :] if DEBUG: print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) print 'scale: {}'.format(im_info[2]) # 1. Generate proposals from bbox deltas and shifted anchors height, width = scores.shape[-2:] if DEBUG: print 'score map size: {}'.format(scores.shape) # Enumerate all shifts shift_x = np.arange(0, width) * _feat_stride shift_y = np.arange(0, height) * _feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = _num_anchors K = shifts.shape[0] anchors = _anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) # 上面的操作其实是将features map的坐标映射到原图中的位置,方便计算IoU anchors = anchors.reshape((K * A, 4)) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) # Same story for the scores: # # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) # Convert anchors into proposals via bbox transformations # 所以说anchor和bounding box还是有一定区别的,对anchor进行一定的放缩处理后才是proposal 也就是bounding box # 至于放缩的系数是bbox_deltas 预测得到的 proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image,将proposal切割成合法尺寸 proposals = clip_boxes(proposals, im_info[:2]) # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = _filter_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep = nms(np.hstack((proposals, scores)), nms_thresh) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) return blob
def im_detect(sess, net, im, boxes=None): """Detect object classes in an image given object proposals. Arguments: net (caffe.Net): Fast R-CNN network to use im (ndarray): color image to test (in BGR order) boxes (ndarray): R x 4 array of object proposals Returns: scores (ndarray): R x K array of object class scores (K includes background as object category 0) boxes (ndarray): R x (4*K) array of predicted bounding boxes """ blobs, im_scales = _get_blobs(im, boxes) # When mapping from image ROIs to feature map ROIs, there's some aliasing # (some distinct image ROIs get mapped to the same feature ROI). # Here, we identify duplicate feature ROIs, so we only compute features # on the unique subset. if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN: v = np.array([1, 1e3, 1e6, 1e9, 1e12]) hashes = np.round(blobs['rois'] * cfg.DEDUP_BOXES).dot(v) _, index, inv_index = np.unique(hashes, return_index=True, return_inverse=True) blobs['rois'] = blobs['rois'][index, :] boxes = boxes[index, :] if cfg.TEST.HAS_RPN: im_blob = blobs['data'] blobs['im_info'] = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) # forward pass if cfg.TEST.HAS_RPN: feed_dict = { net.data: blobs['data'], net.im_info: blobs['im_info'], net.keep_prob: 1.0 } else: feed_dict = { net.data: blobs['data'], net.rois: blobs['rois'], net.keep_prob: 1.0 } cls_score, cls_prob, bbox_pred, rois = \ sess.run([net.get_output('cls_score'), net.get_output('cls_prob'), net.get_output('bbox_pred'),net.get_output('rois')],\ feed_dict=feed_dict) if cfg.TEST.HAS_RPN: assert len(im_scales) == 1, "Only single-image batch implemented" boxes = rois[:, 1:5] / im_scales[0] if cfg.TEST.SVM: # use the raw scores before softmax under the assumption they # were trained as linear SVMs scores = cls_score else: # use softmax estimated probabilities scores = cls_prob if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas box_deltas = bbox_pred pred_boxes = bbox_transform_inv(boxes, box_deltas) pred_boxes = clip_boxes(pred_boxes, im.shape) else: # Simply repeat the boxes, once for each class pred_boxes = np.tile(boxes, (1, scores.shape[1])) if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN: # Map scores and predictions back to the original set of boxes scores = scores[inv_index, :] pred_boxes = pred_boxes[inv_index, :] return scores, pred_boxes
def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride = [16,], anchor_scales = [16,]): """ Parameters ---------- rpn_cls_prob_reshape: (1 , H , W , Ax2) outputs of RPN, prob of bg or fg NOTICE: the old version is ordered by (1, H, W, 2, A) !!!! rpn_bbox_pred: (1 , H , W , Ax4), rgs boxes output of RPN im_info: a list of [image_height, image_width, scale_ratios] cfg_key: 'TRAIN' or 'TEST' _feat_stride: the downsampling ratio of feature map to the original input image anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16]) ---------- Returns ---------- rpn_rois : (1 x H x W x A, 5) e.g. [0, x1, y1, x2, y2] # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) #layer_params = yaml.load(self.param_str_) """ # cfg_key=cfg_key.decode('ascii') _anchors = generate_anchors(scales=np.array(anchor_scales))#生成基本的9个anchor _num_anchors = _anchors.shape[0]#9个anchor im_info = im_info[0]#原始图像的高宽、缩放尺度 assert rpn_cls_prob_reshape.shape[0] == 1, \ 'Only single item batches are supported' pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N#12000,在做nms之前,最多保留的候选box数目 post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N#2000,做完nms之后,最多保留的box的数目 nms_thresh = cfg[cfg_key].RPN_NMS_THRESH#nms用参数,阈值是0.7 min_size = cfg[cfg_key].RPN_MIN_SIZE#候选box的最小尺寸,目前是16,高宽均要大于16 #TODO 后期需要修改这个最小尺寸,改为8? height, width = rpn_cls_prob_reshape.shape[1:3]#feature-map的高宽 # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want # (1, H, W, A) scores = np.reshape(np.reshape(rpn_cls_prob_reshape, [1, height, width, _num_anchors, 2])[:,:,:,:,1], [1, height, width, _num_anchors]) #提取到object的分数,non-object的我们不关心 #并reshape到1*H*W*9 bbox_deltas = rpn_bbox_pred#模型输出的pred是相对值,需要进一步处理成真实图像中的坐标 #im_info = bottom[2].data[0, :] if DEBUG: print('im_size: ({}, {})'.format(im_info[0], im_info[1])) print('scale: {}'.format(im_info[2])) # 1. Generate proposals from bbox deltas and shifted anchors if DEBUG: print('score map size: {}'.format(scores.shape)) # Enumerate all shifts # 同anchor-target-layer-tf这个文件一样,生成anchor的shift,进一步得到整张图像上的所有anchor shift_x = np.arange(0, width) * _feat_stride shift_y = np.arange(0, height) * _feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = _num_anchors K = shifts.shape[0] anchors = _anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) anchors = anchors.reshape((K * A, 4))#这里得到的anchor就是整张图像上的所有anchor # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.reshape((-1, 4)) #(HxWxA, 4) # Same story for the scores: scores = scores.reshape((-1, 1)) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas)#做逆变换,得到box在图像上的真实坐标 # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info[:2])#将所有的proposal修建一下,超出图像范围的将会被修剪掉 # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = _filter_boxes(proposals, min_size * im_info[2])#移除那些proposal小于一定尺寸的proposal proposals = proposals[keep, :]#保留剩下的proposal scores = scores[keep] bbox_deltas=bbox_deltas[keep,:] # # remove irregular boxes, too fat too tall # keep = _filter_irregular_boxes(proposals) # proposals = proposals[keep, :] # scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order = scores.ravel().argsort()[::-1]#score按得分的高低进行排序 if pre_nms_topN > 0: #保留12000个proposal进去做nms order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] bbox_deltas=bbox_deltas[order,:] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep = nms(np.hstack((proposals, scores)), nms_thresh)#进行nms操作,保留2000个proposal if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] bbox_deltas=bbox_deltas[keep,:] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 blob = np.hstack((scores.astype(np.float32, copy=False), proposals.astype(np.float32, copy=False))) return blob,bbox_deltas
def im_detect(sess, net, im, boxes=None, use_box_at=-1): """Detect object classes in an image given object proposals. Arguments: im (ndarray): color image to test (in BGR order) boxes (ndarray): R x 4 array of object proposals or None (for RPN) use_box_at (int32): Use predicted box at a given timestep, default to the last one (use_box_at=-1) Returns: scores (ndarray): R x 1 array of object class scores pred_boxes (ndarray)): R x 4 array of predicted bounding boxes captions (list): length R list of list of word tokens (captions) """ # for bbox unnormalization bbox_mean = np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS).reshape((1, 4)) bbox_stds = np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS).reshape((1, 4)) blobs, im_scales = _get_blobs(im, boxes) assert len(im_scales) == 1, "Only single-image batch implemented" im_blob = blobs['data'] blobs['im_info'] = np.array( [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) # (TODO wu) for now it only works with "concat" mode # get initial states and rois cap_state, loc_state, scores, rois = net.feed_image( sess, blobs['data'], blobs['im_info'][0]) # proposal boxes boxes = rois[:, 1:5] / im_scales[0] proposal_n = rois.shape[0] cap_probs = np.ones((proposal_n, 1), dtype=np.int32) # index of <EOS> in vocab end_idx = 2 # captions = np.empty([proposal_n, 1], dtype=np.int32) bbox_offsets_list = [] box_offsets = np.zeros((proposal_n, 4), dtype=np.float32) bbox_pred = np.zeros((proposal_n, 4), dtype=np.float32) for i in xrange(cfg.TIME_STEPS - 1): # dim: [proposal_n, ] input_feed = np.argmax(cap_probs, axis=1) if i == 0: captions = input_feed[:, None] else: captions = np.concatenate((captions, input_feed[:, None]), axis=1) # dim: [proposal_n, i+1] end_ids = np.where(input_feed == end_idx)[0] # prepare for seq length in dynamic rnn input_feed[end_ids] = 0 box_offsets[end_ids] = bbox_pred[end_ids] cap_probs, bbox_pred, cap_state, loc_state = net.inference_step( sess, input_feed, cap_state, loc_state) bbox_offsets_list.append(bbox_pred) # bbox target unnormalization box_deltas = box_offsets * bbox_stds + bbox_mean # do the transformation pred_boxes = bbox_transform_inv(boxes, box_deltas) pred_boxes = clip_boxes(pred_boxes, im.shape) return scores[:, 1], pred_boxes, captions