def demo_test(net, im, pyramid): """Detect object classes in an image using pre-computed object proposals.""" # Detect all object classes and regress object bounds probs, boxes = detect_list(net, im, pyramid=pyramid) # Visualize detections for each class CONF_THRESH = 0.1 NMS_THRESH = 0.3 # for cls_ind, cls in enumerate(CLASSES[1:]): # cls_ind += 1 # because we skipped background # if cls_name == cls: # cls_boxes = boxes[:, 4*cls_ind:4*(cls_ind + 1)] # cls_scores = scores[:, cls_ind] # dets = np.hstack((cls_boxes, # cls_scores[:, np.newaxis])).astype(np.float32) # keep = nms(dets, NMS_THRESH) # dets = dets[keep, :] # # vis_detections(im, cls, dets, thresh=CONF_THRESH) # inds = np.where(dets[:, -1] >= CONF_THRESH)[0] # dets = dets[inds] inds = np.where(probs[:, 0] > CONF_THRESH)[0] probs = probs[inds, 0] boxes = boxes[inds, :] dets = np.hstack((boxes, probs[:, np.newaxis])).astype(np.float32, copy=False) keep = nms(dets, NMS_THRESH) dets = dets[keep, :] return dets
def forward(self, img_path, i): im = cv2.imread(img_path) input_size = 500 imageBuffer = np.zeros([input_size, input_size, 3]) crop_y1 = random.randint(0, max(0, im.shape[0] - input_size)) crop_x1 = random.randint(0, max(0, im.shape[1] - input_size)) crop_y2 = min(im.shape[0] - 1, crop_y1 + input_size - 1) crop_x2 = min(im.shape[1] - 1, crop_x1 + input_size - 1) crop_h = crop_y2 - crop_y1 + 1 crop_w = crop_x2 - crop_x1 + 1 paste_y1 = random.randint(0, input_size - crop_h) paste_x1 = random.randint(0, input_size - crop_w) paste_y2 = paste_y1 + crop_h - 1 paste_x2 = paste_x1 + crop_w - 1 imageBuffer[paste_y1:paste_y2 + 1, paste_x1:paste_x2 + 1, :] = im[crop_y1:crop_y2 + 1, crop_x1:crop_x2 + 1, :] cv2.imwrite('input.jpg', imageBuffer) blob = imageBuffer[:, :, ::-1].transpose(2, 0, 1) blob = mx.nd.array(blob[np.newaxis, :, :, :]) blob.copyto(self.exec_.arg_dict['data']) self.exec_.forward(is_train=False) outputs = [output.asnumpy() for output in self.exec_._get_outputs()] cls_map = outputs[0] reg_map = outputs[1] bbox_deltas = reg_map.transpose((0, 2, 3, 1)).reshape((-1, 4)) scores = cls_map[0, 1:2, :, :].reshape( (1, 25, 63, 63)) # (1,1,1575,63) scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) proposals = bbox_transform_inv(self.anchors, bbox_deltas) #proposals = self.anchors #draw_boxes(imageBuffer, proposals[:100], 'res1') order = scores.ravel().argsort()[::-1] order = order[:6000] scores = scores[order] proposals = proposals[order, :] keep = nms(np.hstack((proposals, scores)), 0.05) keep = keep[:300] proposals = proposals[keep, :] scores = scores[keep] keep = np.where(scores > 0.4)[0] proposals = proposals[keep, :] scores = scores[keep] draw_boxes(imageBuffer, proposals, 'res_{}'.format(i))
def nms(self, nms_threshold): # Non-max suppression for key_record in range(len(self.content)): if self.content[key_record]['rois'] != np.array([]): keep = nms(torch.cat((torch.from_numpy(self.content[key_record]['rois']).float(), torch.from_numpy(self.content[key_record]['scores']).unsqueeze(1).float()), 1), nms_threshold) ind = keep.numpy() self.content[key_record]['scores'] = self.content[key_record]['scores'][ind] self.content[key_record]['rois'] = self.content[key_record]['rois'][ind] self.content[key_record]['class_ids'] = self.content[key_record]['class_ids'][ind]
def nms_cuda(boxes_np, nms_thresh=0.7, xyxy=True): if xyxy: x1, y1, x2, y2, scores = np.split(boxes_np, 5, axis=1) boxes_np = np.hstack([y1, x1, y2, x2, scores]) boxes_pth = torch.from_numpy(boxes_np).float().cuda() pick = nms(boxes_pth, nms_thresh) pick = pick.cpu().data.numpy() if len(pick.shape) == 2: pick = pick.squeeze() return pick
def temporal_nms(bboxes, thresh, score_ind=3): """ One-dimensional non-maximal suppression :param bboxes: [[st, ed, cls, score], ...] :param thresh: :return: """ if not nms: return temporal_nms_fallback(bboxes, thresh, score_ind=score_ind) else: keep = nms(np.array([[x[0], x[1], x[3]] for x in bboxes]), thresh, device_id=0) return [bboxes[i] for i in keep]
def detect_im(net, im, thresh=0.05): im_scale = _compute_scaling_factor(im.shape,cfg.TEST.SCALES[0],cfg.TEST.MAX_SIZE) im_blob = _get_image_blob(im,[im_scale]) probs, boxes = forward_net(net,im_blob[0],im_scale,False) boxes = boxes[:, 0:4] inds = np.where(probs[:, 0] > thresh)[0] probs = probs[inds, 0] boxes = boxes[inds, :] dets = np.hstack((boxes, probs[:, np.newaxis])) \ .astype(np.float32, copy=False) keep = nms(dets, cfg.TEST.NMS_THRESH) cls_dets = dets[keep, :] return cls_dets
def detect_im(net, im, thresh=0.05): im_scale = _compute_scaling_factor(im.shape, cfg.TEST.SCALES[0], cfg.TEST.MAX_SIZE) im_blob = _get_image_blob(im, [im_scale]) probs, boxes = forward_net(net, im_blob[0], im_scale, False) boxes = boxes[:, 0:4] inds = np.where(probs[:, 0] > thresh)[0] probs = probs[inds, 0] boxes = boxes[inds, :] dets = np.hstack((boxes, probs[:, np.newaxis])) \ .astype(np.float32, copy=False) keep = nms(dets, cfg.TEST.NMS_THRESH) cls_dets = dets[keep, :] return cls_dets
def _nms_boxes(self, boxes, scores): """ Perform non-maximum supression of similar boxes/detections. Args: boxes: Rois for this image. Array (num_rois, num_classes * 4). scores: Class probabilities for each roi. Array (num_rois, num_classes). Returns: A list of NMSed class detections for this image. """ all_boxes = [[] for _ in range(self.num_classes)] # skip j = 0, because it's the background class for class_id in range(1, self.num_classes): # Whether to use only the top class for each box or # all classes over a certain threshhold. if self.top_class_only: detection_criterion = (np.argmax(scores, axis=1) == class_id) else: detection_criterion = (scores[:, class_id] > self.class_detection_thresh) class_detected_indexes = np.where(detection_criterion)[0] cls_scores = scores[class_detected_indexes, class_id] class_box_start = class_id * 4 class_box_end = class_box_start + 4 cls_boxes = boxes[class_detected_indexes, class_box_start:class_box_end] cls_dets = np.hstack( (cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32, copy=False) if len(cls_dets) > 1: keep = nms(cls_dets, self.nms_thresh, force_cpu=True) cls_dets = cls_dets[keep, :] all_boxes[class_id] = cls_dets return all_boxes
def forward(self, bottom, top): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) assert bottom[0].data.shape[0] == 1, \ 'Only single item batches are supported' if self.phase==0: cfg_key = 'TRAIN' elif self.phase==1: cfg_key = 'TEST' else: cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' if cfg_key == 'TRAIN': nms_thresh = cfg[cfg_key].NMS_THRESH post_nms_topN = cfg[cfg_key].ANCHOR_N_POST_NMS pre_nms_topN = cfg[cfg_key].ANCHOR_N_PRE_NMS if cfg_key == 'TEST': pre_nms_topN = cfg[cfg_key].N_DETS_PER_MODULE min_size = cfg[cfg_key].ANCHOR_MIN_SIZE # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want scores = bottom[0].data[:, self._num_anchors:, :, :] bbox_deltas = bottom[1].data im_info = bottom[2].data[0, :] if DEBUG: print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) print 'scale: {}'.format(im_info[2]) # 1. Generate proposals from bbox deltas and shifted anchors height, width = scores.shape[-2:] if DEBUG: print 'score map size: {}'.format(scores.shape) # Enumerate all shifts shift_x = np.arange(0, width) * self._feat_stride shift_y = np.arange(0, height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = self._num_anchors K = shifts.shape[0] anchors = self._anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) anchors = anchors.reshape((K * A, 4)) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) # Same story for the scores: # # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info[:2]) # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = _filter_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # 6. apply nms (if in training mode) # 7. take after_nms_topN # 8. return the top proposals (-> RoIs top) if self.phase == 0: # DO NMS ONLY IN TRAINING TIME # DURING TEST WE HAVE NMS OUTSIDE OF THIS FUNCTION keep = nms(np.hstack((proposals, scores)), nms_thresh) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 if proposals.shape[0] == 0: blob = np.array([[0,0,0,16,16]],dtype=np.float32) else: batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) top[0].reshape(*(blob.shape)) top[0].data[...] = blob # [Optional] output scores blob if len(top) > 1: top[1].reshape(*(scores.shape)) top[1].data[...] = scores
def refine_detections(rois, probs, deltas, window, config): """Refine classified proposals and filter overlaps and return final detections. Inputs: rois: [N, (y1, x1, y2, x2)] in normalized coordinates probs: [N, num_classes]. Class probabilities. deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific bounding box deltas. window: (y1, x1, y2, x2) in image coordinates. The part of the image that contains the image excluding the padding. Returns detections shaped: [N, (y1, x1, y2, x2, class_id, score)] """ # Class IDs per ROI _, class_ids = torch.max(probs, dim=1) # Class probability of the top class of each ROI # Class-specific bounding box deltas idx = torch.arange(class_ids.size()[0]).long() if config.GPU_COUNT: idx = idx.cuda() class_scores = probs[idx, class_ids.data] deltas_specific = deltas[idx, class_ids.data] # Apply bounding box deltas # Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates std_dev = Variable(torch.from_numpy( np.reshape(config.RPN_BBOX_STD_DEV, [1, 4])).float(), requires_grad=False) if config.GPU_COUNT: std_dev = std_dev.cuda() refined_rois = proposal.apply_box_deltas(rois, deltas_specific * std_dev) # Convert coordiates to image domain height, width = config.IMAGE_SHAPE[:2] scale = Variable(torch.from_numpy(np.array([height, width, height, width])).float(), requires_grad=False) if config.GPU_COUNT: scale = scale.cuda() refined_rois *= scale # Clip boxes to image window refined_rois = clip_to_window(window, refined_rois) # Round and cast to int since we're deadling with pixels now refined_rois = torch.round(refined_rois) # TODO: Filter out boxes with zero area # Filter out background boxes keep_bool = class_ids > 0 # Filter out low confidence boxes if config.DETECTION_MIN_CONFIDENCE: keep_bool = keep_bool & (class_scores >= config.DETECTION_MIN_CONFIDENCE) keep = torch.nonzero(keep_bool)[:, 0] # Apply per-class NMS pre_nms_class_ids = class_ids[keep.data] pre_nms_scores = class_scores[keep.data] pre_nms_rois = refined_rois[keep.data] for i, class_id in enumerate(util_pytorch.unique1d(pre_nms_class_ids)): # Pick detections of this class ixs = torch.nonzero(pre_nms_class_ids == class_id)[:, 0] # Sort ix_rois = pre_nms_rois[ixs.data] ix_scores = pre_nms_scores[ixs] ix_scores, order = ix_scores.sort(descending=True) ix_rois = ix_rois[order.data, :] class_keep = nms( torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1).data, config.DETECTION_NMS_THRESHOLD) # Map indicies class_keep = keep[ixs[order[class_keep].data].data] if i == 0: nms_keep = class_keep else: nms_keep = util_pytorch.unique1d(torch.cat((nms_keep, class_keep))) keep = util_pytorch.intersect1d(keep, nms_keep) # Keep top detections roi_count = config.DETECTION_MAX_INSTANCES top_ids = class_scores[keep.data].sort(descending=True)[1][:roi_count] keep = keep[top_ids.data] # Arrange output as [N, (y1, x1, y2, x2, class_id, score)] # Coordinates are in image domain. result = torch.cat( (refined_rois[keep.data], class_ids[keep.data].unsqueeze(1).float(), class_scores[keep.data].unsqueeze(1)), dim=1) return result
def proposal_layer(inputs, proposal_count, nms_threshold, anchors, config=None): """Receives anchor scores and selects a subset to pass as proposals to the second stage. Filtering is done based on anchor scores and non-max suppression to remove overlaps. It also applies bounding box refinment detals to anchors. Inputs: rpn_probs: [batch, anchors, (bg prob, fg prob)] rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))] Returns: Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)] """ # Currently only supports batchsize 1 inputs[0] = inputs[0].squeeze(0) inputs[1] = inputs[1].squeeze(0) # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1] scores = inputs[0][:, 1] # Box deltas [batch, num_rois, 4] deltas = inputs[1] std_dev = Variable(torch.from_numpy( np.reshape(config.RPN_BBOX_STD_DEV, [1, 4])).float(), requires_grad=False) if config.GPU_COUNT: std_dev = std_dev.cuda() deltas = deltas * std_dev # Improve performance by trimming to top anchors by score # and doing the rest on the smaller subset. pre_nms_limit = min(6000, anchors.size()[0]) scores, order = scores.sort(descending=True) order = order[:pre_nms_limit] scores = scores[:pre_nms_limit] deltas = deltas[order.data, :] # TODO: Support batch size > 1 ff. anchors = anchors[order.data, :] # Apply deltas to anchors to get refined anchors. # [batch, N, (y1, x1, y2, x2)] boxes = apply_box_deltas(anchors, deltas) # Clip to image boundaries. [batch, N, (y1, x1, y2, x2)] height, width = config.IMAGE_SHAPE[:2] window = np.array([0, 0, height, width]).astype(np.float32) boxes = clip_boxes(boxes, window) # Filter out small boxes # According to Xinlei Chen's paper, this reduces detection accuracy # for small objects, so we're skipping it. # Non-max suppression keep = nms(torch.cat((boxes, scores.unsqueeze(1)), 1).data, nms_threshold) keep = keep[:proposal_count] boxes = boxes[keep, :] # Normalize dimensions to range of 0 to 1. norm = Variable(torch.from_numpy(np.array([height, width, height, width])).float(), requires_grad=False) if config.GPU_COUNT: norm = norm.cuda() normalized_boxes = boxes / norm # Add back batch dimension normalized_boxes = normalized_boxes.unsqueeze(0) return normalized_boxes
def forward(self, bottom, top): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) assert bottom[0].data.shape[0] == 1, \ 'Only single item batches are supported' if self.phase == 0: cfg_key = 'TRAIN' elif self.phase == 1: cfg_key = 'TEST' else: cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' if cfg_key == 'TRAIN': nms_thresh = cfg[cfg_key].NMS_THRESH post_nms_topN = cfg[cfg_key].ANCHOR_N_POST_NMS pre_nms_topN = cfg[cfg_key].ANCHOR_N_PRE_NMS if cfg_key == 'TEST': pre_nms_topN = cfg[cfg_key].N_DETS_PER_MODULE score_thresh = cfg[cfg_key].SCORE_THRESH min_size = cfg[cfg_key].ANCHOR_MIN_SIZE # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want scores = bottom[-3].data # For multi-class bbox_deltas = bottom[-2].data im_info = bottom[-1].data[0, :] # 1. Generate proposals from bbox deltas and shifted anchors height, width = scores.shape[-2:] # Enumerate all shifts shift_x = np.arange(0, width) * self._feat_stride[0] shift_y = np.arange(0, height) * self._feat_stride[0] shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = self._num_anchors K = shifts.shape[0] num_classes = scores.shape[1] / (A * self._num_feats) anchors = self._anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) anchors = anchors.reshape((K * A, 4)) self.anchors = anchors # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) # Same story for the scores: # # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = scores.transpose((0, 2, 3, 1)).reshape( (-1, num_classes, A * self._num_feats)).transpose( (0, 2, 1)).reshape((-1, num_classes)) # Convert anchors into proposals via bbox transformations new_anchors = np.concatenate([anchors[:, np.newaxis, :]] * self._num_feats, axis=1).reshape((-1, 4)) proposals = bbox_transform_inv(new_anchors, bbox_deltas) for i in range(self._num_refine): # Do this because a combination of bbox_transform_inv and _compute_targets # will cause a larger 3rd and 4th entry of coordinates # We do not do this at the last regression, just to follow the original code proposals[:, 2:4] -= 1 refine_delta = bottom[i].data refine_delta = refine_delta.transpose((0, 2, 3, 1)).reshape( (-1, 4)) proposals = bbox_transform_inv(proposals, refine_delta) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info[:2]) if self._subsampled: anchor_map = np.zeros((height, width, A)) for i in xrange(A): stride = self._feat_stride[i / len(self._shifts)** 2] // self._feat_stride[0] anchor_map[::stride, ::stride, i] = 1 anchor_map = anchor_map.reshape((K * A)) subsampled_inds = np.where(anchor_map)[0] proposals = proposals[subsampled_inds, :] scores = scores[subsampled_inds, :] # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = _filter_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep, :] # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN # max_score = np.max(scores[:, 1:], axis=1).ravel() order = max_score.argsort()[::-1] try: thresh_idx = np.where(max_score[order] >= score_thresh)[0].max() except: thresh_idx = 0 # Nothing greater then score_thresh, just keep the largest one if pre_nms_topN > 0: order = order[:pre_nms_topN] order = order[:thresh_idx + 1] proposals = proposals[order, :] scores = scores[order, :] # 6. apply nms (if in training mode) # 7. take after_nms_topN # 8. return the top proposals (-> RoIs top) if self.phase == 0: # DO NMS ONLY IN TRAINING TIME # DURING TEST WE HAVE NMS OUTSIDE OF THIS FUNCTION keep = nms(np.hstack((proposals, scores)), nms_thresh) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 if proposals.shape[0] == 0: blob = np.array([[0, 0, 0, 16, 16]], dtype=np.float32) else: batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) blob = np.hstack( (batch_inds, proposals.astype(np.float32, copy=False))) top[0].reshape(*(blob.shape)) top[0].data[...] = blob # [Optional] output scores blob if len(top) > 1: top[1].reshape(*(scores.shape)) top[1].data[...] = scores
def forward(self, bottom, top): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted transform deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) assert bottom[0].data.shape[ 0] == 1, 'Only single item batches are supported' cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH min_size = cfg[cfg_key].RPN_MIN_SIZE # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want scores = bottom[0].data[:, self._num_anchors:, :, :] bbox_deltas = bottom[1].data im_info = bottom[2].data[0, :] # 1. Generate proposals from transform deltas and shifted anchors height, width = scores.shape[-2:] self._height = height self._width = width # Enumerate all shifts shift_x = np.arange(0, self._width) * self._feat_stride shift_y = np.arange(0, self._height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = self._num_anchors K = shifts.shape[0] anchors = self._anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) anchors = anchors.reshape((K * A, 4)) _, keep = clip_boxes(anchors, im_info[:2]) self._anchor_index_before_clip = keep # Transpose and reshape predicted transform transformations to get them # into the same order as the anchors: # # transform deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) # Same story for the scores: # # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) # Convert anchors into proposals via transform transformations proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image proposals, keep = clip_boxes(proposals, im_info[:2]) # Record the cooresponding index before and after clip # This step doesn't need unmap # We need it to decide whether do back propagation self._proposal_index_before_clip = keep # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = filter_small_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] self._ind_after_filter = keep # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] self._ind_after_sort = order # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep = nms(np.hstack((proposals, scores)), nms_thresh) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) proposals = np.hstack( (batch_inds, proposals.astype(np.float32, copy=False))) self._proposal_index = keep blobs = {'rois': proposals} if str(self.phase) == 'TRAIN': if cfg.TRAIN.MIX_INDEX: all_rois_index = self._ind_after_filter[self._ind_after_sort[ self._proposal_index]].reshape(1, len(keep)) blobs['proposal_index'] = all_rois_index # Copy data to forward to top layer for blob_name, blob in blobs.iteritems(): top[self._top_name_map[blob_name]].reshape(*blob.shape) top[self._top_name_map[blob_name]].data[...] = blob.astype( np.float32, copy=False)
def forward(self, input): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # the first set of _num_anchors channels are bg probs # the second set are the fg probs scores = input[0][:, self._num_anchors:, :, :] bbox_deltas = input[1] im_info = input[2] cfg_key = input[3] pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH min_size = cfg[cfg_key].RPN_MIN_SIZE batch_size = bbox_deltas.size(0) feat_height, feat_width = scores.size(2), scores.size(3) shift_x = np.arange(0, feat_width) * self._feat_stride shift_y = np.arange(0, feat_height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = torch.from_numpy( np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()) shifts = shifts.contiguous().type_as(scores).float() A = self._num_anchors K = shifts.size(0) self._anchors = self._anchors.type_as(scores) # anchors = self._anchors.view(1, A, 4) + shifts.view(1, K, 4).permute(1, 0, 2).contiguous() anchors = self._anchors.view(1, A, 4) + shifts.view(K, 1, 4) anchors = anchors.view(1, K * A, 4).expand(batch_size, K * A, 4) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).contiguous() bbox_deltas = bbox_deltas.view(batch_size, -1, 4) # Same story for the scores: scores = scores.permute(0, 2, 3, 1).contiguous() scores = scores.view(batch_size, -1) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas, batch_size) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info, batch_size) # proposals = clip_boxes_batch(proposals, im_info, batch_size) # assign the score to 0 if it's non keep. # keep = self._filter_boxes(proposals, min_size * im_info[:, 2]) # trim keep index to make it euqal over batch # keep_idx = torch.cat(tuple(keep_idx), 0) # scores_keep = scores.view(-1)[keep_idx].view(batch_size, trim_size) # proposals_keep = proposals.view(-1, 4)[keep_idx, :].contiguous().view(batch_size, trim_size, 4) # _, order = torch.sort(scores_keep, 1, True) scores_keep = scores proposals_keep = proposals _, order = torch.sort(scores_keep, 1, True) output = scores.new(batch_size, post_nms_topN, 5).zero_() for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) proposals_single = proposals_keep[i] scores_single = scores_keep[i] # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN (e.g. 6000) order_single = order[i] if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): order_single = order_single[:pre_nms_topN] proposals_single = proposals_single[order_single, :] scores_single = scores_single[order_single].view(-1, 1) # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep_idx_i = nms(torch.cat((proposals_single, scores_single), 1), nms_thresh, force_cpu=not cfg.USE_GPU_NMS) keep_idx_i = keep_idx_i.long().view(-1) if post_nms_topN > 0: keep_idx_i = keep_idx_i[:post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] scores_single = scores_single[keep_idx_i, :] # padding 0 at the end. num_proposal = proposals_single.size(0) output[i, :, 0] = i output[i, :num_proposal, 1:] = proposals_single return output
def forward(self, input): # input[0]: (batch_size, channels, H, W) = (1, 24, 19, 20) # input[1]: (batch_size, channels, H, W) = (1, 12*4, 19, 20) # input[2]: (batch_size, H, W) = (1, 240, 320) # input[3]: "TEST" or "TRAIN" all_anchors = self.all_anchors.cuda() scores = input[ 0][:, self. _num_anchors_type:, :, :] # class score (binary) for each feature map pixel bbox_deltas = input[ 1] # bbox for each feature map pixel, size (batch_size, 48, 19, 20) im_info = input[ 2] # image shape, for jhmdb, it is [[240, 320]] TODO1: change this to [240, 320] cfg_key = input[3] # TRAIN or TEST im_info = np.array(im_info) pre_nms_topN = cfg[ cfg_key].RPN_PRE_NMS_TOP_N # train: 12000, test: 6000 post_nms_topN = cfg[ cfg_key].RPN_POST_NMS_TOP_N # train: 2000, test: 300 nms_thresh = cfg[cfg_key].RPN_NMS_THRESH # train: 0.7, test: 0.7 min_size = cfg[cfg_key].RPN_MIN_SIZE # train: 8, test: 16 batch_size = bbox_deltas.size(0) # mostly 1 # since the anchors are obtained from dataset, we can just use it, change it to # (batch_size, 3600, 4) TODO: this is different from origin all_anchors = all_anchors.contiguous() all_anchors = all_anchors.view(1, self.all_num_anchors, 4).expand(batch_size, self.all_num_anchors, 4) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: change to (batch_size, 19, 20, 48) bbox_deltas = bbox_deltas.permute(0, 2, 3, 1).contiguous() bbox_deltas = bbox_deltas.view(batch_size, -1, 4) # Same story for the scores: # batch_size, 19, 25, 12 scores = scores.permute(0, 2, 3, 1).contiguous() ''' x = torch.randn(5, 4) print(x.stride(), x.is_contiguous()) print(x.t().stride(), x.t().is_contiguous()) x.view(4, 5) # ok x.t().view(4, 5) # fails ''' scores = scores.view(batch_size, -1) # Convert anchors into proposals via bbox transformations # so we get a big list of bbox ## slide anchors on each pixels on the feature map 19*20, get bounding boxes # achors, 630 * 4, means 630 anchors, with 4 coordinates # bbox_deltas, batch_size, 19, 20, 48. 48 means 4 cooridnates * 12 anchors # all_anchors.shape = 1x3600x4, bbox_deltas.shape=1x3600x4 proposals = bbox_transform_inv2(all_anchors, bbox_deltas, batch_size) # 2. clip predicted boxes to image: TODO: this line is useless, since our input anchor is already fixed with # image size. ## remove the bboxes that outside of the image boundary # proposals.shape = [1, 3600, 4]), im_info = [[240, 320]] proposals = clip_boxes(proposals, im_info, batch_size) scores_keep = scores #(batch_size, 12, 19, 25) proposals_keep = proposals _, order = torch.sort( scores_keep, 1, True) # sort 12 'anchors', here is the cnn output score output = scores.new(batch_size, post_nms_topN, 5).zero_() for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) proposals_single = proposals_keep[ i] # for one batch, all the anchors for each feature map pixel # size: (12, 19, 25) scores_single = scores_keep[ i] # binary class score for each feature map pixel # size: (12, 19, 25) # # 4. sort all (proposal, score) pairs by score from highest to lowest # # 5. take top pre_nms_topN (e.g. 6000) order_single = order[i] if pre_nms_topN > 0 and pre_nms_topN < scores_keep.numel(): order_single = order_single[:pre_nms_topN] proposals_single = proposals_single[order_single, :] scores_single = scores_single[order_single].view(-1, 1) # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep_idx_i = nms(torch.cat((proposals_single, scores_single), 1), nms_thresh, force_cpu=not cfg.USE_GPU_NMS) keep_idx_i = keep_idx_i.long().view(-1) if post_nms_topN > 0: keep_idx_i = keep_idx_i[:post_nms_topN] proposals_single = proposals_single[keep_idx_i, :] scores_single = scores_single[keep_idx_i, :] # padding 0 at the end. num_proposal = proposals_single.size(0) output[i, :, 0] = i output[i, :num_proposal, 1:] = proposals_single return output
def cpu_mask_voting(masks, boxes, scores, num_classes, max_per_image, im_width, im_height): """ Wrapper function for mask voting, note we already know the class of boxes and masks Args: masks: ~ n x mask_sz x mask_sz boxes: ~ n x 4 scores: ~ n x 1 max_per_image: default would be 100 im_width: width of image im_height: height of image """ # apply nms and sort to get first images according to their scores scores = scores[:, 1:] num_detect = boxes.shape[0] res_mask = [[] for _ in xrange(num_detect)] for i in xrange(num_detect): box = np.round(boxes[i]).astype(int) mask = cv2.resize(masks[i, 0].astype(np.float32), (box[2] - box[0] + 1, box[3] - box[1] + 1)) res_mask[i] = mask # Intermediate results sup_boxes = [] sup_masks = [] sup_scores = [] tobesort_scores = [] for i in xrange(num_classes - 1): dets = np.hstack((boxes.astype(np.float32), scores[:, i:i+1])) inds = nms(dets, cfg.TEST.MASK_MERGE_NMS_THRESH) ind_boxes = boxes[inds] ind_masks = masks[inds] ind_scores = scores[inds, i] order = ind_scores.ravel().argsort()[::-1] num_keep = min(len(order), max_per_image) order = order[0:num_keep] sup_boxes.append(ind_boxes[order]) sup_masks.append(ind_masks[order]) sup_scores.append(ind_scores[order]) tobesort_scores.extend(ind_scores[order]) sorted_scores = np.sort(tobesort_scores)[::-1] num_keep = min(len(sorted_scores), max_per_image) thresh = sorted_scores[num_keep-1] result_box = [] result_mask = [] for c in xrange(num_classes - 1): cls_box = sup_boxes[c] cls_score = sup_scores[c] keep = np.where(cls_score >= thresh)[0] new_sup_boxes = cls_box[keep] num_sup_box = len(new_sup_boxes) masks_ar = np.zeros((num_sup_box, 1, cfg.MASK_SIZE, cfg.MASK_SIZE)) boxes_ar = np.zeros((num_sup_box, 4)) for i in xrange(num_sup_box): # Get weights according to their segmentation scores cur_ov = bbox_overlaps(boxes.astype(np.float), new_sup_boxes[i, np.newaxis].astype(np.float)) cur_inds = np.where(cur_ov >= cfg.TEST.MASK_MERGE_IOU_THRESH)[0] cur_weights = scores[cur_inds, c] cur_weights = cur_weights / sum(cur_weights) # Re-format mask when passing it to mask_aggregation pass_mask = [res_mask[j] for j in list(cur_inds)] # do mask aggregation tmp_mask, boxes_ar[i] = mask_aggregation(boxes[cur_inds], pass_mask, cur_weights, im_width, im_height) tmp_mask = cv2.resize(tmp_mask.astype(np.float32), (cfg.MASK_SIZE, cfg.MASK_SIZE)) masks_ar[i, 0] = tmp_mask # make new array such that scores is the last dimension of boxes boxes_scored_ar = np.hstack((boxes_ar, cls_score[keep, np.newaxis])) result_box.append(boxes_scored_ar) result_mask.append(masks_ar) return result_box, result_mask
im2show = np.copy(im) for j in xrange(1, imdb.num_classes): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, cfg.TEST.NMS) cls_dets = cls_dets[keep.view(-1).long()] if vis: im2show = vis_detections(im2show, imdb.classes[j], cls_dets.cpu().numpy(), 0.3) all_boxes[j][i] = cls_dets.cpu().numpy() else: all_boxes[j][i] = empty_array # Limit to max_per_image detections *over all classes* if max_per_image > 0: image_scores = np.hstack( [all_boxes[j][i][:, -1] for j in xrange(1, imdb.num_classes)]) if len(image_scores) > max_per_image: image_thresh = np.sort(image_scores)[-max_per_image] for j in xrange(1, imdb.num_classes):
def gpu_mask_voting(masks, boxes, scores, num_classes, max_per_image, im_width, im_height, cfg): """ A wrapper function, note we already know the class of boxes and masks Args: masks: ~ 300 x 21 x 21 boxes: ~ 300 x 4 scores: ~ 300 x 1 max_per_image: default would be 100 im_width: im_height: """ # Intermediate results sup_boxes = [] sup_scores = [] tobesort_scores = [] for i in xrange(num_classes): if i == 0: sup_boxes.append([]) sup_scores.append([]) continue dets = np.hstack((boxes.astype(np.float32), scores[:, i:i+1].astype(np.float32))) #thresh = (cfg.TEST_DEFAULT_MASK_MERGE_IOU_THRESH).astype(np.float32) #print ('dets.shape: {}'.format(dets.shape)) #print ('dets.dtype: {}'.format(dets.dtype)) inds = nms(dets, cfg.TEST_DEFAULT_MASK_MERGE_IOU_THRESH, cfg) ind_boxes = boxes[inds] ind_scores = scores[inds, i] num_keep = min(len(ind_scores), max_per_image) sup_boxes.append(ind_boxes[0:num_keep, :]) sup_scores.append(ind_scores[0:num_keep]) tobesort_scores.extend(ind_scores[0:num_keep]) sorted_scores = np.sort(tobesort_scores)[::-1] num_keep = min(len(sorted_scores), max_per_image) thresh = sorted_scores[num_keep-1] # inds array to record which mask should be aggregated together candidate_inds = [] # weight for each element in the candidate inds candidate_weights = [] # start position for candidate array candidate_start = [] candidate_scores = [] class_bar = [] for c in xrange(num_classes): if c == 0: continue cls_box = sup_boxes[c] cls_score = sup_scores[c] keep = np.where(cls_score >= thresh)[0] new_sup_boxes = cls_box[keep] num_sup_box = len(new_sup_boxes) for i in xrange(num_sup_box): cur_ov = bbox_overlaps(boxes.astype(np.float), new_sup_boxes[i, np.newaxis].astype(np.float)) cur_inds = np.where(cur_ov >= cfg.TEST_DEFAULT_MASK_MERGE_IOU_THRESH)[0] candidate_inds.extend(cur_inds) cur_weights = scores[cur_inds, c] cur_weights = cur_weights / sum(cur_weights) candidate_weights.extend(cur_weights) candidate_start.append(len(candidate_inds)) candidate_scores.extend(cls_score[keep]) class_bar.append(len(candidate_scores)) candidate_inds = np.array(candidate_inds, dtype=np.int32) candidate_weights = np.array(candidate_weights, dtype=np.float32) candidate_start = np.array(candidate_start, dtype=np.int32) candidate_scores = np.array(candidate_scores, dtype=np.float32) #print ('boxes.shape: {}'.format(boxes.shape)) #print ('masks.shape: {}'.format(masks.shape)) masks = np.reshape(masks, (masks.shape[0],1,masks.shape[1],masks.shape[2])) # rfm add result_mask, result_box = mv(boxes.astype(np.float32), masks.astype(np.float32), candidate_inds, candidate_start, candidate_weights, im_height, im_width) #print ('result_mask.shape: {}'.format(result_mask.shape)) #print ('result_box.shape: {}'.format(result_box.shape)) result_box = np.hstack((result_box, candidate_scores[:, np.newaxis])) list_result_box = [] list_result_mask = [] # separate result mask into different classes for i in xrange(num_classes - 1): cls_start = class_bar[i - 1] if i > 0 else 0 cls_end = class_bar[i] list_result_box.append(result_box[cls_start:cls_end, :]) list_result_mask.append(result_mask[cls_start:cls_end, :, :, :]) return list_result_mask, list_result_box
im2show = np.copy(im) for j in xrange(1, len(pascal_classes)): inds = torch.nonzero(scores[:, j] > thresh).view(-1) # if there is det if inds.numel() > 0: cls_scores = scores[:, j][inds] _, order = torch.sort(cls_scores, 0, True) if args.class_agnostic: cls_boxes = pred_boxes[inds, :] else: cls_boxes = pred_boxes[inds][:, j * 4:(j + 1) * 4] cls_dets = torch.cat((cls_boxes, cls_scores.unsqueeze(1)), 1) # cls_dets = torch.cat((cls_boxes, cls_scores), 1) cls_dets = cls_dets[order] keep = nms(cls_dets, args.TEST_NMS) cls_dets = cls_dets[keep.view(-1).long()] if vis: im2show = vis_detections(im2show, pascal_classes[j], cls_dets.cpu().numpy(), 0.5) misc_toc = time.time() nms_time = misc_toc - misc_tic if webcam_num == -1: sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \ .format(num_images + 1, len(imglist), detect_time, nms_time)) sys.stdout.flush() if vis and webcam_num == -1: # cv2.imshow('test', im2show) # cv2.waitKey(0)
def refine_detections(rois, probs, deltas, window, config): """Refine classified proposals and filter overlaps and return final detections. Inputs: rois: [N, (y1, x1, y2, x2)] in normalized coordinates probs: [N, num_classes]. Class probabilities. deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific bounding box deltas. window: (y1, x1, y2, x2) in image coordinates. The part of the image that contains the image excluding the padding. Returns detections shaped: [N, (y1, x1, y2, x2, class_id, score)] """ # Class IDs per ROI _, class_ids = torch.max(probs, dim=1) # Class probability of the top class of each ROI # Class-specific bounding box deltas idx = torch.arange(class_ids.size()[0]).long() if config.GPU_COUNT: idx = idx.cuda() class_scores = probs[idx, class_ids.data] deltas_specific = deltas[idx, class_ids.data] refined_rois = coordinate_convert(rois, deltas_specific, config, config.GPU_COUNT) # Clip boxes to image window refined_rois = clip_to_window(window, refined_rois) # Round and cast to int since we're deadling with pixels now refined_rois = torch.round(refined_rois) # TODO: Filter out boxes with zero area # Filter out background boxes keep_bool = class_ids > 0 if config.USE_NMS: # Filter out low confidence boxes if config.DETECTION_MIN_CONFIDENCE: keep_bool = keep_bool & (class_scores >= config.DETECTION_MIN_CONFIDENCE) if max(keep_bool) == 0: return [], [] keep = torch.nonzero(keep_bool)[:, 0] # Apply per-class NMS pre_nms_class_ids = class_ids[keep.data] pre_nms_scores = class_scores[keep.data] pre_nms_rois = refined_rois[keep.data] for i, class_id in enumerate(unique1d(pre_nms_class_ids)): # Pick detections of this class ixs = torch.nonzero(pre_nms_class_ids == class_id)[:, 0] # Sort ix_rois = pre_nms_rois[ixs.data] ix_scores = pre_nms_scores[ixs] ix_scores, order = ix_scores.sort(descending=True) ix_rois = ix_rois[order.data, :] class_keep = nms( torch.cat((ix_rois, ix_scores.unsqueeze(1)), dim=1).data, config.DETECTION_NMS_THRESHOLD) # Map indicies class_keep = keep[ixs[order[class_keep].data].data] if i == 0: nms_keep = class_keep else: nms_keep = unique1d(torch.cat((nms_keep, class_keep))) keep = intersect1d(keep, nms_keep) else: keep = torch.nonzero(keep_bool).view((-1)) if len(keep) > 100: ix_scores, order = class_scores[keep.data].sort(descending=True) keep = keep[order[:100]] # else: # ix_scores, order = class_scores[~keep_bool].sort(descending=False) # keep2 = torch.nonzero(~keep_bool).view((-1))[order[:(1000-len(keep))]] # keep = torch.cat((keep,keep2),0) ix_scores, order = class_scores[keep.data].sort(descending=True) keep = keep[order] # Keep top detections roi_count = config.DETECTION_MAX_INSTANCES if len(keep.data) > 0: top_ids = class_scores[keep.data].sort(descending=True)[1][:] keep = keep[top_ids.data] # Arrange output as [N, (y1, x1, y2, x2, class_id, score)] # Coordinates are in image domain. result = torch.cat((refined_rois[keep.data], class_ids[keep.data].unsqueeze(1).float(), class_scores[keep.data].unsqueeze(1)), dim=1) else: return [], [] return result, keep
anchors = anchors[order.data, :] boxes = apply_box_deltas(anchors, deltas) # Clip to image boundaries.. [y1, x1, y2, x2] height, width = config.IMAGE_SHAPE[:2] # 画像本来の大きさ [1024, 1024, 3]ここは変更予定 window = np.array([0, 0, height, width]).astype(np.float32) boxes = clip_boxes(boxes, window) # 画像からはみ出てるバウンディングボックスを変形する # Filter out small boxes # According to Xinlei Chen's paper, this reduces detection accuracy # Non-max suppression # nms_thresholdを超えた値を削除している keep = nms(torch.cat((boxes, scores.unsqueeze(1)), 1).data, nms_threshold) # keepはインデックス? keep = keep[:proposal_count] boxes = boxes[keep, :] # Normalize dimensions to range of 0 to 1. . # 相対的な値に変換する norm = Variable(torch.from_numpy(np.array([height, widht, height, width]))\ .float(), requires_grad=False) if config.GPU_COUNT: norm = norm.cuda() normalized_boxes = boxes / norm normalize_boxed = normalized_boxes.unsqueeze(0) # 元に戻す
def cpu_mask_voting(masks, boxes, scores, num_classes, max_per_image, im_width, im_height, cfg): """ Wrapper function for mask voting, note we already know the class of boxes and masks Args: masks: ~ n x mask_sz x mask_sz boxes: ~ n x 4 scores: ~ n x 1 max_per_image: default would be 100 im_width: width of image im_height: height of image """ # apply nms and sort to get first images according to their scores scores = scores[:, 1:] # remove bg scores num_detect = boxes.shape[0] res_mask = [[] for _ in xrange(num_detect)] for i in xrange(num_detect): box = np.round(boxes[i]).astype(int) mask = cv2.resize(masks[i].astype(np.float32), (box[2]-box[0]+1, box[3]-box[1]+1)) # unpool mask pooled #mask = unpool_mask(masks[i], (box[3]-box[1]+1, box[2]-box[0]+1)) res_mask[i] = mask # Intermediate results sup_boxes = [] sup_masks = [] sup_scores = [] tobesort_scores = [] for i in xrange(num_classes - 1): dets = np.hstack((boxes.astype(np.float32), scores[:, i:i+1])) inds = nms(dets, cfg.TEST_DEFAULT_MASK_MERGE_NMS_THRESH, cfg) ind_boxes = boxes[inds] ind_masks = masks[inds] ind_scores = scores[inds, i] order = ind_scores.ravel().argsort()[::-1] num_keep = min(len(order), max_per_image) order = order[0:num_keep] sup_boxes.append(ind_boxes[order]) sup_masks.append(ind_masks[order]) sup_scores.append(ind_scores[order]) tobesort_scores.extend(ind_scores[order]) sorted_scores = np.sort(tobesort_scores)[::-1] num_keep = min(len(sorted_scores), max_per_image) thresh = sorted_scores[num_keep-1] result_box = [] result_mask = [] for c in xrange(num_classes - 1): cls_box = sup_boxes[c] cls_score = sup_scores[c] keep = np.where(cls_score >= thresh)[0] new_sup_boxes = cls_box[keep] num_sup_box = len(new_sup_boxes) #masks_ar = np.zeros((num_sup_box, 1, cfg.MAIN_DEFAULT_MASK_SIZE, cfg.MAIN_DEFAULT_MASK_SIZE)) masks_ar = np.zeros((num_sup_box, cfg.MAIN_DEFAULT_MASK_SIZE, cfg.MAIN_DEFAULT_MASK_SIZE)) boxes_ar = np.zeros((num_sup_box, 4)) for i in xrange(num_sup_box): # Get weights according to their segmentation scores cur_ov = bbox_overlaps(boxes.astype(np.float), new_sup_boxes[i, np.newaxis].astype(np.float)) cur_inds = np.where(cur_ov >= cfg.TEST_DEFAULT_MASK_MERGE_IOU_THRESH)[0] cur_weights = scores[cur_inds, c] cur_weights = cur_weights / sum(cur_weights) # Re-format mask when passing it to mask_aggregation pass_mask = [res_mask[j] for j in list(cur_inds)] # do mask aggregation tmp_mask, boxes_ar[i] = mask_aggregation(boxes[cur_inds], pass_mask, cur_weights, im_width, im_height, cfg) tmp_mask = cv2.resize(tmp_mask.astype(np.float32), (cfg.MAIN_DEFAULT_MASK_SIZE, cfg.MAIN_DEFAULT_MASK_SIZE)) # pool mask to get a fixed size #tmp_mask = pool_mask(tmp_mask, boxes_ar[i], (cfg.MAIN_DEFAULT_MASK_SIZE, cfg.MAIN_DEFAULT_MASK_SIZE)) masks_ar[i] = tmp_mask # make new array such that scores is the last dimension of boxes boxes_scored_ar = np.hstack((boxes_ar, cls_score[keep, np.newaxis])) result_box.append(boxes_scored_ar) result_mask.append(masks_ar) return result_mask, result_box
def gpu_mask_voting(masks, boxes, scores, num_classes, max_per_image, im_width, im_height): """ A wrapper function, note we already know the class of boxes and masks Args: masks: ~ 300 x 21 x 21 boxes: ~ 300 x 4 scores: ~ 300 x 1 max_per_image: default would be 100 im_width: im_height: """ # Intermediate results sup_boxes = [] sup_scores = [] tobesort_scores = [] for i in xrange(num_classes): if i == 0: sup_boxes.append([]) sup_scores.append([]) continue dets = np.hstack((boxes.astype(np.float32), scores[:, i:i+1])) inds = nms(dets, cfg.TEST.MASK_MERGE_NMS_THRESH) ind_boxes = boxes[inds] ind_scores = scores[inds, i] num_keep = min(len(ind_scores), max_per_image) sup_boxes.append(ind_boxes[0:num_keep, :]) sup_scores.append(ind_scores[0:num_keep]) tobesort_scores.extend(ind_scores[0:num_keep]) sorted_scores = np.sort(tobesort_scores)[::-1] num_keep = min(len(sorted_scores), max_per_image) thresh = sorted_scores[num_keep-1] # inds array to record which mask should be aggregated together candidate_inds = [] # weight for each element in the candidate inds candidate_weights = [] # start position for candidate array candidate_start = [] candidate_scores = [] class_bar = [] for c in xrange(num_classes): if c == 0: continue cls_box = sup_boxes[c] cls_score = sup_scores[c] keep = np.where(cls_score >= thresh)[0] new_sup_boxes = cls_box[keep] num_sup_box = len(new_sup_boxes) for i in xrange(num_sup_box): cur_ov = bbox_overlaps(boxes.astype(np.float), new_sup_boxes[i, np.newaxis].astype(np.float)) cur_inds = np.where(cur_ov >= cfg.TEST.MASK_MERGE_IOU_THRESH)[0] candidate_inds.extend(cur_inds) cur_weights = scores[cur_inds, c] cur_weights = cur_weights / sum(cur_weights) candidate_weights.extend(cur_weights) candidate_start.append(len(candidate_inds)) candidate_scores.extend(cls_score[keep]) class_bar.append(len(candidate_scores)) candidate_inds = np.array(candidate_inds, dtype=np.int32) candidate_weights = np.array(candidate_weights, dtype=np.float32) candidate_start = np.array(candidate_start, dtype=np.int32) candidate_scores = np.array(candidate_scores, dtype=np.float32) result_mask, result_box = mv(boxes.astype(np.float32), masks, candidate_inds, candidate_start, candidate_weights, im_height, im_width) result_box = np.hstack((result_box, candidate_scores[:, np.newaxis])) list_result_box = [] list_result_mask = [] # separate result mask into different classes for i in xrange(num_classes - 1): cls_start = class_bar[i - 1] if i > 0 else 0 cls_end = class_bar[i] list_result_box.append(result_box[cls_start:cls_end, :]) list_result_mask.append(result_mask[cls_start:cls_end, :, :, :]) return list_result_mask, list_result_box
def detect(net, im_path, thresh=0.05, visualize=False, timers=None, pyramid=False, visualization_folder=None): """ Main module to detect faces :param net: The trained network :param im_path: The path to the image :param thresh: Detection with a less score than thresh are ignored :param visualize: Whether to visualize the detections :param timers: Timers for calculating detect time (if None new timers would be created) :param pyramid: Whether to use pyramid during inference :param visualization_folder: If set the visualizations would be saved in this folder (if visualize=True) :return: cls_dets (bounding boxes concatenated with scores) and the timers """ if not timers: timers = {'detect': Timer(), 'misc': Timer()} im = cv2.imread(im_path) imfname = os.path.basename(im_path) sys.stdout.flush() timers['detect'].tic() if not pyramid: im_scale = _compute_scaling_factor(im.shape,cfg.TEST.SCALES[0],cfg.TEST.MAX_SIZE) im_blob = _get_image_blob(im,[im_scale]) probs, boxes = forward_net(net,im_blob[0],im_scale,False) boxes = boxes[:, 0:4] else: all_probs = [] all_boxes = [] # Compute the scaling coefficients for the pyramid base_scale = _compute_scaling_factor(im.shape,cfg.TEST.PYRAMID_BASE_SIZE[0],cfg.TEST.PYRAMID_BASE_SIZE[1]) pyramid_scales = [float(scale)/cfg.TEST.PYRAMID_BASE_SIZE[0]*base_scale for scale in cfg.TEST.SCALES] im_blobs = _get_image_blob(im,pyramid_scales) for i in range(len(pyramid_scales)): probs,boxes = forward_net(net,im_blobs[i],pyramid_scales[i],True) for j in xrange(len(probs)): # Do not apply M3 to the largest scale if i<len(pyramid_scales)-1 or j<len(probs)-1: all_boxes.append(boxes[j][:,0:4]) all_probs.append(probs[j].copy()) probs = np.concatenate(all_probs) boxes = np.concatenate(all_boxes) timers['detect'].toc() timers['misc'].tic() inds = np.where(probs[:, 0] > thresh)[0] probs = probs[inds, 0] boxes = boxes[inds, :] dets = np.hstack((boxes, probs[:, np.newaxis])) \ .astype(np.float32, copy=False) keep = nms(dets, cfg.TEST.NMS_THRESH) cls_dets = dets[keep, :] if visualize: plt_name = os.path.splitext(imfname)[0] + '_detections_{}'.format(net.name) visusalize_detections(im, cls_dets, plt_name=plt_name, visualization_folder=visualization_folder) timers['misc'].toc() return cls_dets,timers
def forward(self, bottom, top): # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted transform deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) assert bottom[0].data.shape[0] == 1, 'Only single item batches are supported' cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH min_size = cfg[cfg_key].RPN_MIN_SIZE # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want scores = bottom[0].data[:, self._num_anchors:, :, :] bbox_deltas = bottom[1].data im_info = bottom[2].data[0, :] # 1. Generate proposals from transform deltas and shifted anchors height, width = scores.shape[-2:] self._height = height self._width = width # Enumerate all shifts shift_x = np.arange(0, self._width) * self._feat_stride shift_y = np.arange(0, self._height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = self._num_anchors K = shifts.shape[0] anchors = self._anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) anchors = anchors.reshape((K * A, 4)) _, keep = clip_boxes(anchors, im_info[:2]) self._anchor_index_before_clip = keep # Transpose and reshape predicted transform transformations to get them # into the same order as the anchors: # # transform deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) # Same story for the scores: # # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) # Convert anchors into proposals via transform transformations proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image proposals, keep = clip_boxes(proposals, im_info[:2]) # Record the cooresponding index before and after clip # This step doesn't need unmap # We need it to decide whether do back propagation self._proposal_index_before_clip = keep # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = filter_small_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] self._ind_after_filter = keep # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] self._ind_after_sort = order # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep = nms(np.hstack((proposals, scores)), nms_thresh) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) proposals = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) self._proposal_index = keep blobs = { 'rois': proposals } if str(self.phase) == 'TRAIN': if cfg.TRAIN.MIX_INDEX: all_rois_index = self._ind_after_filter[self._ind_after_sort[self._proposal_index]].reshape(1, len(keep)) blobs['proposal_index'] = all_rois_index # Copy data to forward to top layer for blob_name, blob in blobs.iteritems(): top[self._top_name_map[blob_name]].reshape(*blob.shape) top[self._top_name_map[blob_name]].data[...] = blob.astype(np.float32, copy=False)
def detect(net, im_path, thresh=0.05, visualize=False, timers=None, pyramid=False, dect_visualization_folder=None): """ Main module to detect faces :param net: The trained network :param im_path: The path to the image :param thresh: Detection with a less score than thresh are ignored :param visualize: Whether to visualize the detections :param timers: Timers for calculating detect time (if None new timers would be created) :param pyramid: Whether to use pyramid during inference :param visualization_folder: If set the visualizations would be saved in this folder (if visualize=True) :return: cls_dets (bounding boxes concatenated with scores) and the timers """ if not timers: timers = {'detect': Timer(), 'misc': Timer()} im = cv2.imread(im_path) im_class__file = im_path.split('/')[-2] imfname = os.path.basename(im_path) sys.stdout.flush() timers['detect'].tic() if not pyramid: im_scale = _compute_scaling_factor(im.shape, cfg.TEST.SCALES[0], cfg.TEST.MAX_SIZE) im_blob = _get_image_blob(im, [im_scale]) probs, boxes = forward_net(net, im_blob[0], im_scale, False) boxes = boxes[:, 0:4] else: all_probs = [] all_boxes = [] # Compute the scaling coefficients for the pyramid base_scale = _compute_scaling_factor(im.shape, cfg.TEST.PYRAMID_BASE_SIZE[0], cfg.TEST.PYRAMID_BASE_SIZE[1]) pyramid_scales = [ float(scale) / cfg.TEST.PYRAMID_BASE_SIZE[0] * base_scale for scale in cfg.TEST.SCALES ] im_blobs = _get_image_blob(im, pyramid_scales) for i in range(len(pyramid_scales)): probs, boxes = forward_net(net, im_blobs[i], pyramid_scales[i], True) for j in xrange(len(probs)): # Do not apply M3 to the largest scale if i < len(pyramid_scales) - 1 or j < len(probs) - 1: all_boxes.append(boxes[j][:, 0:4]) all_probs.append(probs[j].copy()) probs = np.concatenate(all_probs) boxes = np.concatenate(all_boxes) timers['detect'].toc() timers['misc'].tic() inds = np.where(probs[:, 0] > thresh)[0] probs = probs[inds, 0] boxes = boxes[inds, :] dets = np.hstack((boxes, probs[:, np.newaxis])) \ .astype(np.float32, copy=False) keep = nms(dets, cfg.TEST.NMS_THRESH) cls_dets = dets[keep, :] if visualize: plt_name = os.path.splitext(imfname)[0] + '_detections_{}'.format( net.name) dect_visualization_folder = os.path.join(dect_visualization_folder, im_class__file) visusalize_detections(im, cls_dets, plt_name=plt_name, visualization_folder=dect_visualization_folder) timers['misc'].toc() return cls_dets, timers