def filter_annotations(img_all_annotations, used_classes): """Filters out annotations from the unused classes and dontcare regions. Filters out the annotations that belong to classes we do now wish to use and (optionally) also removes all boxes that overlap with dontcare regions. Args: img_all_annotations: A list of annotation dictionaries. See documentation of read_annotation_file for more details about the format of the annotations. used_classes: A list of strings listing the classes we want to keep, if the list contains "dontcare", all bounding boxes with overlapping with dont care regions will also be filtered out. Returns: img_filtered_annotations: A list of annotation dictionaries that have passed the filtering. """ img_filtered_annotations = {} # Filter the type of the objects. relevant_annotation_indices = [ i for i, x in enumerate(img_all_annotations['type']) if x in used_classes ] for key in img_all_annotations.keys(): img_filtered_annotations[key] = ( img_all_annotations[key][relevant_annotation_indices]) if 'dontcare' in used_classes: dont_care_indices = [ i for i, x in enumerate(img_filtered_annotations['type']) if x == 'dontcare' ] # bounding box format [y_min, x_min, y_max, x_max] all_boxes = np.stack([ img_filtered_annotations['2d_bbox_top'], img_filtered_annotations['2d_bbox_left'], img_filtered_annotations['2d_bbox_bottom'], img_filtered_annotations['2d_bbox_right'] ], axis=1) ious = iou(boxes1=all_boxes, boxes2=all_boxes[dont_care_indices]) # Remove all bounding boxes that overlap with a dontcare region. if ious.size > 0: boxes_to_remove = np.amax(ious, axis=1) > 0.0 for key in img_all_annotations.keys(): img_filtered_annotations[key] = (img_filtered_annotations[key][ np.logical_not(boxes_to_remove)]) return img_filtered_annotations
def iou(boxlist1, boxlist2): """Computes pairwise intersection-over-union between box collections. Args: boxlist1: BoxList holding N boxes boxlist2: BoxList holding M boxes Returns: a numpy array with shape [N, M] representing pairwise iou scores. """ return np_box_ops.iou(boxlist1.get(), boxlist2.get())
def bb_cls_matching(det_bb, gt_bb, gt_cls, iou_thresh=0.5): det_cls = np.zeros(len(det_bb)) iou_array = iou(det_bb, gt_bb) pos_ind = np.where(np.amax(iou_array, axis=1) >= iou_thresh) if not len(pos_ind[0]): print('Warning: No matching bb!') iou_array = iou_array[pos_ind] pos_det_cls = gt_cls[np.argmax(iou_array, axis=1)] det_cls[pos_ind] = pos_det_cls return det_cls, pos_ind[0]
def _calc_single_sample_det_info(self, ground_bbox_list, det_bbox_list, iou_threshold=0.5): ''' 返回某张图片的检测结果信息(虚警个数、漏警个数) :param ground_bbox_list: ground truth检测框列表,ymin, xmin, ymax, xmax :param det_bbox_list: 检测结果框列表,ymin, xmin, ymax, xmax :param iou_threshold: :return: ''' if len(ground_bbox_list) == 0: return 0, len(det_bbox_list), 0 if len(det_bbox_list) == 0: return 0, 0, len(ground_bbox_list) iou_mat = box_ops.iou(np.array(ground_bbox_list), np.array(det_bbox_list)) # iou过阈值的为正确检测的个数TP tp = 0 gt_over_map = {} det_over_map = {} gt_iou_list = [] for i in range(len(ground_bbox_list)): if i in gt_over_map: continue max_j = -1 max_iou = 0 gt_iou = 0 for j in range(len(det_bbox_list)): if j in det_over_map: continue if iou_mat[i, j] > gt_iou: gt_iou = iou_mat[i, j] if iou_mat[i, j] >= iou_threshold and iou_mat[i, j] > max_iou: max_iou = iou_mat[i, j] max_j = j gt_iou_list.append(gt_iou) if max_j >= 0: tp += 1 gt_over_map[i] = True det_over_map[max_j] = True # 检测的框没有ground truth的框跟其对应的,即过检(虚警)的框,FP fp = len(det_bbox_list) - tp # ground truth的框,没检测到的,即过杀(漏警)的框,FN fn = len(ground_bbox_list) - tp return tp, fp, fn, gt_iou_list
def track_dets(dets): """Perform tracking for each object class for multi-object multi-class tracking. Note, classes from dets might already be merged by prepare_dets merge_classes """ num_frames = len(dets[list(dets.keys())[0]]) tracks = collections.defaultdict(dict) for class_id in dets: tracker = Sort(max_age=2, min_hits=3) tracker.reset() # process tracks for one frame for fn in range(num_frames): ndets = len(dets[class_id][fn]['boxes']) # prepare box and score for tracker fn_bxsc = np.array(dets[class_id][fn]['bxsc']) tracks[class_id][fn] = tracker.update(fn_bxsc) ## associate the tid with the dets # loop over object classes, and frames in video for class_id in dets: for fn in dets[class_id]: # get current det and track boxes det_boxes = np.array(dets[class_id][fn]['boxes']) trk_boxes = tracks[class_id][fn][:, :4] # if no dets or no tracks, go to next frame if det_boxes.size == 0 or trk_boxes.size == 0: continue # do munkres iou between det-boxes and track-boxes iou = np_box_ops.iou(det_boxes, trk_boxes) dd, tt = munkres(-iou) for d, t in zip(dd, tt): dets[class_id][fn]['tid'][d] = int(tracks[class_id][fn][t, 4]) return dets
def non_max_suppression(boxlist, max_output_size=10000, iou_threshold=1.0, score_threshold=-10.0): """Non maximum suppression. This op greedily selects a subset of detection bounding boxes, pruning away boxes that have high IOU (intersection over union) overlap (> thresh) with already selected boxes. In each iteration, the detected bounding box with highest score in the available pool is selected. Args: boxlist: BoxList holding N boxes. Must contain a 'scores' field representing detection scores. All scores belong to the same class. max_output_size: maximum number of retained boxes iou_threshold: intersection over union threshold. score_threshold: minimum score threshold. Remove the boxes with scores less than this value. Default value is set to -10. A very low threshold to pass pretty much all the boxes, unless the user sets a different score threshold. Returns: a BoxList holding M boxes where M <= max_output_size Raises: ValueError: if 'scores' field does not exist ValueError: if threshold is not in [0, 1] ValueError: if max_output_size < 0 """ if not boxlist.has_field('scores'): raise ValueError('Field scores does not exist') if iou_threshold < 0. or iou_threshold > 1.0: raise ValueError('IOU threshold must be in [0, 1]') if max_output_size < 0: raise ValueError('max_output_size must be bigger than 0.') boxlist = filter_scores_greater_than(boxlist, score_threshold) if boxlist.num_boxes() == 0: return boxlist boxlist = sort_by_field(boxlist, 'scores') # Prevent further computation if NMS is disabled. if iou_threshold == 1.0: if boxlist.num_boxes() > max_output_size: selected_indices = np.arange(max_output_size) return gather(boxlist, selected_indices) else: return boxlist boxes = boxlist.get() num_boxes = boxlist.num_boxes() # is_index_valid is True only for all remaining valid boxes, is_index_valid = np.full(num_boxes, 1, dtype=bool) selected_indices = [] num_output = 0 for i in range(num_boxes): if num_output < max_output_size: if is_index_valid[i]: num_output += 1 selected_indices.append(i) is_index_valid[i] = False valid_indices = np.where(is_index_valid)[0] if valid_indices.size == 0: break intersect_over_union = np_box_ops.iou( np.expand_dims(boxes[i, :], axis=0), boxes[valid_indices, :]) intersect_over_union = np.squeeze(intersect_over_union, axis=0) is_index_valid[valid_indices] = np.logical_and( is_index_valid[valid_indices], intersect_over_union <= iou_threshold) return gather(boxlist, np.array(selected_indices))
def testIOU(self): iou = np_box_ops.iou(self.boxes1, self.boxes2) expected_iou = np.array( [[2.0 / 16.0, 0.0, 6.0 / 400.0], [1.0 / 16.0, 0.0, 5.0 / 400.0]], dtype=float) self.assertAllClose(iou, expected_iou)
def sthelse_annots_masks(vid_id, sthelse_annots, obj_iou_th=0.3): """Create segmentation masks based on sth-else annotations """ oH, oW, oN, _ = sth_dataset.video_properties(vid_id) ### load pickle segmentation data pkl_fname = SEG_FNAME_TEMPLATE % vid_id with open(pkl_fname, 'rb') as fid: seg_data = pickle.load(fid) # get image H, W _, H, W = seg_data[0]['pred_masks'].shape ### combine segmentation results with sthelse bounding boxes hand_masks = collections.defaultdict(None) objs_masks = collections.defaultdict(dict) bbox_keys = ['x1', 'y1', 'x2', 'y2'] # assumes first frame annotation exists, get indices to objects num_obj = list(range(len(sthelse_annots[0]['gt_placeholders']))) for fn in range(oN): # if no labels for this frame, skip if fn not in sthelse_annots: continue # get mask-rcnn data seg_frame = seg_data[fn] classes = seg_frame['pred_classes'] boxes = seg_frame['pred_boxes'].astype('int64') # x1, y1, x2, y2 masks = seg_frame['pred_masks'] # N x HxW masks # process label annots = sthelse_annots[fn]['labels'] hand_label = [label for label in annots if label['category'] == 'hand'] objs_label = [label for label in annots if label['category'] != 'hand'] # process hand annotations if hand_label: # should not have more than one hand in videos if len(hand_label) > 1: pdb.set_trace() person_idx = np.where(classes == 0)[0] if person_idx.size > 0: # if mask-RCNN outputs don't have a "person" category # leave as None, we'll append empty masks later hand_bbox = [[label['box2d'][key] for key in bbox_keys] \ for label in hand_label] hand_bbox = np.array(hand_bbox).astype( 'int64') # M=1 x 4 for computing iou # compute iou with mask-rcnn "person" category boxes ious = np_box_ops.iou(hand_bbox, boxes[person_idx])[0] # M=1 x N --> N pick_mask = person_idx[np.argmax(ious)] # filter mask with label bbox hand_masks[fn] = filter_mask(masks[pick_mask], hand_bbox[0]) # process obj box annotations if objs_label: obj_idx = np.where(classes != 0)[0] if obj_idx.size > 0: # if mask-RCNN outputs don't have "non-person" category # leave as None, we'll append empty masks later objs_bbox = [[label['box2d'][key] for key in bbox_keys] \ for label in objs_label] objs_bbox = np.array(objs_bbox).astype( 'int64') # M x 4 for computing iou # compute iou with mask-rcnn "non-person" category boxes ious = np_box_ops.iou(objs_bbox, boxes[obj_idx]) # M x N # compute amount of object pixels # do assignment with munkres ll, mm = munkres(-ious) # label indices, mask-rcnn indices for l, m in zip(ll, mm): if ious[l, m] > obj_iou_th: obj_id = int(annots[l]['gt_annotation'].split()[1]) objs_masks[fn][obj_id] = filter_mask( masks[obj_idx[m]], objs_bbox[l]) ### replace None or empty masks by all-zero frames all_zero_mask = np.zeros((H, W)).astype(bool) for fn in range(oN): if fn not in hand_masks: hand_masks[fn] = all_zero_mask.copy() for obj_id in num_obj: if obj_id not in objs_masks[fn]: objs_masks[fn][obj_id] = all_zero_mask.copy() # make into lists list_hand_masks = [hand_masks[k] for k in sorted(hand_masks.keys())] list_objs_masks = [[objs_masks[k][o] for k in sorted(objs_masks.keys())] \ for o in sorted(num_obj)] print('id: {}, hand: {}, obj: {}'.format(vid_id, len(list_hand_masks), len(list_objs_masks))) return list_hand_masks, list_objs_masks
def davidhandobj_masks(video_id, david_preds, dets, seg_data, obj_iou_th=0.3, num_obj=[0]): """Create segmentation masks based on a mixture of David's hand-object model and MaskRCNN """ oN = len(seg_data) # get image H, W _, H, W = seg_data[0]['pred_masks'].shape ### combine segmentation results with sthelse bounding boxes hand_masks = collections.defaultdict(None) objs_masks = collections.defaultdict(dict) ### process each frame where hand exists # we'll do tracking for rest of the frames later hobj_info = {} for fn in range(oN): hobj_info[fn] = { 'hand': False, 'hbox_sc': None, 'o0': False, 'o0box_sc': None, 'o1': False, 'o1box_sc': None } # david predictions state = david_preds[fn] # mask-rcnn seg_frame = seg_data[fn] classes = seg_frame['pred_classes'] boxes = seg_frame['pred_boxes'].astype('int64') # x1, y1, x2, y2 masks = seg_frame['pred_masks'] # N x HxW masks ## check if 'hand' in david's predictions if state['hand'] is not None: # person class in seg-data? person_idx = np.where(classes == 0)[0] if person_idx.size > 0: # if mask-RCNN outputs don't have a "person" category # leave as None, we'll append empty masks later hobj_info[fn]['hand'] = True # M=1 x 4 for computing iou hand_bbox = np.array([state['hand'][:4].astype('int64')]) hobj_info[fn]['hbox_sc'] = \ np.hstack((hand_bbox[0], state['hand'][4:5])) # compute iou with mask-rcnn "person" category boxes ious = np_box_ops.iou(hand_bbox, boxes[person_idx])[0] # M=1 x N --> N pick_mask = person_idx[np.argmax(ious)] # filter mask with label bbox hand_masks[fn] = filter_mask(masks[pick_mask], hand_bbox[0]) ## add object information only if hand is present if state['obj'] is not None: obj_idx = np.where(classes != 0)[0] if obj_idx.size > 0: # if mask-RCNN outputs don't have "non-person" category # leave as None, we'll append empty masks later hobj_info[fn]['o0'] = True idx = 0 nobjs, _ = state['obj'].shape if nobjs > 1: idx = np.argmax(state['obj'][:, 4]) obj_bbox = state['obj'][idx][:4].astype('int64') # M=1 x 4 for computing iou obj_bbox = np.array([obj_bbox]) hobj_info[fn]['o0box_sc'] = \ np.hstack((obj_bbox[0], state['obj'][idx][4:5])) ### do tracking to fill in other object boxes for frames tracker = Sort(max_age=5, min_hits=0) tracker.reset() # process for each frame tracks = {fn: {0: None, 1: None} for fn in range(oN)} for fn in range(oN): ## object tracker if hobj_info[fn]['hand'] and hobj_info[fn][ 'o0']: # selected box david's model o0box_sc = np.array([hobj_info[fn]['o0box_sc']]) tracks[fn][0] = tracker.update(o0box_sc) elif dets[1][fn]['bxsc']: # mask-rcnn detection bxsc = np.array(dets[1][fn]['bxsc']) tracks[fn][0] = tracker.update(bxsc) else: # no box tracks[fn][0] = tracker.update([]) # merge # tracks = merge_tracklets(video_id, tracks, obj_id=0, obj_sim_thr=0.95) # get track ids for main object pick_tid = [-1, -1] tid = [] for fn in range(oN): if hobj_info[fn]['o0']: tid.append(tracks[fn][0][0][-1]) if np.unique(tid).size != 1: print('WARNING: David object in more than one track. Using mode') pick_tid[0] = mode(tid).mode[0] else: pick_tid[0] = np.unique(tid)[0] # show all boxes for this track # for fn in range(oN): idx = np.where(tracks[fn][0][:, 4] == uniq_tid)[0]; print(fn, hobj_info[fn]['o0'], tracks[fn][0][idx]) # pdb.set_trace() ### if looking for more than one object, track all mask-rcnn boxes to get stable object if len(num_obj) == 2: tracker1 = Sort(max_age=12, min_hits=0) for fn in range(oN): tracks[fn][1] = tracker1.update(np.array(dets[1][fn]['bxsc'])) # object 0 final boxes o0boxes = {} for fn in tracks.keys(): fn_tids = tracks[fn][0][:, 4] intrack_idx = np.where(fn_tids == pick_tid[0])[0] if intrack_idx.size > 0: # which box is track-det? o0boxes[fn] = np.array(tracks[fn][0][intrack_idx, :4]) else: o0boxes[fn] = None # tid specific information tid2fn = collections.defaultdict(list) # which frames they appear in fn2tid = collections.defaultdict(list) # inverse map tid_area = collections.defaultdict(list) # compute normalized area tid_olap = collections.defaultdict( list) # overlap between object 0 and object 1 for fn in range(oN): for x1, y1, x2, y2, tid in tracks[fn][1]: fn2tid[fn].append(tid) tid2fn[tid].append(fn) tid_area[tid].append((x2 - x1) * (y2 - y1) / (H * W)) # get box of object-0 (manipulated) fn_tids = tracks[fn][0][:, 4] intrack_idx = np.where(fn_tids == pick_tid[0])[0] if intrack_idx.size > 0: # which box is track-det? o0box = np.array(tracks[fn][0][intrack_idx, :4]) iou = np_box_ops.iou(np.array([[x1, y1, x2, y2]]), o0box) tid_olap[tid].append(iou[0, 0]) else: tid_olap[tid].append(0.) for tid in tid_area.keys(): tid_area[tid] = np.mean(tid_area[tid]) tid_olap[tid] = np.mean(tid_olap[tid]) # get the longest track, and let's assume this is the secondary object tid_lens = {k: 1. * len(v) / oN for k, v in tid2fn.items()} # normalize lengths by area of coverage tid_norm = { k: tid_lens[k] * (1 - tid_area[k]) * (1 - tid_olap[k]) for k in tid_lens } pick_tid[1] = max(tid_norm, key=tid_norm.get) # show boxes of this track id # for fn in range(oN): idx = np.where(tracks[fn][1][:, 4] == 24)[0]; print(fn, tracks[fn][1][idx]) ### replace None or empty masks by all-zero frames all_zero_mask = np.zeros((H, W)).astype(bool) for fn in range(oN): # put zeros for hand-masks if fn not in hand_masks: hand_masks[fn] = all_zero_mask.copy() # put mask-rcnn / zeros for obj-masks for obj_id in num_obj: # if obj_id not in objs_masks[fn]: # check if object tid exists in frame, then copy fn_tids = tracks[fn][obj_id][:, 4] intrack_idx = np.where(fn_tids == pick_tid[obj_id])[0] if intrack_idx.size > 0: # which box is track-det? fn_tbox = tracks[fn][obj_id][intrack_idx, :4] frame_boxes = np.array(dets[1][fn]['boxes']) iousc = np_box_ops.iou(fn_tbox, frame_boxes) fn_dets_idx = np.argmax(iousc) # print(fn, np.max(iousc), iousc) if np.max(iousc) > obj_iou_th: fn_mask_idx = dets[1][fn]['idx'][fn_dets_idx] objs_masks[fn][obj_id] = seg_data[fn]['pred_masks'][ fn_mask_idx] else: objs_masks[fn][obj_id] = all_zero_mask.copy() else: # if box not in track, let it be objs_masks[fn][obj_id] = all_zero_mask.copy() # make into lists list_hand_masks = [hand_masks[k] for k in sorted(hand_masks.keys())] list_objs_masks = [[objs_masks[k][o] for k in sorted(objs_masks.keys())] \ for o in sorted(num_obj)] print('id: {}, hand: {}, obj: {}'.format(video_id, len(list_hand_masks), len(list_objs_masks))) return list_hand_masks, list_objs_masks