def format_track_results(self, results, infos, resfile): """Format tracking results.""" results_per_video = [] for frame_id, result in enumerate(results): outs_track = results2outs(bbox_results=result) track_ids, bboxes = outs_track['ids'], outs_track['bboxes'] frame_ids = np.full_like(track_ids, frame_id) results_per_frame = np.concatenate( (frame_ids[:, None], track_ids[:, None], bboxes), axis=1) results_per_video.append(results_per_frame) # `results_per_video` is a ndarray with shape (N, 7). Each row denotes # (frame_id, track_id, x1, y1, x2, y2, score) results_per_video = np.concatenate(results_per_video) if self.interpolate_tracks_cfg is not None: results_per_video = interpolate_tracks( results_per_video, **self.interpolate_tracks_cfg) with open(resfile, 'wt') as f: for frame_id, info in enumerate(infos): # `mot_frame_id` is the actually frame id used for evaluation. # It may not start from 0. if 'mot_frame_id' in info: mot_frame_id = info['mot_frame_id'] else: mot_frame_id = info['frame_id'] + 1 results_per_frame = \ results_per_video[results_per_video[:, 0] == frame_id] for i in range(len(results_per_frame)): _, track_id, x1, y1, x2, y2, conf = results_per_frame[i] f.writelines( f'{mot_frame_id},{track_id},{x1:.3f},{y1:.3f},' + f'{(x2-x1):.3f},{(y2-y1):.3f},{conf:.3f},-1,-1,-1\n')
def test_results2outs(): from mmtrack.core import results2outs num_classes = 3 num_objects = [2, 0, 2] gt_labels = [] for id, num in enumerate(num_objects): gt_labels.extend([id for _ in range(num)]) image_size = 100 bbox_results = [ np.random.randint(low=0, high=image_size, size=(num_objects[i], 5)) for i in range(num_classes) ] bbox_results_with_ids = [ np.random.randint(low=0, high=image_size, size=(num_objects[i], 6)) for i in range(num_classes) ] mask_results = [[] for i in range(num_classes)] for cls_id in range(num_classes): for obj_id in range(num_objects[cls_id]): mask_results[cls_id].append( np.random.randint(0, 2, (image_size, image_size))) # test results2outs without ids outs = results2outs( bbox_results=bbox_results, mask_results=mask_results, mask_shape=(image_size, image_size)) for key in ['bboxes', 'labels', 'masks']: assert key in outs assert outs['bboxes'].shape == (sum(num_objects), 5) assert (outs['labels'] == np.array(gt_labels)).all() assert outs['masks'].shape == (sum(num_objects), image_size, image_size) # test results2outs with ids outs = results2outs( bbox_results=bbox_results_with_ids, mask_results=mask_results, mask_shape=(image_size, image_size)) for key in ['bboxes', 'labels', 'ids', 'masks']: assert key in outs assert outs['bboxes'].shape == (sum(num_objects), 5) assert (outs['labels'] == np.array(gt_labels)).all() assert outs['ids'].shape == (sum(num_objects), ) assert outs['masks'].shape == (sum(num_objects), image_size, image_size)
def show_result(self, img, result, score_thr=0.0, thickness=1, font_scale=0.5, show=False, out_file=None, wait_time=0, backend='cv2', **kwargs): """Visualize tracking results. Args: img (str | ndarray): Filename of loaded image. result (dict): Tracking result. - The value of key 'track_bboxes' is list with length num_classes, and each element in list is ndarray with shape(n, 6) in [id, tl_x, tl_y, br_x, br_y, score] format. - The value of key 'det_bboxes' is list with length num_classes, and each element in list is ndarray with shape(n, 5) in [tl_x, tl_y, br_x, br_y, score] format. thickness (int, optional): Thickness of lines. Defaults to 1. font_scale (float, optional): Font scales of texts. Defaults to 0.5. show (bool, optional): Whether show the visualizations on the fly. Defaults to False. out_file (str | None, optional): Output filename. Defaults to None. backend (str, optional): Backend to draw the bounding boxes, options are `cv2` and `plt`. Defaults to 'cv2'. Returns: ndarray: Visualized image. """ assert isinstance(result, dict) track_bboxes = result.get('track_bboxes', None) track_masks = result.get('track_masks', None) if isinstance(img, str): img = mmcv.imread(img) outs_track = results2outs( bbox_results=track_bboxes, mask_results=track_masks, mask_shape=img.shape[:2]) img = imshow_tracks( img, outs_track.get('bboxes', None), outs_track.get('labels', None), outs_track.get('ids', None), outs_track.get('masks', None), classes=self.CLASSES, score_thr=score_thr, thickness=thickness, font_scale=font_scale, show=show, out_file=out_file, wait_time=wait_time, backend=backend) return img
def __call__(self, results): outs_det = results2outs(bbox_results=results['detections']) bboxes = outs_det['bboxes'] labels = outs_det['labels'] results['public_bboxes'] = bboxes[:, :4] if bboxes.shape[1] > 4: results['public_scores'] = bboxes[:, -1] results['public_labels'] = labels results['bbox_fields'].append('public_bboxes') return results
def simple_test(self, img, img_metas, rescale=False, **kwargs): """Test without augmentations. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. rescale (bool, optional): If False, then returned bboxes and masks will fit the scale of img, otherwise, returned bboxes and masks will fit the scale of original image shape. Defaults to False. Returns: dict[str : list(ndarray)]: The tracking results. """ frame_id = img_metas[0].get('frame_id', -1) if frame_id == 0: self.tracker.reset() det_results = self.detector.simple_test( img, img_metas, rescale=rescale) assert len(det_results) == 1, 'Batch inference is not supported.' bbox_results = det_results[0] num_classes = len(bbox_results) outs_det = results2outs(bbox_results=bbox_results) det_bboxes = torch.from_numpy(outs_det['bboxes']).to(img) det_labels = torch.from_numpy(outs_det['labels']).to(img).long() track_bboxes, track_labels, track_ids = self.tracker.track( img=img, img_metas=img_metas, model=self, bboxes=det_bboxes, labels=det_labels, frame_id=frame_id, rescale=rescale, **kwargs) track_results = outs2results( bboxes=track_bboxes, labels=track_labels, ids=track_ids, num_classes=num_classes) det_results = outs2results( bboxes=det_bboxes, labels=det_labels, num_classes=num_classes) return dict( det_bboxes=det_results['bbox_results'], track_bboxes=track_results['bbox_results'])
def simple_test(self, img, img_metas, rescale=False): """Test forward. Args: img (Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. rescale (bool): whether to rescale the bboxes. Returns: dict[str : Tensor]: Track results. """ # TODO inherit from a base tracker assert self.with_track_head, 'track head must be implemented.' # noqa frame_id = img_metas[0].get('frame_id', -1) if frame_id == 0: self.tracker.reset() x = self.detector.extract_feat(img) proposal_list = self.detector.rpn_head.simple_test_rpn(x, img_metas) det_results = self.detector.roi_head.simple_test( x, proposal_list, img_metas, rescale=rescale) bbox_results = det_results[0] num_classes = len(bbox_results) outs_det = results2outs(bbox_results=bbox_results) det_bboxes = torch.tensor(outs_det['bboxes']).to(img) det_labels = torch.tensor(outs_det['labels']).to(img).long() track_bboxes, track_labels, track_ids = self.tracker.track( img_metas=img_metas, feats=x, model=self, bboxes=det_bboxes, labels=det_labels, frame_id=frame_id) track_bboxes = outs2results( bboxes=track_bboxes, labels=track_labels, ids=track_ids, num_classes=num_classes)['bbox_results'] return dict(det_bboxes=bbox_results, track_bboxes=track_bboxes)
def format_bbox_results(self, results, infos, resfile): """Format detection results.""" with open(resfile, 'wt') as f: for res, info in zip(results, infos): if 'mot_frame_id' in info: frame = info['mot_frame_id'] else: frame = info['frame_id'] + 1 outs_det = results2outs(bbox_results=res) for bbox, label in zip(outs_det['bboxes'], outs_det['labels']): x1, y1, x2, y2, conf = bbox f.writelines( f'{frame},-1,{x1:.3f},{y1:.3f},{(x2-x1):.3f},' + f'{(y2-y1):.3f},{conf:.3f}\n') f.close()
def format_track_results(self, results, infos, resfile): """Format tracking results.""" with open(resfile, 'wt') as f: for res, info in zip(results, infos): if 'mot_frame_id' in info: frame = info['mot_frame_id'] else: frame = info['frame_id'] + 1 outs_track = results2outs(bbox_results=res) for bbox, label, id in zip(outs_track['bboxes'], outs_track['labels'], outs_track['ids']): x1, y1, x2, y2, conf = bbox f.writelines( f'{frame},{id},{x1:.3f},{y1:.3f},{(x2-x1):.3f},' + f'{(y2-y1):.3f},{conf:.3f},-1,-1,-1\n')
def format_results(self, results, resfile_path=None, metrics=['track_segm']): """Format the results to a zip file (standard format for YouTube-VIS Challenge). Args: results (dict(list[ndarray])): Testing results of the dataset. resfile_path (str, optional): Path to save the formatted results. Defaults to None. metrics (list[str], optional): The results of the specific metrics will be formatted. Defaults to ['track_segm']. Returns: tuple: (resfiles, tmp_dir), resfiles is the path of the result json file, tmp_dir is the temporal directory created for saving files. """ assert isinstance(results, dict), 'results must be a dict.' if isinstance(metrics, str): metrics = [metrics] assert 'track_segm' in metrics if resfile_path is None: tmp_dir = tempfile.TemporaryDirectory() resfile_path = tmp_dir.name else: tmp_dir = None resfiles = osp.join(resfile_path, 'results.json') inds = [i for i, _ in enumerate(self.data_infos) if _['frame_id'] == 0] num_vids = len(inds) assert num_vids == len(self.vid_ids) inds.append(len(self.data_infos)) vid_infos = self.coco.load_vids(self.vid_ids) json_results = [] for i in range(num_vids): video_id = vid_infos[i]['id'] # collect data for each instances in a video. collect_data = dict() for frame_id, (bbox_res, mask_res) in enumerate( zip(results['track_bboxes'][inds[i]:inds[i + 1]], results['track_masks'][inds[i]:inds[i + 1]])): outs_track = results2outs(bbox_results=bbox_res) bboxes = outs_track['bboxes'] labels = outs_track['labels'] ids = outs_track['ids'] masks = mmcv.concat_list(mask_res) assert len(masks) == len(bboxes) for i, id in enumerate(ids): if id not in collect_data: collect_data[id] = dict( category_ids=[], scores=[], segmentations=dict()) collect_data[id]['category_ids'].append(labels[i]) collect_data[id]['scores'].append(bboxes[i][4]) if isinstance(masks[i]['counts'], bytes): masks[i]['counts'] = masks[i]['counts'].decode() collect_data[id]['segmentations'][frame_id] = masks[i] # transform the collected data into official format for id, id_data in collect_data.items(): output = dict() output['video_id'] = video_id output['score'] = np.array(id_data['scores']).mean().item() # majority voting for sequence category output['category_id'] = np.bincount( np.array(id_data['category_ids'])).argmax().item() + 1 output['segmentations'] = [] for frame_id in range(inds[i + 1] - inds[i]): if frame_id in id_data['segmentations']: output['segmentations'].append( id_data['segmentations'][frame_id]) else: output['segmentations'].append(None) json_results.append(output) mmcv.dump(json_results, resfiles) # zip the json file in order to submit to the test server. zip_file_name = osp.join(resfile_path, 'submission_file.zip') zf = zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) print_log(f"zip the 'results.json' into '{zip_file_name}', " 'please submmit the zip file to the test server') zf.write(resfiles, 'results.json') zf.close() return resfiles, tmp_dir