def _compute_distance_matrix(prev_json_data, prev_boxes, prev_poses, cur_json_data, cur_boxes, cur_poses, cost_types, cost_weights): assert(len(cost_types) == len(cost_weights)) all_Cs = [] for cost_type, cost_weight in zip(cost_types, cost_weights): if cost_weight == 0: continue if cost_type == 'bbox_overlap': all_Cs.append((1 - _compute_pairwise_iou(prev_boxes, cur_boxes))) elif cost_type == 'cnn-cosdist': all_Cs.append(_compute_pairwise_deep_cosine_dist( img_utils.get_image_path(prev_json_data), prev_boxes, img_utils.get_image_path(cur_json_data), cur_boxes)) elif cost_type == 'pose-pck': all_Cs.append(_compute_pairwise_kps_pck_distance(prev_poses, cur_poses)) elif cost_type == 'pose-oks': all_Cs.append(_compute_pairwise_kps_oks_distance(prev_poses, prev_boxes, cur_poses, cur_boxes)) else: raise NotImplementedError('Unknown cost type {}'.format(cost_type)) # print ('cost_weight', cost_weight) # print ('all_Cs', all_Cs) all_Cs[-1] *= cost_weight # print ('before sum', all_Cs) return np.sum(np.stack(all_Cs, axis=0), axis=0)
def _compute_distance_matrix( prev_json_data, prev_boxes, prev_poses, cur_json_data, cur_boxes, cur_poses, cost_types, cost_weights, ): assert (len(cost_weights) == len(cost_types)) all_Cs = [] for cost_type, cost_weight in zip(cost_types, cost_weights): if cost_weight == 0: continue if cost_type == 'bbox-overlap': all_Cs.append((1 - _compute_pairwise_iou(prev_boxes, cur_boxes))) elif cost_type == 'cnn-cosdist': all_Cs.append( _compute_pairwise_deep_cosine_dist( img_utils.get_image_path(prev_json_data), prev_boxes, img_utils.get_image_path(cur_json_data), cur_boxes)) elif cost_type == 'pose-pck': kps_names = cur_json_data['dataset'].person_cat_info['keypoints'] all_Cs.append( _compute_pairwise_kpt_distance(prev_poses, cur_poses, kps_names)) else: raise NotImplementedError('Unknown cost type {}'.format(cost_type)) all_Cs[-1] *= cost_weight return np.sum(np.stack(all_Cs, axis=0), axis=0)
def _compute_distance_matrix( prev_json_data, prev_boxes, prev_poses, cur_json_data, cur_boxes, cur_poses, cost_types, cost_weights, ): assert(len(cost_weights) == len(cost_types)) all_Cs = [] for cost_type, cost_weight in zip(cost_types, cost_weights): if cost_weight == 0: continue if cost_type == 'bbox-overlap': all_Cs.append((1 - _compute_pairwise_iou(prev_boxes, cur_boxes))) elif cost_type == 'cnn-cosdist': all_Cs.append(_compute_pairwise_deep_cosine_dist( img_utils.get_image_path(prev_json_data), prev_boxes, img_utils.get_image_path(cur_json_data), cur_boxes)) elif cost_type == 'pose-pck': kps_names = cur_json_data['dataset'].person_cat_info['keypoints'] all_Cs.append(_compute_pairwise_kpt_distance( prev_poses, cur_poses, kps_names)) else: raise NotImplementedError('Unknown cost type {}'.format(cost_type)) all_Cs[-1] *= cost_weight return np.sum(np.stack(all_Cs, axis=0), axis=0)
def compute_matches_tracks(json_data, dets, lstm_model): # Consider all consecutive frames, and match the boxes num_imgs = len(json_data) all_tracks = [[]] * len(json_data) # First split the images into videos all_video_roidb = [] video_entries = [] for img_id in range(num_imgs): if img_id == 0 or _is_same_video(json_data[img_id - 1], json_data[img_id]): video_entries.append((json_data[img_id], img_id)) else: all_video_roidb.append( sorted(video_entries, key=lambda x: img_utils.get_image_path(x[0]))) video_entries = [(json_data[img_id], img_id)] if len(video_entries) > 0: all_video_roidb.append(video_entries) # Make sure I got everything assert (len(json_data) == len(gen_utils.flatten_list(all_video_roidb))) logger.info('Computing tracks for {} videos.'.format(len(all_video_roidb))) for vid_id in tqdm(range(len(all_video_roidb)), desc='Tracks compute'): if cfg.TRACKING.LSTM_TEST.LSTM_TRACKING_ON: tracks = _compute_tracks_video_lstm(all_video_roidb[vid_id], dets, lstm_model) else: tracks = _compute_tracks_video(all_video_roidb[vid_id], dets) if cfg.TRACKING.FLOW_SMOOTHING_ON: tracks = _smooth_pose_video(all_video_roidb[vid_id], dets, tracks) # resort and assign for i, (_, det_id) in enumerate(all_video_roidb[vid_id]): all_tracks[det_id] = tracks[i] if cfg.TRACKING.DEBUG.DUMMY_TRACKS: # Replace with random track IDs all_tracks[det_id] = [ np.random.randint(FIRST_TRACK_ID, MAX_TRACK_IDS + 1) for _ in tracks[i] ] dets['all_tracks'] = [[], all_tracks] _summarize_track_stats(all_tracks, json_data) return dets
def _known_shot_change(video_json_data, frame_id): if not cfg.TRACKING.DEBUG.UPPER_BOUND_3_SHOTS: # Only use it if in debugging mode, to get a upper bound return False if frame_id == 0: return True # read the CSV import csv D = {} with open('/path/to/shot_boundaries_val.csv', 'r') as fin: reader = csv.reader(fin) for row in reader: D[row[0]] = [int(el) for el in row[1].strip().split(',') if len(el) > 0] vname = osp.dirname(img_utils.get_image_path(video_json_data[frame_id][0])) vname = osp.basename(osp.dirname(vname)) + '/' + osp.basename(vname) assert(vname in D) # frame_id + 1 since the frame_id are 0 indexed and I labeled as per 1-index if frame_id + 1 in D[vname]: return True else: return False
def compute_matches_tracks(json_data, dets, lstm_model): # Consider all consecutive frames, and match the boxes num_imgs = len(json_data) all_tracks = [[]] * len(json_data) # First split the images into videos all_video_roidb = [] video_entries = [] for img_id in range(num_imgs): if img_id == 0 or _is_same_video( json_data[img_id - 1], json_data[img_id]): video_entries.append((json_data[img_id], img_id)) else: all_video_roidb.append(sorted( video_entries, key=lambda x: img_utils.get_image_path(x[0]))) video_entries = [(json_data[img_id], img_id)] if len(video_entries) > 0: all_video_roidb.append(video_entries) # Make sure I got everything assert(len(json_data) == len(gen_utils.flatten_list(all_video_roidb))) logger.info('Computing tracks for {} videos.'.format(len(all_video_roidb))) for vid_id in tqdm(range(len(all_video_roidb)), desc='Tracks compute'): if cfg.TRACKING.LSTM_TEST.LSTM_TRACKING_ON: tracks = _compute_tracks_video_lstm( all_video_roidb[vid_id], dets, lstm_model) else: tracks = _compute_tracks_video(all_video_roidb[vid_id], dets) if cfg.TRACKING.FLOW_SMOOTHING_ON: tracks = _smooth_pose_video( all_video_roidb[vid_id], dets, tracks) # resort and assign for i, (_, det_id) in enumerate(all_video_roidb[vid_id]): all_tracks[det_id] = tracks[i] if cfg.TRACKING.DEBUG.DUMMY_TRACKS: # Replace with random track IDs all_tracks[det_id] = [ np.random.randint(FIRST_TRACK_ID, MAX_TRACK_IDS + 1) for _ in tracks[i]] dets['all_tracks'] = [[], all_tracks] _summarize_track_stats(all_tracks, json_data) return dets
def _is_same_video(json1, json2): return (osp.dirname(img_utils.get_image_path(json1)) == osp.dirname( img_utils.get_image_path(json2)))
def _compute_tracks_video(video_json_data, dets): nframes = len(video_json_data) video_tracks = [] next_track_id = FIRST_TRACK_ID for frame_id in range(nframes): frame_tracks = [] # each element is (roidb entry, idx in the dets/original roidb) frame_data, det_id = video_json_data[frame_id] cur_boxes = _get_boxes(dets, det_id) cur_poses = _get_poses(dets, det_id) if (frame_id == 0 or _known_shot_change(video_json_data, frame_id)) \ and not cfg.TRACKING.DEBUG.UPPER_BOUND: matches = -np.ones((cur_boxes.shape[0], )) else: cur_frame_data = frame_data if cfg.TRACKING.DEBUG.UPPER_BOUND: prev_boxes = frame_data['boxes'] prev_poses = [el for el in frame_data['gt_keypoints']] prev_frame_data = { 'image': img_utils.get_image_path(cur_frame_data) } else: prev_boxes = _get_boxes(dets, video_json_data[frame_id - 1][1]) prev_poses = _get_poses(dets, video_json_data[frame_id - 1][1]) # 0-index to remove the other index to the dets structure prev_frame_data = video_json_data[frame_id - 1][0] matches = _compute_matches( prev_frame_data, cur_frame_data, prev_boxes, cur_boxes, prev_poses, cur_poses, cost_types=cfg.TRACKING.DISTANCE_METRICS, cost_weights=cfg.TRACKING.DISTANCE_METRIC_WTS, bipart_match_algo=cfg.TRACKING.BIPARTITE_MATCHING_ALGO) if cfg.TRACKING.DEBUG.UPPER_BOUND: prev_tracks = frame_data['tracks'].reshape((-1)).tolist() matched = np.where(np.array(matches) != -1)[0] # Remove things unmatched matches = matches[matched] new_boxes = _get_boxes(dets, det_id)[matched] # This doesn't help, but made the pose score go low # new_boxes[:, -1] = 1.0 # make the detections 100% confidence new_poses = [_get_poses(dets, det_id)[el] for el in matched] if cfg.TRACKING.DEBUG.UPPER_BOUND_2_GT_KPS: # Set the points to be GT points new_boxes[:, :4] = frame_data['boxes'][matches] for match_id in range(matches.shape[0]): if cfg.TRACKING.DEBUG.UPPER_BOUND_2_GT_KPS_ONLY_CONF: dims_to_replace = np.array(2) else: dims_to_replace = np.arange(3) new_poses[match_id][dims_to_replace, :] = frame_data[ 'gt_keypoints'][matches[match_id]][dims_to_replace, :] _set_boxes(new_boxes, dets, det_id) _set_poses(new_poses, dets, det_id) else: prev_tracks = video_tracks[frame_id - 1] if frame_id > 0 else None if cfg.TRACKING.DEBUG.UPPER_BOUND_5_GT_KPS_ONLY: gt_prev_boxes = frame_data['boxes'] gt_prev_poses = [el for el in frame_data['gt_keypoints']] gt_prev_frame_data = { 'image': img_utils.get_image_path(frame_data) } matches_gt = _compute_matches(gt_prev_frame_data, frame_data, gt_prev_boxes, cur_boxes, gt_prev_poses, cur_poses, cost_types=('bbox-overlap', ), cost_weights=(1.0, ), bipart_match_algo='hungarian') # replace the predicted poses for match_id in range(matches_gt.shape[0]): if matches_gt[match_id] == -1: continue cur_poses[match_id][:3, :] = gt_prev_poses[ matches_gt[match_id]][:3, :] for m in matches: if m == -1: # didn't match to any frame_tracks.append(next_track_id) next_track_id += 1 if next_track_id >= MAX_TRACK_IDS: logger.warning('Exceeded max track ids ({}) for {}'.format( MAX_TRACK_IDS, frame_data['image'])) next_track_id %= MAX_TRACK_IDS else: frame_tracks.append(prev_tracks[m]) video_tracks.append(frame_tracks) return video_tracks
def _run_posetrack_eval(roidb, det_file, dataset, output_dir): with open(det_file, 'rb') as fin: dets = pkl.load(fin) assert len(roidb) == len(dets['all_boxes'][1]), \ 'Mismatch {} vs {}'.format(len(roidb), len(dets['all_boxes'][1])) gen_utils.mkdir_p(output_dir) out_filenames = video2filenames(dataset.annotation_directory) out_data = {} # each video to all predictions eval_tracking = False if 'all_tracks' in dets: eval_tracking = True for i, entry in enumerate(roidb): image_name = get_image_path(entry)[len(dataset.image_directory):] video_name = osp.dirname(image_name) frame_num = int(osp.basename(image_name).split('.')[0]) boxes = dets['all_boxes'][1][i] kps = dets['all_keyps'][1][i] if eval_tracking: # means there is a "all_tracks" in the dets tracks = dets['all_tracks'][1][i] else: tracks = [1] * len(kps) data_el = { 'image': image_name, 'imagenum': [frame_num], 'annorect': _convert_data_to_annorect_struct(boxes, kps, tracks), } if video_name in out_data: out_data[video_name].append(data_el) else: out_data[video_name] = [data_el] logger.info('Saving the JSON files to {}'.format(output_dir)) # clear out the previous predictions, if any gen_utils.run_cmd('rm -r {}/*'.format(output_dir), print_cmd=False) for vname in tqdm(out_data.keys(), desc='Writing JSON files for eval'): vdata = out_data[vname] outfpath = osp.join(output_dir, out_filenames['images' + vname]) # output_dir, out_filenames[osp.join('images', vname)]) with open(outfpath, 'w') as fout: json.dump({'annolist': vdata}, fout, indent=4) logger.info('Wrote all predictions in JSON to {}'.format(output_dir)) logger.info('Running dataset level evaluation...') st_time = time.time() ####jianbo add this line and comment following two lines score_ap, score_mot, apAll, preAll, recAll, mota = _run_eval( dataset.annotation_directory, output_dir, eval_tracking) # logger.info(_run_eval(dataset.annotation_directory, output_dir, eval_tracking)) # logger.info('...Done in {}'.format(time.time() - st_time)) ##### # TODO(rgirdhar): Do this better if cfg.EVAL.EVAL_MPII_PER_VIDEO: # run the evaluation per-video res = [] logger.info('Running per-video evaluation...') st_time = time.time() pervid_outpath = osp.join( osp.dirname(osp.normpath(output_dir)), osp.basename(det_file) + '_per_video_scores.txt') # Earlier I used multi-processing to compute the predictions in parallel # but now I've updated the eval code itself to use multiprocessing so # can not use multiprocessing here (else it gives an error that daemon # processes can not spawn children). Hense setting num processes to 0. res = map( partial(_run_eval_single_video, out_filenames=out_filenames, output_dir=output_dir, dataset=dataset, eval_tracking=eval_tracking), out_data.keys()) logger.info('...Done in {} seconds'.format(time.time() - st_time)) res = sorted(res, key=lambda x: x[1]) # sort on score logger.info('Writing per-video scores to {}'.format(pervid_outpath)) with open(pervid_outpath, 'w') as fout: for el in res: fout.write('{} {} {}\n'.format(el[0], el[1], el[2])) return score_ap, score_mot, apAll, preAll, recAll, mota
def _run_posetrack_eval(roidb, det_file, dataset, output_dir): with open(det_file, 'rb') as fin: dets = pkl.load(fin) assert len(roidb) == len(dets['all_boxes'][1]), \ 'Mismatch {} vs {}'.format(len(roidb), len(dets['all_boxes'][1])) gen_utils.mkdir_p(output_dir) out_filenames = video2filenames(dataset.annotation_directory) out_data = {} # each video to all predictions eval_tracking = False if 'all_tracks' in dets: eval_tracking = True for i, entry in enumerate(roidb): image_name = get_image_path(entry)[len(dataset.image_directory):] video_name = osp.dirname(image_name) frame_num = int(osp.basename(image_name).split('.')[0]) boxes = dets['all_boxes'][1][i] kps = dets['all_keyps'][1][i] if eval_tracking: # means there is a "all_tracks" in the dets tracks = dets['all_tracks'][1][i] else: tracks = [1] * len(kps) data_el = { 'image': image_name, 'imagenum': [frame_num], 'annorect': _convert_data_to_annorect_struct(boxes, kps, tracks), } if video_name in out_data: out_data[video_name].append(data_el) else: out_data[video_name] = [data_el] logger.info('Saving the JSON files to {}'.format(output_dir)) # clear out the previous predictions, if any gen_utils.run_cmd('rm -r {}/*'.format(output_dir), print_cmd=False) for vname in tqdm(out_data.keys(), desc='Writing JSON files for eval'): vdata = out_data[vname] outfpath = osp.join( output_dir, out_filenames[osp.join('images', vname)]) with open(outfpath, 'w') as fout: json.dump({'annolist': vdata}, fout) logger.info('Wrote all predictions in JSON to {}'.format(output_dir)) logger.info('Running dataset level evaluation...') st_time = time.time() logger.info(_run_eval(dataset.annotation_directory, output_dir, eval_tracking)) logger.info('...Done in {}'.format(time.time() - st_time)) # TODO(rgirdhar): Do this better if cfg.EVAL.EVAL_MPII_PER_VIDEO: # run the evaluation per-video res = [] logger.info('Running per-video evaluation...') st_time = time.time() pervid_outpath = osp.join( osp.dirname(osp.normpath(output_dir)), osp.basename(det_file) + '_per_video_scores.txt') # Earlier I used multi-processing to compute the predictions in parallel # but now I've updated the eval code itself to use multiprocessing so # can not use multiprocessing here (else it gives an error that daemon # processes can not spawn children). Hense setting num processes to 0. res = map(partial( _run_eval_single_video, out_filenames=out_filenames, output_dir=output_dir, dataset=dataset, eval_tracking=eval_tracking), out_data.keys()) logger.info('...Done in {} seconds'.format(time.time() - st_time)) res = sorted(res, key=lambda x: x[1]) # sort on score logger.info('Writing per-video scores to {}'.format(pervid_outpath)) with open(pervid_outpath, 'w') as fout: for el in res: fout.write('{} {} {}\n'.format(el[0], el[1], el[2]))
def _is_same_video(json1, json2): return (osp.dirname(img_utils.get_image_path(json1)) == osp.dirname(img_utils.get_image_path(json2)))
def _compute_tracks_video(video_json_data, dets): nframes = len(video_json_data) video_tracks = [] next_track_id = FIRST_TRACK_ID for frame_id in range(nframes): frame_tracks = [] # each element is (roidb entry, idx in the dets/original roidb) frame_data, det_id = video_json_data[frame_id] cur_boxes = _get_boxes(dets, det_id) cur_poses = _get_poses(dets, det_id) if (frame_id == 0 or _known_shot_change(video_json_data, frame_id)) \ and not cfg.TRACKING.DEBUG.UPPER_BOUND: matches = -np.ones((cur_boxes.shape[0], )) else: cur_frame_data = frame_data if cfg.TRACKING.DEBUG.UPPER_BOUND: prev_boxes = frame_data['boxes'] prev_poses = [el for el in frame_data['gt_keypoints']] prev_frame_data = {'image': img_utils.get_image_path(cur_frame_data)} else: prev_boxes = _get_boxes(dets, video_json_data[frame_id - 1][1]) prev_poses = _get_poses(dets, video_json_data[frame_id - 1][1]) # 0-index to remove the other index to the dets structure prev_frame_data = video_json_data[frame_id - 1][0] matches = _compute_matches( prev_frame_data, cur_frame_data, prev_boxes, cur_boxes, prev_poses, cur_poses, cost_types=cfg.TRACKING.DISTANCE_METRICS, cost_weights=cfg.TRACKING.DISTANCE_METRIC_WTS, bipart_match_algo=cfg.TRACKING.BIPARTITE_MATCHING_ALGO) if cfg.TRACKING.DEBUG.UPPER_BOUND: prev_tracks = frame_data['tracks'].reshape((-1)).tolist() matched = np.where(np.array(matches) != -1)[0] # Remove things unmatched matches = matches[matched] new_boxes = _get_boxes(dets, det_id)[matched] # This doesn't help, but made the pose score go low # new_boxes[:, -1] = 1.0 # make the detections 100% confidence new_poses = [_get_poses(dets, det_id)[el] for el in matched] if cfg.TRACKING.DEBUG.UPPER_BOUND_2_GT_KPS: # Set the points to be GT points new_boxes[:, :4] = frame_data['boxes'][matches] for match_id in range(matches.shape[0]): if cfg.TRACKING.DEBUG.UPPER_BOUND_2_GT_KPS_ONLY_CONF: dims_to_replace = np.array(2) else: dims_to_replace = np.arange(3) new_poses[match_id][dims_to_replace, :] = frame_data[ 'gt_keypoints'][matches[match_id]][dims_to_replace, :] _set_boxes(new_boxes, dets, det_id) _set_poses(new_poses, dets, det_id) else: prev_tracks = video_tracks[frame_id - 1] if frame_id > 0 else None if cfg.TRACKING.DEBUG.UPPER_BOUND_5_GT_KPS_ONLY: gt_prev_boxes = frame_data['boxes'] gt_prev_poses = [el for el in frame_data['gt_keypoints']] gt_prev_frame_data = {'image': img_utils.get_image_path(frame_data)} matches_gt = _compute_matches( gt_prev_frame_data, frame_data, gt_prev_boxes, cur_boxes, gt_prev_poses, cur_poses, cost_types=('bbox-overlap', ), cost_weights=(1.0, ), bipart_match_algo='hungarian') # replace the predicted poses for match_id in range(matches_gt.shape[0]): if matches_gt[match_id] == -1: continue cur_poses[match_id][:3, :] = gt_prev_poses[matches_gt[match_id]][:3, :] for m in matches: if m == -1: # didn't match to any frame_tracks.append(next_track_id) next_track_id += 1 if next_track_id >= MAX_TRACK_IDS: logger.warning('Exceeded max track ids ({}) for {}'.format( MAX_TRACK_IDS, frame_data['image'])) next_track_id %= MAX_TRACK_IDS else: frame_tracks.append(prev_tracks[m]) video_tracks.append(frame_tracks) return video_tracks