def scatterplot_recall(tubes_recall, objects_recall, fig_width=13, fig_height=9, exclude_classes=None): class_map = utils.class2idx_map() AUC = [] for class_label in range(1, len(class_map)): # for each class # area under the curve AUC.append(eval_utils.average_precision(objects_recall[class_label])) name = "tab10" cmap = get_cmap(name) colors = cmap.colors fig = plt.figure(figsize=(fig_width, fig_height)) plt.grid(alpha=0.4) plt.xlabel('AUC (Objects Recall)', fontsize=fig_width) plt.ylabel('Tubes Recall', fontsize=fig_width) fig.axes[0].tick_params(labelsize=fig_width - 2) hex_colors = [matplotlib.colors.to_hex(color) for color in colors] for idx, class_label in enumerate(range(1, len(class_map))): if exclude_classes is not None: if utils.idx2class(class_map, class_label) in exclude_classes: continue plt.scatter(AUC[idx], tubes_recall[idx], c=hex_colors[idx], label=utils.idx2class(class_map, class_label), s=50) fig.axes[0].legend(fontsize=fig_width - 3)
def plot_objects_recall(objects_recall, fig_width=13, fig_height=9): class_map = utils.class2idx_map() name = "tab10" cmap = get_cmap(name) colors = cmap.colors fig = plt.figure(figsize=(fig_width, fig_height)) plt.grid(alpha=0.4) plt.xlabel('Attention Threshold', fontsize=fig_width) plt.ylabel('Recall', fontsize=fig_width) plt.xticks(np.linspace(0, 1, len(class_map))) plt.yticks(np.linspace(0, 1, len(class_map))) fig.axes[0].tick_params(labelsize=fig_width - 2) hex_colors = [matplotlib.colors.to_hex(color) for color in colors] for idx, class_label in enumerate(objects_recall): plt.plot(objects_recall[class_label][:, 1], objects_recall[class_label][:, 0], c=hex_colors[idx], label=utils.idx2class(class_map, class_label), linewidth=2) fig.axes[0].legend(fontsize=fig_width - 4)
def get_objects_recall(config_path, epoch, split, num_threshold_points=100): with open(os.path.join(config_path, 'config.pkl'), 'rb') as f: cfg_dict = pickle.load(f) with open(os.path.join(cfg_dict['am_path'], cfg_dict['filename'], split, 'am_epoch_' + str(epoch) + '_keyframes' + '.pkl'), 'rb') as f: am = pickle.load(f) with open(os.path.join(cfg_dict['annot_path'], 'annotated_data.pkl'), 'rb') as f: annotated_data = pickle.load(f) with open(os.path.join(cfg_dict['annot_path'], 'daly1.1.0.pkl'), 'rb') as f: annot = pickle.load(f, encoding='latin1') obj_annotations = utils.get_obj_annotations(annotated_data, annot) classes_to_exclude = cfg_dict['classes_to_exclude'] OH = cfg_dict['out_feature_size'][0] OW = cfg_dict['out_feature_size'][1] T_fm = cfg_dict['out_feature_temp_size'] class_map = utils.class2idx_map() num_layers = cfg_dict['num_layers'] num_graphs = cfg_dict['num_graphs'] # collect tubes with IoU > 0.5 tubes_dict = {} for video in annotated_data[split]: vid_annot = annotated_data[split][video] w, h = vid_annot['(width, height)'] for instance in vid_annot['action_instances']: instance_annot = annotated_data[split][video]['action_instances'][instance] keyframes_dict = instance_annot['keyframes'] keyframe_ids = np.array(list(keyframes_dict.keys())) keyframe_boxes = np.copy(np.stack(list(keyframes_dict.values()))) keyframe_boxes[:, [0, 2]] = np.copy(keyframe_boxes[:, [0, 2]]) * w keyframe_boxes[:, [1, 3]] = np.copy(keyframe_boxes[:, [1, 3]]) * h for tube_id in instance_annot['tubes']: tube = instance_annot['tubes'][tube_id] spt_iou = np.mean(utils.get_tube_iou(tube[np.in1d(tube[:, 0], keyframe_ids), 1:5], keyframe_boxes)) if spt_iou > 0.5: if video not in tubes_dict: tubes_dict[video] = {} if instance not in tubes_dict[video]: tubes_dict[video][instance] = {} tubes_dict[video][instance]['tubes'] = {} tubes_dict[video][instance]['tube_labels'] = [] tubes_dict[video][instance]['tubes'][tube_id] = tube tubes_dict[video][instance]['tube_labels'].append(instance_annot['tube_labels'][tube_id]) objects_recall = {} thresholds = np.linspace(0, 1, num_threshold_points) for class_label in range(1, len(class_map)): # recall curve for each class (exclude background) if classes_to_exclude is not None: class_name = utils.idx2class(class_map, class_label) if class_name not in classes_to_exclude: continue # calculate total number of false negatives fn = 0 for video in obj_annotations[split]: if (video not in am.keys()) or (video not in tubes_dict): continue # no object annotations or no positive tubes vid_annot = obj_annotations[split][video] for instance in vid_annot['action_instances']: if instance not in tubes_dict[video]: continue tubes_instance = tubes_dict[video][instance] assert len(set(tubes_instance['tube_labels'])) == 1 instance_label = tubes_instance['tube_labels'][0] if class_label != instance_label: continue # skip instances of different class keyframes = list(vid_annot['action_instances'][instance].keys()) for keyframe in keyframes: fn_keyframe = 0 for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])): if (vid_annot['action_instances'][instance][keyframe][box_idx][5] == 1) or (vid_annot['action_instances'][instance][keyframe][box_idx][6] == 1): continue fn_keyframe += 1 fn += fn_keyframe * len(tubes_instance['tube_labels']) # total number of false negatives recall_values = np.zeros([len(thresholds), 2]) for idx, threshold in enumerate(thresholds): # for each threshold tp = 0 fn_ = fn for video in obj_annotations[split]: if (video not in am.keys()) or (video not in tubes_dict): continue # no object annotations or no positive tubes vid_annot = obj_annotations[split][video] W, H = obj_annotations[split][video]['(width, height)'] for instance in vid_annot['action_instances']: if instance not in tubes_dict[video]: continue # no positive tubes assert len(set(tubes_dict[video][instance]['tube_labels'])) == 1 instance_label = tubes_dict[video][instance]['tube_labels'][0] if class_label != instance_label: continue # skip instances of different class keyframes = list(vid_annot['action_instances'][instance].keys()) for tube_id in tubes_dict[video][instance]['tubes']: # for each (positive) tube for keyframe in keyframes: att_map_list = [] for layer_num in range(num_layers): for graph_num in range(num_graphs): lngn = str(layer_num) + str(graph_num) # layer number and graph number att_map = am[video][instance][keyframe][lngn][tube_id] att_map = att_map.reshape(T_fm, OH, OW)[3] att_map = att_map.reshape(-1) att_map = scipy.special.softmax(att_map) att_map = att_map.reshape(OH, OW) att_map_list.append(att_map) # get obj annotation for keyframe for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])): # for each object annotation in keyframe obj_box = vid_annot['action_instances'][instance][keyframe][box_idx][0:4] obj_box = obj_box * OH x1 = int(round(obj_box[0])) y1 = int(round(obj_box[1])) x2 = int(round(obj_box[2])) y2 = int(round(obj_box[3])) sum_list = [] att_map_idx = 0 for layer_num in range(num_layers): for graph_num in range(num_graphs): patch = att_map_list[att_map_idx][y1:y2 + 1, x1:x2 + 1] att_sum = np.sum(patch) # add attention values inside the object bounding box sum_list.append(att_sum) att_map_idx += 1 is_positive = any(np.array(sum_list) > threshold) # if any of the graphs satisfies condition if is_positive: tp += 1 fn_ -= 1 recall_values[idx, 0] = tp / (tp + fn_) recall_values[idx, 1] = threshold objects_recall[class_label] = recall_values return objects_recall
def videomAP(scores, annot_data, split, cfg, iou_threshold=0.5): PR = {} scored_tubes = score_tubes(scores, cfg.num_actions) pred_tubes = get_tube_predictions(scored_tubes, annot_data, split, cfg.num_actions, nms_threshold=0.2) gt_tubes = get_gt_tubes(annot_data, split, list(scored_tubes.keys()), cfg) for class_label in range(1, cfg.num_actions): class_pred_tubes = pred_tubes[class_label] class_gt_tubes = gt_tubes[class_label] pr = np.empty((len(class_pred_tubes) + 1, 2), dtype=np.float32) pr[0,0] = 1.0 pr[0,1] = 0.0 fp = 0 tp = 0 fn = 0 covered_gt_tubes = {} for video in class_gt_tubes: covered_gt_tubes[video] = {} instances = class_gt_tubes[video] for instance in instances: num_gt_tubes = len(class_gt_tubes[video][instance]) covered_gt_tubes[video][instance] = num_gt_tubes * [0] fn += num_gt_tubes for i, j in enumerate(np.argsort(-np.array([pred_tube[2] for pred_tube in class_pred_tubes]))): video, instance, score, tube_id, tube = class_pred_tubes[j] is_positive = False if video in class_gt_tubes: if instance in class_gt_tubes[video]: gt_kf_tubes = class_gt_tubes[video][instance] ious = [] for gt_tube in gt_kf_tubes: keyframes = gt_tube[:, 0] ious.append(np.mean(utils.get_tube_iou(tube[np.in1d(tube[:, 0], keyframes), 1:5], gt_tube[:, 1:5]))) amax = np.argmax(ious) if ious[amax] >= iou_threshold: if covered_gt_tubes[video][instance][amax] == 0: is_positive = True covered_gt_tubes[video][instance][amax] = 1 if is_positive: tp += 1 fn -= 1 else: fp += 1 pr[i+1,0] = tp / (tp + fp) pr[i+1,1] = tp / (tp + fn) PR[utils.idx2class(cfg.class_map, class_label)] = pr AP = {class_name:100 * average_precision(PR[class_name]) for class_name in PR} mAP = sum(list(AP.values())) / len(AP) return mAP, AP
def plot_attention_maps(config_path, epoch, pred_type, size=5, split='test', fig_width=15, fig_height=25): with open(os.path.join(config_path, 'config.pkl'), 'rb') as f: cfg_dict = pickle.load(f) am_path = cfg_dict['am_path'] filename = cfg_dict['filename'] annot_path = cfg_dict['annot_path'] class_map = cfg_dict['class_map'] T_fm = cfg_dict['out_feature_temp_size'] OH, OW = cfg_dict['out_feature_size'] num_layers = cfg_dict['num_layers'] num_graphs = cfg_dict['num_graphs'] h, w = cfg_dict['img_size'] data_path = cfg_dict['data_path'] with open(os.path.join(annot_path, 'annotated_data.pkl'), 'rb') as f: annotated_data = pickle.load(f) with open( os.path.join(am_path, filename, split, 'am_epoch_' + str(epoch) + '.pkl'), 'rb') as f: am = pickle.load(f) for class_label in range(1, cfg_dict['num_actions']): print(utils.idx2class(class_map, class_label)) print(15 * '-') class_preds = pred_type[class_label] num_preds = len(class_preds) if num_preds == 0: print() continue if num_preds > size: sample = np.random.choice(num_preds, size=size, replace=False) else: sample = np.random.choice(num_preds, size=num_preds, replace=False) preds = [class_preds[s] for s in sample] for idx, pred in enumerate(preds): video, instance, score, tube_id, tube = pred tbound = annotated_data[split][video]['action_instances'][ instance]['tbound'] W, H = annotated_data[split][video]['(width, height)'] tube_label = annotated_data[split][video]['action_instances'][ instance]['tube_labels'][tube_id] center_frame = int( np.random.choice(list(am[video][instance].keys()), 1)) # sample center frame frame_seq = np.array([ frame_num for frame_num in range(center_frame - 15, center_frame + 16 + 1) ]) frames_in_tbound = [] for frame_num in frame_seq: if (frame_num > tbound[0]) and (frame_num < tbound[-1]): frames_in_tbound.append(frame_num) clip = [] for frame_num in frame_seq: if frame_num in frames_in_tbound: clip.append(frame_num) else: if frame_num < center_frame: clip.append(frames_in_tbound[0]) else: clip.append(frames_in_tbound[-1]) clip = clip[3::4] # get every 4th frame of clip print('Prediction:', utils.idx2class(class_map, class_label), '|', 'Ground Truth:', utils.idx2class(class_map, tube_label)) fig, ax = plt.subplots(T_fm, num_layers * num_graphs + 1, figsize=(fig_width, fig_height)) for t in range(T_fm): frame_num = clip[t] #frame_num = center_frame boxes = np.copy( annotated_data[split][video]['action_instances'][instance] ['tubes'][tube_id]) box = boxes[np.where(boxes[:, 0] == frame_num)[0], 1:5][0] box[[0, 2]] = (box[[0, 2]] / W) * w box[[1, 3]] = (box[[1, 3]] / H) * h img = plt.imread( os.path.join(data_path, video, 'frame' + str(frame_num).zfill(6) + '.jpg')) rect = plt.Rectangle((box[0], box[1]), box[2] - box[0], box[3] - box[1], fill=False, edgecolor='r', linewidth=1) ax[t][0].add_patch(rect) ax[t][0].imshow(img) ax_idx = 0 for j in range(num_layers): for i in range(num_graphs): ax_idx += 1 img2 = plt.imread( os.path.join( data_path, video, 'frame' + str(frame_num).zfill(6) + '.jpg')) lngn = str(j) + str(i) att_map = scipy.special.softmax( np.copy(am[video][instance][center_frame][lngn] [tube_id])) att_map = att_map.reshape( T_fm, OH, OW)[t] # temporal attention maps #att_map = np.mean(att_map.reshape(T_fm, OH, OW), axis=0) # temporal average attention map res = cv2.resize(att_map, dsize=(w, h), interpolation=cv2.INTER_CUBIC) extent = 0, w, 0, h ax[t][ax_idx].imshow(img2, extent=extent) ax[t][ax_idx].imshow(res, alpha=0.5, cmap='Reds', extent=extent) plt.pause(.5) print('===' * 20) print() print()
def extract_features(dataloader, dataset, model, device, annot_data, annot): model.eval() obj_annotations = utils.get_obj_annotations(annot_data, annot) class_map = utils.class2idx_map(classes_to_exclude=None) features_dict = {} for idx, batch_data in enumerate(dataloader): imgs = batch_data[0] person_boxes = batch_data[1] action_labels = batch_data[2] num_boxes_per_frame = batch_data[3] video_names = batch_data[4] instances = batch_data[5] center_frames = batch_data[6] video_name = video_names[0] instance = instances[0].item() keyframe = center_frames[0].item() if video_name not in obj_annotations[dataset.split]: continue if instance not in obj_annotations[dataset.split][video_name]['action_instances']: continue if keyframe not in obj_annotations[dataset.split][video_name]['action_instances'][instance]: continue num_actors_list = [num_boxes_per_frame[b][15].item() for b in range(imgs.shape[0])] batch = [data.to(device=device) for data in [imgs, person_boxes]] batch.append(num_boxes_per_frame) batch.append(action_labels) with torch.set_grad_enabled(False): actor_features_emb_list, context_features_list = model(batch, 'return_features') for graph_num in range(len(actor_features_emb_list)): if graph_num not in features_dict: features_dict[graph_num] = [] actor_features_emb = actor_features_emb_list[graph_num].detach().cpu().numpy() tube_labels = np.copy(annot_data[dataset.split][video_name]['action_instances'][instance]['tube_labels']) for tube_id, tube_label in enumerate(tube_labels): if tube_label > 0: # not background features_dict[graph_num].append([actor_features_emb[tube_id, :], utils.idx2class(class_map, tube_label)]) vid_annot = obj_annotations[dataset.split][video_name] for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])): obj_box = vid_annot['action_instances'][instance][keyframe][box_idx][0:4] obj_box = obj_box * 14 x1 = int(round(obj_box[0])) y1 = int(round(obj_box[1])) x2 = int(round(obj_box[2])) y2 = int(round(obj_box[3])) if x1 == x2: x1 = int(np.floor(obj_box[0])) x2 = int(np.ceil(obj_box[2])) if y1 == y2: y1 = int(np.floor(obj_box[1])) y2 = int(np.ceil(obj_box[3])) for graph_num in range(len(context_features_list)): obj_features = context_features_list[graph_num][0, :, 3, y1:y2 + 1, x1:x2 + 1].detach().cpu().numpy() obj_features = np.mean(obj_features, axis=(1, 2)) obj_id = int(vid_annot['action_instances'][instance][keyframe][box_idx][4]) obj_name = annot['objectList'][obj_id] features_dict[graph_num].append([obj_features, obj_name]) return features_dict