Ejemplo n.º 1
0
def scatterplot_recall(tubes_recall,
                       objects_recall,
                       fig_width=13,
                       fig_height=9,
                       exclude_classes=None):
    class_map = utils.class2idx_map()
    AUC = []
    for class_label in range(1, len(class_map)):  # for each class
        # area under the curve
        AUC.append(eval_utils.average_precision(objects_recall[class_label]))
    name = "tab10"
    cmap = get_cmap(name)
    colors = cmap.colors
    fig = plt.figure(figsize=(fig_width, fig_height))
    plt.grid(alpha=0.4)
    plt.xlabel('AUC (Objects Recall)', fontsize=fig_width)
    plt.ylabel('Tubes Recall', fontsize=fig_width)
    fig.axes[0].tick_params(labelsize=fig_width - 2)
    hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
    for idx, class_label in enumerate(range(1, len(class_map))):
        if exclude_classes is not None:
            if utils.idx2class(class_map, class_label) in exclude_classes:
                continue
        plt.scatter(AUC[idx],
                    tubes_recall[idx],
                    c=hex_colors[idx],
                    label=utils.idx2class(class_map, class_label),
                    s=50)
    fig.axes[0].legend(fontsize=fig_width - 3)
Ejemplo n.º 2
0
def plot_objects_recall(objects_recall, fig_width=13, fig_height=9):
    class_map = utils.class2idx_map()
    name = "tab10"
    cmap = get_cmap(name)
    colors = cmap.colors
    fig = plt.figure(figsize=(fig_width, fig_height))
    plt.grid(alpha=0.4)
    plt.xlabel('Attention Threshold', fontsize=fig_width)
    plt.ylabel('Recall', fontsize=fig_width)
    plt.xticks(np.linspace(0, 1, len(class_map)))
    plt.yticks(np.linspace(0, 1, len(class_map)))
    fig.axes[0].tick_params(labelsize=fig_width - 2)
    hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
    for idx, class_label in enumerate(objects_recall):
        plt.plot(objects_recall[class_label][:, 1],
                 objects_recall[class_label][:, 0],
                 c=hex_colors[idx],
                 label=utils.idx2class(class_map, class_label),
                 linewidth=2)
    fig.axes[0].legend(fontsize=fig_width - 4)
Ejemplo n.º 3
0
    parser.add_argument('--batch_size', type=int, default=3, help='Batch size')
    parser.add_argument('--gpu_device', type=str, default=0, help='GPU device (number) to use; defaults to 0')
    parser.add_argument('--cpu', action='store_true', help='Whether to use CPU instead of GPU; this option overwrites the --gpu_device argument')
    parser.add_argument('--save_scores', action='store_true', help='Whether to save model scores during inference')
    parser.add_argument('--split', default='test', help='Dataset split; possible values are \'training\', \'validation\', \'test\'')

    args = parser.parse_args()

    with open(os.path.join(args.config_path, 'config.pkl'), 'rb') as f:
        cfg_dict = pickle.load(f)
    if 'merge_function' not in cfg_dict:
        cfg_dict['merge_function'] = 'concat'
    if 'zero_shot' not in cfg_dict:
        cfg_dict['zero_shot'] = False
        cfg_dict['classes_to_exclude'] = None
    if 'num_features_mixed5c' not in cfg_dict:
        cfg_dict['num_features_mixed5c'] = 1024
    #if 'i3d_weights_path' not in cfg_dict:
    #    cfg_dict['i3d_weights_path'] = 'models/'
    cfg_dict['i3d_weights_path'] = 'models/'
    if 'class_map' not in cfg_dict:
        cfg_dict['class_map'] = utils.class2idx_map(cfg_dict['classes_to_exclude'])    

    cfg = config.GetConfig(**cfg_dict)
    
    utils.print_config(cfg)

    dataloader, test_set, model, device = prepare_inference(cfg, args)
    inference(dataloader, test_set, model, device, cfg)

Ejemplo n.º 4
0
def get_tubes_recall(config_path, epoch, split, num_threshold_points=100):
    
    with open(os.path.join(config_path, 'config.pkl'), 'rb') as f:
        cfg_dict = pickle.load(f)    
    
    scores_path = cfg_dict['scores_path']
    model_name = cfg_dict['model_name']
    filename = cfg_dict['filename']
    
    with open(os.path.join(scores_path, model_name, filename, split, 'scores_epoch_' + str(epoch) + '.pkl'), 'rb') as f:
        scores = pickle.load(f)
    scored_tubes = score_tubes(scores, cfg_dict['num_actions'])
    
    with open(os.path.join(cfg_dict['annot_path'], 'annotated_data.pkl'), 'rb') as f:
        annotated_data = pickle.load(f)
    
    classes_to_exclude = cfg_dict['classes_to_exclude']
    class_map = utils.class2idx_map()
    
    # collect tubes with IoU > 0.5
    tubes_dict = {}
    for video in annotated_data[split]:
        vid_annot = annotated_data[split][video]
        w, h = vid_annot['(width, height)']
        for instance in vid_annot['action_instances']:
            instance_annot = annotated_data[split][video]['action_instances'][instance]
            keyframes_dict = instance_annot['keyframes']
            keyframe_ids = np.array(list(keyframes_dict.keys()))
            keyframe_boxes = np.copy(np.stack(list(keyframes_dict.values())))
            keyframe_boxes[:, [0, 2]] = np.copy(keyframe_boxes[:, [0, 2]]) * w
            keyframe_boxes[:, [1, 3]] = np.copy(keyframe_boxes[:, [1, 3]]) * h
            for tube_id in instance_annot['tubes']:
                tube = instance_annot['tubes'][tube_id]
                spt_iou = np.mean(utils.get_tube_iou(tube[np.in1d(tube[:, 0], keyframe_ids), 1:5], keyframe_boxes))
                if spt_iou > 0.5:
                    if video not in tubes_dict:
                        tubes_dict[video] = {}
                    if instance not in tubes_dict[video]:
                        tubes_dict[video][instance] = {}
                        tubes_dict[video][instance]['tubes'] = {}
                        tubes_dict[video][instance]['tube_labels'] = []
                    tubes_dict[video][instance]['tubes'][tube_id] = tube
                    tubes_dict[video][instance]['tube_labels'].append(instance_annot['tube_labels'][tube_id])
    
    tubes_recall = []
    for class_label in range(1, len(class_map)): # for each class
        running_corrects = 0
        running_total = 0
        for video in scored_tubes:
            if video not in tubes_dict:
                continue
            for instance in scored_tubes[video]:
                if instance not in tubes_dict[video]:
                    continue
                tubes_instance = tubes_dict[video][instance]
                assert len(set(tubes_instance['tube_labels'])) == 1
                instance_label = tubes_instance['tube_labels'][0]
                if class_label != instance_label:
                    continue
                tube_ids = np.array(list(tubes_dict[video][instance]['tubes'].keys()))
                predicted_labels = scored_tubes[video][instance][tube_ids, 0]
                gt_labels = np.array(tubes_dict[video][instance]['tube_labels'], dtype=predicted_labels.dtype)
                running_corrects += np.sum(predicted_labels == gt_labels)
                running_total += len(gt_labels)
        tubes_recall.append(running_corrects / running_total)
        
    return tubes_recall
Ejemplo n.º 5
0
def get_objects_recall(config_path, epoch, split, num_threshold_points=100):
    
    with open(os.path.join(config_path, 'config.pkl'), 'rb') as f:
        cfg_dict = pickle.load(f)    
    
    with open(os.path.join(cfg_dict['am_path'], cfg_dict['filename'], split, 'am_epoch_' + str(epoch) + '_keyframes' + '.pkl'), 'rb') as f:
        am = pickle.load(f)
        
    with open(os.path.join(cfg_dict['annot_path'], 'annotated_data.pkl'), 'rb') as f:
        annotated_data = pickle.load(f)
    
    with open(os.path.join(cfg_dict['annot_path'], 'daly1.1.0.pkl'), 'rb') as f:
        annot = pickle.load(f, encoding='latin1')
    
    obj_annotations = utils.get_obj_annotations(annotated_data, annot)
    
    classes_to_exclude = cfg_dict['classes_to_exclude']
    OH = cfg_dict['out_feature_size'][0]
    OW = cfg_dict['out_feature_size'][1]
    T_fm = cfg_dict['out_feature_temp_size']
    class_map = utils.class2idx_map()
    num_layers = cfg_dict['num_layers']
    num_graphs = cfg_dict['num_graphs']
    
    # collect tubes with IoU > 0.5
    tubes_dict = {}
    for video in annotated_data[split]:
        vid_annot = annotated_data[split][video]
        w, h = vid_annot['(width, height)']
        for instance in vid_annot['action_instances']:
            instance_annot = annotated_data[split][video]['action_instances'][instance]
            keyframes_dict = instance_annot['keyframes']
            keyframe_ids = np.array(list(keyframes_dict.keys()))
            keyframe_boxes = np.copy(np.stack(list(keyframes_dict.values())))
            keyframe_boxes[:, [0, 2]] = np.copy(keyframe_boxes[:, [0, 2]]) * w
            keyframe_boxes[:, [1, 3]] = np.copy(keyframe_boxes[:, [1, 3]]) * h
            for tube_id in instance_annot['tubes']:
                tube = instance_annot['tubes'][tube_id]
                spt_iou = np.mean(utils.get_tube_iou(tube[np.in1d(tube[:, 0], keyframe_ids), 1:5], keyframe_boxes))
                if spt_iou > 0.5:
                    if video not in tubes_dict:
                        tubes_dict[video] = {}
                    if instance not in tubes_dict[video]:
                        tubes_dict[video][instance] = {}
                        tubes_dict[video][instance]['tubes'] = {}
                        tubes_dict[video][instance]['tube_labels'] = []
                    tubes_dict[video][instance]['tubes'][tube_id] = tube
                    tubes_dict[video][instance]['tube_labels'].append(instance_annot['tube_labels'][tube_id])
    
    objects_recall = {}
    thresholds = np.linspace(0, 1, num_threshold_points)
    for class_label in range(1, len(class_map)): # recall curve for each class (exclude background)
        if classes_to_exclude is not None:
            class_name = utils.idx2class(class_map, class_label) 
            if class_name not in classes_to_exclude:
                continue
        # calculate total number of false negatives
        fn = 0
        for video in obj_annotations[split]:
            if (video not in am.keys()) or (video not in tubes_dict):
                continue # no object annotations or no positive tubes
            vid_annot = obj_annotations[split][video]
            for instance in vid_annot['action_instances']:
                if instance not in tubes_dict[video]:
                    continue
                tubes_instance = tubes_dict[video][instance]
                assert len(set(tubes_instance['tube_labels'])) == 1
                instance_label = tubes_instance['tube_labels'][0]
                if class_label != instance_label:
                    continue # skip instances of different class
                keyframes = list(vid_annot['action_instances'][instance].keys())
                for keyframe in keyframes:
                    fn_keyframe = 0
                    for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])):
                        if (vid_annot['action_instances'][instance][keyframe][box_idx][5] == 1) or (vid_annot['action_instances'][instance][keyframe][box_idx][6] == 1):
                            continue
                        fn_keyframe += 1
                    fn += fn_keyframe * len(tubes_instance['tube_labels']) # total number of false negatives
        
        recall_values = np.zeros([len(thresholds), 2])
        for idx, threshold in enumerate(thresholds): # for each threshold
            tp = 0
            fn_ = fn
            for video in obj_annotations[split]:
                if (video not in am.keys()) or (video not in tubes_dict): 
                    continue # no object annotations or no positive tubes
                vid_annot = obj_annotations[split][video]
                W, H = obj_annotations[split][video]['(width, height)']
                for instance in vid_annot['action_instances']:
                    if instance not in tubes_dict[video]:
                        continue # no positive tubes
                    assert len(set(tubes_dict[video][instance]['tube_labels'])) == 1
                    instance_label = tubes_dict[video][instance]['tube_labels'][0]
                    if class_label != instance_label:
                        continue # skip instances of different class
                    keyframes = list(vid_annot['action_instances'][instance].keys())
                            
                    for tube_id in tubes_dict[video][instance]['tubes']: # for each (positive) tube                   
                        for keyframe in keyframes: 
                            att_map_list = []
                            for layer_num in range(num_layers):
                                for graph_num in range(num_graphs):
                                    lngn = str(layer_num) + str(graph_num) # layer number and graph number
                                    att_map = am[video][instance][keyframe][lngn][tube_id]
                                    att_map = att_map.reshape(T_fm, OH, OW)[3]
                                    att_map = att_map.reshape(-1)
                                    att_map = scipy.special.softmax(att_map)
                                    att_map = att_map.reshape(OH, OW)
                                    att_map_list.append(att_map)

                            # get obj annotation for keyframe
                            for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])): # for each object annotation in keyframe
                                obj_box = vid_annot['action_instances'][instance][keyframe][box_idx][0:4]
                                obj_box = obj_box * OH
                                x1 = int(round(obj_box[0]))
                                y1 = int(round(obj_box[1]))
                                x2 = int(round(obj_box[2]))
                                y2 = int(round(obj_box[3]))
                                sum_list = []
                                att_map_idx = 0
                                for layer_num in range(num_layers):
                                    for graph_num in range(num_graphs):
                                        patch = att_map_list[att_map_idx][y1:y2 + 1, x1:x2 + 1]
                                        att_sum = np.sum(patch) # add attention values inside the object bounding box
                                        sum_list.append(att_sum)
                                        att_map_idx += 1
                                is_positive = any(np.array(sum_list) > threshold) # if any of the graphs satisfies condition
                                if is_positive:
                                    tp += 1
                                    fn_ -= 1

            recall_values[idx, 0] = tp / (tp + fn_)
            recall_values[idx, 1] = threshold
        objects_recall[class_label] = recall_values
    return objects_recall
Ejemplo n.º 6
0
    def __init__(self, args):

        self.img_size = 224, 224  # size of input frame (h, w)
        self.out_feature_size = 14, 14  # size of output feature map (Mixed_4f) (h, w)
        self.out_feature_temp_size = 8  # temporal output size of feature map (Mixed_4f)
        self.num_person_boxes = 20  # max number of tracks in action instance
        self.num_in_frames = 32  # number of input frames

        self.model_name = args.model_name  # 'baseline' or 'gcn'

        self.data_path = args.data_path  # 'data/DALY/frames/
        self.annot_path = args.annot_path
        self.results_path = args.results_path
        self.scores_path = args.scores_path
        self.am_path = 'am/'
        self.features_path = 'extracted_features/'
        self.i3d_weights_path = 'models/'
        self.filename = ''

        if args.cpu == False:
            self.use_gpu = True
            self.device_list = args.gpu_device
        else:
            self.use_gpu = False

        self.num_actions = 11  # (10 + Background)

        self.label_tracks = False  # whether to compute track annotations (True), or load them instead (False)

        self.training_batch_size = args.batch_size
        self.validation_batch_size = args.batch_size

        self.momentum = 0.9
        self.weight_decay = 0

        # learning rate with cosine annealing schedule
        # 'warmup' refers to linear warmp-up
        self.start_epoch = 0
        self.total_epochs = args.total_epochs  # 450 # 'total_steps' is inferred as (per epoch) 'num_steps' * 'total_epochs'
        self.warmup_epochs = args.warmup_epochs  # 0 # 'warmup_steps' is inferred as (per epoch) 'num_steps' * 'warmup_epochs' (0-indexed)
        self.init_lr = args.init_lr  # 4.5e-6
        self.max_lr = args.max_lr  # 4.7e-5
        self.min_lr = args.min_lr

        # if self.warmup_epochs == 0:
        #     if self.init_lr is not None:
        #         warnings.warn("Warning: {warmup_epochs} is 0, while init_lr is greater than 0.\n Defaulting init_lr to None".format(s))
        # if self.init_lr is None:
        #     if self.warmup_epochs > 0:
        #         warnings.warn("Warning: {warmup_epochs} is 0, while init_lr is greater than 0.\n Defaulting init_lr to None".format(s))
        # raise warning

        # if self.model_name == 'baseline':
        #     self.total_epochs = 150
        #     self.warmup_epochs = 0
        #     self.init_lr = None
        #     self.max_lr = 2.5e-4
        #     self.min_lr = 0

        self.num_features_mixed4f = 832  # number of output channels of Mixed_4f
        self.num_features_mixed5c = 1024  # number of output channels of Mixed_5c
        self.num_features_gcn = 256
        self.crop_size = 7, 7  # output size of RoI Pooling
        self.dropout_prob = 0.5
        self.num_layers = args.num_layers  # number of gcn layers
        self.num_graphs = args.num_graphs  # number of graphs per layer
        self.merge_function = args.merge_function  # function to merge output of multiple graphs in final layer: 'sum' or 'concat'

        if self.model_name == 'baseline':
            self.use_i3d_tail = True

        self.zero_shot = args.zero_shot
        self.classes_to_exclude = None
        if self.model_name == 'gcn':
            if self.zero_shot:
                self.classes_to_exclude = [
                    'Ironing', 'TakingPhotosOrVideos'
                ]  # classes to exclude during training
                self.num_actions = self.num_actions - len(
                    self.classes_to_exclude)
        elif self.model_name == 'baseline':
            self.zero_shot = False

        self.class_map = utils.class2idx_map(self.classes_to_exclude)

        #self.save_log = True # whether to save the model, state_dict, and loss every x-number of epochs
        self.set_bn_eval = False  # If set to True, freeze batch normalization layers
        self.save_scores = args.save_scores  # whether to save output (softmax) scores every x-number of epochs
        self.save_am = False  # whether to save the adjacency matrix of every clip
        if self.model_name == 'baseline':
            self.save_am = False
        self.plot_grad_flow = False
        self.num_epochs_to_val = args.num_epochs_to_val

        self.resume_training = args.resume_training  # Load weights from checkpoint to resume training
        if self.resume_training:
            self.checkpoint_path = args.checkpoint_path
Ejemplo n.º 7
0
def plot_tSNE(config_path,
              epoch,
              split,
              plot_type='actions',
              fig_width=14,
              fig_height=9):

    with open(os.path.join(config_path, 'config.pkl'), 'rb') as f:
        cfg_dict = pickle.load(f)
    with open(
            os.path.join(cfg_dict['features_path'], cfg_dict['filename'],
                         split, 'features_epoch_' + str(epoch) + '.pkl'),
            'rb') as f:
        actor_and_obj_features = pickle.load(f)
    with open(os.path.join(cfg_dict['annot_path'], 'daly1.1.0.pkl'),
              'rb') as f:
        annot = pickle.load(f, encoding='latin1')

    filename = cfg_dict['filename']
    if plot_type == 'actions':
        class_map = utils.class2idx_map(cfg_dict['classes_to_exclude'])
        action_list = list(class_map.keys())
        action_list.remove('Background')
        palette = "tab10"
        cmap = get_cmap(palette)
        colors = cmap.colors
        # total of 10 colors
        hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
    elif plot_type == 'objects':
        obj_list = annot['objectList']
        palette = "Paired"
        cmap = get_cmap(palette)
        colors = cmap.colors
        # total of 14 colors
        hex_colors = [matplotlib.colors.to_hex(color) for color in colors]
        hex_colors.append(matplotlib.colors.to_hex('black'))
        hex_colors.append(matplotlib.colors.to_hex('grey'))

    features_dict = {}
    names_dict = {}
    for graph_num in range(len(actor_and_obj_features)):
        features_dict[graph_num] = []
        names_dict[graph_num] = []
    for graph_num in range(len(actor_and_obj_features)):
        for features in actor_and_obj_features[graph_num]:
            name = features[1]
            if (plot_type == 'actions') and (name not in action_list):
                continue
            if (plot_type == 'objects') and (name not in obj_list):
                continue
            features_dict[graph_num].append(features[0])
            names_dict[graph_num].append(features[1])

    for graph_num in features_dict:
        fig = plt.figure(figsize=(fig_width, fig_height))
        features = np.vstack(features_dict[graph_num])
        names = np.array(names_dict[graph_num])
        if plot_type == 'objects':
            # collect names of 14 most frequent objects
            obj_freqs = []
            for obj in obj_list:
                idxs = np.where(names == obj)[0]
                obj_freqs.append((obj, len(idxs)))
            most_freq_obj = sorted(obj_freqs,
                                   key=lambda tup: tup[1],
                                   reverse=True)
            most_freq_obj = most_freq_obj[0:14]
            most_freq_obj = [obj for obj, _ in most_freq_obj]
            # collect features of 14 most frequent objects
            idxs = np.in1d(names, most_freq_obj)
            features = features[idxs, :]
            names = names[idxs]
        np.random.seed(5186)
        print('Fitting t-SNE for graph', str(graph_num) + '...')
        features_emb = TSNE(n_components=2,
                            perplexity=30.0,
                            n_iter=1000,
                            learning_rate=200).fit_transform(features)
        if plot_type == 'actions':
            for idx, action in enumerate(action_list):
                idxs = np.where(names == action)[0]
                plt.scatter(features_emb[idxs, 0],
                            features_emb[idxs, 1],
                            s=13,
                            c=hex_colors[idx],
                            label=action)
        elif plot_type == 'objects':
            for idx, obj in enumerate(most_freq_obj):
                idxs = np.where(names == obj)[0]
                plt.scatter(features_emb[idxs, 0],
                            features_emb[idxs, 1],
                            s=13,
                            c=hex_colors[idx],
                            label=obj)
        if graph_num == 0:
            plt.legend()
        plt.show()
Ejemplo n.º 8
0
def extract_features(dataloader, dataset, model, device, annot_data, annot):
    
    model.eval()
    
    obj_annotations = utils.get_obj_annotations(annot_data, annot)
    class_map = utils.class2idx_map(classes_to_exclude=None)
    features_dict = {}
    
    for idx, batch_data in enumerate(dataloader):
        
        imgs = batch_data[0]
        person_boxes = batch_data[1]
        action_labels = batch_data[2]
        num_boxes_per_frame = batch_data[3]
        video_names = batch_data[4]
        instances = batch_data[5]
        center_frames = batch_data[6]
        
        video_name = video_names[0]
        instance = instances[0].item()
        keyframe = center_frames[0].item()
            
        if video_name not in obj_annotations[dataset.split]:
            continue
        if instance not in obj_annotations[dataset.split][video_name]['action_instances']:
            continue
        if keyframe not in obj_annotations[dataset.split][video_name]['action_instances'][instance]:
            continue
       
        num_actors_list = [num_boxes_per_frame[b][15].item() for b in range(imgs.shape[0])]
        
        batch = [data.to(device=device) for data in [imgs, person_boxes]]
        batch.append(num_boxes_per_frame)
        batch.append(action_labels)

        with torch.set_grad_enabled(False):
            actor_features_emb_list, context_features_list = model(batch, 'return_features')
        
        for graph_num in range(len(actor_features_emb_list)):
            if graph_num not in features_dict:
                features_dict[graph_num] = []
            actor_features_emb = actor_features_emb_list[graph_num].detach().cpu().numpy()    
            tube_labels = np.copy(annot_data[dataset.split][video_name]['action_instances'][instance]['tube_labels'])
            for tube_id, tube_label in enumerate(tube_labels):
                if tube_label > 0: # not background
                    features_dict[graph_num].append([actor_features_emb[tube_id, :], utils.idx2class(class_map, tube_label)])
        
        vid_annot = obj_annotations[dataset.split][video_name]
        for box_idx in range(len(vid_annot['action_instances'][instance][keyframe])):
            obj_box = vid_annot['action_instances'][instance][keyframe][box_idx][0:4]
            obj_box = obj_box * 14
            x1 = int(round(obj_box[0]))
            y1 = int(round(obj_box[1]))
            x2 = int(round(obj_box[2]))
            y2 = int(round(obj_box[3]))
            if x1 == x2:
                x1 = int(np.floor(obj_box[0]))
                x2 = int(np.ceil(obj_box[2]))
            if y1 == y2:
                y1 = int(np.floor(obj_box[1]))
                y2 = int(np.ceil(obj_box[3]))
            for graph_num in range(len(context_features_list)):
                obj_features = context_features_list[graph_num][0, :, 3, y1:y2 + 1, x1:x2 + 1].detach().cpu().numpy()
                obj_features = np.mean(obj_features, axis=(1, 2))
                obj_id = int(vid_annot['action_instances'][instance][keyframe][box_idx][4])
                obj_name = annot['objectList'][obj_id]
                features_dict[graph_num].append([obj_features, obj_name])
    
    return features_dict