Example #1
0
def validate(args, val_dataloader, nets, iteration=0, iou_thresh=0.5):
    """
    Test the model on validation set
    """

    # write results to files for evaluation
    output_files = []
    fouts = []
    for i in range(args.max_iter):
        output_file = args.save_root + 'val_result-' + str(
            iteration) + '-iter' + str(i + 1) + '.csv'
        output_files.append(output_file)
        f = open(output_file, 'w')
        fouts.append(f)

    gt_file = args.save_root + 'val_gt.csv'
    fout = open(gt_file, 'w')

    with torch.no_grad():  # for evaluation
        for num, (images, targets, tubes, infos) in enumerate(val_dataloader):

            if (num + 1) % 100 == 0:
                print("%d / %d" %
                      (num + 1, len(val_dataloader.dataset) / args.batch_size))

            for b in range(len(infos)):
                for n in range(len(infos[b]['boxes'])):
                    mid = int(len(infos[b]['boxes'][n]) / 2)
                    box = infos[b]['boxes'][n][mid]
                    labels = infos[b]['labels'][n][mid]
                    for label in labels:
                        fout.write(
                            '{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6}\n'.
                            format(infos[b]['video_name'], infos[b]['fid'],
                                   box[0], box[1], box[2], box[3], label))

            _, _, channels, height, width = images.size()
            images = images.cuda()

            # get conv features
            conv_feat = nets['base_net'](images)
            context_feat = None
            if not args.no_context:
                context_feat = nets['context_net'](conv_feat)

            ############## Inference ##############

            history, _ = inference(args, conv_feat, context_feat, nets,
                                   args.max_iter, tubes)

            #################### Evaluation #################

            # loop for each  iteration
            for i in range(len(history)):
                pred_prob = history[i]['pred_prob'].cpu()
                pred_prob = pred_prob[:, int(pred_prob.shape[1] / 2)]
                pred_tubes = history[i]['pred_loc'].cpu()
                pred_tubes = pred_tubes[:, int(pred_tubes.shape[1] / 2)]
                tubes_nums = history[i]['tubes_nums']

                # loop for each sample in a batch
                tubes_count = 0
                for b in range(len(tubes_nums)):
                    info = infos[b]
                    seq_start = tubes_count
                    tubes_count = tubes_count + tubes_nums[b]

                    cur_pred_prob = pred_prob[seq_start:seq_start +
                                              tubes_nums[b]]
                    cur_pred_tubes = pred_tubes[seq_start:seq_start +
                                                tubes_nums[b]]

                    # do NMS first
                    all_scores = []
                    all_boxes = []
                    all_idx = []
                    for cl_ind in range(args.num_classes):
                        scores = cur_pred_prob[:, cl_ind].squeeze().reshape(-1)
                        c_mask = scores.gt(
                            args.conf_thresh)  # greater than minmum threshold
                        scores = scores[c_mask]
                        idx = np.where(c_mask.numpy())[0]
                        if len(scores) == 0:
                            all_scores.append([])
                            all_boxes.append([])
                            continue
                        boxes = cur_pred_tubes.clone()
                        l_mask = c_mask.unsqueeze(1).expand_as(boxes)
                        boxes = boxes[l_mask].view(-1, 4)

                        boxes = valid_tubes(boxes.view(-1, 1, 4)).view(-1, 4)
                        keep = nms(boxes, scores, args.nms_thresh)
                        boxes = boxes[keep].numpy()
                        scores = scores[keep].numpy()
                        idx = idx[keep]

                        boxes[:, ::2] /= width
                        boxes[:, 1::2] /= height
                        all_scores.append(scores)
                        all_boxes.append(boxes)
                        all_idx.append(idx)

                    # get the top scores
                    scores_list = [(s, cl_ind, j)
                                   for cl_ind, scores in enumerate(all_scores)
                                   for j, s in enumerate(scores)]
                    if args.evaluate_topk > 0:
                        scores_list.sort(key=lambda x: x[0])
                        scores_list = scores_list[::-1]
                        scores_list = scores_list[:args.topk]

                    for s, cl_ind, j in scores_list:
                        # write to files
                        box = all_boxes[cl_ind][j]
                        fouts[i].write(
                            '{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6},{7:.4}\n'
                            .format(info['video_name'], info['fid'], box[0],
                                    box[1], box[2], box[3], label_dict[cl_ind],
                                    s))
    fout.close()

    all_metrics = []
    for i in range(args.max_iter):
        fouts[i].close()

        metrics = ava_evaluation(os.path.join(args.data_root, 'label/'),
                                 output_files[i], gt_file)
        all_metrics.append(metrics)

    return all_metrics
Example #2
0
def main():

    ################## Customize your configuratons here ###################

    checkpoint_path = 'pretrained/ava_step.pth'
    if os.path.isfile(checkpoint_path):
        print("Loading pretrain model from %s" % checkpoint_path)
        map_location = 'cuda:0'
        checkpoint = torch.load(checkpoint_path, map_location=map_location)
        args = checkpoint['cfg']
    else:
        raise ValueError("Pretrain model not found!", checkpoint_path)

    # TODO: Set data_root to the customized input dataset
    args.data_root = '/datasets/demo/frames/'
    args.save_root = os.path.join(os.path.dirname(args.data_root), 'results/')
    if not os.path.isdir(args.save_root):
        os.makedirs(args.save_root)

    # TODO: modify this setting according to the actual frame rate and file name
    source_fps = 30
    im_format = 'frame%04d.jpg'
    conf_thresh = 0.4
    global_thresh = 0.8  # used for cross-class NMS

    ################ Define models #################

    gpu_count = torch.cuda.device_count()
    nets = OrderedDict()
    # backbone network
    nets['base_net'] = BaseNet(args)
    # ROI pooling
    nets['roi_net'] = ROINet(args.pool_mode, args.pool_size)

    # detection network
    for i in range(args.max_iter):
        if args.det_net == "two_branch":
            nets['det_net%d' % i] = TwoBranchNet(args)
        else:
            raise NotImplementedError
    if not args.no_context:
        # context branch
        nets['context_net'] = ContextNet(args)

    for key in nets:
        nets[key] = nets[key].cuda()

    nets['base_net'] = torch.nn.DataParallel(nets['base_net'])
    if not args.no_context:
        nets['context_net'] = torch.nn.DataParallel(nets['context_net'])
    for i in range(args.max_iter):
        nets['det_net%d' % i].to('cuda:%d' % ((i + 1) % gpu_count))
        nets['det_net%d' % i].set_device('cuda:%d' % ((i + 1) % gpu_count))

    # load pretrained model
    nets['base_net'].load_state_dict(checkpoint['base_net'])
    if not args.no_context and 'context_net' in checkpoint:
        nets['context_net'].load_state_dict(checkpoint['context_net'])
    for i in range(args.max_iter):
        pretrained_dict = checkpoint['det_net%d' % i]
        nets['det_net%d' % i].load_state_dict(pretrained_dict)

    ################ DataLoader setup #################

    dataset = CustomizedDataset(args.data_root,
                                args.T,
                                args.NUM_CHUNKS[args.max_iter],
                                source_fps,
                                args.fps,
                                BaseTransform(args.image_size, args.means,
                                              args.stds, args.scale_norm),
                                anchor_mode=args.anchor_mode,
                                im_format=im_format)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             args.batch_size,
                                             num_workers=args.num_workers,
                                             shuffle=False,
                                             collate_fn=detection_collate,
                                             pin_memory=True)

    ################ Inference #################

    for _, net in nets.items():
        net.eval()

    fout = open(os.path.join(args.save_root, 'results.txt'), 'w')
    torch.cuda.synchronize()
    t0 = time.time()
    with torch.no_grad():
        for _, (images, tubes, infos) in enumerate(dataloader):

            _, _, channels, height, width = images.size()
            images = images.cuda()

            # get conv features
            conv_feat = nets['base_net'](images)
            context_feat = None
            if not args.no_context:
                context_feat = nets['context_net'](conv_feat)

            history, _ = inference(args, conv_feat, context_feat, nets,
                                   args.max_iter, tubes)

            # collect result of the last step
            pred_prob = history[-1]['pred_prob'].cpu()
            pred_prob = pred_prob[:, int(pred_prob.shape[1] / 2)]
            pred_tubes = history[-1]['pred_loc'].cpu()
            pred_tubes = pred_tubes[:, int(pred_tubes.shape[1] / 2)]
            tubes_nums = history[-1]['tubes_nums']

            # loop for each batch
            tubes_count = 0
            for b in range(len(tubes_nums)):
                info = infos[b]
                seq_start = tubes_count
                tubes_count = tubes_count + tubes_nums[b]

                cur_pred_prob = pred_prob[seq_start:seq_start + tubes_nums[b]]
                cur_pred_tubes = pred_tubes[seq_start:seq_start +
                                            tubes_nums[b]]

                # do NMS first
                all_scores = []
                all_boxes = []
                all_idx = []
                for cl_ind in range(args.num_classes):
                    scores = cur_pred_prob[:, cl_ind].squeeze()
                    c_mask = scores.gt(conf_thresh)  # greater than a threshold
                    scores = scores[c_mask]
                    idx = np.where(c_mask.numpy())[0]
                    if len(scores) == 0:
                        all_scores.append([])
                        all_boxes.append([])
                        continue
                    boxes = cur_pred_tubes.clone()
                    l_mask = c_mask.unsqueeze(1).expand_as(boxes)
                    boxes = boxes[l_mask].view(-1, 4)

                    boxes = valid_tubes(boxes.view(-1, 1, 4)).view(-1, 4)
                    keep = nms(boxes, scores, args.nms_thresh)
                    boxes = boxes[keep].numpy()
                    scores = scores[keep].numpy()
                    idx = idx[keep]

                    boxes[:, ::2] /= width
                    boxes[:, 1::2] /= height
                    all_scores.append(scores)
                    all_boxes.append(boxes)
                    all_idx.append(idx)

                # get the top scores
                scores_list = [(s, cl_ind, j)
                               for cl_ind, scores in enumerate(all_scores)
                               for j, s in enumerate(scores)]
                if args.evaluate_topk > 0:
                    scores_list.sort(key=lambda x: x[0])
                    scores_list = scores_list[::-1]
                    scores_list = scores_list[:args.topk]

                # merge high overlapping boxes (a simple greedy method)
                merged_result = {}
                flag = [1 for _ in range(len(scores_list))]
                for i in range(len(scores_list)):
                    if flag[i]:
                        s, cl_ind, j = scores_list[i]
                        box = all_boxes[cl_ind][j]
                        temp = ([box], [args.label_dict[cl_ind]], [s])

                        # find all high IoU boxes
                        for ii in range(i + 1, len(scores_list)):
                            if flag[ii]:
                                s2, cl_ind2, j2 = scores_list[ii]
                                box2 = all_boxes[cl_ind2][j2]
                                if compute_box_iou(box, box2) > global_thresh:
                                    flag[ii] = 0
                                    temp[0].append(box2)
                                    temp[1].append(args.label_dict[cl_ind2])
                                    temp[2].append(s2)

                        merged_box = np.mean(np.concatenate(temp[0],
                                                            axis=0).reshape(
                                                                -1, 4),
                                             axis=0)
                        key = ','.join(merged_box.astype(str).tolist())
                        merged_result[key] = [
                            (l, s) for l, s in zip(temp[1], temp[2])
                        ]

                # visualize results
                if not os.path.isdir(
                        os.path.join(args.save_root, info['video_name'])):
                    os.makedirs(
                        os.path.join(args.save_root, info['video_name']))
                print(info)
                overlay_image(os.path.join(args.data_root, info['video_name'],
                                           im_format % info['fid']),
                              os.path.join(args.save_root, info['video_name'],
                                           im_format % info['fid']),
                              pred_boxes=merged_result,
                              id2class=args.id2class)

                # write to files
                for key in merged_result:
                    box = np.asarray(key.split(','), dtype=np.float32)
                    for l, s in merged_result[key]:
                        fout.write(
                            '{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6},{7:.4}\n'
                            .format(info['video_name'], info['fid'], box[0],
                                    box[1], box[2], box[3], l, s))
            torch.cuda.synchronize()
            t1 = time.time()
            print("Batch time: ", t1 - t0)

            torch.cuda.synchronize()
            t0 = time.time()

    fout.close()
Example #3
0
def train_select(step, history, targets, tubes, args):
    """
    Select candidate samples for model training
    Arguments:
        step: int, the current step
        history: dict, inference output
        targets: list, ground truths
        tubes: np.array, initial proposals
        args: configs
    """

    # adaptively get the start chunk
    chunks = args.NUM_CHUNKS[step]
    max_chunks = args.NUM_CHUNKS[args.max_iter]
    T_start = int((args.NUM_CHUNKS[args.max_iter] - chunks) / 2) * args.T
    T_length = chunks * args.T

    cls_thresh = args.cls_thresh[step - 1]
    reg_thresh = args.reg_thresh[step - 1]

    ######### Collect candidates for training ########

    candidates = []
    if step > 1:  # for step > 1
        pred_prob = history['pred_prob'].cpu()
        pred_tubes = history['pred_loc'].cpu()
        tubes_nums = history['tubes_nums']
        tubes_count = 0

        if args.temporal_mode == "predict":
            pred_first_loc = history['pred_first_loc'].cpu()
            pred_last_loc = history['pred_last_loc'].cpu()

    for b in range(len(targets)):
        if step == 1:  # for 1st step
            candidates.append((tubes[b], None))

        else:  # for step > 1
            seq_start = tubes_count
            tubes_count = tubes_count + tubes_nums[b]
            cur_pred_prob = pred_prob[seq_start:seq_start + tubes_nums[b]]
            # get averaged score for each tube
            cur_pred_prob = torch.mean(cur_pred_prob, dim=1)
            cur_pred_tubes = pred_tubes[seq_start:seq_start + tubes_nums[b]]

            # select top-scoring boxes from each class
            all_scores = []
            all_idx = []
            for cl_ind in range(args.num_classes):
                scores = cur_pred_prob[:, cl_ind].squeeze()

                # sort according to the scores
                scores = scores.numpy().reshape(-1)
                ids = np.argsort(scores)[::-1]
                scores = scores[ids]
                idx = ids

                if args.topk > 0:
                    scores = scores[:int(args.topk / args.num_classes) * 2]
                    idx = idx[:int(args.topk / args.num_classes) * 2]
                all_scores.append(scores)
                all_idx.append(idx)

            # get the top scores
            scores_list = [(s, cl_ind, j)
                           for cl_ind, scores in enumerate(all_scores)
                           for j, s in enumerate(scores)]
            scores_list.sort(key=lambda x: x[0])
            scores_list = scores_list[::-1]
            temp_list = []
            temp = set()
            for s, cl_ind, j in scores_list:
                if not all_idx[cl_ind][j] in temp:
                    temp.add(all_idx[cl_ind][j])
                    temp_list.append((s, cl_ind, j))

            if args.topk > 0:
                scores_list = temp_list[:args.topk]
            else:
                scores_list = temp_list

            cur_tubes = []
            cur_scores = []
            for s, cl_ind, j in scores_list:
                cur_tubes.append(cur_pred_tubes[all_idx[cl_ind][j], :].numpy())
                cur_scores.append(s)
            try:
                cur_tubes = np.stack(cur_tubes, axis=0)
            except:
                pdb.set_trace()
            cur_tubes = valid_tubes(cur_tubes, args.image_size[0],
                                    args.image_size[1])
            cur_scores = np.asarray(cur_scores)

            if args.temporal_mode == "predict":
                cur_pred_first_loc = pred_first_loc[seq_start:seq_start +
                                                    tubes_nums[b]]
                cur_pred_last_loc = pred_last_loc[seq_start:seq_start +
                                                  tubes_nums[b]]

                cur_first_tubes = []
                cur_last_tubes = []
                for s, cl_ind, j in scores_list:
                    cur_first_tubes.append(
                        cur_pred_first_loc[all_idx[cl_ind][j], :])
                    cur_last_tubes.append(
                        cur_pred_last_loc[all_idx[cl_ind][j], :])
                cur_first_tubes = np.stack(cur_first_tubes, axis=0)
                cur_first_tubes = valid_tubes(cur_first_tubes,
                                              args.image_size[0],
                                              args.image_size[1])
                cur_last_tubes = np.stack(cur_last_tubes, axis=0)
                cur_last_tubes = valid_tubes(cur_last_tubes,
                                             args.image_size[0],
                                             args.image_size[1])
            else:
                cur_first_tubes, cur_last_tubes = None, None

            candidates.append(
                (cur_tubes, cur_scores, cur_first_tubes, cur_last_tubes))

    ######### Select training samples ########

    selected_tubes = []
    target_tubes = []
    for b in range(len(targets)):
        cur_tubes = candidates[b][0]
        cur_scores = candidates[b][1]
        selected_pos, selected_neg, ious = select_proposals(
            targets[b][:, int(max_chunks / 2)].reshape(targets[b].shape[0], 1,
                                                       -1),
            cur_tubes[:, int(cur_tubes.shape[1] / 2)].reshape(
                cur_tubes.shape[0], 1, -1), cur_scores, cls_thresh,
            args.max_pos_num, args.selection_sampling, args.neg_ratio)

        cur_selected_tubes = np.zeros(
            (len(selected_pos) + len(selected_neg), cur_tubes.shape[1], 4),
            dtype=np.float32)
        cur_target_tubes = np.zeros(
            (len(selected_pos) + len(selected_neg), 1, 6 + args.num_classes),
            dtype=np.float32)  # only one frame for loss
        row = 0
        for ii, jj in selected_pos:
            cur_selected_tubes[row] = cur_tubes[jj]
            cur_target_tubes[row, :, :4] = targets[b][ii,
                                                      int(max_chunks / 2), :4]
            cur_target_tubes[row, :, 6:] = targets[b][ii,
                                                      int(max_chunks / 2), 4:]
            cur_target_tubes[row, :, 5] = 1  # flag for regression
            cur_target_tubes[row, :, 4] = 1  # flag for classification
            row += 1

        for ii, jj in selected_neg:
            cur_selected_tubes[row] = cur_tubes[jj]
            # for regreesion only samples
            if ious[ii, jj] >= reg_thresh:
                cur_target_tubes[row, :, :4] = targets[b][ii,
                                                          int(max_chunks /
                                                              2), :4]
                cur_target_tubes[row, :, 6:] = targets[b][ii,
                                                          int(max_chunks / 2),
                                                          4:]
                cur_target_tubes[row, :, 5] = 1  # for regression
            # FIXME: cur_target_tubes[row,:,4] = 1     # flag for classification
            row += 1

        ###### check whether extend tube is needed ######

        if step - 1 in args.NUM_CHUNKS and args.NUM_CHUNKS[
                step] == args.NUM_CHUNKS[step - 1] + 2:

            if args.temporal_mode == "predict":
                cur_first_tubes = candidates[b][2]
                cur_last_tubes = candidates[b][3]

                cur_selected_first = np.zeros(
                    (len(selected_pos) + len(selected_neg), args.T, 4),
                    dtype=np.float32)
                cur_selected_last = np.zeros(
                    (len(selected_pos) + len(selected_neg), args.T, 4),
                    dtype=np.float32)
                row = 0
                for ii, jj in selected_pos:
                    cur_selected_first[row] = cur_first_tubes[jj]
                    cur_selected_last[row] = cur_last_tubes[jj]
                    row += 1

                for ii, jj in selected_neg:
                    cur_selected_first[row] = cur_first_tubes[jj]
                    cur_selected_last[row] = cur_last_tubes[jj]
                    row += 1

                cur_selected_tubes = np.concatenate([
                    cur_selected_first, cur_selected_tubes, cur_selected_last
                ],
                                                    axis=1)

            elif args.temporal_mode == "extrapolate":  # linear extrapolation
                cur_selected_tubes = extrapolate_tubes(cur_selected_tubes,
                                                       args.T)

            else:  # mean tubes
                mean_tubes = np.mean(cur_selected_tubes, axis=1, keepdims=True)
                mean_tubes = np.tile(mean_tubes, (1, args.T, 1))
                cur_selected_tubes = np.concatenate(
                    (mean_tubes, cur_selected_tubes, mean_tubes), axis=1)

        ###### check whether predicting neighbor is needed ######

        cur_target_first = np.zeros(
            (len(selected_pos) + len(selected_neg), 1, 6 + args.num_classes),
            dtype=np.float32)
        cur_target_last = np.zeros(
            (len(selected_pos) + len(selected_neg), 1, 6 + args.num_classes),
            dtype=np.float32)

        if args.temporal_mode == "predict" and step < args.max_iter and args.NUM_CHUNKS[
                step + 1] == args.NUM_CHUNKS[step] + 2:
            row = 0
            for ii, jj in selected_pos:
                cur_target_first[row, :, :4] = targets[b][ii,
                                                          int((T_start -
                                                               args.T) /
                                                              args.T), :4]
                if cur_target_first[row, :, :4].sum() > 0:  # valid box
                    cur_target_first[row, :, 5] = 1
                cur_target_first[row, :,
                                 6:] = targets[b][ii,
                                                  int((T_start - args.T) /
                                                      args.T), 4:]

                cur_target_last[row, :, :4] = targets[b][ii,
                                                         int((T_start +
                                                              T_length) /
                                                             args.T), :4]
                if cur_target_last[row, :, :4].sum() > 0:  # valid box
                    cur_target_last[row, :, 5] = 1
                cur_target_last[row, :,
                                6:] = targets[b][ii,
                                                 int((T_start + T_length) /
                                                     args.T), 4:]
                row += 1

        cur_target_tubes = np.concatenate(
            [cur_target_first, cur_target_tubes, cur_target_last], axis=1)

        selected_tubes.append(cur_selected_tubes)
        target_tubes.append(cur_target_tubes)

    return selected_tubes, target_tubes
Example #4
0
def main():

    ################## Load pretrained model and configurations ###################

    checkpoint_path = 'pretrained/ava_step.pth'
    if os.path.isfile(checkpoint_path):
        print ("Loading pretrain model from %s" % checkpoint_path)
        map_location = 'cuda:0'
        checkpoint = torch.load(checkpoint_path, map_location=map_location)
        args = checkpoint['cfg']
    else:
        raise ValueError("Pretrain model not found!", checkpoint_path)

    if not os.path.isdir(args.save_root):
        os.makedirs(args.save_root)
    
    label_dict = {}
    if args.num_classes == 60:
        label_map = os.path.join(args.data_root, 'label/ava_action_list_v2.1_for_activitynet_2018.pbtxt')
        categories, class_whitelist = read_labelmap(open(label_map, 'r'))
        classes = [(val['id'], val['name']) for val in categories]
        id2class = {c[0]: c[1] for c in classes}    # gt class id (1~80) --> class name
        for i, c in enumerate(sorted(list(class_whitelist))):
            label_dict[i] = c
    else:
        for i in range(80):
            label_dict[i] = i+1

    ################ Define models #################

    gpu_count = torch.cuda.device_count()
    nets = OrderedDict()
    # backbone network
    nets['base_net'] = BaseNet(args)
    # ROI pooling
    nets['roi_net'] = ROINet(args.pool_mode, args.pool_size)

    # detection network
    for i in range(args.max_iter):
        if args.det_net == "two_branch":
            nets['det_net%d' % i] = TwoBranchNet(args)
        else:
            raise NotImplementedError
    if not args.no_context:
        # context branch
        nets['context_net'] = ContextNet(args)

    for key in nets:
        nets[key] = nets[key].cuda()

    nets['base_net'] = torch.nn.DataParallel(nets['base_net'])
    if not args.no_context:
        nets['context_net'] = torch.nn.DataParallel(nets['context_net'])
    for i in range(args.max_iter):
        nets['det_net%d' % i].to('cuda:%d' % ((i+1)%gpu_count))
        nets['det_net%d' % i].set_device('cuda:%d' % ((i+1)%gpu_count))

    # load pretrained weights
    nets['base_net'].load_state_dict(checkpoint['base_net'])
    if not args.no_context and 'context_net' in checkpoint:
        nets['context_net'].load_state_dict(checkpoint['context_net'])
    for i in range(args.max_iter):
        pretrained_dict = checkpoint['det_net%d' % i]
        nets['det_net%d' % i].load_state_dict(pretrained_dict)

    
    ################ DataLoader setup #################

    dataset = AVADataset(args.data_root, 'test', args.input_type, args.T, args.NUM_CHUNKS[args.max_iter], args.fps, BaseTransform(args.image_size, args.means, args.stds,args.scale_norm), proposal_path=args.proposal_path_val, stride=1, anchor_mode=args.anchor_mode, num_classes=args.num_classes, foreground_only=False)
    dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers,
                                  shuffle=False, collate_fn=detection_collate, pin_memory=True)

    ################ Inference #################

    for _, net in nets.items():
        net.eval()

    # write results to files for evaluation
    output_files = []
    fouts = []
    for i in range(args.max_iter):
        output_file = args.save_root+'testing_result-iter'+str(i+1)+'.csv'
        output_files.append(output_file)
        f = open(output_file, 'w')
        fouts.append(f)

    gt_file = args.save_root+'testing_gt.csv'
    fout = open(gt_file, 'w')

    torch.cuda.synchronize()
    t0 = time.time()
    with torch.no_grad():    # for evaluation
        for num, (images, targets, tubes, infos) in enumerate(dataloader):

            if (num+1) % 100 == 0:
                print ("%d / %d" % (num+1, len(dataloader.dataset)/args.batch_size))

            for b in range(len(infos)):
                for n in range(len(infos[b]['boxes'])):
                    mid = int(len(infos[b]['boxes'][n])/2)
                    box = infos[b]['boxes'][n][mid]
                    labels = infos[b]['labels'][n][mid]
                    for label in labels:
                        fout.write('{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6}\n'.format(
                                    infos[b]['video_name'],
                                    infos[b]['fid'],
                                    box[0], box[1], box[2], box[3],
                                    label))

            _, _, channels, height, width = images.size()
            images = images.cuda()

            # get conv features
            conv_feat = nets['base_net'](images)
            context_feat = None
            if not args.no_context:
                context_feat = nets['context_net'](conv_feat)

            ############## Inference ##############

            history, _ = inference(args, conv_feat, context_feat, nets, args.max_iter, tubes)

            #################### Evaluation #################

            # loop for each  iteration
            for i in range(len(history)):
                pred_prob = history[i]['pred_prob'].cpu()
                pred_prob = pred_prob[:,int(pred_prob.shape[1]/2)]
                pred_tubes = history[i]['pred_loc'].cpu()
                pred_tubes = pred_tubes[:,int(pred_tubes.shape[1]/2)]
                tubes_nums = history[i]['tubes_nums']

                # loop for each sample in a batch
                tubes_count = 0
                for b in range(len(tubes_nums)):
                    info = infos[b]
                    seq_start = tubes_count
                    tubes_count = tubes_count + tubes_nums[b]
    
                    cur_pred_prob = pred_prob[seq_start:seq_start+tubes_nums[b]]
                    cur_pred_tubes = pred_tubes[seq_start:seq_start+tubes_nums[b]]

                    # do NMS first
                    all_scores = []
                    all_boxes = []
                    all_idx = []
                    for cl_ind in range(args.num_classes):
                        scores = cur_pred_prob[:, cl_ind].squeeze().reshape(-1)
                        c_mask = scores.gt(args.conf_thresh) # greater than minmum threshold
                        scores = scores[c_mask]
                        idx = np.where(c_mask.numpy())[0]
                        if len(scores) == 0:
                            all_scores.append([])
                            all_boxes.append([])
                            continue
                        boxes = cur_pred_tubes.clone()
                        l_mask = c_mask.unsqueeze(1).expand_as(boxes)
                        boxes = boxes[l_mask].view(-1, 4)
    
                        boxes = valid_tubes(boxes.view(-1,1,4)).view(-1,4)
                        keep = nms(boxes, scores, args.nms_thresh)
                        boxes = boxes[keep].numpy()
                        scores = scores[keep].numpy()
                        idx = idx[keep]
    
                        boxes[:, ::2] /= width
                        boxes[:, 1::2] /= height
                        all_scores.append(scores)
                        all_boxes.append(boxes)
                        all_idx.append(idx)

                    # get the top scores
                    scores_list = [(s,cl_ind,j) for cl_ind,scores in enumerate(all_scores) for j,s in enumerate(scores)]
                    if args.evaluate_topk > 0:
                        scores_list.sort(key=lambda x: x[0])
                        scores_list = scores_list[::-1]
                        scores_list = scores_list[:args.topk]

                    for s,cl_ind,j in scores_list:
                        # write to files
                        box = all_boxes[cl_ind][j]
                        fouts[i].write('{0},{1:04},{2:.4},{3:.4},{4:.4},{5:.4},{6},{7:.4}\n'.format(
                                                    info['video_name'],
                                                    info['fid'],
                                                    box[0],box[1],box[2],box[3],
                                                    label_dict[cl_ind],
                                                    s))
    fout.close()

    all_metrics = []
    for i in range(args.max_iter):
        fouts[i].close()

        metrics = ava_evaluation(os.path.join(args.data_root, 'label/'), output_files[i], gt_file)
        all_metrics.append(metrics)

    # Logging
    log_name = args.save_root+"testing_results.log"
    log_file = open(log_name, "w", 1)
    prt_str = ''
    for i in range(args.max_iter):
        prt_str += 'Iter '+str(i+1)+': MEANAP =>'+str(all_metrics[i]['PascalBoxes_Precision/[email protected]'])+'\n'
    log_file.write(prt_str)
    
    for i in class_whitelist:
        log_file.write("({}) {}: {}\n".format(i,id2class[i], 
            all_metrics[-1]["PascalBoxes_PerformanceByCategory/[email protected]/{}".format(id2class[i])]))

    log_file.close()
Example #5
0
def inference(args, conv_feat, context_feat, nets, exec_iter, tubes):
    """
    Inference on two-branch networks of different steps.
    In training, it is used to collect all candidate tubes.
    In testing, it is used to get detection results of each step

    Arguments:
        conv_feat: conv features from the backbone network
        context_feat: context features from the context network (None if the context network is not used)
        nets: a list of two-branch networks
        exec_iter: the number of iterations to execute
        tubes: initial proposal tubes

    return:
        history: collecting output results for each iteration
        trajectory: collecting input for each iteration
    """

    # flatten list of tubes
    flat_tubes, tubes_nums = flatten_tubes(
        tubes, batch_idx=True)  # add batch_idx for ROI pooling
    flat_tubes = torch.FloatTensor(flat_tubes).to(conv_feat)

    history = []
    trajectory = []
    for i in range(1, exec_iter + 1):  # index from 1
        # adaptively get the start chunk
        chunks = args.NUM_CHUNKS[i]
        T_start = int((args.NUM_CHUNKS[args.max_iter] - chunks) / 2) * args.T
        T_length = chunks * args.T
        chunk_idx = [j * args.T + int(args.T / 2) for j in range(chunks)
                     ]  # used to index the middel frame of each chunk
        half_T = int(args.T / 2)

        # ROI Pooling
        pooled_feat = nets['roi_net'](
            conv_feat[:, T_start:T_start + T_length].contiguous(), flat_tubes)
        _, C, W, H = pooled_feat.size()
        pooled_feat = pooled_feat.view(-1, T_length, C, W, H)

        # detection head
        temp_context_feat = None
        if not args.no_context:
            temp_context_feat = torch.zeros(
                (pooled_feat.size(0), context_feat.size(1), T_length, 1,
                 1)).to(context_feat)
            for p in range(pooled_feat.size(0)):
                temp_context_feat[p] = context_feat[
                    int(flat_tubes[p, 0, 0].item() / T_length), :,
                    T_start:T_start + T_length].contiguous().clone()

        global_prob, local_loc, first_loc, last_loc, _, _, _ = nets[
            'det_net%d' % (i - 1)](pooled_feat,
                                   context_feat=temp_context_feat,
                                   tubes=None,
                                   targets=None)

        ########## prepare data for next iteration ###########

        pred_prob = global_prob.view(-1, 1, args.num_classes).expand(
            -1, T_length, -1)

        # decode regression results to output tubes
        flat_tubes = flat_tubes.to(local_loc)
        pred_loc = decode_coef(
            flat_tubes.view(-1, 5)[:, 1:], local_loc.view(-1, 4))
        pred_loc = pred_loc.view(local_loc.size())

        if args.temporal_mode == "predict":
            pred_first_loc = decode_coef(
                flat_tubes[:, chunk_idx[0] - half_T:chunk_idx[0] + half_T +
                           1].contiguous().view(-1, 5)[:, 1:],
                first_loc.view(-1, 4))
            pred_first_loc = pred_first_loc.view(
                first_loc.size())  # [N*T, 4*C] --> [N, T, 4*C]

            pred_last_loc = decode_coef(
                flat_tubes[:, chunk_idx[-1] - half_T:chunk_idx[-1] + half_T +
                           1].contiguous().view(-1, 5)[:, 1:],
                last_loc.view(-1, 4))
            pred_last_loc = pred_last_loc.view(
                last_loc.size())  # [N*T, 4*C] --> [N, T, 4*C]

        history.append({
            'pred_prob':
            pred_prob.data,
            'pred_loc':
            pred_loc.data,
            'pred_first_loc':
            pred_first_loc.data if args.temporal_mode == "predict" else None,
            'pred_last_loc':
            pred_last_loc.data if args.temporal_mode == "predict" else None,
            'tubes_nums':
            tubes_nums
        })

        # loop for each batch
        cur_trajectory = []
        selected_tubes = []
        tubes_count = 0
        for b in range(len(tubes_nums)):
            seq_start = tubes_count
            tubes_count = tubes_count + tubes_nums[b]

            cur_pred_prob = pred_prob[seq_start:seq_start + tubes_nums[b]]
            cur_pred_tubes = pred_loc[seq_start:seq_start + tubes_nums[b]]
            cur_pred_class = torch.argmax(cur_pred_prob, dim=-1)

            # check whether extending tubes is needed
            if i < args.max_iter and args.NUM_CHUNKS[
                    i + 1] == args.NUM_CHUNKS[i] + 2:
                # check which method to extend tubes
                if args.temporal_mode == "predict":
                    cur_first_tubes = pred_first_loc[seq_start:seq_start +
                                                     tubes_nums[b]]
                    cur_last_tubes = pred_last_loc[seq_start:seq_start +
                                                   tubes_nums[b]]

                    cur_proposals = torch.cat(
                        [cur_first_tubes, cur_pred_tubes, cur_last_tubes],
                        dim=1)  # concatenate along time axis
                    cur_proposals = cur_proposals.cpu().numpy()

                elif args.temporal_mode == "extrapolate":
                    # expand tubes along temporal axis with extrapolation
                    cur_proposals = cur_pred_tubes.cpu().numpy()
                    cur_proposals = extrapolate_tubes(cur_proposals, args.T)

                else:  # mean tubes
                    cur_proposals = cur_pred_tubes.cpu().numpy()
                    mean_tubes = np.mean(cur_proposals, axis=1, keepdims=True)
                    mean_tubes = np.tile(mean_tubes, (1, args.T, 1))
                    cur_proposals = np.concatenate(
                        (mean_tubes, cur_proposals, mean_tubes), axis=1)
            else:
                cur_proposals = cur_pred_tubes.cpu().numpy()
            cur_proposals = valid_tubes(cur_proposals,
                                        width=args.image_size[0],
                                        height=args.image_size[1])
            cur_trajectory.append((cur_proposals, cur_pred_class))

            selected_tubes.append(cur_proposals)
        trajectory.append(cur_trajectory)

        # flatten list of tubes
        flat_tubes, tubes_nums = flatten_tubes(
            selected_tubes, batch_idx=True)  # add batch_idx for ROI pooling
        flat_tubes = torch.FloatTensor(flat_tubes).to(conv_feat)

    return history, trajectory