Exemple #1
0
def main(args):
    # Setup config node
    cfg = setup_config(args, random_seed=args.random_seed)

    # For debugging only
    #cfg.defrost()
    #cfg.DATALOADER.NUM_WORKERS = 0
    #cfg.SOLVER.IMS_PER_BATCH = 1

    # Eval only mode to produce mAP results
    # Build Trainer from config node. Begin Training.
    if cfg.MODEL.META_ARCHITECTURE == 'ProbabilisticDetr':
        trainer = Detr_Trainer(cfg)
    else:
        trainer = Trainer(cfg)

    if args.eval_only:
        model = trainer.build_model(cfg)
        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume)
        res = trainer.test(cfg, model)
        if comm.is_main_process():
            verify_results(cfg, res)
        return res

    trainer.resume_or_load(resume=args.resume)
    return trainer.train()
def get_mAP_results(config_names, configs_list, inference_configs_list):
    # Level 0 is coco validation set with no corruption, level 10 is open
    # images, level 11 is open images ood
    image_corruption_levels = [0, 1, 2, 3, 4, 5, 10]

    test_dataset_coco = "coco_2017_custom_val"
    test_dataset_open_images = "openimages_val"

    arg_parser = setup_arg_parser()
    args = arg_parser.parse_args()

    # Initiate dataframe dict
    mAP_results = defaultdict(list)

    for config_name, config, inference_config_name in zip(
            config_names, configs_list, inference_configs_list):
        # Setup config
        args.config_file = config
        args.inference_config = inference_config_name
        args.test_dataset = test_dataset_coco
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)
        cfg.defrost()

        # Read coco dataset results
        cfg.ACTUAL_TEST_DATASET = args.test_dataset

        for image_corruption_level in image_corruption_levels:
            # Build path to gt instances and inference output
            args.image_corruption_level = image_corruption_level
            if image_corruption_level == 0:
                image_corruption_level = 'Val'
            elif image_corruption_level == 10:
                image_corruption_level = 'OpenIm'
            else:
                image_corruption_level = 'C' + str(image_corruption_level)

            if 'OpenIm' not in image_corruption_level:
                inference_output_dir = get_inference_output_dir(
                    cfg['OUTPUT_DIR'], args.test_dataset,
                    args.inference_config, args.image_corruption_level)
            else:
                args.image_corruption_level = 0
                args.test_dataset = test_dataset_open_images
                inference_output_dir = get_inference_output_dir(
                    cfg['OUTPUT_DIR'], args.test_dataset,
                    args.inference_config, args.image_corruption_level)

            text_file_name = glob.glob(
                os.path.join(inference_output_dir, 'mAP_res.txt'))[0]
            with open(text_file_name, "r") as f:
                mAP = f.read().strip('][\n').split(', ')[0]
                mAP = float(mAP) * 100

            mAP_results['Method Name'].append(config_name)
            mAP_results['Image Corruption Level'].append(
                image_corruption_level)
            mAP_results['mAP'].append(mAP)

    return mAP_results
def main(args, cfg=None):
    # Setup config
    if cfg is None:
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)

    # Build path to inference output
    inference_output_dir = os.path.join(
        cfg['OUTPUT_DIR'], 'inference', args.test_dataset,
        os.path.split(args.inference_config)[-1][:-5])

    prediction_file_name = os.path.join(inference_output_dir,
                                        'coco_instances_results.json')

    meta_catalog = MetadataCatalog.get(args.test_dataset)

    # Evaluate detection results
    gt_coco_api = COCO(meta_catalog.json_file)
    res_coco_api = gt_coco_api.loadRes(prediction_file_name)
    results_api = COCOeval(gt_coco_api, res_coco_api, iouType='bbox')

    results_api.params.catIds = [
        1, 3
    ]  #list(meta_catalog.thing_dataset_id_to_contiguous_id.keys())

    # Calculate and print aggregate results
    results_api.evaluate()
    results_api.accumulate()
    results_api.summarize()

    # Compute optimal micro F1 score threshold. We compute the f1 score for
    # every class and score threshold. We then compute the score threshold that
    # maximizes the F-1 score of every class. The final score threshold is the average
    # over all classes.
    precisions = results_api.eval['precision'].mean(0)[:, :, 0, 2]
    recalls = np.expand_dims(results_api.params.recThrs, 1)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    optimal_f1_score = f1_scores.argmax(0)
    scores = results_api.eval['scores'].mean(0)[:, :, 0, 2]
    optimal_score_threshold = [
        scores[optimal_f1_score_i, i]
        for i, optimal_f1_score_i in enumerate(optimal_f1_score)
    ]
    optimal_score_threshold = np.array(optimal_score_threshold)
    optimal_score_threshold = optimal_score_threshold[
        optimal_score_threshold != 0]
    optimal_score_threshold = optimal_score_threshold.mean()

    print("Classification Score at Optimal F-1 Score: {}".format(
        optimal_score_threshold))

    text_file_name = os.path.join(inference_output_dir, 'mAP_res.txt')

    with open(text_file_name, "w") as text_file:
        print(results_api.stats.tolist() + [
            optimal_score_threshold,
        ],
              file=text_file)
Exemple #4
0
def main(args):
    # Setup config node
    cfg = setup_config(args,
                       random_seed=args.random_seed)

    # Eval only mode to produce mAP results
    if args.eval_only:
        model = Trainer.build_model(cfg)
        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume
        )
        res = Trainer.test(cfg, model)
        if comm.is_main_process():
            verify_results(cfg, res)
        return res

    # Build Trainer from config node. Begin Training.
    trainer = Trainer(cfg)
    trainer.resume_or_load(resume=args.resume)
    return trainer.train()
Exemple #5
0
def main(args):
    # Setup config
    cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)
    # Make sure only 1 data point is processed at a time. This simulates
    # deployment.
    cfg.defrost()
    cfg.DATALOADER.NUM_WORKERS = 32
    cfg.SOLVER.IMS_PER_BATCH = 1

    cfg.MODEL.DEVICE = device.type

    # Set up number of cpu threads#
    torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS)

    # Create inference output directory and copy inference config file to keep
    # track of experimental settings
    inference_output_dir = get_inference_output_dir(
        cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config,
        args.image_corruption_level)

    os.makedirs(inference_output_dir, exist_ok=True)
    copyfile(
        args.inference_config,
        os.path.join(inference_output_dir,
                     os.path.split(args.inference_config)[-1]))

    # Get category mapping dictionary:
    train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id
    test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        args.test_dataset).thing_dataset_id_to_contiguous_id

    # If both dicts are equal or if we are performing out of distribution
    # detection, just flip the test dict.
    cat_mapping_dict = get_train_contiguous_id_to_test_thing_dataset_id_dict(
        cfg, args, train_thing_dataset_id_to_contiguous_id,
        test_thing_dataset_id_to_contiguous_id)

    # Build predictor
    predictor = build_predictor(cfg)
    test_data_loader = build_detection_test_loader(
        cfg, dataset_name=args.test_dataset)

    final_output_list = []
    if not args.eval_only:
        with torch.no_grad():
            with tqdm.tqdm(total=len(test_data_loader)) as pbar:
                for idx, input_im in enumerate(test_data_loader):
                    # Apply corruption
                    outputs = predictor(input_im)

                    # predictor.visualize_inference(input_im, outputs)

                    final_output_list.extend(
                        instances_to_json(outputs, input_im[0]['image_id'],
                                          cat_mapping_dict))
                    pbar.update(1)

        with open(
                os.path.join(inference_output_dir,
                             'coco_instances_results.json'), 'w') as fp:
            json.dump(final_output_list, fp, indent=4, separators=(',', ': '))

    if 'ood' in args.test_dataset:
        compute_ood_probabilistic_metrics.main(args, cfg)
    else:
        compute_average_precision.main(args, cfg)
        compute_probabilistic_metrics.main(args, cfg)
        compute_calibration_errors.main(args, cfg)
def main(args,
         cfg=None,
         iou_min=None,
         iou_correct=None,
         min_allowed_score=None):
    # Setup config
    if cfg is None:
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)

    cfg.defrost()
    cfg.ACTUAL_TEST_DATASET = args.test_dataset

    # Build path to gt instances and inference output
    inference_output_dir = get_inference_output_dir(
        cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config,
        args.image_corruption_level)

    # Get thresholds to perform evaluation on
    if iou_min is None:
        iou_min = args.iou_min
    if iou_correct is None:
        iou_correct = args.iou_correct
    if min_allowed_score is None:
        # Check if F-1 Score has been previously computed ON THE ORIGINAL
        # DATASET such as COCO even when evaluating on VOC.
        try:
            train_set_inference_output_dir = get_inference_output_dir(
                cfg['OUTPUT_DIR'], cfg.DATASETS.TEST[0], args.inference_config,
                0)
            with open(
                    os.path.join(train_set_inference_output_dir,
                                 "mAP_res.txt"), "r") as f:
                min_allowed_score = f.read().strip('][\n').split(', ')[-1]
                min_allowed_score = round(float(min_allowed_score), 4)
        except FileNotFoundError:
            # If not, process all detections. Not recommended as the results might be influenced by very low scoring
            # detections that would normally be removed in robotics/vision
            # applications.
            min_allowed_score = 0.0

    # get preprocessed instances
    preprocessed_predicted_instances, preprocessed_gt_instances = evaluation_utils.get_per_frame_preprocessed_instances(
        cfg, inference_output_dir, min_allowed_score)

    # get metacatalog and image infos
    meta_catalog = MetadataCatalog.get(args.test_dataset)
    images_info = json.load(open(meta_catalog.json_file, 'r'))['images']

    # Loop over all images and visualize errors
    for image_info in images_info:
        image_id = image_info['id']
        image = cv2.imread(
            os.path.join(meta_catalog.image_root, image_info['file_name']))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        predicted_box_means = {
            image_id:
            preprocessed_predicted_instances['predicted_boxes'][image_id]
        }
        predicted_box_covariances = {
            image_id:
            preprocessed_predicted_instances['predicted_covar_mats'][image_id]
        }
        predicted_cls_probs = {
            image_id:
            preprocessed_predicted_instances['predicted_cls_probs'][image_id]
        }
        gt_box_means = {
            image_id: preprocessed_gt_instances['gt_boxes'][image_id]
        }
        gt_cat_idxs = {
            image_id: preprocessed_gt_instances['gt_cat_idxs'][image_id]
        }

        # Perform matching
        matched_results = evaluation_utils.match_predictions_to_groundtruth(
            predicted_box_means,
            predicted_cls_probs,
            predicted_box_covariances,
            gt_box_means,
            gt_cat_idxs,
            iou_min=iou_min,
            iou_correct=iou_correct)

        true_positives = matched_results['true_positives']
        duplicates = matched_results['duplicates']
        localization_errors = matched_results['localization_errors']
        false_positives = matched_results['false_positives']
        false_negatives = matched_results['false_negatives']

        # Plot True Positive Detections In Blue
        v = Visualizer(image, meta_catalog, scale=2.0)

        gt_boxes = true_positives['gt_box_means'].cpu().numpy()
        true_positive_boxes = true_positives['predicted_box_means'].cpu(
        ).numpy()
        false_positives_boxes = false_positives['predicted_box_means'].cpu(
        ).numpy()
        duplicates_boxes = duplicates['predicted_box_means'].cpu().numpy()
        localization_errors_boxes = localization_errors[
            'predicted_box_means'].cpu().numpy()

        # Get category labels
        gt_cat_idxs = true_positives['gt_cat_idxs'].cpu().numpy()
        # Get category mapping dictionary:
        train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
            cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id
        test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
            args.test_dataset).thing_dataset_id_to_contiguous_id

        thing_dataset_id_to_contiguous_id = evaluation_utils.get_test_thing_dataset_id_to_train_contiguous_id_dict(
            cfg, args, train_thing_dataset_id_to_contiguous_id,
            test_thing_dataset_id_to_contiguous_id)
        class_list = MetadataCatalog.get(
            cfg.DATASETS.TRAIN[0]).as_dict()['thing_classes']

        if gt_cat_idxs.shape[0] > 0:
            gt_labels = [
                class_list[thing_dataset_id_to_contiguous_id[gt_class]]
                for gt_class in gt_cat_idxs[:, 0]
            ]
        else:
            gt_labels = []

        if cfg.MODEL.META_ARCHITECTURE != "ProbabilisticRetinaNet":
            if len(true_positives['predicted_cls_probs'] > 0):
                _, true_positive_classes = true_positives[
                    'predicted_cls_probs'][:, :-1].max(1)
            else:
                true_positive_classes = np.array([])

            if len(duplicates['predicted_cls_probs']) > 0:
                _, duplicates_classes = duplicates[
                    'predicted_cls_probs'][:, :-1].max(1)
            else:
                duplicates_classes = np.array([])

            if len(localization_errors['predicted_cls_probs']) > 0:
                _, localization_errors_classes = localization_errors[
                    'predicted_cls_probs'][:, :-1].max(1)
            else:
                localization_errors_classes = np.array([])

            if len(false_positives['predicted_cls_probs']) > 0:
                _, false_positives_classes = false_positives[
                    'predicted_cls_probs'][:, :-1].max(1)
            else:
                false_positives_classes = np.array([])

        else:
            if len(true_positives['predicted_cls_probs'] > 0):
                _, true_positive_classes = true_positives[
                    'predicted_cls_probs'].max(1)
            else:
                true_positive_classes = np.array([])

            if len(duplicates['predicted_cls_probs']) > 0:

                _, duplicates_classes = duplicates['predicted_cls_probs'].max(
                    1)
            else:
                duplicates_classes = np.array([])

            if len(localization_errors['predicted_cls_probs']) > 0:
                _, localization_errors_classes = localization_errors[
                    'predicted_cls_probs'].max(1)
            else:
                localization_errors_classes = np.array([])

            if len(false_positives['predicted_cls_probs']) > 0:
                _, false_positives_classes = false_positives[
                    'predicted_cls_probs'].max(1)
            else:
                false_positives_classes = np.array([])

        if len(true_positives['predicted_cls_probs'] > 0):
            true_positive_classes = true_positive_classes.cpu().numpy()
            true_positive_labels = [
                class_list[tp_class] for tp_class in true_positive_classes
            ]
        else:
            true_positive_labels = []

        if len(duplicates['predicted_cls_probs']) > 0:
            duplicates_classes = duplicates_classes.cpu().numpy()
            duplicates_labels = [
                class_list[d_class] for d_class in duplicates_classes
            ]
        else:
            duplicates_labels = []

        if len(localization_errors['predicted_cls_probs']) > 0:
            localization_errors_classes = localization_errors_classes.cpu(
            ).numpy()
            localization_errors_labels = [
                class_list[le_class]
                for le_class in localization_errors_classes
            ]
        else:
            localization_errors_labels = []

        if len(false_positives['predicted_cls_probs']) > 0:
            false_positives_classes = false_positives_classes.cpu().numpy()
            false_positives_labels = [
                class_list[fp_class] for fp_class in false_positives_classes
            ]
        else:
            false_positives_labels = []

        # Overlay true positives in blue
        _ = v.overlay_instances(boxes=gt_boxes,
                                assigned_colors=['lime' for _ in gt_boxes],
                                labels=gt_labels,
                                alpha=1.0)
        plotted_true_positive_boxes = v.overlay_instances(
            boxes=true_positive_boxes,
            assigned_colors=['dodgerblue' for _ in true_positive_boxes],
            alpha=1.0,
            labels=true_positive_labels)
        cv2.imshow(
            'True positive detections with IOU greater than {}'.format(
                iou_correct),
            cv2.cvtColor(plotted_true_positive_boxes.get_image(),
                         cv2.COLOR_RGB2BGR))

        # Plot False Positive Detections In Red
        v = Visualizer(image, meta_catalog, scale=2.0)

        _ = v.overlay_instances(boxes=gt_boxes,
                                assigned_colors=['lime' for _ in gt_boxes],
                                labels=gt_labels,
                                alpha=0.7)
        plotted_false_positive_boxes = v.overlay_instances(
            boxes=false_positives_boxes,
            assigned_colors=['red' for _ in false_positives_boxes],
            alpha=1.0,
            labels=false_positives_labels)
        cv2.imshow(
            'False positive detections with IOU less than {}'.format(iou_min),
            cv2.cvtColor(plotted_false_positive_boxes.get_image(),
                         cv2.COLOR_RGB2BGR))

        # Plot Duplicates
        v = Visualizer(image, meta_catalog, scale=2.0)

        _ = v.overlay_instances(boxes=gt_boxes,
                                assigned_colors=['lime' for _ in gt_boxes],
                                labels=gt_labels,
                                alpha=0.7)

        plotted_duplicates_boxes = v.overlay_instances(
            boxes=duplicates_boxes,
            assigned_colors=['magenta' for _ in duplicates_boxes],
            alpha=1.0,
            labels=duplicates_labels)
        cv2.imshow(
            'Duplicate Detections',
            cv2.cvtColor(plotted_duplicates_boxes.get_image(),
                         cv2.COLOR_RGB2BGR))

        # Plot localization errors
        v = Visualizer(image, meta_catalog, scale=2.0)

        _ = v.overlay_instances(boxes=gt_boxes,
                                assigned_colors=['lime' for _ in gt_boxes],
                                labels=gt_labels,
                                alpha=0.7)
        plotted_localization_errors_boxes = v.overlay_instances(
            boxes=localization_errors_boxes,
            assigned_colors=['aqua' for _ in localization_errors_boxes],
            alpha=1.0,
            labels=localization_errors_labels)
        cv2.imshow(
            'Detections with localization errors between minimum IOU = {} and maximum IOU = {}'
            .format(iou_min, iou_correct),
            cv2.cvtColor(plotted_localization_errors_boxes.get_image(),
                         cv2.COLOR_RGB2BGR))

        # Plot False Negatives Detections In Brown
        if len(false_negatives['gt_box_means']) > 0:
            false_negatives_boxes = false_negatives['gt_box_means'].cpu(
            ).numpy()
            false_negatives_classes = false_negatives['gt_cat_idxs'].cpu(
            ).numpy()
            false_negatives_labels = [
                class_list[thing_dataset_id_to_contiguous_id[gt_class[0]]]
                for gt_class in false_negatives_classes.tolist()
            ]
        else:
            false_negatives_boxes = np.array([])
            false_negatives_labels = []

        v = Visualizer(image, meta_catalog, scale=2.0)

        plotted_false_negative_boxes = v.overlay_instances(
            boxes=false_negatives_boxes,
            assigned_colors=['coral' for _ in false_negatives_boxes],
            alpha=1.0,
            labels=false_negatives_labels)
        cv2.imshow(
            'False negative ground truth.',
            cv2.cvtColor(plotted_false_negative_boxes.get_image(),
                         cv2.COLOR_RGB2BGR))

        cv2.waitKey(0)
        cv2.destroyAllWindows()
Exemple #7
0
def main(
        args,
        cfg=None,
        min_allowed_score=None):
    # Setup config
    if cfg is None:
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)

    cfg.defrost()
    cfg.ACTUAL_TEST_DATASET = args.test_dataset

    # Build path to gt instances and inference output
    inference_output_dir = get_inference_output_dir(
        cfg['OUTPUT_DIR'],
        args.test_dataset,
        args.inference_config,
        args.image_corruption_level)

    # Get thresholds to perform evaluation on
    if min_allowed_score is None:
        # Check if F-1 Score has been previously computed.
        try:
            with open(os.path.join(inference_output_dir, "mAP_res.txt"), "r") as f:
                min_allowed_score = f.read().strip('][\n').split(', ')[-1]
                min_allowed_score = round(float(min_allowed_score), 4)
        except FileNotFoundError:
            # If not, process all detections. Not recommended as the results might be influenced by very low scoring
            # detections that would normally be removed in robotics/vision
            # applications.
            min_allowed_score = 0.0

    # get preprocessed instances
    preprocessed_predicted_instances, preprocessed_gt_instances = evaluation_utils.get_per_frame_preprocessed_instances(
        cfg, inference_output_dir, min_allowed_score)

    # get metacatalog and image infos
    meta_catalog = MetadataCatalog.get(args.test_dataset)
    images_info = json.load(open(meta_catalog.json_file, 'r'))['images']

    # Loop over all images and visualize errors
    for image_info in images_info:
        image_id = image_info['id']
        image = cv2.imread(
            os.path.join(
                meta_catalog.image_root,
                image_info['file_name']))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        v = ProbabilisticVisualizer(
            image,
            meta_catalog,
            scale=1.5)
        class_list = v.metadata.as_dict()['thing_classes']

        predicted_box_means = preprocessed_predicted_instances['predicted_boxes'][image_id].cpu(
        ).numpy()
        gt_box_means = preprocessed_gt_instances['gt_boxes'][image_id].cpu(
        ).numpy()
        predicted_box_covariances = preprocessed_predicted_instances[
            'predicted_covar_mats'][image_id].cpu(
        ).numpy()

        predicted_cls_probs = preprocessed_predicted_instances['predicted_cls_probs'][image_id]

        if predicted_cls_probs.shape[0] > 0:
            if cfg.MODEL.META_ARCHITECTURE == "ProbabilisticGeneralizedRCNN" or cfg.MODEL.META_ARCHITECTURE == "ProbabilisticDetr":
                predicted_scores, predicted_classes = predicted_cls_probs[:, :-1].max(
                    1)
                predicted_entropies = entropy(
                    predicted_cls_probs.cpu().numpy(), base=2)

            else:
                predicted_scores, predicted_classes = predicted_cls_probs.max(
                    1)
                predicted_entropies = entropy(
                    np.stack(
                        (predicted_scores.cpu().numpy(),
                         1 - predicted_scores.cpu().numpy())),
                    base=2)
            predicted_classes = predicted_classes.cpu(
            ).numpy()
            predicted_classes = [class_list[p_class]
                                 for p_class in predicted_classes]
            assigned_colors = cm.autumn(predicted_entropies)
            predicted_scores = predicted_scores.cpu().numpy()
        else:
            predicted_scores=np.array([])
            predicted_classes = np.array([])
            assigned_colors = []

        gt_cat_idxs = preprocessed_gt_instances['gt_cat_idxs'][image_id].cpu(
        ).numpy()
        thing_dataset_id_to_contiguous_id = meta_catalog.thing_dataset_id_to_contiguous_id
        if gt_cat_idxs.shape[0] > 0:
            gt_labels = [class_list[thing_dataset_id_to_contiguous_id[gt_class]]
                         for gt_class in gt_cat_idxs[:, 0]]
        else:
            gt_labels = []

        # noinspection PyTypeChecker
        _ = v.overlay_covariance_instances(
            boxes=gt_box_means,
            assigned_colors=[
                'lightgreen' for _ in gt_box_means],
            labels=gt_labels,
            alpha=1.0)
        plotted_detections = v.overlay_covariance_instances(
            boxes=predicted_box_means,
            covariance_matrices=predicted_box_covariances,
            assigned_colors=assigned_colors,
            alpha=1.0,
            labels=predicted_classes)

        cv2.imshow(
            'Detected Instances.',
            cv2.cvtColor(
                plotted_detections.get_image(),
                cv2.COLOR_RGB2BGR))
        cv2.waitKey()
Exemple #8
0
def main(
        args,
        cfg=None,
        iou_min=None,
        iou_correct=None,
        min_allowed_score=None):
    # Setup config
    if cfg is None:
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)

    cfg.defrost()
    cfg.ACTUAL_TEST_DATASET = args.test_dataset

    # Setup torch device and num_threads
    torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS)

    # Build path to gt instances and inference output
    inference_output_dir = os.path.join(
        cfg['OUTPUT_DIR'],
        'inference',
        args.test_dataset,
        os.path.split(args.inference_config)[-1][:-5])

    # Get thresholds to perform evaluation on
    if iou_min is None:
        iou_min = args.iou_min
    if iou_correct is None:
        iou_correct = args.iou_correct

    if min_allowed_score is None:
        # Check if F-1 Score has been previously computed ON THE ORIGINAL
        # DATASET such as COCO even when evaluating on VOC.
        try:
            train_set_inference_output_dir = os.path.join(
                cfg['OUTPUT_DIR'],
                'inference',
                cfg.DATASETS.TEST[0],
                os.path.split(args.inference_config)[-1][:-5])
            with open(os.path.join(train_set_inference_output_dir, "mAP_res.txt"), "r") as f:
                min_allowed_score = f.read().strip('][\n').split(', ')[-1]
                min_allowed_score = round(float(min_allowed_score), 4)
        except FileNotFoundError:
            # If not, process all detections. Not recommended as the results might be influenced by very low scoring
            # detections that would normally be removed in robotics/vision
            # applications.
            min_allowed_score = 0.0

    # Get category mapping dictionary:
    train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id
    test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        args.test_dataset).thing_dataset_id_to_contiguous_id

    cat_mapping_dict = get_thing_dataset_id_to_contiguous_id_dict(
        cfg,
        args,
        train_thing_dataset_id_to_contiguous_id,
        test_thing_dataset_id_to_contiguous_id)

    # Get matched results by either generating them or loading from file.
    with torch.no_grad():
        matched_results = evaluation_utils.get_matched_results(
            cfg, inference_output_dir,
            iou_min=iou_min,
            iou_correct=iou_correct,
            min_allowed_score=min_allowed_score)

        # Build preliminary dicts required for computing classification scores.
        for matched_results_key in matched_results.keys():
            if 'gt_cat_idxs' in matched_results[matched_results_key].keys():
                # First we convert the written things indices to contiguous
                # indices.
                gt_converted_cat_idxs = matched_results[matched_results_key]['gt_cat_idxs'].squeeze(
                    1)
                gt_converted_cat_idxs = torch.as_tensor([cat_mapping_dict[class_idx.cpu(
                ).tolist()] for class_idx in gt_converted_cat_idxs]).to(device)
                matched_results[matched_results_key]['gt_converted_cat_idxs'] = gt_converted_cat_idxs.to(
                    device)
                if 'predicted_cls_probs' in matched_results[matched_results_key].keys(
                ):
                    predicted_cls_probs = matched_results[matched_results_key]['predicted_cls_probs']
                    # This is required for evaluation of retinanet based
                    # detections.
                    matched_results[matched_results_key]['predicted_score_of_gt_category'] = torch.gather(
                        predicted_cls_probs, 1, gt_converted_cat_idxs.unsqueeze(1)).squeeze(1)
                matched_results[matched_results_key]['gt_cat_idxs'] = gt_converted_cat_idxs
            else:
                # For false positives, the correct category is background. For retinanet, since no explicit
                # background category is available, this value is computed as 1.0 - score of the predicted
                # category.
                predicted_class_probs, predicted_class_idx = matched_results[matched_results_key]['predicted_cls_probs'].max(
                    1)
                matched_results[matched_results_key]['predicted_score_of_gt_category'] = 1.0 - \
                    predicted_class_probs
                matched_results[matched_results_key]['predicted_cat_idxs'] = predicted_class_idx

        # Load the different detection partitions
        true_positives = matched_results['true_positives']
        false_negatives = matched_results['false_negatives']
        false_positives = matched_results['false_positives']

        # Get the number of elements in each partition
        num_true_positives = true_positives['predicted_box_means'].shape[0]
        num_false_negatives = false_negatives['gt_box_means'].shape[0]
        num_false_positives = false_positives['predicted_box_means'].shape[0]

        per_class_output_list = []
        for class_idx in [1, 3]:
            true_positives_valid_idxs = true_positives['gt_converted_cat_idxs'] == class_idx
            false_positives_valid_idxs = false_positives['predicted_cat_idxs'] == class_idx

            # Compute classification metrics for every partition
            true_positives_cls_analysis = scoring_rules.retinanet_compute_cls_scores(
                true_positives, true_positives_valid_idxs)
            false_positives_cls_analysis = scoring_rules.retinanet_compute_cls_scores(
                false_positives, false_positives_valid_idxs)

            # Compute regression metrics for every partition
            true_positives_reg_analysis = scoring_rules.compute_reg_scores(
                true_positives, true_positives_valid_idxs)
            false_positives_reg_analysis = scoring_rules.compute_reg_scores_fn(
                false_positives, false_positives_valid_idxs)

            per_class_output_list.append(
                {'true_positives_cls_analysis': true_positives_cls_analysis,
                 'true_positives_reg_analysis': true_positives_reg_analysis,
                 'false_positives_cls_analysis': false_positives_cls_analysis,
                 'false_positives_reg_analysis': false_positives_reg_analysis})

        final_accumulated_output_dict = dict()
        final_average_output_dict = dict()

        for key in per_class_output_list[0].keys():
            average_output_dict = dict()
            for inner_key in per_class_output_list[0][key].keys():
                collected_values = [per_class_output[key][inner_key]
                                    for per_class_output in per_class_output_list if per_class_output[key][inner_key] is not None]
                collected_values = np.array(collected_values)

                if key in average_output_dict.keys():
                    # Use nan mean since some classes do not have duplicates for
                    # instance or has one duplicate for instance. torch.std returns nan in that case
                    # so we handle those here. This should not have any effect on the final results, as
                    # it only affects inter-class variance which we do not
                    # report anyways.
                    average_output_dict[key].update(
                        {inner_key: np.nanmean(collected_values)})
                    final_accumulated_output_dict[key].update(
                        {inner_key: collected_values})
                else:
                    average_output_dict.update(
                        {key: {inner_key: np.nanmean(collected_values)}})
                    final_accumulated_output_dict.update(
                        {key: {inner_key: collected_values}})

            final_average_output_dict.update(average_output_dict)
        # Summarize and print all
        table = PrettyTable()
        table.field_names = (['Output Type',
                              'Number of Instances',
                              'Cls Ignorance Score',
                              'Reg Ignorance Score'])
        table.add_row(
            [
                "True Positives:",
                num_true_positives,
                '{:.4f}'.format(
                    final_average_output_dict['true_positives_cls_analysis']['ignorance_score_mean']),
                '{:.4f}'.format(
                    final_average_output_dict['true_positives_reg_analysis']['ignorance_score_mean'])])

        table.add_row(
            [
                "False Positives:",
                num_false_positives,
                '{:.4f}'.format(
                    final_average_output_dict['false_positives_cls_analysis']['ignorance_score_mean']),
                '{:.4f}'.format(
                    final_average_output_dict['false_positives_reg_analysis']['total_entropy_mean'])])

        table.add_row(["False Negatives:",
                       num_false_negatives,
                       '-',
                       '-'])
        print(table)
Exemple #9
0
def main(args):
    cfg = setup_config(args,
                       random_seed=args.random_seed,
                       is_testing=True)

    inference_output_dir = get_inference_output_dir(
        cfg['OUTPUT_DIR'],
        args.test_dataset,
        args.inference_config,
        args.image_corruption_level)

    # Check if F-1 Score has been previously computed ON THE ORIGINAL
    # DATASET such as COCO even when evaluating on OpenImages.
    try:
        train_set_inference_output_dir = get_inference_output_dir(
            cfg['OUTPUT_DIR'],
            cfg.DATASETS.TEST[0],
            args.inference_config,
            0)
        with open(os.path.join(train_set_inference_output_dir, "mAP_res.txt"), "r") as f:
            min_allowed_score = f.read().strip('][\n').split(', ')[-1]
            min_allowed_score = round(float(min_allowed_score), 4)
    except FileNotFoundError:
        # If not, process all detections. Not recommended as the results might be influenced by very low scoring
        # detections that would normally be removed in robotics/vision
        # applications.
        min_allowed_score = 0.0

    iou_thresholds = np.arange(0.5, 1.0, 0.05).round(2)

    probabilistic_detection_dicts = []
    calibration_dicts = []

    for iou_correct in iou_thresholds:
        print("Processing detections at {} iou threshold...".format(iou_correct))
        probabilistic_scores_file_name = os.path.join(
            inference_output_dir, 'probabilistic_scoring_res_{}_{}_{}.pkl'.format(
                args.iou_min, iou_correct, min_allowed_score))
        calibration_file_name = os.path.join(
            inference_output_dir, 'calibration_errors_res_{}_{}_{}.pkl'.format(
                args.iou_min, iou_correct, min_allowed_score))

        try:
            with open(probabilistic_scores_file_name, "rb") as f:
                probabilistic_scores = pickle.load(f)
        except FileNotFoundError:
            compute_probabilistic_metrics.main(
                args, cfg, iou_correct=iou_correct, print_results=False)
            with open(probabilistic_scores_file_name, "rb") as f:
                probabilistic_scores = pickle.load(f)

        try:
            with open(calibration_file_name, "rb") as f:
                calibration_errors = pickle.load(f)
        except FileNotFoundError:
            compute_calibration_errors.main(
                args, cfg, iou_correct=iou_correct, print_results=False)
            with open(calibration_file_name, "rb") as f:
                calibration_errors = pickle.load(f)

        probabilistic_detection_dicts.append(probabilistic_scores)
        calibration_dicts.append(calibration_errors)

    probabilistic_detection_final_dict = {
        key: {} for key in probabilistic_detection_dicts[0].keys()}
    for key in probabilistic_detection_dicts[0].keys():
        for key_l2 in probabilistic_detection_dicts[0][key].keys():
            accumulated_values = [
                probabilistic_detection_dicts[i][key][key_l2] for i in range(
                    len(probabilistic_detection_dicts))]
            probabilistic_detection_final_dict[key].update(
                {key_l2: np.nanmean(np.array(accumulated_values), 0)})

    calibration_final_dict = {key: None for key in calibration_dicts[0].keys()}
    for key in calibration_dicts[0].keys():
        accumulated_values = [
            calibration_dicts[i][key] for i in range(
                len(calibration_dicts))]
        calibration_final_dict[key] = np.nanmean(
            np.array(accumulated_values), 0)

    dictionary_file_name = os.path.join(
        inference_output_dir,
        'probabilistic_scoring_res_averaged_{}.pkl'.format(min_allowed_score))
    with open(dictionary_file_name, "wb") as pickle_file:
        pickle.dump(probabilistic_detection_final_dict, pickle_file)

    dictionary_file_name = os.path.join(
        inference_output_dir, 'calibration_res_averaged_{}.pkl'.format(
            min_allowed_score))
    with open(dictionary_file_name, "wb") as pickle_file:
        pickle.dump(calibration_final_dict, pickle_file)

    # Summarize and print all
    table = PrettyTable()
    table.field_names = (['Output Type',
                          'Cls Ignorance Score',
                          'Cls Brier/Probability Score',
                          'Reg Ignorance Score',
                          'Reg Energy Score'])

    table.add_row(
        [
            "True Positives:",
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['true_positives_cls_analysis']['ignorance_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['true_positives_cls_analysis']['brier_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['true_positives_reg_analysis']['ignorance_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['true_positives_reg_analysis']['energy_score_mean']))])
    table.add_row(
        [
            "Duplicates:",
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['duplicates_cls_analysis']['ignorance_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['duplicates_cls_analysis']['brier_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['duplicates_reg_analysis']['ignorance_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['duplicates_reg_analysis']['energy_score_mean']))])
    table.add_row(
        [
            "Localization Errors:",
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['localization_errors_cls_analysis']['ignorance_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['localization_errors_cls_analysis']['brier_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['localization_errors_reg_analysis']['ignorance_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['localization_errors_reg_analysis']['energy_score_mean']))])
    table.add_row(
        [
            "False Positives:",
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['false_positives_cls_analysis']['ignorance_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['false_positives_cls_analysis']['brier_score_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['false_positives_reg_analysis']['total_entropy_mean'])),
            '{:.4f}'.format(
                np.nanmean(probabilistic_detection_final_dict['false_positives_reg_analysis']['fp_energy_score_mean']))])

    print(table)
    text_file_name = os.path.join(
        inference_output_dir,
        'probabilistic_scoring_res_averaged_{}.txt'.format(
            min_allowed_score))

    with open(text_file_name, "w") as text_file:
        print(table, file=text_file)

    table = PrettyTable()
    table.field_names = (['Cls Marginal Calibration Error',
                          'Reg Expected Calibration Error',
                          'Reg Maximum Calibration Error'])

    table.add_row(
        [
            '{:.4f}'.format(
                calibration_final_dict['cls_marginal_calibration_error']), '{:.4f}'.format(
                calibration_final_dict['reg_expected_calibration_error']), '{:.4f}'.format(
                    calibration_final_dict['reg_maximum_calibration_error'])])

    text_file_name = os.path.join(
        inference_output_dir,
        'calibration_res_averaged_{}.txt'.format(
            min_allowed_score))

    with open(text_file_name, "w") as text_file:
        print(table, file=text_file)

    print(table)
def get_matched_results_dicts(config_names,
                              configs_list,
                              inference_configs_list,
                              iou_min=0.1,
                              iou_correct=0.5):

    # Level 0 is coco validation set with no corruption, level 10 is open
    # images, level 11 is open images ood
    image_corruption_levels = [0, 10, 11]

    test_dataset_coco = "coco_2017_custom_val"
    test_dataset_open_images = "openimages_val"
    test_dataset_open_images_odd = "openimages_odd_val"

    arg_parser = setup_arg_parser()
    args = arg_parser.parse_args()

    # Initiate dataframe dict
    res_dict_clean = defaultdict(
        lambda: defaultdict(lambda: defaultdict(list)))

    for config_name, config, inference_config_name in zip(
            config_names, configs_list, inference_configs_list):
        # Setup config
        args.config_file = config
        args.inference_config = inference_config_name
        args.test_dataset = test_dataset_coco
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)
        cfg.defrost()

        # Read coco dataset results
        cfg.ACTUAL_TEST_DATASET = args.test_dataset

        for image_corruption_level in image_corruption_levels:
            # Build path to gt instances and inference output
            args.image_corruption_level = image_corruption_level

            if image_corruption_level == 0:
                image_corruption_level = 'Val'
            elif image_corruption_level == 10:
                image_corruption_level = 'OpenIm'
            elif image_corruption_level == 11:
                image_corruption_level = 'OpenIm OOD'
            else:
                image_corruption_level = 'C' + str(image_corruption_level)
            if 'OpenIm' not in image_corruption_level:
                inference_output_dir = get_inference_output_dir(
                    cfg['OUTPUT_DIR'], args.test_dataset,
                    args.inference_config, args.image_corruption_level)

                # Get matched results by either generating them or loading from
                # file.

                dictionary_file_name = glob.glob(
                    os.path.join(
                        inference_output_dir,
                        "matched_results_{}_{}_*.pth".format(
                            iou_min, iou_correct)))[0]

                matched_results = torch.load(dictionary_file_name,
                                             map_location='cuda')
            elif image_corruption_level == 'OpenIm':
                args.image_corruption_level = 0
                args.test_dataset = test_dataset_open_images if image_corruption_level == 'OpenIm' else test_dataset_open_images_odd
                inference_output_dir = get_inference_output_dir(
                    cfg['OUTPUT_DIR'], args.test_dataset,
                    args.inference_config, args.image_corruption_level)
                dictionary_file_name = glob.glob(
                    os.path.join(
                        inference_output_dir,
                        "matched_results_{}_{}_*.pth".format(
                            iou_min, iou_correct)))[0]
                matched_results = torch.load(dictionary_file_name,
                                             map_location='cuda')
            else:
                args.image_corruption_level = 0
                args.test_dataset = test_dataset_open_images if image_corruption_level == 'OpenIm' else test_dataset_open_images_odd
                inference_output_dir = get_inference_output_dir(
                    cfg['OUTPUT_DIR'], args.test_dataset,
                    args.inference_config, args.image_corruption_level)
                dictionary_file_name = glob.glob(
                    os.path.join(
                        inference_output_dir,
                        "preprocessed_predicted_instances_odd_*.pth"))[0]
                preprocessed_predicted_instances = torch.load(
                    dictionary_file_name, map_location='cuda')

                predicted_boxes = preprocessed_predicted_instances[
                    'predicted_boxes']
                predicted_cov_mats = preprocessed_predicted_instances[
                    'predicted_covar_mats']
                predicted_cls_probs = preprocessed_predicted_instances[
                    'predicted_cls_probs']

                predicted_boxes = list(
                    itertools.chain.from_iterable([
                        predicted_boxes[key] for key in predicted_boxes.keys()
                    ]))
                predicted_cov_mats = list(
                    itertools.chain.from_iterable([
                        predicted_cov_mats[key]
                        for key in predicted_cov_mats.keys()
                    ]))
                predicted_cls_probs = list(
                    itertools.chain.from_iterable([
                        predicted_cls_probs[key]
                        for key in predicted_cls_probs.keys()
                    ]))

                predicted_boxes = torch.stack(predicted_boxes,
                                              1).transpose(0, 1)
                predicted_cov_mats = torch.stack(predicted_cov_mats,
                                                 1).transpose(0, 1)
                predicted_cls_probs = torch.stack(predicted_cls_probs,
                                                  1).transpose(0, 1)
                matched_results = {
                    'predicted_box_means': predicted_boxes,
                    'predicted_box_covariances': predicted_cov_mats,
                    'predicted_cls_probs': predicted_cls_probs
                }

            if image_corruption_level != 'OpenIm OOD':
                all_results_means = torch.cat((
                    matched_results['true_positives']['predicted_box_means'],
                    matched_results['localization_errors']
                    ['predicted_box_means'],
                    matched_results['duplicates']['predicted_box_means'],
                    matched_results['false_positives']['predicted_box_means']))

                all_results_covs = torch.cat((
                    matched_results['true_positives']
                    ['predicted_box_covariances'],
                    matched_results['localization_errors']
                    ['predicted_box_covariances'],
                    matched_results['duplicates']['predicted_box_covariances'],
                    matched_results['false_positives']
                    ['predicted_box_covariances']))

                all_gt_means = torch.cat((
                    matched_results['true_positives']['gt_box_means'],
                    matched_results['localization_errors']['gt_box_means'],
                    matched_results['duplicates']['gt_box_means'],
                    matched_results['false_positives']['predicted_box_means'] *
                    np.NaN))

                predicted_multivariate_normal_dists = torch.distributions.multivariate_normal.MultivariateNormal(
                    all_results_means.to('cpu'),
                    all_results_covs.to('cpu') +
                    1e-2 * torch.eye(all_results_covs.shape[2]).to('cpu'))
                predicted_multivariate_normal_dists.loc = predicted_multivariate_normal_dists.loc.to(
                    'cuda')
                predicted_multivariate_normal_dists.scale_tril = predicted_multivariate_normal_dists.scale_tril.to(
                    'cuda')
                predicted_multivariate_normal_dists._unbroadcasted_scale_tril = predicted_multivariate_normal_dists._unbroadcasted_scale_tril.to(
                    'cuda')
                predicted_multivariate_normal_dists.covariance_matrix = predicted_multivariate_normal_dists.covariance_matrix.to(
                    'cuda')
                predicted_multivariate_normal_dists.precision_matrix = predicted_multivariate_normal_dists.precision_matrix.to(
                    'cuda')
                all_entropy = predicted_multivariate_normal_dists.entropy()

                all_log_prob = -predicted_multivariate_normal_dists.log_prob(
                    all_gt_means)
                # Energy Score.
                sample_set = predicted_multivariate_normal_dists.sample(
                    (3, )).to('cuda')
                sample_set_1 = sample_set[:-1]
                sample_set_2 = sample_set[1:]

                energy_score = torch.norm(
                    (sample_set_1 - all_gt_means),
                    dim=2).mean(0) - 0.5 * torch.norm(
                        (sample_set_1 - sample_set_2), dim=2).mean(0)

                mse_loss = torch.nn.MSELoss(reduction='none')
                mse = mse_loss(all_gt_means, all_results_means).mean(1)

                res_dict_clean[config_name][image_corruption_level][
                    'Entropy'].extend(all_entropy.cpu().numpy())

                res_dict_clean[config_name][image_corruption_level][
                    'MSE'].extend(mse.cpu().numpy())
                res_dict_clean[config_name][image_corruption_level][
                    'NLL'].extend(all_log_prob.cpu().numpy())
                res_dict_clean[config_name][image_corruption_level][
                    'ED'].extend(energy_score.cpu().numpy())

                res_dict_clean[config_name][image_corruption_level][
                    'IOU With GT'].extend(
                        torch.cat(
                            (matched_results['true_positives']
                             ['iou_with_ground_truth'],
                             matched_results['localization_errors']
                             ['iou_with_ground_truth'][:, 0],
                             matched_results['duplicates']
                             ['iou_with_ground_truth'],
                             torch.zeros(
                                 matched_results['false_positives']
                                 ['predicted_box_means'].shape[0]).to('cuda') *
                             np.NaN)).cpu().numpy())

                predicted_multivariate_normal_dists = torch.distributions.multivariate_normal.MultivariateNormal(
                    matched_results['false_positives']
                    ['predicted_box_means'].to('cpu'),
                    matched_results['false_positives']
                    ['predicted_box_covariances'].to('cpu') +
                    1e-2 * torch.eye(matched_results['false_positives'][
                        'predicted_box_covariances'].shape[2]).to('cpu'))
                predicted_multivariate_normal_dists.loc = predicted_multivariate_normal_dists.loc.to(
                    'cuda')
                predicted_multivariate_normal_dists.scale_tril = predicted_multivariate_normal_dists.scale_tril.to(
                    'cuda')
                predicted_multivariate_normal_dists._unbroadcasted_scale_tril = predicted_multivariate_normal_dists._unbroadcasted_scale_tril.to(
                    'cuda')
                predicted_multivariate_normal_dists.covariance_matrix = predicted_multivariate_normal_dists.covariance_matrix.to(
                    'cuda')
                predicted_multivariate_normal_dists.precision_matrix = predicted_multivariate_normal_dists.precision_matrix.to(
                    'cuda')
                FP_Entropy = predicted_multivariate_normal_dists.entropy()
                res_dict_clean[config_name][image_corruption_level][
                    'FP_Entropy'].extend(FP_Entropy.cpu().numpy())

                predicted_cat_dists_fp = matched_results['false_positives'][
                    'predicted_cls_probs']

                if predicted_cat_dists_fp.shape[1] == 80:
                    predicted_cat_dists_fp, _ = predicted_cat_dists_fp.max(
                        dim=1)
                    predicted_cat_dists_fp = 1 - predicted_cat_dists_fp
                    predicted_categorical_dists = torch.distributions.Bernoulli(
                        probs=predicted_cat_dists_fp)
                else:
                    predicted_categorical_dists = torch.distributions.Categorical(
                        probs=matched_results['false_positives']
                        ['predicted_cls_probs'])

                all_pred_ent = predicted_categorical_dists.entropy()
                res_dict_clean[config_name][image_corruption_level][
                    'Cat_Entropy'].extend(all_pred_ent.cpu().numpy())

                if image_corruption_level == 'OpenIm':
                    res_dict_clean[config_name][image_corruption_level][
                        'Truncated'].extend(
                            torch.cat(
                                (matched_results['true_positives']
                                 ['is_truncated'],
                                 matched_results['localization_errors']
                                 ['is_truncated'],
                                 matched_results['duplicates']['is_truncated'],
                                 torch.full(
                                     (matched_results['false_positives']
                                      ['predicted_box_means'].shape[0], ),
                                     -1,
                                     dtype=torch.float32).to('cuda') *
                                 np.NaN)).cpu().numpy())
                    res_dict_clean[config_name][image_corruption_level][
                        'Occluded'].extend(
                            torch.cat(
                                (matched_results['true_positives']
                                 ['is_occluded'],
                                 matched_results['localization_errors']
                                 ['is_occluded'],
                                 matched_results['duplicates']['is_occluded'],
                                 torch.full(
                                     (matched_results['false_positives']
                                      ['predicted_box_means'].shape[0], ),
                                     -1,
                                     dtype=torch.float32).to('cuda') *
                                 np.NaN)).cpu().numpy())
                else:
                    res_dict_clean[config_name][image_corruption_level][
                        'Truncated'].extend(
                            torch.cat(
                                (torch.full(
                                    (matched_results['true_positives']
                                     ['predicted_box_means'].shape[0], ),
                                    -1,
                                    dtype=torch.float32).to('cuda') * np.NaN,
                                 torch.full(
                                     (matched_results['localization_errors']
                                      ['predicted_box_means'].shape[0], ),
                                     -1,
                                     dtype=torch.float32).to('cuda'),
                                 torch.full(
                                     (matched_results['duplicates']
                                      ['predicted_box_means'].shape[0], ),
                                     -1,
                                     dtype=torch.float32).to('cuda'),
                                 torch.full(
                                     (matched_results['false_positives']
                                      ['predicted_box_means'].shape[0], ),
                                     -1,
                                     dtype=torch.float32).to('cuda') *
                                 np.NaN)).cpu().numpy())
                    res_dict_clean[config_name][image_corruption_level][
                        'Occluded'].extend(
                            torch.cat(
                                (torch.full(
                                    (matched_results['true_positives']
                                     ['predicted_box_means'].shape[0], ),
                                    -1,
                                    dtype=torch.float32).to('cuda') * np.NaN,
                                 torch.full(
                                     (matched_results['localization_errors']
                                      ['predicted_box_means'].shape[0], ),
                                     -1,
                                     dtype=torch.float32).to('cuda') * np.NaN,
                                 torch.full(
                                     (matched_results['duplicates']
                                      ['predicted_box_means'].shape[0], ),
                                     -1,
                                     dtype=torch.float32).to('cuda') * np.NaN,
                                 torch.full(
                                     (matched_results['false_positives']
                                      ['predicted_box_means'].shape[0], ),
                                     -1,
                                     dtype=torch.float32).to('cuda') *
                                 np.NaN)).cpu().numpy())
            else:
                predicted_multivariate_normal_dists = torch.distributions.multivariate_normal.MultivariateNormal(
                    matched_results['predicted_box_means'].to('cpu'),
                    matched_results['predicted_box_covariances'].to('cpu') +
                    1e-2 *
                    torch.eye(matched_results['predicted_box_covariances'].
                              shape[2]).to('cpu'))
                predicted_multivariate_normal_dists.loc = predicted_multivariate_normal_dists.loc.to(
                    'cuda')
                predicted_multivariate_normal_dists.scale_tril = predicted_multivariate_normal_dists.scale_tril.to(
                    'cuda')
                predicted_multivariate_normal_dists._unbroadcasted_scale_tril = predicted_multivariate_normal_dists._unbroadcasted_scale_tril.to(
                    'cuda')
                predicted_multivariate_normal_dists.covariance_matrix = predicted_multivariate_normal_dists.covariance_matrix.to(
                    'cuda')
                predicted_multivariate_normal_dists.precision_matrix = predicted_multivariate_normal_dists.precision_matrix.to(
                    'cuda')
                all_entropy = predicted_multivariate_normal_dists.entropy()
                res_dict_clean[config_name][image_corruption_level][
                    'FP_Entropy'].extend(all_entropy.cpu().numpy())
                res_dict_clean[config_name][image_corruption_level][
                    'IOU With GT'].extend(
                        torch.zeros(matched_results['predicted_box_means'].
                                    shape[0]).cpu().numpy())
                res_dict_clean[config_name][image_corruption_level][
                    'Truncated'].extend(
                        torch.full((
                            matched_results['predicted_box_means'].shape[0], ),
                                   -1,
                                   dtype=torch.float32).cpu().numpy() * np.NaN)
                res_dict_clean[config_name][image_corruption_level][
                    'Occluded'].extend(
                        torch.full((
                            matched_results['predicted_box_means'].shape[0], ),
                                   -1,
                                   dtype=torch.float32).cpu().numpy() * np.NaN)

                all_results_cat = matched_results['predicted_cls_probs']
                if all_results_cat.shape[1] == 80:
                    predicted_cat_dists_fp, _ = all_results_cat.max(dim=1)
                    predicted_cat_dists_fp = 1 - predicted_cat_dists_fp
                    predicted_categorical_dists = torch.distributions.Bernoulli(
                        probs=predicted_cat_dists_fp)
                else:
                    predicted_categorical_dists = torch.distributions.Categorical(
                        probs=all_results_cat)

                all_pred_ent = predicted_categorical_dists.entropy()
                res_dict_clean[config_name][image_corruption_level][
                    'Cat_Entropy'].extend(all_pred_ent.cpu().numpy())

    return res_dict_clean
def get_clean_results_dict(config_names, configs_list, inference_configs_list):

    # Level 0 is coco validation set with no corruption, level 10 is open
    # images, level 11 is open images ood
    image_corruption_levels = [0, 1, 3, 5, 10, 11]

    test_dataset_coco = "coco_2017_custom_val"
    test_dataset_open_images = "openimages_val"
    test_dataset_open_images_odd = "openimages_odd_val"

    arg_parser = setup_arg_parser()
    args = arg_parser.parse_args()

    # Initiate dataframe dict
    res_dict_clean = defaultdict(lambda: defaultdict(list))

    for config_name, config, inference_config_name in zip(
            config_names, configs_list, inference_configs_list):
        # Setup config
        args.config_file = config
        args.inference_config = inference_config_name
        args.test_dataset = test_dataset_coco
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)
        cfg.defrost()

        # Read coco dataset results
        cfg.ACTUAL_TEST_DATASET = args.test_dataset

        for image_corruption_level in image_corruption_levels:
            # Build path to gt instances and inference output
            args.image_corruption_level = image_corruption_level

            if image_corruption_level == 0:
                image_corruption_level = 'Val'
            elif image_corruption_level == 10:
                image_corruption_level = 'OpenIm'
            elif image_corruption_level == 11:
                image_corruption_level = 'OpenIm OOD'
            else:
                image_corruption_level = 'C' + str(image_corruption_level)
            if 'OpenIm' not in image_corruption_level:
                inference_output_dir = get_inference_output_dir(
                    cfg['OUTPUT_DIR'], args.test_dataset,
                    args.inference_config, args.image_corruption_level)

                dictionary_file_name = glob.glob(
                    os.path.join(
                        inference_output_dir,
                        'probabilistic_scoring_res_averaged_*.pkl'))[0]
            else:
                args.image_corruption_level = 0
                args.test_dataset = test_dataset_open_images if image_corruption_level == 'OpenIm' else test_dataset_open_images_odd
                inference_output_dir = get_inference_output_dir(
                    cfg['OUTPUT_DIR'], args.test_dataset,
                    args.inference_config, args.image_corruption_level)
                prob_dict_name = 'probabilistic_scoring_res_averaged_*.pkl' if image_corruption_level == 'OpenIm' else 'probabilistic_scoring_res_odd_*.pkl'
                dictionary_file_name = glob.glob(
                    os.path.join(inference_output_dir, prob_dict_name))[0]

            with open(dictionary_file_name, "rb") as pickle_file:
                res_dict = pickle.load(pickle_file)

                if image_corruption_level != 'OpenIm OOD':
                    # True Positives Results
                    res_dict_clean['True Positives'][
                        'Negative Log Likelihood (Classification)'].extend(
                            res_dict['true_positives_cls_analysis']
                            ['ignorance_score_mean'])
                    res_dict_clean['True Positives']['Brier Score'].extend(
                        res_dict['true_positives_cls_analysis']
                        ['brier_score_mean'])
                    res_dict_clean['True Positives'][
                        'Negative Log Likelihood (Regression)'].extend(
                            res_dict['true_positives_reg_analysis']
                            ['ignorance_score_mean'])
                    res_dict_clean['True Positives'][
                        'Mean Squared Error'].extend(
                            res_dict['true_positives_reg_analysis']
                            ['mean_squared_error'])
                    res_dict_clean['True Positives']['Energy Score'].extend(
                        res_dict['true_positives_reg_analysis']
                        ['energy_score_mean'])
                    res_dict_clean['True Positives'][
                        'Image Corruption Level'].extend(
                            [image_corruption_level] *
                            res_dict['true_positives_reg_analysis']
                            ['energy_score_mean'].shape[0])
                    res_dict_clean['True Positives']['Method Name'].extend(
                        [config_name] * res_dict['true_positives_reg_analysis']
                        ['energy_score_mean'].shape[0])

                    # Duplicates Results
                    res_dict_clean['Duplicates'][
                        'Negative Log Likelihood (Classification)'].extend(
                            res_dict['duplicates_cls_analysis']
                            ['ignorance_score_mean'])
                    res_dict_clean['Duplicates']['Brier Score'].extend(
                        res_dict['duplicates_cls_analysis']
                        ['brier_score_mean'])
                    res_dict_clean['Duplicates'][
                        'Negative Log Likelihood (Regression)'].extend(
                            res_dict['duplicates_reg_analysis']
                            ['ignorance_score_mean'])
                    res_dict_clean['Duplicates']['Mean Squared Error'].extend(
                        res_dict['duplicates_reg_analysis']
                        ['mean_squared_error'])
                    res_dict_clean['Duplicates']['Energy Score'].extend(
                        res_dict['duplicates_reg_analysis']
                        ['energy_score_mean'])
                    res_dict_clean['Duplicates'][
                        'Image Corruption Level'].extend(
                            [image_corruption_level] *
                            res_dict['duplicates_reg_analysis']
                            ['energy_score_mean'].shape[0])
                    res_dict_clean['Duplicates']['Method Name'].extend(
                        [config_name] * res_dict['duplicates_reg_analysis']
                        ['energy_score_mean'].shape[0])

                    # Localization Error Results
                    res_dict_clean['Localization Errors'][
                        'Negative Log Likelihood (Classification)'].extend(
                            res_dict['localization_errors_cls_analysis']
                            ['ignorance_score_mean'])
                    res_dict_clean['Localization Errors'][
                        'Brier Score'].extend(
                            res_dict['localization_errors_cls_analysis']
                            ['brier_score_mean'])
                    res_dict_clean['Localization Errors'][
                        'Negative Log Likelihood (Regression)'].extend(
                            res_dict['localization_errors_reg_analysis']
                            ['ignorance_score_mean'])
                    res_dict_clean['Localization Errors'][
                        'Mean Squared Error'].extend(
                            res_dict['localization_errors_reg_analysis']
                            ['mean_squared_error'])
                    res_dict_clean['Localization Errors'][
                        'Energy Score'].extend(
                            res_dict['localization_errors_reg_analysis']
                            ['energy_score_mean'])
                    res_dict_clean['Localization Errors'][
                        'Image Corruption Level'].extend(
                            [image_corruption_level] *
                            res_dict['localization_errors_reg_analysis']
                            ['energy_score_mean'].shape[0])
                    res_dict_clean['Localization Errors'][
                        'Method Name'].extend(
                            [config_name] *
                            res_dict['localization_errors_reg_analysis']
                            ['energy_score_mean'].shape[0])

                    # False Positives Results
                    res_dict_clean['False Positives'][
                        'Negative Log Likelihood (Classification)'].extend(
                            res_dict['false_positives_cls_analysis']
                            ['ignorance_score_mean'])
                    res_dict_clean['False Positives']['Brier Score'].extend(
                        res_dict['false_positives_cls_analysis']
                        ['brier_score_mean'])
                    res_dict_clean['False Positives']['Entropy'].extend(
                        res_dict['false_positives_reg_analysis']
                        ['total_entropy_mean'])
                    res_dict_clean['False Positives'][
                        'Image Corruption Level'].extend(
                            [image_corruption_level] *
                            res_dict['false_positives_reg_analysis']
                            ['total_entropy_mean'].shape[0])
                    res_dict_clean['False Positives']['Method Name'].extend(
                        [config_name] *
                        res_dict['false_positives_reg_analysis']
                        ['total_entropy_mean'].shape[0])
                else:
                    # False Positives Results
                    res_dict_clean['False Positives'][
                        'Negative Log Likelihood (Classification)'].append(
                            res_dict['ignorance_score_mean'])
                    res_dict_clean['False Positives']['Brier Score'].append(
                        res_dict['brier_score_mean'])
                    res_dict_clean['False Positives']['Entropy'].append(
                        res_dict['total_entropy_mean'])
                    res_dict_clean['False Positives'][
                        'Image Corruption Level'].append(
                            image_corruption_level)
                    res_dict_clean['False Positives']['Method Name'].append(
                        config_name)
    return res_dict_clean
Exemple #12
0
def main(args, cfg=None, min_allowed_score=None):

    # Setup config
    if cfg is None:
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)

    cfg.defrost()
    cfg.ACTUAL_TEST_DATASET = args.test_dataset

    # Setup torch device and num_threads
    torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS)

    # Build path to gt instances and inference output
    inference_output_dir = get_inference_output_dir(
        cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config,
        args.image_corruption_level)

    if min_allowed_score is None:
        # Check if F-1 Score has been previously computed ON THE ORIGINAL
        # DATASET, and not on VOC.
        try:
            train_set_inference_output_dir = get_inference_output_dir(
                cfg['OUTPUT_DIR'], cfg.DATASETS.TEST[0], args.inference_config,
                0)
            with open(
                    os.path.join(train_set_inference_output_dir,
                                 "mAP_res.txt"), "r") as f:
                min_allowed_score = f.read().strip('][\n').split(', ')[-1]
                min_allowed_score = round(float(min_allowed_score), 4)
        except FileNotFoundError:
            # If not, process all detections. Not recommended as the results might be influenced by very low scoring
            # detections that would normally be removed in robotics/vision
            # applications.
            min_allowed_score = 0.0

    # Get matched results by either generating them or loading from file.
    with torch.no_grad():
        try:
            preprocessed_predicted_instances = torch.load(os.path.join(
                inference_output_dir,
                "preprocessed_predicted_instances_odd_{}.pth".format(
                    min_allowed_score)),
                                                          map_location=device)
        # Process predictions
        except FileNotFoundError:
            prediction_file_name = os.path.join(inference_output_dir,
                                                'coco_instances_results.json')
            predicted_instances = json.load(open(prediction_file_name, 'r'))
            preprocessed_predicted_instances = eval_predictions_preprocess(
                predicted_instances,
                min_allowed_score=min_allowed_score,
                is_odd=True)
            torch.save(
                preprocessed_predicted_instances,
                os.path.join(
                    inference_output_dir,
                    "preprocessed_predicted_instances_odd_{}.pth".format(
                        min_allowed_score)))

        predicted_boxes = preprocessed_predicted_instances['predicted_boxes']
        predicted_cov_mats = preprocessed_predicted_instances[
            'predicted_covar_mats']
        predicted_cls_probs = preprocessed_predicted_instances[
            'predicted_cls_probs']

        predicted_boxes = list(
            itertools.chain.from_iterable(
                [predicted_boxes[key] for key in predicted_boxes.keys()]))
        predicted_cov_mats = list(
            itertools.chain.from_iterable([
                predicted_cov_mats[key] for key in predicted_cov_mats.keys()
            ]))
        predicted_cls_probs = list(
            itertools.chain.from_iterable([
                predicted_cls_probs[key] for key in predicted_cls_probs.keys()
            ]))

        num_false_positives = len(predicted_boxes)
        valid_idxs = torch.as_tensor([i for i in range(num_false_positives)
                                      ]).to(device)

        predicted_boxes = torch.stack(predicted_boxes, 1).transpose(0, 1)
        predicted_cov_mats = torch.stack(predicted_cov_mats, 1).transpose(0, 1)
        predicted_cls_probs = torch.stack(predicted_cls_probs,
                                          1).transpose(0, 1)

        false_positives_dict = {
            'predicted_box_means': predicted_boxes,
            'predicted_box_covariances': predicted_cov_mats,
            'predicted_cls_probs': predicted_cls_probs
        }

        false_positives_reg_analysis = scoring_rules.compute_reg_scores_fn(
            false_positives_dict, valid_idxs)

        if cfg.MODEL.META_ARCHITECTURE == 'ProbabilisticRetinaNet':
            predicted_class_probs, predicted_class_idx = predicted_cls_probs.max(
                1)
            false_positives_dict['predicted_score_of_gt_category'] = 1.0 - \
                predicted_class_probs
            false_positives_cls_analysis = scoring_rules.sigmoid_compute_cls_scores(
                false_positives_dict, valid_idxs)

        else:
            false_positives_dict[
                'predicted_score_of_gt_category'] = predicted_cls_probs[:, -1]
            _, predicted_class_idx = predicted_cls_probs[:, :-1].max(1)
            false_positives_cls_analysis = scoring_rules.softmax_compute_cls_scores(
                false_positives_dict, valid_idxs)

        # Summarize and print all
        table = PrettyTable()
        table.field_names = ([
            'Output Type', 'Number of Instances', 'Cls Ignorance Score',
            'Cls Brier/Probability Score', 'Reg Ignorance Score',
            'Reg Energy Score'
        ])
        table.add_row([
            "False Positives:", num_false_positives, '{:.4f}'.format(
                false_positives_cls_analysis['ignorance_score_mean'], ),
            '{:.4f}'.format(false_positives_cls_analysis['brier_score_mean']),
            '{:.4f}'.format(
                false_positives_reg_analysis['total_entropy_mean']),
            '{:.4f}'.format(
                false_positives_reg_analysis['fp_energy_score_mean'])
        ])
        print(table)

        text_file_name = os.path.join(
            inference_output_dir,
            'probabilistic_scoring_res_odd_{}.txt'.format(min_allowed_score))

        with open(text_file_name, "w") as text_file:
            print(table, file=text_file)

        dictionary_file_name = os.path.join(
            inference_output_dir,
            'probabilistic_scoring_res_odd_{}.pkl'.format(min_allowed_score))
        false_positives_reg_analysis.update(false_positives_cls_analysis)
        with open(dictionary_file_name, "wb") as pickle_file:
            pickle.dump(false_positives_reg_analysis, pickle_file)
Exemple #13
0
def main(args):
    # Setup config
    cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)

    # Make sure only 1 data point is processed at a time. This simulates
    # deployment.
    cfg.defrost()
    cfg.DATALOADER.NUM_WORKERS = 32
    cfg.SOLVER.IMS_PER_BATCH = 1

    cfg.MODEL.DEVICE = device.type

    # Set up number of cpu threads
    torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS)

    # Create inference output directory and copy inference config file to keep
    # track of experimental settings
    inference_output_dir = os.path.join(
        cfg['OUTPUT_DIR'], 'inference', args.test_dataset,
        os.path.split(args.inference_config)[-1][:-5])
    os.makedirs(inference_output_dir, exist_ok=True)
    copyfile(
        args.inference_config,
        os.path.join(inference_output_dir,
                     os.path.split(args.inference_config)[-1]))

    # Get category mapping dictionary:
    train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id
    test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        args.test_dataset).thing_dataset_id_to_contiguous_id

    # If both dicts are equal or if we are performing out of distribution
    # detection, just flip the test dict.
    if (train_thing_dataset_id_to_contiguous_id
            == test_thing_dataset_id_to_contiguous_id) or (
                cfg.DATASETS.TRAIN[0] == 'coco_not_in_voc_2017_train'):
        cat_mapping_dict = dict(
            (v, k) for k, v in test_thing_dataset_id_to_contiguous_id.items())
    else:
        # If not equal, two situations: 1) BDD to KITTI and 2) COCO to PASCAL
        cat_mapping_dict = dict(
            (v, k) for k, v in test_thing_dataset_id_to_contiguous_id.items())
        if 'voc' in args.test_dataset and 'coco' in cfg.DATASETS.TRAIN[0]:
            dataset_mapping_dict = dict(
                (v, k) for k, v in metadata.COCO_TO_VOC_CONTIGUOUS_ID.items())
        elif 'kitti' in args.test_dataset and 'bdd' in cfg.DATASETS.TRAIN[0]:
            dataset_mapping_dict = dict(
                (v, k) for k, v in metadata.BDD_TO_KITTI_CONTIGUOUS_ID.items())
        else:
            ValueError(
                'Cannot generate category mapping dictionary. Please check if training and inference datasets are compatible.'
            )
        cat_mapping_dict = dict(
            (dataset_mapping_dict[k], v) for k, v in cat_mapping_dict.items())

    # Build predictor
    predictor = build_predictor(cfg)
    test_data_loader = build_detection_test_loader(
        cfg, dataset_name=args.test_dataset)

    final_output_list = []
    if not args.eval_only:
        with torch.no_grad():
            with tqdm.tqdm(total=len(test_data_loader)) as pbar:
                for idx, input_im in enumerate(test_data_loader):
                    outputs = predictor(input_im)

                    final_output_list.extend(
                        instances_to_json(outputs, input_im[0]['image_id'],
                                          cat_mapping_dict))
                    pbar.update(1)

        with open(
                os.path.join(inference_output_dir,
                             'coco_instances_results.json'), 'w') as fp:
            json.dump(final_output_list, fp, indent=4, separators=(',', ': '))

    #compute_average_precision.main(args, cfg)
    compute_probabilistic_metrics.main(args, cfg)
    compute_calibration_errors.main(args, cfg)
def main(args,
         cfg=None,
         iou_min=None,
         iou_correct=None,
         min_allowed_score=None):
    # Setup config
    if cfg is None:
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)

    cfg.defrost()
    cfg.ACTUAL_TEST_DATASET = args.test_dataset

    # Setup torch device and num_threads
    torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS)

    # Build path to gt instances and inference output
    inference_output_dir = os.path.join(
        cfg['OUTPUT_DIR'], 'inference', args.test_dataset,
        os.path.split(args.inference_config)[-1][:-5])

    # Get thresholds to perform evaluation on
    if iou_min is None:
        iou_min = args.iou_min
    if iou_correct is None:
        iou_correct = args.iou_correct
    if min_allowed_score is None:
        # Check if F-1 Score has been previously computed ON THE ORIGINAL
        # DATASET such as COCO even when evaluating on VOC.
        try:
            train_set_inference_output_dir = os.path.join(
                cfg['OUTPUT_DIR'], 'inference', cfg.DATASETS.TEST[0],
                os.path.split(args.inference_config)[-1][:-5])
            with open(
                    os.path.join(train_set_inference_output_dir,
                                 "mAP_res.txt"), "r") as f:
                min_allowed_score = f.read().strip('][\n').split(', ')[-1]
                min_allowed_score = round(float(min_allowed_score), 4)
        except FileNotFoundError:
            # If not, process all detections. Not recommended as the results might be influenced by very low scoring
            # detections that would normally be removed in robotics/vision
            # applications.
            min_allowed_score = 0.0

    # Get category mapping dictionary:
    train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id
    test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        args.test_dataset).thing_dataset_id_to_contiguous_id

    cat_mapping_dict = get_thing_dataset_id_to_contiguous_id_dict(
        cfg, args, train_thing_dataset_id_to_contiguous_id,
        test_thing_dataset_id_to_contiguous_id)

    # Get matched results by either generating them or loading from file.
    with torch.no_grad():
        matched_results = evaluation_utils.get_matched_results(
            cfg,
            inference_output_dir,
            iou_min=iou_min,
            iou_correct=iou_correct,
            min_allowed_score=min_allowed_score)

        # Build preliminary dicts required for computing classification scores.
        for matched_results_key in matched_results.keys():
            if 'gt_cat_idxs' in matched_results[matched_results_key].keys():
                # First we convert the written things indices to contiguous
                # indices.
                gt_converted_cat_idxs = matched_results[matched_results_key][
                    'gt_cat_idxs'].squeeze(1)
                gt_converted_cat_idxs = torch.as_tensor([
                    cat_mapping_dict[class_idx.cpu().tolist()]
                    for class_idx in gt_converted_cat_idxs
                ]).to(device)
                matched_results[matched_results_key][
                    'gt_converted_cat_idxs'] = gt_converted_cat_idxs.to(device)
                matched_results[matched_results_key][
                    'gt_cat_idxs'] = gt_converted_cat_idxs
            if 'predicted_cls_probs' in matched_results[
                    matched_results_key].keys():
                predicted_class_probs, predicted_cat_idxs = matched_results[
                    matched_results_key]['predicted_cls_probs'][:, :-1].max(1)

                matched_results[matched_results_key][
                    'predicted_cat_idxs'] = predicted_cat_idxs
                matched_results[matched_results_key][
                    'output_logits'] = predicted_class_probs

        # Load the different detection partitions
        true_positives = matched_results['true_positives']
        duplicates = matched_results['duplicates']
        false_positives = matched_results['false_positives']

        # Get the number of elements in each partition
        cls_min_uncertainty_error_list = []

        reg_maximum_calibration_error_list = []
        reg_expected_calibration_error_list = []
        reg_min_uncertainty_error_list = []

        all_predicted_scores = torch.cat(
            (true_positives['predicted_cls_probs'].flatten(),
             duplicates['predicted_cls_probs'].flatten(),
             false_positives['predicted_cls_probs'].flatten()), 0)
        all_gt_scores = torch.cat(
            (torch.nn.functional.one_hot(
                true_positives['gt_cat_idxs'],
                true_positives['predicted_cls_probs'].shape[1]).flatten().to(
                    device),
             torch.nn.functional.one_hot(
                 duplicates['gt_cat_idxs'], duplicates['predicted_cls_probs'].
                 shape[1]).flatten().to(device),
             torch.zeros_like(false_positives['predicted_cls_probs'].type(
                 torch.LongTensor).flatten()).to(device)), 0)

        # Compute classification calibration error using calibration
        # library
        cls_marginal_calibration_error = cal.get_calibration_error(
            all_predicted_scores.cpu().numpy(),
            all_gt_scores.cpu().numpy())

        for class_idx in cat_mapping_dict.values():
            true_positives_valid_idxs = true_positives[
                'gt_converted_cat_idxs'] == class_idx
            duplicates_valid_idxs = duplicates[
                'gt_converted_cat_idxs'] == class_idx
            false_positives_valid_idxs = false_positives[
                'predicted_cat_idxs'] == class_idx

            # For the rest of the code, gt_scores need to be ones or zeros. All
            # processing is done on a per-class basis
            all_gt_scores = torch.cat(
                (torch.ones_like(true_positives['gt_converted_cat_idxs']
                                 [true_positives_valid_idxs]).to(device),
                 torch.zeros_like(duplicates['gt_converted_cat_idxs']
                                  [duplicates_valid_idxs]).to(device),
                 torch.zeros_like(false_positives['predicted_cat_idxs']
                                  [false_positives_valid_idxs]).to(device)),
                0).type(torch.DoubleTensor)

            # Compute classification minimum uncertainty error
            distribution_params = torch.cat(
                (true_positives['output_logits'][true_positives_valid_idxs],
                 duplicates['output_logits'][duplicates_valid_idxs],
                 false_positives['output_logits'][false_positives_valid_idxs]),
                0)
            all_predicted_cat_entropy = -torch.log(distribution_params)

            random_idxs = torch.randperm(all_predicted_cat_entropy.shape[0])

            all_predicted_cat_entropy = all_predicted_cat_entropy[random_idxs]
            all_gt_scores_cls = all_gt_scores[random_idxs]
            sorted_entropies, sorted_idxs = all_predicted_cat_entropy.sort()
            sorted_gt_idxs_tp = all_gt_scores_cls[sorted_idxs]
            sorted_gt_idxs_fp = 1.0 - sorted_gt_idxs_tp

            tp_cum_sum = torch.cumsum(sorted_gt_idxs_tp, 0)
            fp_cum_sum = torch.cumsum(sorted_gt_idxs_fp, 0)
            cls_u_errors = 0.5 * (sorted_gt_idxs_tp.sum(0) - tp_cum_sum) / \
                sorted_gt_idxs_tp.sum(0) + 0.5 * fp_cum_sum / sorted_gt_idxs_fp.sum(0)
            cls_min_u_error = cls_u_errors.min()
            cls_min_uncertainty_error_list.append(cls_min_u_error)

            # Compute regression calibration errors. False negatives cant be evaluated since
            # those do not have ground truth.
            all_predicted_means = torch.cat(
                (true_positives['predicted_box_means']
                 [true_positives_valid_idxs],
                 duplicates['predicted_box_means'][duplicates_valid_idxs]), 0)

            all_predicted_covariances = torch.cat(
                (true_positives['predicted_box_covariances']
                 [true_positives_valid_idxs],
                 duplicates['predicted_box_covariances'][duplicates_valid_idxs]
                 ), 0)

            all_predicted_gt = torch.cat(
                (true_positives['gt_box_means'][true_positives_valid_idxs],
                 duplicates['gt_box_means'][duplicates_valid_idxs]), 0)

            all_predicted_covariances = torch.diagonal(
                all_predicted_covariances, dim1=1, dim2=2)

            # The assumption of uncorrelated components is not accurate, especially when estimating full
            # covariance matrices. However, using scipy to compute multivariate cdfs is very very
            # time consuming for such large amounts of data.
            reg_maximum_calibration_error = []
            reg_expected_calibration_error = []

            # Regression calibration is computed for every box dimension
            # separately, and averaged after.
            for box_dim in range(all_predicted_gt.shape[1]):
                all_predicted_means_current_dim = all_predicted_means[:,
                                                                      box_dim]
                all_predicted_gt_current_dim = all_predicted_gt[:, box_dim]
                all_predicted_covariances_current_dim = all_predicted_covariances[:,
                                                                                  box_dim]
                normal_dists = torch.distributions.Normal(
                    all_predicted_means_current_dim,
                    scale=all_predicted_covariances_current_dim)
                all_predicted_scores = normal_dists.cdf(
                    all_predicted_gt_current_dim)

                reg_calibration_error = []
                histogram_bin_step_size = 1 / 15.0
                for i in torch.arange(0.0, 1.0 - histogram_bin_step_size,
                                      histogram_bin_step_size):
                    # Get number of elements in bin
                    elements_in_bin = (all_predicted_scores <
                                       (i + histogram_bin_step_size))
                    num_elems_in_bin_i = elements_in_bin.type(
                        torch.FloatTensor).to(device).sum()

                    # Compute calibration error from "Accurate uncertainties for deep
                    # learning using calibrated regression" paper.
                    reg_calibration_error.append(
                        (num_elems_in_bin_i / all_predicted_scores.shape[0] -
                         (i + histogram_bin_step_size))**2)

                calibration_error = torch.stack(reg_calibration_error).to(
                    device)
                reg_maximum_calibration_error.append(calibration_error.max())
                reg_expected_calibration_error.append(calibration_error.mean())

            reg_maximum_calibration_error_list.append(
                reg_maximum_calibration_error)
            reg_expected_calibration_error_list.append(
                reg_expected_calibration_error)

            # Compute regression minimum uncertainty error
            all_predicted_covars = torch.cat((
                true_positives['predicted_box_covariances']
                [true_positives_valid_idxs],
                duplicates['predicted_box_covariances'][duplicates_valid_idxs],
                false_positives['predicted_box_covariances']
                [false_positives_valid_idxs]), 0)

            all_predicted_distributions = torch.distributions.multivariate_normal.MultivariateNormal(
                torch.zeros(all_predicted_covars.shape[0:2]).to(device),
                all_predicted_covars +
                1e-4 * torch.eye(all_predicted_covars.shape[2]).to(device))

            all_predicted_reg_entropy = all_predicted_distributions.entropy()
            random_idxs = torch.randperm(all_predicted_reg_entropy.shape[0])

            all_predicted_reg_entropy = all_predicted_reg_entropy[random_idxs]
            all_gt_scores_reg = all_gt_scores[random_idxs]

            sorted_entropies, sorted_idxs = all_predicted_reg_entropy.sort()
            sorted_gt_idxs_tp = all_gt_scores_reg[sorted_idxs]
            sorted_gt_idxs_fp = 1.0 - sorted_gt_idxs_tp

            tp_cum_sum = torch.cumsum(sorted_gt_idxs_tp, 0)
            fp_cum_sum = torch.cumsum(sorted_gt_idxs_fp, 0)
            reg_u_errors = 0.5 * ((sorted_gt_idxs_tp.sum(0) - tp_cum_sum) /
                                  sorted_gt_idxs_tp.sum(0)) + 0.5 * (
                                      fp_cum_sum / sorted_gt_idxs_fp.sum(0))
            reg_min_u_error = reg_u_errors.min()
            reg_min_uncertainty_error_list.append(reg_min_u_error)

        # Summarize and print all
        table = PrettyTable()
        table.field_names = ([
            'Cls Marginal Calibration Error', 'Reg Expected Calibration Error',
            'Reg Maximum Calibration Error', 'Cls Minimum Uncertainty Error',
            'Reg Minimum Uncertainty Error'
        ])

        reg_expected_calibration_error = torch.stack([
            torch.stack(reg, 0) for reg in reg_expected_calibration_error_list
        ], 0)
        reg_expected_calibration_error = reg_expected_calibration_error[
            ~torch.isnan(reg_expected_calibration_error)].mean()

        reg_maximum_calibration_error = torch.stack([
            torch.stack(reg, 0) for reg in reg_maximum_calibration_error_list
        ], 0)
        reg_maximum_calibration_error = reg_maximum_calibration_error[
            ~torch.isnan(reg_maximum_calibration_error)].mean()

        cls_min_u_error = torch.stack(cls_min_uncertainty_error_list, 0)
        cls_min_u_error = cls_min_u_error[~torch.isnan(cls_min_u_error)].mean()

        reg_min_u_error = torch.stack(reg_min_uncertainty_error_list, 0)
        reg_min_u_error = reg_min_u_error[~torch.isnan(reg_min_u_error)].mean()

        table.add_row([
            '{:.4f}'.format(cls_marginal_calibration_error), '{:.4f}'.format(
                reg_expected_calibration_error.cpu().numpy().tolist()),
            '{:.4f}'.format(
                reg_maximum_calibration_error.cpu().numpy().tolist()),
            '{:.4f}'.format(cls_min_u_error.cpu().numpy().tolist()),
            '{:.4f}'.format(reg_min_u_error.cpu().numpy().tolist())
        ])
        print(table)
Exemple #15
0
def main(args,
         cfg=None,
         iou_min=None,
         iou_correct=None,
         min_allowed_score=None,
         print_results=True):

    # Setup config
    if cfg is None:
        cfg = setup_config(args, random_seed=args.random_seed, is_testing=True)

    cfg.defrost()
    cfg.ACTUAL_TEST_DATASET = args.test_dataset

    # Setup torch device and num_threads
    torch.set_num_threads(cfg.DATALOADER.NUM_WORKERS)

    # Build path to gt instances and inference output
    inference_output_dir = get_inference_output_dir(
        cfg['OUTPUT_DIR'], args.test_dataset, args.inference_config,
        args.image_corruption_level)

    # Get thresholds to perform evaluation on
    if iou_min is None:
        iou_min = args.iou_min
    if iou_correct is None:
        iou_correct = args.iou_correct
    if min_allowed_score is None:
        # Check if F-1 Score has been previously computed ON THE ORIGINAL
        # DATASET such as COCO even when evaluating on OpenImages.
        try:
            train_set_inference_output_dir = get_inference_output_dir(
                cfg['OUTPUT_DIR'], cfg.DATASETS.TEST[0], args.inference_config,
                0)
            with open(
                    os.path.join(train_set_inference_output_dir,
                                 "mAP_res.txt"), "r") as f:
                min_allowed_score = f.read().strip('][\n').split(', ')[-1]
                min_allowed_score = round(float(min_allowed_score), 4)
        except FileNotFoundError:
            # If not, process all detections. Not recommended as the results might be influenced by very low scoring
            # detections that would normally be removed in robotics/vision
            # applications.
            min_allowed_score = 0.0

    # Get category mapping dictionary:
    train_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        cfg.DATASETS.TRAIN[0]).thing_dataset_id_to_contiguous_id
    test_thing_dataset_id_to_contiguous_id = MetadataCatalog.get(
        args.test_dataset).thing_dataset_id_to_contiguous_id

    cat_mapping_dict = get_test_thing_dataset_id_to_train_contiguous_id_dict(
        cfg, args, train_thing_dataset_id_to_contiguous_id,
        test_thing_dataset_id_to_contiguous_id)

    # Get matched results by either generating them or loading from file.
    with torch.no_grad():
        matched_results = evaluation_utils.get_matched_results(
            cfg,
            inference_output_dir,
            iou_min=iou_min,
            iou_correct=iou_correct,
            min_allowed_score=min_allowed_score)

        # Build preliminary dicts required for computing classification scores.
        for matched_results_key in matched_results.keys():
            if 'gt_cat_idxs' in matched_results[matched_results_key].keys():
                # First we convert the written things indices to contiguous
                # indices.
                gt_converted_cat_idxs = matched_results[matched_results_key][
                    'gt_cat_idxs'].squeeze(1)
                gt_converted_cat_idxs = torch.as_tensor([
                    cat_mapping_dict[class_idx.cpu().tolist()]
                    for class_idx in gt_converted_cat_idxs
                ]).to(device)
                matched_results[matched_results_key][
                    'gt_converted_cat_idxs'] = gt_converted_cat_idxs.to(device)
                matched_results[matched_results_key][
                    'gt_cat_idxs'] = gt_converted_cat_idxs
            if 'predicted_cls_probs' in matched_results[
                    matched_results_key].keys():
                if cfg.MODEL.META_ARCHITECTURE == 'ProbabilisticRetinaNet':
                    # For false positives, the correct category is background. For retinanet, since no explicit
                    # background category is available, this value is computed as 1.0 - score of the predicted
                    # category.
                    predicted_class_probs, predicted_cat_idxs = matched_results[
                        matched_results_key]['predicted_cls_probs'].max(1)
                    matched_results[matched_results_key][
                        'output_logits'] = predicted_class_probs
                else:
                    predicted_class_probs, predicted_cat_idxs = matched_results[
                        matched_results_key][
                            'predicted_cls_probs'][:, :-1].max(1)

                matched_results[matched_results_key][
                    'predicted_cat_idxs'] = predicted_cat_idxs

        # Load the different detection partitions
        true_positives = matched_results['true_positives']
        duplicates = matched_results['duplicates']
        localization_errors = matched_results['localization_errors']
        false_positives = matched_results['false_positives']

        reg_maximum_calibration_error_list = []
        reg_expected_calibration_error_list = []
        if cfg.MODEL.META_ARCHITECTURE == 'ProbabilisticRetinaNet':
            all_predicted_scores = torch.cat(
                (true_positives['predicted_cls_probs'].flatten(),
                 duplicates['predicted_cls_probs'].flatten(),
                 localization_errors['predicted_cls_probs'].flatten(),
                 false_positives['predicted_cls_probs'].flatten()), 0)
            all_gt_scores = torch.cat(
                (torch.nn.functional.one_hot(
                    true_positives['gt_cat_idxs'],
                    true_positives['predicted_cls_probs'].shape[1]).flatten().
                 to(device),
                 torch.nn.functional.one_hot(
                     duplicates['gt_cat_idxs'],
                     duplicates['predicted_cls_probs'].shape[1]).flatten().to(
                         device),
                 torch.zeros_like(
                     localization_errors['predicted_cls_probs'].type(
                         torch.LongTensor).flatten()).to(device),
                 torch.zeros_like(false_positives['predicted_cls_probs'].type(
                     torch.LongTensor).flatten()).to(device)), 0)
        else:
            # For RCNN based networks, a background category is
            # explicitly available.
            all_predicted_scores = torch.cat(
                (true_positives['predicted_cls_probs'],
                 duplicates['predicted_cls_probs'],
                 localization_errors['predicted_cls_probs'],
                 false_positives['predicted_cls_probs']), 0)
            all_gt_scores = torch.cat(
                (true_positives['gt_cat_idxs'], duplicates['gt_cat_idxs'],
                 torch.ones_like(localization_errors['predicted_cls_probs']
                                 [:, 0]).fill_(80.0).type(
                                     torch.LongTensor).to(device),
                 torch.ones_like(false_positives['predicted_cls_probs'][:, 0]).
                 fill_(80.0).type(torch.LongTensor).to(device)), 0)

        # Compute classification calibration error using calibration
        # library
        cls_marginal_calibration_error = cal.get_calibration_error(
            all_predicted_scores.cpu().numpy(),
            all_gt_scores.cpu().numpy())

        for class_idx in cat_mapping_dict.values():
            true_positives_valid_idxs = true_positives[
                'gt_converted_cat_idxs'] == class_idx
            localization_errors_valid_idxs = localization_errors[
                'gt_converted_cat_idxs'] == class_idx
            duplicates_valid_idxs = duplicates[
                'gt_converted_cat_idxs'] == class_idx

            # Compute regression calibration errors. False negatives cant be evaluated since
            # those do not have ground truth.
            all_predicted_means = torch.cat(
                (true_positives['predicted_box_means']
                 [true_positives_valid_idxs],
                 duplicates['predicted_box_means'][duplicates_valid_idxs],
                 localization_errors['predicted_box_means']
                 [localization_errors_valid_idxs]), 0)

            all_predicted_covariances = torch.cat((
                true_positives['predicted_box_covariances']
                [true_positives_valid_idxs],
                duplicates['predicted_box_covariances'][duplicates_valid_idxs],
                localization_errors['predicted_box_covariances']
                [localization_errors_valid_idxs]), 0)

            all_predicted_gt = torch.cat(
                (true_positives['gt_box_means'][true_positives_valid_idxs],
                 duplicates['gt_box_means'][duplicates_valid_idxs],
                 localization_errors['gt_box_means']
                 [localization_errors_valid_idxs]), 0)

            all_predicted_covariances = torch.diagonal(
                all_predicted_covariances, dim1=1, dim2=2)

            # The assumption of uncorrelated components is not accurate, especially when estimating full
            # covariance matrices. However, using scipy to compute multivariate cdfs is very very
            # time consuming for such large amounts of data.
            reg_maximum_calibration_error = []
            reg_expected_calibration_error = []

            # Regression calibration is computed for every box dimension
            # separately, and averaged after.
            for box_dim in range(all_predicted_gt.shape[1]):
                all_predicted_means_current_dim = all_predicted_means[:,
                                                                      box_dim]
                all_predicted_gt_current_dim = all_predicted_gt[:, box_dim]
                all_predicted_covariances_current_dim = all_predicted_covariances[:,
                                                                                  box_dim]
                normal_dists = torch.distributions.Normal(
                    all_predicted_means_current_dim,
                    scale=all_predicted_covariances_current_dim)
                all_predicted_scores = normal_dists.cdf(
                    all_predicted_gt_current_dim)

                reg_calibration_error = []
                histogram_bin_step_size = 1 / 15.0
                for i in torch.arange(0.0, 1.0 - histogram_bin_step_size,
                                      histogram_bin_step_size):
                    # Get number of elements in bin
                    elements_in_bin = (all_predicted_scores <
                                       (i + histogram_bin_step_size))
                    num_elems_in_bin_i = elements_in_bin.type(
                        torch.FloatTensor).to(device).sum()

                    # Compute calibration error from "Accurate uncertainties for deep
                    # learning using calibrated regression" paper.
                    reg_calibration_error.append(
                        (num_elems_in_bin_i / all_predicted_scores.shape[0] -
                         (i + histogram_bin_step_size))**2)

                calibration_error = torch.stack(reg_calibration_error).to(
                    device)
                reg_maximum_calibration_error.append(calibration_error.max())
                reg_expected_calibration_error.append(calibration_error.mean())

            reg_maximum_calibration_error_list.append(
                reg_maximum_calibration_error)
            reg_expected_calibration_error_list.append(
                reg_expected_calibration_error)

        # Summarize and print all
        reg_expected_calibration_error = torch.stack([
            torch.stack(reg, 0) for reg in reg_expected_calibration_error_list
        ], 0)
        reg_expected_calibration_error = reg_expected_calibration_error[
            ~torch.isnan(reg_expected_calibration_error)].mean()

        reg_maximum_calibration_error = torch.stack([
            torch.stack(reg, 0) for reg in reg_maximum_calibration_error_list
        ], 0)
        reg_maximum_calibration_error = reg_maximum_calibration_error[
            ~torch.isnan(reg_maximum_calibration_error)].mean()

        if print_results:
            table = PrettyTable()
            table.field_names = ([
                'Cls Marginal Calibration Error',
                'Reg Expected Calibration Error',
                'Reg Maximum Calibration Error'
            ])

            table.add_row([
                cls_marginal_calibration_error,
                reg_expected_calibration_error.cpu().numpy().tolist(),
                reg_maximum_calibration_error.cpu().numpy().tolist()
            ])
            print(table)

            text_file_name = os.path.join(
                inference_output_dir, 'calibration_errors_{}_{}_{}.txt'.format(
                    iou_min, iou_correct, min_allowed_score))

            with open(text_file_name, "w") as text_file:
                print([
                    cls_marginal_calibration_error,
                    reg_expected_calibration_error.cpu().numpy().tolist(),
                    reg_maximum_calibration_error.cpu().numpy().tolist()
                ],
                      file=text_file)

        dictionary_file_name = os.path.join(
            inference_output_dir, 'calibration_errors_res_{}_{}_{}.pkl'.format(
                iou_min, iou_correct, min_allowed_score))

        final_accumulated_output_dict = {
            'cls_marginal_calibration_error':
            cls_marginal_calibration_error,
            'reg_expected_calibration_error':
            reg_expected_calibration_error.cpu().numpy(),
            'reg_maximum_calibration_error':
            reg_maximum_calibration_error.cpu().numpy()
        }

        with open(dictionary_file_name, "wb") as pickle_file:
            pickle.dump(final_accumulated_output_dict, pickle_file)