Esempio n. 1
0
def inference_on_dataset(model, data_loader, evaluator):
    """#NOTE: modified to add time
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately.
    The model will be used in eval mode.
    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.
            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.
    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()
            outputs = model(inputs)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            cur_compute_time = time.perf_counter() - start_compute_time
            total_compute_time += cur_compute_time
            for _o in outputs:
                _o['time'] = cur_compute_time / len(outputs)
            evaluator.process(inputs, outputs)

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)),
                    n=5,
                )

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (total - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))

    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Esempio n. 2
0
def gdrn_inference_on_dataset(cfg,
                              model,
                              data_loader,
                              evaluator,
                              amp_test=False):
    """Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately. The model
    will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    total_process_time = 0
    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0
                total_process_time = 0

            start_compute_time = time.perf_counter()
            #############################
            # process input
            batch = batch_data(cfg, inputs, phase="test")
            if evaluator.train_objs is not None:
                roi_labels = batch["roi_cls"].cpu().numpy().tolist()
                obj_names = [evaluator.obj_names[_l] for _l in roi_labels]
                if all(_obj not in evaluator.train_objs for _obj in obj_names):
                    continue

            # if cfg.DEBUG:
            #     for i in range(len(batch["roi_cls"])):
            #         vis_roi_im = batch["roi_img"][i].cpu().numpy().transpose(1,2,0)[:, :, ::-1]
            #         show_ims = [vis_roi_im]
            #         show_titles = ["roi_im"]
            #
            #         vis_coor2d = batch["roi_coord_2d"][i].cpu().numpy()
            #         show_ims.extend([vis_coor2d[0], vis_coor2d[1]])
            #         show_titles.extend(["coord_2d_x", "coord_2d_y"])
            #         grid_show(show_ims, show_titles, row=1, col=3)

            with autocast(enabled=amp_test):
                out_dict = model(
                    batch["roi_img"],
                    roi_classes=batch["roi_cls"],
                    roi_cams=batch["roi_cam"],
                    roi_whs=batch["roi_wh"],
                    roi_centers=batch["roi_center"],
                    resize_ratios=batch["resize_ratio"],
                    roi_coord_2d=batch.get("roi_coord_2d", None),
                    roi_extents=batch.get("roi_extent", None),
                )
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            cur_compute_time = time.perf_counter() - start_compute_time
            total_compute_time += cur_compute_time
            # NOTE: added
            # TODO: add detection time here
            outputs = [{} for _ in range(len(inputs))]
            for _i in range(len(outputs)):
                outputs[_i]["time"] = cur_compute_time

            start_process_time = time.perf_counter()
            evaluator.process(inputs, outputs, out_dict)  # RANSAC/PnP
            cur_process_time = time.perf_counter() - start_process_time
            total_process_time += cur_process_time

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    f"Inference done {idx+1}/{total}. {seconds_per_img:.4f} s / img. ETA={str(eta)}",
                    n=5)

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        f"Total inference time: {total_time_str} "
        f"({total_time / (total - num_warmup):.6f} s / img per device, on {num_devices} devices)"
    )
    # pure forward time
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))
    # post_process time
    total_process_time_str = str(
        datetime.timedelta(seconds=int(total_process_time)))
    logger.info(
        "Total inference post process time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_process_time_str,
                total_process_time / (total - num_warmup), num_devices))

    results = evaluator.evaluate()  # results is always None
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Esempio n. 3
0
def inference_on_dataset(
    model, data_loader, evaluator, num_classes, topk, num_estimate, min_score
):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately.
    The model will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.
        topk (int)
        num_estimate (int): Number of images to estimate initial score threshold.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger.info("Start inference on {} images".format(len(data_loader)))
    if isinstance(topk, int):
        logger.info(f"Collecting top-{topk} images.")
        topk = [topk] * num_classes
    else:
        logger.info(f"Collecting top-k images. Counts:\n{topk}")

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0

    # We keep track of scores from _this_ process (process_scores) and scores from
    # all processes (scores). Every iter, each process updates process_scores and its
    # local scores with the new scores from the model.
    # Every few iterations, all processes pass their process_scores to each other and
    # updates their own global scores.

    # Map category id to min-heap of top scores from this process.
    process_scores = defaultdict(list)
    # Map category id to min-heap of top scores from all processes.
    global_scores = defaultdict(list)
    init_thresholds = torch.full(
        (num_classes + 1,), fill_value=min_score, dtype=torch.float32
    ).to(model.device)
    init_threshold_path = Path(evaluator._output_dir) / "_thresholds_checkpoint.pth"
    if init_threshold_path.exists():
        logger.info("Loading thresholds from disk.")
        init_thresholds = torch.load(init_threshold_path).to(model.device)
    else:
        init_threshold_path.parent.mkdir(exist_ok=True, parents=True)

    # Trying to get exactly the top-k estimates can result in getting slightly fewer
    # than K estimates. This can happen due to subtle differences in the model's forward
    # pass in the first phase vs. the second phase. For example, in the first phase,
    # when we have low thresholds, D2 will use torchvision.ops.boxes.batched_nms for
    # batch NMS. In phase 2, D2 will use a slightly different, customized
    # implementation, which may occasionally result in fewer boxes.
    # To address this, we set thresholds to be a bit looser, targeting 10% more
    # predictions than requested.
    topk_loose = [int(ceil(k * 1.1)) for k in topk]

    def get_thresholds(scores, min_thresholds):
        thresholds = []
        for i in range(num_classes):
            if topk_loose[i] == 0:
                thresholds.append(float("inf"))
            elif len(scores[i]) < topk_loose[i]:
                thresholds.append(-1)
            else:
                thresholds.append(scores[i][0])
        # Add -1 for background
        thresholds = torch.FloatTensor(thresholds + [-1]).to(model.device)
        # Clamp at minimum thresholds
        return torch.max(thresholds, init_thresholds)

    def update_scores(scores, inputs, outputs):
        updated = set()
        for image, output in zip(inputs, outputs):
            if isinstance(output, dict):
                instances = output["instances"]
            else:
                instances = output
            curr_labels = instances.pred_classes.int().tolist()
            curr_scores = instances.scores.cpu().tolist()
            for label, score in zip(curr_labels, curr_scores):
                # label = label.int().item()
                # scores[label].append((image["image_id"], score.cpu().item()))
                if len(scores[label]) >= topk_loose[label]:
                    if score < scores[label][0]:
                        continue
                    else:
                        heapq.heappushpop(scores[label], score)
                else:
                    heapq.heappush(scores[label], score)
                updated.add(label)

    def gather_scores(process_scores):
        # List of scores per process
        scores_list = comm.all_gather(process_scores)
        gathered = defaultdict(list)
        labels = {x for scores in scores_list for x in scores.keys()}
        for label in labels:
            # Sort in descending order.
            sorted_generator = heapq.merge(
                *[sorted(x[label], reverse=True) for x in scores_list], reverse=True
            )
            top_k = itertools.islice(sorted_generator, topk_loose[label])
            top_k_ascending = list(reversed(list(top_k)))  # Return to ascending order
            heapq.heapify(top_k_ascending)
            gathered[label] = top_k_ascending
        return gathered

    with inference_context(model), torch.no_grad():
        #########
        # Phase 1: Compute initial, low score thresholds without mask branch.
        #########
        # First, get an estimate of score thresholds with the mask branch off.
        # Otherwise, in the initial few images, we will run the mask branch on a bunch
        # of useless proposals which makes everything slow.
        num_estimate = min(num_estimate, len(data_loader))
        for idx, inputs in enumerate(
            tqdm(
                data_loader,
                desc="Computing score thresholds",
                total=num_estimate,
                disable=comm.get_rank() != 0,
            )
        ):
            if idx > num_estimate:
                break
            # Gather scores from other processes periodically.
            # In early iterations, the thresholds are low, making inference slow and
            # gather relatively fast, so we gather more often.
            # Later, the thresholds are high enough that inference is fast and gathering
            # is slow, so we stop gathering.
            if (idx < 100 and idx % 10 == 0) or (idx % 500 == 0):
                global_scores = gather_scores(process_scores)

            thresholds = get_thresholds(global_scores, init_thresholds)
            if idx % 1000 == 0:  # Save thresholds for later runs
                torch.save(thresholds, init_threshold_path)

            with per_class_thresholded_inference(model, thresholds, topk):
                with _turn_off_roi_heads(model, ["mask_on", "keypoint_on"]):
                    outputs = model.inference(inputs, do_postprocess=False)
            update_scores(global_scores, inputs, outputs)
            update_scores(process_scores, inputs, outputs)

            if (idx < 100 and idx % 10 == 0) or (idx % 100 == 0):
                logger.info(
                    "Threshold range (%s, %s); # collected: (%s, %s)",
                    thresholds[:-1].min(),
                    thresholds[:-1].max(),
                    min(len(x) for x in global_scores.values()),
                    max(len(x) for x in global_scores.values()),
                )

        del global_scores
        # Necessary to avoid timeout when gathering?
        comm.synchronize()

        # Map class to scores of predictions so far.
        init_scores = gather_scores(process_scores)
        # Minimum thresholds from the estimate stage
        init_thresholds = get_thresholds(init_scores, init_thresholds)
        # Clear scores from estimates; we will start tracking them again.
        scores = defaultdict(list)

        #########
        # Phase 2: Collect top-k predictions, with mask branch enabled.
        #########
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()
            thresholds = get_thresholds(scores, init_thresholds)
            with per_class_thresholded_inference(model, thresholds, topk):
                with limit_mask_branch_proposals(model, max_proposals=300):
                    outputs = model(inputs)
            update_scores(scores, inputs, outputs)

            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process(inputs, outputs)

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (
                    time.perf_counter() - start_time
                ) / iters_after_start
                eta = datetime.timedelta(
                    seconds=int(total_seconds_per_img * (total - idx - 1))
                )
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)
                    ),
                    n=5,
                    name=logger.name,
                )

            # Clear unnecessary predictions every so often.
            if idx < 100 or ((idx + 1) % 10) == 0:
                by_cat = defaultdict(list)
                for pred in evaluator._predictions:
                    for ann in pred["instances"]:
                        by_cat[ann["category_id"]].append(ann)
                topk_preds = []
                for c, anns in by_cat.items():
                    topk_preds.extend(
                        sorted(anns, key=lambda a: a["score"], reverse=True)[: topk[c]]
                    )
                evaluator._predictions = [{"instances": topk_preds}]

    if evaluator._output_dir:
        PathManager.mkdirs(evaluator._output_dir)
        file_path = os.path.join(
            evaluator._output_dir, f"instances_predictions_rank{comm.get_rank()}.pth"
        )
        with PathManager.open(file_path, "wb") as f:
            torch.save(evaluator._predictions, f)

    # Necessary to avoid timeout when gathering?
    comm.synchronize()
    # Limit number of detections per category across workers.
    predictions = comm.gather(evaluator._predictions, dst=0)
    if comm.is_main_process():
        predictions = list(itertools.chain(*predictions))
        by_cat = defaultdict(list)
        for pred in predictions:
            for ann in pred["instances"]:
                by_cat[ann["category_id"]].append(ann)
        logger.info(f"Max per cat: {max([len(v) for v in by_cat.values()])}")
        logger.info(f"Min per cat: {min([len(v) for v in by_cat.values()])}")
        topk_preds = []
        for c, anns in by_cat.items():
            topk_preds.extend(
                sorted(anns, key=lambda a: a["score"], reverse=True)[: topk[c]]
            )
        evaluator._predictions = [{"instances": topk_preds}]
    else:
        evaluator._predictions = []

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format(
            total_time_str, total_time / (total - num_warmup), num_devices
        )
    )
    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format(
            total_compute_time_str,
            total_compute_time / (total - num_warmup),
            num_devices,
        )
    )

    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Esempio n. 4
0
def inference_ensemble_on_dataset(models, data_loader, evaluator):
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    for model in models:
        model.eval()

    with torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()

            outputs = []
            for model in models:
                outputs.append(model(inputs))

            res = []
            for i in range(len(outputs[0])):
                out_i = [output[i] for output in outputs]
                merged_instances = merge_multi_predictions(
                    out_i,
                    (inputs[i]['height'], inputs[i]['width']),
                    nms_threshold=0.5,
                )
                res.append({"instances": merged_instances})

            outputs = res
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process(inputs, outputs)

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)
                    ),
                    n=5,
                )

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format(
            total_time_str, total_time / (total - num_warmup), num_devices
        )
    )
    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format(
            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
        )
    )

    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results