Ejemplo n.º 1
0
    def _do_loss_eval(self):
        # Copying inference_on_dataset from evaluator.py
        total = len(self._data_loader)
        num_warmup = min(5, total - 1)
            
        start_time = time.perf_counter()
        total_compute_time = 0
        losses = []
        for idx, inputs in enumerate(self._data_loader):            
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0
            start_compute_time = time.perf_counter()
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Loss on Validation  done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)
                    ),
                    n=5,
                )
            loss_batch = self._get_loss(inputs)
            losses.append(loss_batch)
        mean_loss = np.mean(losses)
        self.trainer.storage.put_scalar('validation_loss', mean_loss)
        comm.synchronize()

        return losses
Ejemplo n.º 2
0
    def _do_loss_eval(self) -> float:
        """
        Evaluate the loss function on the validation set.

        Returns:
            mean_loss (float):  Value of the loss.
        """
        # Copying inference_on_dataset from evaluator.py
        num_samples: int = len(self._data_loader)
        self._logger.info("Starting validation on %d samples",
                          num_samples)
        num_warmup: int = min(5, num_samples - 1)

        start_time: float = time.perf_counter()
        total_compute_time: float = 0
        losses: List[float] = []
        for idx, inputs in enumerate(self._data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            # Inference for these inputs
            start_compute_time: float = time.perf_counter()
            loss_batch: float = self._get_loss(inputs)
            losses.append(loss_batch)
            if torch.cuda.is_available():
                torch.cuda.synchronize()

            total_compute_time += time.perf_counter() - start_compute_time
            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)

            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                # Compute average time spent on each image.
                total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start

                # Compute ETA
                eta = datetime.timedelta(
                    seconds=int(total_seconds_per_img * (num_samples - idx - 1)))

                log_every_n_seconds(lvl=logging.INFO,
                                    msg=f"Loss on Validation done {idx + 1}/{num_samples}."\
                                        f" {seconds_per_img:.4f} s / img. ETA={eta}",
                                    n=100,
                                    name=__name__)

        # Average the losses.
        mean_loss = np.mean(losses)

        # Print the loss value.
        self._logger.info("Validation loss : {mean_loss}")

        # Store the loss value for it to be logged and displayed in TensorBoard.
        self.trainer.storage.put_scalar('validation_loss',
                                        mean_loss)
        comm.synchronize()

        return mean_loss
Ejemplo n.º 3
0
    def evaluate_loss(self, cfg, model):
        """Compute and log the validation loss to Comet

        Args:
            cfg (CfgNode): Detectron Config Object
            model (torch.nn.Module): Detectron Model

        Returns:
            dict: Empty Dict to satisfy Detectron Eval Hook API requirements
        """
        eval_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0],
                                                  DatasetMapper(cfg, True))

        # Copying inference_on_dataset from evaluator.py
        total = len(eval_loader)
        num_warmup = min(5, total - 1)

        start_time = time.perf_counter()
        total_compute_time = 0
        losses = []

        if comm.is_main_process():
            storage = get_event_storage()

            for idx, inputs in enumerate(eval_loader):
                if idx == num_warmup:
                    start_time = time.perf_counter()
                    total_compute_time = 0
                start_compute_time = time.perf_counter()
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                total_compute_time += time.perf_counter() - start_compute_time
                iters_after_start = idx + 1 - num_warmup * int(
                    idx >= num_warmup)
                seconds_per_img = total_compute_time / iters_after_start
                if idx >= num_warmup * 2 or seconds_per_img > 5:
                    total_seconds_per_img = (time.perf_counter() -
                                             start_time) / iters_after_start
                    eta = datetime.timedelta(
                        seconds=int(total_seconds_per_img * (total - idx - 1)))
                    log_every_n_seconds(
                        logging.INFO,
                        "Loss on Validation  done {}/{}. {:.4f} s / img. ETA={}"
                        .format(idx + 1, total, seconds_per_img, str(eta)),
                        n=5,
                    )
                loss_batch = self._get_loss(model, inputs)
                losses.append(loss_batch)
            mean_loss = np.mean(losses)

            # Log to Comet
            self.experiment.log_metric("eval_loss", mean_loss)

            storage.put_scalar("eval_loss", mean_loss)
            comm.synchronize()

        # Returns empty dict to satisfy Dectron Eval Hook requirement
        return {}
Ejemplo n.º 4
0
def inference(cfg, out_dir):

    # build model
    model = build_model(cfg)
    # resume
    DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
        "./output/autoaug_post_train/model_final.pth", resume=True)

    # data_loader
    mapper = DatasetMapper(cfg, False)
    data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0],
                                              mapper)

    total = len(data_loader)  # inference data loader must have a fixed length

    num_devices = torch.distributed.get_world_size(
    ) if torch.distributed.is_initialized() else 1

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    torch.no_grad()
    model.eval()
    for idx, inputs in enumerate(data_loader):

        start_compute_time = time.perf_counter()
        outputs = model(inputs)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        total_compute_time += time.perf_counter() - start_compute_time

        # log
        iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
        seconds_per_img = total_compute_time / iters_after_start
        if idx >= num_warmup * 2 or seconds_per_img > 5:
            total_seconds_per_img = (time.perf_counter() -
                                     start_time) / iters_after_start
            eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                 (total - idx - 1)))
            log_every_n_seconds(
                logging.INFO,
                "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                    idx + 1, total, seconds_per_img, str(eta)),
                n=5,
            )

        for input, output in zip(inputs, outputs):

            pred_segm = output["sem_seg"].to("cpu")
            pred = torch.max(pred_segm, dim=0)[1].data
            pred = pred.numpy()[:, :, np.newaxis]
            pred = np.dstack((pred, pred, pred))

            cv2.imwrite(
                out_dir +
                input["file_name"].split("/")[-1].replace("jpg", "png"),
                pred * 255)
Ejemplo n.º 5
0
def do_test(cfg, model):
    results = OrderedDict()
    for dataset_name in cfg.DATASETS.TEST:
        if cfg.MULTI_DATASET.ENABLED:
            # TODO: refactor
            try:
                model.set_eval_dataset(dataset_name)
            except:
                try:
                    model.module.set_eval_dataset(dataset_name)
                except:
                    print('set eval dataset failed.')
        data_loader = build_detection_test_loader(cfg, dataset_name)
        logger = logging.getLogger(__name__)
        logger.info("Start inference on {} images".format(len(data_loader)))
        total = min(len(data_loader), cfg.DUMP_NUM_IMG)
        start_time = time.perf_counter()
        model.eval()

        with torch.no_grad():
            for idx, inputs in enumerate(data_loader):
                if idx > total:
                    break
                _ = model(inputs)
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / (idx + 1)
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. ETA={}".format(
                        idx + 1, total, str(eta)),
                    n=5,
                )

        if cfg.DUMP_CLS_SCORE:
            class_scores = model.roi_heads.class_scores
            class_scores = [[y.tolist() for y in x] for x in class_scores]
            json.dump(
                class_scores,
                open(
                    '{}/class_scores_{}.json'.format(cfg.OUTPUT_DIR,
                                                     dataset_name), 'w'))
            model.roi_heads.class_scores = []
            if cfg.DUMP_BBOX:
                boxes = model.roi_heads.dump_boxes
                boxes = [[y.tolist() for y in x] for x in boxes]
                json.dump(
                    boxes,
                    open(
                        '{}/boxes_{}.json'.format(cfg.OUTPUT_DIR,
                                                  dataset_name), 'w'))
                model.roi_heads.dump_boxes = []

    return
Ejemplo n.º 6
0
    def _do_eval_loss(self, data_loader):
        total = len(data_loader)
        with torch.no_grad():
            for idx, inputs in enumerate(data_loader):
                loss_dict = self._model(inputs)
                # loss_dict_scaled = {k: v * self.weight_dict[k] if k in self.weight_dict else v for k, v in loss_dict.items()}
                device = next(iter(loss_dict.values())).device
                with torch.cuda.stream(torch.cuda.Stream() if device.type ==
                                       "cuda" else None):
                    metrics_dict = {
                        'val_' + k: v.detach().cpu().item()
                        for k, v in loss_dict.items()
                    }
                    all_metrics_dict = comm.gather(metrics_dict)

                if comm.is_main_process():
                    metrics_dict = {
                        k: np.mean([x[k] for x in all_metrics_dict])
                        for k in all_metrics_dict[0].keys()
                    }
                    total_losses_reduced = sum(
                        metrics_dict[k] * self.weight_dict[k.split('val_')[-1]]
                        for k in metrics_dict.keys()
                        if k.split('val_')[-1] in self.weight_dict)
                    if not np.isfinite(total_losses_reduced):
                        raise FloatingPointError(
                            f"Loss became infinite or NaN at iteration={idx}!\n"
                            f"loss_dict = {metrics_dict}")
                    if torch.cuda.is_available():
                        max_mem_mb = torch.cuda.max_memory_allocated(
                        ) / 1024.0 / 1024.0
                    else:
                        max_mem_mb = None
                    log_every_n_seconds(
                        logging.INFO,
                        msg=
                        " iter: {iter}/{total}  val_loss:{val_loss}   {losses}  {memory}"
                        .format(iter=idx + 1,
                                total=total,
                                val_loss='{:.3f}'.format(total_losses_reduced),
                                losses="  ".join([
                                    "{}: {:.3f}".format(
                                        k.split('val_loss_')[-1], v)
                                    for k, v in metrics_dict.items()
                                ]),
                                memory="max_mem: {:.0f}M".format(max_mem_mb)
                                if max_mem_mb is not None else ""),
                        n=5,
                        name=self.logger)

                    storage = get_event_storage()
                    if len(metrics_dict) > 1:
                        storage.put_scalars(
                            total_val_loss=total_losses_reduced,
                            **metrics_dict)
Ejemplo n.º 7
0
 def _log_progress(self, percentage):
     log_every_n_seconds(
         logging.INFO,
         "({:.2f}%) Wrote {} elements to local disk cache, db size: {:.2f} MiB"
         .format(
             percentage,
             len(self._cache.cache),
             self._cache.cache.volume() / 1024**2,
         ),
         n=10,
     )
Ejemplo n.º 8
0
    def _do_loss_eval(self):
        # Copying inference_on_dataset from evaluator.py
        total = len(self._data_loader)
        num_warmup = min(5, total - 1)

        start_time = time.perf_counter()
        total_compute_time = 0
        losses = []
        for idx, inputs in enumerate(self._data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0
            start_compute_time = time.perf_counter()
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Loss on Validation  done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)
                    ),
                    n=5,
                )
            loss_batch = self._get_loss(inputs)
            losses.append(loss_batch)
        mean_loss = np.mean(losses)
        tol = 1e10-4
        if mean_loss<self.best_val_loss+tol:
            self.best_val_loss=mean_loss
            self.waiting=0
            print("Saving best model...")
            self.trainer.checkpointer.save("best_model")
            print("Model saved")
        self.waiting+=1

        self.trainer.storage.put_scalar('validation_loss', mean_loss)

        metrics_dict = {k:v[0] for k,v in self.trainer.storage.latest().items()}
        self.train_process.log_metrics(metrics_dict, self.trainer.iter)

        if self.waiting>self.patience and self.patience>=0:
            self.trainer.run=False
        comm.synchronize()
        return losses
Ejemplo n.º 9
0
def inference_on_dataset(model, data_loader, evaluator):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately.
    The model will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    # inference_context 临时设置为eval() 模式
    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()
            """
            # inputs:list[dict]
            dict = {
             {
                "file_name":图片完全路径
                "height": 原始图片高
                "width":原始图片宽
                "image_id":图片id
                "image": tensor(N,H,W) # gt_boxes 用Boxes 封装,里面是一个tensor with shape(num,4)
                "instances":Instances(a class with attr, gt_boxes(Boxes),gt_classes(list[int]),.image_size)
                 # 此.image_size 是经过transfrom 后的
            }
            """
            #outputs: list[dict{"instances":Instances}] with shape [batch_size]
            """
            if without nms, shape is [topk] for each attr
            Instances:
                .pred_boxes(Boxes): Boxes.tensor with shape[topk,4] # 注意box已经根据原图尺寸进行了调整
                .scores(Tensor): shape[topk]
                .pred_classes(Tensor): prediction of class id
            """
            outputs = model(inputs)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            # 处理输入输出
            evaluator.process(inputs, outputs)

            # total_compute_time 是避开num_warmup 的
            # 所以iters_after_start  也要减去num_warmup
            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start

            if idx >= num_warmup * 2 or seconds_per_img > 5:
                # eta 预估剩余时间 Estimated Time of Arrival 预计到达时间
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)),
                    n=5,
                )

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (total - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))
    """ return results:
    res = {
    "AP", "AP50", "AP75", "APs", "APm", "APl", "AP-{#class_name}",...
    }
    """
    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Ejemplo n.º 10
0
def inference_on_dataset(model, data_loader, evaluator):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    The model will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use
            :class:`DatasetEvaluators([])` if you only want to benchmark, but
            don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = torch.distributed.get_world_size(
    ) if torch.distributed.is_initialized() else 1
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()
            outputs = model(inputs)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process(inputs, outputs)

            if idx >= num_warmup * 2:
                seconds_per_img = total_compute_time / (idx + 1 - num_warmup)
                eta = datetime.timedelta(seconds=int(seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)),
                    n=5,
                )

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (total - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))

    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Ejemplo n.º 11
0
def inference_on_dataset(model,
                         data_loader,
                         evaluator,
                         overwrite=True,
                         only_zero_rot=True):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately.
    The model will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    predictions_save_path = path.join(
        evaluator._output_dir, f'predictions_{evaluator._dataset_name}.pkl')
    if not overwrite and path.exists(predictions_save_path):
        # Load existing predictions if overwrite is false
        print("Loading existing predictions")
        #evaluator._predictions = load_obj(predictions_save_path)
        (evaluator._predictions, evaluator.focussed_comps,
         evaluator.related_comps, evaluator.unrelated_comps, evaluator.n_comps,
         evaluator.pred_bboxes_scores, evaluator.unrelated_names,
         evaluator.focussed_names, evaluator.related_unresolved,
         evaluator.unrelated_unresolved, evaluator.wide_focus,
         evaluator.old_related_unresolved, evaluator.old_unrelated_unresolved,
         evaluator.misboxed_category) = load_obj(predictions_save_path)
    else:

        num_warmup = min(5, total - 1)
        start_time = time.perf_counter()
        total_compute_time = 0
        with inference_context(model), torch.no_grad():
            for idx, inputs in enumerate(data_loader):
                # We only need to evaluate the unrotated images
                #if inputs[0]['file_name'].endswith('ILTJ110530.36+465055.8_radio_DR2_rotated0deg.png'):
                #    print('input filename')
                #    print(inputs[0]['proposals'])

                if only_zero_rot and not inputs[0]['file_name'].endswith(
                        '_rotated0deg.png'):
                    continue
                if idx == num_warmup:
                    start_time = time.perf_counter()
                    total_compute_time = 0

                start_compute_time = time.perf_counter()
                outputs = model(inputs)
                #missing_box = 'ILTJ123057.73+464446.2_radio_DR2_rotated0deg.png'
                #if inputs[0]['file_name'].endswith(missing_box):
                #    print('output filename',missing_box)
                #    print('inputs:',inputs)
                #    print('outputs:', outputs)

                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                total_compute_time += time.perf_counter() - start_compute_time
                # Appends predicted instances to evaluator._predictions
                evaluator.process(inputs, outputs)

                iters_after_start = idx + 1 - num_warmup * int(
                    idx >= num_warmup)
                seconds_per_img = total_compute_time / iters_after_start
                if idx >= num_warmup * 2 or seconds_per_img > 5:
                    total_seconds_per_img = (time.perf_counter() -
                                             start_time) / iters_after_start
                    eta = datetime.timedelta(
                        seconds=int(total_seconds_per_img * (total - idx - 1)))
                    log_every_n_seconds(
                        logging.INFO,
                        "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                            idx + 1, total, seconds_per_img, str(eta)),
                        n=10,
                    )
        # Save to pickle
        save_obj([
            evaluator._predictions, evaluator.focussed_comps,
            evaluator.related_comps, evaluator.unrelated_comps,
            evaluator.n_comps, evaluator.pred_bboxes_scores,
            evaluator.unrelated_names, evaluator.focussed_names,
            evaluator.related_unresolved, evaluator.unrelated_unresolved,
            evaluator.wide_focus, evaluator.old_related_unresolved,
            evaluator.old_unrelated_unresolved, evaluator.misboxed_category
        ], predictions_save_path)

        # Measure the time only for this worker (before the synchronization barrier)
        total_time = time.perf_counter() - start_time
        total_time_str = str(datetime.timedelta(seconds=total_time))
        # NOTE this format is parsed by grep
        logger.info(
            "Total inference time: {} ({:.6f} s / img per device, on {} devices)"
            .format(total_time_str, total_time / (total - num_warmup),
                    num_devices))
        total_compute_time_str = str(
            datetime.timedelta(seconds=int(total_compute_time)))
        logger.info(
            "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
            .format(total_compute_time_str,
                    total_compute_time / (total - num_warmup), num_devices))
    results = evaluator.evaluate()
    if not isinstance(results, pd.DataFrame):
        logger.info(
            f"LOFAR Evaluation metrics (for all values 0% is best, 100% is worst):"
        )
        logger.info(f"1. Pred. that fail to cover a single comp. source.")
        logger.info(f"{results['bbox']['assoc_single_fail_fraction']:.2%}")
        logger.info(f"2. Pred. that fail to cover all comp. of a " \
                "multi-comp, source.")
        logger.info(f"{results['bbox']['assoc_multi_fail_fraction']:.2%}")
        logger.info(
            f"3. Pred. that include unassociated comp. for a single comp. source."
        )
        logger.info(f"{results['bbox']['unassoc_single_fail_fraction']:.2%}")
        logger.info(f"4. Pred. that include unassociated comp. for a " \
                "multi-comp. source.")
        logger.info(f"{results['bbox']['unassoc_multi_fail_fraction']:.2%}")
        logger.info(
            f"Catalogue is {results['bbox']['correct_catalogue']} correct.")

    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Ejemplo n.º 12
0
def inference_on_dataset(
    model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None]
):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.__call__` accurately.
    The model will be used in eval mode.

    Args:
        model (callable): a callable which takes an object from
            `data_loader` and returns some outputs.

            If it's an nn.Module, it will be temporarily set to `eval` mode.
            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark,
            but don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} batches".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    if isinstance(evaluator, abc.MutableSequence):
        evaluator = DatasetEvaluators(evaluator)
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    with ExitStack() as stack:
        if isinstance(model, nn.Module):
            stack.enter_context(inference_context(model))
        stack.enter_context(torch.no_grad())

        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()
            outputs = model(inputs)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process(inputs, outputs)

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_iter = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_iter > 5:
                total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / iter. ETA={}".format(
                        idx + 1, total, seconds_per_iter, str(eta)
                    ),
                    n=5,
                )

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
            total_time_str, total_time / (total - num_warmup), num_devices
        )
    )
    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
        )
    )

    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Ejemplo n.º 13
0
def eval_mislabel_detection(dataset_name,
                            cfg,
                            mismatch_thresh=0.3,
                            augment=False):

    data_loader = build_test_loader(cfg, dataset_name)
    model = build_model(cfg)
    DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
    model.eval()

    n = len(data_loader)

    tp = torch.zeros(n)
    fp = torch.zeros(n)

    total = torch.zeros(n)
    npos = torch.zeros(n)

    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start mislabel evaluation on {} images".format(n))

    num_warmup = min(5, n - 1)
    start_time = time.perf_counter()
    total_compute_time = 0

    for idx, inputs in enumerate(data_loader):
        if idx == num_warmup:
            start_time = time.perf_counter()
            total_compute_time = 0

        start_compute_time = time.perf_counter()

        gt_mismatch_scores, _, _, gt_mislabeled_ids = detect_mislabeled_annotations_per_image(
            inputs, model)
        pred_mislabeled_ids = gt_mismatch_scores > mismatch_thresh

        total[idx] = gt_mismatch_scores.shape[0]
        npos[idx] = torch.sum(gt_mislabeled_ids).int()

        tp[idx] = torch.sum(
            torch.logical_and(gt_mislabeled_ids, pred_mislabeled_ids)).int()
        fp[idx] = torch.sum(
            torch.logical_and(torch.logical_not(gt_mislabeled_ids),
                              pred_mislabeled_ids)).int()

        total_compute_time += time.perf_counter() - start_compute_time
        iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
        seconds_per_img = total_compute_time / iters_after_start
        if idx >= num_warmup * 2 or seconds_per_img > 5:
            total_seconds_per_img = (time.perf_counter() -
                                     start_time) / iters_after_start
            eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                 (n - idx - 1)))
            log_every_n_seconds(
                logging.INFO,
                "Processed {}/{}. {:.4f} s / img. ETA={}".format(
                    idx + 1, n, seconds_per_img, str(eta)),
                n=5,
            )
    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))

    logger.info(
        "Total evaluation time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (n - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total evaluation pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str, total_compute_time / (n - num_warmup),
                num_devices))

    # recall = torch.sum(tp)/ torch.sum(npos)
    # precision = 1 - (torch.sum(tp) + torch.sum(fp))/torch.sum(total)

    # return recall.item(), precision.item()
    return torch.sum(tp).item(), torch.sum(fp).item(), torch.sum(
        npos).item(), torch.sum(total).item()
Ejemplo n.º 14
0
def detect_mislabeled_annotations(dataset_name, cfg, mismatch_thresh=0.3):

    class_names = MetadataCatalog.get(dataset_name).thing_classes
    sa_json_dir = sa_setup_project_dir(dataset_name, class_names)
    qa = []
    completed = []

    data_loader = build_test_loader(cfg, dataset_name)
    model = build_model(cfg)
    DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
    model.eval()

    n = len(data_loader)
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start mislabel detection on {} images".format(n))

    num_warmup = min(5, n - 1)
    start_time = time.perf_counter()
    total_compute_time = 0

    for idx, inputs in enumerate(data_loader):

        if idx == num_warmup:
            start_time = time.perf_counter()
            total_compute_time = 0

        start_compute_time = time.perf_counter()

        gt_mismatch_scores, gt_classes, gt_boxes, _ = detect_mislabeled_annotations_per_image(
            inputs, model)
        mislabeled_gt_ids = gt_mismatch_scores > mismatch_thresh

        gt_class_info = [(class_id.item(), class_names[class_id])
                         for class_id in gt_classes]

        if torch.any(mislabeled_gt_ids):
            qa.append(inputs[0]["file_name"])
        else:
            completed.append(inputs[0]["file_name"])

        sa_format_annotations(inputs[0]['image_id'], sa_json_dir, gt_boxes,
                              gt_class_info, mislabeled_gt_ids)

        total_compute_time += time.perf_counter() - start_compute_time
        iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
        seconds_per_img = total_compute_time / iters_after_start
        if idx >= num_warmup * 2 or seconds_per_img > 5:
            total_seconds_per_img = (time.perf_counter() -
                                     start_time) / iters_after_start
            eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                 (n - idx - 1)))
            log_every_n_seconds(
                logging.INFO,
                "Proessed {}/{}. {:.4f} s / img. ETA={}".format(
                    idx + 1, n, seconds_per_img, str(eta)),
                n=5,
            )
    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))

    logger.info(
        "Total detection time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (n - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total detection pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str, total_compute_time / (n - num_warmup),
                num_devices))

    sa_write_status_lists(dataset_name, qa, completed)
Ejemplo n.º 15
0
def inference_on_dataset(model,
                         data_loader,
                         distributed=True,
                         output_dir=None):
    num_devices = get_world_size()
    logger = logging.getLogger("detectron2")
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    predictions = []
    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()
            outputs = forward_warpper(model, inputs)

            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            predictions.extend(process(inputs, outputs))

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)),
                    n=5,
                    name="detectron2",
                )
            # Measure the time only for this worker (before the synchronization barrier)
        total_time = time.perf_counter() - start_time
        total_time_str = str(datetime.timedelta(seconds=total_time))
        # NOTE this format is parsed by grep
        logger.info(
            "Total inference time: {} ({:.6f} s / img per device, on {} devices)"
            .format(total_time_str, total_time / (total - num_warmup),
                    num_devices))
        total_compute_time_str = str(
            datetime.timedelta(seconds=int(total_compute_time)))
        logger.info(
            "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
            .format(total_compute_time_str,
                    total_compute_time / (total - num_warmup), num_devices))

    if distributed:
        comm.synchronize()
        predictions = comm.gather(predictions, dst=0)
        predictions = list(itertools.chain(*predictions))

        if not comm.is_main_process():
            return {}

    if output_dir:
        PathManager.mkdirs(output_dir)
        file_path = os.path.join(output_dir, "instances_predictions.pth")
        logger.info("Saving results to {}".format(file_path))
        with PathManager.open(file_path, "wb") as f:
            torch.save(predictions, f)

    coco_results = list(itertools.chain(*[x["instances"]
                                          for x in predictions]))
    logger.info(
        "Start converting obj365 results to coco type annotation json file...")
    coco_dict = convert_obj365_res_to_coco_json(coco_results)

    return coco_dict
Ejemplo n.º 16
0
    def inference(model, data_loader, evaluator, k_th, K_fold):

        total = len(
            data_loader)  # inference data loader must have a fixed length

        logger = logging.getLogger("detectron2.trainer")
        logger.info("Start inference on {} images".format(total))

        num_devices = torch.distributed.get_world_size(
        ) if torch.distributed.is_initialized() else 1

        # 1.initialize evaluator counter
        evaluator.reset()

        num_warmup = min(5 * K_fold, total - 1)
        start_time = time.perf_counter()
        total_compute_time = 0
        torch.no_grad()
        model.eval()
        for idx, inputs in enumerate(data_loader):

            # warm up
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            # select sub dataset
            if not idx % K_fold == k_th:
                continue

            start_compute_time = time.perf_counter()

            # 2.evaluate
            outputs = model(inputs)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time

            # 3.update evaluator counter
            evaluator.process(inputs, outputs)

            # log
            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "{}_th sub_datasets | Inference done {}/{}. {:.4f} s / img. ETA={}"
                    .format(k_th, idx + 1, total, seconds_per_img, str(eta)),
                    n=5,
                )

        # Measure the time only for this worker (before the synchronization barrier)
        total_time = time.perf_counter() - start_time
        total_time_str = str(datetime.timedelta(seconds=total_time))
        # NOTE this format is parsed by grep
        logger.info(
            "Total inference time: {} ({:.6f} s / img per device, on {} devices)"
            .format(total_time_str, total_time / (total - num_warmup),
                    num_devices))
        total_compute_time_str = str(
            datetime.timedelta(seconds=int(total_compute_time)))
        logger.info(
            "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
            .format(total_compute_time_str,
                    total_compute_time / (total - num_warmup), num_devices))

        # 4.final evaluate
        results = evaluator.evaluate()
        # An evaluator may return None when not in main process.
        # Replace it by an empty dict instead to make it easier for downstream code to handle
        if results is None:
            results = {}

        return results
Ejemplo n.º 17
0
def inference_on_dataset(model, data_loader, tracker, evaluator):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately.
    The model will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger = logging.getLogger("detectron2")
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    res_tracks = dict()
    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            # pre process.
            assert len(inputs) == 1
            assert isinstance(inputs[0], tuple)
            frame_id = inputs[0][0].get("frame_id", None)
            assert frame_id is not None
            if frame_id == 1:
                tracker.reset_all()
                # warm up for first frame.
                _, pre_embed = model(inputs)
            # add pre embed to inputs.
            inputs[0][0]["pre_embed"] = pre_embed

            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            # inference.
            start_compute_time = time.perf_counter()
            outputs, pre_embed = model(inputs)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process([inputs[0][0]], outputs)

            # post process.
            if frame_id == 1:
                res_track = tracker.init_track(outputs[0]["instances"])
            else:
                res_track = tracker.step(outputs[0]["instances"])
            res_tracks[inputs[0][0]["image_id"]] = res_track

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)),
                    n=5,
                    name="detectron2")

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (total - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))

    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results, res_tracks
Ejemplo n.º 18
0
def gdrn_inference_on_dataset(cfg,
                              model,
                              data_loader,
                              evaluator,
                              amp_test=False):
    """Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately. The model
    will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    total_process_time = 0
    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0
                total_process_time = 0

            start_compute_time = time.perf_counter()
            #############################
            # process input
            batch = batch_data(cfg, inputs, phase="test")
            if evaluator.train_objs is not None:
                roi_labels = batch["roi_cls"].cpu().numpy().tolist()
                obj_names = [evaluator.obj_names[_l] for _l in roi_labels]
                if all(_obj not in evaluator.train_objs for _obj in obj_names):
                    continue

            # if cfg.DEBUG:
            #     for i in range(len(batch["roi_cls"])):
            #         vis_roi_im = batch["roi_img"][i].cpu().numpy().transpose(1,2,0)[:, :, ::-1]
            #         show_ims = [vis_roi_im]
            #         show_titles = ["roi_im"]
            #
            #         vis_coor2d = batch["roi_coord_2d"][i].cpu().numpy()
            #         show_ims.extend([vis_coor2d[0], vis_coor2d[1]])
            #         show_titles.extend(["coord_2d_x", "coord_2d_y"])
            #         grid_show(show_ims, show_titles, row=1, col=3)

            with autocast(enabled=amp_test):
                out_dict = model(
                    batch["roi_img"],
                    roi_classes=batch["roi_cls"],
                    roi_cams=batch["roi_cam"],
                    roi_whs=batch["roi_wh"],
                    roi_centers=batch["roi_center"],
                    resize_ratios=batch["resize_ratio"],
                    roi_coord_2d=batch.get("roi_coord_2d", None),
                    roi_extents=batch.get("roi_extent", None),
                )
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            cur_compute_time = time.perf_counter() - start_compute_time
            total_compute_time += cur_compute_time
            # NOTE: added
            # TODO: add detection time here
            outputs = [{} for _ in range(len(inputs))]
            for _i in range(len(outputs)):
                outputs[_i]["time"] = cur_compute_time

            start_process_time = time.perf_counter()
            evaluator.process(inputs, outputs, out_dict)  # RANSAC/PnP
            cur_process_time = time.perf_counter() - start_process_time
            total_process_time += cur_process_time

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    f"Inference done {idx+1}/{total}. {seconds_per_img:.4f} s / img. ETA={str(eta)}",
                    n=5)

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        f"Total inference time: {total_time_str} "
        f"({total_time / (total - num_warmup):.6f} s / img per device, on {num_devices} devices)"
    )
    # pure forward time
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))
    # post_process time
    total_process_time_str = str(
        datetime.timedelta(seconds=int(total_process_time)))
    logger.info(
        "Total inference post process time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_process_time_str,
                total_process_time / (total - num_warmup), num_devices))

    results = evaluator.evaluate()  # results is always None
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Ejemplo n.º 19
0
def inference_on_dataset(model, data_loader, evaluator):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately.
    The model will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = torch.distributed.get_world_size(
    ) if torch.distributed.is_initialized() else 1
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    # with forward aggregation the model does not return the output for the current input frame
    inputs_buffer = deque()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0
    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            assert (len(inputs) == 1), "Test batch size != 1({})".format(
                len(inputs))
            if not inputs[0]['is_padding']:
                # skip left frame padding (repeated frames at the beginning of each video)
                # skip right frame padding (repeated frames at the end of each video):
                #   - This way, inputs_buffer will be empty at the end of each video. With
                #   each frame in the right padding we remove one actual input from the buffer.
                inputs_buffer.append(inputs)

            start_compute_time = time.perf_counter()
            outputs = model(inputs)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time

            # incomplete iteration:
            #   - processing padding at the beginning/end of the current video
            #          or
            #   - more frames are needed to perform the forward aggregation
            # therefore:
            #   - discard outputs(==None)
            #   - do not take into account execution time for incomplete iterations
            if outputs is not None:
                # the current output is related with the first input in the inputs_buffer
                inputs = inputs_buffer.popleft()
                evaluator.process(inputs, outputs)

                iters_after_start = idx + 1 - num_warmup * int(
                    idx >= num_warmup)
                seconds_per_img = total_compute_time / iters_after_start
                if idx >= num_warmup * 2 or seconds_per_img > 5:
                    total_seconds_per_img = (time.perf_counter() -
                                             start_time) / iters_after_start
                    eta = datetime.timedelta(
                        seconds=int(total_seconds_per_img * (total - idx - 1)))
                    log_every_n_seconds(
                        logging.INFO,
                        "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                            idx + 1, total, seconds_per_img, str(eta)),
                        n=5,
                    )
            else:
                print(idx, 'padding')

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (total - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))

    results = evaluator.evaluate()
    print(type(evaluator))
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Ejemplo n.º 20
0
def inference_on_dataset(model, data_loader, evaluator):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately.
    The model will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    # perf profiling
    prof_type = os.getenv('DETECTRON2_PROF', None)

    def prof_func():
        return torch.autograd.profiler.profile(use_cuda=prof_type == 'cuda')

    if prof_type is not None:
        prof_key = '{}_time_total'.format(prof_type if prof_type ==
                                          'cpu' else 'cuda')
        prof_logger = logging.getLogger(
            'detectron2_prof_test_{}'.format(prof_type))
        prof_logger.setLevel(logging.INFO)
        prof_logger.addHandler(
            logging.FileHandler(
                './detectron2_prof_test_{}.log'.format(prof_type), 'w'))

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0

    # perf profiling: add timer for pre, and post-processing
    timer = [0, 0]

    loading_start, loading_time = time.perf_counter(), 0
    with inference_context(model), torch.no_grad(), prof_func() as prof:
        for idx, inputs in enumerate(data_loader):
            loading_time += time.perf_counter() - loading_start
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()
            outputs = model(inputs, timer=timer)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process(inputs, outputs)

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)),
                    n=5,
                )
            loading_start = time.perf_counter()

    # perf profiling logging
    if prof_type is not None:
        prof_logger.info(prof.key_averages().table(sort_by=prof_key))

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (total - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))
    logger.info(
        "Pre-processing time: {:.2f} s, Post-processing time: {:.2f}".format(
            timer[0], timer[1]))

    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    results['pre_processing_time'] = timer[0]
    results['post_processing_time'] = timer[1]
    results['loading_time'] = loading_time
    results['compute_time'] = total_compute_time
    results['inference_time'] = total_time
    return results
Ejemplo n.º 21
0
def inference_custom(model, data_loader, evaluator):

    num_devices = get_world_size()
    logger = logging.getLogger(__name__)
    logger.info("Start inference on {} images".format(len(data_loader)))

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0

    with inference_context(model), torch.no_grad():
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()
            outputs = model(inputs)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process(inputs, outputs)

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() -
                                         start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img *
                                                     (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)),
                    n=5,
                )

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".
        format(total_time_str, total_time / (total - num_warmup), num_devices))
    total_compute_time_str = str(
        datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)"
        .format(total_compute_time_str,
                total_compute_time / (total - num_warmup), num_devices))

    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Ejemplo n.º 22
0
def inference_on_dataset(
    model, data_loader, evaluator, num_classes, topk, num_estimate, min_score
):
    """
    Run model on the data_loader and evaluate the metrics with evaluator.
    Also benchmark the inference speed of `model.forward` accurately.
    The model will be used in eval mode.

    Args:
        model (nn.Module): a module which accepts an object from
            `data_loader` and returns some outputs. It will be temporarily set to `eval` mode.

            If you wish to evaluate a model in `training` mode instead, you can
            wrap the given model and override its behavior of `.eval()` and `.train()`.
        data_loader: an iterable object with a length.
            The elements it generates will be the inputs to the model.
        evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want
            to benchmark, but don't want to do any evaluation.
        topk (int)
        num_estimate (int): Number of images to estimate initial score threshold.

    Returns:
        The return value of `evaluator.evaluate()`
    """
    num_devices = get_world_size()
    logger.info("Start inference on {} images".format(len(data_loader)))
    if isinstance(topk, int):
        logger.info(f"Collecting top-{topk} images.")
        topk = [topk] * num_classes
    else:
        logger.info(f"Collecting top-k images. Counts:\n{topk}")

    total = len(data_loader)  # inference data loader must have a fixed length
    if evaluator is None:
        # create a no-op evaluator
        evaluator = DatasetEvaluators([])
    evaluator.reset()

    num_warmup = min(5, total - 1)
    start_time = time.perf_counter()
    total_compute_time = 0

    # We keep track of scores from _this_ process (process_scores) and scores from
    # all processes (scores). Every iter, each process updates process_scores and its
    # local scores with the new scores from the model.
    # Every few iterations, all processes pass their process_scores to each other and
    # updates their own global scores.

    # Map category id to min-heap of top scores from this process.
    process_scores = defaultdict(list)
    # Map category id to min-heap of top scores from all processes.
    global_scores = defaultdict(list)
    init_thresholds = torch.full(
        (num_classes + 1,), fill_value=min_score, dtype=torch.float32
    ).to(model.device)
    init_threshold_path = Path(evaluator._output_dir) / "_thresholds_checkpoint.pth"
    if init_threshold_path.exists():
        logger.info("Loading thresholds from disk.")
        init_thresholds = torch.load(init_threshold_path).to(model.device)
    else:
        init_threshold_path.parent.mkdir(exist_ok=True, parents=True)

    # Trying to get exactly the top-k estimates can result in getting slightly fewer
    # than K estimates. This can happen due to subtle differences in the model's forward
    # pass in the first phase vs. the second phase. For example, in the first phase,
    # when we have low thresholds, D2 will use torchvision.ops.boxes.batched_nms for
    # batch NMS. In phase 2, D2 will use a slightly different, customized
    # implementation, which may occasionally result in fewer boxes.
    # To address this, we set thresholds to be a bit looser, targeting 10% more
    # predictions than requested.
    topk_loose = [int(ceil(k * 1.1)) for k in topk]

    def get_thresholds(scores, min_thresholds):
        thresholds = []
        for i in range(num_classes):
            if topk_loose[i] == 0:
                thresholds.append(float("inf"))
            elif len(scores[i]) < topk_loose[i]:
                thresholds.append(-1)
            else:
                thresholds.append(scores[i][0])
        # Add -1 for background
        thresholds = torch.FloatTensor(thresholds + [-1]).to(model.device)
        # Clamp at minimum thresholds
        return torch.max(thresholds, init_thresholds)

    def update_scores(scores, inputs, outputs):
        updated = set()
        for image, output in zip(inputs, outputs):
            if isinstance(output, dict):
                instances = output["instances"]
            else:
                instances = output
            curr_labels = instances.pred_classes.int().tolist()
            curr_scores = instances.scores.cpu().tolist()
            for label, score in zip(curr_labels, curr_scores):
                # label = label.int().item()
                # scores[label].append((image["image_id"], score.cpu().item()))
                if len(scores[label]) >= topk_loose[label]:
                    if score < scores[label][0]:
                        continue
                    else:
                        heapq.heappushpop(scores[label], score)
                else:
                    heapq.heappush(scores[label], score)
                updated.add(label)

    def gather_scores(process_scores):
        # List of scores per process
        scores_list = comm.all_gather(process_scores)
        gathered = defaultdict(list)
        labels = {x for scores in scores_list for x in scores.keys()}
        for label in labels:
            # Sort in descending order.
            sorted_generator = heapq.merge(
                *[sorted(x[label], reverse=True) for x in scores_list], reverse=True
            )
            top_k = itertools.islice(sorted_generator, topk_loose[label])
            top_k_ascending = list(reversed(list(top_k)))  # Return to ascending order
            heapq.heapify(top_k_ascending)
            gathered[label] = top_k_ascending
        return gathered

    with inference_context(model), torch.no_grad():
        #########
        # Phase 1: Compute initial, low score thresholds without mask branch.
        #########
        # First, get an estimate of score thresholds with the mask branch off.
        # Otherwise, in the initial few images, we will run the mask branch on a bunch
        # of useless proposals which makes everything slow.
        num_estimate = min(num_estimate, len(data_loader))
        for idx, inputs in enumerate(
            tqdm(
                data_loader,
                desc="Computing score thresholds",
                total=num_estimate,
                disable=comm.get_rank() != 0,
            )
        ):
            if idx > num_estimate:
                break
            # Gather scores from other processes periodically.
            # In early iterations, the thresholds are low, making inference slow and
            # gather relatively fast, so we gather more often.
            # Later, the thresholds are high enough that inference is fast and gathering
            # is slow, so we stop gathering.
            if (idx < 100 and idx % 10 == 0) or (idx % 500 == 0):
                global_scores = gather_scores(process_scores)

            thresholds = get_thresholds(global_scores, init_thresholds)
            if idx % 1000 == 0:  # Save thresholds for later runs
                torch.save(thresholds, init_threshold_path)

            with per_class_thresholded_inference(model, thresholds, topk):
                with _turn_off_roi_heads(model, ["mask_on", "keypoint_on"]):
                    outputs = model.inference(inputs, do_postprocess=False)
            update_scores(global_scores, inputs, outputs)
            update_scores(process_scores, inputs, outputs)

            if (idx < 100 and idx % 10 == 0) or (idx % 100 == 0):
                logger.info(
                    "Threshold range (%s, %s); # collected: (%s, %s)",
                    thresholds[:-1].min(),
                    thresholds[:-1].max(),
                    min(len(x) for x in global_scores.values()),
                    max(len(x) for x in global_scores.values()),
                )

        del global_scores
        # Necessary to avoid timeout when gathering?
        comm.synchronize()

        # Map class to scores of predictions so far.
        init_scores = gather_scores(process_scores)
        # Minimum thresholds from the estimate stage
        init_thresholds = get_thresholds(init_scores, init_thresholds)
        # Clear scores from estimates; we will start tracking them again.
        scores = defaultdict(list)

        #########
        # Phase 2: Collect top-k predictions, with mask branch enabled.
        #########
        for idx, inputs in enumerate(data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            start_compute_time = time.perf_counter()
            thresholds = get_thresholds(scores, init_thresholds)
            with per_class_thresholded_inference(model, thresholds, topk):
                with limit_mask_branch_proposals(model, max_proposals=300):
                    outputs = model(inputs)
            update_scores(scores, inputs, outputs)

            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            evaluator.process(inputs, outputs)

            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (
                    time.perf_counter() - start_time
                ) / iters_after_start
                eta = datetime.timedelta(
                    seconds=int(total_seconds_per_img * (total - idx - 1))
                )
                log_every_n_seconds(
                    logging.INFO,
                    "Inference done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)
                    ),
                    n=5,
                    name=logger.name,
                )

            # Clear unnecessary predictions every so often.
            if idx < 100 or ((idx + 1) % 10) == 0:
                by_cat = defaultdict(list)
                for pred in evaluator._predictions:
                    for ann in pred["instances"]:
                        by_cat[ann["category_id"]].append(ann)
                topk_preds = []
                for c, anns in by_cat.items():
                    topk_preds.extend(
                        sorted(anns, key=lambda a: a["score"], reverse=True)[: topk[c]]
                    )
                evaluator._predictions = [{"instances": topk_preds}]

    if evaluator._output_dir:
        PathManager.mkdirs(evaluator._output_dir)
        file_path = os.path.join(
            evaluator._output_dir, f"instances_predictions_rank{comm.get_rank()}.pth"
        )
        with PathManager.open(file_path, "wb") as f:
            torch.save(evaluator._predictions, f)

    # Necessary to avoid timeout when gathering?
    comm.synchronize()
    # Limit number of detections per category across workers.
    predictions = comm.gather(evaluator._predictions, dst=0)
    if comm.is_main_process():
        predictions = list(itertools.chain(*predictions))
        by_cat = defaultdict(list)
        for pred in predictions:
            for ann in pred["instances"]:
                by_cat[ann["category_id"]].append(ann)
        logger.info(f"Max per cat: {max([len(v) for v in by_cat.values()])}")
        logger.info(f"Min per cat: {min([len(v) for v in by_cat.values()])}")
        topk_preds = []
        for c, anns in by_cat.items():
            topk_preds.extend(
                sorted(anns, key=lambda a: a["score"], reverse=True)[: topk[c]]
            )
        evaluator._predictions = [{"instances": topk_preds}]
    else:
        evaluator._predictions = []

    # Measure the time only for this worker (before the synchronization barrier)
    total_time = time.perf_counter() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    # NOTE this format is parsed by grep
    logger.info(
        "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format(
            total_time_str, total_time / (total - num_warmup), num_devices
        )
    )
    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
    logger.info(
        "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format(
            total_compute_time_str,
            total_compute_time / (total - num_warmup),
            num_devices,
        )
    )

    results = evaluator.evaluate()
    # An evaluator may return None when not in main process.
    # Replace it by an empty dict instead to make it easier for downstream code to handle
    if results is None:
        results = {}
    return results
Ejemplo n.º 23
0
def main(
    cfg,
    output_dir,
    runner=None,
    is_train=True,
):
    setup_after_launch(cfg, output_dir, runner)

    if is_train:
        data_loader = runner.build_detection_train_loader(cfg)
    else:
        assert len(cfg.DATASETS.TEST) > 0, cfg.DATASETS.TEST
        data_loader = runner.build_detection_test_loader(
            cfg, cfg.DATASETS.TEST[0])

    TOTAL_BENCHMARK_TIME = (100 if get_launch_environment() == "local" else 600
                            )  # run benchmark for 10 min
    LOGGING_METER_WINDOW_SIZE = 20
    LOGGING_METER_TIME_INTERVAL = 5
    WARMUP_ITERS = 5

    # initialize
    time_per_iter = HistoryBuffer(max_length=10000)
    total_time = 0

    start = time.time()
    for no, batch in enumerate(data_loader):
        data_time = time.time() - start
        time_per_iter.update(data_time)
        total_time += data_time

        if no == 0:
            logger.info("Show the first batch as example:\n{}".format(batch))

        # Assume batch size is constant
        batch_size = cfg.SOLVER.IMS_PER_BATCH // comm.get_world_size()
        assert len(batch) * batch_size

        median = time_per_iter.median(window_size=LOGGING_METER_WINDOW_SIZE)
        avg = time_per_iter.avg(window_size=LOGGING_METER_WINDOW_SIZE)
        log_every_n_seconds(
            logging.INFO,
            "iter: {};"
            " recent per-iter seconds: {:.4f} (avg) {:.4f} (median);"
            " recent per-image seconds: {:.4f} (avg) {:.4f} (median).".format(
                no,
                avg,
                median,
                avg / batch_size,
                median / batch_size,
            ),
            n=LOGGING_METER_TIME_INTERVAL,
        )

        # Synchronize between processes, exit when all processes are running for enough
        # time. This mimic the loss.backward(), the logged time doesn't include the time
        # for synchronize.
        finished = comm.all_gather(total_time >= TOTAL_BENCHMARK_TIME)
        if all(x for x in finished):
            logger.info(
                "Benchmarking finished after {} seconds".format(total_time))
            break

        start = time.time()

    dataset_name = ":".join(
        cfg.DATASETS.TRAIN) if is_train else cfg.DATASETS.TEST[0]
    time_per_iter = [x[0] for x in time_per_iter.values()]
    time_per_iter = time_per_iter[
        min(WARMUP_ITERS, max(len(time_per_iter) - WARMUP_ITERS, 0)):]
    results = {
        "environment": {
            "num_workers": cfg.DATALOADER.NUM_WORKERS,
            "world_size": comm.get_world_size(),
            "processes_per_machine": get_num_processes_per_machine(),
        },
        "main_processes_stats": {
            "batch_size_per_process":
            batch_size,
            "per_iter_avg":
            np.average(time_per_iter),
            "per_iter_p1":
            np.percentile(time_per_iter, 1, interpolation="nearest"),
            "per_iter_p10":
            np.percentile(time_per_iter, 10, interpolation="nearest"),
            "per_iter_p50":
            np.percentile(time_per_iter, 50, interpolation="nearest"),
            "per_iter_p90":
            np.percentile(time_per_iter, 90, interpolation="nearest"),
            "per_iter_p99":
            np.percentile(time_per_iter, 99, interpolation="nearest"),
            "per_image_avg":
            np.average(time_per_iter) / batch_size,
            "per_image_p1":
            np.percentile(time_per_iter, 1, interpolation="nearest") /
            batch_size,
            "per_image_p10":
            np.percentile(time_per_iter, 10, interpolation="nearest") /
            batch_size,
            "per_image_p50":
            np.percentile(time_per_iter, 50, interpolation="nearest") /
            batch_size,
            "per_image_p90":
            np.percentile(time_per_iter, 90, interpolation="nearest") /
            batch_size,
            "per_image_p99":
            np.percentile(time_per_iter, 99, interpolation="nearest") /
            batch_size,
        },
        "data_processes_stats": {},  # TODO: add worker stats
    }
    # Metrics follows the hierarchy of: name -> dataset -> task -> metrics -> number
    metrics = {"_name_": {dataset_name: results}}
    print_metrics_table(metrics)

    return {
        "accuracy": metrics,
        "metrics": metrics,
    }