def inference_on_dataset(model, data_loader, evaluator): """#NOTE: modified to add time Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() cur_compute_time = time.perf_counter() - start_compute_time total_compute_time += cur_compute_time for _o in outputs: _o['time'] = cur_compute_time / len(outputs) evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)". format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def gdrn_inference_on_dataset(cfg, model, data_loader, evaluator, amp_test=False): """Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 total_process_time = 0 with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 total_process_time = 0 start_compute_time = time.perf_counter() ############################# # process input batch = batch_data(cfg, inputs, phase="test") if evaluator.train_objs is not None: roi_labels = batch["roi_cls"].cpu().numpy().tolist() obj_names = [evaluator.obj_names[_l] for _l in roi_labels] if all(_obj not in evaluator.train_objs for _obj in obj_names): continue # if cfg.DEBUG: # for i in range(len(batch["roi_cls"])): # vis_roi_im = batch["roi_img"][i].cpu().numpy().transpose(1,2,0)[:, :, ::-1] # show_ims = [vis_roi_im] # show_titles = ["roi_im"] # # vis_coor2d = batch["roi_coord_2d"][i].cpu().numpy() # show_ims.extend([vis_coor2d[0], vis_coor2d[1]]) # show_titles.extend(["coord_2d_x", "coord_2d_y"]) # grid_show(show_ims, show_titles, row=1, col=3) with autocast(enabled=amp_test): out_dict = model( batch["roi_img"], roi_classes=batch["roi_cls"], roi_cams=batch["roi_cam"], roi_whs=batch["roi_wh"], roi_centers=batch["roi_center"], resize_ratios=batch["resize_ratio"], roi_coord_2d=batch.get("roi_coord_2d", None), roi_extents=batch.get("roi_extent", None), ) if torch.cuda.is_available(): torch.cuda.synchronize() cur_compute_time = time.perf_counter() - start_compute_time total_compute_time += cur_compute_time # NOTE: added # TODO: add detection time here outputs = [{} for _ in range(len(inputs))] for _i in range(len(outputs)): outputs[_i]["time"] = cur_compute_time start_process_time = time.perf_counter() evaluator.process(inputs, outputs, out_dict) # RANSAC/PnP cur_process_time = time.perf_counter() - start_process_time total_process_time += cur_process_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, f"Inference done {idx+1}/{total}. {seconds_per_img:.4f} s / img. ETA={str(eta)}", n=5) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( f"Total inference time: {total_time_str} " f"({total_time / (total - num_warmup):.6f} s / img per device, on {num_devices} devices)" ) # pure forward time total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) # post_process time total_process_time_str = str( datetime.timedelta(seconds=int(total_process_time))) logger.info( "Total inference post process time: {} ({:.6f} s / img per device, on {} devices)" .format(total_process_time_str, total_process_time / (total - num_warmup), num_devices)) results = evaluator.evaluate() # results is always None # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def inference_on_dataset( model, data_loader, evaluator, num_classes, topk, num_estimate, min_score ): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. topk (int) num_estimate (int): Number of images to estimate initial score threshold. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger.info("Start inference on {} images".format(len(data_loader))) if isinstance(topk, int): logger.info(f"Collecting top-{topk} images.") topk = [topk] * num_classes else: logger.info(f"Collecting top-k images. Counts:\n{topk}") total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 # We keep track of scores from _this_ process (process_scores) and scores from # all processes (scores). Every iter, each process updates process_scores and its # local scores with the new scores from the model. # Every few iterations, all processes pass their process_scores to each other and # updates their own global scores. # Map category id to min-heap of top scores from this process. process_scores = defaultdict(list) # Map category id to min-heap of top scores from all processes. global_scores = defaultdict(list) init_thresholds = torch.full( (num_classes + 1,), fill_value=min_score, dtype=torch.float32 ).to(model.device) init_threshold_path = Path(evaluator._output_dir) / "_thresholds_checkpoint.pth" if init_threshold_path.exists(): logger.info("Loading thresholds from disk.") init_thresholds = torch.load(init_threshold_path).to(model.device) else: init_threshold_path.parent.mkdir(exist_ok=True, parents=True) # Trying to get exactly the top-k estimates can result in getting slightly fewer # than K estimates. This can happen due to subtle differences in the model's forward # pass in the first phase vs. the second phase. For example, in the first phase, # when we have low thresholds, D2 will use torchvision.ops.boxes.batched_nms for # batch NMS. In phase 2, D2 will use a slightly different, customized # implementation, which may occasionally result in fewer boxes. # To address this, we set thresholds to be a bit looser, targeting 10% more # predictions than requested. topk_loose = [int(ceil(k * 1.1)) for k in topk] def get_thresholds(scores, min_thresholds): thresholds = [] for i in range(num_classes): if topk_loose[i] == 0: thresholds.append(float("inf")) elif len(scores[i]) < topk_loose[i]: thresholds.append(-1) else: thresholds.append(scores[i][0]) # Add -1 for background thresholds = torch.FloatTensor(thresholds + [-1]).to(model.device) # Clamp at minimum thresholds return torch.max(thresholds, init_thresholds) def update_scores(scores, inputs, outputs): updated = set() for image, output in zip(inputs, outputs): if isinstance(output, dict): instances = output["instances"] else: instances = output curr_labels = instances.pred_classes.int().tolist() curr_scores = instances.scores.cpu().tolist() for label, score in zip(curr_labels, curr_scores): # label = label.int().item() # scores[label].append((image["image_id"], score.cpu().item())) if len(scores[label]) >= topk_loose[label]: if score < scores[label][0]: continue else: heapq.heappushpop(scores[label], score) else: heapq.heappush(scores[label], score) updated.add(label) def gather_scores(process_scores): # List of scores per process scores_list = comm.all_gather(process_scores) gathered = defaultdict(list) labels = {x for scores in scores_list for x in scores.keys()} for label in labels: # Sort in descending order. sorted_generator = heapq.merge( *[sorted(x[label], reverse=True) for x in scores_list], reverse=True ) top_k = itertools.islice(sorted_generator, topk_loose[label]) top_k_ascending = list(reversed(list(top_k))) # Return to ascending order heapq.heapify(top_k_ascending) gathered[label] = top_k_ascending return gathered with inference_context(model), torch.no_grad(): ######### # Phase 1: Compute initial, low score thresholds without mask branch. ######### # First, get an estimate of score thresholds with the mask branch off. # Otherwise, in the initial few images, we will run the mask branch on a bunch # of useless proposals which makes everything slow. num_estimate = min(num_estimate, len(data_loader)) for idx, inputs in enumerate( tqdm( data_loader, desc="Computing score thresholds", total=num_estimate, disable=comm.get_rank() != 0, ) ): if idx > num_estimate: break # Gather scores from other processes periodically. # In early iterations, the thresholds are low, making inference slow and # gather relatively fast, so we gather more often. # Later, the thresholds are high enough that inference is fast and gathering # is slow, so we stop gathering. if (idx < 100 and idx % 10 == 0) or (idx % 500 == 0): global_scores = gather_scores(process_scores) thresholds = get_thresholds(global_scores, init_thresholds) if idx % 1000 == 0: # Save thresholds for later runs torch.save(thresholds, init_threshold_path) with per_class_thresholded_inference(model, thresholds, topk): with _turn_off_roi_heads(model, ["mask_on", "keypoint_on"]): outputs = model.inference(inputs, do_postprocess=False) update_scores(global_scores, inputs, outputs) update_scores(process_scores, inputs, outputs) if (idx < 100 and idx % 10 == 0) or (idx % 100 == 0): logger.info( "Threshold range (%s, %s); # collected: (%s, %s)", thresholds[:-1].min(), thresholds[:-1].max(), min(len(x) for x in global_scores.values()), max(len(x) for x in global_scores.values()), ) del global_scores # Necessary to avoid timeout when gathering? comm.synchronize() # Map class to scores of predictions so far. init_scores = gather_scores(process_scores) # Minimum thresholds from the estimate stage init_thresholds = get_thresholds(init_scores, init_thresholds) # Clear scores from estimates; we will start tracking them again. scores = defaultdict(list) ######### # Phase 2: Collect top-k predictions, with mask branch enabled. ######### for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() thresholds = get_thresholds(scores, init_thresholds) with per_class_thresholded_inference(model, thresholds, topk): with limit_mask_branch_proposals(model, max_proposals=300): outputs = model(inputs) update_scores(scores, inputs, outputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = ( time.perf_counter() - start_time ) / iters_after_start eta = datetime.timedelta( seconds=int(total_seconds_per_img * (total - idx - 1)) ) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta) ), n=5, name=logger.name, ) # Clear unnecessary predictions every so often. if idx < 100 or ((idx + 1) % 10) == 0: by_cat = defaultdict(list) for pred in evaluator._predictions: for ann in pred["instances"]: by_cat[ann["category_id"]].append(ann) topk_preds = [] for c, anns in by_cat.items(): topk_preds.extend( sorted(anns, key=lambda a: a["score"], reverse=True)[: topk[c]] ) evaluator._predictions = [{"instances": topk_preds}] if evaluator._output_dir: PathManager.mkdirs(evaluator._output_dir) file_path = os.path.join( evaluator._output_dir, f"instances_predictions_rank{comm.get_rank()}.pth" ) with PathManager.open(file_path, "wb") as f: torch.save(evaluator._predictions, f) # Necessary to avoid timeout when gathering? comm.synchronize() # Limit number of detections per category across workers. predictions = comm.gather(evaluator._predictions, dst=0) if comm.is_main_process(): predictions = list(itertools.chain(*predictions)) by_cat = defaultdict(list) for pred in predictions: for ann in pred["instances"]: by_cat[ann["category_id"]].append(ann) logger.info(f"Max per cat: {max([len(v) for v in by_cat.values()])}") logger.info(f"Min per cat: {min([len(v) for v in by_cat.values()])}") topk_preds = [] for c, anns in by_cat.items(): topk_preds.extend( sorted(anns, key=lambda a: a["score"], reverse=True)[: topk[c]] ) evaluator._predictions = [{"instances": topk_preds}] else: evaluator._predictions = [] # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format( total_time_str, total_time / (total - num_warmup), num_devices ) ) total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format( total_compute_time_str, total_compute_time / (total - num_warmup), num_devices, ) ) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def inference_ensemble_on_dataset(models, data_loader, evaluator): num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 for model in models: model.eval() with torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() outputs = [] for model in models: outputs.append(model(inputs)) res = [] for i in range(len(outputs[0])): out_i = [output[i] for output in outputs] merged_instances = merge_multi_predictions( out_i, (inputs[i]['height'], inputs[i]['width']), nms_threshold=0.5, ) res.append({"instances": merged_instances}) outputs = res if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta) ), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format( total_time_str, total_time / (total - num_warmup), num_devices ) ) total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format( total_compute_time_str, total_compute_time / (total - num_warmup), num_devices ) ) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results