Example #1
0
 def finalize_use_feature(self):
     pair_df_per_gpu = self.pair_df
     all_pair_df = all_gather(pair_df_per_gpu)
     if not is_main_process():
         return
     pd.concat(all_pair_df).to_pickle(self.output_folder +
                                      "/all_pair_df.pickle")
Example #2
0
def test_model(cfg, model, distributed, iters_per_epoch, dllogger, args):
    if distributed:
        model = model.module
    torch.cuda.empty_cache()  # TODO check if it helps
    iou_types = ("bbox", )
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    if cfg.OUTPUT_DIR:
        for idx, dataset_name in enumerate(dataset_names):
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference",
                                         dataset_name)
            mkdir(output_folder)
            output_folders[idx] = output_folder
    data_loaders_val = make_data_loader(cfg,
                                        is_train=False,
                                        is_distributed=distributed)
    results = []
    for output_folder, dataset_name, data_loader_val in zip(
            output_folders, dataset_names, data_loaders_val):
        result = inference(
            model,
            data_loader_val,
            dataset_name=dataset_name,
            iou_types=iou_types,
            box_only=cfg.MODEL.RPN_ONLY,
            device=cfg.MODEL.DEVICE,
            expected_results=cfg.TEST.EXPECTED_RESULTS,
            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
            output_folder=output_folder,
            dllogger=dllogger,
            args=args)
        synchronize()
        results.append(result)
    if is_main_process():
        map_results, raw_results = results[0]
        bbox_map = map_results.results["bbox"]['AP']
        if cfg.MODEL.MASK_ON:
            segm_map = map_results.results["segm"]['AP']
        else:
            segm_map = 0.0
        dllogger.log(step=(
            cfg.SOLVER.MAX_ITER,
            cfg.SOLVER.MAX_ITER / iters_per_epoch,
        ),
                     data={
                         "BBOX_mAP": bbox_map,
                         "MASK_mAP": segm_map
                     })
        dllogger.log(step=tuple(),
                     data={
                         "BBOX_mAP": bbox_map,
                         "MASK_mAP": segm_map
                     })

        args.writer.add_scalar('BBOX_mAP', bbox_map,
                               cfg.SOLVER.MAX_ITER / iters_per_epoch + 1)
        args.writer.add_scalar('MASK_mAP', segm_map,
                               cfg.SOLVER.MAX_ITER / iters_per_epoch + 1)
Example #3
0
    def __init__(self,
                 ann_file,
                 root,
                 remove_images_without_annotations,
                 transforms=None):
        # ann_f = ann_file
        ann_f = ann_file + ('0' if is_main_process() else '')
        super(COCODataset, self).__init__(root, ann_f)
        synchronize()

        # sort indices for reproducible results
        self.root_json = os.path.dirname(root)
        self.ids = sorted(self.ids)

        # filter images without detection annotations
        # if remove_images_without_annotations:
        #     self.ids = [
        #         img_id
        #         for img_id in self.ids
        #         if len(self.coco.getAnnIds(imgIds=img_id, iscrowd=None)) > 0
        #     ]

        self.json_category_id_to_contiguous_id = {
            v: i + 1
            for i, v in enumerate(self.coco.getCatIds())
        }
        self.contiguous_category_id_to_json_id = {
            v: k
            for k, v in self.json_category_id_to_contiguous_id.items()
        }
        self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
        self.transforms = transforms
Example #4
0
    def preprocess_coco(self, ann_file, split):
        if split == 5 and not self.is_train:
            return ann_file
        ann_file_new = ann_file + "_" + str(split)
        if not os.path.isfile(ann_file_new) and is_main_process():
            with open(ann_file) as f_in:
                anns = json.load(f_in)
            if split == 5:  #voc non-voc
                voc_inds = (0, 1, 2, 3, 4, 5, 6, 8, 14, 15, 16, 17, 18, 19, 39,
                            56, 57, 58, 60, 62)
                split_cat = [
                    a["id"] for i, a in enumerate(anns["categories"])
                    if not i in voc_inds
                ]
            else:
                if not self.is_train:
                    split_cat = [
                        a["id"] for i, a in enumerate(anns["categories"])
                        if i % 4 == (split - 1)
                    ]
                else:
                    split_cat = [
                        a["id"] for i, a in enumerate(anns["categories"])
                        if not i % 4 == (split - 1)
                    ]
            anns["annotations"] = [
                v for v in anns['annotations'] if v["category_id"] in split_cat
            ]
            with open(ann_file_new, "w") as f_out:
                json.dump(anns, f_out)

        return ann_file_new
Example #5
0
def mlperf_test_early_exit(iteration, iters_per_epoch, tester, model,
                           distributed, min_bbox_map, min_segm_map, args):
    if iteration > 0 and iteration % iters_per_epoch == 0:
        epoch = iteration // iters_per_epoch

        dllogger.log(step="PARAMETER", data={"eval_start": True})

        bbox_map, segm_map = test_and_exchange_map(tester, model, distributed,
                                                   args)

        # necessary for correctness, this is for resuming the training
        model.train()
        dllogger.log(step=(
            iteration,
            epoch,
        ),
                     data={
                         "BBOX_mAP": bbox_map,
                         "MASK_mAP": segm_map
                     })

        if is_main_process():
            args.writer.add_scalar('BBOX_mAP', bbox_map, epoch)
            args.writer.add_scalar('MASK_mAP', segm_map, epoch)

        # terminating condition
        if bbox_map >= min_bbox_map and segm_map >= min_segm_map:
            dllogger.log(step="PARAMETER",
                         data={"target_accuracy_reached": True})
            return False  #True #let's continue the training

    return False
Example #6
0
def inference(
        model,
        data_loader,
        postprocessor,
        semi_loss,
        dataset_name,
        iou_types=("bbox",),
        box_only=False,
        device="cuda",
        expected_results=(),
        expected_results_sigma_tol=4,
        output_folder=None,
        anchor_strides=None
):
    # convert to a torch.device for efficiency
    device = torch.device(device)
    num_devices = get_world_size()
    logger = logging.getLogger("maskrcnn_benchmark.inference")
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset)))
    total_timer = Timer()
    inference_timer = Timer()
    total_timer.tic()
    predictions = compute_on_dataset(model, data_loader,postprocessor,semi_loss,anchor_strides, device, inference_timer)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = total_timer.toc()
    total_time_str = get_time_str(total_time)
    logger.info(
        "Total run time: {} ({} s / img per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )
    total_infer_time = get_time_str(inference_timer.total_time)
    logger.info(
        "Model inference time: {} ({} s / img per device, on {} devices)".format(
            total_infer_time,
            inference_timer.total_time * num_devices / len(dataset),
            num_devices,
        )
    )

    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
    if not is_main_process():
        return

    if output_folder:
        torch.save(predictions, os.path.join(output_folder, "predictions.pth"))

    extra_args = dict(
        box_only=box_only,
        iou_types=iou_types,
        expected_results=expected_results,
        expected_results_sigma_tol=expected_results_sigma_tol,
    )

    return evaluate(dataset=dataset,
                    predictions=predictions,
                    output_folder=output_folder,
                    **extra_args)
Example #7
0
def run_test(cfg, model, distributed):
    if distributed:
        model = model.module
    torch.cuda.empty_cache()  # TODO check if it helps
    iou_types = ("bbox", )
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    if cfg.MODEL.KEYPOINT_ON:
        iou_types = iou_types + ("keypoints", )
    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    if cfg.OUTPUT_DIR:
        for idx, dataset_name in enumerate(dataset_names):
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference",
                                         dataset_name)
            mkdir(output_folder)
            output_folders[idx] = output_folder
    # evaluate object detection
    data_loaders_val = make_data_loader(cfg,
                                        is_train=False,
                                        is_distributed=distributed)
    for output_folder, dataset_name, data_loader_val in zip(
            output_folders, dataset_names, data_loaders_val):
        result_obj = inference(
            model,
            data_loader_val,
            dataset_name=dataset_name,
            iou_types=iou_types,
            box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
            device=cfg.MODEL.DEVICE,
            expected_results=cfg.TEST.EXPECTED_RESULTS,
            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
            output_folder=output_folder,
            eval_attributes=False,
        )
        synchronize()
    # evaluate attribute detection
    data_loaders_val = make_data_loader(cfg,
                                        is_train=False,
                                        is_distributed=distributed)
    for output_folder, dataset_name, data_loader_val in zip(
            output_folders, dataset_names, data_loaders_val):
        result_attr = inference(
            model,
            data_loader_val,
            dataset_name=dataset_name,
            iou_types=iou_types,
            box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
            device=cfg.MODEL.DEVICE,
            expected_results=cfg.TEST.EXPECTED_RESULTS,
            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
            output_folder=output_folder,
            eval_attributes=True,
        )
        synchronize()

    if is_main_process():
        results = {**result_obj, **result_attr}
        print(results)
Example #8
0
 def _get_tensorboard_writer(log_dir, window_size=20):
     if is_main_process() and log_dir is not None:
         timestamp = datetime.fromtimestamp(
             time.time()).strftime('%Y%m%d-%H:%M')
         return TensorboardXWriter(os.path.join(log_dir, timestamp),
                                   window_size)
     else:
         return None
Example #9
0
def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu):
    all_predictions = all_gather(predictions_per_gpu)
    if not is_main_process():
        return
    # merge the list of dicts
    predictions = {}
    for p in all_predictions:
        predictions.update(p)

    return predictions
Example #10
0
def cache_url(url, model_dir=None, progress=True):
    r"""Loads the Torch serialized object at the given URL.
    If the object is already present in `model_dir`, it's deserialized and
    returned. The filename part of the URL should follow the naming convention
    ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
    digits of the SHA256 hash of the contents of the file. The hash is used to
    ensure unique names and to verify the contents of the file.
    The default value of `model_dir` is ``$TORCH_HOME/models`` where
    ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
    overridden with the ``$TORCH_MODEL_ZOO`` environment variable.
    Args:
        url (string): URL of the object to download
        model_dir (string, optional): directory in which to save the object
        progress (bool, optional): whether or not to display a progress bar to stderr
    Example:
        >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
    """
    if model_dir is None:
        torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch'))
        model_dir = os.getenv('TORCH_MODEL_ZOO',
                              os.path.join(torch_home, 'models'))
    if not os.path.exists(model_dir) and is_main_process():
        os.makedirs(model_dir)
    parts = urlparse(url)
    filename = os.path.basename(parts.path)
    if filename == "model_final.pkl":
        # workaround as pre-trained Caffe2 models from Detectron have all the same filename
        # so make the full path the filename by replacing / with _
        filename = parts.path.replace("/", "_")
    cached_file = os.path.join(model_dir, filename)
    if not os.path.exists(cached_file) and is_main_process():
        sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
        hash_prefix = HASH_REGEX.search(filename)
        if hash_prefix is not None:
            hash_prefix = hash_prefix.group(1)
            # workaround: Caffe2 models don't have a hash, but follow the R-50 convention,
            # which matches the hash PyTorch uses. So we skip the hash matching
            # if the hash_prefix is less than 6 characters
            if len(hash_prefix) < 6:
                hash_prefix = None
        _download_url_to_file(url, cached_file, hash_prefix, progress=progress)
    synchronize()
    return cached_file
Example #11
0
def main():
    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument('--output_dir',
                        default='runs',
                        help='where to write models and stats')
    parser.add_argument('--resume',
                        help='filename of a model to resume training with')
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument("--batch_size", type=int, default=2)
    parser.add_argument("--save_every", type=int, default=500)
    parser.add_argument("--image_every", type=int, default=100)
    parser.add_argument("--print_every", type=int, default=20)
    parser.add_argument("--lr", type=float, default=0.001)
    parser.add_argument("--single-block", action='store_true', default=False)
    parser.add_argument("--fine-tune",
                        "-ft",
                        action='store_true',
                        default=False,
                        help="do not restore optimizer and scheduler state")

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    if is_main_process():
        logging.basicConfig(level=logging.INFO)
        output_dir = experiment_dir(base_dir=args.output_dir)
        logger = setup_logger("fcos", output_dir, get_rank())
        logger.info("Using {} GPUs".format(num_gpus))
        logger.info(args)
    else:
        output_dir = None

    train(
        output_dir,
        args.local_rank,
        args.distributed,
        resume=args.resume,
        base_lr=args.lr,
        batch_size=args.batch_size,
        single_block=args.single_block,
        fine_tune=args.fine_tune,
        save_every=args.save_every,
        image_every=args.image_every,
        print_every=args.print_every,
    )
def mlperf_test_early_exit(iteration, iters_per_epoch, tester, model, distributed, min_bbox_map, min_segm_map):
    # Note: let iters / epoch == 10k, at iter 9999 we've finished epoch 0 and need to test
    if iteration > 0 and (iteration + 1)% iters_per_epoch == 0:
        synchronize()
        epoch = iteration // iters_per_epoch + 1

        log_end(key=constants.EPOCH_STOP, metadata={"epoch_num": epoch})
        log_end(key=constants.BLOCK_STOP, metadata={"first_epoch_num": epoch})
        log_start(key=constants.EVAL_START, metadata={"epoch_num":epoch})
        # set the async evaluator's tag correctly
        set_epoch_tag(epoch)

        # Note: No longer returns anything, underlying future is in another castle
        tester(model=model, distributed=distributed)
        # necessary for correctness
        model.train()
    else:
        # Otherwise, check for finished async results
        results = check_completed_tags()

        # on master process, check each result for terminating condition
        # sentinel for run finishing
        finished = 0
        if is_main_process():
            for result_epoch, (bbox_map, segm_map) in results.items():
                logger = logging.getLogger('maskrcnn_benchmark.trainer')
                logger.info('bbox mAP: {}, segm mAP: {}'.format(bbox_map, segm_map))

                log_event(key=constants.EVAL_ACCURACY, value={"BBOX" : bbox_map, "SEGM" : segm_map}, metadata={"epoch_num" : result_epoch} )
                log_end(key=constants.EVAL_STOP, metadata={"epoch_num": result_epoch})
                # terminating condition
                if bbox_map >= min_bbox_map and segm_map >= min_segm_map:
                    logger.info("Target mAP reached, exiting...")
                    finished = 1
                    #return True

        # We now know on rank 0 whether or not we should terminate
        # Bcast this flag on multi-GPU
        if get_world_size() > 1:
            with torch.no_grad():
                finish_tensor = torch.tensor([finished], dtype=torch.int32, device = torch.device('cuda'))
                torch.distributed.broadcast(finish_tensor, 0)
    
                # If notified, end.
                if finish_tensor.item() == 1:
                    return True
        else:
            # Single GPU, don't need to create tensor to bcast, just use value directly
            if finished == 1:
                return True

    # Otherwise, default case, continue
    return False
Example #13
0
def run_test(cfg, model, distributed, iteration_name):
    global best_val_map, is_best_val_map, cur_val_map, writer
    if distributed:
        model = model.module
    torch.cuda.empty_cache()  # TODO check if it helps
    iou_types = ("bbox", )
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm", )
    if cfg.MODEL.KEYPOINT_ON:
        iou_types = iou_types + ("keypoints", )
    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    if cfg.OUTPUT_DIR:
        for idx, dataset_name in enumerate(dataset_names):
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference",
                                         dataset_name + '_' + iteration_name)
            mkdir(output_folder)
            output_folders[idx] = output_folder
    data_loaders_val = make_data_loader(cfg,
                                        is_train=False,
                                        is_distributed=distributed)
    for output_folder, dataset_name, data_loader_val in zip(
            output_folders, dataset_names, data_loaders_val):
        results = inference(
            model,
            data_loader_val,
            dataset_name=dataset_name,
            iou_types=iou_types,
            box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY,
            device=cfg.MODEL.DEVICE,
            expected_results=cfg.TEST.EXPECTED_RESULTS,
            expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
            output_folder=output_folder,
        )
        if not is_main_process():
            synchronize()
            return
        if iteration_name != 'final':
            for k, v in results.results.items():
                for ki, vi in v.items():
                    if ki == 'AP':
                        cur_val_map = vi
                        if vi > best_val_map:
                            best_val_map = vi
                            is_best_val_map = True
                        else:
                            is_best_val_map = False
                    writer.add_scalar(dataset_name + '_' + k + '_' + ki, vi,
                                      int(iteration_name))
                    # print(dataset_name + '_' + k + '_' + ki, vi)
        synchronize()
Example #14
0
    def _get_tensorboard_writer(log_dir):
        try:
            from tensorboardX import SummaryWriter
        except ImportError:
            raise ImportError('To use tensorboard please install tensorboardX '
                              '[ pip install tensorflow tensorboardX ].')

        if is_main_process():
            timestamp = datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y%m%d-%H:%M')
            tb_logger = SummaryWriter('{}-{}'.format(log_dir, timestamp))
            return tb_logger
        else:
            return None
Example #15
0
def inference(
        model,
        rngs,
        data_loader,
        iou_types=("bbox", ),
        box_only=False,
        device="cuda",
        expected_results=(),
        expected_results_sigma_tol=4,
        output_folder=None,
):

    # convert to a torch.device for efficiency
    device = torch.device(device)
    num_devices = (torch.distributed.get_world_size()
                   if torch.distributed.is_initialized() else 1)
    dataset = data_loader.dataset
    predictions = compute_on_dataset(model, rngs, data_loader, device)
    # wait for all processes to complete before measuring the time
    synchronize()

    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
    if not is_main_process():
        return

    coco_results = {}
    if "bbox" in iou_types:
        coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset)

    results = COCOResults(*iou_types)
    uuid1 = str(uuid.uuid1())

    for iou_type in iou_types:
        with tempfile.NamedTemporaryFile() as f:
            file_path = f.name
            if output_folder:
                file_path = os.path.join(output_folder,
                                         uuid1 + iou_type + ".json")
            res = evaluate_predictions_on_coco(dataset.coco,
                                               coco_results[iou_type],
                                               file_path, iou_type)
            results.update(res)

        if os.path.isfile(file_path):
            os.remove(file_path)

    return results
def evaluator(cfg,args,model,device,iteration):
    meters_val = MetricLogger(delimiter="  ")

    data_loader_val = make_data_loader(cfg, is_train=False, is_distributed=False)[0]
        
    with torch.no_grad():
        # Should be one image for each GPU:
        print('Calculating evaluation loss.')
        for iteration_val, batch in enumerate(data_loader_val):
            #if is_main_process():
            #    print(iteration_val)
            if args.debug and iteration_val>10:
                break
            images_val, targets_val, _ = batch
            
            skip_batch=False
            nbox=[]
            for t in targets_val:
                nbox.append(len(t))
                if len(t)<1:
                    skip_batch=True
                    break
            if skip_batch:
                continue
            try:
                print(iteration_val,nbox)
                images_val = images_val.to(device)
                targets_val = [target.to(device) for target in targets_val]
                loss_dict = model(images_val, targets_val)
                losses = sum(loss for loss in loss_dict.values())
                loss_dict_reduced = reduce_loss_dict(loss_dict)
                losses_reduced = sum(loss for loss in loss_dict_reduced.values())
                meters_val.update(loss=losses_reduced, **loss_dict_reduced)
            except:
                print('Warning: ground truth error.')
    
        #synchronize()

        if is_main_process():
            print('Save evaluation loss to tensorboard.')
            for name, meter in meters_val.meters.items():
                print(name,meter.global_avg)
                args.writer.add_scalar('EvalMetrics/'+name, meter.global_avg, iteration / args.iters_per_epoch)
            print('Pass')
def check_completed_tags():
    # Evaluator is only valid on master rank - all others will have nothing.
    # So, assemble lists of completed runs on master
    if is_main_process():
        evaluator = get_evaluator()

        # loop over all all epoch, result pairs that have finished
        all_results = {}
        for t, r in evaluator.finished_tasks().items():
            # Note: one indirection due to possibility of multiple test datasets
            # we only care about the first
            map_results = r# [0]
            bbox_map = map_results.results["bbox"]['AP']
            segm_map = map_results.results["segm"]['AP']
            all_results.update({ t : (bbox_map, segm_map) })

        return all_results
    
    return {}
Example #18
0
def cache_url(url, model_dir=None, progress=True):
    r"""Loads the Torch serialized object at the given URL.
    If the object is already present in `model_dir`, it's deserialized and
    returned. The filename part of the URL should follow the naming convention
    ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
    digits of the SHA256 hash of the contents of the file. The hash is used to
    ensure unique names and to verify the contents of the file.
    The default value of `model_dir` is ``$TORCH_HOME/models`` where
    ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be
    overridden with the ``$TORCH_MODEL_ZOO`` environment variable.
    Args:
        url (string): URL of the object to download
        model_dir (string, optional): directory in which to save the object
        progress (bool, optional): whether or not to display a progress bar to stderr
    Example:
        >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
    """
    if model_dir is None:
        torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch'))
        model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models'))
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    parts = urlparse(url)
    filename = os.path.basename(parts.path)
    if filename == "model_final.pkl":
        # workaround as pre-trained Caffe2 models from Detectron have all the same filename
        # so make the full path the filename by replacing / with _
        filename = parts.path.replace("/", "_")
    cached_file = os.path.join(model_dir, filename)
    if not os.path.exists(cached_file) and is_main_process():
        sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
        hash_prefix = HASH_REGEX.search(filename)
        if hash_prefix is not None:
            hash_prefix = hash_prefix.group(1)
            # workaround: Caffe2 models don't have a hash, but follow the R-50 convention,
            # which matches the hash PyTorch uses. So we skip the hash matching
            # if the hash_prefix is less than 6 characters
            if len(hash_prefix) < 6:
                hash_prefix = None
        _download_url_to_file(url, cached_file, hash_prefix, progress=progress)
    synchronize()
    return cached_file
Example #19
0
def inference(
        model,
        data_loader,
        dataset_name,
        device="cuda"
):
    # convert to a torch.device for efficiency
    device = torch.device(device)
    num_devices = get_world_size()
    logger = logging.getLogger("maskrcnn_benchmark.inference")
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset)))
    total_timer = Timer()
    inference_timer = Timer()
    total_timer.tic()
    predictions = compute_on_dataset(model, data_loader, device, inference_timer)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = total_timer.toc()
    total_time_str = get_time_str(total_time)
    logger.info(
        "Total run time: {} ({} s / img per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )
    total_infer_time = get_time_str(inference_timer.total_time)
    logger.info(
        "Model inference time: {} ({} s / img per device, on {} devices)".format(
            total_infer_time,
            inference_timer.total_time * num_devices / len(dataset),
            num_devices,
        )
    )

    predictions = _accumulate_predictions_from_multiple_gpus(predictions)
    if not is_main_process():
        return

    results =  do_voc_evaluation(dataset=dataset,
                                 predictions=predictions,
                                 logger=logger)
    return results
Example #20
0
def inference(
    model,
    data_loader,
    iou_types=("bbox",),
    box_only=False,
    device="cuda",
    expected_results=(),
    expected_results_sigma_tol=4,
    without_nms=False,
    output_folder=None,
):

    # convert to a torch.device for efficiency
    device = torch.device(device)
    num_devices = (
        torch.distributed.deprecated.get_world_size()
        if torch.distributed.deprecated.is_initialized()
        else 1
    )
    logger = logging.getLogger("maskrcnn_benchmark.eval_IR")
    dataset = data_loader.dataset
    logger.info("Start evaluation on {} images".format(len(dataset)))
    start_time = time.time()
    with_overlaps, without_overlaps = compute_on_dataset(model, data_loader, device)
    # wait for all processes to complete before measuring the time
    synchronize()
    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=total_time))
    logger.info(
        "Total inference time: {} ({} s / img per device, on {} devices)".format(
            total_time_str, total_time * num_devices / len(dataset), num_devices
        )
    )

    with_overlaps = _accumulate_predictions_from_multiple_gpus(with_overlaps)
    without_overlaps = _accumulate_predictions_from_multiple_gpus(without_overlaps)
    if not is_main_process():
        return

    logger.info("Evaluating IoU average Recall (IR)")
    results = evaluate_iou_average_recall(with_overlaps, without_overlaps, output_folder)
    logger.info(results)
Example #21
0
def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu):
    all_predictions = all_gather(predictions_per_gpu)
    if not is_main_process():
        return
    # merge the list of dicts
    predictions = {}
    for p in all_predictions:
        predictions.update(p)
    # convert a dict where the key is the index in a list
    image_ids = list(sorted(predictions.keys()))
    if len(image_ids) != image_ids[-1] + 1:
        logger = logging.getLogger("maskrcnn_benchmark.inference")
        logger.warning(
            "Number of images that were gathered from multiple processes is not "
            "a contiguous set. Some images might be missing from the evaluation"
        )

    # convert to a list
    predictions = [predictions[i] for i in image_ids]
    return predictions
Example #22
0
def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu,
                                               return_dict=False,
                                               only_gather=False):
    if _dict_to_list is None:
        return
    if get_world_size() == 1:
        return predictions_per_gpu
    all_predictions = all_gather(predictions_per_gpu)
    if only_gather:
        return all_predictions
    if not is_main_process():
        return
    # merge the list of dicts
    predictions = {}
    for p in all_predictions:
        predictions.update(p)

    if return_dict:
        return predictions

    return _dict_to_list(predictions)
Example #23
0
def test_and_exchange_map(tester, model, distributed):
    results = tester(model=model, distributed=distributed)

    # main process only
    if is_main_process():
        # Note: one indirection due to possibility of multiple test datasets, we only care about the first
        #       tester returns (parsed results, raw results). In our case, don't care about the latter
        map_results, raw_results = results[0]
        bbox_map = map_results.results["bbox"]['AP']
        segm_map = map_results.results["segm"]['AP']
    else:
        bbox_map = 0.
        segm_map = 0.

    if distributed:
        map_tensor = torch.tensor([bbox_map, segm_map], dtype=torch.float32, device=torch.device("cuda"))
        torch.distributed.broadcast(map_tensor, 0)
        bbox_map = map_tensor[0].item()
        segm_map = map_tensor[1].item()

    return bbox_map, segm_map
Example #24
0
    def preprocess_lvis(self, ann_file, extract_feature):
        suffix = "_freq" if self.is_train else "_common_rare"
        ann_file_new = ann_file + suffix
        if not os.path.isfile(ann_file_new) and is_main_process():
            with open(ann_file) as f_in:
                anns = json.load(f_in)
                cids_lvis_f = [
                    a["id"] for a in anns["categories"]
                    if a["frequency"] == "f"
                ]
            if self.is_train:
                anns["annotations"] = [
                    v for v in anns["annotations"]
                    if v["category_id"] in cids_lvis_f
                ]
            else:
                anns["annotations"] = [
                    v for v in anns["annotations"]
                    if v["category_id"] not in cids_lvis_f
                ]
                # classes not exhaustively annotated should not be support examples of an image
                if not extract_feature:
                    img_ne = {
                        d["id"]: d["not_exhaustive_category_ids"]
                        for d in anns["images"]
                    }
                    anns["annotations"] = [
                        v for v in anns["annotations"]
                        if v["category_id"] not in img_ne[v["image_id"]]
                    ]

                # the file_name field in the lvis val dataset is broken
                replace_file_name = lambda x: (x["file_name"].split("_")[2])
                if "_" in anns["images"][0]["file_name"]:
                    for img in anns["images"]:
                        img.update({"file_name": replace_file_name(img)})

            with open(ann_file_new, "w") as f_out:
                json.dump(anns, f_out)
        return ann_file_new
def main():

    run = Run.get_context()
    workspace = run.experiment.workspace

    # First thing to do is try to set up from environment
    configure_nccl_settings_from_env()

    parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=os.getenv("LOCAL_RANK", 0))
    parser.add_argument(
        "--max_steps",
        type=int,
        default=0,
        help="Override number of training steps in the config",
    )
    parser.add_argument("--dataset", type=str, required=True)
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument("--fp16", help="Mixed precision training", action="store_true")
    parser.add_argument("--amp", help="Mixed precision training", action="store_true")
    parser.add_argument(
        "--skip_checkpoint",
        default=False,
        action="store_true",
        help="Whether to save checkpoints",
    )
    parser.add_argument(
        "--json-summary",
        help="Out file for DLLogger",
        default="dllogger.out",
        type=str,
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )
    args = parser.parse_args()
    args.fp16 = args.fp16 or args.amp

    num_gpus = get_global_size()
    args.distributed = num_gpus > 1
    args.local_rank = get_local_rank()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)

    # Redundant option - Override config parameter with command line input
    if args.max_steps > 0:
        cfg.SOLVER.MAX_ITER = args.max_steps

    if args.skip_checkpoint:
        cfg.SAVE_CHECKPOINT = False

    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    if is_main_process():
        dllogger.init(
            backends=[
                dllogger.JSONStreamBackend(
                    verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary
                ),
                dllogger.StdOutBackend(
                    verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step
                ),
            ]
        )
    else:
        dllogger.init(backends=[])

    dllogger.log(step="PARAMETER", data={"gpu_count": num_gpus})
    # dllogger.log(step="PARAMETER", data={"environment_info": collect_env_info()})
    dllogger.log(step="PARAMETER", data={"config_file": args.config_file})

    dllogger.log(step="PARAMETER", data={"config": cfg})

    if args.fp16:
        fp16 = True
    else:
        fp16 = False

    if args.local_rank == 0:
        dllogger.log(step="WEIGHT DOWNLOAD", data={"complete": False})
        download_weights(cfg.MODEL.WEIGHT, cfg.PATHS_CATALOG)
        dllogger.log(step="WEIGHT DOWNLOAD", data={"complete": True})

        dllogger.log(
            step="DATASET MOUNT", data={"complete": False, "dataset": args.dataset}
        )
        coco2017 = Dataset.get_by_name(workspace, args.dataset)
        cc2017mount = coco2017.mount("/data")
        cc2017mount.start()
        dllogger.log(
            step="DATASET MOUNT", data={"complete": True, "dataset": args.dataset}
        )

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl", init_method="env://")
        synchronize()

    model, iters_per_epoch = train(
        cfg, args.local_rank, args.distributed, fp16, dllogger
    )
Example #26
0
 def finalize_extract_feature(self):
     features_df_per_gpu = self.features_df
     all_features_df = all_gather(features_df_per_gpu)
     if not is_main_process():
         return
     pd.concat(all_features_df).to_pickle(self.df_file)
Example #27
0
def do_train(
    model,
    data_loader,
    optimizer,
    scheduler,
    checkpointer,
    device,
    checkpoint_period,
    arguments,
    tb_logger,
    cfg,
):
    print('111111111111111111111')
    logger = logging.getLogger("maskrcnn_benchmark.trainer")
    print('2222222222222222222222')
    logger.info("Start training")
    print('4444444444444444444444')
    meters = MetricLogger(delimiter="  ")
    max_iter = len(data_loader)
    start_iter = arguments["iteration"]
    model.train()
    start_training_time = time.time()
    end = time.time()
    kkk = 0
    for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
        data_time = time.time() - end
        arguments["iteration"] = iteration
        #print(kkk)
        kkk += 1
        scheduler.step()

        images = images.to(device)
        targets = [target.to(device) for target in targets]

        loss_dict = model(images, targets)

        losses = sum(loss for loss in loss_dict.values())
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = reduce_loss_dict(loss_dict)
        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
        meters.update(loss=losses_reduced, **loss_dict_reduced)

        optimizer.zero_grad()
        losses.backward()
        if cfg.SOLVER.USE_ADAM:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        batch_time = time.time() - end
        end = time.time()
        meters.update(time=batch_time, data=data_time)

        eta_seconds = meters.time.global_avg * (max_iter - iteration)
        eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))

        if iteration % 20 == 0 or iteration == (max_iter - 1):
            #print(kkk * 10000000)
            logger.info(
                meters.delimiter.join(
                    [
                        "eta: {eta}",
                        "iter: {iter}",
                        "{meters}",
                        "lr: {lr:.6f}",
                        "max mem: {memory:.0f}",
                    ]
                ).format(
                    eta=eta_string,
                    iter=iteration,
                    meters=str(meters),
                    lr=optimizer.param_groups[0]["lr"],
                    memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
                )
            )
            if is_main_process():
                for tag, value in loss_dict_reduced.items():
                    tb_logger.scalar_summary(tag, value.item(), iteration)
        if iteration % checkpoint_period == 0 and iteration > 0:
            checkpointer.save("model_{:07d}".format(iteration), **arguments)

    checkpointer.save("model_{:07d}".format(iteration), **arguments)
    total_training_time = time.time() - start_training_time
    total_time_str = str(datetime.timedelta(seconds=total_training_time))
    logger.info(
        "Total training time: {} ({:.4f} s / it)".format(
            total_time_str, total_training_time / (max_iter)
        )
    )
Example #28
0
def main():

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=os.getenv('LOCAL_RANK', 0))
    parser.add_argument("--max_steps",
                        type=int,
                        default=0,
                        help="Override number of training steps in the config")
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    parser.add_argument("--fp16",
                        help="Mixed precision training",
                        action="store_true")
    parser.add_argument("--amp",
                        help="Mixed precision training",
                        action="store_true")
    parser.add_argument('--skip_checkpoint',
                        default=False,
                        action='store_true',
                        help="Whether to save checkpoints")
    parser.add_argument(
        "--json-summary",
        help="Out file for DLLogger",
        default="dllogger.out",
        type=str,
    )
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )
    args = parser.parse_args()
    args.fp16 = args.fp16 or args.amp

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)

    # Redundant option - Override config parameter with command line input
    if args.max_steps > 0:
        cfg.SOLVER.MAX_ITER = args.max_steps

    if args.skip_checkpoint:
        cfg.SAVE_CHECKPOINT = False

    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    if is_main_process():
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    dllogger.log(step="PARAMETER", data={"gpu_count": num_gpus})
    # dllogger.log(step="PARAMETER", data={"environment_info": collect_env_info()})
    dllogger.log(step="PARAMETER", data={"config_file": args.config_file})

    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()

    dllogger.log(step="PARAMETER", data={"config": cfg})

    if args.fp16:
        fp16 = True
    else:
        fp16 = False

    model, iters_per_epoch = train(cfg, args.local_rank, args.distributed,
                                   fp16, dllogger)

    if not args.skip_test:
        if not cfg.PER_EPOCH_EVAL:
            test_model(cfg, model, args.distributed, iters_per_epoch, dllogger)
Example #29
0
def main():
    mlperf_log.ROOT_DIR_MASKRCNN = os.path.dirname(os.path.abspath(__file__))

    parser = argparse.ArgumentParser(
        description="PyTorch Object Detection Training")
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(
        os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    if is_main_process:
        # Setting logging file parameters for compliance logging
        os.environ["COMPLIANCE_FILE"] = './MASKRCNN_complVv0.5.0_' + str(
            datetime.datetime.now())
        mlperf_log.LOG_FILE = os.getenv("COMPLIANCE_FILE")
        mlperf_log._FILE_HANDLER = logging.FileHandler(mlperf_log.LOG_FILE)
        mlperf_log._FILE_HANDLER.setLevel(logging.DEBUG)
        mlperf_log.LOGGER.addHandler(mlperf_log._FILE_HANDLER)

    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend="nccl",
                                             init_method="env://")
        synchronize()

        print_mlperf(key=mlperf_log.RUN_START)

        # setting seeds - needs to be timed, so after RUN_START
        if is_main_process():
            master_seed = random.SystemRandom().randint(0, 2**32 - 1)
            seed_tensor = torch.tensor(master_seed,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))
        else:
            seed_tensor = torch.tensor(0,
                                       dtype=torch.float32,
                                       device=torch.device("cuda"))

        torch.distributed.broadcast(seed_tensor, 0)
        master_seed = int(seed_tensor.item())
    else:
        print_mlperf(key=mlperf_log.RUN_START)
        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
        master_seed = random.SystemRandom().randint(0, 2**32 - 1)

    # actually use the random seed
    args.seed = master_seed
    # random number generator with seed set to master_seed
    random_number_generator = random.Random(master_seed)
    print_mlperf(key=mlperf_log.RUN_SET_RANDOM_SEED, value=master_seed)

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    # generate worker seeds, one seed for every distributed worker
    worker_seeds = generate_seeds(
        random_number_generator,
        torch.distributed.get_world_size()
        if torch.distributed.is_initialized() else 1)

    # todo sharath what if CPU
    # broadcast seeds from rank=0 to other workers
    worker_seeds = broadcast_seeds(worker_seeds, device='cuda')

    # Setting worker seeds
    logger.info("Worker {}: Setting seed {}".format(
        args.local_rank, worker_seeds[args.local_rank]))
    torch.manual_seed(worker_seeds[args.local_rank])

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    model = train(cfg, args.local_rank, args.distributed)

    print_mlperf(key=mlperf_log.RUN_FINAL)
Example #30
0
def do_coco_evaluation(
    dataset,
    predictions,
    box_only,
    output_folder,
    iou_types,
    expected_results,
    expected_results_sigma_tol,
):
    logger = logging.getLogger("maskrcnn_benchmark.inference")

    # Different path here, fast parallel method not available, fall back to effectively the old
    # path.
    if box_only:
        predictions = _accumulate_predictions_from_multiple_gpus(predictions)
        if not is_main_process():
            return

        logger.info("Evaluating bbox proposals")
        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
        res = COCOResults("box_proposal")
        for limit in [100, 1000]:
            for area, suffix in areas.items():
                stats = evaluate_box_proposals(predictions,
                                               dataset,
                                               area=area,
                                               limit=limit)
                key = "AR{}@{:d}".format(suffix, limit)
                res.results["box_proposal"][key] = stats["ar"].item()
        logger.info(res)
        check_expected_results(res, expected_results,
                               expected_results_sigma_tol)
        if output_folder:
            torch.save(res, os.path.join(output_folder, "box_proposals.pth"))
        return
    logger.info("Preparing results for COCO format")
    coco_results = {}
    if "bbox" in iou_types:
        logger.info("Preparing bbox results")
        coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset)
    if "segm" in iou_types:
        logger.info("Preparing segm results")
        coco_results["segm"] = prepare_for_coco_segmentation(
            predictions, dataset)
    if 'keypoints' in iou_types:
        logger.info('Preparing keypoints results')
        coco_results['keypoints'] = prepare_for_coco_keypoint(
            predictions, dataset)

    # Gather all prepared predictions of each type from all ranks
    if "bbox" in iou_types:
        temp_bbox_list = all_gather(coco_results["bbox"])
    if "segm" in iou_types:
        temp_segm_list = all_gather(coco_results["segm"])
    if "keypoints" in iou_types:
        temp_keypoints_list = all_gather(coco_results["keypoints"])

    # Only main process will call COCO
    if not is_main_process():
        return

    # Unpack the gathered results into a single List[Entry]
    if "bbox" in iou_types:
        coco_results["bbox"] = [i for j in temp_bbox_list for i in j]
    if "segm" in iou_types:
        coco_results["segm"] = [i for j in temp_segm_list for i in j]
    if "keypoints" in iou_types:
        coco_results["keypoints"] = [i for j in temp_keypoints_list for i in j]

    results = evaluate_coco(dataset, coco_results, iou_types, output_folder)
    # Submit to async evaluator
    # get_evaluator().submit_task(get_tag(),
    #                            evaluate_coco,
    #                            dataset,
    #                            coco_results,
    #                            iou_types,
    #                            output_folder)
    # Note: None of these are possible now
    # logger.info(results)
    check_expected_results(results, expected_results,
                           expected_results_sigma_tol)
    # if output_folder:
    #     torch.save(results, os.path.join(output_folder, "coco_results.pth"))

    # Note: results is now empty, the relevant future is held in the hidden
    # AsyncEvaluator object
    return results, coco_results
Example #31
0
def main():
    parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
    parser.add_argument(
        "--config-file",
        default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    distributed = num_gpus > 1

    if distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", init_method="env://"
        )

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)

    # replace the output directory based on the "task" kind.
    cfg.OUTPUT_DIR = os.path.join(
        os.path.dirname(cfg.OUTPUT_DIR),
        cfg.TASK.KIND.lower(),
        cfg.NAME,
        os.path.basename(cfg.OUTPUT_DIR))
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    running_lock = None
    if is_main_process():
        # save the config to the output directory.
        save_config_path = os.path.join(output_dir, "config.yaml")

        with open(save_config_path, "w") as cf:
            cf.write(cfg.dump())
            print("wrote (merged) config to {0}".format(save_config_path))

        running_lock = RunningLockFile(output_dir).start()
        save_code = SaveCodeChanges([os.path.dirname(maskrcnn_benchmark.__path__[0])])
        save_code(output_dir)

        print("saved code changes (against HEAD) to {0}".format(output_dir))

    save_dir = ""
    logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(cfg)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    model = build_detection_model(cfg)
    model.to(cfg.MODEL.DEVICE)
        
    checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
    _ = checkpointer.load(cfg.MODEL.WEIGHT)

    chunk_predictions = cfg.TEST.CHUNK_PREDICTIONS

    iou_types = ("bbox",)
    if cfg.MODEL.MASK_ON:
        iou_types = iou_types + ("segm",)

    if cfg.TASK.KIND.lower() == "panoptic":
        # at some point we should run all bbox/segm, etc.
        if cfg.MODEL.PANOPTIC.COMPUTE_CC_RESULTS and not chunk_predictions:
            iou_types = ("ccpan", "pan")
        else:
            iou_types = ("pan",)


    if cfg.TEST.ORDER_ONLY:
        iou_types = ("order",)
            
    output_folders = [None] * len(cfg.DATASETS.TEST)
    dataset_names = cfg.DATASETS.TEST
    if cfg.OUTPUT_DIR:
        for idx, dataset_name in enumerate(dataset_names):
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)
            mkdir(output_folder)
            output_folders[idx] = output_folder

    # allow the user to override with existing .pth files.
    # unsure how this will work in the distributed setting.
    if not distributed:
        existing_predictions = cfg.TEST.PREDICTION_PATHS
    else:
        print("distributed... ignoring existing predictions if given")
        existing_predictions = []

    chunk_predictions = cfg.TEST.CHUNK_PREDICTIONS
    if not distributed:
        # only makes sense for multiple GPUS when doing actual prediction.
        chunk_predictions = chunk_predictions and existing_predictions
        
    data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed)
    dataset_index = 0

    try:
        for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val):
            predictions = None
            if existing_predictions is not None and dataset_index < len(existing_predictions):
                # todo, check for "chunk" predictions" and delay the load.
                predictions_path = existing_predictions[dataset_index]

                if not chunk_predictions:
                    predictions = torch.load(predictions_path)
                else:
                    # this should be a list of "chunks".
                    predictions = predictions_path
                    print("using chunks: {0}".format(predictions))

            inference(
                model,
                data_loader_val,
                dataset_name=dataset_name,
                iou_types=iou_types,
                box_only=cfg.MODEL.RPN_ONLY,
                device=cfg.MODEL.DEVICE,
                expected_results=cfg.TEST.EXPECTED_RESULTS,
                expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL,
                output_folder=output_folder,
                predictions=predictions,
                working_directory=cfg.TEMPORARY_DIR,
                chunk_predictions=chunk_predictions,
                compute_pre_results=cfg.MODEL.PANOPTIC.COMPUTE_PRE_RESULTS,
                panoptic_confidence_thresh=cfg.MODEL.FUSION.CONFIDENCE_THRESHOLD,
                panoptic_overlap_thresh=cfg.MODEL.FUSION.OVERLAP_THRESHOLD,
                panoptic_stuff_min_area=cfg.MODEL.FUSION.STUFF_MINIMUM_AREA)
            
            synchronize()
            dataset_index += 1
    finally:
        if running_lock:
            running_lock.end()