Example #1
0
def main(
    cfg,
    output_dir,
    runner=None,
    eval_only=False,
    # NOTE: always enable resume when running on cluster
    resume=True,
):
    setup_after_launch(cfg, output_dir, runner)

    model = runner.build_model(cfg)
    logger.info("Model:\n{}".format(model))

    if eval_only:
        checkpointer = runner.build_checkpointer(cfg,
                                                 model,
                                                 save_dir=output_dir)
        # checkpointer.resume_or_load() will skip all additional checkpointable
        # which may not be desired like ema states
        if resume and checkpointer.has_checkpoint():
            checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS,
                                                     resume=resume)
        else:
            checkpoint = checkpointer.load(cfg.MODEL.WEIGHTS)
        train_iter = checkpoint.get("iteration", None)
        model.eval()
        metrics = runner.do_test(cfg, model, train_iter=train_iter)
        print_metrics_table(metrics)
        return {
            "accuracy": metrics,
            "model_configs": {},
            "metrics": metrics,
        }

    model = create_ddp_model(
        model,
        fp16_compression=cfg.MODEL.DDP_FP16_GRAD_COMPRESS,
        device_ids=None
        if cfg.MODEL.DEVICE == "cpu" else [comm.get_local_rank()],
        broadcast_buffers=False,
        find_unused_parameters=cfg.MODEL.DDP_FIND_UNUSED_PARAMETERS,
    )

    trained_cfgs = runner.do_train(cfg, model, resume=resume)
    metrics = runner.do_test(cfg, model)
    print_metrics_table(metrics)

    # dump config files for trained models
    trained_model_configs = dump_trained_model_configs(cfg.OUTPUT_DIR,
                                                       trained_cfgs)
    return {
        # for e2e_workflow
        "accuracy": metrics,
        # for unit_workflow
        "model_configs": trained_model_configs,
        "metrics": metrics,
    }
Example #2
0
def main(
    cfg,
    output_dir,
    runner,
    # binary specific optional arguments
    predictor_types: typing.List[str],
    device: str = "cpu",
    compare_accuracy: bool = False,
    skip_if_fail: bool = False,
):
    if compare_accuracy:
        raise NotImplementedError(
            "compare_accuracy functionality isn't currently supported.")
        # NOTE: dict for metrics of all exported models (and original pytorch model)
        # ret["accuracy_comparison"] = accuracy_comparison

    cfg = copy.deepcopy(cfg)
    setup_after_launch(cfg, output_dir, runner)

    with temp_defrost(cfg):
        cfg.merge_from_list(["MODEL.DEVICE", device])
    model = runner.build_model(cfg, eval_only=True)

    # NOTE: train dataset is used to avoid leakage since the data might be used for
    # running calibration for quantization. test_loader is used to make sure it follows
    # the inference behaviour (augmentation will not be applied).
    datasets = list(cfg.DATASETS.TRAIN)
    data_loader = runner.build_detection_test_loader(cfg, datasets)

    logger.info("Running the pytorch model and print FLOPS ...")
    first_batch = next(iter(data_loader))
    input_args = (first_batch, )
    flops_utils.print_model_flops(model, input_args)

    predictor_paths: typing.Dict[str, str] = {}
    for typ in predictor_types:
        # convert_and_export_predictor might alter the model, copy before calling it
        pytorch_model = copy.deepcopy(model)
        try:
            predictor_path = convert_and_export_predictor(
                cfg,
                pytorch_model,
                typ,
                output_dir,
                data_loader,
            )
            logger.info(
                f"Predictor type {typ} has been exported to {predictor_path}")
            predictor_paths[typ] = predictor_path
        except Exception as e:
            logger.exception(f"Export {typ} predictor failed: {e}")
            if not skip_if_fail:
                raise e

    ret = {"predictor_paths": predictor_paths, "accuracy_comparison": {}}

    return ret
def main(
    cfg: CfgNode,
    output_dir: str,
    task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask,
    eval_only: bool = False,
    num_machines: int = 1,
    num_processes: int = 1,
) -> TrainOutput:
    """Main function for launching a training with lightning trainer
    Args:
        cfg: D2go config node
        num_machines: Number of nodes used for distributed training
        num_processes: Number of processes on each node.
        eval_only: True if run evaluation only.
    """
    # FIXME: make comm.get_world_size() work properly.
    setup_after_launch(cfg, output_dir, _scale_world_size=False)
    auto_scale_world_size(cfg, new_world_size=num_machines * num_processes)

    task = task_cls.from_config(cfg, eval_only)
    trainer_params = get_trainer_params(cfg, num_machines, num_processes)

    last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt")
    if PathManager.exists(last_checkpoint):
        # resume training from checkpoint
        trainer_params["resume_from_checkpoint"] = last_checkpoint
        logger.info(f"Resuming training from checkpoint: {last_checkpoint}.")

    trainer = pl.Trainer(**trainer_params)
    model_configs = None
    if eval_only:
        do_test(trainer, task)
    else:
        model_configs = do_train(cfg, trainer, task)

    return TrainOutput(
        output_dir=cfg.OUTPUT_DIR,
        tensorboard_log_dir=trainer_params["logger"].log_dir,
        accuracy=task.eval_res,
        model_configs=model_configs,
    )
Example #4
0
def main(
    cfg,
    output_dir,
    runner,
    # binary specific optional arguments
    predictor_path,
    num_threads=None,
    caffe2_engine=None,
    caffe2_logging_print_net_summary=0,
):
    torch.backends.quantized.engine = cfg.QUANTIZATION.BACKEND
    print("run with quantized engine: ", torch.backends.quantized.engine)

    setup_after_launch(cfg, output_dir, runner)
    caffe2_global_init(caffe2_logging_print_net_summary, num_threads)

    predictor = create_predictor(predictor_path)
    metrics = runner.do_test(cfg, predictor)
    print_metrics_table(metrics)
    return {
        "accuracy": metrics,
        "metrics": metrics,
    }
Example #5
0
 def setup(self, stage: str):
     setup_after_launch(self.cfg, self.cfg.OUTPUT_DIR, runner=None)
Example #6
0
def main(
    cfg,
    output_dir,
    runner=None,
    is_train=True,
):
    setup_after_launch(cfg, output_dir, runner)

    if is_train:
        data_loader = runner.build_detection_train_loader(cfg)
    else:
        assert len(cfg.DATASETS.TEST) > 0, cfg.DATASETS.TEST
        data_loader = runner.build_detection_test_loader(
            cfg, cfg.DATASETS.TEST[0])

    TOTAL_BENCHMARK_TIME = (100 if get_launch_environment() == "local" else 600
                            )  # run benchmark for 10 min
    LOGGING_METER_WINDOW_SIZE = 20
    LOGGING_METER_TIME_INTERVAL = 5
    WARMUP_ITERS = 5

    # initialize
    time_per_iter = HistoryBuffer(max_length=10000)
    total_time = 0

    start = time.time()
    for no, batch in enumerate(data_loader):
        data_time = time.time() - start
        time_per_iter.update(data_time)
        total_time += data_time

        if no == 0:
            logger.info("Show the first batch as example:\n{}".format(batch))

        # Assume batch size is constant
        batch_size = cfg.SOLVER.IMS_PER_BATCH // comm.get_world_size()
        assert len(batch) * batch_size

        median = time_per_iter.median(window_size=LOGGING_METER_WINDOW_SIZE)
        avg = time_per_iter.avg(window_size=LOGGING_METER_WINDOW_SIZE)
        log_every_n_seconds(
            logging.INFO,
            "iter: {};"
            " recent per-iter seconds: {:.4f} (avg) {:.4f} (median);"
            " recent per-image seconds: {:.4f} (avg) {:.4f} (median).".format(
                no,
                avg,
                median,
                avg / batch_size,
                median / batch_size,
            ),
            n=LOGGING_METER_TIME_INTERVAL,
        )

        # Synchronize between processes, exit when all processes are running for enough
        # time. This mimic the loss.backward(), the logged time doesn't include the time
        # for synchronize.
        finished = comm.all_gather(total_time >= TOTAL_BENCHMARK_TIME)
        if all(x for x in finished):
            logger.info(
                "Benchmarking finished after {} seconds".format(total_time))
            break

        start = time.time()

    dataset_name = ":".join(
        cfg.DATASETS.TRAIN) if is_train else cfg.DATASETS.TEST[0]
    time_per_iter = [x[0] for x in time_per_iter.values()]
    time_per_iter = time_per_iter[
        min(WARMUP_ITERS, max(len(time_per_iter) - WARMUP_ITERS, 0)):]
    results = {
        "environment": {
            "num_workers": cfg.DATALOADER.NUM_WORKERS,
            "world_size": comm.get_world_size(),
            "processes_per_machine": get_num_processes_per_machine(),
        },
        "main_processes_stats": {
            "batch_size_per_process":
            batch_size,
            "per_iter_avg":
            np.average(time_per_iter),
            "per_iter_p1":
            np.percentile(time_per_iter, 1, interpolation="nearest"),
            "per_iter_p10":
            np.percentile(time_per_iter, 10, interpolation="nearest"),
            "per_iter_p50":
            np.percentile(time_per_iter, 50, interpolation="nearest"),
            "per_iter_p90":
            np.percentile(time_per_iter, 90, interpolation="nearest"),
            "per_iter_p99":
            np.percentile(time_per_iter, 99, interpolation="nearest"),
            "per_image_avg":
            np.average(time_per_iter) / batch_size,
            "per_image_p1":
            np.percentile(time_per_iter, 1, interpolation="nearest") /
            batch_size,
            "per_image_p10":
            np.percentile(time_per_iter, 10, interpolation="nearest") /
            batch_size,
            "per_image_p50":
            np.percentile(time_per_iter, 50, interpolation="nearest") /
            batch_size,
            "per_image_p90":
            np.percentile(time_per_iter, 90, interpolation="nearest") /
            batch_size,
            "per_image_p99":
            np.percentile(time_per_iter, 99, interpolation="nearest") /
            batch_size,
        },
        "data_processes_stats": {},  # TODO: add worker stats
    }
    # Metrics follows the hierarchy of: name -> dataset -> task -> metrics -> number
    metrics = {"_name_": {dataset_name: results}}
    print_metrics_table(metrics)

    return {
        "accuracy": metrics,
        "metrics": metrics,
    }