コード例 #1
0
def setup_after_launch(cfg, output_dir, runner):
    """
    Set things up after entering DDP, including
        - creating working directory
        - setting up logger
        - logging environment
        - initializing runner
    """
    create_dir_on_global_main_process(output_dir)
    comm.synchronize()
    setup_loggers(output_dir)
    cfg.freeze()
    if cfg.OUTPUT_DIR != output_dir:
        with temp_defrost(cfg):
            logger.warning(
                "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}".
                format(cfg.OUTPUT_DIR, output_dir))
            cfg.OUTPUT_DIR = output_dir
    logger.info("Initializing runner ...")
    runner = initialize_runner(runner, cfg)

    log_info(cfg, runner)
    dump_cfg(cfg, os.path.join(output_dir, "config.yaml"))

    auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
コード例 #2
0
ファイル: setup.py プロジェクト: iooops/d2go
def setup_after_launch(cfg: CN, output_dir: str, runner):
    _setup_after_launch(cfg, output_dir, runner)
    logger.info("Initializing runner ...")
    runner = initialize_runner(runner, cfg)

    log_info(cfg, runner)

    auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
コード例 #3
0
ファイル: test_config.py プロジェクト: facebookresearch/d2go
 def test_not_scale_for_zero_world_size(self):
     """
     when reference world size is 0, no scaling should happen
     """
     cfg = GeneralizedRCNNRunner().get_default_cfg()
     self.assertEqual(cfg.SOLVER.REFERENCE_WORLD_SIZE, 8)
     cfg.SOLVER.REFERENCE_WORLD_SIZE = 0
     batch_size_x8 = cfg.SOLVER.IMS_PER_BATCH
     auto_scale_world_size(cfg, new_world_size=1)
     self.assertEqual(cfg.SOLVER.REFERENCE_WORLD_SIZE, 0)
     self.assertEqual(cfg.SOLVER.IMS_PER_BATCH, batch_size_x8)
コード例 #4
0
ファイル: test_config.py プロジェクト: facebookresearch/d2go
 def test_8gpu_to_1gpu(self):
     """
     when scaling a 8-gpu config to 1-gpu one, the batch size will be reduced by 8x
     """
     cfg = GeneralizedRCNNRunner().get_default_cfg()
     self.assertEqual(cfg.SOLVER.REFERENCE_WORLD_SIZE, 8)
     batch_size_x8 = cfg.SOLVER.IMS_PER_BATCH
     assert batch_size_x8 % 8 == 0, "default batch size is not multiple of 8"
     auto_scale_world_size(cfg, new_world_size=1)
     self.assertEqual(cfg.SOLVER.REFERENCE_WORLD_SIZE, 1)
     self.assertEqual(cfg.SOLVER.IMS_PER_BATCH * 8, batch_size_x8)
コード例 #5
0
ファイル: setup.py プロジェクト: facebookresearch/d2go
def setup_after_launch(
        cfg: CfgNode,
        output_dir: str,
        runner: Optional[BaseRunner] = None,
        _scale_world_size:
    bool = True,  # HACK: temporarily allow lightning_train_net to by pass this.
):
    """
    Binary-level setup after entering DDP, including
        - creating working directory
        - setting up logger
        - logging environment
        - printing and dumping config
        - (optional) initializing runner
    """

    create_dir_on_global_main_process(output_dir)
    setup_loggers(output_dir)
    log_system_info()

    cfg.freeze()
    maybe_override_output_dir(cfg, output_dir)
    logger.info("Running with full config:\n{}".format(cfg))
    dump_cfg(cfg, os.path.join(output_dir, "config.yaml"))

    if runner:
        logger.info("Initializing runner ...")
        runner = initialize_runner(runner, cfg)
        logger.info("Running with runner: {}".format(runner))

    # save the diff config
    if runner:
        default_cfg = runner.get_default_cfg()
        dump_cfg(
            get_diff_cfg(default_cfg, cfg),
            os.path.join(output_dir, "diff_config.yaml"),
        )
    else:
        # TODO: support getting default_cfg without runner.
        pass

    # scale the config after dumping so that dumped config files keep original world size
    if _scale_world_size:
        auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
コード例 #6
0
def main(
    cfg: CfgNode,
    output_dir: str,
    task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask,
    eval_only: bool = False,
    num_machines: int = 1,
    num_processes: int = 1,
) -> TrainOutput:
    """Main function for launching a training with lightning trainer
    Args:
        cfg: D2go config node
        num_machines: Number of nodes used for distributed training
        num_processes: Number of processes on each node.
        eval_only: True if run evaluation only.
    """
    # FIXME: make comm.get_world_size() work properly.
    setup_after_launch(cfg, output_dir, _scale_world_size=False)
    auto_scale_world_size(cfg, new_world_size=num_machines * num_processes)

    task = task_cls.from_config(cfg, eval_only)
    trainer_params = get_trainer_params(cfg, num_machines, num_processes)

    last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt")
    if PathManager.exists(last_checkpoint):
        # resume training from checkpoint
        trainer_params["resume_from_checkpoint"] = last_checkpoint
        logger.info(f"Resuming training from checkpoint: {last_checkpoint}.")

    trainer = pl.Trainer(**trainer_params)
    model_configs = None
    if eval_only:
        do_test(trainer, task)
    else:
        model_configs = do_train(cfg, trainer, task)

    return TrainOutput(
        output_dir=cfg.OUTPUT_DIR,
        tensorboard_log_dir=trainer_params["logger"].log_dir,
        accuracy=task.eval_res,
        model_configs=model_configs,
    )
コード例 #7
0
ファイル: lightning_train_net.py プロジェクト: iooops/d2go
def main(
    cfg: CfgNode,
    output_dir: Optional[str] = None,
    task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask,
    eval_only: bool = False,
    num_machines: int = 1,
    num_gpus: int = 0,
    num_processes: int = 1,
) -> TrainOutput:
    """Main function for launching a training with lightning trainer
    Args:
        cfg: D2go config node
        num_machines: Number of nodes used for distributed training
        num_gpus: Number of GPUs to train on each node
        num_processes: Number of processes on each node.
            NOTE: Automatically set to the number of GPUs when using DDP.
            Set a value greater than 1 to mimic distributed training on CPUs.
        eval_only: True if run evaluation only.
    """
    assert (num_processes == 1 or num_gpus
            == 0), "Only set num_processes > 1 when training on CPUs"
    auto_scale_world_size(cfg, num_machines * num_gpus)
    maybe_override_output_dir(cfg, output_dir)

    task = task_cls.from_config(cfg, eval_only)
    tb_logger = TensorBoardLogger(save_dir=cfg.OUTPUT_DIR)

    trainer_params = {
        # training loop is bounded by max steps, use a large max_epochs to make
        # sure max_steps is met first
        "max_epochs":
        10**8,
        "max_steps":
        cfg.SOLVER.MAX_ITER,
        "val_check_interval":
        cfg.TEST.EVAL_PERIOD
        if cfg.TEST.EVAL_PERIOD > 0 else cfg.SOLVER.MAX_ITER,
        "num_nodes":
        num_machines,
        "gpus":
        num_gpus,
        "num_processes":
        num_processes,
        "accelerator":
        get_accelerator(cfg.MODEL.DEVICE),
        "callbacks":
        _get_trainer_callbacks(cfg),
        "logger":
        tb_logger,
        "num_sanity_val_steps":
        0,
        "progress_bar_refresh_rate":
        10,
    }

    last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt")
    if PathManager.exists(last_checkpoint):
        # resume training from checkpoint
        trainer_params["resume_from_checkpoint"] = last_checkpoint
        logger.info(f"Resuming training from checkpoint: {last_checkpoint}.")

    trainer = pl.Trainer(**trainer_params)
    model_configs = None
    if eval_only:
        do_test(trainer, task)
    else:
        model_configs = do_train(cfg, trainer, task)

    return TrainOutput(
        output_dir=cfg.OUTPUT_DIR,
        tensorboard_log_dir=tb_logger.log_dir,
        accuracy=task.eval_res,
        model_configs=model_configs,
    )