Beispiel #1
0
def launch_distributed(
    cfg: AttrDict,
    node_id: int,
    engine_name: str,
    hook_generator: Callable[[Any], List[ClassyHook]],
):
    """
    Launch the distributed training across gpus, according to the cfg

    Args:
        cfg  -- VISSL yaml configuration
        node_id -- node_id for this node
        engine_name -- what engine to run: train or extract_features
        hook_generator -- Callback to generate all the ClassyVision hooks for this engine
    """
    node_id = get_node_id(node_id)
    dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES)
    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    set_env_vars(local_rank=0, node_id=node_id, cfg=cfg)
    copy_to_local(cfg)

    # given the checkpoint folder, we check that there's not already a final checkpoint
    checkpoint_folder = get_checkpoint_folder(cfg)
    if is_training_finished(cfg, checkpoint_folder=checkpoint_folder):
        logging.info(
            f"Training already succeeded on node: {node_id}, exiting.")
        return

    # Get the checkpoint where to load from. The load_checkpoints function will
    # automatically take care of detecting whether it's a resume or not.
    symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists(
            symlink_checkpoint_path):
        checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    else:
        checkpoint_path = get_resume_checkpoint(
            cfg, checkpoint_folder=checkpoint_folder)

    try:
        if world_size > 1:
            torch.multiprocessing.spawn(
                _distributed_worker,
                nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
                args=(
                    cfg,
                    node_id,
                    dist_run_id,
                    engine_name,
                    checkpoint_path,
                    checkpoint_folder,
                    hook_generator,
                ),
                daemon=False,
            )
        else:
            _distributed_worker(
                local_rank=0,
                cfg=cfg,
                node_id=node_id,
                dist_run_id=dist_run_id,
                engine_name=engine_name,
                checkpoint_path=checkpoint_path,
                checkpoint_folder=checkpoint_folder,
                hook_generator=hook_generator,
            )

    except (KeyboardInterrupt, RuntimeError) as e:
        logging.error("Wrapping up, caught exception: ", e)
        if isinstance(e, RuntimeError):
            raise e
    finally:
        cleanup_local_dir(cfg)

    logging.info("All Done!")
Beispiel #2
0
def launch_distributed(
    cfg: AttrDict,
    node_id: int,
    engine_name: str,
    hook_generator: Callable[[Any], List[ClassyHook]],
):
    """
    Launch the distributed training across gpus of the current node according to the cfg.

    If more than 1 nodes are needed for training, this function should be called on each
    of the different nodes, each time with an unique node_id in the range [0..N-1] if N
    is the total number of nodes to take part in training.

    Alternatively, you can use SLURM or any cluster management system to run this function
    for you.

    Configure the node_id, dist_run_id, setup the environment variabled

    Args:
        cfg (AttrDict): VISSL yaml configuration
        node_id (int): node_id for this node
        engine_name (str): what engine to run: train or extract_features
        hook_generator (Callable): Callback to generate all the ClassyVision hooks
            for this engine
    """

    setup_logging(__name__)
    node_id = get_node_id(node_id)
    dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES)

    # If using gpus, we check that the user has specified <= gpus available on user system.
    if cfg.MACHINE.DEVICE == "gpu":
        assert cfg.DISTRIBUTED.NUM_PROC_PER_NODE <= torch.cuda.device_count(
        ), (f"User system doesn't have requested {cfg.DISTRIBUTED.NUM_PROC_PER_NODE} gpus "
            f"available. Number of gpus found on user system={torch.cuda.device_count()}. "
            "Please set the DISTRIBUTED.NUM_PROC_PER_NODE properly.")

    # set the environment variables including local rank, node id etc.
    set_env_vars(local_rank=0, node_id=node_id, cfg=cfg)

    # given the checkpoint folder, we check that there's not already a final checkpoint
    # and that if there already exists a final checkpoint and user is not overriding
    # to ignore the final checkpoint
    checkpoint_folder = get_checkpoint_folder(cfg)
    if is_training_finished(cfg, checkpoint_folder=checkpoint_folder):
        logging.info(
            f"Training already succeeded on node: {node_id}, exiting.")
        return

    # Get the checkpoint where to resume from. The get_resume_checkpoint function will
    # automatically take care of detecting whether it's a resume or not.
    symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and g_pathmgr.exists(
            symlink_checkpoint_path):
        checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    else:
        checkpoint_path = get_resume_checkpoint(
            cfg, checkpoint_folder=checkpoint_folder)

    # assert that if the user set the PARAMS_FILE, it must exist and be valid.
    # we only use the PARAMS_FILE init if the checkpoint doesn't exist for the
    # given training. This ensures that if the same training resumes, then it
    # resumes from the checkpoint and not the weight init
    if checkpoint_path is None and cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]:
        params_file = cfg["MODEL"]["WEIGHTS_INIT"]["PARAMS_FILE"]
        error_message = f"Specified PARAMS_FILE does NOT exist: {params_file}"
        assert g_pathmgr.exists(params_file), error_message

    # copy the data to local if user wants. This can speed up dataloading.
    _copy_to_local(cfg)

    try:
        torch.multiprocessing.spawn(
            _distributed_worker,
            nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
            args=(
                cfg,
                node_id,
                dist_run_id,
                engine_name,
                checkpoint_path,
                checkpoint_folder,
                hook_generator,
            ),
            daemon=False,
        )
    except (KeyboardInterrupt, RuntimeError) as e:
        logging.error("Wrapping up, caught exception: ", e)
        if isinstance(e, RuntimeError):
            raise e
    finally:
        _cleanup_local_dir(cfg)

    logging.info("All Done!")
    shutdown_logging()
Beispiel #3
0
def launch_distributed(
    cfg: AttrDict,
    node_id: int,
    engine_name: str,
    hook_generator: Callable[[Any], List[ClassyHook]],
):
    """
    Launch the distributed training across gpus of the current node according to the cfg.

    If more than 1 nodes are needed for training, this function should be called on each
    of the different nodes, each time with an unique node_id in the range [0..N-1] if N
    is the total number of nodes to take part in training.

    Alternatively, you can use SLURM or any cluster management system to run this function
    for you.

    Configure the node_id, dist_run_id, setup the environment variabled

    Args:
        cfg (AttrDict): VISSL yaml configuration
        node_id (int): node_id for this node
        engine_name (str): what engine to run: train or extract_features
        hook_generator (Callable): Callback to generate all the ClassyVision hooks
            for this engine
    """

    setup_logging(__name__)
    node_id = get_node_id(node_id)
    dist_run_id = get_dist_run_id(cfg, cfg.DISTRIBUTED.NUM_NODES)
    world_size = cfg.DISTRIBUTED.NUM_NODES * cfg.DISTRIBUTED.NUM_PROC_PER_NODE
    set_env_vars(local_rank=0, node_id=node_id, cfg=cfg)
    _copy_to_local(cfg)

    # given the checkpoint folder, we check that there's not already a final checkpoint
    checkpoint_folder = get_checkpoint_folder(cfg)
    if is_training_finished(cfg, checkpoint_folder=checkpoint_folder):
        logging.info(f"Training already succeeded on node: {node_id}, exiting.")
        return

    # Get the checkpoint where to load from. The load_checkpoints function will
    # automatically take care of detecting whether it's a resume or not.
    symlink_checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    if cfg.CHECKPOINT.USE_SYMLINK_CHECKPOINT_FOR_RESUME and PathManager.exists(
        symlink_checkpoint_path
    ):
        checkpoint_path = f"{checkpoint_folder}/checkpoint.torch"
    else:
        checkpoint_path = get_resume_checkpoint(
            cfg, checkpoint_folder=checkpoint_folder
        )

    try:
        if world_size > 1:
            torch.multiprocessing.spawn(
                _distributed_worker,
                nprocs=cfg.DISTRIBUTED.NUM_PROC_PER_NODE,
                args=(
                    cfg,
                    node_id,
                    dist_run_id,
                    engine_name,
                    checkpoint_path,
                    checkpoint_folder,
                    hook_generator,
                ),
                daemon=False,
            )
        else:
            _distributed_worker(
                local_rank=0,
                cfg=cfg,
                node_id=node_id,
                dist_run_id=dist_run_id,
                engine_name=engine_name,
                checkpoint_path=checkpoint_path,
                checkpoint_folder=checkpoint_folder,
                hook_generator=hook_generator,
            )

    except (KeyboardInterrupt, RuntimeError) as e:
        logging.error("Wrapping up, caught exception: ", e)
        if isinstance(e, RuntimeError):
            raise e
    finally:
        _cleanup_local_dir(cfg)

    logging.info("All Done!")
    shutdown_logging()