Exemple #1
0
def make_default_env_context(
    hparams: Dict[str, Any], experiment_config: Optional[Dict] = None, trial_seed: int = 0
) -> det.EnvContext:
    if experiment_config is None:
        experiment_config = make_default_exp_config(hparams, 1)

    # TODO(ryan): Fix the parameter passing so that this doesn't read from environment variables,
    # and we can get rid of the @expose_gpus fixture.
    use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false"))
    gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu)

    return det.EnvContext(
        experiment_config=experiment_config,
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP, ExperimentID(1), TrialID(1), StepID(1)
        ),
        master_addr="",
        master_port=0,
        container_id="",
        hparams=hparams,
        latest_checkpoint=None,
        use_gpu=use_gpu,
        container_gpus=gpu_uuids,
        slot_ids=[],
        debug=False,
        workload_manager_type="",
        det_rendezvous_ports="",
        det_trial_runner_network_interface=constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="uuid-123",
        trial_seed=trial_seed,
    )
Exemple #2
0
def create_default_env_context(
        experiment_config: Dict[str, Any]) -> det.EnvContext:
    det_trial_runner_network_interface = constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE
    return det.EnvContext(
        experiment_config=experiment_config,
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP,
            ExperimentID(1),
            TrialID(1),
            StepID(1),
            det.ExperimentConfig(experiment_config).scheduling_unit(),
            0,
        ),
        master_addr="",
        master_port=0,
        use_tls=False,
        master_cert_file=None,
        master_cert_name=None,
        container_id="",
        hparams={"global_batch_size": 32},
        latest_checkpoint=None,
        use_gpu=False,
        container_gpus=[],
        slot_ids=[],
        debug=False,
        workload_manager_type="",
        det_rendezvous_ports="",
        det_trial_unique_port_offset=0,
        det_trial_runner_network_interface=det_trial_runner_network_interface,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="uuid-123",
        trial_seed=0,
    )
Exemple #3
0
def main() -> None:
    for k in ENVIRONMENT_VARIABLE_KEYS:
        if k not in os.environ:
            sys.exit("Environment not set: missing " + k)

    experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"])
    debug = experiment_config.get("debug", False)
    det._set_logger(debug)

    master_addr = os.environ["DET_MASTER_ADDR"]
    master_port = int(os.environ["DET_MASTER_PORT"])
    agent_id = os.environ["DET_AGENT_ID"]
    container_id = os.environ["DET_CONTAINER_ID"]
    hparams = simplejson.loads(os.environ["DET_HPARAMS"])
    initial_work = workload.Workload.from_json(simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"]))
    latest_checkpoint = simplejson.loads(os.environ["DET_LATEST_CHECKPOINT"])
    use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false"))
    slot_ids = json.loads(os.environ["DET_SLOT_IDS"])
    workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"]
    det_rendezvous_ports = os.environ["DET_RENDEZVOUS_PORTS"]
    det_trial_runner_network_interface = os.environ["DET_TRIAL_RUNNER_NETWORK_INTERFACE"]
    det_trial_id = os.environ["DET_TRIAL_ID"]
    det_experiment_id = os.environ["DET_EXPERIMENT_ID"]
    det_cluster_id = os.environ["DET_CLUSTER_ID"]
    trial_seed = int(os.environ["DET_TRIAL_SEED"])

    gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids)

    env = det.EnvContext(
        master_addr,
        master_port,
        container_id,
        experiment_config,
        hparams,
        initial_work,
        latest_checkpoint,
        use_gpu,
        gpu_uuids,
        slot_ids,
        debug,
        workload_manager_type,
        det_rendezvous_ports,
        det_trial_runner_network_interface,
        det_trial_id,
        det_experiment_id,
        det_cluster_id,
        trial_seed,
    )

    logging.info(
        f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}."
    )

    try:
        storage.validate_config(env.experiment_config["checkpoint_storage"])
    except Exception as e:
        logging.error("Checkpoint storage validation failed: {}".format(e))
        sys.exit(1)

    build_and_run_training_pipeline(env)
Exemple #4
0
def get_dummy_env() -> det.EnvContext:
    return det.EnvContext(
        master_url="",
        master_cert_file=None,
        master_cert_name=None,
        experiment_config={"resources": {"slots_per_trial": 1, "native_parallel": False}},
        latest_checkpoint=None,
        steps_completed=0,
        use_gpu=False,
        container_gpus=[],
        slot_ids=[],
        debug=False,
        hparams={"global_batch_size": 1},
        det_trial_unique_port_offset=0,
        det_trial_id="1",
        det_agent_id="1",
        det_experiment_id="1",
        det_cluster_id="uuid-123",
        trial_seed=0,
        trial_run_id=1,
        allocation_id="",
        managed_training=True,
        test_mode=False,
        on_cluster=False,
    )
Exemple #5
0
def get_dummy_env() -> det.EnvContext:
    return det.EnvContext(
        master_addr="",
        master_port=0,
        container_id="",
        experiment_config={
            "resources": {
                "slots_per_trial": 1,
                "native_parallel": False
            }
        },
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP,
            determined_common.types.ExperimentID(1),
            determined_common.types.TrialID(1),
            determined_common.types.StepID(1),
        ),
        latest_checkpoint=None,
        use_gpu=False,
        container_gpus=[],
        slot_ids=[],
        debug=False,
        workload_manager_type="",
        hparams={"global_batch_size": 1},
        det_rendezvous_ports="",
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="uuid-123",
        trial_seed=0,
    )
Exemple #6
0
def _make_local_execution_env(
    managed_training: bool,
    test_mode: bool,
    config: Optional[Dict[str, Any]],
    hparams: Optional[Dict[str, Any]] = None,
    limit_gpus: Optional[int] = None,
) -> Tuple[det.EnvContext, det.RendezvousInfo, horovod.HorovodContext]:
    config = det.ExperimentConfig(
        _make_local_execution_exp_config(config,
                                         managed_training=managed_training,
                                         test_mode=test_mode))
    hparams = hparams or api.generate_random_hparam_values(
        config.get("hyperparameters", {}))
    use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus)
    local_rendezvous_ports = (
        f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}"
    )

    env = det.EnvContext(
        master_addr="",
        master_port=0,
        use_tls=False,
        master_cert_file=None,
        master_cert_name=None,
        container_id="",
        experiment_config=config,
        hparams=hparams,
        initial_workload=workload.train_workload(1, 1, 1,
                                                 config.scheduling_unit()),
        latest_checkpoint=None,
        use_gpu=use_gpu,
        container_gpus=container_gpus,
        slot_ids=slot_ids,
        debug=config.debug_enabled(),
        workload_manager_type="",
        det_rendezvous_ports=local_rendezvous_ports,
        det_trial_unique_port_offset=0,
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="",
        det_experiment_id="",
        det_cluster_id="",
        trial_seed=config.experiment_seed(),
        managed_training=managed_training,
        test_mode=test_mode,
        on_cluster=False,
    )
    rendezvous_ports = env.rendezvous_ports()
    rendezvous_info = det.RendezvousInfo(
        addrs=[f"0.0.0.0:{rendezvous_ports[0]}"],
        addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"],
        rank=0)
    hvd_config = horovod.HorovodContext.from_configs(env.experiment_config,
                                                     rendezvous_info,
                                                     env.hparams)

    return env, rendezvous_info, hvd_config
Exemple #7
0
def _make_local_test_experiment_env(
    checkpoint_dir: pathlib.Path,
    config: Optional[Dict[str, Any]],
    hparams: Optional[Dict[str, Any]] = None,
) -> Tuple[det.EnvContext, workload.Stream, det.RendezvousInfo,
           horovod.HorovodContext]:
    config = det.ExperimentConfig(_make_local_test_experiment_config(config))
    hparams = hparams or _generate_test_hparam_values(config)
    use_gpu, container_gpus, slot_ids = _get_gpus()
    local_rendezvous_ports = (
        f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}"
    )

    env = det.EnvContext(
        master_addr="",
        master_port=1,
        container_id="test_mode",
        experiment_config=config,
        hparams=hparams,
        initial_workload=workload.train_workload(1, 1, 1,
                                                 config.batches_per_step()),
        latest_checkpoint=None,
        use_gpu=use_gpu,
        container_gpus=container_gpus,
        slot_ids=slot_ids,
        debug=config.debug_enabled(),
        workload_manager_type="",
        det_rendezvous_ports=local_rendezvous_ports,
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="test_mode",
        trial_seed=config.experiment_seed(),
    )
    workloads = _make_test_workloads(checkpoint_dir.joinpath("checkpoint"),
                                     config)
    rendezvous_ports = env.rendezvous_ports()
    rendezvous_info = det.RendezvousInfo(
        addrs=[f"0.0.0.0:{rendezvous_ports[0]}"],
        addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"],
        rank=0)
    hvd_config = horovod.HorovodContext.from_configs(env.experiment_config,
                                                     rendezvous_info,
                                                     env.hparams)

    return env, workloads, rendezvous_info, hvd_config
Exemple #8
0
def _make_local_execution_env(
    managed_training: bool,
    test_mode: bool,
    config: Optional[Dict[str, Any]],
    checkpoint_dir: str,
    hparams: Optional[Dict[str, Any]] = None,
    limit_gpus: Optional[int] = None,
) -> Tuple[core.Context, det.EnvContext]:
    config = det.ExperimentConfig(
        _make_local_execution_exp_config(config,
                                         checkpoint_dir,
                                         managed_training=managed_training,
                                         test_mode=test_mode))
    hparams = hparams or api.generate_random_hparam_values(
        config.get("hyperparameters", {}))
    use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus)

    env = det.EnvContext(
        master_url="",
        master_cert_file=None,
        master_cert_name=None,
        experiment_config=config,
        hparams=hparams,
        latest_checkpoint=None,
        steps_completed=0,
        use_gpu=use_gpu,
        container_gpus=container_gpus,
        slot_ids=slot_ids,
        debug=config.debug_enabled(),
        det_trial_unique_port_offset=0,
        det_trial_id="",
        det_agent_id="",
        det_experiment_id="",
        det_cluster_id="",
        trial_seed=config.experiment_seed(),
        trial_run_id=1,
        allocation_id="",
        managed_training=managed_training,
        test_mode=test_mode,
        on_cluster=False,
    )

    core_context = core._dummy_init()

    return core_context, env
Exemple #9
0
def get_dummy_env() -> det.EnvContext:
    return det.EnvContext(
        master_addr="",
        master_port=0,
        use_tls=False,
        master_cert_file=None,
        master_cert_name=None,
        container_id="",
        experiment_config={
            "resources": {
                "slots_per_trial": 1,
                "native_parallel": False
            }
        },
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP,
            determined.common.types.ExperimentID(1),
            determined.common.types.TrialID(1),
            determined.common.types.StepID(1),
            constants.DEFAULT_SCHEDULING_UNIT,
            0,
        ),
        latest_checkpoint=None,
        use_gpu=False,
        container_gpus=[],
        slot_ids=[],
        debug=False,
        workload_manager_type="",
        hparams={"global_batch_size": 1},
        det_rendezvous_port="",
        det_trial_unique_port_offset=0,
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_agent_id="1",
        det_experiment_id="1",
        det_task_token="",
        det_cluster_id="uuid-123",
        trial_seed=0,
        managed_training=True,
        test_mode=False,
        on_cluster=False,
    )
def make_default_env_context(
    hparams: Dict[str, Any],
    experiment_config: Dict,
    trial_seed: int = 0,
    latest_checkpoint: Optional[str] = None,
    steps_completed: int = 0,
    expose_gpus: bool = False,
) -> det.EnvContext:
    assert (latest_checkpoint is None) == (steps_completed == 0)

    if expose_gpus:
        gpu_uuids = gpu.get_gpu_uuids()
        use_gpu = bool(gpu_uuids)
    else:
        gpu_uuids = []
        use_gpu = False

    return det.EnvContext(
        experiment_config=experiment_config,
        master_url="",
        master_cert_file=None,
        master_cert_name=None,
        hparams=hparams,
        latest_checkpoint=latest_checkpoint,
        steps_completed=steps_completed,
        use_gpu=use_gpu,
        container_gpus=gpu_uuids,
        slot_ids=[],
        debug=False,
        det_trial_unique_port_offset=0,
        det_trial_id="1",
        det_experiment_id="1",
        det_agent_id="1",
        det_cluster_id="uuid-123",
        trial_seed=trial_seed,
        trial_run_id=1,
        allocation_id="",
        managed_training=True,
        test_mode=False,
        on_cluster=False,
    )
Exemple #11
0
def main() -> None:
    for k in ENVIRONMENT_VARIABLE_KEYS:
        if k not in os.environ:
            sys.exit("Environment not set: missing " + k)

    experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"])
    debug = experiment_config.get("debug", False)
    determined.common.set_logger(debug)

    master_addr = os.environ["DET_MASTER_ADDR"]
    master_port = int(os.environ["DET_MASTER_PORT"])
    use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false"))
    master_cert_file = os.environ.get("DET_MASTER_CERT_FILE")
    master_cert_name = os.environ.get("DET_MASTER_CERT_NAME")
    agent_id = os.environ["DET_AGENT_ID"]
    container_id = os.environ["DET_CONTAINER_ID"]
    hparams = simplejson.loads(os.environ["DET_HPARAMS"])
    initial_work = workload.Workload.from_json(
        simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"]))

    with open(os.environ["DET_LATEST_CHECKPOINT"], "r") as f:
        latest_checkpoint = json.load(f)

    use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false"))
    slot_ids = json.loads(os.environ["DET_SLOT_IDS"])
    workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"]
    det_rendezvous_port = os.environ["DET_RENDEZVOUS_PORT"]
    det_trial_unique_port_offset = int(
        os.environ["DET_TRIAL_UNIQUE_PORT_OFFSET"])
    det_trial_runner_network_interface = os.environ[
        "DET_TRIAL_RUNNER_NETWORK_INTERFACE"]
    det_trial_id = os.environ["DET_TRIAL_ID"]
    det_experiment_id = os.environ["DET_EXPERIMENT_ID"]
    det_agent_id = os.environ["DET_AGENT_ID"]
    det_cluster_id = os.environ["DET_CLUSTER_ID"]
    det_task_token = os.environ["DET_TASK_TOKEN"]
    trial_seed = int(os.environ["DET_TRIAL_SEED"])

    gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids)

    env = det.EnvContext(
        master_addr,
        master_port,
        use_tls,
        master_cert_file,
        master_cert_name,
        container_id,
        experiment_config,
        hparams,
        initial_work,
        latest_checkpoint,
        use_gpu,
        gpu_uuids,
        slot_ids,
        debug,
        workload_manager_type,
        det_rendezvous_port,
        det_trial_unique_port_offset,
        det_trial_runner_network_interface,
        det_trial_id,
        det_experiment_id,
        det_agent_id,
        det_cluster_id,
        det_task_token,
        trial_seed,
        managed_training=True,
        test_mode=False,
        on_cluster=True,
    )

    logging.info(
        f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}."
    )

    try:
        storage.validate_config(
            env.experiment_config["checkpoint_storage"],
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )
    except Exception as e:
        logging.error("Checkpoint storage validation failed: {}".format(e))
        sys.exit(1)

    try:
        build_and_run_training_pipeline(env)
    except det.InvalidHP:
        logging.info("InvalidHP detected, gracefully exiting trial")
        pass
Exemple #12
0
def main(train_entrypoint: str) -> int:
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # TODO: refactor data_layer, and profiling to to not use the cli_cert.
    certs.cli_cert = certs.default_load(info.master_url)

    # TODO: Don't include EnvContext object in the future high-level APIs for PyTorch or Keras.
    # It was natural to create this big-blob-of-config object, but it was a mistake to pass it into
    # the lowest layers of the harness code; it's too large of an object to be easily mockable,
    # which is part of why building local training mode has always been a challenge.
    #
    # A better pattern is to pass in exactly the information that is necessary at each layer.  We
    # will use that pattern for the future high-level APIs, but it's not worth refactoring e.g. the
    # TFKerasTrialController or EstimatorTrialController to add that functionality, so for now we
    # continue with the legacy strategy.

    env = det.EnvContext(
        master_url=info.master_url,
        master_cert_file=info.master_cert_file,
        master_cert_name=info.master_cert_name,
        experiment_config=info.trial._config,
        hparams=info.trial.hparams,
        latest_checkpoint=info.latest_checkpoint,
        steps_completed=info.trial._steps_completed,
        use_gpu=bool(info.gpu_uuids),
        container_gpus=info.gpu_uuids,
        slot_ids=info.slot_ids,
        debug=info.trial._debug,
        det_trial_unique_port_offset=info.trial._unique_port_offset,
        det_trial_id=str(info.trial.trial_id),
        det_experiment_id=str(info.trial.experiment_id),
        det_agent_id=info.agent_id,
        det_cluster_id=info.cluster_id,
        trial_seed=info.trial.trial_seed,
        trial_run_id=info.trial._trial_run_id,
        allocation_id=info.allocation_id,
        managed_training=True,
        test_mode=False,
        on_cluster=True,
    )

    det.common.set_logger(env.debug)
    logging.debug("Starting harness.")

    with maybe_periodic_stacktraces(env.debug):
        # Step 1: Load user code.
        # We can't build a core.Context without rank information, and we can't gather rank
        # information until the distributed backend is initialized, and we can't initialize the
        # correct distributed backend until we know which Trial class the user implemented.
        trial_class = load.trial_class_from_entrypoint(train_entrypoint)
        controller_class = load.get_trial_controller_class(trial_class)
        if info.container_rank == 0:
            try:
                analytics.send_analytics("trial_loaded", analytics.get_trial_analytics(trial_class))
            except Exception as e:
                logging.debug(f"Cannot send analytics: {e}")

        # Step 2: Initialize framework-specific details (dtrain framework, random seeds, etc).
        distributed_backend = det._DistributedBackend()
        controller_class.pre_execute_hook(env, distributed_backend)

        # Step 3: Now that the dtrain framework is initialized, build the DistributedContext object.
        # For harness.py, we only support a fixed set of Determined-provided launch layers, since
        # the TrialControllers only support a fixed set of launch layers.
        distributed = None
        if distributed_backend.use_horovod():
            distributed = core.DistributedContext.from_horovod(horovod.hvd)
        elif distributed_backend.use_deepspeed():
            distributed = core.DistributedContext.from_deepspeed()
        elif distributed_backend.use_torch():
            distributed = core.DistributedContext.from_torch_distributed()
        elif len(info.container_addrs) > 1 or len(info.slot_ids) > 1:
            raise ValueError(
                "In multi-slot tasks, the determined.exec.harness module must not be invoked "
                "directly.  Instead, it must be wrapped in one of the following launch layers: "
                "determined.launch.horovod, determined.launch.deepspeed"
            )

        # Step 4: Let core.init() create the core.Context.
        with core.init(
            distributed=distributed,
            preempt_mode=core.PreemptMode.ChiefOnly,
            tensorboard_mode=core.TensorboardMode.MANUAL,
        ) as core_context:
            trial_context = trial_class.trial_context_class(core_context, env)

            # Step 4: Instantiate the user's Trial.
            trial_inst = trial_class(trial_context)

            # Step 5: Create a TrialController and execute training
            logging.info(f"Creating {controller_class.__name__} with {trial_class.__name__}.")
            controller = controller_class.from_trial(
                trial_inst=trial_inst,
                context=trial_context,
                env=env,
            )

            controller.run()

    return 0