Exemple #1
0
def create_trial_instance(
    trial_def: Type[det.Trial],
    checkpoint_dir: str,
    config: Optional[Dict[str, Any]] = None,
    hparams: Optional[Dict[str, Any]] = None,
) -> det.Trial:
    """
    Create a trial instance from a Trial class definition. This can be a useful
    utility for debugging your trial logic in any development environment.

    Arguments:
        trial_def: A class definition that inherits from the det.Trial interface.
        checkpoint_dir:
            The checkpoint directory that the trial will use for loading and
            saving checkpoints.
        config:
            An optional experiment configuration that is used to initialize the
            :class:`determined.TrialContext`. If not specified, a minimal default
            is used.
    """
    determined_common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())
    env, rendezvous_info, hvd_config = det._make_local_execution_env(
        False, config, hparams)
    trial_context = trial_def.trial_context_class(env, hvd_config)
    return trial_def(trial_context)
def local_experiment(args: Namespace) -> None:
    try:
        from determined import experimental, load
    except ImportError as e:
        print("--local requires that the `determined` package is installed.")
        raise e

    if not args.test_mode:
        raise NotImplementedError(
            "Local training mode (--local mode without --test mode) is not yet supported. Please "
            "try local test mode by adding the --test flag or cluster training mode by removing "
            "the --local flag.")

    experiment_config = _parse_config_file_or_exit(args.config_file)

    determined_common.set_logger(bool(experiment_config.get("debug", False)))

    # Python typically initializes sys.path[0] as the empty string when
    # invoked interactively, which directs Python to search modules in the
    # current directory first. However, this is _not_ happening when this
    # Python function is invoked via the cli. We add it manually here so
    # that test_one_batch can import the entrypoint by changing the
    # directory to model_def.
    #
    # Reference: https://docs.python.org/3/library/sys.html#sys.path

    with experimental._local_execution_manager(args.model_def.resolve()):
        trial_class = load.load_trial_implementation(
            experiment_config["entrypoint"])
        sys.path = [""] + sys.path
        experimental.test_one_batch(trial_class=trial_class,
                                    config=experiment_config)
Exemple #3
0
def init_native(
    trial_def: Optional[Type[det.Trial]] = None,
    controller_cls: Optional[Type[det.TrialController]] = None,
    native_context_cls: Optional[Type[det.NativeContext]] = None,
    config: Optional[Dict[str, Any]] = None,
    local: bool = False,
    test: bool = False,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> Any:
    determined_common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())

    if local:
        if not test:
            logging.warning("local training is not supported, testing instead")

        with det._local_execution_manager(pathlib.Path(context_dir).resolve()):
            return test_one_batch(
                controller_cls=controller_cls,
                native_context_cls=native_context_cls,
                trial_class=trial_def,
                config=config,
            )

    else:
        return _init_cluster_mode(
            trial_def=trial_def,
            controller_cls=controller_cls,
            native_context_cls=native_context_cls,
            config=config,
            test=test,
            context_dir=context_dir,
            command=command,
            master_url=master_url,
        )
Exemple #4
0
def local_experiment(args: Namespace) -> None:
    try:
        import determined as det
        from determined import experimental, load
    except ImportError as e:
        print("--local requires that the `determined` package is installed.")
        raise e

    if not args.test_mode:
        raise NotImplementedError(
            "Local training mode (--local mode without --test mode) is not yet supported. Please "
            "try local test mode by adding the --test flag or cluster training mode by removing "
            "the --local flag.")

    experiment_config = _parse_config_file_or_exit(args.config_file)

    determined_common.set_logger(bool(experiment_config.get("debug", False)))

    with det._local_execution_manager(args.model_def.resolve()):
        trial_class = load.load_trial_implementation(
            experiment_config["entrypoint"])
        experimental.test_one_batch(trial_class=trial_class,
                                    config=experiment_config)
Exemple #5
0
def main() -> None:
    for k in ENVIRONMENT_VARIABLE_KEYS:
        if k not in os.environ:
            sys.exit("Environment not set: missing " + k)

    experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"])
    debug = experiment_config.get("debug", False)
    determined_common.set_logger(debug)

    master_addr = os.environ["DET_MASTER_ADDR"]
    master_port = int(os.environ["DET_MASTER_PORT"])
    use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false"))
    master_cert_file = os.environ.get("DET_MASTER_CERT_FILE")
    agent_id = os.environ["DET_AGENT_ID"]
    container_id = os.environ["DET_CONTAINER_ID"]
    hparams = simplejson.loads(os.environ["DET_HPARAMS"])
    initial_work = workload.Workload.from_json(simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"]))

    with open(os.environ["DET_LATEST_CHECKPOINT"], "r") as f:
        latest_checkpoint = json.load(f)

    use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false"))
    slot_ids = json.loads(os.environ["DET_SLOT_IDS"])
    workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"]
    det_rendezvous_ports = os.environ["DET_RENDEZVOUS_PORTS"]
    det_trial_unique_port_offset = int(os.environ["DET_TRIAL_UNIQUE_PORT_OFFSET"])
    det_trial_runner_network_interface = os.environ["DET_TRIAL_RUNNER_NETWORK_INTERFACE"]
    det_trial_id = os.environ["DET_TRIAL_ID"]
    det_experiment_id = os.environ["DET_EXPERIMENT_ID"]
    det_cluster_id = os.environ["DET_CLUSTER_ID"]
    trial_seed = int(os.environ["DET_TRIAL_SEED"])

    gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids)

    env = det.EnvContext(
        master_addr,
        master_port,
        use_tls,
        master_cert_file,
        container_id,
        experiment_config,
        hparams,
        initial_work,
        latest_checkpoint,
        use_gpu,
        gpu_uuids,
        slot_ids,
        debug,
        workload_manager_type,
        det_rendezvous_ports,
        det_trial_unique_port_offset,
        det_trial_runner_network_interface,
        det_trial_id,
        det_experiment_id,
        det_cluster_id,
        trial_seed,
    )

    logging.info(
        f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}."
    )

    try:
        storage.validate_config(
            env.experiment_config["checkpoint_storage"],
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )
    except Exception as e:
        logging.error("Checkpoint storage validation failed: {}".format(e))
        sys.exit(1)

    build_and_run_training_pipeline(env)