def create_trial_instance( trial_def: Type[det.Trial], checkpoint_dir: str, config: Optional[Dict[str, Any]] = None, hparams: Optional[Dict[str, Any]] = None, ) -> det.Trial: """ Create a trial instance from a Trial class definition. This can be a useful utility for debugging your trial logic in any development environment. Arguments: trial_def: A class definition that inherits from the det.Trial interface. checkpoint_dir: The checkpoint directory that the trial will use for loading and saving checkpoints. config: An optional experiment configuration that is used to initialize the :class:`determined.TrialContext`. If not specified, a minimal default is used. """ determined_common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) env, rendezvous_info, hvd_config = det._make_local_execution_env( False, config, hparams) trial_context = trial_def.trial_context_class(env, hvd_config) return trial_def(trial_context)
def local_experiment(args: Namespace) -> None: try: from determined import experimental, load except ImportError as e: print("--local requires that the `determined` package is installed.") raise e if not args.test_mode: raise NotImplementedError( "Local training mode (--local mode without --test mode) is not yet supported. Please " "try local test mode by adding the --test flag or cluster training mode by removing " "the --local flag.") experiment_config = _parse_config_file_or_exit(args.config_file) determined_common.set_logger(bool(experiment_config.get("debug", False))) # Python typically initializes sys.path[0] as the empty string when # invoked interactively, which directs Python to search modules in the # current directory first. However, this is _not_ happening when this # Python function is invoked via the cli. We add it manually here so # that test_one_batch can import the entrypoint by changing the # directory to model_def. # # Reference: https://docs.python.org/3/library/sys.html#sys.path with experimental._local_execution_manager(args.model_def.resolve()): trial_class = load.load_trial_implementation( experiment_config["entrypoint"]) sys.path = [""] + sys.path experimental.test_one_batch(trial_class=trial_class, config=experiment_config)
def init_native( trial_def: Optional[Type[det.Trial]] = None, controller_cls: Optional[Type[det.TrialController]] = None, native_context_cls: Optional[Type[det.NativeContext]] = None, config: Optional[Dict[str, Any]] = None, local: bool = False, test: bool = False, context_dir: str = "", command: Optional[List[str]] = None, master_url: Optional[str] = None, ) -> Any: determined_common.set_logger( util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled()) if local: if not test: logging.warning("local training is not supported, testing instead") with det._local_execution_manager(pathlib.Path(context_dir).resolve()): return test_one_batch( controller_cls=controller_cls, native_context_cls=native_context_cls, trial_class=trial_def, config=config, ) else: return _init_cluster_mode( trial_def=trial_def, controller_cls=controller_cls, native_context_cls=native_context_cls, config=config, test=test, context_dir=context_dir, command=command, master_url=master_url, )
def local_experiment(args: Namespace) -> None: try: import determined as det from determined import experimental, load except ImportError as e: print("--local requires that the `determined` package is installed.") raise e if not args.test_mode: raise NotImplementedError( "Local training mode (--local mode without --test mode) is not yet supported. Please " "try local test mode by adding the --test flag or cluster training mode by removing " "the --local flag.") experiment_config = _parse_config_file_or_exit(args.config_file) determined_common.set_logger(bool(experiment_config.get("debug", False))) with det._local_execution_manager(args.model_def.resolve()): trial_class = load.load_trial_implementation( experiment_config["entrypoint"]) experimental.test_one_batch(trial_class=trial_class, config=experiment_config)
def main() -> None: for k in ENVIRONMENT_VARIABLE_KEYS: if k not in os.environ: sys.exit("Environment not set: missing " + k) experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"]) debug = experiment_config.get("debug", False) determined_common.set_logger(debug) master_addr = os.environ["DET_MASTER_ADDR"] master_port = int(os.environ["DET_MASTER_PORT"]) use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false")) master_cert_file = os.environ.get("DET_MASTER_CERT_FILE") agent_id = os.environ["DET_AGENT_ID"] container_id = os.environ["DET_CONTAINER_ID"] hparams = simplejson.loads(os.environ["DET_HPARAMS"]) initial_work = workload.Workload.from_json(simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"])) with open(os.environ["DET_LATEST_CHECKPOINT"], "r") as f: latest_checkpoint = json.load(f) use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) slot_ids = json.loads(os.environ["DET_SLOT_IDS"]) workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"] det_rendezvous_ports = os.environ["DET_RENDEZVOUS_PORTS"] det_trial_unique_port_offset = int(os.environ["DET_TRIAL_UNIQUE_PORT_OFFSET"]) det_trial_runner_network_interface = os.environ["DET_TRIAL_RUNNER_NETWORK_INTERFACE"] det_trial_id = os.environ["DET_TRIAL_ID"] det_experiment_id = os.environ["DET_EXPERIMENT_ID"] det_cluster_id = os.environ["DET_CLUSTER_ID"] trial_seed = int(os.environ["DET_TRIAL_SEED"]) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids) env = det.EnvContext( master_addr, master_port, use_tls, master_cert_file, container_id, experiment_config, hparams, initial_work, latest_checkpoint, use_gpu, gpu_uuids, slot_ids, debug, workload_manager_type, det_rendezvous_ports, det_trial_unique_port_offset, det_trial_runner_network_interface, det_trial_id, det_experiment_id, det_cluster_id, trial_seed, ) logging.info( f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}." ) try: storage.validate_config( env.experiment_config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) except Exception as e: logging.error("Checkpoint storage validation failed: {}".format(e)) sys.exit(1) build_and_run_training_pipeline(env)