def __init__(self, trial_inst: det.Trial, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) assert isinstance( trial_inst, DeepSpeedTrial), "DeepSpeedTrialController needs a DeepSpeedTrial" self.trial = trial_inst self.context = cast(det_ds.DeepSpeedTrialContext, self.context) self.context._set_determined_profiler(self.prof) if torch.cuda.is_available(): self.prof._set_sync_device(self._sync_device) self.callbacks = self.trial.build_callbacks() if len(self.context.models) == 0: raise det.errors.InvalidExperimentException( "Must have at least one model engine. " "This might be caused by not wrapping your model with wrap_model_engine()" ) self.wlsq = None # type: Optional[layers.WorkloadSequencer] if self.workloads is None: self.workloads, self.wlsq = layers.make_compatibility_workloads( self.context._core, self.env, self.context.models[0].train_batch_size()) self.steps_completed = self.env.steps_completed
def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) check_startup_hook_ran = self.env.hparams.get("check_startup_hook_ran", False) if check_startup_hook_ran: check.true(os.path.isfile("startup-hook-ran"), "File should exists.") self.chaos = random.SystemRandom() self._batch_size = self.context.get_per_slot_batch_size() self.chaos_probability = self.env.hparams.get("chaos_probability", 0) self.chaos_probability_train = self.env.hparams.get("chaos_probability_train") self.chaos_probability_validate = self.env.hparams.get("chaos_probability_validate") self.chaos_probability_checkpoint = self.env.hparams.get("chaos_probability_checkpoint") self.nan_probability_validate = self.env.hparams.get("nan_probability_validate", 0) self.fail_on_first_validation = self.env.hparams.get("fail_on_first_validation", "") self.fail_on_chechpoint_save = self.env.hparams.get("fail_on_chechpoint_save", "") self.validation_set_size = self.env.hparams.get("validation_set_size", 32 * 32) self.train_batch_secs = self.env.hparams.get("training_batch_seconds", 0) self.validation_secs = self.env.hparams.get( "validation_seconds", self.validation_set_size * self.train_batch_secs / self._batch_size, ) self.num_training_metrics = self.env.hparams.get("num_training_metrics", 1) assert self.num_training_metrics > 0 self.num_validation_metrics = self.env.hparams.get("num_validation_metrics", 1) assert self.num_validation_metrics > 0 self.save_secs = self.env.hparams.get("save_checkpoint_seconds", 0) self.load_secs = self.env.hparams.get("load_checkpoint_secs", 0) self.metrics_progression = self.env.hparams.get("metrics_progression", "decreasing") assert self.metrics_progression in ("increasing", "decreasing", "constant") self.metrics_base = self.env.hparams.get("metrics_base", 0.9) assert 0 < self.metrics_base < 1 self.metrics_sigma = self.env.hparams.get("metrics_sigma", 0.0) assert 0 <= self.metrics_sigma self.write_null = self.env.hparams.get("write_null", False) self.request_stop = self.env.hparams.get("request_stop", False) self.non_chief_exit_immediately = self.env.hparams.get("non_chief_exit_immediately", False) self.wlsq = None if self.workloads is None: self.workloads, self.wlsq = layers.make_compatibility_workloads( self.context._core, self.env, self.context.get_global_batch_size() ) self.steps_completed = self.env.steps_completed if self.env.latest_checkpoint is not None: with self.context._core.checkpoint.restore_path( self.env.latest_checkpoint ) as load_path: self.load(pathlib.Path(load_path)) else: self.trained_steps = collections.Counter()
def __init__( self, estimator: tf.estimator.Estimator, user_train_spec: tf.estimator.TrainSpec, val_spec: tf.estimator.EvalSpec, serving_input_receiver_fns: Dict[str, estimator.ServingInputReceiverFn], context: estimator.EstimatorTrialContext, *args: Any, **kwargs: Any, ) -> None: super().__init__(context, *args, **kwargs) # Catch if the estimator has been configured to use a tf.distribute.Strategy # as this can conflict with Determined's distributed training and lead to # crashes/OOM. We cannot reliable tell the user that this was the cause of # their failure, because the code may crash before this point in user code # during build_estimator(). train_distribute is valid if it is None or if # it is an empty tf.contrib.distribute.DistributeConfig if estimator.config.train_distribute is not None: check.is_none( estimator.config.train_distribute.train_distribute, f"TensorFlow's approach to distributed training can conflict with " f"Determined's. Currently Determined requires that the train_distribute " f"field of the RunConfig not be set. Your estimator has " f"train_distribute={str(estimator.config.train_distribute.train_distribute)}", ) check.is_none( estimator.config.train_distribute.eval_distribute, f"TensorFlow's approach to distributed training can conflict with " f"Determined's. Currently Determined requires that the eval_distribute " f"field of the RunConfig not be set. Your estimator has " f"eval_distribute={str(estimator.config.train_distribute.eval_distribute)}", ) if self.context.distributed.size > 1: assert ( self.use_horovod ), "Estimator trial must be run with a horovod backend if distributed training" self.estimator = estimator self.user_train_spec = user_train_spec self.val_spec = val_spec self.serving_input_receiver_fns = serving_input_receiver_fns self.wlsq = None # type: Optional[layers.WorkloadSequencer] if self.workloads is None: self.workloads, self.wlsq = layers.make_compatibility_workloads( self.context._core, self.env, self.context.get_global_batch_size(), ) self._init_model()
def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self.value = self.env.hparams["starting_base_value"] self.training_structure = self.env.hparams["training_structure"] self.training_structure["inf"] = math.inf self.training_structure["nan"] = math.nan self.training_structure["nanarray"] = np.array([math.nan, math.nan]) self.validation_structure = self.env.hparams["validation_structure"] self.validation_structure["neg_inf"] = -1 * math.inf self.gain_per_batch = 0 self.wlsq = None if self.workloads is None: self.workloads, self.wlsq = layers.make_compatibility_workloads( self.context._core, self.env, self.context.get_global_batch_size())
def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self.value = self.env.hparams["starting_base_value"] self.training_structure = self.env.hparams["training_structure"] self.validation_structure = self.env.hparams["validation_structure"] self.gain_per_batch = self.env.hparams["gain_per_batch"] self.wlsq = None if self.workloads is None: self.workloads, self.wlsq = layers.make_compatibility_workloads( self.context._core, self.env, self.context.get_global_batch_size()) self.steps_completed = self.env.steps_completed if self.env.latest_checkpoint is not None: with self.context._core.checkpoint.restore_path( self.env.latest_checkpoint) as load_path: self.load(pathlib.Path(load_path))
def __init__(self, trial_inst: det.Trial, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) check.is_instance(trial_inst, PyTorchTrial, "PyTorchTrialController needs an PyTorchTrial") self.trial = cast(PyTorchTrial, trial_inst) self.context = cast(pytorch.PyTorchTrialContext, self.context) self.context._set_determined_profiler(self.prof) if torch.cuda.is_available(): self.prof._set_sync_device(self._sync_device) self.callbacks = self.trial.build_callbacks() check.gt_eq( len(self.context.models), 1, "Must have at least one model. " "This might be caused by not wrapping your model with wrap_model()", ) check.gt_eq( len(self.context.optimizers), 1, "Must have at least one optimizer. " "This might be caused by not wrapping your optimizer with wrap_optimizer()", ) self._check_evaluate_implementation() self.wlsq = None # type: Optional[layers.WorkloadSequencer] if self.workloads is None: self.workloads, self.wlsq = layers.make_compatibility_workloads( self.context._core, self.env, self.context.get_global_batch_size(), ) self.steps_completed = self.env.steps_completed # Currently only horovod and torch backends are supported for distributed training if self.context.distributed.size > 1: assert (self.use_horovod or self.use_torch ), "Must use horovod or torch for distributed training"
def __init__( self, model: tf.keras.models.Model, session: tf.compat.v1.ConfigProto, train_config: keras.TFKerasTrainConfig, trial: "TFKerasTrial", *args: Any, **kwargs: Any, ) -> None: super().__init__(*args, **kwargs) self.model = model self.session = session self.trial = trial # Configure optimizers, done for backwards compatibility. self.context._select_optimizers() keras._check_if_aggregation_frequency_will_work( model=self.model, use_horovod=self.use_horovod, aggregation_frequency=self.context._aggregation_frequency, ) self.training_data = train_config.training_data self.validation_data = train_config.validation_data # Support the deprecated SequenceAdapter API. if isinstance(self.training_data, keras.SequenceAdapter): self.context._configure_fit( workers=self.training_data.workers, use_multiprocessing=self.training_data.use_multiprocessing, max_queue_size=self.training_data.max_queue_size, ) # Use the provided Sequence directly. self.training_data = self.training_data.sequence if isinstance(self.validation_data, keras.SequenceAdapter): # Ignore these settings and use the same settings as for the fit call. self.validation_data = self.validation_data.sequence if self.context.distributed.size > 1: assert self.use_horovod, ( "TF Keras trial must be launched with a horovod backend if " "doing distributed training" ) self._check_training_data() self._check_validation_data() self.enqueuers = [] # type: List[keras._Enqueuer] self.wlsq = None # type: Optional[layers.WorkloadSequencer] if self.workloads is None: self.workloads, self.wlsq = layers.make_compatibility_workloads( self.context._core, self.env, self.context.get_global_batch_size(), ) # If a load path is provided, load weights and restore the data location. self.multiplexer_load_state = None # type: Optional[Dict] if self.env.latest_checkpoint is not None: logging.info(f"Restoring trial from checkpoint {self.env.latest_checkpoint}") with self.context._core.checkpoint.restore_path( self.env.latest_checkpoint ) as load_path: self._load(load_path) self._configure_callbacks(train_config.callbacks) self.train_response_func = None # type: Optional[workload.ResponseFunc] self.train_workload_metrics = [] # type: List[Dict[str, Any]] self.train_workload_batches = 0 self.train_workload_inputs = 0 self.train_workload_len = 0 self.test_inputs = 0 self.steps_completed = self.env.steps_completed