def make_workloads() -> workload.Stream: training_metrics = [] interceptor = workload.WorkloadResponseInterceptor() total_steps, total_batches_processed = 10, 0 for step_id in range(1, total_steps): num_batches = step_id yield from interceptor.send( workload.train_workload( step_id, num_batches=num_batches, total_batches_processed=total_batches_processed, ), [], ) metrics = interceptor.metrics_result() batch_metrics = metrics["metrics"]["batch_metrics"] assert len( batch_metrics ) == num_batches, "did not run for expected num_batches" training_metrics.extend(batch_metrics) total_batches_processed += num_batches yield workload.terminate_workload( ), [], workload.ignore_workload_response
def make_workloads1() -> workload.Stream: nonlocal controller yield workload.train_workload( 1, 1, 0), [], workload.ignore_workload_response assert controller is not None, "controller was never set!" assert controller.trial.counter.__dict__ == { "validation_steps_started": 0, "validation_steps_ended": 0, "checkpoints_ended": 0, } yield workload.validation_workload( ), [], workload.ignore_workload_response assert controller.trial.counter.__dict__ == { "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 0, } yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response assert controller.trial.counter.__dict__ == { "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 1, } yield workload.terminate_workload( ), [], workload.ignore_workload_response
def fake_workload_gen() -> Generator[workload.Workload, None, None]: # Generate some fake workloads. for i in range(NUM_FAKE_WORKLOADS - 1): yield workload.train_workload(i + 1, num_batches=1, total_batches_processed=i) yield workload.validation_workload(i)
def make_test_workloads( checkpoint_dir: pathlib.Path, config: det.ExperimentConfig ) -> workload.Stream: print("Start training a test experiment.") interceptor = workload.WorkloadResponseInterceptor() print("Training 1 step.") yield from interceptor.send(workload.train_workload(1), [config.batches_per_step()]) metrics = interceptor.metrics_result() batch_metrics = metrics["batch_metrics"] check.eq(len(batch_metrics), config.batches_per_step()) print(f"Finished training. Metrics: {batch_metrics}") print("Validating.") yield from interceptor.send(workload.validation_workload(1), []) validation = interceptor.metrics_result() v_metrics = validation["validation_metrics"] print(f"Finished validating. Validation metrics: {v_metrics}") print(f"Saving a checkpoint to {checkpoint_dir}") yield workload.checkpoint_workload(), [checkpoint_dir], workload.ignore_workload_response print(f"Finished saving a checkpoint to {checkpoint_dir}.") yield workload.terminate_workload(), [], workload.ignore_workload_response print("The test experiment passed.")
def make_workloads() -> workload.Stream: nonlocal w interceptor = workload.WorkloadResponseInterceptor() for idx, batch in batches: yield from interceptor.send(workload.train_workload(1), [1]) metrics = interceptor.metrics_result() # Calculate what the loss should be. loss = trial_class.calc_loss(w, batch) assert metrics["avg_metrics"]["loss"] == pytest.approx(loss) # Update what the weight should be. w = w - hparams["learning_rate"] * trial_class.calc_gradient( w, batch) if test_checkpointing and idx == 3: # Checkpoint and let the next TrialController finish the work.l yield workload.checkpoint_workload(), [ checkpoint_dir ], workload.ignore_workload_response break yield workload.terminate_workload( ), [], workload.ignore_workload_response
def make_workloads() -> workload.Stream: nonlocal w interceptor = workload.WorkloadResponseInterceptor() for idx, batch in batches: yield from interceptor.send(workload.train_workload(1)) metrics = interceptor.metrics_result() # Calculate what the loss should be. loss = trial_class.calc_loss(w, batch) epsilon = 0.0001 assert abs(metrics["metrics"]["avg_metrics"]["loss"] - loss) < epsilon # Update what the weight should be. w = w - hparams["learning_rate"] * trial_class.calc_gradient( w, batch) if test_checkpointing and idx == 3: # Checkpoint and let the next TrialController finish the work. interceptor = workload.WorkloadResponseInterceptor() yield from interceptor.send(workload.checkpoint_workload()) nonlocal latest_checkpoint, steps_completed latest_checkpoint = interceptor.metrics_result()["uuid"] # steps_completed is unused, but can't be 0. steps_completed = 1 break
def send( self, steps: int, validation_freq: int, initial_step_id: int = 1, batches_per_step: int = 1, ) -> workload.Stream: self._training_metrics = [] self._validation_metrics = [] interceptor = workload.WorkloadResponseInterceptor() for step_id in range(initial_step_id, initial_step_id + steps): yield from interceptor.send(workload.train_workload(step_id), [batches_per_step]) metrics = interceptor.metrics_result() batch_metrics = metrics["batch_metrics"] assert len(batch_metrics) == batches_per_step self._training_metrics.extend(batch_metrics) if step_id % validation_freq == 0: yield from interceptor.send( workload.validation_workload(step_id), []) validation = interceptor.metrics_result() v_metrics = validation["validation_metrics"] self._validation_metrics.append(v_metrics)
def _make_local_execution_env( managed_training: bool, test_mode: bool, config: Optional[Dict[str, Any]], hparams: Optional[Dict[str, Any]] = None, limit_gpus: Optional[int] = None, ) -> Tuple[det.EnvContext, det.RendezvousInfo, horovod.HorovodContext]: config = det.ExperimentConfig( _make_local_execution_exp_config(config, managed_training=managed_training, test_mode=test_mode)) hparams = hparams or api.generate_random_hparam_values( config.get("hyperparameters", {})) use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus) local_rendezvous_ports = ( f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}" ) env = det.EnvContext( master_addr="", master_port=0, use_tls=False, master_cert_file=None, master_cert_name=None, container_id="", experiment_config=config, hparams=hparams, initial_workload=workload.train_workload(1, 1, 1, config.scheduling_unit()), latest_checkpoint=None, use_gpu=use_gpu, container_gpus=container_gpus, slot_ids=slot_ids, debug=config.debug_enabled(), workload_manager_type="", det_rendezvous_ports=local_rendezvous_ports, det_trial_unique_port_offset=0, det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="", det_experiment_id="", det_cluster_id="", trial_seed=config.experiment_seed(), managed_training=managed_training, test_mode=test_mode, on_cluster=False, ) rendezvous_ports = env.rendezvous_ports() rendezvous_info = det.RendezvousInfo( addrs=[f"0.0.0.0:{rendezvous_ports[0]}"], addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"], rank=0) hvd_config = horovod.HorovodContext.from_configs(env.experiment_config, rendezvous_info, env.hparams) return env, rendezvous_info, hvd_config
def make_workloads1() -> workload.Stream: nonlocal controller assert controller.trial.counter.trial_startups == 1 yield workload.train_workload(1, 1, 0, 4), workload.ignore_workload_response assert controller is not None, "controller was never set!" assert controller.trial.counter.__dict__ == { "trial_startups": 1, "validation_steps_started": 0, "validation_steps_ended": 0, "checkpoints_ended": 0, "training_started_times": 1, "training_epochs_started": 2, "training_epochs_ended": 2, "trial_shutdowns": 0, } assert controller.trial.legacy_counter.__dict__ == { "legacy_on_training_epochs_start_calls": 2 } yield workload.validation_workload( ), workload.ignore_workload_response assert controller.trial.counter.__dict__ == { "trial_startups": 1, "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 0, "training_started_times": 1, "training_epochs_started": 2, "training_epochs_ended": 2, "trial_shutdowns": 0, } assert controller.trial.legacy_counter.__dict__ == { "legacy_on_training_epochs_start_calls": 2 } interceptor = workload.WorkloadResponseInterceptor() yield from interceptor.send(workload.checkpoint_workload()) nonlocal latest_checkpoint, steps_completed latest_checkpoint = interceptor.metrics_result()["uuid"] steps_completed = 1 assert controller.trial.counter.__dict__ == { "trial_startups": 1, "validation_steps_started": 1, "validation_steps_ended": 1, "checkpoints_ended": 1, "training_started_times": 1, "training_epochs_started": 2, "training_epochs_ended": 2, "trial_shutdowns": 0, } assert controller.trial.legacy_counter.__dict__ == { "legacy_on_training_epochs_start_calls": 2 }
def send(self, steps: int, validation_freq: int, initial_step_id: int = 1, scheduling_unit: int = 1) -> workload.Stream: self._training_metrics = [] self._avg_training_metrics = [] self._validation_metrics = [] total_batches_processed = 0 interceptor = workload.WorkloadResponseInterceptor() for step_id in range(initial_step_id, initial_step_id + steps): stop_requested = False yield from interceptor.send( workload.train_workload( step_id, num_batches=scheduling_unit, total_batches_processed=total_batches_processed, ), [], ) metrics = interceptor.metrics_result() batch_metrics = metrics["metrics"]["batch_metrics"] assert len(batch_metrics) == scheduling_unit self._training_metrics.extend(batch_metrics) self._avg_training_metrics.append( metrics["metrics"]["avg_metrics"]) total_batches_processed += scheduling_unit if metrics["stop_requested"]: assert step_id == self.request_stop_step_id stop_requested = True if step_id % validation_freq == 0: yield from interceptor.send( workload.validation_workload( step_id, total_batches_processed=total_batches_processed), [], ) validation = interceptor.metrics_result() v_metrics = validation["metrics"]["validation_metrics"] self._validation_metrics.append(v_metrics) if validation["stop_requested"]: assert step_id == self.request_stop_step_id stop_requested = True if stop_requested: break else: assert step_id != self.request_stop_step_id
def _make_local_test_experiment_env( checkpoint_dir: pathlib.Path, config: Optional[Dict[str, Any]], hparams: Optional[Dict[str, Any]] = None, ) -> Tuple[det.EnvContext, workload.Stream, det.RendezvousInfo, horovod.HorovodContext]: config = det.ExperimentConfig(_make_local_test_experiment_config(config)) hparams = hparams or _generate_test_hparam_values(config) use_gpu, container_gpus, slot_ids = _get_gpus() local_rendezvous_ports = ( f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}" ) env = det.EnvContext( master_addr="", master_port=1, container_id="test_mode", experiment_config=config, hparams=hparams, initial_workload=workload.train_workload(1, 1, 1, config.batches_per_step()), latest_checkpoint=None, use_gpu=use_gpu, container_gpus=container_gpus, slot_ids=slot_ids, debug=config.debug_enabled(), workload_manager_type="", det_rendezvous_ports=local_rendezvous_ports, det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_experiment_id="1", det_cluster_id="test_mode", trial_seed=config.experiment_seed(), ) workloads = _make_test_workloads(checkpoint_dir.joinpath("checkpoint"), config) rendezvous_ports = env.rendezvous_ports() rendezvous_info = det.RendezvousInfo( addrs=[f"0.0.0.0:{rendezvous_ports[0]}"], addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"], rank=0) hvd_config = horovod.HorovodContext.from_configs(env.experiment_config, rendezvous_info, env.hparams) return env, workloads, rendezvous_info, hvd_config
def _make_test_workloads(config: det.ExperimentConfig) -> workload.Stream: interceptor = workload.WorkloadResponseInterceptor() logging.info("Training one batch") yield from interceptor.send(workload.train_workload(1)) metrics = interceptor.metrics_result() batch_metrics = metrics["metrics"]["batch_metrics"] check.eq(len(batch_metrics), config.scheduling_unit()) logging.info(f"Finished training, metrics: {batch_metrics}") logging.info("Validating one batch") yield from interceptor.send(workload.validation_workload(1)) validation = interceptor.metrics_result() v_metrics = validation["metrics"]["validation_metrics"] logging.info(f"Finished validating, validation metrics: {v_metrics}") logging.info("Saving a checkpoint.") yield workload.checkpoint_workload(), workload.ignore_workload_response logging.info("Finished saving a checkpoint.")
def send(self, steps: int, validation_freq: int, initial_step_id: int = 1, batches_per_step: int = 1) -> workload.Stream: self._training_metrics = [] self._validation_metrics = [] interceptor = workload.WorkloadResponseInterceptor() for step_id in range(initial_step_id, initial_step_id + steps): stop_requested = False yield from interceptor.send(workload.train_workload(step_id), [batches_per_step]) metrics = interceptor.metrics_result() batch_metrics = metrics["metrics"]["batch_metrics"] assert len(batch_metrics) == batches_per_step self._training_metrics.extend(batch_metrics) if metrics["stop_requested"]: assert step_id == self.request_stop_step_id stop_requested = True if step_id % validation_freq == 0: yield from interceptor.send( workload.validation_workload(step_id), []) validation = interceptor.metrics_result() print(validation) v_metrics = validation["metrics"]["validation_metrics"] self._validation_metrics.append(v_metrics) if validation["stop_requested"]: assert step_id == self.request_stop_step_id stop_requested = True if stop_requested: break else: assert step_id != self.request_stop_step_id
def _make_test_workloads(checkpoint_dir: pathlib.Path, config: det.ExperimentConfig) -> workload.Stream: interceptor = workload.WorkloadResponseInterceptor() logging.info("Training one batch") yield from interceptor.send(workload.train_workload(1), []) metrics = interceptor.metrics_result() batch_metrics = metrics["metrics"]["batch_metrics"] check.eq(len(batch_metrics), config.scheduling_unit()) logging.debug(f"Finished training, metrics: {batch_metrics}") logging.info("Validating one step") yield from interceptor.send(workload.validation_workload(1), []) validation = interceptor.metrics_result() v_metrics = validation["metrics"]["validation_metrics"] logging.debug(f"Finished validating, validation metrics: {v_metrics}") logging.info(f"Saving a checkpoint to {checkpoint_dir}.") yield workload.checkpoint_workload(), [checkpoint_dir ], workload.ignore_workload_response logging.info(f"Finished saving a checkpoint to {checkpoint_dir}.") yield workload.terminate_workload(), [], workload.ignore_workload_response logging.info("The test experiment passed.")
def make_workloads() -> workload.Stream: yield workload.train_workload( 1, num_batches=100), [], workload.ignore_workload_response yield workload.checkpoint_workload(), [], checkpoint_response_func
def make_workloads() -> workload.Stream: yield workload.train_workload( 1, num_batches=100), [], workload.ignore_workload_response yield workload.validation_workload( ), [], workload.ignore_workload_response
def fake_workload_gen() -> Generator[workload.Workload, None, None]: # Generate some fake workloads. for i in range(NUM_FAKE_WORKLOADS - 1): yield workload.train_workload(i + 1) yield workload.validation_workload(i)
def make_workloads2() -> workload.Stream: yield workload.train_workload(1, 1, 0), workload.ignore_workload_response