def test_launch_layer_cifar( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 1) config = conf.set_profiling_enabled(config) config = conf.set_entrypoint( config, "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial" ) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0].trial.id).select_checkpoint(latest=True).load( map_location="cpu")) collect_trial_profiles(trials[0].trial.id) assert exp.check_if_string_present_in_trial_logs( trials[0].trial.id, "allocation stopped after resources exited successfully with a zero exit code", )
def test_epoch_sync(num_workers: int, global_batch_size: int, dataset_len: int) -> None: """ Test that epoch_idx is synchronized across all workers regardless of whether the number of batches is evenly divisible by the number of workers. """ config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml")) config = conf.set_slots_per_trial(config, num_workers) max_len_batches = 10 config = conf.set_max_length(config, {"batches": max_len_batches}) config = conf.set_hparam(config, "dataset_len", dataset_len) config = conf.set_global_batch_size(config, global_batch_size) e_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("pytorch_no_op"), 1) t_id = exp.experiment_trials(e_id)[0].trial.id batches_per_epoch = (dataset_len + global_batch_size - 1) // global_batch_size # ceil for batch_idx in range(max_len_batches): epoch_idx = batch_idx // batches_per_epoch for rank in range(config["resources"]["slots_per_trial"]): assert exp.check_if_string_present_in_trial_logs( t_id, f"rank {rank} finished batch {batch_idx} in epoch {epoch_idx}")
def test_on_trial_close_callback() -> None: config = conf.load_config(conf.fixtures_path("estimator_no_op/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 3}) exp_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("estimator_no_op"), 1) assert exp.check_if_string_present_in_trial_logs( exp.experiment_trials(exp_id)[0].trial.id, "rank 0 has completed on_trial_close")
def test_distributed_logging() -> None: config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 1}) e_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("pytorch_no_op"), 1) t_id = exp.experiment_trials(e_id)[0]["id"] for i in range(config["resources"]["slots_per_trial"]): assert exp.check_if_string_present_in_trial_logs( t_id, "finished train_batch for rank {}".format(i))
def test_launch_layer_exit( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/const.yaml")) config = conf.set_entrypoint( config, "python3 -m nonexistent_launch_module model_def:CIFARTrial") experiment_id = exp.run_failure_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch")) trials = exp.experiment_trials(experiment_id) Determined(conf.make_master_url()).get_trial(trials[0].trial.id) collect_trial_profiles(trials[0].trial.id) assert exp.check_if_string_present_in_trial_logs( trials[0].trial.id, "container failed with non-zero exit code: 1")