def test_no_failure_with_torch_mp(out_dir): shutil.rmtree(out_dir, ignore_errors=True) path = build_json(out_dir, save_all=True, save_interval="1") path = str(path) os.environ["SMDEBUG_CONFIG_FILE_PATH"] = path device = "cpu" dataloader_kwargs = {} cpu_count = 2 if mp.cpu_count() > 2 else mp.cpu_count() torch.manual_seed(1) model = Net().to(device) model.share_memory( ) # gradients are allocated lazily, so they are not shared here processes = [] for rank in range(cpu_count): p = mp.Process(target=train, args=(rank, model, device, dataloader_kwargs)) # We first train the model across `num_processes` processes p.start() processes.append(p) for p in processes: p.join() trial = create_trial(out_dir) assert trial.num_workers == 1 # Ensure only one worker saved data assert len(trial.tensor_names()) > 20 # Ensure that data was saved assert trial.steps() == [0, 1, 2, 3] # Ensure that steps were saved shutil.rmtree(out_dir, ignore_errors=True) shutil.rmtree(data_dir, ignore_errors=True)
def mode_allworkers_saveall(out_dir, mode): path = build_json(out_dir, include_workers="all", save_all=True, include_collections=["weights", "gradients"]) num_workers = len(get_available_gpus()) mode_args = list(HOROVOD_ESTIMATOR_TEST_SCRIPT_ARGS) + [ "--model_dir", os.path.join(out_dir, "checkpoint"), ] if mode == "cpu": mode_args += ["--use_only_cpu", "true"] launch_horovod_job( script_file_path= f"examples/tensorflow/sagemaker_official_container/{HOROVOD_ESTIMATOR_TEST_SCRIPT_PATH}", script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == num_workers assert len(tr.tensor_names()) > 99 assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers assert len(tr.tensor( tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
def basic_test(out_dir, mode): path = build_json(out_dir, include_workers="one", include_collections=["weights", "gradients"]) num_workers = len(get_available_gpus()) mode_args = list(HOROVOD_MNIST_ARGS) + [ "--model_dir", os.path.join(out_dir, "checkpoint") ] if mode == "cpu": mode_args += ["--use_only_cpu", "true"] launch_horovod_job( script_file_path= f"examples/tensorflow/sagemaker_official_container/{HOROVOD_MNIST_SCRIPT_NAME}", script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) print(tr.tensor_names()) assert len(tr.workers()) == 1 assert len(tr.tensor_names()) == 13 assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == 1
def test_training_with_no_grad_updates(): temp_dir = TemporaryDirectory().name path = build_json(temp_dir, include_collections=["losses"], save_interval="1") os.environ["SMDEBUG_CONFIG_FILE_PATH"] = str(path) do_training() trial = create_trial(temp_dir) assert len(trial.steps()) == 99
def mode_allworkers_saveall(out_dir, mode): path = build_json(out_dir, include_workers="all", save_all=True) num_workers = len(get_available_gpus()) mode_args = ["--model_dir", out_dir] launch_smdataparallel_job( script_file_path=SMDATAPARALLEL_TF2_TEST_MNIST_SCRIPT, script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == num_workers assert len(tr.tensor_names()) == 35 assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers assert len(tr.tensor("loss").workers(0)) == num_workers
def mode_allworkers_saveall(out_dir, mode): path = build_json(out_dir, include_workers="all", save_all=True) num_workers = 1 if bool(device_count()) is False else device_count() mode_args = list(SMDATAPARALLEL_PYTORCH_TEST_MNIST_ARGS) launch_smdataparallel_job( script_file_path=SMDATAPARALLEL_PYTORCH_TEST_MNIST_SCRIPT, script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == num_workers assert len(tr.tensor_names()) > 25 assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers assert len(tr.tensor( tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
def mode_allworkers_default_collections(out_dir, mode): path = build_json(out_dir, include_workers="all", include_collections=TF_DEFAULT_SAVED_COLLECTIONS) num_workers = len(get_available_gpus()) mode_args = ["--model_dir", out_dir] launch_smdataparallel_job( script_file_path=SMDATAPARALLEL_TF2_TEST_MNIST_SCRIPT, script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == num_workers assert len(tr.tensor_names()) == 1 assert len(tr.tensor( tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
def mode_allworkers(out_dir, mode): path = build_json(out_dir, include_workers="all", include_collections=["weights", "optimizer_variables"]) num_workers = len(get_available_gpus()) mode_args = ["--model_dir", out_dir] launch_smdataparallel_job( script_file_path=SMDATAPARALLEL_TF2_TEST_MNIST_SCRIPT, script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == num_workers print("tensor names: ", tr.tensor_names()) assert len(tr.tensor_names()) == 5 assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
def test_no_failure_with_torch_mp(out_dir): shutil.rmtree(out_dir, ignore_errors=True) shutil.rmtree(data_dir, ignore_errors=True) print("Downloading the MNIST dataset") os.system(f"mkdir {data_dir}") s3_client = boto3.client("s3") s3_client.download_file("smdebug-testing", "datasets/MNIST_pytorch.tar.gz", f"{data_dir}/MNIST_pytorch.tar.gz") os.system(f"tar -zxf {data_dir}/MNIST_pytorch.tar.gz") os.system(f"mv MNIST {data_dir}") path = build_json(out_dir, save_all=True, save_interval="1") path = str(path) os.environ["SMDEBUG_CONFIG_FILE_PATH"] = path device = "cpu" dataloader_kwargs = {} cpu_count = 2 if mp.cpu_count() > 2 else mp.cpu_count() torch.manual_seed(1) model = Net().to(device) model.share_memory( ) # gradients are allocated lazily, so they are not shared here processes = [] print(f"Starting the training for {cpu_count} ") for rank in range(cpu_count): p = mp.Process(target=train, args=(rank, model, device, dataloader_kwargs)) # We first train the model across `num_processes` processes p.start() processes.append(p) for p in processes: p.join() print("Finished the training..") trial = create_trial(out_dir) assert trial.num_workers == 1 # Ensure only one worker saved data assert len(trial.tensor_names()) > 20 # Ensure that data was saved assert trial.steps() == [0, 1, 2, 3] # Ensure that steps were saved shutil.rmtree(out_dir, ignore_errors=True) shutil.rmtree(data_dir, ignore_errors=True)
def mode_allworkers(out_dir, mode): path = build_json(out_dir, include_workers="all", include_collections=["weights", "optimizer_variables"]) num_workers = len(get_available_gpus()) mode_args = list(HOROVOD_KERAS_TEST_SCRIPT_ARGS) + ["--model_dir", out_dir] if mode == "cpu": mode_args += ["--use_only_cpu", "true"] launch_horovod_job( script_file_path=HOROVOD_TF2_TEST_MNIST_SCRIPT, script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == num_workers assert len(tr.tensor_names()) == (13 if is_tf_2_2() else 14) assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers
def mode_allworkers_saveall(out_dir, mode): path = build_json(out_dir, include_workers="all", save_all=True) num_workers = 1 if bool(device_count()) is False else device_count() mode_args = [] if mode == "cpu": mode_args += ["--use_only_cpu", "true"] launch_horovod_job( script_file_path=HOROVOD_PYTORCH_TEST_MNIST_SCRIPT, script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == num_workers assert len(tr.tensor_names()) > 25 assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == num_workers assert len(tr.tensor( tr.tensor_names(collection="losses")[0]).workers(0)) == num_workers
def mode_one_worker(out_dir, mode): path = build_json(out_dir, include_workers="one", include_collections=["weights", "gradients"]) num_workers = device_count() mode_args = [] if mode == "cpu": mode_args += ["--use_only_cpu", "true"] launch_horovod_job( script_file_path=HOROVOD_PYTORCH_TEST_MNIST_SCRIPT, script_args=mode_args, num_workers=num_workers, config_file_path=path, mode=mode, ) tr = create_trial(out_dir) assert len(tr.workers()) == 1 # We expect only one worker because # it has been configured so in HOROVOD_MNIST_SCRIPT_NAME assert len(tr.tensor_names()) == 13 assert len(tr.tensor( tr.tensor_names(collection="weights")[0]).workers(0)) == 1 assert len(tr.tensor( tr.tensor_names(collection="losses")[0]).workers(0)) == 1