def test_distributed_training_from_master_node_use_mpi_with_slot_processes_per_host( log_script_invocation, chmod, stat, download_and_install, check_call, _wait_for_worker_nodes_to_start_sshd, _start_ssh_daemon, _change_hostname): hosts = ['algo-1', 'algo-2'] env = mock_training_env(hosts=hosts, num_gpus=8, network_interface_name='foonet') training.train(env, { 'sagemaker_use_mpi': True, 'sagemaker_process_slots_per_host': 16 }) download_and_install.assert_called_with('s3://my/script') _change_hostname.assert_called_once_with('algo-1') _start_ssh_daemon.assert_called_once() _wait_for_worker_nodes_to_start_sshd.assert_called_once_with(hosts) check_call.assert_called_once_with([ 'mpirun', '--allow-run-as-root', '--host', 'algo-1:16,algo-2:16', '-mca', 'btl_tcp_if_include', 'foonet', '-mca', 'oob_tcp_if_include', 'foonet', '-mca', 'btl', '^openib', '-x', 'PATH', '-x', 'LD_LIBRARY_PATH', '-x', 'LD_PRELOAD=/libchangehostname.so', '-mca', 'orte_abort_on_non_zero_status', '1', '-x', 'NCCL_DEBUG=INFO', '-x', 'NCCL_SOCKET_IFNAME=foonet', '-np', '32', '/mpi_script.sh' ]) chmod.assert_called_with('/mpi_script.sh', stat().st_mode.__or__())
def test_distributed_training_from_worker_node_use_mpi_with_sagemaker_additional_mpi_options( log_script_invocation, socket, chmod, stat, download_and_install, check_call, sleep, _can_connect, popen, system): hosts = ['algo-1', 'algo-2'] env = mock_training_env(hosts=hosts, num_gpus=8, network_interface_name='foonet') training.train(env, {'sagemaker_use_mpi': True, 'sagemaker_process_slots_per_host': 16, 'sagemaker_additional_mpi_options': '-x MY_ENVIRONMENT_VARIABLE'}) download_and_install.assert_called_with('s3://my/script') system.assert_called_once_with('change-hostname.sh algo-1') popen.assert_called_once_with(["/usr/sbin/sshd", "-D"]) _can_connect.assert_called_with('algo-2', 22, socket()) check_call.assert_called_once_with( ['mpirun', '--allow-run-as-root', '--host', 'algo-1:16,algo-2:16', '-mca', 'btl_tcp_if_include', 'foonet', '-mca', 'oob_tcp_if_include', 'foonet', '-mca', 'btl', '^openib', '-x', 'PATH', '-x', 'LD_LIBRARY_PATH', '-x', 'LD_PRELOAD=/libchangehostname.so', '-mca', 'orte_abort_on_non_zero_status', '1', '-x', 'NCCL_DEBUG=INFO', '-x', 'NCCL_SOCKET_IFNAME=foonet', '-np', '32', '-x', 'MY_ENVIRONMENT_VARIABLE', '/mpi_script.sh']) chmod.assert_called_with('/mpi_script.sh', stat().st_mode.__or__()) open().write.assert_called_with("""#!/usr/bin/env bash touch /mpi_is_running %s -m mpi4py -m imagenet EXIT_CODE=$? touch /mpi_is_finished exit ${EXIT_CODE} """ % sys.executable)
def test_single_machine(run_entry, download_and_install): env = mock_training_env() training.train(env, {}) download_and_install.assert_called_with('s3://my/script') run_entry.assert_called_with('s3://my/script', 'imagenet', env.to_cmd_args(), env.to_env_vars(), runner=framework.runner.ProcessRunnerType, extra_opts={})
def test_distributed_training_from_worker_node_use_mpi( log_script_invocation, isfile, chmod, stat, download_and_install, popen): hosts = ['algo-1', 'algo-2'] env = mock_training_env(current_host='algo-2', hosts=hosts) training.train(env, {'sagemaker_use_mpi': True}) download_and_install.assert_called_with('s3://my/script') popen.assert_called_with(['/usr/sbin/sshd', '-D']) isfile.assert_called_with('/mpi_is_finished') chmod.assert_called_once_with('/mpi_script.sh', stat().st_mode.__or__())
def test_distributed_training_from_worker_node(isfile, chmod, stat, download_and_install, _start_ssh_daemon, system): hosts = ['algo-1', 'algo-2'] env = mock_training_env(current_host='algo-2', hosts=hosts) training.train(env, {}) download_and_install.assert_called_with('s3://my/script') system.assert_called_once_with('change-hostname.sh algo-2') _start_ssh_daemon.assert_called_once() isfile.assert_called_with('/mpi_is_finished') chmod.assert_called_once_with('/mpi_script.sh', stat().st_mode.__or__())
def test_distributed_training_from_master_node( _create_mpi_script, _run_mpi_on_all_nodes, _wait_for_worker_nodes_to_start_sshd, _start_ssh_daemon, _change_hostname): hosts = ['algo-1', 'algo-2'] env = mock_training_env(hosts=hosts) training.train(env, {}) _create_mpi_script.assert_called_with(env) _change_hostname.assert_called_once_with('algo-1') _start_ssh_daemon.assert_called_once() _wait_for_worker_nodes_to_start_sshd.assert_called_once_with(hosts) _run_mpi_on_all_nodes.assert_called_once_with(env, {})
def test_distributed_training(run_entry, download_and_install): hosts = ['algo-1', 'algo-2'] env = mock_training_env(hosts=hosts) training.train(env, {}) download_and_install.assert_called_with('s3://my/script') run_entry.assert_called_with('s3://my/script', 'imagenet', env.to_cmd_args(), env.to_env_vars(), runner=framework.runner.MPIRunnerType, extra_opts={'sagemaker_mpi_num_of_processes_per_host': None, 'sagemaker_mpi_num_processes': None})
def test_single_machine(run_module): env = mock_training_env() training.train(env, {}) run_module.assert_called_with('s3://my/script', env.to_cmd_args(), env.to_env_vars(), 'imagenet')