Beispiel #1
0
def test_mpi_master_run(training_env, popen, policy, ssh_client):
    with patch.dict(os.environ, clear=True):

        master = _mpi.MasterRunner(user_entry_point='train.sh',
                                   args=['-v', '--lr', '35'],
                                   env_vars={'LD_CONFIG_PATH': '/etc/ld'},
                                   master_hostname='algo-1',
                                   hosts=['algo-1', 'algo-2'],
                                   process_per_host=2,
                                   custom_mpi_options='-v --lr 35',
                                   network_interface_name='ethw3')

        process = master.run(wait=False)

        ssh_client().load_system_host_keys.assert_called()
        ssh_client().set_missing_host_key_policy.assert_called_with(policy())
        ssh_client().connect.assert_called_with('algo-2', port=22)
        ssh_client().close.assert_called()

        popen.assert_called_with([
            'mpirun',
            '--host', 'algo-1:2,algo-2:2',
            '-np', '4', '--allow-run-as-root',
            '--display-map',
            '--tag-output',
            '-mca', 'btl_tcp_if_include', 'ethw3',
            '-mca', 'oob_tcp_if_include', 'ethw3',
            '-mca', 'plm_rsh_no_tree_spawn', '1',
            '-bind-to', 'socket', '-map-by', 'slot',
            '-mca', 'pml', 'ob1',
            '-mca', 'btl', '^openib',
            '-mca', 'orte_abort_on_non_zero_status', '1',
            '-x', 'NCCL_MIN_NRINGS=4',
            '-x', 'NCCL_SOCKET_IFNAME=ethw3',
            '-x', 'NCCL_DEBUG=INFO',
            '-x', 'LD_LIBRARY_PATH',
            '-x', 'PATH',
            '-x', 'LD_PRELOAD=%s' % inspect.getfile(gethostname),
            '-v', '--lr', '35', '-x', 'LD_CONFIG_PATH', '/bin/sh', '-c', './train.sh -v --lr 35'],
            cwd=_env.code_dir,
            env=ANY, stderr=None)

        assert process == popen()
def _get_by_runner_type(identifier,
                        user_entry_point=None,
                        args=None,
                        env_vars=None,
                        extra_opts=None):
    """Placeholder docstring"""
    env = sagemaker_containers.training_env()
    user_entry_point = user_entry_point or env.user_entry_point
    args = args or env.to_cmd_args()
    env_vars = env_vars or env.to_env_vars()

    if identifier is RunnerType.MPI and env.is_master:
        mpi_args = extra_opts or {}

        # Default to single process for CPU
        default_processes_per_host = env.num_gpus if env.num_gpus > 0 else 1
        processes_per_host = _mpi_param_value(mpi_args, env,
                                              _params.MPI_PROCESSES_PER_HOST,
                                              default_processes_per_host)
        num_processes = _mpi_param_value(mpi_args, env,
                                         _params.MPI_NUM_PROCESSES)
        custom_mpi_options = _mpi_param_value(mpi_args, env,
                                              _params.MPI_CUSTOM_OPTIONS, "")

        return _mpi.MasterRunner(
            user_entry_point,
            args,
            env_vars,
            env.master_hostname,
            env.hosts,
            processes_per_host,
            custom_mpi_options,
            env.network_interface_name,
            num_processes=num_processes,
        )
    elif identifier is RunnerType.MPI:
        return _mpi.WorkerRunner(user_entry_point, args, env_vars,
                                 env.master_hostname)
    elif identifier is RunnerType.Process:
        return _process.ProcessRunner(user_entry_point, args, env_vars)
    else:
        raise ValueError("Invalid identifier %s" % identifier)
def _get_by_runner_type(identifier):
    env = sagemaker_containers.training_env()
    if identifier is RunnerType.MPI and env.is_master:
        processes_per_host = env.additional_framework_parameters.get(
            _params.MPI_PROCESSES_PER_HOST, 1)
        custom_mpi_options = env.additional_framework_parameters.get(
            _params.MPI_CUSTOM_OPTIONS, '')

        return _mpi.MasterRunner(env.user_entry_point, env.to_cmd_args(),
                                 env.to_env_vars(), env.master_hostname,
                                 env.hosts, processes_per_host,
                                 custom_mpi_options,
                                 env.network_interface_name)
    elif identifier is RunnerType.MPI:
        return _mpi.WorkerRunner(env.user_entry_point, env.to_cmd_args(),
                                 env.to_env_vars(), env.master_hostname)
    elif identifier is RunnerType.Process:
        return _process.ProcessRunner(env.user_entry_point, env.to_cmd_args(),
                                      env.to_env_vars())
    else:
        raise ValueError('Invalid identifier %s' % identifier)
def _get_by_runner_type(identifier,
                        user_entry_point=None,
                        args=None,
                        env_vars=None,
                        extra_opts=None):
    env = sagemaker_containers.training_env()
    user_entry_point = user_entry_point or env.user_entry_point
    args = args or env.to_cmd_args()
    env_vars = env_vars or env.to_env_vars()

    if identifier is RunnerType.MPI and env.is_master:
        mpi_args = extra_opts or {}

        processes_per_host = _mpi_param_value(mpi_args, env,
                                              _params.MPI_PROCESSES_PER_HOST,
                                              1)
        num_processes = _mpi_param_value(mpi_args, env,
                                         _params.MPI_NUM_PROCESSES)
        custom_mpi_options = _mpi_param_value(mpi_args, env,
                                              _params.MPI_CUSTOM_OPTIONS, '')

        return _mpi.MasterRunner(user_entry_point,
                                 args,
                                 env_vars,
                                 env.master_hostname,
                                 env.hosts,
                                 processes_per_host,
                                 custom_mpi_options,
                                 env.network_interface_name,
                                 num_processes=num_processes)
    elif identifier is RunnerType.MPI:
        return _mpi.WorkerRunner(user_entry_point, args, env_vars,
                                 env.master_hostname)
    elif identifier is RunnerType.Process:
        return _process.ProcessRunner(user_entry_point, args, env_vars)
    else:
        raise ValueError('Invalid identifier %s' % identifier)
Beispiel #5
0
def test_mpi_master_run(training_env, popen, policy, ssh_client, path_exists):
    with patch.dict(os.environ, clear=True):

        master = _mpi.MasterRunner(
            user_entry_point="train.sh",
            args=["-v", "--lr", "35"],
            env_vars={"LD_CONFIG_PATH": "/etc/ld"},
            master_hostname="algo-1",
            hosts=["algo-1", "algo-2"],
            process_per_host=2,
            custom_mpi_options="-v --lr 35",
            network_interface_name="ethw3",
        )

        process = master.run(wait=False)

        ssh_client().load_system_host_keys.assert_called()
        ssh_client().set_missing_host_key_policy.assert_called_with(policy())
        ssh_client().connect.assert_called_with("algo-2", port=22)
        ssh_client().close.assert_called()

        popen.assert_called_with(
            [
                "mpirun",
                "--host",
                "algo-1:2,algo-2:2",
                "-np",
                "4",
                "--allow-run-as-root",
                "--display-map",
                "--tag-output",
                "-mca",
                "btl_tcp_if_include",
                "ethw3",
                "-mca",
                "oob_tcp_if_include",
                "ethw3",
                "-mca",
                "plm_rsh_no_tree_spawn",
                "1",
                "-bind-to",
                "socket",
                "-map-by",
                "slot",
                "-mca",
                "pml",
                "ob1",
                "-mca",
                "btl",
                "^openib",
                "-mca",
                "orte_abort_on_non_zero_status",
                "1",
                "-x",
                "NCCL_MIN_NRINGS=4",
                "-x",
                "NCCL_SOCKET_IFNAME=ethw3",
                "-x",
                "NCCL_DEBUG=INFO",
                "-x",
                "LD_LIBRARY_PATH",
                "-x",
                "PATH",
                "-x",
                "LD_PRELOAD=%s" % inspect.getfile(gethostname),
                "-v",
                "--lr",
                "35",
                "-x",
                "LD_CONFIG_PATH",
                "/bin/sh",
                "-c",
                "./train.sh -v --lr 35",
            ],
            cwd=_env.code_dir,
            env=ANY,
            stdout=None,
            stderr=None,
        )

        assert process == popen()
        path_exists.assert_called_with("/usr/sbin/sshd")