Example #1
0
def test_mpi_worker_run_no_wait(popen, ssh_client):
    worker = _mpi.WorkerRunner(user_entry_point='train.sh',
                               args=['-v', '--lr', '35'],
                               env_vars={'LD_CONFIG_PATH': '/etc/ld'},
                               master_hostname='algo-1')

    worker.run(wait=False)

    ssh_client.assert_not_called()

    popen.assert_called_with(['/usr/sbin/sshd', '-D'])
def test_mpi_worker_run_no_wait(popen, ssh_client):
    worker = _mpi.WorkerRunner(
        user_entry_point="train.sh",
        args=["-v", "--lr", "35"],
        env_vars={"LD_CONFIG_PATH": "/etc/ld"},
        master_hostname="algo-1",
    )

    worker.run(wait=False)

    ssh_client.assert_not_called()

    popen.assert_called_with(["/usr/sbin/sshd", "-D"])
Example #3
0
def test_mpi_worker_run(popen, policy, process_iter, wait_procs, ssh_client):

    process = MagicMock(info={'name': 'orted'})
    process_iter.side_effect = lambda attrs: [process]

    worker = _mpi.WorkerRunner(user_entry_point='train.sh',
                               args=['-v', '--lr', '35'],
                               env_vars={'LD_CONFIG_PATH': '/etc/ld'},
                               master_hostname='algo-1')

    worker.run()

    ssh_client().load_system_host_keys.assert_called()
    ssh_client().set_missing_host_key_policy.assert_called_with(policy())
    ssh_client().connect.assert_called_with('algo-1', port=22)
    ssh_client().close.assert_called()
    wait_procs.assert_called_with([process])

    popen.assert_called_with(['/usr/sbin/sshd', '-D'])
def _get_by_runner_type(identifier,
                        user_entry_point=None,
                        args=None,
                        env_vars=None,
                        extra_opts=None):
    """Placeholder docstring"""
    env = sagemaker_containers.training_env()
    user_entry_point = user_entry_point or env.user_entry_point
    args = args or env.to_cmd_args()
    env_vars = env_vars or env.to_env_vars()

    if identifier is RunnerType.MPI and env.is_master:
        mpi_args = extra_opts or {}

        # Default to single process for CPU
        default_processes_per_host = env.num_gpus if env.num_gpus > 0 else 1
        processes_per_host = _mpi_param_value(mpi_args, env,
                                              _params.MPI_PROCESSES_PER_HOST,
                                              default_processes_per_host)
        num_processes = _mpi_param_value(mpi_args, env,
                                         _params.MPI_NUM_PROCESSES)
        custom_mpi_options = _mpi_param_value(mpi_args, env,
                                              _params.MPI_CUSTOM_OPTIONS, "")

        return _mpi.MasterRunner(
            user_entry_point,
            args,
            env_vars,
            env.master_hostname,
            env.hosts,
            processes_per_host,
            custom_mpi_options,
            env.network_interface_name,
            num_processes=num_processes,
        )
    elif identifier is RunnerType.MPI:
        return _mpi.WorkerRunner(user_entry_point, args, env_vars,
                                 env.master_hostname)
    elif identifier is RunnerType.Process:
        return _process.ProcessRunner(user_entry_point, args, env_vars)
    else:
        raise ValueError("Invalid identifier %s" % identifier)
def _get_by_runner_type(identifier):
    env = sagemaker_containers.training_env()
    if identifier is RunnerType.MPI and env.is_master:
        processes_per_host = env.additional_framework_parameters.get(
            _params.MPI_PROCESSES_PER_HOST, 1)
        custom_mpi_options = env.additional_framework_parameters.get(
            _params.MPI_CUSTOM_OPTIONS, '')

        return _mpi.MasterRunner(env.user_entry_point, env.to_cmd_args(),
                                 env.to_env_vars(), env.master_hostname,
                                 env.hosts, processes_per_host,
                                 custom_mpi_options,
                                 env.network_interface_name)
    elif identifier is RunnerType.MPI:
        return _mpi.WorkerRunner(env.user_entry_point, env.to_cmd_args(),
                                 env.to_env_vars(), env.master_hostname)
    elif identifier is RunnerType.Process:
        return _process.ProcessRunner(env.user_entry_point, env.to_cmd_args(),
                                      env.to_env_vars())
    else:
        raise ValueError('Invalid identifier %s' % identifier)
def test_mpi_worker_run(popen, policy, process_iter, wait_procs, ssh_client, sleep):

    process = MagicMock(info={"name": "orted"})
    process_iter.side_effect = lambda attrs: [process]

    worker = _mpi.WorkerRunner(
        user_entry_point="train.sh",
        args=["-v", "--lr", "35"],
        env_vars={"LD_CONFIG_PATH": "/etc/ld"},
        master_hostname="algo-1",
    )

    worker.run()

    ssh_client().load_system_host_keys.assert_called()
    ssh_client().set_missing_host_key_policy.assert_called_with(policy())
    ssh_client().connect.assert_called_with("algo-1", port=22)
    ssh_client().close.assert_called()
    wait_procs.assert_called_with([process])

    popen.assert_called_with(["/usr/sbin/sshd", "-D"])
def _get_by_runner_type(identifier,
                        user_entry_point=None,
                        args=None,
                        env_vars=None,
                        extra_opts=None):
    env = sagemaker_containers.training_env()
    user_entry_point = user_entry_point or env.user_entry_point
    args = args or env.to_cmd_args()
    env_vars = env_vars or env.to_env_vars()

    if identifier is RunnerType.MPI and env.is_master:
        mpi_args = extra_opts or {}

        processes_per_host = _mpi_param_value(mpi_args, env,
                                              _params.MPI_PROCESSES_PER_HOST,
                                              1)
        num_processes = _mpi_param_value(mpi_args, env,
                                         _params.MPI_NUM_PROCESSES)
        custom_mpi_options = _mpi_param_value(mpi_args, env,
                                              _params.MPI_CUSTOM_OPTIONS, '')

        return _mpi.MasterRunner(user_entry_point,
                                 args,
                                 env_vars,
                                 env.master_hostname,
                                 env.hosts,
                                 processes_per_host,
                                 custom_mpi_options,
                                 env.network_interface_name,
                                 num_processes=num_processes)
    elif identifier is RunnerType.MPI:
        return _mpi.WorkerRunner(user_entry_point, args, env_vars,
                                 env.master_hostname)
    elif identifier is RunnerType.Process:
        return _process.ProcessRunner(user_entry_point, args, env_vars)
    else:
        raise ValueError('Invalid identifier %s' % identifier)