Esempio n. 1
0
 def get_allocation_info():
     """Returns and sets the static CSM allocation info."""
     if not LSFUtils._csm_allocation_info:
         lsf_allocation_id = os.environ["CSM_ALLOCATION_ID"].strip()
         output = io.StringIO()
         exit_code = safe_shell_exec.execute("{cmd} -a {allocation}".format(
             cmd=LSFUtils._CSM_ALLOCATION_QUERY, allocation=lsf_allocation_id),
             stdout=output, stderr=output)
         if exit_code != 0:
             raise RuntimeError(
                 "{cmd} failed with exit code {exit_code}".format(
                     cmd=LSFUtils._CSM_ALLOCATION_QUERY, exit_code=exit_code))
         LSFUtils._csm_allocation_info = yaml.safe_load(output.getvalue())
         # Fetch the total number of cores and gpus for the first host
         output = io.StringIO()
         exit_code = safe_shell_exec.execute("{cmd} -n {node}".format(
             cmd=LSFUtils._CSM_NODE_QUERY,
             node=LSFUtils._csm_allocation_info["compute_nodes"][0]),
             stdout=output, stderr=output)
         if exit_code != 0:
             raise RuntimeError(
                 "{cmd} failed with exit code {exit_code}".format(
                     cmd=LSFUtils._CSM_NODE_QUERY, exit_code=exit_code))
         node_output = yaml.safe_load(output.getvalue())
         total_core_count = (int(node_output["Record_1"]["discovered_cores"]) -
                            int(node_output["Record_1"]["discovered_sockets"]) * LSFUtils._csm_allocation_info["isolated_cores"])
         LSFUtils._csm_allocation_info["compute_node_cores"]= total_core_count
         LSFUtils._csm_allocation_info["compute_node_gpus"] = int(node_output["Record_1"]["discovered_gpus"])
         # Sorting LSF hostnames
         LSFUtils._csm_allocation_info["compute_nodes"].sort()
     return LSFUtils._csm_allocation_info
Esempio n. 2
0
    def do_test_run_with_controller_failure(self, controller, mode, run):
        if run == 'func':
            command = None
            run_func = lambda: fn(0)
        elif run == 'cmd':
            command = 'false'
            run_func = None
        else:
            self.fail('unknown run argument {}'.format(run))

        if controller == 'mpi':
            exception = 'mpirun failed with exit code 1'
        else:
            exception = 'Horovod detected that one or more processes exited with non-zero status'

        with self.horovod_args(mode, controller=controller, run_func=run_func, command=command) as (hargs, exec):
            if controller == 'mpi' and run == 'cmd':
                self.assertIsNone(_run(hargs))
                exec.assert_called_once()
                args, kwargs = exec.call_args
                executable, args, env = args
                self.assertEqual('/bin/sh', executable)
                self.assertEqual(3, len(args))
                self.assertEqual('/bin/sh', args[0])
                self.assertEqual('-c', args[1])
                exit_code = safe_shell_exec.execute(args[2], env)
                self.assertEqual(1, exit_code)
            else:
                with pytest.raises(RuntimeError, match=exception):
                    _run(hargs)
Esempio n. 3
0
    def do_test_run_with_controller_success(self, controller, mode, run):
        if run == 'func':
            command = None
            run_func = fn
        elif run == 'cmd':
            command = 'true'
            run_func = None
        else:
            self.fail('unknown run argument {}'.format(run))

        with self.horovod_args(mode, controller, run_func=run_func, command=command) as (hargs, exec):
            if controller == 'mpi' and run == 'cmd':
                self.assertIsNone(_run(hargs))
                exec.assert_called_once()
                args, kwargs = exec.call_args
                executable, args, env = args
                self.assertEqual('/bin/sh', executable)
                self.assertEqual(3, len(args))
                self.assertEqual('/bin/sh', args[0])
                self.assertEqual('-c', args[1])
                exit_code = safe_shell_exec.execute(args[2], env)
                self.assertEqual(0, exit_code)
            else:
                actual = _run(hargs)
                expected = list([(rank, hargs.np) for rank in range(hargs.np)]) if run == 'func' else None
                self.assertEqual(expected, actual)
Esempio n. 4
0
    def _exec_command(command, slot_info, events):
        index = slot_info.rank
        host_name = slot_info.hostname

        host_address = network.resolve_host_address(host_name)
        local_addresses = network.get_local_host_addresses()
        if host_address not in local_addresses:
            local_command = quote(
                'cd {pwd} > /dev/null 2>&1 ; {command}'.format(
                    pwd=os.getcwd(), command=command))
            command = get_remote_command(
                local_command,
                host=host_name,
                port=settings.ssh_port,
                identity_file=settings.ssh_identity_file)

        if settings.verbose:
            print(command)

        # Redirect output if requested
        stdout = stderr = None
        stdout_file = stderr_file = None
        if settings.output_filename:
            padded_rank = _pad_rank(index, settings.num_proc)
            output_dir_rank = os.path.join(
                settings.output_filename,
                'rank.{rank}'.format(rank=padded_rank))
            if not os.path.exists(output_dir_rank):
                os.mkdir(output_dir_rank)

            stdout_file = open(os.path.join(output_dir_rank, 'stdout'), 'w')
            stderr_file = open(os.path.join(output_dir_rank, 'stderr'), 'w')

            stdout = MultiFile([sys.stdout, stdout_file])
            stderr = MultiFile([sys.stderr, stderr_file])

        try:
            exit_code = safe_shell_exec.execute(
                command,
                index=index,
                stdout=stdout,
                stderr=stderr,
                events=events,
                prefix_output_with_timestamp=settings.
                prefix_output_with_timestamp)
            if exit_code != 0:
                print('Process {idx} exit with status code {ec}.'.format(
                    idx=index, ec=exit_code))
        except Exception as e:
            print('Exception happened during safe_shell_exec, exception '
                  'message: {message}'.format(message=e))
            exit_code = 1
        finally:
            if stdout_file:
                stdout_file.close()
            if stderr_file:
                stderr_file.close()
        return exit_code, time.time()
Esempio n. 5
0
 def _run(self, cmd, env):
     stdout = io.StringIO()
     stderr = io.StringIO()
     try:
         exit_code = safe_shell_exec.execute(cmd, env=env, stdout=stdout, stderr=stderr)
         return exit_code, stdout.getvalue(), stderr.getvalue()
     finally:
         stdout.close()
         stderr.close()
Esempio n. 6
0
 def _execute_discovery_script(self):
     stdout = io.StringIO()
     exit_code = safe_shell_exec.execute(self._discovery_script,
                                         stdout=stdout)
     if exit_code != 0:
         raise RuntimeError(
             'Failed to execute discovery script: {}. Exit code: {}'.format(
                 self._discovery_script, exit_code))
     return stdout.getvalue()
Esempio n. 7
0
 def do_test_safe_shell_exec(self, cmd, expected_exit_code, expected_stdout, expected_stderr, event=None):
     stdout = io.StringIO()
     stderr = io.StringIO()
     res = safe_shell_exec.execute(cmd, stdout=stdout, stderr=stderr, events=[event] if event else None)
     self.assertEqual(expected_exit_code, res)
     if expected_stdout is not None:
         self.assertEqual(expected_stdout, stdout.getvalue())
     if expected_stderr is not None:
         self.assertEqual(expected_stderr, stderr.getvalue())
Esempio n. 8
0
 def get_num_threads():
     """Returns the number of hardware threads."""
     lscpu_cmd = get_ssh_command(LSFUtils._LSCPU_CMD,
                                 host=LSFUtils.get_compute_hosts()[0])
     output = io.StringIO()
     exit_code = safe_shell_exec.execute(lscpu_cmd,
                                         stdout=output,
                                         stderr=output)
     if exit_code != 0:
         raise RuntimeError(
             "{cmd} failed with exit code {exit_code}".format(
                 cmd=lscpu_cmd, exit_code=exit_code))
     return int(yaml.safe_load(output.getvalue())[LSFUtils._THREAD_KEY])
Esempio n. 9
0
 def get_num_threads():
     """Returns the number of hardware threads."""
     lscpu_cmd = 'ssh -o PasswordAuthentication=no -o StrictHostKeyChecking=no ' \
                 '{host} {cmd}'.format(
         host=LSFUtils.get_compute_hosts()[0],
         cmd=LSFUtils._LSCPU_CMD
     )
     output = io.StringIO()
     exit_code = safe_shell_exec.execute(lscpu_cmd, stdout=output, stderr=output)
     if exit_code != 0:
         raise RuntimeError("{cmd} failed with exit code {exit_code}".format(
             cmd=lscpu_cmd, exit_code=exit_code))
     return int(yaml.safe_load(output.getvalue())[LSFUtils._THREAD_KEY])
Esempio n. 10
0
 def _exec_command(command):
     host_output = io.StringIO()
     try:
         exit_code = safe_shell_exec.execute(command,
                                             stdout=host_output,
                                             stderr=host_output)
         if exit_code != 0:
             print('Launching horovod task function was not '
                   'successful:\n{host_output}'.format(
                       host_output=host_output.getvalue()))
             os._exit(exit_code)
     finally:
         host_output.close()
     return exit_code
Esempio n. 11
0
    def exec_command(command):
        exit_code = 1
        output_msg = ''

        # Try ssh 5 times
        for i in range(SSH_ATTEMPTS):
            output = io.StringIO()
            try:
                exit_code = safe_shell_exec.execute(command,
                                                    stdout=output,
                                                    stderr=output)
                if exit_code == 0:
                    break
                output_msg = output.getvalue()
            finally:
                output.close()
        return exit_code, output_msg
Esempio n. 12
0
    def find_available_hosts_and_slots(self):
        stdout = io.StringIO()
        exit_code = safe_shell_exec.execute(self._discovery_script, stdout=stdout)
        if exit_code != 0:
            raise RuntimeError('Failed to execute discovery script: {}. Exit code: {}'
                               .format(self._discovery_script, exit_code))

        host_slots = {}
        lines = set(stdout.getvalue().strip().split('\n'))
        for line in lines:
            host = line
            if ':' in line:
                host, slots = line.split(':')
                host_slots[host] = int(slots)
            else:
                host_slots[host] = self._default_slots
        return host_slots
Esempio n. 13
0
 def _run(self, cmd, env):
     stdout = io.StringIO()
     stderr = io.StringIO()
     try:
         if env is not None:
             env = {
                 'PATH': os.environ['PATH'],
                 **env,
             }
         exit_code = safe_shell_exec.execute(cmd,
                                             env=env,
                                             stdout=stdout,
                                             stderr=stderr)
         return exit_code, stdout.getvalue(), stderr.getvalue()
     finally:
         stdout.close()
         stderr.close()
Esempio n. 14
0
 def _run_command(self,
                  command,
                  env,
                  event,
                  stdout=None,
                  stderr=None,
                  index=None,
                  prefix_output_with_timestamp=False):
     self._command_exit_code = safe_shell_exec.execute(
         command,
         env=env,
         stdout=stdout,
         stderr=stderr,
         index=index,
         prefix_output_with_timestamp=prefix_output_with_timestamp,
         events=[event])
     if stdout:
         stdout.close()
     if stderr:
         stderr.close()
Esempio n. 15
0
def execute(command, env=None):
    """
    Executes the command and returns stdout and stderr as a string, together with the exit code.
    :param command: command to execute
    :param env: environment variables to use
    :return: (output, exit code) or None on failure
    """
    output = io.StringIO()
    try:
        exit_code = safe_shell_exec.execute(command,
                                            env=env,
                                            stdout=output,
                                            stderr=output)
        output_msg = output.getvalue()
    except Exception:
        print(traceback.format_exc(), file=sys.stderr)
        return None
    finally:
        output.close()

    return output_msg, exit_code
Esempio n. 16
0
def js_run(settings, nics, env, command, stdout=None, stderr=None):
    """
    Runs Horovod with jsrun.

    Args:
        settings: Settings for running jsrun.
                  Note: settings.num_proc and settings.hosts must not be None.
        nics: Interfaces to include by jsrun.
        env: Environment dictionary to use for running jsrun.
        command: Command and arguments to run as a list of string.
        stdout: Stdout of the mpi process.
                Only used when settings.run_func_mode is True.
        stderr: Stderr of the mpi process.
                Only used when settings.run_func_mode is True.
    """
    mpi_impl_flags, _ = _get_mpi_implementation_flags(settings.tcp_flag,
                                                      env=env)
    if mpi_impl_flags is None:
        raise Exception(_MPI_NOT_FOUND_ERROR_MSG)

    if not is_jsrun_installed():
        raise Exception(
            'horovod does not find the jsrun command.\n\n'
            'Please, make sure you are running on a cluster with jsrun installed or '
            'use one of the other launchers.')

    if nics and 'NCCL_SOCKET_IFNAME' not in env:
        env['NCCL_SOCKET_IFNAME'] = ','.join(nics)

    smpiargs = ' '.join(mpi_impl_flags)
    if settings.extra_mpi_args:
        smpiargs += ' ' + settings.extra_mpi_args

    if settings.binding_args:
        binding_args = settings.binding_args
    else:
        rf = generate_jsrun_rankfile(settings)
        if settings.verbose >= 2:
            safe_shell_exec.execute('cat {rf}'.format(rf=rf))
        binding_args = '--erf_input {rf}'.format(rf=rf)

    jsrun_command = (
        'jsrun {binding_args} '
        '{output_filename_arg} '
        '{smpiargs} '
        '{command}'.format(
            binding_args=binding_args,
            output_filename_arg='--stdio_stderr {file} --stdio_stdout {file}'.
            format(file=settings.output_filename)
            if settings.output_filename else '',
            smpiargs='--smpiargs {args}'.format(
                args=quote(smpiargs)) if smpiargs else '',
            command=' '.join(quote(par) for par in command)))

    if settings.verbose >= 2:
        print(jsrun_command)

    # Execute the jsrun command.
    if settings.run_func_mode:
        exit_code = safe_shell_exec.execute(jsrun_command,
                                            env=env,
                                            stdout=stdout,
                                            stderr=stderr)
        if exit_code != 0:
            raise RuntimeError(
                "jsrun failed with exit code {exit_code}".format(
                    exit_code=exit_code))
    else:
        os.execve('/bin/sh', ['/bin/sh', '-c', jsrun_command], env)
Esempio n. 17
0
def mpi_run(settings, nics, env, command, stdout=None, stderr=None):
    """
    Runs mpi_run.

    Args:
        settings: Settings for running MPI.
                  Note: settings.num_proc and settings.hosts must not be None.
        nics: Interfaces to include by MPI.
        env: Environment dictionary to use for running command.
        command: Command and arguments to run as a list of string.
        stdout: Stdout of the mpi process.
                Only used when settings.run_func_mode is True.
        stderr: Stderr of the mpi process.
                Only used when settings.run_func_mode is True.
    """
    if env is not None and not isinstance(env, dict):
        raise Exception('env argument must be a dict, not {type}: {env}'
                        .format(type=type(env), env=env))

    mpi_impl_flags, impl_binding_args, mpi = _get_mpi_implementation_flags(settings.tcp_flag, env=env)
    if mpi_impl_flags is None:
        raise Exception(_MPI_NOT_FOUND_ERROR_MSG)

    impi = _IMPI_IMPL == mpi

    ssh_args = []
    if settings.ssh_port:
        ssh_args += [f'-p {settings.ssh_port}']
    if settings.ssh_identity_file:
        ssh_args += [f'-i {settings.ssh_identity_file}']

    mpi_ssh_args = ''
    if ssh_args:
        joined_ssh_args = ' '.join(ssh_args)
        mpi_ssh_args = f'-bootstrap=ssh -bootstrap-exec-args \"{joined_ssh_args}\"' if impi else f'-mca plm_rsh_args \"{joined_ssh_args}\"'

    tcp_intf_arg = '-mca btl_tcp_if_include {nics}'.format(
        nics=','.join(nics)) if nics and not impi else ''
    nccl_socket_intf_arg = '-{opt} NCCL_SOCKET_IFNAME={nics}'.format(
        opt='genv' if impi else 'x',
        nics=','.join(nics)) if nics else ''

    # On large cluster runs (e.g. Summit), we need extra settings to work around OpenMPI issues
    host_names, host_to_slots = hosts.parse_hosts_and_slots(settings.hosts)
    if not impi and host_names and len(host_names) >= _LARGE_CLUSTER_THRESHOLD:
        mpi_impl_flags.append('-mca plm_rsh_no_tree_spawn true')
        mpi_impl_flags.append('-mca plm_rsh_num_concurrent {}'.format(len(host_names)))

    # if user does not specify any hosts, mpirun by default uses local host.
    # There is no need to specify localhost.
    hosts_arg = '-{opt} {hosts}'.format(opt='hosts' if impi else 'H',
                hosts=','.join(host_names) if host_names and impi else settings.hosts)

    ppn_arg = ' '
    if host_to_slots and impi:
        ppn = host_to_slots[host_names[0]]
        for h_name in host_names[1:]:
            if ppn != host_to_slots[h_name]:
                raise Exception('''Different slots in -hosts parameter are not supported in Intel(R) MPI.
                                 Use -machinefile <machine_file> for this purpose.''')
        ppn_arg = ' -ppn {} '.format(ppn)

    if settings.prefix_output_with_timestamp and not impi:
        mpi_impl_flags.append('--timestamp-output')

    binding_args = settings.binding_args if settings.binding_args and not impi else ' '.join(impl_binding_args)

    basic_args = '-l' if impi else '--allow-run-as-root --tag-output'

    output = []
    if settings.output_filename:
        output.append('-outfile-pattern' if impi else '--output-filename')
        output.append(settings.output_filename)

    env_list = '' if impi else ' '.join(
                    '-x %s' % key for key in sorted(env.keys()) if env_util.is_exportable(key))

    # Pass all the env variables to the mpirun command.
    mpirun_command = (
        'mpirun {basic_args} '
        '-np {num_proc}{ppn_arg}{hosts_arg} '
        '{binding_args} '
        '{mpi_args} '
        '{mpi_ssh_args} '
        '{tcp_intf_arg} '
        '{nccl_socket_intf_arg} '
        '{output_filename_arg} '
        '{env} {extra_mpi_args} {command}'  # expect a lot of environment variables
        .format(basic_args=basic_args,
                num_proc=settings.num_proc,
                ppn_arg=ppn_arg,
                hosts_arg=hosts_arg,
                binding_args=binding_args,
                mpi_args=' '.join(mpi_impl_flags),
                tcp_intf_arg=tcp_intf_arg,
                nccl_socket_intf_arg=nccl_socket_intf_arg,
                mpi_ssh_args=mpi_ssh_args,
                output_filename_arg=' '.join(output),
                env=env_list,
                extra_mpi_args=settings.extra_mpi_args if settings.extra_mpi_args else '',
                command=' '.join(quote(par) for par in command))
    )

    if settings.verbose >= 2:
        print(mpirun_command)

    # we need the driver's PATH and PYTHONPATH in env to run mpirun,
    # env for mpirun is different to env encoded in mpirun_command
    for var in ['PATH', 'PYTHONPATH']:
        if var not in env and var in os.environ:
            # copy env so we do not leak env modifications
            env = copy.copy(env)
            # copy var over from os.environ
            env[var] = os.environ[var]

    # Execute the mpirun command.
    if settings.run_func_mode:
        exit_code = safe_shell_exec.execute(mpirun_command, env=env, stdout=stdout, stderr=stderr)
        if exit_code != 0:
            raise RuntimeError("mpirun failed with exit code {exit_code}".format(exit_code=exit_code))
    else:
        os.execve('/bin/sh', ['/bin/sh', '-c', mpirun_command], env)
Esempio n. 18
0
 def _run_command(self, command, env, event):
     self._command_exit_code = safe_shell_exec.execute(command,
                                                       env=env,
                                                       events=[event])
Esempio n. 19
0
 def _exec(cmd):
     exit_code = safe_shell_exec.execute(cmd)
     if exit_code is None or exit_code != 0:
         raise RuntimeError(
             'executed command returned non-zero exit code: {}'.format(
                 exit_code))
Esempio n. 20
0
import sys
import time

from horovod.runner.common.util import safe_shell_exec


class FakeEvent(object):
    def wait(self):
        time.sleep(999)


def write(filename, value):
    filename_tmp = filename + '.tmp'
    with open(filename_tmp, 'w') as f:
        f.write(str(value))

    # Atomic rename to prevent race conditions from reader
    os.rename(filename_tmp, filename)


if __name__ == '__main__':
    logfile = sys.argv[1]
    write(logfile, os.getpid())

    cmd = ' '.join([sys.executable] + sys.argv[2:])

    # Mock out the event to avoid leaking semaphores
    safe_shell_exec._create_event = lambda ctx: FakeEvent()

    safe_shell_exec.execute(cmd)
Esempio n. 21
0
def mpi_run(settings, nics, env, command, stdout=None, stderr=None):
    """
    Runs mpi_run.

    Args:
        settings: Settings for running MPI.
                  Note: settings.num_proc and settings.hosts must not be None.
        nics: Interfaces to include by MPI.
        env: Environment dictionary to use for running command.
        command: Command and arguments to run as a list of string.
        stdout: Stdout of the mpi process.
                Only used when settings.run_func_mode is True.
        stderr: Stderr of the mpi process.
                Only used when settings.run_func_mode is True.
    """
    if env is not None and not isinstance(env, dict):
        raise Exception(
            'env argument must be a dict, not {type}: {env}'.format(
                type=type(env), env=env))

    mpi_impl_flags, impl_binding_args = _get_mpi_implementation_flags(
        settings.tcp_flag, env=env)
    if mpi_impl_flags is None:
        raise Exception(_MPI_NOT_FOUND_ERROR_MSG)

    ssh_port_arg = '-mca plm_rsh_args \"-p {ssh_port}\"'.format(
        ssh_port=settings.ssh_port) if settings.ssh_port else ''

    # if user does not specify any hosts, mpirun by default uses local host.
    # There is no need to specify localhost.
    hosts_arg = '-H {hosts}'.format(hosts=settings.hosts)

    tcp_intf_arg = '-mca btl_tcp_if_include {nics}'.format(
        nics=','.join(nics)) if nics else ''
    nccl_socket_intf_arg = '-x NCCL_SOCKET_IFNAME={nics}'.format(
        nics=','.join(nics)) if nics else ''

    # On large cluster runs (e.g. Summit), we need extra settings to work around OpenMPI issues
    host_names, _ = hosts.parse_hosts_and_slots(settings.hosts)
    if host_names and len(host_names) >= _LARGE_CLUSTER_THRESHOLD:
        mpi_impl_flags.append('-mca plm_rsh_no_tree_spawn true')
        mpi_impl_flags.append('-mca plm_rsh_num_concurrent {}'.format(
            len(host_names)))

    binding_args = settings.binding_args if settings.binding_args else ' '.join(
        impl_binding_args)

    # Pass all the env variables to the mpirun command.
    mpirun_command = (
        'mpirun --allow-run-as-root --tag-output '
        '-np {num_proc} {hosts_arg} '
        '{binding_args} '
        '{mpi_args} '
        '{ssh_port_arg} '
        '{tcp_intf_arg} '
        '{nccl_socket_intf_arg} '
        '{output_filename_arg} '
        '{env} {extra_mpi_args} {command}'  # expect a lot of environment variables
        .format(num_proc=settings.num_proc,
                hosts_arg=hosts_arg,
                binding_args=binding_args,
                mpi_args=' '.join(mpi_impl_flags),
                tcp_intf_arg=tcp_intf_arg,
                nccl_socket_intf_arg=nccl_socket_intf_arg,
                ssh_port_arg=ssh_port_arg,
                output_filename_arg='--output-filename ' +
                settings.output_filename if settings.output_filename else '',
                env=' '.join('-x %s' % key for key in sorted(env.keys())
                             if env_util.is_exportable(key)),
                extra_mpi_args=settings.extra_mpi_args
                if settings.extra_mpi_args else '',
                command=' '.join(quote(par) for par in command)))

    if settings.verbose >= 2:
        print(mpirun_command)

    # we need the driver's PATH and PYTHONPATH in env to run mpirun,
    # env for mpirun is different to env encoded in mpirun_command
    for var in ['PATH', 'PYTHONPATH']:
        if var not in env and var in os.environ:
            # copy env so we do not leak env modifications
            env = copy.copy(env)
            # copy var over from os.environ
            env[var] = os.environ[var]

    # Execute the mpirun command.
    if settings.run_func_mode:
        exit_code = safe_shell_exec.execute(mpirun_command,
                                            env=env,
                                            stdout=stdout,
                                            stderr=stderr)
        if exit_code != 0:
            raise RuntimeError(
                "mpirun failed with exit code {exit_code}".format(
                    exit_code=exit_code))
    else:
        os.execve('/bin/sh', ['/bin/sh', '-c', mpirun_command], env)