Python Settings Exemples, horovod.runner.common.util.settings.Settings Python Exemples

Exemple #1

0

Afficher le fichier

    def test_generate_jsrun_rankfile(self):
        settings = hvd_settings.Settings(
            num_proc=5,
            hosts='host1:4,host2:4,host3:4',
        )

        with temppath() as rankfile_path:
            rankfile_path = generate_jsrun_rankfile(settings, rankfile_path)

            with open(rankfile_path, 'r') as file:
                gen_rankfile = file.read()

            expected_rankfile = (
"""overlapping_rs: allow
cpu_index_using: logical

rank: 0: { hostname: host1; cpu: {0-3} ; gpu: * ; mem: * }
rank: 1: { hostname: host1; cpu: {4-7} ; gpu: * ; mem: * }
rank: 2: { hostname: host1; cpu: {8-11} ; gpu: * ; mem: * }
rank: 3: { hostname: host1; cpu: {12-15} ; gpu: * ; mem: * }

rank: 4: { hostname: host2; cpu: {0-3} ; gpu: * ; mem: * }
""")

            self.assertMultiLineEqual(gen_rankfile, expected_rankfile)

Exemple #2

0

Afficher le fichier

    def test_js_run(self):
        if _get_mpi_implementation_flags(False)[0] is None:
            self.skipTest("MPI is not available")

        cmd = ['cmd', 'arg1', 'arg2']
        env = {'env1': 'val1', 'env2': 'val2'}
        stdout = '<stdout>'
        stderr = '<stderr>'
        settings = hvd_settings.Settings(
            verbose=0,
            extra_mpi_args='>mpi-extra args go here<',
            num_proc=4,
            hosts='localhost:2,127.0.0.1:2',
            output_filename='>output filename goes here<',
            run_func_mode=True
        )

        def mpi_impl_flags(tcp, env=None):
            return ["--mock-mpi-impl-flags"], []

        with mock.patch("horovod.runner.js_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags):
            with mock.patch("horovod.runner.js_run.safe_shell_exec.execute", return_value=0) as execute:
                js_run(settings, None, env, cmd, stdout=stdout, stderr=stderr)

                # call the mocked _get_mpi_implementation_flags method
                mpi_flags, _ = horovod.runner.js_run._get_mpi_implementation_flags(False)
                self.assertIsNotNone(mpi_flags)
                expected_command = ('jsrun '
                                    '--erf_input /tmp/rankfile '
                                    '--stdio_stderr >output filename goes here< '
                                    '--stdio_stdout >output filename goes here< '
                                    '--smpiargs \'{mpi_args} >mpi-extra args go here<\' '
                                    'cmd arg1 arg2').format(mpi_args=' '.join(mpi_flags))
                expected_env = {'env1': 'val1', 'env2': 'val2'}
                execute.assert_called_once_with(expected_command, env=expected_env, stdout=stdout, stderr=stderr)

Exemple #3

0

Afficher le fichier

    def test_mpi_run_full(self):
        if not mpi_available():
            self.skipTest("MPI is not available")

        cmd = ['cmd', 'arg1', 'arg2']
        nics = ['eth0', 'eth1']
        env = {'env1': 'val1', 'env2': 'val2'}
        stdout = '<stdout>'
        stderr = '<stderr>'
        tmout = timeout.Timeout(5, message='Timed out waiting for something.')
        settings = hvd_settings.Settings(
            verbose=0,
            ssh_port=1022,
            extra_mpi_args='>mpi-extra args go here<',
            binding_args='>binding args go here<',
            key=secret.make_secret_key(),
            start_timeout=tmout,
            num_proc=1,
            hosts='localhost:1',
            output_filename='>output filename goes here<',
            run_func_mode=True
        )

        def mpi_impl_flags(tcp, env=None):
            return ["--mock-mpi-impl-flags"], []

        with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags) as impl:
            with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=0) as execute:
                mpi_run(settings, nics, env, cmd, stdout=stdout, stderr=stderr)

                # assert call on _get_mpi_implementation_flags
                impl.assert_called_once_with(None, env=env)

                # call the mocked _get_mpi_implementation_flags method ourselves
                mpi_flags, _ = horovod.runner.mpi_run._get_mpi_implementation_flags(False)
                self.assertIsNotNone(mpi_flags)
                expected_command = ('mpirun '
                                    '--allow-run-as-root --tag-output '
                                    '-np 1 -H {hosts} '
                                    '>binding args go here< '
                                    '{mpi_flags} '
                                    '-mca plm_rsh_args "-p 1022" '
                                    '-mca btl_tcp_if_include eth0,eth1 -x NCCL_SOCKET_IFNAME=eth0,eth1 '
                                    '--output-filename >output filename goes here< '
                                    '-x env1 -x env2 '
                                    '>mpi-extra args go here< '
                                    'cmd arg1 arg2').format(hosts=settings.hosts,
                                                            mpi_flags=' '.join(mpi_flags))

                # remove PYTHONPATH from execute's env
                # we cannot know the exact value of that env variable
                # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below
                self.assertIn('env', execute.call_args.kwargs)
                if 'PYTHONPATH' in execute.call_args.kwargs['env']:
                    execute.call_args.kwargs['env'].pop('PYTHONPATH')

                expected_env = {'env1': 'val1', 'env2': 'val2', 'PATH': os.environ.get('PATH')}
                execute.assert_called_once_with(expected_command, env=expected_env, stdout=stdout, stderr=stderr)

Exemple #4

0

Afficher le fichier

Fichier : launch.py Projet : tomzhang/horovod

def _run_static(args):
    nics_set = set(args.nics.split(',')) if args.nics else None

    # horovodrun has to finish all the checks before this timeout runs out.
    if args.start_timeout:
        start_timeout = args.start_timeout
    else:
        # Lookup default timeout from the environment variable.
        start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30'))

    tmout = timeout.Timeout(start_timeout,
                            message='Timed out waiting for {activity}. Please '
                            'check connectivity between servers. You '
                            'may need to increase the --start-timeout '
                            'parameter if you have too many servers.')
    settings = hvd_settings.Settings(verbose=2 if args.verbose else 0,
                                     ssh_port=args.ssh_port,
                                     ssh_identity_file=args.ssh_identity_file,
                                     extra_mpi_args=args.mpi_args,
                                     tcp_flag=args.tcp_flag,
                                     binding_args=args.binding_args,
                                     key=secret.make_secret_key(),
                                     start_timeout=tmout,
                                     num_proc=args.np,
                                     hosts=args.hosts,
                                     output_filename=args.output_filename,
                                     run_func_mode=args.run_func is not None,
                                     nics=nics_set)

    # This cache stores the results of checks performed by horovod
    # during the initialization step. It can be disabled by setting
    # --disable-cache flag.
    fn_cache = None
    if not args.disable_cache:
        params = ''
        if args.np:
            params += str(args.np) + ' '
        if args.hosts:
            params += str(args.hosts) + ' '
        if args.ssh_port:
            params += str(args.ssh_port)
        if args.ssh_identity_file:
            params += args.ssh_identity_file
        parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest()
        fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES,
                               parameters_hash)

    all_host_names, _ = hosts.parse_hosts_and_slots(args.hosts)
    if settings.verbose >= 2:
        print('Filtering local host names.')
    remote_host_names = network.filter_local_addresses(all_host_names)
    if settings.verbose >= 2:
        print('Remote host found: ' + ' '.join(remote_host_names))

    if len(remote_host_names) > 0:
        if settings.verbose >= 2:
            print('Checking ssh on all remote hosts.')
        # Check if we can ssh into all remote hosts successfully.
        if not _check_all_hosts_ssh_successful(remote_host_names,
                                               args.ssh_port,
                                               args.ssh_identity_file,
                                               fn_cache=fn_cache):
            raise RuntimeError('could not connect to some hosts via ssh')
        if settings.verbose >= 2:
            print('SSH was successful into all the remote hosts.')

    nics = driver_service.get_common_interfaces(settings, all_host_names,
                                                remote_host_names, fn_cache)

    if args.run_func:
        # get the driver IPv4 address
        driver_ip = network.get_driver_ip(nics)
        run_func_server = KVStoreServer(verbose=settings.verbose)
        run_func_server_port = run_func_server.start_server()
        put_data_into_kvstore(driver_ip, run_func_server_port, 'runfunc',
                              'func', args.run_func)

        command = [
            sys.executable, '-m', 'horovod.runner.run_task',
            str(driver_ip),
            str(run_func_server_port)
        ]

        try:
            _launch_job(args, settings, nics, command)
            results = [None] * args.np
            # TODO: make it parallel to improve performance
            for i in range(args.np):
                results[i] = read_data_from_kvstore(driver_ip,
                                                    run_func_server_port,
                                                    'runfunc_result', str(i))
            return results
        finally:
            run_func_server.shutdown_server()
    else:
        command = args.command
        _launch_job(args, settings, nics, command)
        return None

Exemple #5

0

Afficher le fichier

Fichier : test_run.py Projet : y742035557/horovod

class RunTests(unittest.TestCase):
    """
    Tests for horovod.runner.
    """
    def __init__(self, *args, **kwargs):
        super(RunTests, self).__init__(*args, **kwargs)
        warnings.simplefilter('module')

    def test_params_args(self):
        with override_args('horovodrun', '-np', '2', '--fusion-threshold-mb',
                           '10', '--cycle-time-ms', '20', '--cache-capacity',
                           '512', '--hierarchical-allreduce',
                           '--hierarchical-allgather'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_FUSION_THRESHOLD),
                             str(10 * 1024 * 1024))
            self.assertEqual(env.get(config_parser.HOROVOD_CYCLE_TIME), '20.0')
            self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY),
                             '512')
            self.assertEqual(
                env.get(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE), '1')
            self.assertEqual(
                env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '1')

    def test_autotune_args(self):
        with override_args('horovodrun', '-np', '2', '--autotune',
                           '--autotune-log-file', '/tmp/autotune.txt',
                           '--autotune-warmup-samples', '1',
                           '--autotune-steps-per-sample', '5',
                           '--autotune-bayes-opt-max-samples', '10',
                           '--autotune-gaussian-process-noise', '0.2'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE), '1')
            self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_LOG),
                             '/tmp/autotune.txt')
            self.assertEqual(
                env.get(config_parser.HOROVOD_AUTOTUNE_WARMUP_SAMPLES), '1')
            self.assertEqual(
                env.get(config_parser.HOROVOD_AUTOTUNE_STEPS_PER_SAMPLE), '5')
            self.assertEqual(
                env.get(config_parser.HOROVOD_AUTOTUNE_BAYES_OPT_MAX_SAMPLES),
                '10')
            self.assertEqual(
                env.get(config_parser.HOROVOD_AUTOTUNE_GAUSSIAN_PROCESS_NOISE),
                '0.2')

    def test_autotuning_with_fixed_param(self):
        with override_args('horovodrun', '-np', '2', '--autotune',
                           '--cache-capacity', '1024',
                           '--no-hierarchical-allgather'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertNotIn(config_parser.HOROVOD_FUSION_THRESHOLD, env)
            self.assertNotIn(config_parser.HOROVOD_CYCLE_TIME, env)
            self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY),
                             '1024')
            self.assertNotIn(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE, env)
            self.assertEqual(
                env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '0')

    def test_timeline_args(self):
        with override_args('horovodrun', '-np', '2', '--timeline-filename',
                           '/tmp/timeline.json', '--timeline-mark-cycles'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_TIMELINE),
                             '/tmp/timeline.json')
            self.assertEqual(
                env.get(config_parser.HOROVOD_TIMELINE_MARK_CYCLES), '1')

    def test_stall_check_args(self):
        with override_args('horovodrun', '-np', '2', '--no-stall-check'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(
                env.get(config_parser.HOROVOD_STALL_CHECK_DISABLE), '1')

        with override_args('horovodrun', '-np', '2',
                           '--stall-check-warning-time-seconds', '10',
                           '--stall-check-shutdown-time-seconds', '20'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertNotIn(config_parser.HOROVOD_STALL_CHECK_DISABLE, env)
            self.assertEqual(
                env.get(config_parser.HOROVOD_STALL_CHECK_TIME_SECONDS), '10')
            self.assertEqual(
                env.get(config_parser.HOROVOD_STALL_SHUTDOWN_TIME_SECONDS),
                '20')

    def test_library_args(self):
        with override_args('horovodrun', '-np', '2', '--mpi-threads-disable',
                           '--num-nccl-streams', '2', '--ccl-bgt-affinity',
                           '1', '--gloo-timeout-seconds', '60'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(
                env.get(config_parser.HOROVOD_MPI_THREADS_DISABLE), '1')
            self.assertEqual(env.get(config_parser.HOROVOD_NUM_NCCL_STREAMS),
                             '2')
            self.assertEqual(env.get(config_parser.HOROVOD_CCL_BGT_AFFINITY),
                             '1')
            self.assertEqual(
                env.get(config_parser.HOROVOD_GLOO_TIMEOUT_SECONDS), '60')

    def test_logging_args(self):
        with override_args('horovodrun', '-np', '2', '--log-level', 'INFO',
                           '--log-hide-timestamp'):
            args = parse_args()
            env = {}
            config_parser.set_env_from_args(env, args)

            self.assertEqual(env.get(config_parser.HOROVOD_LOG_LEVEL), 'INFO')
            self.assertEqual(env.get(config_parser.HOROVOD_LOG_HIDE_TIME), '1')

    def test_config_file(self):
        config_filename = os.path.join(os.path.dirname(__file__),
                                       'data/config.test.yaml')
        with override_args('horovodrun', '-np', '2', '--config-file',
                           config_filename):
            args = parse_args()

            self.assertTrue(args.use_gloo)

            # Params
            self.assertEqual(args.fusion_threshold_mb, 32)
            self.assertEqual(args.cycle_time_ms, 10)
            self.assertEqual(args.cache_capacity, 2048)
            self.assertTrue(args.hierarchical_allreduce)
            self.assertTrue(args.hierarchical_allgather)

            # Autotune
            self.assertTrue(args.autotune)
            self.assertEqual(args.autotune_log_file,
                             'horovod_autotune_log.txt')
            self.assertEqual(args.autotune_warmup_samples, 5)
            self.assertEqual(args.autotune_steps_per_sample, 20)
            self.assertEqual(args.autotune_bayes_opt_max_samples, 50)
            self.assertEqual(args.autotune_gaussian_process_noise, 0.9)

            # Timeline
            self.assertEqual(args.timeline_filename, 'horovod_timeline.json')
            self.assertTrue(args.timeline_mark_cycles)

            # Stall Check
            self.assertFalse(args.no_stall_check)
            self.assertEqual(args.stall_check_warning_time_seconds, 120)
            self.assertEqual(args.stall_check_shutdown_time_seconds, 240)

            # Library Options
            self.assertTrue(args.mpi_threads_disable)
            self.assertEqual(args.num_nccl_streams, 2)
            self.assertEqual(args.ccl_bgt_affinity, 1)
            self.assertEqual(args.gloo_timeout_seconds, 60)

            # Logging
            self.assertEqual(args.log_level, 'INFO')
            self.assertTrue(args.log_hide_timestamp)

    def test_config_file_override_args(self):
        config_filename = os.path.join(os.path.dirname(__file__),
                                       'data/config.test.yaml')
        with override_args(
                'horovodrun',
                '-np',
                '2',
                '--fusion-threshold-mb',
                '128',
                '--config-file',
                config_filename,
                '--cycle-time-ms',
                '20',
        ):
            args = parse_args()
            self.assertEqual(args.fusion_threshold_mb, 128)
            self.assertEqual(args.cycle_time_ms, 20)

    def test_validate_config_args(self):
        with override_args('horovodrun', '-np', '2', '--fusion-threshold-mb',
                           '-1'):
            with pytest.raises(ValueError):
                parse_args()

    # test_on_event tests in_thread as well, but it does not test args
    def test_in_thread_args(self):
        fn = mock.Mock()
        thread = in_thread(fn, args=(1, ))
        thread.join(1.0)
        self.assertFalse(thread.is_alive())
        fn.assert_called_once_with(1)

        fn = mock.Mock()
        thread = in_thread(fn, args=(1, 2))
        thread.join(1.0)
        self.assertFalse(thread.is_alive())
        fn.assert_called_once_with(1, 2)

        fn = mock.Mock()
        thread = in_thread(fn, args=(1, 2), silent=True)
        thread.join(1.0)
        self.assertFalse(thread.is_alive())
        fn.assert_called_once_with(1, 2)

        fn = mock.Mock()
        with pytest.raises(
                ValueError,
                match="^args must be a tuple, not <(class|type) 'int'>, "
                "for a single argument use \\(arg,\\)$"):
            in_thread(fn, args=1)
        fn.assert_not_called()

    def test_on_event(self):
        # a happy run without args and stop event
        event = threading.Event()
        fn = mock.Mock()
        thread = on_event(event, fn)
        fn.assert_not_called()
        event.set()
        thread.join(1.0)
        self.assertFalse(thread.is_alive())
        fn.assert_called_once()

        # a happy run with args but without stop event
        event = threading.Event()
        fn = mock.Mock()
        thread = on_event(event, fn, ('a', 1))
        fn.assert_not_called()
        event.set()
        thread.join(1.0)
        self.assertFalse(thread.is_alive())
        fn.assert_called_once()
        fn.assert_called_once_with('a', 1)

        # a happy run with stop event but unused
        event = threading.Event()
        stop = threading.Event()
        fn = mock.Mock()
        thread = on_event(event, fn, stop=stop, check_stop_interval_s=0.01)
        fn.assert_not_called()
        event.set()
        thread.join(1.0)
        self.assertFalse(thread.is_alive())
        fn.assert_called_once()
        stop.set()
        time.sleep(0.1)
        fn.assert_called_once()

        # stop the thread before we set the event
        event = threading.Event()
        stop = threading.Event()
        fn = mock.Mock()
        thread = on_event(event, fn, stop=stop, check_stop_interval_s=0.01)
        fn.assert_not_called()
        stop.set()
        thread.join(1.0)
        self.assertFalse(thread.is_alive())
        fn.assert_not_called()
        event.set()
        time.sleep(0.1)
        fn.assert_not_called()

        # test with exception
        def exception():
            raise Exception("Test Exception")

        event = threading.Event()
        fn = mock.Mock(side_effect=exception)
        thread = on_event(event, fn)
        fn.assert_not_called()
        event.set()
        thread.join(1.0)
        self.assertFalse(thread.is_alive())
        fn.assert_called_once()

        # test with exception but silent
        event = threading.Event()
        fn = mock.Mock(side_effect=exception)
        thread = on_event(event, fn)
        fn.assert_not_called()
        event.set()
        thread.join(1.0)
        self.assertFalse(thread.is_alive())
        fn.assert_called_once()

        # test None event
        event = None
        fn = mock.Mock()
        with pytest.raises(ValueError, match="^Event must not be None$"):
            on_event(event, fn)
        fn.assert_not_called()

        # test non-tuple args
        event = threading.Event()
        fn = mock.Mock()
        with pytest.raises(
                ValueError,
                match="^args must be a tuple, not <(class|type) 'int'>, "
                "for a single argument use \\(arg,\\)$"):
            on_event(event, fn, args=1)
        fn.assert_not_called()

        # test None stop and non-daemon
        event = threading.Event()
        fn = mock.Mock()
        with pytest.raises(
                ValueError,
                match="^Stop event must be given for non-daemon event thread$"
        ):
            on_event(event, fn, stop=None, daemon=False)
        fn.assert_not_called()

    def test_safe_shell_exec_captures_stdout(self):
        self.do_test_safe_shell_exec('echo hello', 0, 'hello\n', '')

    def test_safe_shell_exec_captures_stderr(self):
        self.do_test_safe_shell_exec('echo hello >&2', 0, '', 'hello\n')

    def test_safe_shell_exec_captures_last_line_wo_eol(self):
        cmd = 'bash -c "echo -e -n \\"hello\nstdout\\"; echo -e -n \\"hello\nstderr\\" >&2"'
        self.do_test_safe_shell_exec(cmd, 0, 'hello\nstdout', 'hello\nstderr')

    def test_safe_shell_exec_returns_exit_code(self):
        self.do_test_safe_shell_exec('false', 1, '', '')

    @pytest.mark.skip(reason='https://github.com/horovod/horovod/issues/1993')
    def test_safe_shell_exec_interrupts_on_event(self):
        # interrupt execute in one second
        interrupt = threading.Event()
        interrupt_delay = 1.0
        delay(lambda: interrupt.set(), interrupt_delay)

        sleep = interrupt_delay + safe_shell_exec.GRACEFUL_TERMINATION_TIME_S + 2.0
        start = time.time()
        self.do_test_safe_shell_exec('sleep {}'.format(sleep), 143, '', None,
                                     interrupt)
        duration = time.time() - start

        self.assertGreaterEqual(duration, interrupt_delay)
        self.assertLess(duration, sleep - 1.0, 'sleep should not finish')

    def test_safe_shell_exec_interrupts_on_parent_shutdown(self):
        sleep = 20
        parent_script = os.path.join(os.path.dirname(__file__),
                                     'data/run_safe_shell_exec.py')
        child_script = os.path.join(os.path.dirname(__file__), 'data/sleep.py')

        def get_pid(logfile):
            # Wait until the script has written its PID to the logfile
            wait(lambda: os.path.exists(logfile), timeout=5)
            with open(logfile, 'r') as f:
                return int(f.read())

        with temppath() as parent_logfile, temppath() as child_logfile:
            # It's important that this executes in an entirely different interpreter with as little shared
            # state as possible, to avoid issues with the semaphore tracker.
            cmd = ' '.join([
                sys.executable, parent_script, parent_logfile, child_script,
                str(sleep), child_logfile
            ])
            p = subprocess.Popen(cmd, shell=True)

            parent = psutil.Process(get_pid(parent_logfile))
            child = psutil.Process(get_pid(child_logfile))

            self.assertTrue(parent.is_running())
            self.assertTrue(child.is_running())

            # Hard kill the parent process
            parent.kill()
            parent.wait(timeout=safe_shell_exec.GRACEFUL_TERMINATION_TIME_S)
            p.wait()

            # Child process will exit when pipe breaks
            child.wait(
                timeout=2 * safe_shell_exec.GRACEFUL_TERMINATION_TIME_S + 1)

            self.assertFalse(parent.is_running())
            self.assertFalse(child.is_running())

    def do_test_safe_shell_exec(self,
                                cmd,
                                expected_exit_code,
                                expected_stdout,
                                expected_stderr,
                                event=None):
        stdout = io.StringIO()
        stderr = io.StringIO()
        res = safe_shell_exec.execute(cmd,
                                      stdout=stdout,
                                      stderr=stderr,
                                      events=[event] if event else None)
        self.assertEqual(expected_exit_code, res)
        if expected_stdout is not None:
            self.assertEqual(expected_stdout, stdout.getvalue())
        if expected_stderr is not None:
            self.assertEqual(expected_stderr, stderr.getvalue())

    def test_hash(self):
        hash = _hash("test string")
        self.assertEqual(hash, '6f8db599de986fab7a21625b7916589c')

    def test_host_hash(self):
        hash = host_hash()
        salted = host_hash('salt')
        empty_salted = host_hash('')

        self.assertNotEqual(salted, hash)
        self.assertEqual(empty_salted, hash)

    def test_get_mpi_implementation(self):
        def test(output, expected, exit_code=0):
            ret = (output, exit_code) if output is not None else None
            env = {'VAR': 'val'}
            with mock.patch("horovod.runner.mpi_run.tiny_shell_exec.execute",
                            return_value=ret) as m:
                implementation = _get_mpi_implementation(env)
                self.assertEqual(expected, implementation)
                m.assert_called_once_with('mpirun --version', env)

        test(("mpirun (Open MPI) 2.1.1\n"
              "Report bugs to http://www.open-mpi.org/community/help/\n"),
             _OMPI_IMPL)

        test("OpenRTE", _OMPI_IMPL)

        test("IBM Spectrum MPI", _SMPI_IMPL)

        test(("HYDRA build details:\n"
              "    Version:           3.3a2\n"
              "    Configure options: 'MPICHLIB_CFLAGS=-g -O2'\n"),
             _MPICH_IMPL)

        test("Unknown MPI v1.00", _UNKNOWN_IMPL)

        test("output", exit_code=1, expected=_MISSING_IMPL)

        test(None, _MISSING_IMPL)

    def test_run_controller(self):
        def test(use_gloo, use_mpi, use_js, gloo_is_built, mpi_is_built,
                 lsf_exists, jsrun_installed, expected, exception):
            gloo_run = MagicMock()
            mpi_run = MagicMock()
            js_run = MagicMock()

            with is_built(gloo_is_built, mpi_is_built):
                with lsf_and_jsrun(lsf_exists, jsrun_installed):
                    if exception is not None:
                        with pytest.raises(ValueError, match=exception) as e:
                            run_controller(use_gloo,
                                           gloo_run,
                                           use_mpi,
                                           mpi_run,
                                           use_js,
                                           js_run,
                                           verbosity=2)
                        return
                    run_controller(use_gloo,
                                   gloo_run,
                                   use_mpi,
                                   mpi_run,
                                   use_js,
                                   js_run,
                                   verbosity=2)

            if expected == "gloo":
                gloo_run.assert_called_once()
                mpi_run.assert_not_called()
                js_run.assert_not_called()
            elif expected == "mpi":
                gloo_run.assert_not_called()
                mpi_run.assert_called_once()
                js_run.assert_not_called()
            elif expected == "js":
                gloo_run.assert_not_called()
                mpi_run.assert_not_called()
                js_run.assert_called_once()
            else:
                raise ValueError("unsupported framework: {}".format(expected))

        bool_values = [False, True]
        bool_values_and_none = [None, False, True]

        for use_gloo, use_mpi, use_js, \
            gloo_is_built, mpi_is_built, \
            lsf_exists, jsrun_installed in \
            itertools.product(bool_values_and_none, bool_values_and_none, bool_values_and_none,
                              bool_values, bool_values,
                              bool_values, bool_values):

            expected = exception = None
            if use_gloo:
                if gloo_is_built:
                    expected = 'gloo'
                else:
                    exception = r'^Gloo support has not been built\.  If this is not expected, ensure CMake is installed ' \
                                r'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error\.$'
            elif use_mpi:
                if mpi_is_built:
                    expected = 'mpi'
                else:
                    exception = r'^MPI support has not been built\.  If this is not expected, ensure MPI is installed ' \
                                r'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error\.$'
            elif use_js:
                if mpi_is_built:
                    if lsf_exists:
                        expected = 'js'
                    else:
                        exception = 'Horovod did not detect an LSF job.  The jsrun launcher can only be used in that environment. ' \
                                    'Please, pick a different launcher for other environments.'
                else:
                    exception = r'^MPI support has not been built\.  If this is not expected, ensure MPI is installed ' \
                                r'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error\.$'
            elif mpi_is_built:
                if lsf_exists and jsrun_installed:
                    expected = 'js'
                else:
                    expected = 'mpi'
            elif gloo_is_built:
                expected = 'gloo'
            else:
                exception = r'Neither MPI nor Gloo support has been built\. Try reinstalling Horovod ensuring that ' \
                            r'either MPI is installed \(MPI\) or CMake is installed \(Gloo\)\.'

            test(use_gloo, use_mpi, use_js, gloo_is_built, mpi_is_built,
                 lsf_exists, jsrun_installed, expected, exception)

    """
    Minimal mpi_run settings for tests.
    """
    minimal_settings = hvd_settings.Settings(verbose=0,
                                             num_proc=2,
                                             hosts='localhost:2',
                                             run_func_mode=True)
    """
    Tests mpi_run with minimal settings.
    """

    def test_mpi_run_minimal(self):
        if not mpi_available():
            self.skipTest("MPI is not available")

        cmd = ['cmd']
        settings = self.minimal_settings

        def mpi_impl_flags(tcp, env=None):
            return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"]

        with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags",
                        side_effect=mpi_impl_flags):
            with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute",
                            return_value=0) as execute:
                mpi_run(settings, None, {}, cmd)

                # call the mocked _get_mpi_implementation_flags method
                mpi_flags, binding_args = horovod.runner.mpi_run._get_mpi_implementation_flags(
                    False)
                self.assertIsNotNone(mpi_flags)
                expected_cmd = ('mpirun '
                                '--allow-run-as-root --tag-output '
                                '-np 2 -H localhost:2 '
                                '{binding_args} '
                                '{mpi_flags}       '
                                'cmd').format(
                                    binding_args=' '.join(binding_args),
                                    mpi_flags=' '.join(mpi_flags))

                # remove PYTHONPATH from execute's env
                # we cannot know the exact value of that env variable
                # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below
                self.assertIn('env', execute.call_args.kwargs)
                if 'PYTHONPATH' in execute.call_args.kwargs['env']:
                    execute.call_args.kwargs['env'].pop('PYTHONPATH')

                expected_env = {'PATH': os.environ.get('PATH')}
                execute.assert_called_once_with(expected_cmd,
                                                env=expected_env,
                                                stdout=None,
                                                stderr=None)

    """
    Tests mpi_run on a large cluster.
    """

    def test_mpi_run_on_large_cluster(self):
        if not mpi_available():
            self.skipTest("MPI is not available")

        cmd = ['cmd']
        settings = copy.copy(self.minimal_settings)
        settings.hosts = ','.join(['localhost:1'] * large_cluster_threshold)

        def mpi_impl_flags(tcp, env=None):
            return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"]

        with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags",
                        side_effect=mpi_impl_flags):
            with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute",
                            return_value=0) as execute:
                mpi_run(settings, None, {}, cmd)

                # call the mocked _get_mpi_implementation_flags method
                mpi_flags, binding_args = horovod.runner.mpi_run._get_mpi_implementation_flags(
                    False)
                self.assertIsNotNone(mpi_flags)
                mpi_flags.append('-mca plm_rsh_no_tree_spawn true')
                mpi_flags.append('-mca plm_rsh_num_concurrent {}'.format(
                    large_cluster_threshold))
                expected_cmd = ('mpirun '
                                '--allow-run-as-root --tag-output '
                                '-np 2 -H {hosts} '
                                '{binding_args} '
                                '{mpi_flags}       '
                                'cmd').format(
                                    hosts=settings.hosts,
                                    binding_args=' '.join(binding_args),
                                    mpi_flags=' '.join(mpi_flags))

                # remove PYTHONPATH from execute's env
                # we cannot know the exact value of that env variable
                # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below
                self.assertIn('env', execute.call_args.kwargs)
                if 'PYTHONPATH' in execute.call_args.kwargs['env']:
                    execute.call_args.kwargs['env'].pop('PYTHONPATH')

                expected_env = {'PATH': os.environ.get('PATH')}
                execute.assert_called_once_with(expected_cmd,
                                                env=expected_env,
                                                stdout=None,
                                                stderr=None)

    """
    Tests mpi_run with full settings.
    """

    def test_mpi_run_full(self):
        if not mpi_available():
            self.skipTest("MPI is not available")

        cmd = ['cmd', 'arg1', 'arg2']
        nics = ['eth0', 'eth1']
        env = {'env1': 'val1', 'env2': 'val2'}
        stdout = '<stdout>'
        stderr = '<stderr>'
        tmout = timeout.Timeout(5, message='Timed out waiting for something.')
        settings = hvd_settings.Settings(
            verbose=0,
            ssh_port=1022,
            extra_mpi_args='>mpi-extra args go here<',
            binding_args='>binding args go here<',
            key=secret.make_secret_key(),
            start_timeout=tmout,
            num_proc=1,
            hosts='localhost:1',
            output_filename='>output filename goes here<',
            run_func_mode=True)

        def mpi_impl_flags(tcp, env=None):
            return ["--mock-mpi-impl-flags"], []

        with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags",
                        side_effect=mpi_impl_flags) as impl:
            with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute",
                            return_value=0) as execute:
                mpi_run(settings, nics, env, cmd, stdout=stdout, stderr=stderr)

                # assert call on _get_mpi_implementation_flags
                impl.assert_called_once_with(None, env=env)

                # call the mocked _get_mpi_implementation_flags method ourselves
                mpi_flags, _ = horovod.runner.mpi_run._get_mpi_implementation_flags(
                    False)
                self.assertIsNotNone(mpi_flags)
                expected_command = (
                    'mpirun '
                    '--allow-run-as-root --tag-output '
                    '-np 1 -H {hosts} '
                    '>binding args go here< '
                    '{mpi_flags} '
                    '-mca plm_rsh_args "-p 1022" '
                    '-mca btl_tcp_if_include eth0,eth1 -x NCCL_SOCKET_IFNAME=eth0,eth1 '
                    '--output-filename >output filename goes here< '
                    '-x env1 -x env2 '
                    '>mpi-extra args go here< '
                    'cmd arg1 arg2').format(hosts=settings.hosts,
                                            mpi_flags=' '.join(mpi_flags))

                # remove PYTHONPATH from execute's env
                # we cannot know the exact value of that env variable
                # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below
                self.assertIn('env', execute.call_args.kwargs)
                if 'PYTHONPATH' in execute.call_args.kwargs['env']:
                    execute.call_args.kwargs['env'].pop('PYTHONPATH')

                expected_env = {
                    'env1': 'val1',
                    'env2': 'val2',
                    'PATH': os.environ.get('PATH')
                }
                execute.assert_called_once_with(expected_command,
                                                env=expected_env,
                                                stdout=stdout,
                                                stderr=stderr)

    """
    Tests mpi_run without PYTHONPATH set.
    """

    def test_mpi_run_without_pythonpath(self):
        self.do_test_mpi_run_env_override({}, {}, 'PYTHONPATH', None)

    """
    Tests mpi_run with PYTHONPATH set in sys.
    """

    def test_mpi_run_with_sys_pythonpath(self):
        self.do_test_mpi_run_env_override({'PYTHONPATH': 'ppath'}, {},
                                          'PYTHONPATH', 'ppath')

    """
    Tests mpi_run with PYTHONPATH set in env.
    """

    def test_mpi_run_with_env_pythonpath(self):
        self.do_test_mpi_run_env_override({}, {'PYTHONPATH': 'ppath'},
                                          'PYTHONPATH', 'ppath')

    """
    Tests mpi_run with both PYTHONPATH set.
    """

    def test_mpi_run_with_both_pythonpaths(self):
        self.do_test_mpi_run_env_override({'PYTHONPATH': 'sys-ppath'},
                                          {'PYTHONPATH': 'env-ppath'},
                                          'PYTHONPATH', 'env-ppath')

    """
    Tests mpi_run without PATH set.
    """

    def test_mpi_run_without_path(self):
        self.do_test_mpi_run_env_override({}, {}, 'PATH', None)

    """
    Tests mpi_run with PATH set in sys.
    """

    def test_mpi_run_with_sys_path(self):
        self.do_test_mpi_run_env_override({'PATH': 'ppath'}, {}, 'PATH',
                                          'ppath')

    """
    Tests mpi_run with PATH set in env.
    """

    def test_mpi_run_with_env_path(self):
        self.do_test_mpi_run_env_override({}, {'PATH': 'ppath'}, 'PATH',
                                          'ppath')

    """
    Tests mpi_run with both PATH set.
    """

    def test_mpi_run_with_both_paths(self):
        self.do_test_mpi_run_env_override({'PATH': 'sys-path'},
                                          {'PATH': 'env-path'}, 'PATH',
                                          'env-path')

    """
    Actually tests mpi_run overrides arg env with sys env.
    """

    def do_test_mpi_run_env_override(self, sysenv, argenv, env_var, expected):
        if not mpi_available():
            self.skipTest("MPI is not available")

        cmd = ['cmd']
        settings = self.minimal_settings

        def mpi_impl_flags(tcp, env=None):
            return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"]

        with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags),\
             mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=0) as execute,\
             override_env(sysenv):
            mpi_run(settings, None, argenv, cmd)

            # assert the env variable in the execute's env
            self.assertIn('env', execute.call_args.kwargs)
            self.assertEqual(execute.call_args.kwargs['env'].get(env_var),
                             expected)

    def test_mpi_run_with_non_zero_exit(self):
        if not mpi_available():
            self.skipTest("MPI is not available")

        cmd = ['cmd']
        settings = self.minimal_settings

        def mpi_impl_flags(tcp, env=None):
            return [], []

        with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags",
                        side_effect=mpi_impl_flags):
            with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute",
                            return_value=1):
                with pytest.raises(RuntimeError,
                                   match="^mpirun failed with exit code 1$"):
                    mpi_run(settings, None, {}, cmd)

    """
    Tests mpi_run with os.environ.
    """

    def test_mpi_run_with_os_environ(self):
        if not mpi_available():
            self.skipTest("MPI is not available")

        cmd = ['cmd']
        settings = self.minimal_settings

        def mpi_impl_flags(tcp, env=None):
            return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"]

        with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags",
                        side_effect=mpi_impl_flags):
            with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute",
                            return_value=0):
                with pytest.raises(
                        Exception,
                        match=
                        "^env argument must be a dict, not <class 'os._Environ'>: "
                ):
                    mpi_run(settings, None, os.environ, cmd)

    """
    Tests gloo_run with minimal settings.
    """

    def test_gloo_run_minimal(self):
        if not gloo_built:
            self.skipTest("Gloo is not available")

        cmd = ['whoami']
        settings = self.minimal_settings
        gloo_run(settings, ['lo'], {}, '127.0.0.1', cmd)

    """
    Tests gloo_run with os.environ.
    """

    def test_gloo_run_with_os_environ(self):
        if not gloo_built:
            self.skipTest("Gloo is not available")

        cmd = ['whoami']
        settings = self.minimal_settings
        gloo_run(settings, ['lo'], os.environ, '127.0.0.1', cmd)

    def test_horovodrun_hostfile(self):
        with temppath() as host_filename:
            with open(host_filename, 'w+') as fp:
                fp.write('172.31.32.7 slots=8\n')
                fp.write('172.31.33.9 slots=8\n')

            hostnames = hosts.parse_host_files(host_filename)
            self.assertEqual(hostnames, '172.31.32.7:8,172.31.33.9:8')

    """
    Tests js_run.
    """

    @mock.patch('horovod.runner.js_run.is_jsrun_installed',
                MagicMock(return_value=True))
    @mock.patch('horovod.runner.js_run.generate_jsrun_rankfile',
                MagicMock(return_value='/tmp/rankfile'))
    @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_gpus',
                MagicMock(return_value=2))
    @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_cores',
                MagicMock(return_value=2))
    def test_js_run(self):
        if _get_mpi_implementation_flags(False)[0] is None:
            self.skipTest("MPI is not available")

        cmd = ['cmd', 'arg1', 'arg2']
        env = {'env1': 'val1', 'env2': 'val2'}
        stdout = '<stdout>'
        stderr = '<stderr>'
        settings = hvd_settings.Settings(
            verbose=0,
            extra_mpi_args='>mpi-extra args go here<',
            num_proc=4,
            hosts='localhost:2,127.0.0.1:2',
            output_filename='>output filename goes here<',
            run_func_mode=True)

        def mpi_impl_flags(tcp, env=None):
            return ["--mock-mpi-impl-flags"], []

        with mock.patch("horovod.runner.js_run._get_mpi_implementation_flags",
                        side_effect=mpi_impl_flags):
            with mock.patch("horovod.runner.js_run.safe_shell_exec.execute",
                            return_value=0) as execute:
                js_run(settings, None, env, cmd, stdout=stdout, stderr=stderr)

                # call the mocked _get_mpi_implementation_flags method
                mpi_flags, _ = horovod.runner.js_run._get_mpi_implementation_flags(
                    False)
                self.assertIsNotNone(mpi_flags)
                expected_command = (
                    'jsrun '
                    '--erf_input /tmp/rankfile '
                    '--stdio_stderr >output filename goes here< '
                    '--stdio_stdout >output filename goes here< '
                    '--smpiargs \'{mpi_args} >mpi-extra args go here<\' '
                    'cmd arg1 arg2').format(mpi_args=' '.join(mpi_flags))
                expected_env = {'env1': 'val1', 'env2': 'val2'}
                execute.assert_called_once_with(expected_command,
                                                env=expected_env,
                                                stdout=stdout,
                                                stderr=stderr)

    """
    Tests generate_jsrun_rankfile.
    """

    @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_gpus',
                MagicMock(return_value=4))
    @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_cores',
                MagicMock(return_value=4))
    @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_threads',
                MagicMock(return_value=4))
    def test_generate_jsrun_rankfile(self):
        settings = hvd_settings.Settings(
            num_proc=5,
            hosts='host1:4,host2:4,host3:4',
        )

        with temppath() as rankfile_path:
            rankfile_path = generate_jsrun_rankfile(settings, rankfile_path)

            with open(rankfile_path, 'r') as file:
                gen_rankfile = file.read()

            expected_rankfile = ("""overlapping_rs: allow
cpu_index_using: logical

rank: 0: { hostname: host1; cpu: {0-3} ; gpu: * ; mem: * }
rank: 1: { hostname: host1; cpu: {4-7} ; gpu: * ; mem: * }
rank: 2: { hostname: host1; cpu: {8-11} ; gpu: * ; mem: * }
rank: 3: { hostname: host1; cpu: {12-15} ; gpu: * ; mem: * }

rank: 4: { hostname: host2; cpu: {0-3} ; gpu: * ; mem: * }
""")

            self.assertMultiLineEqual(gen_rankfile, expected_rankfile)

    """
    Tests horovod.runner.launch._run with jsrun
    """

    @mock.patch('horovod.runner.util.lsf.LSFUtils.using_lsf',
                MagicMock(return_value=True))
    @mock.patch('horovod.runner.util.lsf.LSFUtils.get_compute_hosts',
                MagicMock(return_value=['host1', 'host2']))
    @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_gpus',
                MagicMock(return_value=2))
    @mock.patch('horovod.runner.util.network.filter_local_addresses',
                MagicMock(return_value=['host1', 'host2']))
    @mock.patch('horovod.runner.launch._check_all_hosts_ssh_successful',
                MagicMock())
    @mock.patch('horovod.runner.launch.run_controller')
    def test_run_with_jsrun(self, mocked_run_controller):
        hargs = _HorovodArgs()
        _run(hargs)
        mocked_run_controller.assert_called_once()

    def test_get_host_assignments(self):
        hosts = parse_hosts('worker-0:2,worker-1:2')
        np = 4
        assignments = get_host_assignments(hosts, np)

        sizes = dict(size=4, local_size=2, cross_size=2)
        expected = [
            SlotInfo(hostname='worker-0',
                     rank=0,
                     local_rank=0,
                     cross_rank=0,
                     **sizes),
            SlotInfo(hostname='worker-0',
                     rank=1,
                     local_rank=1,
                     cross_rank=0,
                     **sizes),
            SlotInfo(hostname='worker-1',
                     rank=2,
                     local_rank=0,
                     cross_rank=1,
                     **sizes),
            SlotInfo(hostname='worker-1',
                     rank=3,
                     local_rank=1,
                     cross_rank=1,
                     **sizes)
        ]
        self.assertListEqual(assignments, expected)

    def test_get_host_assignments_elastic(self):
        hosts = parse_hosts('worker-0:2,worker-1:2')
        min_np = 1
        max_np = 2
        assignments = get_host_assignments(hosts, min_np=min_np, max_np=max_np)

        sizes = dict(size=2, local_size=2, cross_size=1)
        expected = [
            SlotInfo(hostname='worker-0',
                     rank=0,
                     local_rank=0,
                     cross_rank=0,
                     **sizes),
            SlotInfo(hostname='worker-0',
                     rank=1,
                     local_rank=1,
                     cross_rank=0,
                     **sizes)
        ]
        self.assertListEqual(assignments, expected)

    def test_get_host_assignments_heterogeneous(self):
        hosts = parse_hosts('worker-0:1,worker-1:2')
        np = 3
        assignments = get_host_assignments(hosts, np)

        expected = [
            SlotInfo(hostname='worker-0',
                     rank=0,
                     local_rank=0,
                     cross_rank=0,
                     size=3,
                     local_size=1,
                     cross_size=2),
            SlotInfo(hostname='worker-1',
                     rank=1,
                     local_rank=0,
                     cross_rank=1,
                     size=3,
                     local_size=2,
                     cross_size=2),
            SlotInfo(hostname='worker-1',
                     rank=2,
                     local_rank=1,
                     cross_rank=0,
                     size=3,
                     local_size=2,
                     cross_size=1)
        ]
        self.assertListEqual(assignments, expected)

Exemple #6

0

Afficher le fichier

Fichier : runner.py Projet : lakersdf/horovod

def run(fn,
        args=(),
        kwargs={},
        num_proc=None,
        start_timeout=None,
        use_mpi=None,
        use_gloo=None,
        extra_mpi_args=None,
        env=None,
        stdout=None,
        stderr=None,
        verbose=1,
        nics=None,
        prefix_output_with_timestamp=False,
        executable=None):
    """
    Runs Horovod on Spark.  Runs `num_proc` processes executing `fn` using the same amount of Spark tasks.

    Args:
        fn: Function to run.
        args: Arguments to pass to `fn`.
        kwargs: Keyword arguments to pass to `fn`.
        num_proc: Number of Horovod processes.  Defaults to `spark.default.parallelism`.
        start_timeout: Timeout for Spark tasks to spawn, register and start running the code, in seconds.
                       If not set, falls back to `HOROVOD_SPARK_START_TIMEOUT` environment variable value.
                       If it is not set as well, defaults to 600 seconds.
        extra_mpi_args: Extra arguments for mpi_run. Defaults to no extra args.
        env: Environment dictionary to use in Horovod run.
        stdout: Horovod stdout is redirected to this stream. Defaults to sys.stdout when used with MPI.
        stderr: Horovod stderr is redirected to this stream. Defaults to sys.stderr when used with MPI.
        verbose: Debug output verbosity (0-2). Defaults to 1.
        nics: List of NICs for tcp network communication.
        prefix_output_with_timestamp: shows timestamp in stdout/stderr forwarding on the driver
        executable: Optional executable to run when launching the workers. Defaults to `sys.executable`.

    Returns:
        List of results returned by running `fn` on each rank.
    """

    if start_timeout is None:
        # Lookup default timeout from the environment variable.
        start_timeout = int(os.getenv('HOROVOD_SPARK_START_TIMEOUT', '600'))

    # nics needs to be a set
    if nics and not isinstance(nics, set):
        nics = set(nics)

    tmout = timeout.Timeout(
        start_timeout,
        message='Timed out waiting for {activity}. Please check that you have '
        'enough resources to run all Horovod processes. Each Horovod '
        'process runs in a Spark task. You may need to increase the '
        'start_timeout parameter to a larger value if your Spark resources '
        'are allocated on-demand.')
    settings = hvd_settings.Settings(
        verbose=verbose,
        extra_mpi_args=extra_mpi_args,
        key=secret.make_secret_key(),
        start_timeout=tmout,
        nics=nics,
        run_func_mode=True,
        prefix_output_with_timestamp=prefix_output_with_timestamp)

    spark_context = pyspark.SparkContext._active_spark_context
    if spark_context is None:
        raise Exception('Could not find an active SparkContext, are you '
                        'running in a PySpark session?')

    if num_proc is None:
        num_proc = spark_context.defaultParallelism
        if settings.verbose >= 1:
            logging.info(
                'Running %d processes (inferred from spark.default.parallelism)...',
                num_proc)
    else:
        if settings.verbose >= 1:
            logging.info('Running %d processes...', num_proc)
    settings.num_proc = num_proc

    result_queue = queue.Queue(1)

    # start Spark driver service and launch settings.num_proc Spark tasks
    spark_job_group = 'horovod.spark.run.%d' % job_id.next_job_id()
    driver = driver_service.SparkDriverService(settings.num_proc,
                                               settings.num_proc, fn, args,
                                               kwargs, settings.key,
                                               settings.nics)
    gloo_is_used = is_gloo_used(use_gloo=use_gloo,
                                use_mpi=use_mpi,
                                use_jsrun=False)
    spark_thread = _make_spark_thread(spark_context,
                                      spark_job_group,
                                      driver,
                                      result_queue,
                                      settings,
                                      use_gloo=gloo_is_used,
                                      is_elastic=False)
    try:
        # wait for all tasks to register, notify them and initiate task-to-task address registration
        _notify_and_register_task_addresses(driver, settings)

        # Determine the index grouping based on host hashes.
        # Barrel shift until index 0 is in the first host.
        host_hashes = list(driver.task_host_hash_indices().keys())
        host_hashes.sort()
        while 0 not in driver.task_host_hash_indices()[host_hashes[0]]:
            host_hashes = host_hashes[1:] + host_hashes[:1]

        settings.hosts = ','.join(
            '%s:%d' %
            (host_hash, len(driver.task_host_hash_indices()[host_hash]))
            for host_hash in host_hashes)

        # Run the job
        _launch_job(use_mpi, use_gloo, settings, driver, env, stdout, stderr,
                    executable)
    except:
        # Terminate Spark job.
        spark_context.cancelJobGroup(spark_job_group)

        # Re-raise exception.
        raise
    finally:
        spark_thread.join()
        driver.shutdown()

    # Make sure Spark Job did not fail.
    driver.check_for_spark_job_failure()

    # get ranks from driver
    indices_in_rank_order = _get_indices_in_rank_order(driver)

    # If there's no exception, execution results are in this queue.
    results = result_queue.get_nowait()
    return [results[index] for index in indices_in_rank_order]