def test_generate_jsrun_rankfile(self): settings = hvd_settings.Settings( num_proc=5, hosts='host1:4,host2:4,host3:4', ) with temppath() as rankfile_path: rankfile_path = generate_jsrun_rankfile(settings, rankfile_path) with open(rankfile_path, 'r') as file: gen_rankfile = file.read() expected_rankfile = ( """overlapping_rs: allow cpu_index_using: logical rank: 0: { hostname: host1; cpu: {0-3} ; gpu: * ; mem: * } rank: 1: { hostname: host1; cpu: {4-7} ; gpu: * ; mem: * } rank: 2: { hostname: host1; cpu: {8-11} ; gpu: * ; mem: * } rank: 3: { hostname: host1; cpu: {12-15} ; gpu: * ; mem: * } rank: 4: { hostname: host2; cpu: {0-3} ; gpu: * ; mem: * } """) self.assertMultiLineEqual(gen_rankfile, expected_rankfile)
def test_js_run(self): if _get_mpi_implementation_flags(False)[0] is None: self.skipTest("MPI is not available") cmd = ['cmd', 'arg1', 'arg2'] env = {'env1': 'val1', 'env2': 'val2'} stdout = '<stdout>' stderr = '<stderr>' settings = hvd_settings.Settings( verbose=0, extra_mpi_args='>mpi-extra args go here<', num_proc=4, hosts='localhost:2,127.0.0.1:2', output_filename='>output filename goes here<', run_func_mode=True ) def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], [] with mock.patch("horovod.runner.js_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.runner.js_run.safe_shell_exec.execute", return_value=0) as execute: js_run(settings, None, env, cmd, stdout=stdout, stderr=stderr) # call the mocked _get_mpi_implementation_flags method mpi_flags, _ = horovod.runner.js_run._get_mpi_implementation_flags(False) self.assertIsNotNone(mpi_flags) expected_command = ('jsrun ' '--erf_input /tmp/rankfile ' '--stdio_stderr >output filename goes here< ' '--stdio_stdout >output filename goes here< ' '--smpiargs \'{mpi_args} >mpi-extra args go here<\' ' 'cmd arg1 arg2').format(mpi_args=' '.join(mpi_flags)) expected_env = {'env1': 'val1', 'env2': 'val2'} execute.assert_called_once_with(expected_command, env=expected_env, stdout=stdout, stderr=stderr)
def test_mpi_run_full(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd', 'arg1', 'arg2'] nics = ['eth0', 'eth1'] env = {'env1': 'val1', 'env2': 'val2'} stdout = '<stdout>' stderr = '<stderr>' tmout = timeout.Timeout(5, message='Timed out waiting for something.') settings = hvd_settings.Settings( verbose=0, ssh_port=1022, extra_mpi_args='>mpi-extra args go here<', binding_args='>binding args go here<', key=secret.make_secret_key(), start_timeout=tmout, num_proc=1, hosts='localhost:1', output_filename='>output filename goes here<', run_func_mode=True ) def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], [] with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags) as impl: with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, nics, env, cmd, stdout=stdout, stderr=stderr) # assert call on _get_mpi_implementation_flags impl.assert_called_once_with(None, env=env) # call the mocked _get_mpi_implementation_flags method ourselves mpi_flags, _ = horovod.runner.mpi_run._get_mpi_implementation_flags(False) self.assertIsNotNone(mpi_flags) expected_command = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 1 -H {hosts} ' '>binding args go here< ' '{mpi_flags} ' '-mca plm_rsh_args "-p 1022" ' '-mca btl_tcp_if_include eth0,eth1 -x NCCL_SOCKET_IFNAME=eth0,eth1 ' '--output-filename >output filename goes here< ' '-x env1 -x env2 ' '>mpi-extra args go here< ' 'cmd arg1 arg2').format(hosts=settings.hosts, mpi_flags=' '.join(mpi_flags)) # remove PYTHONPATH from execute's env # we cannot know the exact value of that env variable # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below self.assertIn('env', execute.call_args.kwargs) if 'PYTHONPATH' in execute.call_args.kwargs['env']: execute.call_args.kwargs['env'].pop('PYTHONPATH') expected_env = {'env1': 'val1', 'env2': 'val2', 'PATH': os.environ.get('PATH')} execute.assert_called_once_with(expected_command, env=expected_env, stdout=stdout, stderr=stderr)
def _run_static(args): nics_set = set(args.nics.split(',')) if args.nics else None # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, ssh_identity_file=args.ssh_identity_file, extra_mpi_args=args.mpi_args, tcp_flag=args.tcp_flag, binding_args=args.binding_args, key=secret.make_secret_key(), start_timeout=tmout, num_proc=args.np, hosts=args.hosts, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=nics_set) # This cache stores the results of checks performed by horovod # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) if args.ssh_identity_file: params += args.ssh_identity_file parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) all_host_names, _ = hosts.parse_hosts_and_slots(args.hosts) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. if not _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, args.ssh_identity_file, fn_cache=fn_cache): raise RuntimeError('could not connect to some hosts via ssh') if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') nics = driver_service.get_common_interfaces(settings, all_host_names, remote_host_names, fn_cache) if args.run_func: # get the driver IPv4 address driver_ip = network.get_driver_ip(nics) run_func_server = KVStoreServer(verbose=settings.verbose) run_func_server_port = run_func_server.start_server() put_data_into_kvstore(driver_ip, run_func_server_port, 'runfunc', 'func', args.run_func) command = [ sys.executable, '-m', 'horovod.runner.run_task', str(driver_ip), str(run_func_server_port) ] try: _launch_job(args, settings, nics, command) results = [None] * args.np # TODO: make it parallel to improve performance for i in range(args.np): results[i] = read_data_from_kvstore(driver_ip, run_func_server_port, 'runfunc_result', str(i)) return results finally: run_func_server.shutdown_server() else: command = args.command _launch_job(args, settings, nics, command) return None
class RunTests(unittest.TestCase): """ Tests for horovod.runner. """ def __init__(self, *args, **kwargs): super(RunTests, self).__init__(*args, **kwargs) warnings.simplefilter('module') def test_params_args(self): with override_args('horovodrun', '-np', '2', '--fusion-threshold-mb', '10', '--cycle-time-ms', '20', '--cache-capacity', '512', '--hierarchical-allreduce', '--hierarchical-allgather'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_FUSION_THRESHOLD), str(10 * 1024 * 1024)) self.assertEqual(env.get(config_parser.HOROVOD_CYCLE_TIME), '20.0') self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY), '512') self.assertEqual( env.get(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE), '1') self.assertEqual( env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '1') def test_autotune_args(self): with override_args('horovodrun', '-np', '2', '--autotune', '--autotune-log-file', '/tmp/autotune.txt', '--autotune-warmup-samples', '1', '--autotune-steps-per-sample', '5', '--autotune-bayes-opt-max-samples', '10', '--autotune-gaussian-process-noise', '0.2'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE), '1') self.assertEqual(env.get(config_parser.HOROVOD_AUTOTUNE_LOG), '/tmp/autotune.txt') self.assertEqual( env.get(config_parser.HOROVOD_AUTOTUNE_WARMUP_SAMPLES), '1') self.assertEqual( env.get(config_parser.HOROVOD_AUTOTUNE_STEPS_PER_SAMPLE), '5') self.assertEqual( env.get(config_parser.HOROVOD_AUTOTUNE_BAYES_OPT_MAX_SAMPLES), '10') self.assertEqual( env.get(config_parser.HOROVOD_AUTOTUNE_GAUSSIAN_PROCESS_NOISE), '0.2') def test_autotuning_with_fixed_param(self): with override_args('horovodrun', '-np', '2', '--autotune', '--cache-capacity', '1024', '--no-hierarchical-allgather'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertNotIn(config_parser.HOROVOD_FUSION_THRESHOLD, env) self.assertNotIn(config_parser.HOROVOD_CYCLE_TIME, env) self.assertEqual(env.get(config_parser.HOROVOD_CACHE_CAPACITY), '1024') self.assertNotIn(config_parser.HOROVOD_HIERARCHICAL_ALLREDUCE, env) self.assertEqual( env.get(config_parser.HOROVOD_HIERARCHICAL_ALLGATHER), '0') def test_timeline_args(self): with override_args('horovodrun', '-np', '2', '--timeline-filename', '/tmp/timeline.json', '--timeline-mark-cycles'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_TIMELINE), '/tmp/timeline.json') self.assertEqual( env.get(config_parser.HOROVOD_TIMELINE_MARK_CYCLES), '1') def test_stall_check_args(self): with override_args('horovodrun', '-np', '2', '--no-stall-check'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual( env.get(config_parser.HOROVOD_STALL_CHECK_DISABLE), '1') with override_args('horovodrun', '-np', '2', '--stall-check-warning-time-seconds', '10', '--stall-check-shutdown-time-seconds', '20'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertNotIn(config_parser.HOROVOD_STALL_CHECK_DISABLE, env) self.assertEqual( env.get(config_parser.HOROVOD_STALL_CHECK_TIME_SECONDS), '10') self.assertEqual( env.get(config_parser.HOROVOD_STALL_SHUTDOWN_TIME_SECONDS), '20') def test_library_args(self): with override_args('horovodrun', '-np', '2', '--mpi-threads-disable', '--num-nccl-streams', '2', '--ccl-bgt-affinity', '1', '--gloo-timeout-seconds', '60'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual( env.get(config_parser.HOROVOD_MPI_THREADS_DISABLE), '1') self.assertEqual(env.get(config_parser.HOROVOD_NUM_NCCL_STREAMS), '2') self.assertEqual(env.get(config_parser.HOROVOD_CCL_BGT_AFFINITY), '1') self.assertEqual( env.get(config_parser.HOROVOD_GLOO_TIMEOUT_SECONDS), '60') def test_logging_args(self): with override_args('horovodrun', '-np', '2', '--log-level', 'INFO', '--log-hide-timestamp'): args = parse_args() env = {} config_parser.set_env_from_args(env, args) self.assertEqual(env.get(config_parser.HOROVOD_LOG_LEVEL), 'INFO') self.assertEqual(env.get(config_parser.HOROVOD_LOG_HIDE_TIME), '1') def test_config_file(self): config_filename = os.path.join(os.path.dirname(__file__), 'data/config.test.yaml') with override_args('horovodrun', '-np', '2', '--config-file', config_filename): args = parse_args() self.assertTrue(args.use_gloo) # Params self.assertEqual(args.fusion_threshold_mb, 32) self.assertEqual(args.cycle_time_ms, 10) self.assertEqual(args.cache_capacity, 2048) self.assertTrue(args.hierarchical_allreduce) self.assertTrue(args.hierarchical_allgather) # Autotune self.assertTrue(args.autotune) self.assertEqual(args.autotune_log_file, 'horovod_autotune_log.txt') self.assertEqual(args.autotune_warmup_samples, 5) self.assertEqual(args.autotune_steps_per_sample, 20) self.assertEqual(args.autotune_bayes_opt_max_samples, 50) self.assertEqual(args.autotune_gaussian_process_noise, 0.9) # Timeline self.assertEqual(args.timeline_filename, 'horovod_timeline.json') self.assertTrue(args.timeline_mark_cycles) # Stall Check self.assertFalse(args.no_stall_check) self.assertEqual(args.stall_check_warning_time_seconds, 120) self.assertEqual(args.stall_check_shutdown_time_seconds, 240) # Library Options self.assertTrue(args.mpi_threads_disable) self.assertEqual(args.num_nccl_streams, 2) self.assertEqual(args.ccl_bgt_affinity, 1) self.assertEqual(args.gloo_timeout_seconds, 60) # Logging self.assertEqual(args.log_level, 'INFO') self.assertTrue(args.log_hide_timestamp) def test_config_file_override_args(self): config_filename = os.path.join(os.path.dirname(__file__), 'data/config.test.yaml') with override_args( 'horovodrun', '-np', '2', '--fusion-threshold-mb', '128', '--config-file', config_filename, '--cycle-time-ms', '20', ): args = parse_args() self.assertEqual(args.fusion_threshold_mb, 128) self.assertEqual(args.cycle_time_ms, 20) def test_validate_config_args(self): with override_args('horovodrun', '-np', '2', '--fusion-threshold-mb', '-1'): with pytest.raises(ValueError): parse_args() # test_on_event tests in_thread as well, but it does not test args def test_in_thread_args(self): fn = mock.Mock() thread = in_thread(fn, args=(1, )) thread.join(1.0) self.assertFalse(thread.is_alive()) fn.assert_called_once_with(1) fn = mock.Mock() thread = in_thread(fn, args=(1, 2)) thread.join(1.0) self.assertFalse(thread.is_alive()) fn.assert_called_once_with(1, 2) fn = mock.Mock() thread = in_thread(fn, args=(1, 2), silent=True) thread.join(1.0) self.assertFalse(thread.is_alive()) fn.assert_called_once_with(1, 2) fn = mock.Mock() with pytest.raises( ValueError, match="^args must be a tuple, not <(class|type) 'int'>, " "for a single argument use \\(arg,\\)$"): in_thread(fn, args=1) fn.assert_not_called() def test_on_event(self): # a happy run without args and stop event event = threading.Event() fn = mock.Mock() thread = on_event(event, fn) fn.assert_not_called() event.set() thread.join(1.0) self.assertFalse(thread.is_alive()) fn.assert_called_once() # a happy run with args but without stop event event = threading.Event() fn = mock.Mock() thread = on_event(event, fn, ('a', 1)) fn.assert_not_called() event.set() thread.join(1.0) self.assertFalse(thread.is_alive()) fn.assert_called_once() fn.assert_called_once_with('a', 1) # a happy run with stop event but unused event = threading.Event() stop = threading.Event() fn = mock.Mock() thread = on_event(event, fn, stop=stop, check_stop_interval_s=0.01) fn.assert_not_called() event.set() thread.join(1.0) self.assertFalse(thread.is_alive()) fn.assert_called_once() stop.set() time.sleep(0.1) fn.assert_called_once() # stop the thread before we set the event event = threading.Event() stop = threading.Event() fn = mock.Mock() thread = on_event(event, fn, stop=stop, check_stop_interval_s=0.01) fn.assert_not_called() stop.set() thread.join(1.0) self.assertFalse(thread.is_alive()) fn.assert_not_called() event.set() time.sleep(0.1) fn.assert_not_called() # test with exception def exception(): raise Exception("Test Exception") event = threading.Event() fn = mock.Mock(side_effect=exception) thread = on_event(event, fn) fn.assert_not_called() event.set() thread.join(1.0) self.assertFalse(thread.is_alive()) fn.assert_called_once() # test with exception but silent event = threading.Event() fn = mock.Mock(side_effect=exception) thread = on_event(event, fn) fn.assert_not_called() event.set() thread.join(1.0) self.assertFalse(thread.is_alive()) fn.assert_called_once() # test None event event = None fn = mock.Mock() with pytest.raises(ValueError, match="^Event must not be None$"): on_event(event, fn) fn.assert_not_called() # test non-tuple args event = threading.Event() fn = mock.Mock() with pytest.raises( ValueError, match="^args must be a tuple, not <(class|type) 'int'>, " "for a single argument use \\(arg,\\)$"): on_event(event, fn, args=1) fn.assert_not_called() # test None stop and non-daemon event = threading.Event() fn = mock.Mock() with pytest.raises( ValueError, match="^Stop event must be given for non-daemon event thread$" ): on_event(event, fn, stop=None, daemon=False) fn.assert_not_called() def test_safe_shell_exec_captures_stdout(self): self.do_test_safe_shell_exec('echo hello', 0, 'hello\n', '') def test_safe_shell_exec_captures_stderr(self): self.do_test_safe_shell_exec('echo hello >&2', 0, '', 'hello\n') def test_safe_shell_exec_captures_last_line_wo_eol(self): cmd = 'bash -c "echo -e -n \\"hello\nstdout\\"; echo -e -n \\"hello\nstderr\\" >&2"' self.do_test_safe_shell_exec(cmd, 0, 'hello\nstdout', 'hello\nstderr') def test_safe_shell_exec_returns_exit_code(self): self.do_test_safe_shell_exec('false', 1, '', '') @pytest.mark.skip(reason='https://github.com/horovod/horovod/issues/1993') def test_safe_shell_exec_interrupts_on_event(self): # interrupt execute in one second interrupt = threading.Event() interrupt_delay = 1.0 delay(lambda: interrupt.set(), interrupt_delay) sleep = interrupt_delay + safe_shell_exec.GRACEFUL_TERMINATION_TIME_S + 2.0 start = time.time() self.do_test_safe_shell_exec('sleep {}'.format(sleep), 143, '', None, interrupt) duration = time.time() - start self.assertGreaterEqual(duration, interrupt_delay) self.assertLess(duration, sleep - 1.0, 'sleep should not finish') def test_safe_shell_exec_interrupts_on_parent_shutdown(self): sleep = 20 parent_script = os.path.join(os.path.dirname(__file__), 'data/run_safe_shell_exec.py') child_script = os.path.join(os.path.dirname(__file__), 'data/sleep.py') def get_pid(logfile): # Wait until the script has written its PID to the logfile wait(lambda: os.path.exists(logfile), timeout=5) with open(logfile, 'r') as f: return int(f.read()) with temppath() as parent_logfile, temppath() as child_logfile: # It's important that this executes in an entirely different interpreter with as little shared # state as possible, to avoid issues with the semaphore tracker. cmd = ' '.join([ sys.executable, parent_script, parent_logfile, child_script, str(sleep), child_logfile ]) p = subprocess.Popen(cmd, shell=True) parent = psutil.Process(get_pid(parent_logfile)) child = psutil.Process(get_pid(child_logfile)) self.assertTrue(parent.is_running()) self.assertTrue(child.is_running()) # Hard kill the parent process parent.kill() parent.wait(timeout=safe_shell_exec.GRACEFUL_TERMINATION_TIME_S) p.wait() # Child process will exit when pipe breaks child.wait( timeout=2 * safe_shell_exec.GRACEFUL_TERMINATION_TIME_S + 1) self.assertFalse(parent.is_running()) self.assertFalse(child.is_running()) def do_test_safe_shell_exec(self, cmd, expected_exit_code, expected_stdout, expected_stderr, event=None): stdout = io.StringIO() stderr = io.StringIO() res = safe_shell_exec.execute(cmd, stdout=stdout, stderr=stderr, events=[event] if event else None) self.assertEqual(expected_exit_code, res) if expected_stdout is not None: self.assertEqual(expected_stdout, stdout.getvalue()) if expected_stderr is not None: self.assertEqual(expected_stderr, stderr.getvalue()) def test_hash(self): hash = _hash("test string") self.assertEqual(hash, '6f8db599de986fab7a21625b7916589c') def test_host_hash(self): hash = host_hash() salted = host_hash('salt') empty_salted = host_hash('') self.assertNotEqual(salted, hash) self.assertEqual(empty_salted, hash) def test_get_mpi_implementation(self): def test(output, expected, exit_code=0): ret = (output, exit_code) if output is not None else None env = {'VAR': 'val'} with mock.patch("horovod.runner.mpi_run.tiny_shell_exec.execute", return_value=ret) as m: implementation = _get_mpi_implementation(env) self.assertEqual(expected, implementation) m.assert_called_once_with('mpirun --version', env) test(("mpirun (Open MPI) 2.1.1\n" "Report bugs to http://www.open-mpi.org/community/help/\n"), _OMPI_IMPL) test("OpenRTE", _OMPI_IMPL) test("IBM Spectrum MPI", _SMPI_IMPL) test(("HYDRA build details:\n" " Version: 3.3a2\n" " Configure options: 'MPICHLIB_CFLAGS=-g -O2'\n"), _MPICH_IMPL) test("Unknown MPI v1.00", _UNKNOWN_IMPL) test("output", exit_code=1, expected=_MISSING_IMPL) test(None, _MISSING_IMPL) def test_run_controller(self): def test(use_gloo, use_mpi, use_js, gloo_is_built, mpi_is_built, lsf_exists, jsrun_installed, expected, exception): gloo_run = MagicMock() mpi_run = MagicMock() js_run = MagicMock() with is_built(gloo_is_built, mpi_is_built): with lsf_and_jsrun(lsf_exists, jsrun_installed): if exception is not None: with pytest.raises(ValueError, match=exception) as e: run_controller(use_gloo, gloo_run, use_mpi, mpi_run, use_js, js_run, verbosity=2) return run_controller(use_gloo, gloo_run, use_mpi, mpi_run, use_js, js_run, verbosity=2) if expected == "gloo": gloo_run.assert_called_once() mpi_run.assert_not_called() js_run.assert_not_called() elif expected == "mpi": gloo_run.assert_not_called() mpi_run.assert_called_once() js_run.assert_not_called() elif expected == "js": gloo_run.assert_not_called() mpi_run.assert_not_called() js_run.assert_called_once() else: raise ValueError("unsupported framework: {}".format(expected)) bool_values = [False, True] bool_values_and_none = [None, False, True] for use_gloo, use_mpi, use_js, \ gloo_is_built, mpi_is_built, \ lsf_exists, jsrun_installed in \ itertools.product(bool_values_and_none, bool_values_and_none, bool_values_and_none, bool_values, bool_values, bool_values, bool_values): expected = exception = None if use_gloo: if gloo_is_built: expected = 'gloo' else: exception = r'^Gloo support has not been built\. If this is not expected, ensure CMake is installed ' \ r'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error\.$' elif use_mpi: if mpi_is_built: expected = 'mpi' else: exception = r'^MPI support has not been built\. If this is not expected, ensure MPI is installed ' \ r'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error\.$' elif use_js: if mpi_is_built: if lsf_exists: expected = 'js' else: exception = 'Horovod did not detect an LSF job. The jsrun launcher can only be used in that environment. ' \ 'Please, pick a different launcher for other environments.' else: exception = r'^MPI support has not been built\. If this is not expected, ensure MPI is installed ' \ r'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error\.$' elif mpi_is_built: if lsf_exists and jsrun_installed: expected = 'js' else: expected = 'mpi' elif gloo_is_built: expected = 'gloo' else: exception = r'Neither MPI nor Gloo support has been built\. Try reinstalling Horovod ensuring that ' \ r'either MPI is installed \(MPI\) or CMake is installed \(Gloo\)\.' test(use_gloo, use_mpi, use_js, gloo_is_built, mpi_is_built, lsf_exists, jsrun_installed, expected, exception) """ Minimal mpi_run settings for tests. """ minimal_settings = hvd_settings.Settings(verbose=0, num_proc=2, hosts='localhost:2', run_func_mode=True) """ Tests mpi_run with minimal settings. """ def test_mpi_run_minimal(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, None, {}, cmd) # call the mocked _get_mpi_implementation_flags method mpi_flags, binding_args = horovod.runner.mpi_run._get_mpi_implementation_flags( False) self.assertIsNotNone(mpi_flags) expected_cmd = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 2 -H localhost:2 ' '{binding_args} ' '{mpi_flags} ' 'cmd').format( binding_args=' '.join(binding_args), mpi_flags=' '.join(mpi_flags)) # remove PYTHONPATH from execute's env # we cannot know the exact value of that env variable # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below self.assertIn('env', execute.call_args.kwargs) if 'PYTHONPATH' in execute.call_args.kwargs['env']: execute.call_args.kwargs['env'].pop('PYTHONPATH') expected_env = {'PATH': os.environ.get('PATH')} execute.assert_called_once_with(expected_cmd, env=expected_env, stdout=None, stderr=None) """ Tests mpi_run on a large cluster. """ def test_mpi_run_on_large_cluster(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = copy.copy(self.minimal_settings) settings.hosts = ','.join(['localhost:1'] * large_cluster_threshold) def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, None, {}, cmd) # call the mocked _get_mpi_implementation_flags method mpi_flags, binding_args = horovod.runner.mpi_run._get_mpi_implementation_flags( False) self.assertIsNotNone(mpi_flags) mpi_flags.append('-mca plm_rsh_no_tree_spawn true') mpi_flags.append('-mca plm_rsh_num_concurrent {}'.format( large_cluster_threshold)) expected_cmd = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 2 -H {hosts} ' '{binding_args} ' '{mpi_flags} ' 'cmd').format( hosts=settings.hosts, binding_args=' '.join(binding_args), mpi_flags=' '.join(mpi_flags)) # remove PYTHONPATH from execute's env # we cannot know the exact value of that env variable # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below self.assertIn('env', execute.call_args.kwargs) if 'PYTHONPATH' in execute.call_args.kwargs['env']: execute.call_args.kwargs['env'].pop('PYTHONPATH') expected_env = {'PATH': os.environ.get('PATH')} execute.assert_called_once_with(expected_cmd, env=expected_env, stdout=None, stderr=None) """ Tests mpi_run with full settings. """ def test_mpi_run_full(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd', 'arg1', 'arg2'] nics = ['eth0', 'eth1'] env = {'env1': 'val1', 'env2': 'val2'} stdout = '<stdout>' stderr = '<stderr>' tmout = timeout.Timeout(5, message='Timed out waiting for something.') settings = hvd_settings.Settings( verbose=0, ssh_port=1022, extra_mpi_args='>mpi-extra args go here<', binding_args='>binding args go here<', key=secret.make_secret_key(), start_timeout=tmout, num_proc=1, hosts='localhost:1', output_filename='>output filename goes here<', run_func_mode=True) def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], [] with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags) as impl: with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, nics, env, cmd, stdout=stdout, stderr=stderr) # assert call on _get_mpi_implementation_flags impl.assert_called_once_with(None, env=env) # call the mocked _get_mpi_implementation_flags method ourselves mpi_flags, _ = horovod.runner.mpi_run._get_mpi_implementation_flags( False) self.assertIsNotNone(mpi_flags) expected_command = ( 'mpirun ' '--allow-run-as-root --tag-output ' '-np 1 -H {hosts} ' '>binding args go here< ' '{mpi_flags} ' '-mca plm_rsh_args "-p 1022" ' '-mca btl_tcp_if_include eth0,eth1 -x NCCL_SOCKET_IFNAME=eth0,eth1 ' '--output-filename >output filename goes here< ' '-x env1 -x env2 ' '>mpi-extra args go here< ' 'cmd arg1 arg2').format(hosts=settings.hosts, mpi_flags=' '.join(mpi_flags)) # remove PYTHONPATH from execute's env # we cannot know the exact value of that env variable # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below self.assertIn('env', execute.call_args.kwargs) if 'PYTHONPATH' in execute.call_args.kwargs['env']: execute.call_args.kwargs['env'].pop('PYTHONPATH') expected_env = { 'env1': 'val1', 'env2': 'val2', 'PATH': os.environ.get('PATH') } execute.assert_called_once_with(expected_command, env=expected_env, stdout=stdout, stderr=stderr) """ Tests mpi_run without PYTHONPATH set. """ def test_mpi_run_without_pythonpath(self): self.do_test_mpi_run_env_override({}, {}, 'PYTHONPATH', None) """ Tests mpi_run with PYTHONPATH set in sys. """ def test_mpi_run_with_sys_pythonpath(self): self.do_test_mpi_run_env_override({'PYTHONPATH': 'ppath'}, {}, 'PYTHONPATH', 'ppath') """ Tests mpi_run with PYTHONPATH set in env. """ def test_mpi_run_with_env_pythonpath(self): self.do_test_mpi_run_env_override({}, {'PYTHONPATH': 'ppath'}, 'PYTHONPATH', 'ppath') """ Tests mpi_run with both PYTHONPATH set. """ def test_mpi_run_with_both_pythonpaths(self): self.do_test_mpi_run_env_override({'PYTHONPATH': 'sys-ppath'}, {'PYTHONPATH': 'env-ppath'}, 'PYTHONPATH', 'env-ppath') """ Tests mpi_run without PATH set. """ def test_mpi_run_without_path(self): self.do_test_mpi_run_env_override({}, {}, 'PATH', None) """ Tests mpi_run with PATH set in sys. """ def test_mpi_run_with_sys_path(self): self.do_test_mpi_run_env_override({'PATH': 'ppath'}, {}, 'PATH', 'ppath') """ Tests mpi_run with PATH set in env. """ def test_mpi_run_with_env_path(self): self.do_test_mpi_run_env_override({}, {'PATH': 'ppath'}, 'PATH', 'ppath') """ Tests mpi_run with both PATH set. """ def test_mpi_run_with_both_paths(self): self.do_test_mpi_run_env_override({'PATH': 'sys-path'}, {'PATH': 'env-path'}, 'PATH', 'env-path') """ Actually tests mpi_run overrides arg env with sys env. """ def do_test_mpi_run_env_override(self, sysenv, argenv, env_var, expected): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags),\ mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=0) as execute,\ override_env(sysenv): mpi_run(settings, None, argenv, cmd) # assert the env variable in the execute's env self.assertIn('env', execute.call_args.kwargs) self.assertEqual(execute.call_args.kwargs['env'].get(env_var), expected) def test_mpi_run_with_non_zero_exit(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings def mpi_impl_flags(tcp, env=None): return [], [] with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=1): with pytest.raises(RuntimeError, match="^mpirun failed with exit code 1$"): mpi_run(settings, None, {}, cmd) """ Tests mpi_run with os.environ. """ def test_mpi_run_with_os_environ(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd'] settings = self.minimal_settings def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], ["--mock-mpi-binding-args"] with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=0): with pytest.raises( Exception, match= "^env argument must be a dict, not <class 'os._Environ'>: " ): mpi_run(settings, None, os.environ, cmd) """ Tests gloo_run with minimal settings. """ def test_gloo_run_minimal(self): if not gloo_built: self.skipTest("Gloo is not available") cmd = ['whoami'] settings = self.minimal_settings gloo_run(settings, ['lo'], {}, '127.0.0.1', cmd) """ Tests gloo_run with os.environ. """ def test_gloo_run_with_os_environ(self): if not gloo_built: self.skipTest("Gloo is not available") cmd = ['whoami'] settings = self.minimal_settings gloo_run(settings, ['lo'], os.environ, '127.0.0.1', cmd) def test_horovodrun_hostfile(self): with temppath() as host_filename: with open(host_filename, 'w+') as fp: fp.write('172.31.32.7 slots=8\n') fp.write('172.31.33.9 slots=8\n') hostnames = hosts.parse_host_files(host_filename) self.assertEqual(hostnames, '172.31.32.7:8,172.31.33.9:8') """ Tests js_run. """ @mock.patch('horovod.runner.js_run.is_jsrun_installed', MagicMock(return_value=True)) @mock.patch('horovod.runner.js_run.generate_jsrun_rankfile', MagicMock(return_value='/tmp/rankfile')) @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_gpus', MagicMock(return_value=2)) @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_cores', MagicMock(return_value=2)) def test_js_run(self): if _get_mpi_implementation_flags(False)[0] is None: self.skipTest("MPI is not available") cmd = ['cmd', 'arg1', 'arg2'] env = {'env1': 'val1', 'env2': 'val2'} stdout = '<stdout>' stderr = '<stderr>' settings = hvd_settings.Settings( verbose=0, extra_mpi_args='>mpi-extra args go here<', num_proc=4, hosts='localhost:2,127.0.0.1:2', output_filename='>output filename goes here<', run_func_mode=True) def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], [] with mock.patch("horovod.runner.js_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags): with mock.patch("horovod.runner.js_run.safe_shell_exec.execute", return_value=0) as execute: js_run(settings, None, env, cmd, stdout=stdout, stderr=stderr) # call the mocked _get_mpi_implementation_flags method mpi_flags, _ = horovod.runner.js_run._get_mpi_implementation_flags( False) self.assertIsNotNone(mpi_flags) expected_command = ( 'jsrun ' '--erf_input /tmp/rankfile ' '--stdio_stderr >output filename goes here< ' '--stdio_stdout >output filename goes here< ' '--smpiargs \'{mpi_args} >mpi-extra args go here<\' ' 'cmd arg1 arg2').format(mpi_args=' '.join(mpi_flags)) expected_env = {'env1': 'val1', 'env2': 'val2'} execute.assert_called_once_with(expected_command, env=expected_env, stdout=stdout, stderr=stderr) """ Tests generate_jsrun_rankfile. """ @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_gpus', MagicMock(return_value=4)) @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_cores', MagicMock(return_value=4)) @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_threads', MagicMock(return_value=4)) def test_generate_jsrun_rankfile(self): settings = hvd_settings.Settings( num_proc=5, hosts='host1:4,host2:4,host3:4', ) with temppath() as rankfile_path: rankfile_path = generate_jsrun_rankfile(settings, rankfile_path) with open(rankfile_path, 'r') as file: gen_rankfile = file.read() expected_rankfile = ("""overlapping_rs: allow cpu_index_using: logical rank: 0: { hostname: host1; cpu: {0-3} ; gpu: * ; mem: * } rank: 1: { hostname: host1; cpu: {4-7} ; gpu: * ; mem: * } rank: 2: { hostname: host1; cpu: {8-11} ; gpu: * ; mem: * } rank: 3: { hostname: host1; cpu: {12-15} ; gpu: * ; mem: * } rank: 4: { hostname: host2; cpu: {0-3} ; gpu: * ; mem: * } """) self.assertMultiLineEqual(gen_rankfile, expected_rankfile) """ Tests horovod.runner.launch._run with jsrun """ @mock.patch('horovod.runner.util.lsf.LSFUtils.using_lsf', MagicMock(return_value=True)) @mock.patch('horovod.runner.util.lsf.LSFUtils.get_compute_hosts', MagicMock(return_value=['host1', 'host2'])) @mock.patch('horovod.runner.util.lsf.LSFUtils.get_num_gpus', MagicMock(return_value=2)) @mock.patch('horovod.runner.util.network.filter_local_addresses', MagicMock(return_value=['host1', 'host2'])) @mock.patch('horovod.runner.launch._check_all_hosts_ssh_successful', MagicMock()) @mock.patch('horovod.runner.launch.run_controller') def test_run_with_jsrun(self, mocked_run_controller): hargs = _HorovodArgs() _run(hargs) mocked_run_controller.assert_called_once() def test_get_host_assignments(self): hosts = parse_hosts('worker-0:2,worker-1:2') np = 4 assignments = get_host_assignments(hosts, np) sizes = dict(size=4, local_size=2, cross_size=2) expected = [ SlotInfo(hostname='worker-0', rank=0, local_rank=0, cross_rank=0, **sizes), SlotInfo(hostname='worker-0', rank=1, local_rank=1, cross_rank=0, **sizes), SlotInfo(hostname='worker-1', rank=2, local_rank=0, cross_rank=1, **sizes), SlotInfo(hostname='worker-1', rank=3, local_rank=1, cross_rank=1, **sizes) ] self.assertListEqual(assignments, expected) def test_get_host_assignments_elastic(self): hosts = parse_hosts('worker-0:2,worker-1:2') min_np = 1 max_np = 2 assignments = get_host_assignments(hosts, min_np=min_np, max_np=max_np) sizes = dict(size=2, local_size=2, cross_size=1) expected = [ SlotInfo(hostname='worker-0', rank=0, local_rank=0, cross_rank=0, **sizes), SlotInfo(hostname='worker-0', rank=1, local_rank=1, cross_rank=0, **sizes) ] self.assertListEqual(assignments, expected) def test_get_host_assignments_heterogeneous(self): hosts = parse_hosts('worker-0:1,worker-1:2') np = 3 assignments = get_host_assignments(hosts, np) expected = [ SlotInfo(hostname='worker-0', rank=0, local_rank=0, cross_rank=0, size=3, local_size=1, cross_size=2), SlotInfo(hostname='worker-1', rank=1, local_rank=0, cross_rank=1, size=3, local_size=2, cross_size=2), SlotInfo(hostname='worker-1', rank=2, local_rank=1, cross_rank=0, size=3, local_size=2, cross_size=1) ] self.assertListEqual(assignments, expected)
def run(fn, args=(), kwargs={}, num_proc=None, start_timeout=None, use_mpi=None, use_gloo=None, extra_mpi_args=None, env=None, stdout=None, stderr=None, verbose=1, nics=None, prefix_output_with_timestamp=False, executable=None): """ Runs Horovod on Spark. Runs `num_proc` processes executing `fn` using the same amount of Spark tasks. Args: fn: Function to run. args: Arguments to pass to `fn`. kwargs: Keyword arguments to pass to `fn`. num_proc: Number of Horovod processes. Defaults to `spark.default.parallelism`. start_timeout: Timeout for Spark tasks to spawn, register and start running the code, in seconds. If not set, falls back to `HOROVOD_SPARK_START_TIMEOUT` environment variable value. If it is not set as well, defaults to 600 seconds. extra_mpi_args: Extra arguments for mpi_run. Defaults to no extra args. env: Environment dictionary to use in Horovod run. stdout: Horovod stdout is redirected to this stream. Defaults to sys.stdout when used with MPI. stderr: Horovod stderr is redirected to this stream. Defaults to sys.stderr when used with MPI. verbose: Debug output verbosity (0-2). Defaults to 1. nics: List of NICs for tcp network communication. prefix_output_with_timestamp: shows timestamp in stdout/stderr forwarding on the driver executable: Optional executable to run when launching the workers. Defaults to `sys.executable`. Returns: List of results returned by running `fn` on each rank. """ if start_timeout is None: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_SPARK_START_TIMEOUT', '600')) # nics needs to be a set if nics and not isinstance(nics, set): nics = set(nics) tmout = timeout.Timeout( start_timeout, message='Timed out waiting for {activity}. Please check that you have ' 'enough resources to run all Horovod processes. Each Horovod ' 'process runs in a Spark task. You may need to increase the ' 'start_timeout parameter to a larger value if your Spark resources ' 'are allocated on-demand.') settings = hvd_settings.Settings( verbose=verbose, extra_mpi_args=extra_mpi_args, key=secret.make_secret_key(), start_timeout=tmout, nics=nics, run_func_mode=True, prefix_output_with_timestamp=prefix_output_with_timestamp) spark_context = pyspark.SparkContext._active_spark_context if spark_context is None: raise Exception('Could not find an active SparkContext, are you ' 'running in a PySpark session?') if num_proc is None: num_proc = spark_context.defaultParallelism if settings.verbose >= 1: logging.info( 'Running %d processes (inferred from spark.default.parallelism)...', num_proc) else: if settings.verbose >= 1: logging.info('Running %d processes...', num_proc) settings.num_proc = num_proc result_queue = queue.Queue(1) # start Spark driver service and launch settings.num_proc Spark tasks spark_job_group = 'horovod.spark.run.%d' % job_id.next_job_id() driver = driver_service.SparkDriverService(settings.num_proc, settings.num_proc, fn, args, kwargs, settings.key, settings.nics) gloo_is_used = is_gloo_used(use_gloo=use_gloo, use_mpi=use_mpi, use_jsrun=False) spark_thread = _make_spark_thread(spark_context, spark_job_group, driver, result_queue, settings, use_gloo=gloo_is_used, is_elastic=False) try: # wait for all tasks to register, notify them and initiate task-to-task address registration _notify_and_register_task_addresses(driver, settings) # Determine the index grouping based on host hashes. # Barrel shift until index 0 is in the first host. host_hashes = list(driver.task_host_hash_indices().keys()) host_hashes.sort() while 0 not in driver.task_host_hash_indices()[host_hashes[0]]: host_hashes = host_hashes[1:] + host_hashes[:1] settings.hosts = ','.join( '%s:%d' % (host_hash, len(driver.task_host_hash_indices()[host_hash])) for host_hash in host_hashes) # Run the job _launch_job(use_mpi, use_gloo, settings, driver, env, stdout, stderr, executable) except: # Terminate Spark job. spark_context.cancelJobGroup(spark_job_group) # Re-raise exception. raise finally: spark_thread.join() driver.shutdown() # Make sure Spark Job did not fail. driver.check_for_spark_job_failure() # get ranks from driver indices_in_rank_order = _get_indices_in_rank_order(driver) # If there's no exception, execution results are in this queue. results = result_queue.get_nowait() return [results[index] for index in indices_in_rank_order]