def wait_for_available_slots(self, min_np, min_hosts=1): extra_message = ' An elastic job also requires that at least two hosts ' \ 'are available to resolve compatible network interfaces. If you know which interfaces ' \ 'are compatible in your network, set `--network-interface` to skip this check.' \ if min_hosts > 1 else '' tmout = timeout.Timeout( self._timeout, message= 'Timed out waiting for {{activity}}. Please check that you have ' 'enough resources to run at least {min_np} Horovod processes.{extra_message}' .format(min_np=min_np, extra_message=extra_message)) self._wait_hosts_cond.acquire() try: while True: current_hosts = self._host_manager.current_hosts avail_slots = current_hosts.count_available_slots() logging.debug(f"current available slots: {avail_slots}") avail_hosts = len(current_hosts.available_hosts) logging.debug(f"current available hosts: {avail_hosts}.") if avail_slots >= min_np and avail_hosts >= min_hosts: return current_hosts if self._shutdown.is_set(): raise RuntimeError( 'Job has been shutdown, see above error messages for details.' ) self._wait_hosts_cond.wait(tmout.remaining()) tmout.check_time_out_for( 'minimum number of slots to become available') finally: self._wait_hosts_cond.release()
def start_timeout(self): return timeout.Timeout( self.timeout_s, message="Timed out waiting for {activity}. Please " "check connectivity between servers. You " "may need to increase the --start-timeout " "parameter if you have too many servers.")
def _run_elastic(args): # construct host discovery component if args.host_discovery_script: discover_hosts = discovery.HostDiscoveryScript( args.host_discovery_script, args.slots) elif args.hosts: _, available_host_slots = hosts.parse_hosts_and_slots(args.hosts) if len(available_host_slots) < 2: raise ValueError( 'Cannot run in fault tolerance mode with fewer than 2 hosts.') discover_hosts = discovery.FixedHosts(available_host_slots) else: raise ValueError( 'One of --host-discovery-script, --hosts, or --hostnames must be provided' ) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = elastic_settings.ElasticSettings( discovery=discover_hosts, min_num_proc=args.min_num_proc or args.num_proc, max_num_proc=args.max_num_proc, elastic_timeout=args.elastic_timeout, reset_limit=args.reset_limit, cooldown_range=args.cooldown_range, num_proc=args.num_proc, verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, ssh_identity_file=args.ssh_identity_file, extra_mpi_args=args.mpi_args, key=secret.make_secret_key(), start_timeout=tmout, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=args.nics, prefix_output_with_timestamp=args.prefix_output_with_timestamp) if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support is required to use elastic training, but has not been built. Ensure CMake is ' 'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) env = os.environ.copy() config_parser.set_env_from_args(env, args) executable = args.executable or sys.executable return gloo_run_elastic(settings, env, args.run_func if args.run_func else args.command, executable)
def test_mpi_run_full(self): if not mpi_available(): self.skipTest("MPI is not available") cmd = ['cmd', 'arg1', 'arg2'] nics = ['eth0', 'eth1'] env = {'env1': 'val1', 'env2': 'val2'} stdout = '<stdout>' stderr = '<stderr>' tmout = timeout.Timeout(5, message='Timed out waiting for something.') settings = hvd_settings.Settings( verbose=0, ssh_port=1022, extra_mpi_args='>mpi-extra args go here<', binding_args='>binding args go here<', key=secret.make_secret_key(), start_timeout=tmout, num_proc=1, hosts='localhost:1', output_filename='>output filename goes here<', run_func_mode=True ) def mpi_impl_flags(tcp, env=None): return ["--mock-mpi-impl-flags"], [] with mock.patch("horovod.runner.mpi_run._get_mpi_implementation_flags", side_effect=mpi_impl_flags) as impl: with mock.patch("horovod.runner.mpi_run.safe_shell_exec.execute", return_value=0) as execute: mpi_run(settings, nics, env, cmd, stdout=stdout, stderr=stderr) # assert call on _get_mpi_implementation_flags impl.assert_called_once_with(None, env=env) # call the mocked _get_mpi_implementation_flags method ourselves mpi_flags, _ = horovod.runner.mpi_run._get_mpi_implementation_flags(False) self.assertIsNotNone(mpi_flags) expected_command = ('mpirun ' '--allow-run-as-root --tag-output ' '-np 1 -H {hosts} ' '>binding args go here< ' '{mpi_flags} ' '-mca plm_rsh_args "-p 1022" ' '-mca btl_tcp_if_include eth0,eth1 -x NCCL_SOCKET_IFNAME=eth0,eth1 ' '--output-filename >output filename goes here< ' '-x env1 -x env2 ' '>mpi-extra args go here< ' 'cmd arg1 arg2').format(hosts=settings.hosts, mpi_flags=' '.join(mpi_flags)) # remove PYTHONPATH from execute's env # we cannot know the exact value of that env variable # we test right handling of PYTHONPATH in test_mpi_run_*pythonpath* below self.assertIn('env', execute.call_args.kwargs) if 'PYTHONPATH' in execute.call_args.kwargs['env']: execute.call_args.kwargs['env'].pop('PYTHONPATH') expected_env = {'env1': 'val1', 'env2': 'val2', 'PATH': os.environ.get('PATH')} execute.assert_called_once_with(expected_command, env=expected_env, stdout=stdout, stderr=stderr)
def _run_command(self, command, env, event, stdout=None, stderr=None, index=None, prefix_output_with_timestamp=False): super(SparkTaskService, self)._run_command(command, env, event, stdout, stderr, index, prefix_output_with_timestamp) if self._minimum_command_lifetime_s is not None: self._minimum_command_lifetime = timeout.Timeout(self._minimum_command_lifetime_s, message='Just measuring runtime')
def create_settings(min_np: int = 1, max_np: int = None, reset_limit: int = None, elastic_timeout: int = 600, timeout_s: int = 30, ssh_identity_file: str = None, nics: str = None, **kwargs): """Returns a Settings object for ElasticRayExecutor. Note that the `discovery` property will be set at runtime. Args: min_np (int): Minimum number of processes running for training to continue. If number of available processes dips below this threshold, then training will wait for more instances to become available. max_np (int): Maximum number of training processes, beyond which no additional processes will be created. If not specified, then will be unbounded. reset_limit (int): Maximum number of times that the training job can scale up or down the number of workers after which the job is terminated. elastic_timeout (int): Timeout for elastic initialisation after re-scaling the cluster. The default value is 600 seconds. Alternatively, the environment variable HOROVOD_ELASTIC_TIMEOUT can also be used.' timeout_s (int): Horovod performs all the checks and starts the processes before the specified timeout. The default value is 30 seconds. ssh_identity_file (str): File on the driver from which the identity (private key) is read. nics (set): Network interfaces that can be used for communication. """ start_timeout = timeout.Timeout( timeout_s, message="Timed out waiting for {activity}. Please " "check connectivity between servers. You " "may need to increase the --start-timeout " "parameter if you have too many servers.") ssh_identity_file = ssh_identity_file or os.path.expanduser( "~/ray_bootstrap_key.pem") settings = ElasticSettings( discovery=None, min_np=min_np, max_np=max_np, elastic_timeout=elastic_timeout, reset_limit=reset_limit, num_proc=min_np, ssh_identity_file=ssh_identity_file, nics=nics, start_timeout=start_timeout, key=secret.make_secret_key() if secret else None, **kwargs) return settings
def _run_static(args): nics_set = set(args.nics.split(',')) if args.nics else None # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, ssh_identity_file=args.ssh_identity_file, extra_mpi_args=args.mpi_args, tcp_flag=args.tcp_flag, binding_args=args.binding_args, key=secret.make_secret_key(), start_timeout=tmout, num_proc=args.np, hosts=args.hosts, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=nics_set) # This cache stores the results of checks performed by horovod # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) if args.ssh_identity_file: params += args.ssh_identity_file parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) all_host_names, _ = hosts.parse_hosts_and_slots(args.hosts) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. if not _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, args.ssh_identity_file, fn_cache=fn_cache): raise RuntimeError('could not connect to some hosts via ssh') if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') nics = driver_service.get_common_interfaces(settings, all_host_names, remote_host_names, fn_cache) if args.run_func: # get the driver IPv4 address driver_ip = network.get_driver_ip(nics) run_func_server = KVStoreServer(verbose=settings.verbose) run_func_server_port = run_func_server.start_server() put_data_into_kvstore(driver_ip, run_func_server_port, 'runfunc', 'func', args.run_func) command = [ sys.executable, '-m', 'horovod.runner.run_task', str(driver_ip), str(run_func_server_port) ] try: _launch_job(args, settings, nics, command) results = [None] * args.np # TODO: make it parallel to improve performance for i in range(args.np): results[i] = read_data_from_kvstore(driver_ip, run_func_server_port, 'runfunc_result', str(i)) return results finally: run_func_server.shutdown_server() else: command = args.command _launch_job(args, settings, nics, command) return None
def run_elastic( fn, args=(), kwargs={}, num_proc=None, min_num_proc=None, max_num_proc=None, start_timeout=None, elastic_timeout=None, reset_limit=None, env=None, stdout=None, stderr=None, verbose=1, nics=None, prefix_output_with_timestamp=False, # np is deprecated, use min_num_proc instead min_np=None, # max_num_proc is deprecated, use max_num_proc instead max_np=None): """ Runs Elastic Horovod on Spark. Runs `num_proc` processes executing `fn` using the same amount of Spark tasks. Args: fn: Function to run. args: Arguments to pass to `fn`. kwargs: Keyword arguments to pass to `fn`. num_proc: Number of Horovod processes. Defaults to `spark.default.parallelism`. min_num_proc: Minimum number of processes running for training to continue. If number of available processes dips below this threshold, then training will wait for more instances to become available. max_num_proc: Maximum number of training processes, beyond which no additional processes will be created. If not specified, then will be unbounded. start_timeout: Timeout for Spark tasks to spawn, register and start running the code, in seconds. If not set, falls back to `HOROVOD_SPARK_START_TIMEOUT` environment variable value. If it is not set as well, defaults to 600 seconds. elastic_timeout: Timeout for elastic initialisation after re-scaling the cluster. If not set, falls back to `HOROVOD_ELASTIC_TIMEOUT` environment variable value. If it is not set as well, defaults to 600 seconds. reset_limit: Maximum number of resets after which the job is terminated. env: Environment dictionary to use in Horovod run. Defaults to `os.environ`. stdout: Horovod stdout is redirected to this stream. stderr: Horovod stderr is redirected to this stream. verbose: Debug output verbosity (0-2). Defaults to 1. nics: List of NICs for tcp network communication. prefix_output_with_timestamp: shows timestamp in stdout/stderr forwarding on the driver Returns: List of results returned by running `fn` on each rank. """ if min_np is not None: min_num_proc = min_np warnings.warn('min_np is deprecated, use min_num_proc instead', DeprecationWarning) if max_np is not None: max_num_proc = max_np warnings.warn('max_np is deprecated, use max_num_proc instead', DeprecationWarning) if not gloo_built(verbose=(verbose >= 2)): raise ValueError( 'Gloo support is required to use elastic training, but has not been built. Ensure CMake is ' 'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) spark_context = pyspark.SparkContext._active_spark_context if spark_context is None: raise Exception('Could not find an active SparkContext, are you ' 'running in a PySpark session?') if start_timeout is None: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_SPARK_START_TIMEOUT', '600')) # nics needs to be a set if nics and not isinstance(nics, set): nics = set(nics) if num_proc is None: # TODO: #2023 try spark.dynamicAllocation.initialExecutors num_proc = spark_context.defaultParallelism if verbose >= 1: logging.info( 'Running %d processes (inferred from spark.default.parallelism)...', num_proc) else: if verbose >= 1: logging.info('Running %d processes...', num_proc) if min_num_proc is None: # TODO: #2023 try spark.dynamicAllocation.minExecutors min_num_proc = num_proc if max_num_proc is None: # TODO: #2023 try spark.dynamicAllocation.maxExecutors max_num_proc = num_proc # start Spark driver service and launch settings.num_proc Spark tasks key = secret.make_secret_key() spark_job_group = 'horovod.spark.run.%d' % job_id.next_job_id() driver = driver_service.SparkDriverService(num_proc, max_num_proc, fn, args, kwargs, key, nics) discovery = host_discovery.SparkDriverHostDiscovery(driver) tmout = timeout.Timeout( start_timeout, message='Timed out waiting for {activity}. Please check that you have ' 'enough resources to run all Horovod processes. Each Horovod ' 'process runs in a Spark task. You may need to increase the ' 'start_timeout parameter to a larger value if your Spark resources ' 'are allocated on-demand.') settings = hvd_elastic_settings.ElasticSettings( discovery=discovery, min_num_proc=min_num_proc, max_num_proc=max_num_proc, elastic_timeout=elastic_timeout, reset_limit=reset_limit, num_proc=num_proc, verbose=verbose, key=key, start_timeout=tmout, nics=nics, run_func_mode=True, prefix_output_with_timestamp=prefix_output_with_timestamp) result_queue = queue.Queue(1) # launch settings.num_proc / settings.max_num_proc Spark tasks spark_thread = _make_spark_thread(spark_context, spark_job_group, driver, result_queue, settings, use_gloo=True, is_elastic=True) try: # Register task addresses of initial num_proc tasks _register_task_addresses(driver, settings) # Run the job gloo_run_elastic(settings, driver, env, stdout, stderr) except: # Terminate Spark job. spark_context.cancelJobGroup(spark_job_group) # Re-raise exception. raise finally: spark_thread.join() driver.shutdown() # Make sure Spark Job did not fail. driver.check_for_spark_job_failure() # get ranks from driver indices_in_rank_order = _get_indices_in_rank_order(driver) # If there's no exception, execution results are in this queue. results = result_queue.get_nowait() return [results[index] for index in indices_in_rank_order]
def run(fn, args=(), kwargs={}, num_proc=None, start_timeout=None, use_mpi=None, use_gloo=None, extra_mpi_args=None, env=None, stdout=None, stderr=None, verbose=1, nics=None, prefix_output_with_timestamp=False, executable=None): """ Runs Horovod on Spark. Runs `num_proc` processes executing `fn` using the same amount of Spark tasks. Args: fn: Function to run. args: Arguments to pass to `fn`. kwargs: Keyword arguments to pass to `fn`. num_proc: Number of Horovod processes. Defaults to `spark.default.parallelism`. start_timeout: Timeout for Spark tasks to spawn, register and start running the code, in seconds. If not set, falls back to `HOROVOD_SPARK_START_TIMEOUT` environment variable value. If it is not set as well, defaults to 600 seconds. extra_mpi_args: Extra arguments for mpi_run. Defaults to no extra args. env: Environment dictionary to use in Horovod run. stdout: Horovod stdout is redirected to this stream. Defaults to sys.stdout when used with MPI. stderr: Horovod stderr is redirected to this stream. Defaults to sys.stderr when used with MPI. verbose: Debug output verbosity (0-2). Defaults to 1. nics: List of NICs for tcp network communication. prefix_output_with_timestamp: shows timestamp in stdout/stderr forwarding on the driver executable: Optional executable to run when launching the workers. Defaults to `sys.executable`. Returns: List of results returned by running `fn` on each rank. """ if start_timeout is None: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_SPARK_START_TIMEOUT', '600')) # nics needs to be a set if nics and not isinstance(nics, set): nics = set(nics) tmout = timeout.Timeout( start_timeout, message='Timed out waiting for {activity}. Please check that you have ' 'enough resources to run all Horovod processes. Each Horovod ' 'process runs in a Spark task. You may need to increase the ' 'start_timeout parameter to a larger value if your Spark resources ' 'are allocated on-demand.') settings = hvd_settings.Settings( verbose=verbose, extra_mpi_args=extra_mpi_args, key=secret.make_secret_key(), start_timeout=tmout, nics=nics, run_func_mode=True, prefix_output_with_timestamp=prefix_output_with_timestamp) spark_context = pyspark.SparkContext._active_spark_context if spark_context is None: raise Exception('Could not find an active SparkContext, are you ' 'running in a PySpark session?') if num_proc is None: num_proc = spark_context.defaultParallelism if settings.verbose >= 1: logging.info( 'Running %d processes (inferred from spark.default.parallelism)...', num_proc) else: if settings.verbose >= 1: logging.info('Running %d processes...', num_proc) settings.num_proc = num_proc result_queue = queue.Queue(1) # start Spark driver service and launch settings.num_proc Spark tasks spark_job_group = 'horovod.spark.run.%d' % job_id.next_job_id() driver = driver_service.SparkDriverService(settings.num_proc, settings.num_proc, fn, args, kwargs, settings.key, settings.nics) gloo_is_used = is_gloo_used(use_gloo=use_gloo, use_mpi=use_mpi, use_jsrun=False) spark_thread = _make_spark_thread(spark_context, spark_job_group, driver, result_queue, settings, use_gloo=gloo_is_used, is_elastic=False) try: # wait for all tasks to register, notify them and initiate task-to-task address registration _notify_and_register_task_addresses(driver, settings) # Determine the index grouping based on host hashes. # Barrel shift until index 0 is in the first host. host_hashes = list(driver.task_host_hash_indices().keys()) host_hashes.sort() while 0 not in driver.task_host_hash_indices()[host_hashes[0]]: host_hashes = host_hashes[1:] + host_hashes[:1] settings.hosts = ','.join( '%s:%d' % (host_hash, len(driver.task_host_hash_indices()[host_hash])) for host_hash in host_hashes) # Run the job _launch_job(use_mpi, use_gloo, settings, driver, env, stdout, stderr, executable) except: # Terminate Spark job. spark_context.cancelJobGroup(spark_job_group) # Re-raise exception. raise finally: spark_thread.join() driver.shutdown() # Make sure Spark Job did not fail. driver.check_for_spark_job_failure() # get ranks from driver indices_in_rank_order = _get_indices_in_rank_order(driver) # If there's no exception, execution results are in this queue. results = result_queue.get_nowait() return [results[index] for index in indices_in_rank_order]
def _run_command(self, command, env, event): super(SparkTaskService, self)._run_command(command, env, event) if self._minimum_command_lifetime_s is not None: self._minimum_command_lifetime = timeout.Timeout(self._minimum_command_lifetime_s, message='Just measuring runtime')
def _handle(self, req, client_address): if isinstance(req, RegisterDispatcherRequest): self._wait_cond.acquire() try: if not 0 <= req.dispatcher_id <= self._max_dispatcher_id: return IndexError( f'Dispatcher id must be within [0..{self._max_dispatcher_id}]: ' f'{req.dispatcher_id}') if self._dispatcher_addresses[req.dispatcher_id] is not None and \ self._dispatcher_addresses[req.dispatcher_id] != req.dispatcher_address: return ValueError( f'Dispatcher with id {req.dispatcher_id} has already been registered under ' f'different address {self._dispatcher_addresses[req.dispatcher_id]}: ' f'{req.dispatcher_address}') self._dispatcher_addresses[ req.dispatcher_id] = req.dispatcher_address self._wait_cond.notify_all() finally: self._wait_cond.release() return network.AckResponse() if isinstance(req, WaitForDispatcherRegistrationRequest): self._wait_cond.acquire() try: if not 0 <= req.dispatcher_id <= self._max_dispatcher_id: return IndexError( f'Dispatcher id must be within [0..{self._max_dispatcher_id}]: ' f'{req.dispatcher_id}') tmout = timeout.Timeout( timeout=req.timeout, message= 'Timed out waiting for {activity}. Try to find out what takes ' 'the dispatcher so long to register or increase timeout.') while self._dispatcher_addresses[req.dispatcher_id] is None: self._wait_cond.wait(tmout.remaining()) tmout.check_time_out_for( f'dispatcher {req.dispatcher_id} to register') except TimeoutException as e: return e finally: self._wait_cond.release() return WaitForDispatcherRegistrationResponse( self._dispatcher_addresses[req.dispatcher_id]) if isinstance(req, RegisterDispatcherWorkerRequest): self._wait_cond.acquire() try: if not 0 <= req.dispatcher_id <= self._max_dispatcher_id: return IndexError( f'Dispatcher id must be within [0..{self._max_dispatcher_id}]: ' f'{req.dispatcher_id}') self._dispatcher_worker_ids[req.dispatcher_id].update( {req.worker_id}) self._wait_cond.notify_all() finally: self._wait_cond.release() return network.AckResponse() if isinstance(req, WaitForDispatcherWorkerRegistrationRequest): # if there is only a single dispatcher, wait for that one instead of the requested one dispatcher_id = req.dispatcher_id if self._max_dispatcher_id > 0 else 0 self._wait_cond.acquire() try: if not 0 <= req.dispatcher_id <= self._max_dispatcher_id: return IndexError( f'Dispatcher id must be within [0..{self._max_dispatcher_id}]: ' f'{req.dispatcher_id}') tmout = timeout.Timeout( timeout=req.timeout, message= 'Timed out waiting for {activity}. Try to find out what takes ' 'the workers so long to register or increase timeout.') while len(self._dispatcher_worker_ids[dispatcher_id] ) < self._workers_per_dispatcher: self._wait_cond.wait(tmout.remaining()) tmout.check_time_out_for( f'workers for dispatcher {dispatcher_id} to register') except TimeoutException as e: return e finally: self._wait_cond.release() return network.AckResponse() if isinstance(req, ShutdownRequest): in_thread(self.shutdown) return network.AckResponse() if isinstance(req, WaitForShutdownRequest): self._wait_cond.acquire() try: while not self._shutdown: self._wait_cond.wait() finally: self._wait_cond.release() return network.AckResponse() return super()._handle(req, client_address)