def run(): args = parse_args() # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts: if args.hostfile: args.hosts = parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = 'localhost:{np}'.format(np=args.np) host_list = args.hosts.split(',') all_host_names = [] pattern = re.compile(r'^[\w.-]+:\d+$') for host in host_list: if not pattern.match(host.strip()): raise ValueError('Invalid host input, please make sure it has ' 'format as : worker-0:2,worker-1:2.') all_host_names.append(host.strip().split(':')[0]) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=args.np, hosts=args.hosts, command=args.command) # This cache stores the results of checks performed by horovodrun # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, fn_cache=fn_cache) if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') if len(remote_host_names) > 0: if settings.verbose >= 2: print('Testing interfaces on all the hosts.') local_host_names = set(all_host_names) - set(remote_host_names) # Find the set of common, routed interfaces on all the hosts (remote # and local) and specify it in the args to be used by NCCL. It is # expected that the following function will find at least one interface # otherwise, it will raise an exception. common_intfs = _driver_fn(all_host_names, local_host_names, settings, fn_cache=fn_cache) if settings.verbose >= 2: print('Interfaces on all the hosts were successfully checked.') print('Common interface found: ' + ' '.join(common_intfs)) else: if settings.verbose >= 2: print('All hosts are local, finding the interfaces ' 'with address 127.0.0.1') # If all the given hosts are local, find the interfaces with address # 127.0.0.1 common_intfs = set() for iface, addrs in net_if_addrs().items(): for addr in addrs: if addr.family == AF_INET and addr.address == '127.0.0.1': common_intfs.add(iface) break if len(common_intfs) == 0: raise ValueError('No interface is found for address 127.0.0.1.') if settings.verbose >= 2: print('Local interface found ' + ' '.join(common_intfs)) if args.use_gloo: if not gloo_built(): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, common_intfs) elif args.use_mpi: if not mpi_built(): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, common_intfs) else: if mpi_built(): mpi_run(settings, common_intfs) elif gloo_built(): gloo_run(settings, remote_host_names, common_intfs) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def run(): args = parse_args() if args.version: print(horovod.__version__) exit(0) if args.host: all_host_names = [ x for x in [y.split(':')[0] for y in args.host.split(',')] ] elif args.hostfile: all_host_names = [ x for x in [line.split()[0] for line in open(args.hostfile)] ] else: all_host_names = [] # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=args.np) # This cache stores the results of checks performed by horovodrun # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.host: params += str(args.host) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) remote_host_names = [] if args.host or args.hostfile: if settings.verbose >= 2: print("Filtering local host names.") remote_host_names = network.filter_local_addresses(all_host_names) if len(remote_host_names) > 0: if settings.verbose >= 2: print("Checking ssh on all remote hosts.") # Check if we can ssh into all remote hosts successfully. _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, fn_cache=fn_cache) if settings.verbose >= 2: print("SSH was successful into all the remote hosts.") if args.host: hosts_arg = "-H {hosts}".format(hosts=args.host) else: hosts_arg = "-hostfile {hostfile}".format(hostfile=args.hostfile) else: # if none of --host of --hostfile is specified, localhost will be # used by default # There is no need to specify localhost. hosts_arg = "" if len(remote_host_names) > 0: if settings.verbose >= 2: print("Testing interfaces on all the hosts.") local_host_names = set(all_host_names) - set(remote_host_names) # Find the set of common, routed interfaces on all the hosts (remote # and local) and specify it in the args to be used by NCCL. It is # expected that the following function will find at least one interface # otherwise, it will raise an exception. common_intfs = _driver_fn(all_host_names, local_host_names, settings, fn_cache=fn_cache) tcp_intf_arg = "-mca btl_tcp_if_include {common_intfs}".format( common_intfs=','.join(common_intfs)) nccl_socket_intf_arg = "-x NCCL_SOCKET_IFNAME={common_intfs}".format( common_intfs=','.join(common_intfs)) if settings.verbose >= 2: print("Interfaces on all the hosts were successfully checked.") else: # If all the given hosts are local, no need to specify the interfaces # because MPI does not use network for local execution. tcp_intf_arg = "" nccl_socket_intf_arg = "" # Pass all the env variables to the mpirun command. env = os.environ.copy() # Pass secret key through the environment variables. env[secret.HOROVOD_SECRET_KEY] = codec.dumps_base64(settings.key) if not _is_open_mpi_installed(): raise Exception( 'horovodrun convenience script currently only supports ' 'Open MPI.\n\n' 'Choose one of:\n' '1. Install Open MPI 4.0.0+ and re-install Horovod ' '(use --no-cache-dir pip option).\n' '2. Run distributed ' 'training script using the standard way provided by your' ' MPI distribution (usually mpirun, srun, or jsrun).') if args.ssh_port: ssh_port_arg = "-mca plm_rsh_args \"-p {ssh_port}\"".format( ssh_port=args.ssh_port) else: ssh_port_arg = "" mpirun_command = ( 'mpirun --allow-run-as-root --tag-output ' '-np {num_proc} {hosts_arg} ' '-bind-to none -map-by slot ' '-mca pml ob1 -mca btl ^openib ' '{ssh_port_arg} ' '{tcp_intf_arg} ' '-x NCCL_DEBUG=INFO ' '{nccl_socket_intf_arg} ' '{env} {command}' # expect a lot of environment variables .format(num_proc=settings.num_proc, hosts_arg=hosts_arg, tcp_intf_arg=tcp_intf_arg, nccl_socket_intf_arg=nccl_socket_intf_arg, ssh_port_arg=ssh_port_arg, env=' '.join('-x %s' % key for key in env.keys() if env_util.is_exportable(key)), command=' '.join(quote(par) for par in args.command))) if settings.verbose >= 2: print(mpirun_command) # Execute the mpirun command. os.execve('/bin/sh', ['/bin/sh', '-c', mpirun_command], env)
def _run(args): if args.check_build: check_build(args.verbose) # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts: if args.hostfile: args.hosts = parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = 'localhost:{np}'.format(np=args.np) host_list = args.hosts.split(',') all_host_names = [] pattern = re.compile(r'^[\w.-]+:\d+$') for host in host_list: if not pattern.match(host.strip()): raise ValueError('Invalid host input, please make sure it has ' 'format as : worker-0:2,worker-1:2.') all_host_names.append(host.strip().split(':')[0]) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, extra_mpi_args=args.mpi_args, tcp_flag=args.tcp_flag, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=args.np, hosts=args.hosts, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nic=args.nic) # This cache stores the results of checks performed by horovodrun # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, fn_cache=fn_cache) if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') if len(remote_host_names) > 0: if settings.verbose >= 2: print('Testing interfaces on all the hosts.') local_host_names = set(all_host_names) - set(remote_host_names) # Find the set of common, routed interfaces on all the hosts (remote # and local) and specify it in the args to be used by NCCL. It is # expected that the following function will find at least one interface # otherwise, it will raise an exception. common_intfs = _driver_fn(all_host_names, local_host_names, settings, fn_cache=fn_cache) if settings.verbose >= 2: print('Interfaces on all the hosts were successfully checked.') print('Common interface found: ' + ' '.join(common_intfs)) else: if settings.verbose >= 2: print('All hosts are local, finding the interfaces ' 'with address 127.0.0.1') # If all the given hosts are local, find the interfaces with address # 127.0.0.1 common_intfs = set() for iface, addrs in net_if_addrs().items(): if settings.nic and iface != settings.nic: continue for addr in addrs: if addr.family == AF_INET and addr.address == '127.0.0.1': common_intfs.add(iface) break if len(common_intfs) == 0: raise ValueError('No interface is found for address 127.0.0.1.') if settings.verbose >= 2: print('Local interface found ' + ' '.join(common_intfs)) # get the driver IPv4 address driver_ip = _get_driver_ip(common_intfs) if args.run_func: run_func_server = KVStoreServer(verbose=settings.verbose) run_func_server_port = run_func_server.start_server() pickled_exec_func = cloudpickle.dumps(args.run_func) put_data_into_kvstore(driver_ip, run_func_server_port, 'runfunc', 'func', pickled_exec_func) command = [ sys.executable, '-m', 'horovod.run.run_task', str(driver_ip), str(run_func_server_port) ] try: _launch_job(args, remote_host_names, settings, common_intfs, command) results = [None] * args.np # TODO: make it parallel to improve performance for i in range(args.np): pickled_result = read_data_from_kvstore( driver_ip, run_func_server_port, 'runfunc_result', str(i)) results[i] = cloudpickle.loads(pickled_result) return results finally: run_func_server.shutdown_server() else: command = args.command _launch_job(args, remote_host_names, settings, common_intfs, command) return None
def _run(args): if args.check_build: check_build(args.verbose) # If LSF is used, use default values from job config if lsf.LSFUtils.using_lsf(): if not args.np: args.np = lsf.LSFUtils.get_num_processes() if not args.hosts and not args.hostfile: args.hosts = ','.join( '{host}:{np}'.format(host=host, np=lsf.LSFUtils.get_num_gpus()) for host in lsf.LSFUtils.get_compute_hosts()) # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts: if args.hostfile: args.hosts = parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = 'localhost:{np}'.format(np=args.np) host_list = args.hosts.split(',') all_host_names = [] pattern = re.compile(r'^[\w.-]+:\d+$') for host in host_list: if not pattern.match(host.strip()): raise ValueError('Invalid host input, please make sure it has ' 'format as : worker-0:2,worker-1:2.') all_host_names.append(host.strip().split(':')[0]) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, extra_mpi_args=args.mpi_args, tcp_flag=args.tcp_flag, binding_args=args.binding_args, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=args.np, hosts=args.hosts, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nic=args.nic) # This cache stores the results of checks performed by horovodrun # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, fn_cache=fn_cache) if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') common_intfs = driver_service._get_common_interfaces( settings, all_host_names, remote_host_names, fn_cache) if args.run_func: # get the driver IPv4 address driver_ip = network._get_driver_ip(common_intfs) run_func_server = KVStoreServer(verbose=settings.verbose) run_func_server_port = run_func_server.start_server() pickled_exec_func = cloudpickle.dumps(args.run_func) put_data_into_kvstore(driver_ip, run_func_server_port, 'runfunc', 'func', pickled_exec_func) command = [ sys.executable, '-m', 'horovod.run.run_task', str(driver_ip), str(run_func_server_port) ] try: _launch_job(args, remote_host_names, settings, common_intfs, command) results = [None] * args.np # TODO: make it parallel to improve performance for i in range(args.np): pickled_result = read_data_from_kvstore( driver_ip, run_func_server_port, 'runfunc_result', str(i)) results[i] = cloudpickle.loads(pickled_result) return results finally: run_func_server.shutdown_server() else: command = args.command _launch_job(args, remote_host_names, settings, common_intfs, command) return None
def get_common_interfaces(settings, all_host_names, remote_host_names=None, fn_cache=None): ''' Find the set of common and routed interfaces on all the hosts. :param settings: the object that contains the setting for running horovod :type settings: Horovod.run.common.util.settings.Settings :param all_host_names: list of the host names :type all_host_names: list(string) :param remote_host_names: list of the remote host names. :type remote_host_names: list(string) :param fn_cache: Cache storing the results of checks performed by horovod :type fn_cache: Horovod.run.util.cache.Cache :return: List of common interfaces ''' # Skipping interface discovery for LSF cluster as it slows down considerably the job start if lsf.LSFUtils.using_lsf(): return None if remote_host_names is None: remote_host_names = network.filter_local_addresses(all_host_names) if len(remote_host_names) > 0: if settings.nics: # If args.nics is provided, we will use those interfaces. All the workers # must have at least one of those interfaces available. nics = settings.nics else: # Find the set of common, routed interfaces on all the hosts (remote # and local) and specify it in the args to be used by NCCL. It is # expected that the following function will find at least one interface # otherwise, it will raise an exception. if settings.verbose >= 2: print('Testing interfaces on all the hosts.') local_host_names = set(all_host_names) - set(remote_host_names) nics = _driver_fn(all_host_names, local_host_names, settings, fn_cache=fn_cache) if settings.verbose >= 2: print('Interfaces on all the hosts were successfully checked.') print('Common interface found: ' + ' '.join(nics)) else: if settings.verbose >= 2: print('All hosts are local, finding the interfaces ' 'with address 127.0.0.1') # If all the given hosts are local, find the interfaces with address # 127.0.0.1 nics = set() for iface, addrs in net_if_addrs().items(): if settings.nics and iface not in settings.nics: continue for addr in addrs: if addr.family == AF_INET and addr.address == '127.0.0.1': nics.add(iface) break if len(nics) == 0: raise ValueError('No interface is found for address 127.0.0.1.') if settings.verbose >= 2: print('Local interface found ' + ' '.join(nics)) return nics
def _run_static(args): nics_set = set(args.nics.split(',')) if args.nics else None # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, extra_mpi_args=args.mpi_args, tcp_flag=args.tcp_flag, binding_args=args.binding_args, key=secret.make_secret_key(), start_timeout=tmout, num_proc=args.np, hosts=args.hosts, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=nics_set) # This cache stores the results of checks performed by horovod # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) all_host_names, _ = hosts.parse_hosts_and_slots(args.hosts) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. if not _check_all_hosts_ssh_successful( remote_host_names, args.ssh_port, fn_cache=fn_cache): raise RuntimeError('could not connect to some hosts via ssh') if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') nics = driver_service.get_common_interfaces(settings, all_host_names, remote_host_names, fn_cache) if args.run_func: # get the driver IPv4 address driver_ip = network.get_driver_ip(nics) run_func_server = KVStoreServer(verbose=settings.verbose) run_func_server_port = run_func_server.start_server() put_data_into_kvstore(driver_ip, run_func_server_port, 'runfunc', 'func', args.run_func) command = [ sys.executable, '-m', 'horovod.run.run_task', str(driver_ip), str(run_func_server_port) ] try: _launch_job(args, settings, nics, command) results = [None] * args.np # TODO: make it parallel to improve performance for i in range(args.np): results[i] = read_data_from_kvstore(driver_ip, run_func_server_port, 'runfunc_result', str(i)) return results finally: run_func_server.shutdown_server() else: command = args.command _launch_job(args, settings, nics, command) return None