def _launch_job(args, remote_host_names, settings, nics, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) if args.use_gloo: if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support has not been built. If this is not expected, ensure CMake is installed ' 'and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) elif args.use_mpi: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) mpi_run(settings, nics, env, command) elif args.use_jsrun: if not mpi_built(verbose=(settings.verbose >= 2)): raise ValueError( 'MPI support has not been built. If this is not expected, ensure MPI is installed ' 'and reinstall Horovod with HOROVOD_WITH_MPI=1 to debug the build error.' ) if not lsf.LSFUtils.using_lsf(): raise ValueError( 'Horovod did not detect an LSF job. The jsrun launcher can only be used in that environment. ' 'Please, pick a different launcher for other environments.') js_run(settings, nics, env, command) else: if mpi_built(verbose=(settings.verbose >= 2)): if lsf.LSFUtils.using_lsf() and is_jsrun_installed(): js_run(settings, nics, env, command) else: mpi_run(settings, nics, env, command) elif gloo_built(verbose=(settings.verbose >= 2)): gloo_run(settings, remote_host_names, nics, env, network._get_driver_ip(nics), command) else: raise ValueError( 'Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that ' 'either MPI is installed (MPI) or CMake is installed (Gloo).')
def _launch_job(args, remote_host_names, settings, nics, command): env = os.environ.copy() config_parser.set_env_from_args(env, args) driver_ip = network._get_driver_ip(nics) def gloo_run_fn(): gloo_run(settings, remote_host_names, nics, env, driver_ip, command) def mpi_run_fn(): mpi_run(settings, nics, env, command) def js_run_fn(): js_run(settings, nics, env, command) run_controller(args.use_gloo, gloo_run_fn, args.use_mpi, mpi_run_fn, args.use_jsrun, js_run_fn, args.verbose)
def gloo_run_fn(): driver_ip = network._get_driver_ip(nics) gloo_run(settings, remote_host_names, nics, env, driver_ip, command)
def _run(args): if args.check_build: check_build(args.verbose) # If LSF is used, use default values from job config if lsf.LSFUtils.using_lsf(): if not args.np: args.np = lsf.LSFUtils.get_num_processes() if not args.hosts and not args.hostfile: args.hosts = ','.join( '{host}:{np}'.format(host=host, np=lsf.LSFUtils.get_num_gpus()) for host in lsf.LSFUtils.get_compute_hosts()) # if hosts are not specified, either parse from hostfile, or default as # localhost if not args.hosts: if args.hostfile: args.hosts = parse_host_files(args.hostfile) else: # Set hosts to localhost if not specified args.hosts = 'localhost:{np}'.format(np=args.np) all_host_names = parse_host_names(args.hosts) nics_set = set(args.nics.split(',')) if args.nics else None # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = hvd_settings.Settings(verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, extra_mpi_args=args.mpi_args, tcp_flag=args.tcp_flag, binding_args=args.binding_args, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=args.np, hosts=args.hosts, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=nics_set) # This cache stores the results of checks performed by horovod # during the initialization step. It can be disabled by setting # --disable-cache flag. fn_cache = None if not args.disable_cache: params = '' if args.np: params += str(args.np) + ' ' if args.hosts: params += str(args.hosts) + ' ' if args.ssh_port: params += str(args.ssh_port) parameters_hash = hashlib.md5(params.encode('utf-8')).hexdigest() fn_cache = cache.Cache(CACHE_FOLDER, CACHE_STALENESS_THRESHOLD_MINUTES, parameters_hash) if settings.verbose >= 2: print('Filtering local host names.') remote_host_names = network.filter_local_addresses(all_host_names) if settings.verbose >= 2: print('Remote host found: ' + ' '.join(remote_host_names)) if len(remote_host_names) > 0: if settings.verbose >= 2: print('Checking ssh on all remote hosts.') # Check if we can ssh into all remote hosts successfully. _check_all_hosts_ssh_successful(remote_host_names, args.ssh_port, fn_cache=fn_cache) if settings.verbose >= 2: print('SSH was successful into all the remote hosts.') nics = driver_service.get_common_interfaces(settings, all_host_names, remote_host_names, fn_cache) if args.run_func: # get the driver IPv4 address driver_ip = network._get_driver_ip(nics) run_func_server = KVStoreServer(verbose=settings.verbose) run_func_server_port = run_func_server.start_server() pickled_exec_func = cloudpickle.dumps(args.run_func) put_data_into_kvstore(driver_ip, run_func_server_port, 'runfunc', 'func', pickled_exec_func) command = [ sys.executable, '-m', 'horovod.run.run_task', str(driver_ip), str(run_func_server_port) ] try: _launch_job(args, remote_host_names, settings, nics, command) results = [None] * args.np # TODO: make it parallel to improve performance for i in range(args.np): pickled_result = read_data_from_kvstore( driver_ip, run_func_server_port, 'runfunc_result', str(i)) results[i] = cloudpickle.loads(pickled_result) return results finally: run_func_server.shutdown_server() else: command = args.command _launch_job(args, remote_host_names, settings, nics, command) return None