def _run_elastic(args): # construct host discovery component if args.host_discovery_script: discover_hosts = discovery.HostDiscoveryScript( args.host_discovery_script, args.slots) elif args.hosts: _, available_host_slots = hosts.parse_hosts_and_slots(args.hosts) if len(available_host_slots) < 2: raise ValueError( 'Cannot run in fault tolerance mode with fewer than 2 hosts.') discover_hosts = discovery.FixedHosts(available_host_slots) else: raise ValueError( 'One of --host-discovery-script, --hosts, or --hostnames must be provided' ) # horovodrun has to finish all the checks before this timeout runs out. if args.start_timeout: start_timeout = args.start_timeout else: # Lookup default timeout from the environment variable. start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') settings = elastic_settings.ElasticSettings( discovery=discover_hosts, min_num_proc=args.min_num_proc or args.num_proc, max_num_proc=args.max_num_proc, elastic_timeout=args.elastic_timeout, reset_limit=args.reset_limit, cooldown_range=args.cooldown_range, num_proc=args.num_proc, verbose=2 if args.verbose else 0, ssh_port=args.ssh_port, ssh_identity_file=args.ssh_identity_file, extra_mpi_args=args.mpi_args, key=secret.make_secret_key(), start_timeout=tmout, output_filename=args.output_filename, run_func_mode=args.run_func is not None, nics=args.nics, prefix_output_with_timestamp=args.prefix_output_with_timestamp) if not gloo_built(verbose=(settings.verbose >= 2)): raise ValueError( 'Gloo support is required to use elastic training, but has not been built. Ensure CMake is ' 'installed and reinstall Horovod with HOROVOD_WITH_GLOO=1 to debug the build error.' ) env = os.environ.copy() config_parser.set_env_from_args(env, args) executable = args.executable or sys.executable return gloo_run_elastic(settings, env, args.run_func if args.run_func else args.command, executable)
def elastic_driver_fn(): global_rendezv = RendezvousServer(verbose=1) discover_hosts = discovery.HostDiscoveryScript( "/Users/zuston/iqiyiDev/horovod-opal/dis.sh", 3) driver = ElasticDriver(global_rendezv, discover_hosts, min_np=2, max_np=4) handler = create_rendezvous_handler(driver) global_rendezv_port = global_rendezv.start(handler) print('port: ' + str(global_rendezv_port)) print('wait for available slots: {}'.format(2)) current_hosts = driver.wait_for_available_slots(2) print("current hosts:" + str(current_hosts)) pending_slots = driver._update_host_assignments(current_hosts) print("pending hosts:" + str(pending_slots)) driver._worker_registry.reset(driver.world_size())