def start(self): """Starts the Horovod driver and services.""" self.rendezvous = RendezvousServer(self.settings.verbose) self.driver = ElasticDriver(rendezvous=self.rendezvous, discovery=self.settings.discovery, min_np=self.settings.min_np, max_np=self.settings.max_np, timeout=self.settings.elastic_timeout, reset_limit=self.settings.reset_limit, verbose=self.settings.verbose) handler = create_rendezvous_handler(self.driver) global_rendezv_port = self.rendezvous.start(handler) self.driver.wait_for_available_slots(self.settings.num_proc) # Host-to-host common interface detection # requires at least 2 hosts in an elastic job. min_hosts = _get_min_start_hosts(self.settings) current_hosts = self.driver.wait_for_available_slots( self.settings.num_proc, min_hosts=min_hosts) nics = driver_service.get_common_interfaces( self.settings, current_hosts.host_assignment_order) server_ip = network.get_driver_ip(nics) self.run_env_vars = create_run_env_vars(server_ip, nics, global_rendezv_port, elastic=True)
def start(self): """Starts the Horovod driver and services.""" self.rendezvous = RendezvousServer(self.settings.verbose) self.driver = ElasticDriver(rendezvous=self.rendezvous, discovery=self.settings.discovery, min_num_proc=self.settings.min_num_proc, max_num_proc=self.settings.max_num_proc, timeout=self.settings.elastic_timeout, reset_limit=self.settings.reset_limit, verbose=self.settings.verbose) handler = create_rendezvous_handler(self.driver) logger.debug("[ray] starting rendezvous") global_rendezv_port = self.rendezvous.start(handler) logger.debug(f"[ray] waiting for {self.settings.num_proc} to start.") self.driver.wait_for_available_slots(self.settings.num_proc) # Host-to-host common interface detection # requires at least 2 hosts in an elastic job. min_hosts = _get_min_start_hosts(self.settings) current_hosts = self.driver.wait_for_available_slots( self.settings.num_proc, min_hosts=min_hosts) logger.debug("[ray] getting common interfaces") nics = detect_nics( self.settings, all_host_names=current_hosts.host_assignment_order, ) logger.debug("[ray] getting driver IP") server_ip = socket.gethostbyname(socket.gethostname()) self.run_env_vars = create_run_env_vars(server_ip, nics, global_rendezv_port, elastic=True)
def start(self, executable_cls: type = None, executable_args: Optional[List] = None, executable_kwargs: Optional[Dict] = None, extra_env_vars: Optional[Dict] = None): """Starts the Horovod driver and services. Args: executable_cls (type): The class that will be created within an actor (BaseHorovodWorker). This will allow Horovod to establish its connections and set env vars. executable_args (List): Arguments to be passed into the worker class upon initialization. executable_kwargs (Dict): Keyword arguments to be passed into the worker class upon initialization. extra_env_vars (Dict): Environment variables to be set on the actors (worker processes) before initialization. """ self.rendezvous = RendezvousServer(self.settings.verbose) self.driver = ElasticDriver(rendezvous=self.rendezvous, discovery=self.settings.discovery, min_np=self.min_workers, max_np=self.max_workers, timeout=self.elastic_timeout, reset_limit=self.reset_limit, cooldown_range=self.cooldown_range, verbose=self.settings.verbose) handler = create_rendezvous_handler(self.driver) logger.debug("[ray] starting rendezvous") global_rendezv_port = self.rendezvous.start(handler) logger.debug(f"[ray] waiting for {self.num_workers} to start.") self.driver.wait_for_available_slots(self.num_workers) # Host-to-host common interface detection # requires at least 2 hosts in an elastic job. min_hosts = _get_min_start_hosts(self.settings) current_hosts = self.driver.wait_for_available_slots( self.num_workers, min_hosts=min_hosts) logger.debug("[ray] getting common interfaces") nics = detect_nics( self.settings, all_host_names=current_hosts.host_assignment_order, ) logger.debug("[ray] getting driver IP") server_ip = socket.gethostbyname(socket.gethostname()) self.run_env_vars = create_run_env_vars(server_ip, nics, global_rendezv_port, elastic=True) self.executable_cls = executable_cls self.executable_args = executable_args self.executable_kwargs = executable_kwargs self.env_vars = extra_env_vars or {}