Esempio n. 1
0
File: horovod.py Progetto: rlan/ray
    def on_start(self, worker_group: WorkerGroup,
                 backend_config: HorovodConfig):

        # TODO(matt): Implement placement group strategies in BackendExecutor.

        # Initialize workers with Horovod environment variables
        setup_futures = []
        for rank in range(len(worker_group)):
            worker_node_id = worker_group.workers[rank].metadata.node_id
            setup_futures.append(
                worker_group.execute_single_async(rank, init_env_vars, rank,
                                                  len(worker_group),
                                                  worker_node_id))
        ray.get(setup_futures)

        # Use Horovod Ray Coordinator
        # backend_config as settings
        self.coordinator = Coordinator(backend_config)

        # Get all the hostnames of all workers
        node_ids = [w.metadata.node_id for w in worker_group.workers]
        hostnames = [w.metadata.hostname for w in worker_group.workers]
        # Register each hostname to the coordinator. assumes the hostname
        # ordering is the same.
        for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)):
            self.coordinator.register(hostname, node_id, rank)
        all_info = self.coordinator.finalize_registration()

        setup_futures = []
        for rank, local_cross_env_var in all_info.items():
            setup_futures.append(
                worker_group.execute_single_async(rank, update_env_vars,
                                                  local_cross_env_var))
        ray.get(setup_futures)

        coordinator_envs = self.coordinator.establish_rendezvous()

        nics = detect_nics(
            backend_config,
            all_host_names=list(self.coordinator.hostnames),
            node_workers=worker_group.workers)
        coordinator_envs.update(nics_to_env_var(nics))

        worker_group.execute(update_env_vars, coordinator_envs)
Esempio n. 2
0
    def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig):
        if len(worker_group) > 1 and dist.is_available():
            # Set the appropriate training backend.
            if backend_config.backend is None:
                if worker_group.num_gpus_per_worker > 0:
                    backend = "nccl"
                else:
                    backend = "gloo"
            else:
                backend = backend_config.backend

            master_addr, master_port = worker_group.execute_single(
                0, get_address_and_port)
            if backend_config.init_method == "env":

                def set_env_vars(addr, port):
                    os.environ["MASTER_ADDR"] = addr
                    os.environ["MASTER_PORT"] = str(port)

                worker_group.execute(set_env_vars,
                                     addr=master_addr,
                                     port=master_port)
                url = "env://"
            elif backend_config.init_method == "tcp":
                url = f"tcp://{master_addr}:{master_port}"
            else:
                raise ValueError(
                    f"The provided init_method ("
                    f"{backend_config.init_method}) is not supported. Must "
                    f"be either 'env' or 'tcp'.")

            setup_futures = []
            for i in range(len(worker_group)):
                setup_futures.append(
                    worker_group.execute_single_async(
                        i,
                        setup_torch_process_group,
                        backend=backend,
                        world_rank=i,
                        world_size=len(worker_group),
                        init_method=url,
                        timeout_s=backend_config.timeout_s))
            ray.get(setup_futures)
        else:
            logger.info("Distributed torch is not being used.")
Esempio n. 3
0
    def on_start(self, worker_group: WorkerGroup,
                 backend_config: TensorflowConfig):
        if len(worker_group) > 1:
            # Compute URL for initializing distributed setup.
            def get_url():
                address, port = get_address_and_port()
                return f"{address}:{port}"

            urls = worker_group.execute(get_url)

            # Get setup tasks in order to throw errors on failure.
            setup_futures = []
            for i in range(len(worker_group)):
                setup_futures.append(
                    worker_group.execute_single_async(
                        i,
                        setup_tensorflow_environment,
                        worker_addresses=urls,
                        index=i))
            ray.get(setup_futures)

        else:
            logger.info("Distributed Tensorflow is not being used.")