コード例 #1
0
ファイル: runner.py プロジェクト: fl183/horovod
    def detect_nics(self):
        """Decomposed version of driver_service.get_common_interfaces()."""
        nics = None
        all_host_names = list(self.coordinator.hostnames_by_rank)
        remote_host_names = network.filter_local_addresses(all_host_names)
        if len(remote_host_names) > 0:
            nics = self.settings.nics
            if not nics:
                if self.settings.verbose >= 2:
                    print('Testing interfaces on all hosts.')

                local_host_names = set(all_host_names) - set(remote_host_names)
                nics = _driver_fn(self.colocators, all_host_names,
                                  local_host_names, self.settings)

                if self.settings.verbose >= 2:
                    print('Interfaces on all hosts were successfully checked.')
                    print('Common interface found: ' + ' '.join(nics))
        else:
            nics = driver_service.get_local_interfaces(self.settings)

        return {
            "HOROVOD_GLOO_IFACE": list(nics)[0],
            "NCCL_SOCKET_IFNAME": ",".join(nics),  # TODO
        }
コード例 #2
0
ファイル: utils.py プロジェクト: zuston/horovod
def detect_nics(settings,
                all_host_names: List[str],
                node_workers: Optional[List] = None) -> List[str]:
    """Returns available nics on all given nodes.

    Use `nics_to_env_var` to generate the appropriate environent variables
    to be used in starting Horovod.

    This is a decomposed version of driver_service.get_common_interfaces().

    If 'all_host_names' includes a remote hostname, Horovod will run a nic
    detection scheme that pings each adjacent host to find the right nic.

    Args:
        settings: Horovod Settings object.
        all_host_names (list): List of all host names, including localhost.
        node_workers (list): Optional list of Ray Actors. This list is used
            to conduct the detection scheme. If no list is provided,
            Ray will start some lightweight actors on each node and stop
            them after the nics are found.

    Returns:
        List of nics (str).
    """
    nics = None
    remote_host_names = network.filter_local_addresses(all_host_names)
    if len(remote_host_names) > 0:
        nics = settings.nics
        if not nics:
            with _maybe_create_workers(
                    all_host_names, existing_workers=node_workers) as workers:
                if settings.verbose >= 2:
                    print('Testing interfaces on all hosts.')

                local_host_names = set(all_host_names) - set(remote_host_names)
                nics = _driver_fn(workers, all_host_names, local_host_names,
                                  settings)

                if settings.verbose >= 2:
                    print('Interfaces on all hosts were successfully checked.')
                    print('Common interface found: ' + ' '.join(nics))
    else:
        nics = driver_service.get_local_interfaces(settings)
    return nics