Ejemplo n.º 1
0
def _start_rabit_tracker(num_workers: int):
    """Start Rabit tracker. The workers connect to this tracker to share
    their results."""
    # TODO (hme): Cleanup thread and tracker after training.
    host = systems_utils.get_private_ip()

    env = {"DMLC_NUM_WORKER": num_workers}
    rabit_tracker = xgb.RabitTracker(hostIP=host, nslave=num_workers)

    # Get tracker Host + IP
    env.update(rabit_tracker.slave_envs())
    rabit_tracker.start(num_workers)

    # Wait until context completion
    thread = Thread(target=rabit_tracker.join)
    thread.daemon = True
    thread.start()

    return env
Ejemplo n.º 2
0
def test_head_detection():
    ray.init()

    assert settings.head_ip is None
    sys = RaySystem(use_head=True)
    sys.init()
    assert sys._head_node is not None
    sys.shutdown()

    settings.head_ip = "1.2.3.4"
    sys = RaySystem(use_head=True)
    sys.init()
    assert sys._head_node is None
    sys.shutdown()

    settings.head_ip = systems_utils.get_private_ip()
    sys = RaySystem(use_head=True)
    sys.init()
    assert sys._head_node is not None
    sys.shutdown()
Ejemplo n.º 3
0
    def init(self):
        if ray.is_initialized():
            self._manage_ray = False
        if self._manage_ray:
            ray.init(num_cpus=self.num_cpus)
        # Compute available nodes, based on CPU resource.
        if settings.head_ip is None:
            # TODO (hme): Have this be a class argument vs. using what's set in settings directly.
            logging.getLogger(__name__).info(
                "Using driver node ip as head node.")
            head_ip = get_private_ip()
        else:
            head_ip = settings.head_ip
        total_cpus = 0
        nodes = ray.nodes()
        for node in nodes:
            node_ip = self._node_ip(node)
            if head_ip == node_ip:
                logging.getLogger(__name__).info("head node %s", node_ip)
                self._head_node = node
            elif self._has_cpu_resources(node):
                logging.getLogger(__name__).info("worker node %s", node_ip)
                total_cpus += node["Resources"]["CPU"]
                self._worker_nodes.append(node)
                self._available_nodes.append(node)
        if self._head_node is None:
            if self._use_head:
                logging.getLogger(__name__).warning(
                    "Failed to determine which node is the head."
                    " The head node will be used even though"
                    " nums.core.settings.use_head = False.")
        elif self._use_head and self._has_cpu_resources(self._head_node):
            total_cpus += self._head_node["Resources"]["CPU"]
            self._available_nodes.append(self._head_node)
        logging.getLogger(__name__).info("total cpus %s", total_cpus)

        if self._num_nodes is None:
            self._num_nodes = len(self._available_nodes)
        assert self._num_nodes <= len(self._available_nodes)

        self.init_devices()
Ejemplo n.º 4
0
 def init(self):
     # Compute available nodes, based on CPU resource.
     local_ip = get_private_ip()
     total_cpus = 0
     for node in ray.nodes():
         node_key = list(
             filter(lambda key: "node" in key, node["Resources"].keys()))
         assert len(node_key) == 1
         node_ip = node_key[0].split(":")[1]
         has_cpu_resources = "CPU" in node[
             "Resources"] and node["Resources"]["CPU"] >= 1.0
         if local_ip == node_ip:
             logging.getLogger().info("head node %s", node_ip)
             self.head_node = node
             if self.use_head and has_cpu_resources:
                 total_cpus += node["Resources"]["CPU"]
                 self.available_nodes.append(node)
         elif has_cpu_resources:
             logging.getLogger().info("worker node %s", node_ip)
             total_cpus += node["Resources"]["CPU"]
             self.available_nodes.append(node)
     logging.getLogger().info("total cpus %s", total_cpus)
     # Collect compute functions.
     module_functions = extract_functions(self.compute_imp)
     function_signatures: dict = {}
     required_methods = inspect.getmembers(ComputeInterface(),
                                           predicate=inspect.ismethod)
     for name, func in required_methods:
         function_signatures[name] = func
     for name, func in module_functions.items():
         func_sig = function_signatures[name]
         try:
             remote_params = func_sig.remote_params
         except Exception as _:
             remote_params = {}
         self.remote_functions[name] = self.remote(func, remote_params)