Esempio n. 1
0
class WorkQueue:
    def __init__(self, max_depth: int = 8):
        self._queue = Queue(maxsize=max_depth)

    def get_queue(self):
        """
        :return: Ray Queue actor, needed by the consumers.
        """
        return self._queue

    def empty(self):
        """
        :return: Ray Queue actor, needed by the consumers.
        """
        return self._queue.empty()

    def group(self, labels_all: np.ndarray, probs_all: np.ndarray,
              filename: str, original_shape: tuple, inference_time_sec: float,
              page_number: int) -> dict:
        return {
            "labels_all": labels_all,
            "probs_all": probs_all,
            "filename": filename,
            "original_shape": original_shape,
            "inference_time_sec": inference_time_sec,
            "page_number": page_number
        }

    def ungroup(self, dictionary):
        """
        use this like: labels_all, probs_all, filename, original_shape = ungroup(d)
        :param dictionary:
        :return:
        """
        return dictionary["labels_all"], dictionary["probs_all"], dictionary[
            "filename"], dictionary["original_shape"], dictionary[
                "inference_time_sec"], dictionary["page_number"]

    def push(self, dictionary):
        """
        Push dictionary of params to post-process. Blocks if queue is full for flow-control and proceeds when
        queue has enough space.
        :param dictionary: a dictionary created with group() method.
        :return: None
        """
        # put in object store
        ref = ray.put(dictionary)
        # put ref in queue
        self._queue.put(ref)
        return None

    def pop(self):
        """
        :return: a dictionary created with group() method, use ungroup() to unpack or lookup individually.
        """
        return self._queue.get()
Esempio n. 2
0
def test_simple_usage(ray_start_regular_shared):

    q = Queue()

    items = list(range(10))

    for item in items:
        q.put(item)

    for item in items:
        assert item == q.get()
Esempio n. 3
0
def test_put(ray_start_regular_shared):

    q = Queue(1)

    item = 0
    q.put(item, block=False)
    assert q.get() == item

    item = 1
    q.put(item, timeout=0.2)
    assert q.get() == item

    with pytest.raises(ValueError):
        q.put(0, timeout=-1)

    q.put(0)
    with pytest.raises(Full):
        q.put_nowait(1)

    with pytest.raises(Full):
        q.put(1, timeout=0.2)
Esempio n. 4
0
def test_async_put(ray_start_regular_shared):
    q = Queue(1)
    q.put(1)
    future = async_put.remote(q, 2)

    with pytest.raises(Full):
        q.put_nowait(3)

    with pytest.raises(GetTimeoutError):
        ray.get(future, timeout=0.1)  # task not canceled on timeout.

    assert q.get() == 1
    assert q.get() == 2
Esempio n. 5
0
def test_qsize(ray_start_regular_shared):

    q = Queue()

    items = list(range(10))
    size = 0

    assert q.qsize() == size

    for item in items:
        q.put(item)
        size += 1
        assert q.qsize() == size

    for item in items:
        assert q.get() == item
        size -= 1
        assert q.qsize() == size
Esempio n. 6
0
def test_get(ray_start_regular_shared):

    q = Queue()

    item = 0
    q.put(item)
    assert q.get(block=False) == item

    item = 1
    q.put(item)
    assert q.get(timeout=0.2) == item

    with pytest.raises(ValueError):
        q.get(timeout=-1)

    with pytest.raises(Empty):
        q.get_nowait()

    with pytest.raises(Empty):
        q.get(timeout=0.2)
Esempio n. 7
0
class DistributedPool(DistributedPoolAPI):
    """PoolAPI is an abstract class defining a resource Pool.
    A resource pool object which controls a pool of ressources (CPU, GPU, ...) to which jobs can be submitted. It supports asynchronous results with timeouts and callbacks and has a parallel map implementation.
    """
    def __init__(
        self,
        n_worker: int,
        n_cpu_per_worker: int,
        memory_limit_per_worker: float = 0,
        n_gpu_per_worker: float = 0,
        max_pending_task: int = 10000,
        local_pool_class: Type[LocalPoolAPI] = LocalPool,
    ) -> None:
        """RayDistributedCluster constructor"""

        super().__init__(
            n_worker=n_worker,
            n_cpu_per_worker=n_cpu_per_worker,
            memory_limit_per_worker=memory_limit_per_worker,
            n_gpu_per_worker=n_gpu_per_worker,
            local_pool_class=local_pool_class,
        )
        self.max_pending_task = max_pending_task
        # create task and results queues
        self.task_queue = Queue(max_pending_task)
        self.result_queue = Queue(max_pending_task)

        # consume processed results from result_queue
        # start consuming result queue
        def consume_result_queue():
            self.started = True
            while self.started:
                try:
                    result = self.result_queue.get(timeout=1, block=True)
                    if type(result) is str:
                        continue

                    if result and result.task_id in self.processed_results:
                        self.processed_results[result.task_id].result = result
                        del self.processed_results[result.task_id]
                except Empty:
                    continue
                except (RayActorError, AttributeError):
                    break

        self.result_consumer_thread = threading.Thread(
            target=consume_result_queue)
        self.result_consumer_thread.start()

        # start actors
        opt = {
            "num_cpus": n_cpu_per_worker,
            "num_gpus": n_gpu_per_worker,
        }

        self.actor_pool = [
            _RayExecutorActor.options(**opt).remote(  # type: ignore
                self.task_queue,
                self.result_queue,
                self.create_local_pool(n_cpu=n_cpu_per_worker,
                                       memory_limit=0,
                                       n_visible_gpu=[]),
            ) for _ in range(n_worker)
        ]
        for a in self.actor_pool:
            a.start.remote()

        # wait agent ready
        # self.result_queue.get(block=True, timeout=30)

        self.processed_results: Dict[uuid.UUID, _RayAsyncResult] = {}

    def apply_async(
        self,
        func: Callable[..., _OutputType],
        args: Optional[Iterable[Any]] = None,
        kwds: Optional[Mapping[str, Any]] = None,
        callback: Optional[Callable[[_OutputType], None]] = None,
        error_callback: Optional[Callable[[BaseException], None]] = None,
    ) -> AsyncResult[_OutputType]:
        __doc__ = super().apply_async.__doc__  # noqa: F841
        task = _RayTask(
            task_id=uuid4(),
            func=func,
            args=args,
            kwds=kwds,
            callback=callback,
            error_callback=error_callback,
        )
        self.task_queue.put(task)
        async_res = _RayAsyncResult[_OutputType](task.task_id)
        self.processed_results[task.task_id] = async_res
        return async_res

    def map_async(
        self,
        func: Callable[[_InputType], _OutputType],
        iterable: Iterable[_InputType],
        chunksize: Optional[int] = 1,
        callback: Optional[Callable[[_OutputType], None]] = None,
        error_callback: Optional[Callable[[BaseException], None]] = None,
    ) -> MapResult[_OutputType]:
        __doc__ = super().apply_async.__doc__  # noqa: F841

        chuncks_async_results: List[AsyncResult[List[_OutputType]]] = []
        for c in mitertools.divide(self.n_worker, iterable=iterable):
            task = _RayMapTask(
                task_id=uuid4(),
                func=func,
                args=c,
                callback=callback,
                error_callback=error_callback,
            )

            chunck_async_result = _RayAsyncResult[List[_OutputType]](
                task.task_id)
            chuncks_async_results.append(chunck_async_result)
            self.processed_results[task.task_id] = chunck_async_result

            self.task_queue.put(task)

        async_res: MapResult[_OutputType] = _RayAsyncMapResult[_OutputType](
            async_results=chuncks_async_results)

        return async_res

    def terminate(self) -> None:
        __doc__ = super().terminate.__doc__  # noqa: F841

        self.close()
        for a in self.actor_pool:
            ray.kill(a)

    def close(self) -> None:
        __doc__ = super().close.__doc__  # noqa: F841
        self.started = False
        for a in self.actor_pool:
            a.stop.remote()

        ray.kill(self.result_queue.actor)
        ray.kill(self.task_queue.actor)

        sleep(1)

    def create_local_pool(self,
                          n_cpu: int = 0,
                          memory_limit: float = 0,
                          n_visible_gpu: List[int] = []) -> LocalPoolAPI:
        if memory_limit == 0:
            memory_limit = self.memory_limit_per_worker
        return self.local_pool_class(n_cpu,
                                     memory_limit,
                                     n_visible_gpu,
                                     lazy=True)
Esempio n. 8
0
class DockerCluster:
    """Docker cluster wrapper.

    Creates a directory for starting a fake multinode docker cluster.

    Includes APIs to update the cluster config as needed in tests,
    and to start and connect to the cluster.
    """
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        self._base_config_file = os.path.join(os.path.dirname(__file__),
                                              "example_docker.yaml")
        self._tempdir = None
        self._config_file = None
        self._nodes_file = None
        self._nodes = {}
        self._status_file = None
        self._status = {}
        self._partial_config = config
        self._cluster_config = None
        self._docker_image = None

        self._monitor_script = os.path.join(os.path.dirname(__file__),
                                            "docker_monitor.py")
        self._monitor_process = None

        self._execution_thread = None
        self._execution_event = threading.Event()
        self._execution_queue = None

    @property
    def config_file(self):
        return self._config_file

    @property
    def cluster_config(self):
        return self._cluster_config

    @property
    def cluster_dir(self):
        return self._tempdir

    @property
    def gcs_port(self):
        return self._cluster_config.get("provider",
                                        {}).get("host_gcs_port",
                                                FAKE_DOCKER_DEFAULT_GCS_PORT)

    @property
    def client_port(self):
        return self._cluster_config.get("provider", {}).get(
            "host_client_port", FAKE_DOCKER_DEFAULT_CLIENT_PORT)

    def connect(self, client: bool = True, timeout: int = 120, **init_kwargs):
        """Connect to the docker-compose Ray cluster.

        Assumes the cluster is at RAY_TESTHOST (defaults to
        ``127.0.0.1``).

        Args:
            client: If True, uses Ray client to connect to the
                cluster. If False, uses GCS to connect to the cluster.
            timeout: Connection timeout in seconds.
            **init_kwargs: kwargs to pass to ``ray.init()``.

        """
        host = os.environ.get("RAY_TESTHOST", "127.0.0.1")

        if client:
            port = self.client_port
            address = f"ray://{host}:{port}"
        else:
            port = self.gcs_port
            address = f"{host}:{port}"

        timeout_at = time.monotonic() + timeout
        while time.monotonic() < timeout_at:
            try:
                ray.init(address, **init_kwargs)
                self.wait_for_resources({"CPU": 1})
            except ResourcesNotReadyError:
                time.sleep(1)
                continue
            else:
                break

        try:
            ray.cluster_resources()
        except Exception as e:
            raise RuntimeError(f"Timed out connecting to Ray: {e}")

    def remote_execution_api(self) -> "RemoteAPI":
        """Create an object to control cluster state from within the cluster."""
        self._execution_queue = Queue(actor_options={"num_cpus": 0})
        stop_event = self._execution_event

        def entrypoint():
            while not stop_event.is_set():
                try:
                    cmd, kwargs = self._execution_queue.get(timeout=1)
                except Empty:
                    continue

                if cmd == "kill_node":
                    self.kill_node(**kwargs)

        self._execution_thread = threading.Thread(target=entrypoint)
        self._execution_thread.start()

        return RemoteAPI(self._execution_queue)

    @staticmethod
    def wait_for_resources(resources: Dict[str, float], timeout: int = 60):
        """Wait until Ray cluster resources are available

        Args:
            resources: Minimum resources needed before
                this function returns.
            timeout: Timeout in seconds.

        """
        timeout = time.monotonic() + timeout

        available = ray.cluster_resources()
        while any(available.get(k, 0.0) < v for k, v in resources.items()):
            if time.monotonic() > timeout:
                raise ResourcesNotReadyError(
                    f"Timed out waiting for resources: {resources}")
            time.sleep(1)
            available = ray.cluster_resources()

    def update_config(self, config: Optional[Dict[str, Any]] = None):
        """Update autoscaling config.

        Does a deep update of the base config with a new configuration.
        This can change autoscaling behavior.

        Args:
            config: Partial config to update current
                config with.

        """
        assert self._tempdir, "Call setup() first"

        config = config or {}

        if config:
            self._partial_config = config

        if not config.get("provider", {}).get("image"):
            # No image specified, trying to parse from buildkite
            docker_image = os.environ.get("RAY_DOCKER_IMAGE", None)

            if not docker_image:
                # If still no docker image, use one according to Python version
                mj = sys.version_info.major
                mi = sys.version_info.minor

                docker_image = DEFAULT_DOCKER_IMAGE.format(major=mj, minor=mi)

            self._docker_image = docker_image

        with open(self._base_config_file, "rt") as f:
            cluster_config = yaml.safe_load(f)

        if self._partial_config:
            deep_update(cluster_config,
                        self._partial_config,
                        new_keys_allowed=True)

        if self._docker_image:
            cluster_config["provider"]["image"] = self._docker_image

        cluster_config["provider"]["shared_volume_dir"] = self._tempdir

        self._cluster_config = cluster_config

        with open(self._config_file, "wt") as f:
            yaml.safe_dump(self._cluster_config, f)

        logging.info(f"Updated cluster config to: {self._cluster_config}")

    def maybe_pull_image(self):
        if self._docker_image:
            try:
                images_str = subprocess.check_output(
                    f"docker image inspect {self._docker_image}", shell=True)
                images = json.loads(images_str)
            except Exception as e:
                logger.error(
                    f"Error inspecting image {self._docker_image}: {e}")
                return

            if not images:
                try:
                    subprocess.check_output(
                        f"docker pull {self._docker_image}", shell=True)
                except Exception as e:
                    logger.error(
                        f"Error pulling image {self._docker_image}: {e}")

    def setup(self):
        """Setup docker compose cluster environment.

        Creates the temporary directory, writes the initial config file,
        and pulls the docker image, if required.
        """
        self._tempdir = tempfile.mkdtemp(
            dir=os.environ.get("RAY_TEMPDIR", None))
        os.chmod(self._tempdir, 0o777)
        self._config_file = os.path.join(self._tempdir, "cluster.yaml")
        self._nodes_file = os.path.join(self._tempdir, "nodes.json")
        self._status_file = os.path.join(self._tempdir, "status.json")
        self.update_config()
        self.maybe_pull_image()

    def teardown(self, keep_dir: bool = False):
        """Tear down docker compose cluster environment.

        Args:
            keep_dir: If True, cluster directory
                will not be removed after termination.
        """
        if not keep_dir:
            shutil.rmtree(self._tempdir)
        self._tempdir = None
        self._config_file = None

    def _start_monitor(self):
        self._monitor_process = subprocess.Popen(
            ["python", self._monitor_script, self.config_file])
        time.sleep(2)

    def _stop_monitor(self):
        if self._monitor_process:
            self._monitor_process.wait(timeout=30)
            if self._monitor_process.poll() is None:
                self._monitor_process.terminate()
        self._monitor_process = None

    def start(self):
        """Start docker compose cluster.

        Starts the monitor process and runs ``ray up``.
        """
        self._start_monitor()

        subprocess.check_output(
            f"RAY_FAKE_CLUSTER=1 ray up -y {self.config_file}", shell=True)

    def stop(self):
        """Stop docker compose cluster.

        Runs ``ray down`` and stops the monitor process.
        """
        if ray.is_initialized:
            ray.shutdown()

        subprocess.check_output(
            f"RAY_FAKE_CLUSTER=1 ray down -y {self.config_file}", shell=True)

        self._stop_monitor()
        self._execution_event.set()

    def _update_nodes(self):
        with open(self._nodes_file, "rt") as f:
            self._nodes = json.load(f)

    def _update_status(self):
        with open(self._status_file, "rt") as f:
            self._status = json.load(f)

    def _get_node(
        self,
        node_id: Optional[str] = None,
        num: Optional[int] = None,
        rand: Optional[str] = None,
    ) -> str:
        self._update_nodes()
        if node_id:
            assert (not num and not rand
                    ), "Only provide either `node_id`, `num`, or `random`."
        elif num:
            assert (not node_id and not rand
                    ), "Only provide either `node_id`, `num`, or `random`."
            base = "fffffffffffffffffffffffffffffffffffffffffffffffffff"
            node_id = base + str(num).zfill(5)
        elif rand:
            assert (not node_id and not num
                    ), "Only provide either `node_id`, `num`, or `random`."
            assert rand in [
                "worker",
                "any",
            ], "`random` must be one of ['worker', 'any']"
            choices = list(self._nodes.keys())
            if rand == "worker":
                choices.remove(
                    "fffffffffffffffffffffffffffffffffffffffffffffffffff00000")
            # Else: any
            node_id = random.choice(choices)

        assert node_id in self._nodes, f"Node with ID {node_id} is not in active nodes."
        return node_id

    def _get_docker_container(self, node_id: str) -> Optional[str]:
        self._update_status()
        node_status = self._status.get(node_id)
        if not node_status:
            return None

        return node_status["Name"]

    def kill_node(
        self,
        node_id: Optional[str] = None,
        num: Optional[int] = None,
        rand: Optional[str] = None,
    ):
        """Kill node.

        If ``node_id`` is given, kill that node.

        If ``num`` is given, construct node_id from this number, and kill
        that node.

        If ``rand`` is given (as either ``worker`` or ``any``), kill a random
        node.
        """
        node_id = self._get_node(node_id=node_id, num=num, rand=rand)
        container = self._get_docker_container(node_id=node_id)
        subprocess.check_output(f"docker kill {container}", shell=True)
Esempio n. 9
0
def _train(params: Dict,
           dtrain: RayDMatrix,
           *args,
           evals=(),
           num_actors: int = 4,
           cpus_per_actor: int = 0,
           gpus_per_actor: int = -1,
           resources_per_actor: Optional[Dict] = None,
           checkpoint_prefix: Optional[str] = None,
           checkpoint_path: str = "/tmp",
           checkpoint_frequency: int = 5,
           **kwargs) -> Tuple[xgb.Booster, Dict, Dict]:
    _assert_ray_support()

    if not ray.is_initialized():
        ray.init()

    if gpus_per_actor == -1:
        gpus_per_actor = 0
        if "tree_method" in params and params["tree_method"].startswith("gpu"):
            gpus_per_actor = 1

    if cpus_per_actor <= 0:
        cluster_cpus = _ray_get_cluster_cpus() or 1
        cpus_per_actor = min(int(_get_max_node_cpus() or 1),
                             int(cluster_cpus // num_actors))

    if "nthread" in params:
        if params["nthread"] > cpus_per_actor:
            raise ValueError(
                "Specified number of threads greater than number of CPUs. "
                "\nFIX THIS by passing a lower value for the `nthread` "
                "parameter or a higher number for `cpus_per_actor`.")
    else:
        params["nthread"] = cpus_per_actor

    # Create queue for communication from worker to caller.
    # Always create queue.
    queue = Queue()

    # Create remote actors
    actors = [
        _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor,
                      resources_per_actor, queue, checkpoint_prefix,
                      checkpoint_path, checkpoint_frequency)
        for i in range(num_actors)
    ]
    logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")

    # Split data across workers
    wait_load = []
    for _, actor in enumerate(actors):
        wait_load.extend(_trigger_data_load(actor, dtrain, evals))

    try:
        ray.get(wait_load)
    except Exception:
        _shutdown(actors, queue, force=True)
        raise

    logger.info("[RayXGBoost] Starting XGBoost training.")

    # Start tracker
    env = _start_rabit_tracker(num_actors)
    rabit_args = [("%s=%s" % item).encode() for item in env.items()]

    # Train
    fut = [
        actor.train.remote(rabit_args, params, dtrain, evals, *args, **kwargs)
        for actor in actors
    ]

    callback_returns = [list() for _ in range(len(actors))]
    try:
        not_ready = fut
        while not_ready:
            if queue:
                while not queue.empty():
                    (actor_rank, item) = queue.get()
                    if isinstance(item, Callable):
                        item()
                    else:
                        callback_returns[actor_rank].append(item)
            ready, not_ready = ray.wait(not_ready, timeout=0)
            logger.debug("[RayXGBoost] Waiting for results...")
            ray.get(ready)
        # Once everything is ready
        ray.get(fut)
    # The inner loop should catch all exceptions
    except Exception:
        _shutdown(remote_workers=actors, queue=queue, force=True)
        raise

    # All results should be the same because of Rabit tracking. So we just
    # return the first one.
    res: Dict[str, Any] = ray.get(fut[0])
    bst = res["bst"]
    evals_result = res["evals_result"]
    additional_results = {}

    if callback_returns:
        additional_results["callback_returns"] = callback_returns

    all_res = ray.get(fut)
    total_n = sum(res["train_n"] or 0 for res in all_res)

    logger.info(f"[RayXGBoost] Finished XGBoost training on training data "
                f"with total N={total_n:,}.")

    if checkpoint_prefix:
        _cleanup(checkpoint_prefix, checkpoint_path, num_actors)

    _shutdown(remote_workers=actors, queue=queue, force=False)

    return bst, evals_result, additional_results