Exemple #1
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: List[Block[T]]) -> List[ObjectRef[Block]]:

        map_bar = ProgressBar("Map Progress", total=len(blocks))

        class Worker:
            def ready(self):
                return "ok"

            def process_block(self, block: Block[T]) -> Block[U]:
                return fn(block)

        if "num_cpus" not in remote_args:
            remote_args["num_cpus"] = 1
        Worker = ray.remote(**remote_args)(Worker)

        workers = [Worker.remote()]
        tasks = {w.ready.remote(): w for w in workers}
        ready_workers = set()
        blocks_in = blocks.copy()
        blocks_out = []

        while len(blocks_out) < len(blocks):
            ready, _ = ray.wait(list(tasks),
                                timeout=0.01,
                                num_returns=1,
                                fetch_local=False)
            if not ready:
                if len(ready_workers) / len(workers) > 0.8:
                    w = Worker.remote()
                    workers.append(w)
                    tasks[w.ready.remote()] = w
                    map_bar.set_description(
                        "Map Progress ({} actors {} pending)".format(
                            len(ready_workers),
                            len(workers) - len(ready_workers)))
                continue

            [obj_id] = ready
            worker = tasks[obj_id]
            del tasks[obj_id]

            # Process task result.
            if worker in ready_workers:
                blocks_out.append(obj_id)
                map_bar.update(1)
            else:
                ready_workers.add(worker)

            # Schedule a new task.
            if blocks_in:
                tasks[worker.process_block.remote(blocks_in.pop())] = worker

        map_bar.close()
        return blocks_out
Exemple #2
0
    def apply(self, fn: Any, remote_args: dict,
              blocks: Iterable[Block]) -> Iterable[ObjectRef[Block]]:

        map_bar = ProgressBar("Map Progress", total=len(blocks))

        class BlockWorker:
            def ready(self):
                return "ok"

            @ray.method(num_returns=2)
            def process_block(self, block: Block,
                              meta: BlockMetadata) -> (Block, BlockMetadata):
                new_block = fn(block)
                accessor = BlockAccessor.for_block(new_block)
                new_metadata = BlockMetadata(num_rows=accessor.num_rows(),
                                             size_bytes=accessor.size_bytes(),
                                             schema=accessor.schema(),
                                             input_files=meta.input_files)
                return new_block, new_metadata

        if not remote_args:
            remote_args["num_cpus"] = 1
        BlockWorker = ray.remote(**remote_args)(BlockWorker)

        self.workers = [BlockWorker.remote()]
        metadata_mapping = {}
        tasks = {w.ready.remote(): w for w in self.workers}
        ready_workers = set()
        blocks_in = [(b, m) for (b, m) in zip(blocks, blocks.get_metadata())]
        blocks_out = []

        while len(blocks_out) < len(blocks):
            ready, _ = ray.wait(list(tasks),
                                timeout=0.01,
                                num_returns=1,
                                fetch_local=False)
            if not ready:
                if len(ready_workers) / len(self.workers) > 0.8:
                    w = BlockWorker.remote()
                    self.workers.append(w)
                    tasks[w.ready.remote()] = w
                    map_bar.set_description(
                        "Map Progress ({} actors {} pending)".format(
                            len(ready_workers),
                            len(self.workers) - len(ready_workers)))
                continue

            [obj_id] = ready
            worker = tasks[obj_id]
            del tasks[obj_id]

            # Process task result.
            if worker in ready_workers:
                blocks_out.append(obj_id)
                map_bar.update(1)
            else:
                ready_workers.add(worker)

            # Schedule a new task.
            if blocks_in:
                block_ref, meta_ref = worker.process_block.remote(
                    *blocks_in.pop())
                metadata_mapping[block_ref] = meta_ref
                tasks[block_ref] = worker

        new_metadata = ray.get([metadata_mapping[b] for b in blocks_out])
        map_bar.close()
        return BlockList(blocks_out, new_metadata)