Example #1
0
class WorkQueue:
    def __init__(self, max_depth: int = 8):
        self._queue = Queue(maxsize=max_depth)

    def get_queue(self):
        """
        :return: Ray Queue actor, needed by the consumers.
        """
        return self._queue

    def empty(self):
        """
        :return: Ray Queue actor, needed by the consumers.
        """
        return self._queue.empty()

    def group(self, labels_all: np.ndarray, probs_all: np.ndarray,
              filename: str, original_shape: tuple, inference_time_sec: float,
              page_number: int) -> dict:
        return {
            "labels_all": labels_all,
            "probs_all": probs_all,
            "filename": filename,
            "original_shape": original_shape,
            "inference_time_sec": inference_time_sec,
            "page_number": page_number
        }

    def ungroup(self, dictionary):
        """
        use this like: labels_all, probs_all, filename, original_shape = ungroup(d)
        :param dictionary:
        :return:
        """
        return dictionary["labels_all"], dictionary["probs_all"], dictionary[
            "filename"], dictionary["original_shape"], dictionary[
                "inference_time_sec"], dictionary["page_number"]

    def push(self, dictionary):
        """
        Push dictionary of params to post-process. Blocks if queue is full for flow-control and proceeds when
        queue has enough space.
        :param dictionary: a dictionary created with group() method.
        :return: None
        """
        # put in object store
        ref = ray.put(dictionary)
        # put ref in queue
        self._queue.put(ref)
        return None

    def pop(self):
        """
        :return: a dictionary created with group() method, use ungroup() to unpack or lookup individually.
        """
        return self._queue.get()
Example #2
0
def test_simple_usage(ray_start_regular_shared):

    q = Queue()

    items = list(range(10))

    for item in items:
        q.put(item)

    for item in items:
        assert item == q.get()
Example #3
0
def test_async_get(ray_start_regular_shared):
    q = Queue()
    future = async_get.remote(q)

    with pytest.raises(Empty):
        q.get_nowait()

    with pytest.raises(GetTimeoutError):
        ray.get(future, timeout=0.1)  # task not canceled on timeout.

    q.put(1)
    assert ray.get(future) == 1
Example #4
0
def main(args):
    if not os.path.isdir(args.data_dir):
        os.makedirs(args.data_dir)
    guarantees_path = args.guarantees
    if not os.path.isfile(guarantees_path):
        sys.exit(f'{guarantees_path} is not a file')
    with open(guarantees_path, 'r') as guarantees_file:
        guarantees = json.load(guarantees_file)['patterns']
        logger.info('Read in %d guarantees', len(guarantees))
    #create folder and files
    folder_path = f'{args.data_dir}/{get_folder_name(args)}'
    if not os.path.isdir(folder_path):
        os.mkdir(folder_path)
        logger.info('Created folder %s', folder_path)
    data_gen_stats_file = os.path.join(folder_path, 'data_gen_stats.json')
    flag_filepath = os.path.join(folder_path, 'args.json')
    args_dict = vars(args)
    with open(flag_filepath, 'w') as flag_file:
        json.dump(args_dict, flag_file, indent=2)
    logger.info('Command line arguments written to %s', flag_filepath)
    progress_actor = common.ProgressActor.remote()  # pylint: disable=no-member
    samples_queue = Queue(maxsize=args.num_samples)
    # pylint: disable=no-member
    timeouts_queue = Queue(maxsize=args.num_samples)
    ds_actor = DataSetActor.remote(guarantees, progress_actor, args_dict,
                                   samples_queue, timeouts_queue)
    dataset_writer_result = common.csv_dataset_writer.remote(
        samples_queue, folder_path, args.num_samples, args.train_frac,
        args.val_frac, args.test_frac)
    timeouts_file = os.path.join(folder_path, 'timeouts.csv')
    timeouts_writer_result = common.csv_file_writer.remote(
        timeouts_queue, timeouts_file)
    worker_results = [
        strix.wrapper.worker.remote(ds_actor,
                                    args.strix_bin,
                                    strix_auto=args.strix_auto,
                                    strix_timeout=args.strix_timeout,
                                    id=i) for i in range(args.num_worker)
    ]
    common.progress_bar(progress_actor, args.num_samples, data_gen_stats_file)
    ray.get(worker_results)
    ray.get(dataset_writer_result)
    timeouts_queue.put(None)
    ray.get(timeouts_writer_result)
    split_dataset = from_dir(folder_path)
    stats = split_dataset.circuit_stats(['train', 'val', 'test'])
    stats_file = os.path.join(folder_path, 'circuit-stats.json')
    write_stats(stats, stats_file)
    plot_file = os.path.join(folder_path, 'circuit-stats.png')
    plot_stats(stats, plot_file)
    split_dataset.shuffle()
    split_dataset.save(folder_path)
Example #5
0
def test_async_put(ray_start_regular_shared):
    q = Queue(1)
    q.put(1)
    future = async_put.remote(q, 2)

    with pytest.raises(Full):
        q.put_nowait(3)

    with pytest.raises(GetTimeoutError):
        ray.get(future, timeout=0.1)  # task not canceled on timeout.

    assert q.get() == 1
    assert q.get() == 2
Example #6
0
def test_qsize(ray_start_regular_shared):

    q = Queue()

    items = list(range(10))
    size = 0

    assert q.qsize() == size

    for item in items:
        q.put(item)
        size += 1
        assert q.qsize() == size

    for item in items:
        assert q.get() == item
        size -= 1
        assert q.qsize() == size
Example #7
0
def test_get(ray_start_regular_shared):

    q = Queue()

    item = 0
    q.put(item)
    assert q.get(block=False) == item

    item = 1
    q.put(item)
    assert q.get(timeout=0.2) == item

    with pytest.raises(ValueError):
        q.get(timeout=-1)

    with pytest.raises(Empty):
        q.get_nowait()

    with pytest.raises(Empty):
        q.get(timeout=0.2)
Example #8
0
def test_put(ray_start_regular_shared):

    q = Queue(1)

    item = 0
    q.put(item, block=False)
    assert q.get() == item

    item = 1
    q.put(item, timeout=0.2)
    assert q.get() == item

    with pytest.raises(ValueError):
        q.put(0, timeout=-1)

    q.put(0)
    with pytest.raises(Full):
        q.put_nowait(1)

    with pytest.raises(Full):
        q.put(1, timeout=0.2)
Example #9
0
def sweep():
    ''' Run sweep of parameters '''
    if args.server is not None:
        # For remote sweep we create the following directory structure:
        #      1/     2/         3/       4/
        # <repo>/<logs>/<platform>/<design>/
        repo_dir = abspath(LOCAL_DIR + '/../' * 4)
    else:
        repo_dir = abspath('../')
    print(f'[INFO TUN-0012] Log dir {LOCAL_DIR}.')
    queue = Queue()
    for name, content in config_dict.items():
        if not isinstance(content, list):
            continue
        for i in np.arange(*content):
            config_dict[name] = i
            queue.put([repo_dir, config_dict, LOCAL_DIR])
    workers = [consumer.remote(queue) for _ in range(args.jobs)]
    print('[INFO TUN-0009] Waiting for results.')
    ray.get(workers)
    print('[INFO TUN-0010] Sweep complete.')
Example #10
0
class Detector(LoggerMixin, SlackMixin):
    def __init__(
        self,
        source: str,
        dest: str,
        batch_size: int,
        tiles: int,
        webhook: str,
        gpu: bool,
    ) -> None:
        SlackMixin.__init__(self, webhook)
        self._source = source
        self._dest = dest
        self._batch_size = batch_size
        self._n_tiles = tiles
        self._threads = []

        self._q_to_file_reader = Queue()
        self._q_freader_to_detector = Queue(maxsize=Config.Q_READER_NET_RUNNER)
        self._q_detector_payload_runner = Queue(
            maxsize=Config.Q_NET_RUNNER_PAYLOAD_RUNNER)
        self._q_payload_runner_fwriter = Queue(
            maxsize=Config.Q_PAYLOAD_RUNNER_WRITER)
        self.logger.info("Queues initialized")

        self._file_reader_thread = FileReaderThread(
            queue_in=self._q_to_file_reader,
            queue_out=self._q_freader_to_detector,
        )
        self._model = YOLOv4("yolov4", device="gpu" if gpu else "cpu")
        self._net_runner_thread = NetRunnerThread(
            queue_in=self._q_freader_to_detector,
            queue_out=self._q_detector_payload_runner,
            model=self._model,
        )
        self._payload_runner = PayloadRunnerActor(
            queue_in=self._q_detector_payload_runner,
            queue_out=self._q_payload_runner_fwriter,
            payload=Config.get_payload(),
        )
        self._result_processor = TheResultProcessor(dest)
        self._result_processor_thread = ResultWriterThread(
            result_writer=self._result_processor,
            queue_in=self._q_payload_runner_fwriter,
        )
        self._threads.append(self._file_reader_thread)
        self._threads.append(self._net_runner_thread)  # type: ignore
        self._threads.append(self._result_processor_thread)  # type: ignore
        self._start()
        self._log_message("Detector started")

    def process_images(self):
        for image_path in self._get_images_to_process():
            self._q_to_file_reader.put(image_path)
            self._log_message(f"Image {os.path.basename(image_path)} "
                              f"sent to file reader")

    def _get_images_to_process(self) -> t.Generator:
        for item in os.listdir(self._source):
            if any(item.endswith(ext.lower()) for ext in Config.ALLOWED_EXTS):
                yield os.path.join(self._source, item)
            else:
                self.logger.warning(
                    f"Cannot process file: {item}. Unsupported extension")

    def _log_message(self, message: str) -> None:
        self.logger.info(message)
        self.slack_msg(message)

    def _start(self) -> None:
        for thread in self._threads:
            thread.start()

    def stop(self) -> None:
        self._q_to_file_reader.put("KILL")
        for thread in self._threads:
            thread.join()
        self._log_message("Detected stopped")
Example #11
0
    def to_csv(cls, qc, **kwargs):
        if not cls._to_csv_check_support(kwargs):
            return BaseIO.to_csv(qc, **kwargs)

        # The partition id will be added to the queue, for which the moment
        # of writing to the file has come
        queue = Queue(maxsize=1)

        def func(df, **kw):
            if kw["partition_idx"] != 0:
                # we need to create a new file only for first recording
                # all the rest should be recorded in appending mode
                if "w" in kwargs["mode"]:
                    kwargs["mode"] = kwargs["mode"].replace("w", "a")
                # It is enough to write the header for the first partition
                kwargs["header"] = False

            # for parallelization purposes, each partition is written to an intermediate buffer
            path_or_buf = kwargs["path_or_buf"]
            is_binary = "b" in kwargs["mode"]
            if is_binary:
                kwargs["path_or_buf"] = io.BytesIO()
            else:
                kwargs["path_or_buf"] = io.StringIO()
            df.to_csv(**kwargs)
            content = kwargs["path_or_buf"].getvalue()
            kwargs["path_or_buf"].close()

            # each process waits for its turn to write to a file;
            # in case of violation of the order of receiving messages from the queue,
            # the message is placed back
            while True:
                get_value = queue.get(block=True)
                if get_value == kw["partition_idx"]:
                    break
                queue.put(get_value)

            # preparing to write data from the buffer to a file
            with pandas.io.common.get_handle(
                path_or_buf,
                # in case when using URL in implicit text mode
                # pandas try to open `path_or_buf` in binary mode
                kwargs["mode"] if is_binary else kwargs["mode"] + "t",
                encoding=kwargs["encoding"],
                errors=kwargs["errors"],
                compression=kwargs["compression"],
                storage_options=kwargs["storage_options"],
                is_text=False,
            ) as handles:
                handles.handle.write(content)

            # signal that the next process can start writing to the file
            queue.put(get_value + 1)

            # used for synchronization purposes
            return 0

        # signaling that the partition with id==0 can be written to the file
        queue.put(0)
        result = qc._modin_frame._frame_mgr_cls.map_axis_partitions(
            axis=1,
            partitions=qc._modin_frame._partitions,
            map_func=func,
            keep_partitioning=True,
            lengths=None,
            enumerate_partitions=True,
        )

        # pending completion
        for rows in result:
            for partition in rows:
                wait([partition.oid])
Example #12
0
import ray
from ray.util.queue import Queue

ray.init()
# You can pass this object around to different tasks/actors
queue = Queue(maxsize=100)


@ray.remote
def consumer(queue):
    next_item = queue.get(block=True)
    print(f'got work {next_item}')


consumers = [consumer.remote(queue) for _ in range(3)]

[queue.put(i) for i in range(10)]

print(ray.nodes())
Example #13
0
    def to_csv(cls, qc, **kwargs):
        if not cls._to_csv_check_support(kwargs):
            return BaseIO.to_csv(qc, **kwargs)

        # The partition id will be added to the queue, for which the moment
        # of writing to the file has come
        queue = Queue(maxsize=1)

        def func(df, **kw):
            if kw["partition_idx"] != 0:
                # we need to create a new file only for first recording
                # all the rest should be recorded in appending mode
                if "w" in kwargs["mode"]:
                    kwargs["mode"] = kwargs["mode"].replace("w", "a")
                # It is enough to write the header for the first partition
                kwargs["header"] = False

            # for parallelization purposes, each partition is written to an intermediate buffer
            path_or_buf = kwargs["path_or_buf"]
            is_binary = "b" in kwargs["mode"]
            if is_binary:
                kwargs["path_or_buf"] = io.BytesIO()
            else:
                kwargs["path_or_buf"] = io.StringIO()
            df.to_csv(**kwargs)
            content = kwargs["path_or_buf"].getvalue()

            # each process waits for its turn to write to a file;
            # in case of violation of the order of receiving messages from the queue,
            # the message is placed back
            while True:
                get_value = queue.get(block=True)
                if get_value == kw["partition_idx"]:
                    break
                queue.put(get_value)

            # preparing to write data from the buffer to a file
            open_kwargs = {"mode": kwargs["mode"]}
            if not is_binary:
                # in the buffer, newline symbols have already been translated
                # for the current operating system;
                # in the process of writing the buffer to the file,
                # newline symbols must be left unchanged
                open_kwargs["newline"] = ""
            with open(path_or_buf, **open_kwargs) as _f:
                _f.write(content)
            # signal that the next process can start writing to the file
            queue.put(get_value + 1)

            # used for synchronization purposes
            return 0

        # signaling that the partition with id==0 can be written to the file
        queue.put(0)
        result = qc._modin_frame._frame_mgr_cls.map_axis_partitions(
            axis=1,
            partitions=qc._modin_frame._partitions,
            map_func=func,
            keep_partitioning=True,
            lengths=None,
            enumerate_partitions=True,
        )

        # pending completion
        for rows in result:
            for partition in rows:
                wait([partition.oid])
Example #14
0
class RayHandler:
    def __init__(self,
                 fc_data,
                 behav_data,
                 behav,
                 covars,
                 n_perm=0,
                 **ray_kwargs):
        self.behav_data = behav_data  # For adding kfold_indices

        ray.shutdown()  # Make sure Ray is only initialised once
        self.ray_info = ray.init(**ray_kwargs)

        self.in_queue = Queue()
        self.out_queue = Queue()
        self.status_queue = Queue()
        self.report_queue = Queue()

        self.status_dict = {}
        self.actors_list = []

        # Create dictionaries to keep results (it makes sense to do this class-wide to add results on-the-fly and for later reference if get results functions are called too early for example
        self.fselection_results = {}
        self.fselection_results[-1] = {
        }  # Create sub-dictionary for original (i.e. non-permuted) data
        self.prediction_results = {}

        self.data_dict = {}
        self.data_dict['behav'] = behav
        self.data_dict['covars'] = covars
        self.data_dict['n_perm'] = n_perm
        self.data_dict['data'] = fc_data
        self.data_dict['edges'] = self.data_dict['data'].columns.astype(
            str)  # Save edges columns before adding behavioral columns
        # Pengouin needs all the data (edges, behav, and covars) in a single DataFrame
        if covars:
            self.data_dict['data'][covars] = behav_data[covars]
        if n_perm > 0:
            # It seems to be more efficient to create a separate df and concat later;
            # .to_frame() converts Pandas series into a DataFrame on-the-fly
            behav_df = behav_data[behav].to_frame()
            for perm in range(n_perm):
                behav_df["{}-perm-{}".format(
                    behav, perm)] = np.random.permutation(behav_df[behav])
                self.fselection_results[perm] = {
                }  # Create sub-dictionaries to keep fselection results for permutations
            behav_df = behav_df.copy()
            # To avaid fragmentation (and the corresponding warning), consolidate into a
            # new DataFrame)
            self.data_dict['data'] = pd.concat(
                [self.data_dict['data'], behav_df], axis=1)
        else:
            self.data_dict['data'][behav] = behav_data[behav]
        self.data_dict['data'].columns = self.data_dict['data'].columns.astype(
            str)

    def add_kfold_indices(self, n_folds, clean=True):
        subject_ids = self.data_dict['data'].index
        kfold_indices = get_kfold_indices(subject_ids, n_folds)
        if clean:
            kfold_indices = clean_kfold_indices(kfold_indices, self.behav_data)
        self.data_dict['kfold_indices'] = kfold_indices
        printv("You need to (re-) upload data after this operation.")

    def upload_data(self):
        # Allows us to manipulate data in-class before uploading
        # TODO: Put this and start_workers function in __init__() again? -> No, permutation
        # and post-festum data manipulation!
        self.data_object = ray.put(self.data_dict)

    def start_workers(self, n_workers):
        printv("Starting {} workers".format(n_workers))
        self.workers = [
            RayWorker.remote(self.data_object, self.in_queue, self.out_queue,
                             self.status_queue) for _ in range(n_workers)
        ]

    def start_actors(self):
        qsize = self.in_queue.qsize()
        printv("Starting actors for {} jobs...".format(qsize))
        self.actors = [
            RayActor.remote(self.data_object, self.in_queue, self.out_queue,
                            self.status_queue) for _ in range(qsize)
        ]

    def start_fselection(self, train_subs, fold, perm):  # OUTDATED
        actor = RayActor.remote(self.data_object,
                                self.in_queue,
                                self.out_queue,
                                self.status_queue,
                                auto_start=False)
        object = actor.edgewise_pcorr.remote(train_subs, fold,
                                             perm)  # We don't need to keep
        # the object as results are sent to out_queue
        self.actors_list.append(actor)

    def submit_fselection(self, train_subs, fold, perm=-1):
        # perm=-1 means original data and is the default
        self.in_queue.put(['fselection', train_subs, fold, perm])

    def submit_prediction(self,
                          mask,
                          kfold_indices_train,
                          kfold_indices_test,
                          fold,
                          perm=-1):
        self.in_queue.put([
            'prediction', mask, kfold_indices_train, kfold_indices_test, fold,
            perm
        ])

    def get_results(self, queue, n=100):
        """
      Common get function utilised by get_{prediction,fselection}_results
      Input: queue to get from, max number of items to get at once
      Output: combined results
      """
        N_total = 0
        results = []
        while not queue.empty():
            N = queue.qsize()
            if N_total < N:
                N_total = N
            if N < n:  # To provide some sort of progress display, it makes sense to split
                n = N
            printv("Retrieving results: {} of {}".format(
                len(results) + n, N_total),
                   update=True)
            items = queue.get_nowait_batch(n)
            for item in items:
                results.append(item)
        return results

    def get_fselection_results(self):
        results = self.get_results(self.out_queue)
        n = 1
        N = len(results)
        printv("\n")
        for result in results:
            fold = result[0]
            perm = result[1]
            df = result[2]
            printv("Rearranging result {} of {}".format(n, N), update=True)
            self.fselection_results[perm][fold] = df
            n += 1
        #return self.fselection_results

    def get_prediction_results(self):
        results = self.get_results(self.out_queue)
        for results_dict in results:
            if results_dict['perm'] not in self.prediction_results:
                self.prediction_results[results_dict['perm']] = pd.DataFrame()
                self.prediction_results[
                    results_dict['perm']]['observed'] = self.data_dict['data'][
                        self.data_dict['behav']]
            for tail in ('pos', 'neg', 'glm'):
                self.prediction_results[results_dict['perm']].loc[
                    results_dict['test_IDs'], [tail]] = results_dict[tail]
        return self.prediction_results

    def status(self, verbose=True):
        N = self.status_queue.size()
        status_list_list = self.status_queue.get_nowait_batch(N)
        printv("Retrieving {} items from status queue...".format(N))
        for status_list in status_list_list:
            pid = status_list[0]
            node = status_list[1]
            msg = status_list[2]
            self.status_dict[pid] = {"msg": msg, "node": node}
        n = 1
        for pid, info in self.status_dict.items():
            if (info['msg']):  # Only print alive actors (-> msg != None)
                print("Actor {} [{}@{}]: {}".format(n, pid, info['node'],
                                                    info['msg']))
                n += 1
        print("\n")
        out_size = self.out_queue.qsize()
        in_size = self.in_queue.qsize()
        print("Jobs done: {}".format(out_size))
        print("Jobs remaining in queue: {}".format(in_size))

        return out_size, in_size

    def terminate(self):
        ray.shutdown()
Example #15
0
class DistributedPool(DistributedPoolAPI):
    """PoolAPI is an abstract class defining a resource Pool.
    A resource pool object which controls a pool of ressources (CPU, GPU, ...) to which jobs can be submitted. It supports asynchronous results with timeouts and callbacks and has a parallel map implementation.
    """
    def __init__(
        self,
        n_worker: int,
        n_cpu_per_worker: int,
        memory_limit_per_worker: float = 0,
        n_gpu_per_worker: float = 0,
        max_pending_task: int = 10000,
        local_pool_class: Type[LocalPoolAPI] = LocalPool,
    ) -> None:
        """RayDistributedCluster constructor"""

        super().__init__(
            n_worker=n_worker,
            n_cpu_per_worker=n_cpu_per_worker,
            memory_limit_per_worker=memory_limit_per_worker,
            n_gpu_per_worker=n_gpu_per_worker,
            local_pool_class=local_pool_class,
        )
        self.max_pending_task = max_pending_task
        # create task and results queues
        self.task_queue = Queue(max_pending_task)
        self.result_queue = Queue(max_pending_task)

        # consume processed results from result_queue
        # start consuming result queue
        def consume_result_queue():
            self.started = True
            while self.started:
                try:
                    result = self.result_queue.get(timeout=1, block=True)
                    if type(result) is str:
                        continue

                    if result and result.task_id in self.processed_results:
                        self.processed_results[result.task_id].result = result
                        del self.processed_results[result.task_id]
                except Empty:
                    continue
                except (RayActorError, AttributeError):
                    break

        self.result_consumer_thread = threading.Thread(
            target=consume_result_queue)
        self.result_consumer_thread.start()

        # start actors
        opt = {
            "num_cpus": n_cpu_per_worker,
            "num_gpus": n_gpu_per_worker,
        }

        self.actor_pool = [
            _RayExecutorActor.options(**opt).remote(  # type: ignore
                self.task_queue,
                self.result_queue,
                self.create_local_pool(n_cpu=n_cpu_per_worker,
                                       memory_limit=0,
                                       n_visible_gpu=[]),
            ) for _ in range(n_worker)
        ]
        for a in self.actor_pool:
            a.start.remote()

        # wait agent ready
        # self.result_queue.get(block=True, timeout=30)

        self.processed_results: Dict[uuid.UUID, _RayAsyncResult] = {}

    def apply_async(
        self,
        func: Callable[..., _OutputType],
        args: Optional[Iterable[Any]] = None,
        kwds: Optional[Mapping[str, Any]] = None,
        callback: Optional[Callable[[_OutputType], None]] = None,
        error_callback: Optional[Callable[[BaseException], None]] = None,
    ) -> AsyncResult[_OutputType]:
        __doc__ = super().apply_async.__doc__  # noqa: F841
        task = _RayTask(
            task_id=uuid4(),
            func=func,
            args=args,
            kwds=kwds,
            callback=callback,
            error_callback=error_callback,
        )
        self.task_queue.put(task)
        async_res = _RayAsyncResult[_OutputType](task.task_id)
        self.processed_results[task.task_id] = async_res
        return async_res

    def map_async(
        self,
        func: Callable[[_InputType], _OutputType],
        iterable: Iterable[_InputType],
        chunksize: Optional[int] = 1,
        callback: Optional[Callable[[_OutputType], None]] = None,
        error_callback: Optional[Callable[[BaseException], None]] = None,
    ) -> MapResult[_OutputType]:
        __doc__ = super().apply_async.__doc__  # noqa: F841

        chuncks_async_results: List[AsyncResult[List[_OutputType]]] = []
        for c in mitertools.divide(self.n_worker, iterable=iterable):
            task = _RayMapTask(
                task_id=uuid4(),
                func=func,
                args=c,
                callback=callback,
                error_callback=error_callback,
            )

            chunck_async_result = _RayAsyncResult[List[_OutputType]](
                task.task_id)
            chuncks_async_results.append(chunck_async_result)
            self.processed_results[task.task_id] = chunck_async_result

            self.task_queue.put(task)

        async_res: MapResult[_OutputType] = _RayAsyncMapResult[_OutputType](
            async_results=chuncks_async_results)

        return async_res

    def terminate(self) -> None:
        __doc__ = super().terminate.__doc__  # noqa: F841

        self.close()
        for a in self.actor_pool:
            ray.kill(a)

    def close(self) -> None:
        __doc__ = super().close.__doc__  # noqa: F841
        self.started = False
        for a in self.actor_pool:
            a.stop.remote()

        ray.kill(self.result_queue.actor)
        ray.kill(self.task_queue.actor)

        sleep(1)

    def create_local_pool(self,
                          n_cpu: int = 0,
                          memory_limit: float = 0,
                          n_visible_gpu: List[int] = []) -> LocalPoolAPI:
        if memory_limit == 0:
            memory_limit = self.memory_limit_per_worker
        return self.local_pool_class(n_cpu,
                                     memory_limit,
                                     n_visible_gpu,
                                     lazy=True)
Example #16
0
File: run.py Project: hughplay/memo
        N_GPU_PER_THREAD = args.gpu
        n_thread = min(len(jobs), args.thread)
        total_cpus = N_CPU_PER_THREAD * n_thread
        total_gpus = N_GPU_PER_THREAD * n_thread
        ray.init(num_cpus=total_cpus, num_gpus=total_gpus)
        print(f"Number of threads: {n_thread}.")
        print(f"CPUs per thread: {N_CPU_PER_THREAD}.")
        print(f"Total CPUs: {total_cpus}.")
        if N_GPU_PER_THREAD > 0:
            print(f"GPUs per thread: {N_GPU_PER_THREAD}.")
            print(f"Total GPUs: {total_gpus}.")
        print(f"-----------------")

        jobs_queue = Queue()
        for job in jobs:
            jobs_queue.put(job)
        pb = ProgressBar(len(jobs_queue))
        actor = pb.actor
        job_list = []
        for _ in range(n_thread):
            job_list.append(process_jobs.remote(jobs_queue, args, actor))

        pb.print_until_done()
        job_results = list(chain(*ray.get(job_list)))
        ray.get(actor.get_counter.remote())

        result_path = Path(args.output_dir) / "result.jsonl"
        with jsonlines.open(result_path, "w") as writer:
            writer.write_all(job_results)
    else:
        print(f"No job to run.")