class WorkQueue: def __init__(self, max_depth: int = 8): self._queue = Queue(maxsize=max_depth) def get_queue(self): """ :return: Ray Queue actor, needed by the consumers. """ return self._queue def empty(self): """ :return: Ray Queue actor, needed by the consumers. """ return self._queue.empty() def group(self, labels_all: np.ndarray, probs_all: np.ndarray, filename: str, original_shape: tuple, inference_time_sec: float, page_number: int) -> dict: return { "labels_all": labels_all, "probs_all": probs_all, "filename": filename, "original_shape": original_shape, "inference_time_sec": inference_time_sec, "page_number": page_number } def ungroup(self, dictionary): """ use this like: labels_all, probs_all, filename, original_shape = ungroup(d) :param dictionary: :return: """ return dictionary["labels_all"], dictionary["probs_all"], dictionary[ "filename"], dictionary["original_shape"], dictionary[ "inference_time_sec"], dictionary["page_number"] def push(self, dictionary): """ Push dictionary of params to post-process. Blocks if queue is full for flow-control and proceeds when queue has enough space. :param dictionary: a dictionary created with group() method. :return: None """ # put in object store ref = ray.put(dictionary) # put ref in queue self._queue.put(ref) return None def pop(self): """ :return: a dictionary created with group() method, use ungroup() to unpack or lookup individually. """ return self._queue.get()
def test_simple_usage(ray_start_regular_shared): q = Queue() items = list(range(10)) for item in items: q.put(item) for item in items: assert item == q.get()
def test_async_get(ray_start_regular_shared): q = Queue() future = async_get.remote(q) with pytest.raises(Empty): q.get_nowait() with pytest.raises(GetTimeoutError): ray.get(future, timeout=0.1) # task not canceled on timeout. q.put(1) assert ray.get(future) == 1
def main(args): if not os.path.isdir(args.data_dir): os.makedirs(args.data_dir) guarantees_path = args.guarantees if not os.path.isfile(guarantees_path): sys.exit(f'{guarantees_path} is not a file') with open(guarantees_path, 'r') as guarantees_file: guarantees = json.load(guarantees_file)['patterns'] logger.info('Read in %d guarantees', len(guarantees)) #create folder and files folder_path = f'{args.data_dir}/{get_folder_name(args)}' if not os.path.isdir(folder_path): os.mkdir(folder_path) logger.info('Created folder %s', folder_path) data_gen_stats_file = os.path.join(folder_path, 'data_gen_stats.json') flag_filepath = os.path.join(folder_path, 'args.json') args_dict = vars(args) with open(flag_filepath, 'w') as flag_file: json.dump(args_dict, flag_file, indent=2) logger.info('Command line arguments written to %s', flag_filepath) progress_actor = common.ProgressActor.remote() # pylint: disable=no-member samples_queue = Queue(maxsize=args.num_samples) # pylint: disable=no-member timeouts_queue = Queue(maxsize=args.num_samples) ds_actor = DataSetActor.remote(guarantees, progress_actor, args_dict, samples_queue, timeouts_queue) dataset_writer_result = common.csv_dataset_writer.remote( samples_queue, folder_path, args.num_samples, args.train_frac, args.val_frac, args.test_frac) timeouts_file = os.path.join(folder_path, 'timeouts.csv') timeouts_writer_result = common.csv_file_writer.remote( timeouts_queue, timeouts_file) worker_results = [ strix.wrapper.worker.remote(ds_actor, args.strix_bin, strix_auto=args.strix_auto, strix_timeout=args.strix_timeout, id=i) for i in range(args.num_worker) ] common.progress_bar(progress_actor, args.num_samples, data_gen_stats_file) ray.get(worker_results) ray.get(dataset_writer_result) timeouts_queue.put(None) ray.get(timeouts_writer_result) split_dataset = from_dir(folder_path) stats = split_dataset.circuit_stats(['train', 'val', 'test']) stats_file = os.path.join(folder_path, 'circuit-stats.json') write_stats(stats, stats_file) plot_file = os.path.join(folder_path, 'circuit-stats.png') plot_stats(stats, plot_file) split_dataset.shuffle() split_dataset.save(folder_path)
def test_async_put(ray_start_regular_shared): q = Queue(1) q.put(1) future = async_put.remote(q, 2) with pytest.raises(Full): q.put_nowait(3) with pytest.raises(GetTimeoutError): ray.get(future, timeout=0.1) # task not canceled on timeout. assert q.get() == 1 assert q.get() == 2
def test_qsize(ray_start_regular_shared): q = Queue() items = list(range(10)) size = 0 assert q.qsize() == size for item in items: q.put(item) size += 1 assert q.qsize() == size for item in items: assert q.get() == item size -= 1 assert q.qsize() == size
def test_get(ray_start_regular_shared): q = Queue() item = 0 q.put(item) assert q.get(block=False) == item item = 1 q.put(item) assert q.get(timeout=0.2) == item with pytest.raises(ValueError): q.get(timeout=-1) with pytest.raises(Empty): q.get_nowait() with pytest.raises(Empty): q.get(timeout=0.2)
def test_put(ray_start_regular_shared): q = Queue(1) item = 0 q.put(item, block=False) assert q.get() == item item = 1 q.put(item, timeout=0.2) assert q.get() == item with pytest.raises(ValueError): q.put(0, timeout=-1) q.put(0) with pytest.raises(Full): q.put_nowait(1) with pytest.raises(Full): q.put(1, timeout=0.2)
def sweep(): ''' Run sweep of parameters ''' if args.server is not None: # For remote sweep we create the following directory structure: # 1/ 2/ 3/ 4/ # <repo>/<logs>/<platform>/<design>/ repo_dir = abspath(LOCAL_DIR + '/../' * 4) else: repo_dir = abspath('../') print(f'[INFO TUN-0012] Log dir {LOCAL_DIR}.') queue = Queue() for name, content in config_dict.items(): if not isinstance(content, list): continue for i in np.arange(*content): config_dict[name] = i queue.put([repo_dir, config_dict, LOCAL_DIR]) workers = [consumer.remote(queue) for _ in range(args.jobs)] print('[INFO TUN-0009] Waiting for results.') ray.get(workers) print('[INFO TUN-0010] Sweep complete.')
class Detector(LoggerMixin, SlackMixin): def __init__( self, source: str, dest: str, batch_size: int, tiles: int, webhook: str, gpu: bool, ) -> None: SlackMixin.__init__(self, webhook) self._source = source self._dest = dest self._batch_size = batch_size self._n_tiles = tiles self._threads = [] self._q_to_file_reader = Queue() self._q_freader_to_detector = Queue(maxsize=Config.Q_READER_NET_RUNNER) self._q_detector_payload_runner = Queue( maxsize=Config.Q_NET_RUNNER_PAYLOAD_RUNNER) self._q_payload_runner_fwriter = Queue( maxsize=Config.Q_PAYLOAD_RUNNER_WRITER) self.logger.info("Queues initialized") self._file_reader_thread = FileReaderThread( queue_in=self._q_to_file_reader, queue_out=self._q_freader_to_detector, ) self._model = YOLOv4("yolov4", device="gpu" if gpu else "cpu") self._net_runner_thread = NetRunnerThread( queue_in=self._q_freader_to_detector, queue_out=self._q_detector_payload_runner, model=self._model, ) self._payload_runner = PayloadRunnerActor( queue_in=self._q_detector_payload_runner, queue_out=self._q_payload_runner_fwriter, payload=Config.get_payload(), ) self._result_processor = TheResultProcessor(dest) self._result_processor_thread = ResultWriterThread( result_writer=self._result_processor, queue_in=self._q_payload_runner_fwriter, ) self._threads.append(self._file_reader_thread) self._threads.append(self._net_runner_thread) # type: ignore self._threads.append(self._result_processor_thread) # type: ignore self._start() self._log_message("Detector started") def process_images(self): for image_path in self._get_images_to_process(): self._q_to_file_reader.put(image_path) self._log_message(f"Image {os.path.basename(image_path)} " f"sent to file reader") def _get_images_to_process(self) -> t.Generator: for item in os.listdir(self._source): if any(item.endswith(ext.lower()) for ext in Config.ALLOWED_EXTS): yield os.path.join(self._source, item) else: self.logger.warning( f"Cannot process file: {item}. Unsupported extension") def _log_message(self, message: str) -> None: self.logger.info(message) self.slack_msg(message) def _start(self) -> None: for thread in self._threads: thread.start() def stop(self) -> None: self._q_to_file_reader.put("KILL") for thread in self._threads: thread.join() self._log_message("Detected stopped")
def to_csv(cls, qc, **kwargs): if not cls._to_csv_check_support(kwargs): return BaseIO.to_csv(qc, **kwargs) # The partition id will be added to the queue, for which the moment # of writing to the file has come queue = Queue(maxsize=1) def func(df, **kw): if kw["partition_idx"] != 0: # we need to create a new file only for first recording # all the rest should be recorded in appending mode if "w" in kwargs["mode"]: kwargs["mode"] = kwargs["mode"].replace("w", "a") # It is enough to write the header for the first partition kwargs["header"] = False # for parallelization purposes, each partition is written to an intermediate buffer path_or_buf = kwargs["path_or_buf"] is_binary = "b" in kwargs["mode"] if is_binary: kwargs["path_or_buf"] = io.BytesIO() else: kwargs["path_or_buf"] = io.StringIO() df.to_csv(**kwargs) content = kwargs["path_or_buf"].getvalue() kwargs["path_or_buf"].close() # each process waits for its turn to write to a file; # in case of violation of the order of receiving messages from the queue, # the message is placed back while True: get_value = queue.get(block=True) if get_value == kw["partition_idx"]: break queue.put(get_value) # preparing to write data from the buffer to a file with pandas.io.common.get_handle( path_or_buf, # in case when using URL in implicit text mode # pandas try to open `path_or_buf` in binary mode kwargs["mode"] if is_binary else kwargs["mode"] + "t", encoding=kwargs["encoding"], errors=kwargs["errors"], compression=kwargs["compression"], storage_options=kwargs["storage_options"], is_text=False, ) as handles: handles.handle.write(content) # signal that the next process can start writing to the file queue.put(get_value + 1) # used for synchronization purposes return 0 # signaling that the partition with id==0 can be written to the file queue.put(0) result = qc._modin_frame._frame_mgr_cls.map_axis_partitions( axis=1, partitions=qc._modin_frame._partitions, map_func=func, keep_partitioning=True, lengths=None, enumerate_partitions=True, ) # pending completion for rows in result: for partition in rows: wait([partition.oid])
import ray from ray.util.queue import Queue ray.init() # You can pass this object around to different tasks/actors queue = Queue(maxsize=100) @ray.remote def consumer(queue): next_item = queue.get(block=True) print(f'got work {next_item}') consumers = [consumer.remote(queue) for _ in range(3)] [queue.put(i) for i in range(10)] print(ray.nodes())
def to_csv(cls, qc, **kwargs): if not cls._to_csv_check_support(kwargs): return BaseIO.to_csv(qc, **kwargs) # The partition id will be added to the queue, for which the moment # of writing to the file has come queue = Queue(maxsize=1) def func(df, **kw): if kw["partition_idx"] != 0: # we need to create a new file only for first recording # all the rest should be recorded in appending mode if "w" in kwargs["mode"]: kwargs["mode"] = kwargs["mode"].replace("w", "a") # It is enough to write the header for the first partition kwargs["header"] = False # for parallelization purposes, each partition is written to an intermediate buffer path_or_buf = kwargs["path_or_buf"] is_binary = "b" in kwargs["mode"] if is_binary: kwargs["path_or_buf"] = io.BytesIO() else: kwargs["path_or_buf"] = io.StringIO() df.to_csv(**kwargs) content = kwargs["path_or_buf"].getvalue() # each process waits for its turn to write to a file; # in case of violation of the order of receiving messages from the queue, # the message is placed back while True: get_value = queue.get(block=True) if get_value == kw["partition_idx"]: break queue.put(get_value) # preparing to write data from the buffer to a file open_kwargs = {"mode": kwargs["mode"]} if not is_binary: # in the buffer, newline symbols have already been translated # for the current operating system; # in the process of writing the buffer to the file, # newline symbols must be left unchanged open_kwargs["newline"] = "" with open(path_or_buf, **open_kwargs) as _f: _f.write(content) # signal that the next process can start writing to the file queue.put(get_value + 1) # used for synchronization purposes return 0 # signaling that the partition with id==0 can be written to the file queue.put(0) result = qc._modin_frame._frame_mgr_cls.map_axis_partitions( axis=1, partitions=qc._modin_frame._partitions, map_func=func, keep_partitioning=True, lengths=None, enumerate_partitions=True, ) # pending completion for rows in result: for partition in rows: wait([partition.oid])
class RayHandler: def __init__(self, fc_data, behav_data, behav, covars, n_perm=0, **ray_kwargs): self.behav_data = behav_data # For adding kfold_indices ray.shutdown() # Make sure Ray is only initialised once self.ray_info = ray.init(**ray_kwargs) self.in_queue = Queue() self.out_queue = Queue() self.status_queue = Queue() self.report_queue = Queue() self.status_dict = {} self.actors_list = [] # Create dictionaries to keep results (it makes sense to do this class-wide to add results on-the-fly and for later reference if get results functions are called too early for example self.fselection_results = {} self.fselection_results[-1] = { } # Create sub-dictionary for original (i.e. non-permuted) data self.prediction_results = {} self.data_dict = {} self.data_dict['behav'] = behav self.data_dict['covars'] = covars self.data_dict['n_perm'] = n_perm self.data_dict['data'] = fc_data self.data_dict['edges'] = self.data_dict['data'].columns.astype( str) # Save edges columns before adding behavioral columns # Pengouin needs all the data (edges, behav, and covars) in a single DataFrame if covars: self.data_dict['data'][covars] = behav_data[covars] if n_perm > 0: # It seems to be more efficient to create a separate df and concat later; # .to_frame() converts Pandas series into a DataFrame on-the-fly behav_df = behav_data[behav].to_frame() for perm in range(n_perm): behav_df["{}-perm-{}".format( behav, perm)] = np.random.permutation(behav_df[behav]) self.fselection_results[perm] = { } # Create sub-dictionaries to keep fselection results for permutations behav_df = behav_df.copy() # To avaid fragmentation (and the corresponding warning), consolidate into a # new DataFrame) self.data_dict['data'] = pd.concat( [self.data_dict['data'], behav_df], axis=1) else: self.data_dict['data'][behav] = behav_data[behav] self.data_dict['data'].columns = self.data_dict['data'].columns.astype( str) def add_kfold_indices(self, n_folds, clean=True): subject_ids = self.data_dict['data'].index kfold_indices = get_kfold_indices(subject_ids, n_folds) if clean: kfold_indices = clean_kfold_indices(kfold_indices, self.behav_data) self.data_dict['kfold_indices'] = kfold_indices printv("You need to (re-) upload data after this operation.") def upload_data(self): # Allows us to manipulate data in-class before uploading # TODO: Put this and start_workers function in __init__() again? -> No, permutation # and post-festum data manipulation! self.data_object = ray.put(self.data_dict) def start_workers(self, n_workers): printv("Starting {} workers".format(n_workers)) self.workers = [ RayWorker.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue) for _ in range(n_workers) ] def start_actors(self): qsize = self.in_queue.qsize() printv("Starting actors for {} jobs...".format(qsize)) self.actors = [ RayActor.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue) for _ in range(qsize) ] def start_fselection(self, train_subs, fold, perm): # OUTDATED actor = RayActor.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue, auto_start=False) object = actor.edgewise_pcorr.remote(train_subs, fold, perm) # We don't need to keep # the object as results are sent to out_queue self.actors_list.append(actor) def submit_fselection(self, train_subs, fold, perm=-1): # perm=-1 means original data and is the default self.in_queue.put(['fselection', train_subs, fold, perm]) def submit_prediction(self, mask, kfold_indices_train, kfold_indices_test, fold, perm=-1): self.in_queue.put([ 'prediction', mask, kfold_indices_train, kfold_indices_test, fold, perm ]) def get_results(self, queue, n=100): """ Common get function utilised by get_{prediction,fselection}_results Input: queue to get from, max number of items to get at once Output: combined results """ N_total = 0 results = [] while not queue.empty(): N = queue.qsize() if N_total < N: N_total = N if N < n: # To provide some sort of progress display, it makes sense to split n = N printv("Retrieving results: {} of {}".format( len(results) + n, N_total), update=True) items = queue.get_nowait_batch(n) for item in items: results.append(item) return results def get_fselection_results(self): results = self.get_results(self.out_queue) n = 1 N = len(results) printv("\n") for result in results: fold = result[0] perm = result[1] df = result[2] printv("Rearranging result {} of {}".format(n, N), update=True) self.fselection_results[perm][fold] = df n += 1 #return self.fselection_results def get_prediction_results(self): results = self.get_results(self.out_queue) for results_dict in results: if results_dict['perm'] not in self.prediction_results: self.prediction_results[results_dict['perm']] = pd.DataFrame() self.prediction_results[ results_dict['perm']]['observed'] = self.data_dict['data'][ self.data_dict['behav']] for tail in ('pos', 'neg', 'glm'): self.prediction_results[results_dict['perm']].loc[ results_dict['test_IDs'], [tail]] = results_dict[tail] return self.prediction_results def status(self, verbose=True): N = self.status_queue.size() status_list_list = self.status_queue.get_nowait_batch(N) printv("Retrieving {} items from status queue...".format(N)) for status_list in status_list_list: pid = status_list[0] node = status_list[1] msg = status_list[2] self.status_dict[pid] = {"msg": msg, "node": node} n = 1 for pid, info in self.status_dict.items(): if (info['msg']): # Only print alive actors (-> msg != None) print("Actor {} [{}@{}]: {}".format(n, pid, info['node'], info['msg'])) n += 1 print("\n") out_size = self.out_queue.qsize() in_size = self.in_queue.qsize() print("Jobs done: {}".format(out_size)) print("Jobs remaining in queue: {}".format(in_size)) return out_size, in_size def terminate(self): ray.shutdown()
class DistributedPool(DistributedPoolAPI): """PoolAPI is an abstract class defining a resource Pool. A resource pool object which controls a pool of ressources (CPU, GPU, ...) to which jobs can be submitted. It supports asynchronous results with timeouts and callbacks and has a parallel map implementation. """ def __init__( self, n_worker: int, n_cpu_per_worker: int, memory_limit_per_worker: float = 0, n_gpu_per_worker: float = 0, max_pending_task: int = 10000, local_pool_class: Type[LocalPoolAPI] = LocalPool, ) -> None: """RayDistributedCluster constructor""" super().__init__( n_worker=n_worker, n_cpu_per_worker=n_cpu_per_worker, memory_limit_per_worker=memory_limit_per_worker, n_gpu_per_worker=n_gpu_per_worker, local_pool_class=local_pool_class, ) self.max_pending_task = max_pending_task # create task and results queues self.task_queue = Queue(max_pending_task) self.result_queue = Queue(max_pending_task) # consume processed results from result_queue # start consuming result queue def consume_result_queue(): self.started = True while self.started: try: result = self.result_queue.get(timeout=1, block=True) if type(result) is str: continue if result and result.task_id in self.processed_results: self.processed_results[result.task_id].result = result del self.processed_results[result.task_id] except Empty: continue except (RayActorError, AttributeError): break self.result_consumer_thread = threading.Thread( target=consume_result_queue) self.result_consumer_thread.start() # start actors opt = { "num_cpus": n_cpu_per_worker, "num_gpus": n_gpu_per_worker, } self.actor_pool = [ _RayExecutorActor.options(**opt).remote( # type: ignore self.task_queue, self.result_queue, self.create_local_pool(n_cpu=n_cpu_per_worker, memory_limit=0, n_visible_gpu=[]), ) for _ in range(n_worker) ] for a in self.actor_pool: a.start.remote() # wait agent ready # self.result_queue.get(block=True, timeout=30) self.processed_results: Dict[uuid.UUID, _RayAsyncResult] = {} def apply_async( self, func: Callable[..., _OutputType], args: Optional[Iterable[Any]] = None, kwds: Optional[Mapping[str, Any]] = None, callback: Optional[Callable[[_OutputType], None]] = None, error_callback: Optional[Callable[[BaseException], None]] = None, ) -> AsyncResult[_OutputType]: __doc__ = super().apply_async.__doc__ # noqa: F841 task = _RayTask( task_id=uuid4(), func=func, args=args, kwds=kwds, callback=callback, error_callback=error_callback, ) self.task_queue.put(task) async_res = _RayAsyncResult[_OutputType](task.task_id) self.processed_results[task.task_id] = async_res return async_res def map_async( self, func: Callable[[_InputType], _OutputType], iterable: Iterable[_InputType], chunksize: Optional[int] = 1, callback: Optional[Callable[[_OutputType], None]] = None, error_callback: Optional[Callable[[BaseException], None]] = None, ) -> MapResult[_OutputType]: __doc__ = super().apply_async.__doc__ # noqa: F841 chuncks_async_results: List[AsyncResult[List[_OutputType]]] = [] for c in mitertools.divide(self.n_worker, iterable=iterable): task = _RayMapTask( task_id=uuid4(), func=func, args=c, callback=callback, error_callback=error_callback, ) chunck_async_result = _RayAsyncResult[List[_OutputType]]( task.task_id) chuncks_async_results.append(chunck_async_result) self.processed_results[task.task_id] = chunck_async_result self.task_queue.put(task) async_res: MapResult[_OutputType] = _RayAsyncMapResult[_OutputType]( async_results=chuncks_async_results) return async_res def terminate(self) -> None: __doc__ = super().terminate.__doc__ # noqa: F841 self.close() for a in self.actor_pool: ray.kill(a) def close(self) -> None: __doc__ = super().close.__doc__ # noqa: F841 self.started = False for a in self.actor_pool: a.stop.remote() ray.kill(self.result_queue.actor) ray.kill(self.task_queue.actor) sleep(1) def create_local_pool(self, n_cpu: int = 0, memory_limit: float = 0, n_visible_gpu: List[int] = []) -> LocalPoolAPI: if memory_limit == 0: memory_limit = self.memory_limit_per_worker return self.local_pool_class(n_cpu, memory_limit, n_visible_gpu, lazy=True)
N_GPU_PER_THREAD = args.gpu n_thread = min(len(jobs), args.thread) total_cpus = N_CPU_PER_THREAD * n_thread total_gpus = N_GPU_PER_THREAD * n_thread ray.init(num_cpus=total_cpus, num_gpus=total_gpus) print(f"Number of threads: {n_thread}.") print(f"CPUs per thread: {N_CPU_PER_THREAD}.") print(f"Total CPUs: {total_cpus}.") if N_GPU_PER_THREAD > 0: print(f"GPUs per thread: {N_GPU_PER_THREAD}.") print(f"Total GPUs: {total_gpus}.") print(f"-----------------") jobs_queue = Queue() for job in jobs: jobs_queue.put(job) pb = ProgressBar(len(jobs_queue)) actor = pb.actor job_list = [] for _ in range(n_thread): job_list.append(process_jobs.remote(jobs_queue, args, actor)) pb.print_until_done() job_results = list(chain(*ray.get(job_list))) ray.get(actor.get_counter.remote()) result_path = Path(args.output_dir) / "result.jsonl" with jsonlines.open(result_path, "w") as writer: writer.write_all(job_results) else: print(f"No job to run.")