async def test_close_pipeline(self, ray_context): # m1:g1 ----------> \ # m2:g2 -> m3:g2 -> m4:g3 builder = ParallelPipeline() builder.add_module(ModuleTestTeardown('m1', group='g1')) builder.add_module(ModuleTestTeardown('m2', group='g2')) builder.add_module( ModuleTestTeardown('m3', group='g2').depends_on( builder.get_module('m2'))) builder.add_module( ModuleTestTeardown('m4', group='g3').depends_on( builder.get_module('m3')).depends_on( builder.get_module('m1')).expose_result('final')) up, down = Queue(), Queue() pipeline = await builder.build(dict(up=up, down=down)) assert (up.size() == len(pipeline.modules)) assert (down.size() == 0) await pipeline.run() await pipeline.close() assert pipeline.closed assert (up.size() == len(pipeline.modules)) assert (down.size() == len(pipeline.modules)) with pytest.raises(ClosedPipelineException): await pipeline.run() with pytest.raises(ClosedPipelineException): await pipeline.process()
def test_shutdown(ray_start_regular_shared): q = Queue() actor = q.actor q.shutdown() assert q.actor is None with pytest.raises(RayActorError): ray.get(actor.empty.remote())
class WorkQueue: def __init__(self, max_depth: int = 8): self._queue = Queue(maxsize=max_depth) def get_queue(self): """ :return: Ray Queue actor, needed by the consumers. """ return self._queue def empty(self): """ :return: Ray Queue actor, needed by the consumers. """ return self._queue.empty() def group(self, labels_all: np.ndarray, probs_all: np.ndarray, filename: str, original_shape: tuple, inference_time_sec: float, page_number: int) -> dict: return { "labels_all": labels_all, "probs_all": probs_all, "filename": filename, "original_shape": original_shape, "inference_time_sec": inference_time_sec, "page_number": page_number } def ungroup(self, dictionary): """ use this like: labels_all, probs_all, filename, original_shape = ungroup(d) :param dictionary: :return: """ return dictionary["labels_all"], dictionary["probs_all"], dictionary[ "filename"], dictionary["original_shape"], dictionary[ "inference_time_sec"], dictionary["page_number"] def push(self, dictionary): """ Push dictionary of params to post-process. Blocks if queue is full for flow-control and proceeds when queue has enough space. :param dictionary: a dictionary created with group() method. :return: None """ # put in object store ref = ray.put(dictionary) # put ref in queue self._queue.put(ref) return None def pop(self): """ :return: a dictionary created with group() method, use ungroup() to unpack or lookup individually. """ return self._queue.get()
def test_simple_usage(ray_start_regular_shared): q = Queue() items = list(range(10)) for item in items: q.put(item) for item in items: assert item == q.get()
def main(args): if not os.path.isdir(args.data_dir): os.makedirs(args.data_dir) guarantees_path = args.guarantees if not os.path.isfile(guarantees_path): sys.exit(f'{guarantees_path} is not a file') with open(guarantees_path, 'r') as guarantees_file: guarantees = json.load(guarantees_file)['patterns'] logger.info('Read in %d guarantees', len(guarantees)) #create folder and files folder_path = f'{args.data_dir}/{get_folder_name(args)}' if not os.path.isdir(folder_path): os.mkdir(folder_path) logger.info('Created folder %s', folder_path) data_gen_stats_file = os.path.join(folder_path, 'data_gen_stats.json') flag_filepath = os.path.join(folder_path, 'args.json') args_dict = vars(args) with open(flag_filepath, 'w') as flag_file: json.dump(args_dict, flag_file, indent=2) logger.info('Command line arguments written to %s', flag_filepath) progress_actor = common.ProgressActor.remote() # pylint: disable=no-member samples_queue = Queue(maxsize=args.num_samples) # pylint: disable=no-member timeouts_queue = Queue(maxsize=args.num_samples) ds_actor = DataSetActor.remote(guarantees, progress_actor, args_dict, samples_queue, timeouts_queue) dataset_writer_result = common.csv_dataset_writer.remote( samples_queue, folder_path, args.num_samples, args.train_frac, args.val_frac, args.test_frac) timeouts_file = os.path.join(folder_path, 'timeouts.csv') timeouts_writer_result = common.csv_file_writer.remote( timeouts_queue, timeouts_file) worker_results = [ strix.wrapper.worker.remote(ds_actor, args.strix_bin, strix_auto=args.strix_auto, strix_timeout=args.strix_timeout, id=i) for i in range(args.num_worker) ] common.progress_bar(progress_actor, args.num_samples, data_gen_stats_file) ray.get(worker_results) ray.get(dataset_writer_result) timeouts_queue.put(None) ray.get(timeouts_writer_result) split_dataset = from_dir(folder_path) stats = split_dataset.circuit_stats(['train', 'val', 'test']) stats_file = os.path.join(folder_path, 'circuit-stats.json') write_stats(stats, stats_file) plot_file = os.path.join(folder_path, 'circuit-stats.png') plot_stats(stats, plot_file) split_dataset.shuffle() split_dataset.save(folder_path)
def test_custom_resources(ray_start_regular_shared): current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 # By default an actor should not reserve any resources. Queue() current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 # Specify resource requirement. The queue should now reserve 1 CPU. Queue(actor_options={"num_cpus": 1}) time.sleep(1) current_resources = ray.available_resources() assert "CPU" not in current_resources, current_resources
def predict_wcc(num_actors, min_rcode, max_batch_size, model_path, top_k, shared_args): data_queue = Queue(maxsize=128) infer_queue = Queue(maxsize=128) db_host = shared_args['db_host'] db_port = shared_args['db_port'] db_pwd = shared_args['db_pwd'] # anchors = shared_args['anchors'] max_step = shared_args['max_step'] time_shift = shared_args['time_shift'] corl_prior = shared_args['corl_prior'] args = shared_args['args'] # actors will be retrieved by name in remote functions actors = [ DataLoader.options(name='DataLoader_{}'.format(i)).remote(shared_args) for i in range(num_actors) ] # ray.util.ActorPool( # # [ray.get_actor("DataLoader_" + str(i)) for i in range(num_actors)] # [DataLoader.options(name='DataLoader_{}'.format(i)).remote( # shared_args) for i in range(num_actors)] # ) work = getWorkloadForPredictionFromTags(corl_prior, max_step, time_shift, db_host, db_port, db_pwd) d = _load_data.remote(work, num_actors, min_rcode, shared_args, data_queue) s = _save_infer_result.remote(top_k, shared_args, infer_queue) # There're many unknown issues running GPU inference in ray worker... # use 0.999 instead of integer 1 to avoid exceeding resource limit in total gpu_alloc = 0.999 / args.parallel p = [] for i in range(args.parallel): p.append( _predict.options(num_gpus=gpu_alloc).remote( model_path, max_batch_size, data_queue, infer_queue, args)) if i + 1 < args.parallel: sleep(0.7) if ray.get(d) and ray.get(s) and all(r == True for r in list(ray.get(p))): print('{} inference completed. total workload: {}'.format( strftime("%H:%M:%S"), len(work))) else: print( '{} inference completed with exception. total workload: {}'.format( strftime("%H:%M:%S"), len(work)))
def __init__( self, source: str, dest: str, batch_size: int, tiles: int, webhook: str, gpu: bool, ) -> None: SlackMixin.__init__(self, webhook) self._source = source self._dest = dest self._batch_size = batch_size self._n_tiles = tiles self._threads = [] self._q_to_file_reader = Queue() self._q_freader_to_detector = Queue(maxsize=Config.Q_READER_NET_RUNNER) self._q_detector_payload_runner = Queue( maxsize=Config.Q_NET_RUNNER_PAYLOAD_RUNNER) self._q_payload_runner_fwriter = Queue( maxsize=Config.Q_PAYLOAD_RUNNER_WRITER) self.logger.info("Queues initialized") self._file_reader_thread = FileReaderThread( queue_in=self._q_to_file_reader, queue_out=self._q_freader_to_detector, ) self._model = YOLOv4("yolov4", device="gpu" if gpu else "cpu") self._net_runner_thread = NetRunnerThread( queue_in=self._q_freader_to_detector, queue_out=self._q_detector_payload_runner, model=self._model, ) self._payload_runner = PayloadRunnerActor( queue_in=self._q_detector_payload_runner, queue_out=self._q_payload_runner_fwriter, payload=Config.get_payload(), ) self._result_processor = TheResultProcessor(dest) self._result_processor_thread = ResultWriterThread( result_writer=self._result_processor, queue_in=self._q_payload_runner_fwriter, ) self._threads.append(self._file_reader_thread) self._threads.append(self._net_runner_thread) # type: ignore self._threads.append(self._result_processor_thread) # type: ignore self._start() self._log_message("Detector started")
def execution_loop(self, trainer, tune_enabled: bool = True): """Main execution loop for training, testing, & prediction. Sets up the torch.distributed process group for each worker. Then trigger remote training/testing/eval via ``train_remote`` on each worker. If using with Ray Tune, create a communication queue to retrieve intermediate results, and process those results. Finally retrieve the training results from the rank 0 worker and return.""" # Sets environment variables for all workers. self._setup_env_vars() self.global_to_local = self.get_local_ranks() model = self._model model_ref = ray.put(model) # Don't pickle the model when training remotely. self._model = None queue = None if tune_enabled and TUNE_INSTALLED and is_session_enabled(): # Create communication queue and send to all the workers. queue = Queue(actor_options={"num_cpus": 0}) futures = [ self.workers[i].execute.remote(self.execute_remote, model_ref, i, queue) for i in range(self.num_workers) ] results = process_results(futures, queue) # Get the results, checkpoint path, and model weights from worker 0. results, best_path, state_stream = results[0] state_dict = load_state_stream(state_stream, to_gpu=self.use_gpu) # Set the state for PTL using the output from remote training. self._results = results self._model = model self._model.load_state_dict(state_dict) if self.lightning_module.trainer.checkpoint_callback: self.lightning_module.trainer.checkpoint_callback \ .best_model_path = best_path if queue: # Shutdown the queue. queue.shutdown() return results
def test_custom_resources(ray_start_regular_shared): current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 # By default an actor should not reserve any resources. Queue() current_resources = ray.available_resources() assert current_resources["CPU"] == 1.0 # Specify resource requirement. The queue should now reserve 1 CPU. Queue(actor_options={"num_cpus": 1}) def no_cpu_in_resources(): return "CPU" not in ray.available_resources() wait_for_condition(no_cpu_in_resources)
def remote_execution_api(self) -> "RemoteAPI": """Create an object to control cluster state from within the cluster.""" self._execution_queue = Queue(actor_options={"num_cpus": 0}) stop_event = self._execution_event def entrypoint(): while not stop_event.is_set(): try: cmd, kwargs = self._execution_queue.get(timeout=1) except Empty: continue if cmd == "kill_node": self.kill_node(**kwargs) self._execution_thread = threading.Thread(target=entrypoint) self._execution_thread.start() return RemoteAPI(self._execution_queue)
def sweep(): ''' Run sweep of parameters ''' if args.server is not None: # For remote sweep we create the following directory structure: # 1/ 2/ 3/ 4/ # <repo>/<logs>/<platform>/<design>/ repo_dir = abspath(LOCAL_DIR + '/../' * 4) else: repo_dir = abspath('../') print(f'[INFO TUN-0012] Log dir {LOCAL_DIR}.') queue = Queue() for name, content in config_dict.items(): if not isinstance(content, list): continue for i in np.arange(*content): config_dict[name] = i queue.put([repo_dir, config_dict, LOCAL_DIR]) workers = [consumer.remote(queue) for _ in range(args.jobs)] print('[INFO TUN-0009] Waiting for results.') ray.get(workers) print('[INFO TUN-0010] Sweep complete.')
async def test_should_correctly_init_parallel_pipeline_with_hooks_in_dict( self, ray_context): self.prepare_basic_hooks_test_modules_factory() callable_1_counter = Queue() callable_2_counter = Queue() def callable_1(): callable_1_counter.put(1) def callable_2(): callable_2_counter.put(2) config_file = self.get_config_file( 'correct_exemplary_config_with_groups.yaml') with open(config_file) as config: config = config.read() pipeline = await ConfigReader.read( config, ModuleFactory, after_created={ "g1": [callable_1, callable_2, callable_2], "g2": [callable_1], "g3": [] }) assert len(pipeline.modules) == 3 assert callable_1_counter.qsize() == 3 assert callable_2_counter.qsize() == 4
def __init__(self, fc_data, behav_data, behav, covars, n_perm=0, **ray_kwargs): self.behav_data = behav_data # For adding kfold_indices ray.shutdown() # Make sure Ray is only initialised once self.ray_info = ray.init(**ray_kwargs) self.in_queue = Queue() self.out_queue = Queue() self.status_queue = Queue() self.report_queue = Queue() self.status_dict = {} self.actors_list = [] # Create dictionaries to keep results (it makes sense to do this class-wide to add results on-the-fly and for later reference if get results functions are called too early for example self.fselection_results = {} self.fselection_results[-1] = { } # Create sub-dictionary for original (i.e. non-permuted) data self.prediction_results = {} self.data_dict = {} self.data_dict['behav'] = behav self.data_dict['covars'] = covars self.data_dict['n_perm'] = n_perm self.data_dict['data'] = fc_data self.data_dict['edges'] = self.data_dict['data'].columns.astype( str) # Save edges columns before adding behavioral columns # Pengouin needs all the data (edges, behav, and covars) in a single DataFrame if covars: self.data_dict['data'][covars] = behav_data[covars] if n_perm > 0: # It seems to be more efficient to create a separate df and concat later; # .to_frame() converts Pandas series into a DataFrame on-the-fly behav_df = behav_data[behav].to_frame() for perm in range(n_perm): behav_df["{}-perm-{}".format( behav, perm)] = np.random.permutation(behav_df[behav]) self.fselection_results[perm] = { } # Create sub-dictionaries to keep fselection results for permutations behav_df = behav_df.copy() # To avaid fragmentation (and the corresponding warning), consolidate into a # new DataFrame) self.data_dict['data'] = pd.concat( [self.data_dict['data'], behav_df], axis=1) else: self.data_dict['data'][behav] = behav_data[behav] self.data_dict['data'].columns = self.data_dict['data'].columns.astype( str)
def test_async_put(ray_start_regular_shared): q = Queue(1) q.put(1) future = async_put.remote(q, 2) with pytest.raises(Full): q.put_nowait(3) with pytest.raises(GetTimeoutError): ray.get(future, timeout=0.1) # task not canceled on timeout. assert q.get() == 1 assert q.get() == 2
def start_training(self, trainer): """Main training loop. Trigger remote training via ``train_remote`` on each worker. If using with Ray Tune, create a communication queue to retrieve intermediate results, and process those results. Finally retrieve the training results from the rank 0 worker and return.""" model = self._model model_ref = ray.put(model) # Don't pickle the model when training remotely. self._model = None queue = None if TUNE_INSTALLED and is_session_enabled(): # Create communication queue and send to all the workers. queue = Queue(actor_options={"num_cpus": 0}) result_futures = self.executor.run_remote(self.train_remote, args=[model_ref, queue]) results = process_results(result_futures, queue) results, state_dict, best_path = results[0] self._results = results self._model = model self._model.load_state_dict(state_dict) self._model.trainer.accelerator.training_type_plugin = self if self.lightning_module.trainer.checkpoint_callback: self.lightning_module.trainer.checkpoint_callback \ .best_model_path = best_path if queue: # Shutdown the queue. queue.shutdown() return results
def test_qsize(ray_start_regular_shared): q = Queue() items = list(range(10)) size = 0 assert q.qsize() == size for item in items: q.put(item) size += 1 assert q.qsize() == size for item in items: assert q.get() == item size -= 1 assert q.qsize() == size
def test_async_get(ray_start_regular_shared): q = Queue() future = async_get.remote(q) with pytest.raises(Empty): q.get_nowait() with pytest.raises(GetTimeoutError): ray.get(future, timeout=0.1) # task not canceled on timeout. q.put(1) assert ray.get(future) == 1
async def test_get_async(ray_start_regular_shared): q = Queue() item = 0 await q.put_async(item) assert await q.get_async(block=False) == item item = 1 await q.put_async(item) assert await q.get_async(timeout=0.2) == item with pytest.raises(ValueError): await q.get_async(timeout=-1) with pytest.raises(Empty): await q.get_async(block=False) with pytest.raises(Empty): await q.get_async(timeout=0.2)
async def test_put_async(ray_start_regular_shared): q = Queue(1) item = 0 await q.put_async(item, block=False) assert await q.get_async() == item item = 1 await q.put_async(item, timeout=0.2) assert await q.get_async() == item with pytest.raises(ValueError): await q.put_async(0, timeout=-1) await q.put_async(0) with pytest.raises(Full): await q.put_async(1, block=False) with pytest.raises(Full): await q.put_async(1, timeout=0.2)
async def test_should_call_hooks_in_groups(self, ray_context): builder = ParallelPipeline() callable_1_counter = Queue() callable_2_counter = Queue() def callable_1(): callable_1_counter.put(1) def callable_2(): callable_2_counter.put(2) builder.add_group(builder.Group('g1', after_created=[callable_1, callable_2])) builder.add_group(builder.Group('g2', after_created=[callable_2])) builder.add_module(ModuleA('m1', group='g1')) builder.add_module(ModuleB('m2', group='g2').depends_on(builder.get_module('m1'))) pipeline = await builder.build() assert isinstance(pipeline, ParallelPipeline.Runtime) assert len(pipeline.groups) == 2 assert set([g.name for g in pipeline.groups]) == {'g1', 'g2'} assert callable_1_counter.qsize() == 1 assert callable_2_counter.qsize() == 2
def run( run_or_experiment: Union[str, Callable, Type], name: Optional[str] = None, metric: Optional[str] = None, mode: Optional[str] = None, stop: Union[None, Mapping, Stopper, Callable[[str, Mapping], bool]] = None, time_budget_s: Union[None, int, float, datetime.timedelta] = None, config: Optional[Dict[str, Any]] = None, resources_per_trial: Union[None, Mapping[str, Union[ float, int, Mapping]], PlacementGroupFactory] = None, num_samples: int = 1, local_dir: Optional[str] = None, search_alg: Optional[Union[Searcher, SearchAlgorithm, str]] = None, scheduler: Optional[Union[TrialScheduler, str]] = None, keep_checkpoints_num: Optional[int] = None, checkpoint_score_attr: Optional[str] = None, checkpoint_freq: int = 0, checkpoint_at_end: bool = False, verbose: Union[int, Verbosity] = Verbosity.V3_TRIAL_DETAILS, progress_reporter: Optional[ProgressReporter] = None, log_to_file: bool = False, trial_name_creator: Optional[Callable[[Trial], str]] = None, trial_dirname_creator: Optional[Callable[[Trial], str]] = None, sync_config: Optional[SyncConfig] = None, export_formats: Optional[Sequence] = None, max_failures: int = 0, fail_fast: bool = False, restore: Optional[str] = None, server_port: Optional[int] = None, resume: bool = False, reuse_actors: bool = False, trial_executor: Optional[RayTrialExecutor] = None, raise_on_failed_trial: bool = True, callbacks: Optional[Sequence[Callback]] = None, max_concurrent_trials: Optional[int] = None, # Deprecated args queue_trials: Optional[bool] = None, loggers: Optional[Sequence[Type[Logger]]] = None, _remote: Optional[bool] = None, ) -> ExperimentAnalysis: """Executes training. When a SIGINT signal is received (e.g. through Ctrl+C), the tuning run will gracefully shut down and checkpoint the latest experiment state. Sending SIGINT again (or SIGKILL/SIGTERM instead) will skip this step. Many aspects of Tune, such as the frequency of global checkpointing, maximum pending placement group trials and the path of the result directory be configured through environment variables. Refer to :ref:`tune-env-vars` for a list of environment variables available. Examples: .. code-block:: python # Run 10 trials (each trial is one instance of a Trainable). Tune runs # in parallel and automatically determines concurrency. tune.run(trainable, num_samples=10) # Run 1 trial, stop when trial has reached 10 iterations tune.run(my_trainable, stop={"training_iteration": 10}) # automatically retry failed trials up to 3 times tune.run(my_trainable, stop={"training_iteration": 10}, max_failures=3) # Run 1 trial, search over hyperparameters, stop after 10 iterations. space = {"lr": tune.uniform(0, 1), "momentum": tune.uniform(0, 1)} tune.run(my_trainable, config=space, stop={"training_iteration": 10}) # Resumes training if a previous machine crashed tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume=True) # Rerun ONLY failed trials after an experiment is finished. tune.run(my_trainable, config=space, local_dir=<path/to/dir>, resume="ERRORED_ONLY") Args: run_or_experiment (function | class | str | :class:`Experiment`): If function|class|str, this is the algorithm or model to train. This may refer to the name of a built-on algorithm (e.g. RLLib's DQN or PPO), a user-defined trainable function or class, or the string identifier of a trainable function or class registered in the tune registry. If Experiment, then Tune will execute training based on Experiment.spec. If you want to pass in a Python lambda, you will need to first register the function: ``tune.register_trainable("lambda_id", lambda x: ...)``. You can then use ``tune.run("lambda_id")``. metric (str): Metric to optimize. This metric should be reported with `tune.report()`. If set, will be passed to the search algorithm and scheduler. mode (str): Must be one of [min, max]. Determines whether objective is minimizing or maximizing the metric attribute. If set, will be passed to the search algorithm and scheduler. name (str): Name of experiment. stop (dict | callable | :class:`Stopper`): Stopping criteria. If dict, the keys may be any field in the return result of 'train()', whichever is reached first. If function, it must take (trial_id, result) as arguments and return a boolean (True if trial should be stopped, False otherwise). This can also be a subclass of ``ray.tune.Stopper``, which allows users to implement custom experiment-wide stopping (i.e., stopping an entire Tune run based on some time constraint). time_budget_s (int|float|datetime.timedelta): Global time budget in seconds after which all trials are stopped. Can also be a ``datetime.timedelta`` object. config (dict): Algorithm-specific configuration for Tune variant generation (e.g. env, hyperparams). Defaults to empty dict. Custom search algorithms may ignore this. resources_per_trial (dict|PlacementGroupFactory): Machine resources to allocate per trial, e.g. ``{"cpu": 64, "gpu": 8}``. Note that GPUs will not be assigned unless you specify them here. Defaults to 1 CPU and 0 GPUs in ``Trainable.default_resource_request()``. This can also be a PlacementGroupFactory object wrapping arguments to create a per-trial placement group. num_samples (int): Number of times to sample from the hyperparameter space. Defaults to 1. If `grid_search` is provided as an argument, the grid will be repeated `num_samples` of times. If this is -1, (virtually) infinite samples are generated until a stopping condition is met. local_dir (str): Local dir to save training results to. Defaults to ``~/ray_results``. search_alg (Searcher|SearchAlgorithm|str): Search algorithm for optimization. You can also use the name of the algorithm. scheduler (TrialScheduler|str): Scheduler for executing the experiment. Choose among FIFO (default), MedianStopping, AsyncHyperBand, HyperBand and PopulationBasedTraining. Refer to ray.tune.schedulers for more options. You can also use the name of the scheduler. keep_checkpoints_num (int): Number of checkpoints to keep. A value of `None` keeps all checkpoints. Defaults to `None`. If set, need to provide `checkpoint_score_attr`. checkpoint_score_attr (str): Specifies by which attribute to rank the best checkpoint. Default is increasing order. If attribute starts with `min-` it will rank attribute in decreasing order, i.e. `min-validation_loss`. checkpoint_freq (int): How many training iterations between checkpoints. A value of 0 (default) disables checkpointing. This has no effect when using the Functional Training API. checkpoint_at_end (bool): Whether to checkpoint at the end of the experiment regardless of the checkpoint_freq. Default is False. This has no effect when using the Functional Training API. verbose (Union[int, Verbosity]): 0, 1, 2, or 3. Verbosity mode. 0 = silent, 1 = only status updates, 2 = status and brief trial results, 3 = status and detailed trial results. Defaults to 3. progress_reporter (ProgressReporter): Progress reporter for reporting intermediate experiment progress. Defaults to CLIReporter if running in command-line, or JupyterNotebookReporter if running in a Jupyter notebook. log_to_file (bool|str|Sequence): Log stdout and stderr to files in Tune's trial directories. If this is `False` (default), no files are written. If `true`, outputs are written to `trialdir/stdout` and `trialdir/stderr`, respectively. If this is a single string, this is interpreted as a file relative to the trialdir, to which both streams are written. If this is a Sequence (e.g. a Tuple), it has to have length 2 and the elements indicate the files to which stdout and stderr are written, respectively. trial_name_creator (Callable[[Trial], str]): Optional function for generating the trial string representation. trial_dirname_creator (Callable[[Trial], str]): Function for generating the trial dirname. This function should take in a Trial object and return a string representing the name of the directory. The return value cannot be a path. sync_config (SyncConfig): Configuration object for syncing. See tune.SyncConfig. export_formats (list): List of formats that exported at the end of the experiment. Default is None. max_failures (int): Try to recover a trial at least this many times. Ray will recover from the latest checkpoint if present. Setting to -1 will lead to infinite recovery retries. Setting to 0 will disable retries. Defaults to 0. fail_fast (bool | str): Whether to fail upon the first error. If fail_fast='raise' provided, Tune will automatically raise the exception received by the Trainable. fail_fast='raise' can easily leak resources and should be used with caution (it is best used with `ray.init(local_mode=True)`). restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. server_port (int): Port number for launching TuneServer. resume (str|bool): One of "LOCAL", "REMOTE", "PROMPT", "ERRORED_ONLY", or bool. LOCAL/True restores the checkpoint from the local experiment directory, determined by ``name`` and ``local_dir``. REMOTE restores the checkpoint from ``upload_dir`` (as passed to ``sync_config``). PROMPT provides CLI feedback. False forces a new experiment. ERRORED_ONLY resets and reruns ERRORED trials upon resume - previous trial artifacts will be left untouched. If resume is set but checkpoint does not exist, ValueError will be thrown. reuse_actors (bool): Whether to reuse actors between different trials when possible. This can drastically speed up experiments that start and stop actors often (e.g., PBT in time-multiplexing mode). This requires trials to have the same resource requirements. trial_executor (TrialExecutor): Manage the execution of trials. raise_on_failed_trial (bool): Raise TuneError if there exists failed trial (of ERROR state) when the experiments complete. callbacks (list): List of callbacks that will be called at different times in the training loop. Must be instances of the ``ray.tune.callback.Callback`` class. If not passed, `LoggerCallback` and `SyncerCallback` callbacks are automatically added. max_concurrent_trials (int): Maximum number of trials to run concurrently. Must be non-negative. If None or 0, no limit will be applied. This is achieved by wrapping the ``search_alg`` in a :class:`ConcurrencyLimiter`, and thus setting this argument will raise an exception if the ``search_alg`` is already a :class:`ConcurrencyLimiter`. Defaults to None. _remote (bool): Whether to run the Tune driver in a remote function. This is disabled automatically if a custom trial executor is passed in. This is enabled by default in Ray client mode. Returns: ExperimentAnalysis: Object for experiment analysis. Raises: TuneError: Any trials failed and `raise_on_failed_trial` is True. """ # To be removed in 1.9. if queue_trials is not None: raise DeprecationWarning( "`queue_trials` has been deprecated and is replaced by " "the `TUNE_MAX_PENDING_TRIALS_PG` environment variable. " "Per default at least one Trial is queued at all times, " "so you likely don't need to change anything other than " "removing this argument from your call to `tune.run()`") # NO CODE IS TO BE ADDED ABOVE THIS COMMENT # remote_run_kwargs must be defined before any other # code is ran to ensure that at this point, # `locals()` is equal to args and kwargs remote_run_kwargs = locals().copy() remote_run_kwargs.pop("_remote") if _remote is None: _remote = ray.util.client.ray.is_connected() if _remote is True and trial_executor: raise ValueError("cannot use custom trial executor") if not trial_executor or isinstance(trial_executor, RayTrialExecutor): _ray_auto_init() if _remote: remote_run = ray.remote(num_cpus=0)(run) # Make sure tune.run is called on the sever node. remote_run = force_on_current_node(remote_run) # JupyterNotebooks don't work with remote tune runs out of the box # (e.g. via Ray client) as they don't have access to the main # process stdout. So we introduce a queue here that accepts # callables, which will then be executed on the driver side. if isinstance(progress_reporter, JupyterNotebookReporter): execute_queue = Queue(actor_options={ "num_cpus": 0, **force_on_current_node(None) }) progress_reporter.set_output_queue(execute_queue) def get_next_queue_item(): try: return execute_queue.get(block=False) except Empty: return None else: # If we don't need a queue, use this dummy get fn instead of # scheduling an unneeded actor def get_next_queue_item(): return None def _handle_execute_queue(): execute_item = get_next_queue_item() while execute_item: if isinstance(execute_item, Callable): execute_item() execute_item = get_next_queue_item() remote_future = remote_run.remote(_remote=False, **remote_run_kwargs) # ray.wait(...)[1] returns futures that are not ready, yet while ray.wait([remote_future], timeout=0.2)[1]: # Check if we have items to execute _handle_execute_queue() # Handle queue one last time _handle_execute_queue() return ray.get(remote_future) del remote_run_kwargs all_start = time.time() if loggers: # Raise DeprecationWarning in 1.9, remove in 1.10/1.11 warnings.warn( "The `loggers` argument is deprecated. Please pass the respective " "`LoggerCallback` classes to the `callbacks` argument instead. " "See https://docs.ray.io/en/latest/tune/api_docs/logging.html") if mode and mode not in ["min", "max"]: raise ValueError( "The `mode` parameter passed to `tune.run()` has to be one of " "['min', 'max']") set_verbosity(verbose) config = config or {} sync_config = sync_config or SyncConfig() set_sync_periods(sync_config) if num_samples == -1: num_samples = sys.maxsize result_buffer_length = None # Create scheduler here as we need access to some of its properties if isinstance(scheduler, str): # importing at top level causes a recursive dependency from ray.tune.schedulers import create_scheduler scheduler = create_scheduler(scheduler) scheduler = scheduler or FIFOScheduler() if not scheduler.supports_buffered_results: # Result buffering with e.g. a Hyperband scheduler is a bad idea, as # hyperband tries to stop trials when processing brackets. With result # buffering, we might trigger this multiple times when evaluating # a single trial, which leads to unexpected behavior. env_result_buffer_length = os.getenv("TUNE_RESULT_BUFFER_LENGTH", "") if env_result_buffer_length: warnings.warn( f"You are using a {type(scheduler)} scheduler, but " f"TUNE_RESULT_BUFFER_LENGTH is set " f"({env_result_buffer_length}). This can lead to undesired " f"and faulty behavior, so the buffer length was forcibly set " f"to 1 instead.") result_buffer_length = 1 if isinstance(scheduler, (PopulationBasedTraining, PopulationBasedTrainingReplay)) and not reuse_actors: warnings.warn( "Consider boosting PBT performance by enabling `reuse_actors` as " "well as implementing `reset_config` for Trainable.") trial_executor = trial_executor or RayTrialExecutor( reuse_actors=reuse_actors, result_buffer_length=result_buffer_length) if isinstance(run_or_experiment, list): experiments = run_or_experiment else: experiments = [run_or_experiment] for i, exp in enumerate(experiments): if not isinstance(exp, Experiment): experiments[i] = Experiment( name=name, run=exp, stop=stop, time_budget_s=time_budget_s, config=config, resources_per_trial=resources_per_trial, num_samples=num_samples, local_dir=local_dir, sync_config=sync_config, trial_name_creator=trial_name_creator, trial_dirname_creator=trial_dirname_creator, log_to_file=log_to_file, checkpoint_freq=checkpoint_freq, checkpoint_at_end=checkpoint_at_end, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr, export_formats=export_formats, max_failures=max_failures, restore=restore) else: logger.debug("Ignoring some parameters passed into tune.run.") if fail_fast and max_failures != 0: raise ValueError("max_failures must be 0 if fail_fast=True.") if isinstance(search_alg, str): # importing at top level causes a recursive dependency from ray.tune.suggest import create_searcher search_alg = create_searcher(search_alg) # if local_mode=True is set during ray.init(). is_local_mode = ray.worker._mode() == ray.worker.LOCAL_MODE if is_local_mode: max_concurrent_trials = 1 if not search_alg: search_alg = BasicVariantGenerator( max_concurrent=max_concurrent_trials or 0) elif max_concurrent_trials: if isinstance(search_alg, ConcurrencyLimiter): if search_alg.max_concurrent != max_concurrent_trials: raise ValueError( "You have specified `max_concurrent_trials=" f"{max_concurrent_trials}`, but the `search_alg` is " "already a `ConcurrencyLimiter` with `max_concurrent=" f"{search_alg.max_concurrent}. FIX THIS by setting " "`max_concurrent_trials=None`.") else: logger.warning( "You have specified `max_concurrent_trials=" f"{max_concurrent_trials}`, but the `search_alg` is " "already a `ConcurrencyLimiter`. `max_concurrent_trials` " "will be ignored.") else: if max_concurrent_trials < 1: raise ValueError( "`max_concurrent_trials` must be greater or equal than 1, " f"got {max_concurrent_trials}.") if isinstance(search_alg, Searcher): search_alg = ConcurrencyLimiter( search_alg, max_concurrent=max_concurrent_trials) elif not is_local_mode: logger.warning( "You have passed a `SearchGenerator` instance as the " "`search_alg`, but `max_concurrent_trials` requires a " "`Searcher` instance`. `max_concurrent_trials` " "will be ignored.") if isinstance(search_alg, Searcher): search_alg = SearchGenerator(search_alg) if config and not set_search_properties_backwards_compatible( search_alg.set_search_properties, metric, mode, config, ** experiments[0].public_spec): if has_unresolved_values(config): raise ValueError( "You passed a `config` parameter to `tune.run()` with " "unresolved parameters, but the search algorithm was already " "instantiated with a search space. Make sure that `config` " "does not contain any more parameter definitions - include " "them in the search algorithm's search space if necessary.") if not scheduler.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the scheduler you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your scheduler or from your call to `tune.run()`") # Create syncer callbacks callbacks = create_default_callbacks( callbacks, sync_config, metric=metric, loggers=loggers) runner = TrialRunner( search_alg=search_alg, scheduler=scheduler, local_checkpoint_dir=experiments[0].checkpoint_dir, remote_checkpoint_dir=experiments[0].remote_checkpoint_dir, sync_config=sync_config, stopper=experiments[0].stopper, resume=resume, server_port=server_port, fail_fast=fail_fast, trial_executor=trial_executor, callbacks=callbacks, metric=metric, # Driver should only sync trial checkpoints if # checkpoints are not synced to cloud driver_sync_trial_checkpoints=not bool(sync_config.upload_dir)) if not runner.resumed: for exp in experiments: search_alg.add_configurations([exp]) else: logger.info("TrialRunner resumed, ignoring new add_experiment but " "updating trial resources.") if resources_per_trial: runner.update_pending_trial_resources(resources_per_trial) progress_reporter = progress_reporter or detect_reporter() if not progress_reporter.set_search_properties(metric, mode): raise ValueError( "You passed a `metric` or `mode` argument to `tune.run()`, but " "the reporter you are using was already instantiated with their " "own `metric` and `mode` parameters. Either remove the arguments " "from your reporter or from your call to `tune.run()`") progress_reporter.set_total_samples(search_alg.total_samples) # Calls setup on callbacks runner.setup_experiments( experiments=experiments, total_num_samples=search_alg.total_samples) # User Warning for GPUs if trial_executor.has_gpus(): if isinstance(resources_per_trial, dict) and "gpu" in resources_per_trial: # "gpu" is manually set. pass elif _check_default_resources_override(experiments[0].run_identifier): # "default_resources" is manually overridden. pass else: logger.warning("Tune detects GPUs, but no trials are using GPUs. " "To enable trials to use GPUs, set " "tune.run(resources_per_trial={'gpu': 1}...) " "which allows Tune to expose 1 GPU to each trial. " "You can also override " "`Trainable.default_resource_request` if using the " "Trainable API.") original_handler = signal.getsignal(signal.SIGINT) state = {signal.SIGINT: False} def sigint_handler(sig, frame): logger.warning( "SIGINT received (e.g. via Ctrl+C), ending Ray Tune run. " "This will try to checkpoint the experiment state one last time. " "Press CTRL+C one more time (or send SIGINT/SIGKILL/SIGTERM) " "to skip. ") state[signal.SIGINT] = True # Restore original signal handler to react to future SIGINT signals signal.signal(signal.SIGINT, original_handler) if not int(os.getenv("TUNE_DISABLE_SIGINT_HANDLER", "0")): signal.signal(signal.SIGINT, sigint_handler) tune_start = time.time() progress_reporter.set_start_time(tune_start) while not runner.is_finished() and not state[signal.SIGINT]: runner.step() if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter) tune_taken = time.time() - tune_start try: runner.checkpoint(force=True) except Exception as e: logger.warning(f"Trial Runner checkpointing failed: {str(e)}") if has_verbosity(Verbosity.V1_EXPERIMENT): _report_progress(runner, progress_reporter, done=True) wait_for_sync() runner.cleanup() incomplete_trials = [] for trial in runner.get_trials(): if trial.status != Trial.TERMINATED: incomplete_trials += [trial] if incomplete_trials: if raise_on_failed_trial and not state[signal.SIGINT]: raise TuneError("Trials did not complete", incomplete_trials) else: logger.error("Trials did not complete: %s", incomplete_trials) all_taken = time.time() - all_start if has_verbosity(Verbosity.V1_EXPERIMENT): logger.info(f"Total run time: {all_taken:.2f} seconds " f"({tune_taken:.2f} seconds for the tuning loop).") if state[signal.SIGINT]: logger.warning( "Experiment has been interrupted, but the most recent state was " "saved. You can continue running this experiment by passing " "`resume=True` to `tune.run()`") trials = runner.get_trials() return ExperimentAnalysis( runner.checkpoint_file, trials=trials, default_metric=metric, default_mode=mode, sync_config=sync_config)
def complex_call(env_name, agent_name, data, checkpoint, plot=False): import ray from ray.util.queue import Queue num_trainers = 1 num_evaluators = 1 num_collectors = config["collectors"] parallel_calls = num_trainers + num_collectors + num_evaluators is_gpu = bool(tf.config.list_physical_devices('GPU')) if is_gpu: ray.init(num_cpus=parallel_calls, num_gpus=1) else: ray.init(num_cpus=parallel_calls) queue = Queue( maxsize=100) # interprocess queue to store recent model weights # ray.init(local_mode=True) # for debugging # queue = None # queue does not work in debug mode if checkpoint is not None: path = str(Path(checkpoint).parent ) # due to https://github.com/deepmind/reverb/issues/12 checkpointer = reverb.checkpointers.DefaultCheckpointer(path=path) else: checkpointer = None if config["buffer"] == "full_episode": # 1 table for an episode buffer = storage.UniformBuffer(num_tables=1, min_size=config["batch_size"], max_size=config["buffer_size"], checkpointer=checkpointer) else: # we need several tables for each step size buffer = storage.UniformBuffer(num_tables=config["n_points"] - 1, min_size=config["batch_size"], max_size=config["buffer_size"], checkpointer=checkpointer) agent_object = AGENTS[agent_name] if is_gpu: trainer_objects = [ ray.remote(num_gpus=1 / num_trainers)(agent_object) for _ in range(num_trainers) ] else: trainer_objects = [ ray.remote(agent_object) for _ in range(num_trainers) ] collector_objects = [ ray.remote(worker.Collector) for _ in range(num_collectors) ] evaluator_objects = [ ray.remote(worker.Evaluator) for _ in range(num_evaluators) ] # global variable to control getting items order from the interprocess queue, a done condition # store weight for an evaluator workers_info = misc.GlobalVarActor.remote() # eval = worker.Evaluator(env_name, config, buffer.table_names, buffer.server_port, # workers_info=workers_info) # _, wins = eval.evaluate_episodes() # initialization trainer_agents = [] for i, trainer_object in enumerate(trainer_objects): make_checkpoint = True if i == 0 else False # make a buffer checkpoint only in the first worker trainer_agents.append( trainer_object.remote(env_name, config, buffer.table_names, buffer.server_port, data=data, make_checkpoint=make_checkpoint, ray_queue=queue, workers_info=workers_info)) collector_agents = [] for i, collector_object in enumerate(collector_objects): collector_agents.append( collector_object.remote(env_name, config, buffer.table_names, buffer.server_port, data=data, make_checkpoint=False, ray_queue=queue, worker_id=i + 1, workers_info=workers_info, num_collectors=num_collectors)) evaluator_agents = [] for evaluator_object in evaluator_objects: evaluator_agents.append( evaluator_object.remote(env_name, config, buffer.table_names, buffer.server_port, workers_info=workers_info)) # remote call trainer_futures = [ agent.do_train.remote(iterations_number=config["iterations_number"], save_interval=config["save_interval"]) for agent in trainer_agents ] collect_info_futures = [ agent.do_collect.remote() for agent in collector_agents ] eval_info_futures = [ agent.do_evaluate.remote() for agent in evaluator_agents ] # get results outputs = ray.get(trainer_futures) collect_info = ray.get(collect_info_futures) print(f"Collect info: {collect_info}") _ = ray.get(eval_info_futures) # rewards_array = np.empty(num_trainers) # steps_array = np.empty(num_trainers) # weights_list, mask_list = [], [] # for count, (weights, mask, reward, steps, _) in enumerate(outputs): # weights_list.append(weights) # mask_list.append(mask) # rewards_array[count] = reward # steps_array[count] = steps # print(f"Proc #{count}: Average reward = {reward:.2f}, Steps = {steps:.2f}") # if plot: # misc.plot_2d_array(weights[0], "Zero_lvl_with_reward_" + str(reward) + "_proc_" + str(count)) # misc.plot_2d_array(weights[2], "First_lvl_with_reward_" + str(reward) + "_proc_" + str(count)) # argmax = rewards_array.argmax() # argmax = steps_array.argmax() # print(f"to save: Reward = {rewards_array[argmax]:.2f}, Steps = {steps_array[argmax]:.2f}") # data = { # 'weights': weights_list[argmax], # 'mask': mask_list[argmax], # 'reward': rewards_array[argmax] # } # with open('data/data.pickle', 'wb') as f: # pickle.dump(data, f, protocol=4) _, _, checkpoint = outputs[0] with open('data/checkpoint', 'w') as text_file: print(checkpoint, file=text_file) ray.shutdown() print("Done")
def to_csv(cls, qc, **kwargs): if not cls._to_csv_check_support(kwargs): return BaseIO.to_csv(qc, **kwargs) # The partition id will be added to the queue, for which the moment # of writing to the file has come queue = Queue(maxsize=1) def func(df, **kw): if kw["partition_idx"] != 0: # we need to create a new file only for first recording # all the rest should be recorded in appending mode if "w" in kwargs["mode"]: kwargs["mode"] = kwargs["mode"].replace("w", "a") # It is enough to write the header for the first partition kwargs["header"] = False # for parallelization purposes, each partition is written to an intermediate buffer path_or_buf = kwargs["path_or_buf"] is_binary = "b" in kwargs["mode"] if is_binary: kwargs["path_or_buf"] = io.BytesIO() else: kwargs["path_or_buf"] = io.StringIO() df.to_csv(**kwargs) content = kwargs["path_or_buf"].getvalue() kwargs["path_or_buf"].close() # each process waits for its turn to write to a file; # in case of violation of the order of receiving messages from the queue, # the message is placed back while True: get_value = queue.get(block=True) if get_value == kw["partition_idx"]: break queue.put(get_value) # preparing to write data from the buffer to a file with pandas.io.common.get_handle( path_or_buf, # in case when using URL in implicit text mode # pandas try to open `path_or_buf` in binary mode kwargs["mode"] if is_binary else kwargs["mode"] + "t", encoding=kwargs["encoding"], errors=kwargs["errors"], compression=kwargs["compression"], storage_options=kwargs["storage_options"], is_text=False, ) as handles: handles.handle.write(content) # signal that the next process can start writing to the file queue.put(get_value + 1) # used for synchronization purposes return 0 # signaling that the partition with id==0 can be written to the file queue.put(0) result = qc._modin_frame._frame_mgr_cls.map_axis_partitions( axis=1, partitions=qc._modin_frame._partitions, map_func=func, keep_partitioning=True, lengths=None, enumerate_partitions=True, ) # pending completion for rows in result: for partition in rows: wait([partition.oid])
import ray from ray.util.queue import Queue ray.init() # You can pass this object around to different tasks/actors queue = Queue(maxsize=100) @ray.remote def consumer(queue): next_item = queue.get(block=True) print(f'got work {next_item}') consumers = [consumer.remote(queue) for _ in range(3)] [queue.put(i) for i in range(10)] print(ray.nodes())
def test_put(ray_start_regular_shared): q = Queue(1) item = 0 q.put(item, block=False) assert q.get() == item item = 1 q.put(item, timeout=0.2) assert q.get() == item with pytest.raises(ValueError): q.put(0, timeout=-1) q.put(0) with pytest.raises(Full): q.put_nowait(1) with pytest.raises(Full): q.put(1, timeout=0.2)
def test_get(ray_start_regular_shared): q = Queue() item = 0 q.put(item) assert q.get(block=False) == item item = 1 q.put(item) assert q.get(timeout=0.2) == item with pytest.raises(ValueError): q.get(timeout=-1) with pytest.raises(Empty): q.get_nowait() with pytest.raises(Empty): q.get(timeout=0.2)
class Detector(LoggerMixin, SlackMixin): def __init__( self, source: str, dest: str, batch_size: int, tiles: int, webhook: str, gpu: bool, ) -> None: SlackMixin.__init__(self, webhook) self._source = source self._dest = dest self._batch_size = batch_size self._n_tiles = tiles self._threads = [] self._q_to_file_reader = Queue() self._q_freader_to_detector = Queue(maxsize=Config.Q_READER_NET_RUNNER) self._q_detector_payload_runner = Queue( maxsize=Config.Q_NET_RUNNER_PAYLOAD_RUNNER) self._q_payload_runner_fwriter = Queue( maxsize=Config.Q_PAYLOAD_RUNNER_WRITER) self.logger.info("Queues initialized") self._file_reader_thread = FileReaderThread( queue_in=self._q_to_file_reader, queue_out=self._q_freader_to_detector, ) self._model = YOLOv4("yolov4", device="gpu" if gpu else "cpu") self._net_runner_thread = NetRunnerThread( queue_in=self._q_freader_to_detector, queue_out=self._q_detector_payload_runner, model=self._model, ) self._payload_runner = PayloadRunnerActor( queue_in=self._q_detector_payload_runner, queue_out=self._q_payload_runner_fwriter, payload=Config.get_payload(), ) self._result_processor = TheResultProcessor(dest) self._result_processor_thread = ResultWriterThread( result_writer=self._result_processor, queue_in=self._q_payload_runner_fwriter, ) self._threads.append(self._file_reader_thread) self._threads.append(self._net_runner_thread) # type: ignore self._threads.append(self._result_processor_thread) # type: ignore self._start() self._log_message("Detector started") def process_images(self): for image_path in self._get_images_to_process(): self._q_to_file_reader.put(image_path) self._log_message(f"Image {os.path.basename(image_path)} " f"sent to file reader") def _get_images_to_process(self) -> t.Generator: for item in os.listdir(self._source): if any(item.endswith(ext.lower()) for ext in Config.ALLOWED_EXTS): yield os.path.join(self._source, item) else: self.logger.warning( f"Cannot process file: {item}. Unsupported extension") def _log_message(self, message: str) -> None: self.logger.info(message) self.slack_msg(message) def _start(self) -> None: for thread in self._threads: thread.start() def stop(self) -> None: self._q_to_file_reader.put("KILL") for thread in self._threads: thread.join() self._log_message("Detected stopped")
def test_batch(ray_start_regular_shared): q = Queue(1) with pytest.raises(Full): q.put_nowait_batch([1, 2]) with pytest.raises(Empty): q.get_nowait_batch(1) big_q = Queue(100) big_q.put_nowait_batch(list(range(100))) assert big_q.get_nowait_batch(100) == list(range(100))
def run(self, worker_fn: Callable, callbacks: Optional[List[Callable]] = None) -> List[Any]: """Executes the provided function on all workers. Args: worker_fn: Target elastic function that can be executed. callbacks: List of callables. Each callback must either be a callable function or a class that implements __call__. Every callback will be invoked on every value logged by the rank 0 worker. Returns: List of return values from every completed worker. """ return_values = [] from ray.util.queue import Queue import inspect args = inspect.getfullargspec(Queue).args if "actor_options" not in args: # Ray 1.1 and less _queue = Queue() else: _queue = Queue(actor_options={ "num_cpus": 0, "resources": { ray.state.current_node_id(): 0.001 } }) self.driver.start( self.settings.num_proc, self._create_spawn_worker_fn(return_values, worker_fn, _queue)) def _process_calls(queue, callbacks, event): if not callbacks: return while queue.actor: if not queue.empty(): result = queue.get_nowait() for c in callbacks: c(result) # avoid slamming the CI elif event.is_set(): break time.sleep(0.1) try: event = threading.Event() _callback_thread = threading.Thread(target=_process_calls, args=(_queue, callbacks, event), daemon=True) _callback_thread.start() res = self.driver.get_results() event.set() if _callback_thread: _callback_thread.join(timeout=60) finally: if hasattr(_queue, "shutdown"): _queue.shutdown() else: done_ref = _queue.actor.__ray_terminate__.remote() done, not_done = ray.wait([done_ref], timeout=5) if not_done: ray.kill(_queue.actor) self.driver.stop() if res.error_message is not None: raise RuntimeError(res.error_message) for name, value in sorted(res.worker_results.items(), key=lambda item: item[1][1]): exit_code, timestamp = value if exit_code != 0: raise RuntimeError( 'Horovod detected that one or more processes ' 'exited with non-zero ' 'status, thus causing the job to be terminated. ' 'The first process ' 'to do so was:\nProcess name: {name}\nExit code: {code}\n'. format(name=name, code=exit_code)) return_values = [ value for k, value in sorted(return_values, key=lambda kv: kv[0]) ] return return_values