def save_executed_notebook(self, builderSelf): error_results = [] builderSelf.dask_log[ 'scheduler_info'] = builderSelf.client.scheduler_info() builderSelf.dask_log['futures'] = [] ## create an instance of the class id config set if (builderSelf.config['jupyter_generate_html']): builderSelf._convert_class = convertToHtmlWriter(builderSelf) # this for loop gathers results in the background total_count = len(builderSelf.futures) count = 0 update_count_delayed = 1 for future, nb in as_completed(builderSelf.futures, with_results=True, raise_errors=False): count += 1 builderSelf._execute_notebook_class.check_execution_completion( builderSelf, future, nb, error_results, count, total_count, 'futures') for future, nb in as_completed(builderSelf.delayed_futures, with_results=True, raise_errors=False): count += 1 if update_count_delayed == 1: update_count_delayed = 0 total_count += len(builderSelf.delayed_futures) builderSelf._execute_notebook_class.check_execution_completion( builderSelf, future, nb, error_results, count, total_count, 'delayed_futures') return error_results
def run_job(self, job): futures = [ self.client.submit(task, workers=task.get_locations()) for task in job.get_tasks() ] for future, result in dd.as_completed(futures, with_results=True): yield result
def process_futures_into_list(future_list): """ Take a list of futures and turn them into a list of results Results must be of the form i, data (where i is the output order) :param future_list: list(Futures) :return output_list: list(Data) """ DaskController = MPControl.client output_list = [None] * len(future_list) complete_gen = distributed.as_completed(future_list) for finished_future in complete_gen: # Jobs can be cancelled in certain situations if finished_future.cancelled(): # Restart cancelled futures and put them back into the work pile DaskController.client.retry(finished_future) complete_gen.update([finished_future]) # More likely is jobs erroring as a result of cluster instability elif finished_future.status == "error": error = finished_future.exception() utils.Debug.vprint("Restarting job (Error: {er})".format(er=error), level=1) # Restart errored futures and put them back into the work pile DaskController.client.retry(finished_future) complete_gen.update([finished_future]) # In the event of success, get the data i, result_data = finished_future.result() output_list[i] = result_data return output_list
def distributed_main(): """ Create a large 2D numpy array, do some expensive computation on every element ***IN PARALLEL***, return the sum. """ two_d_array = np.random.rand(10000, Y_DIM) # Split the large array into smaller arrays along the Y axis # Submit each smaller array as a job futures = [] for i in range(NUM_JOBS): start = (i * Y_DIM) // NUM_JOBS end = ((i + 1) * Y_DIM) // NUM_JOBS print([start, end]) # Sends lots of data over the network to each worker future = client.submit(parallel_func, two_d_array[:, start:end]) futures.append(future) progress(futures) total = 0 for future in as_completed(futures): total += future.result() print(total) return total
def main(self, gmrecords): """Process data using steps defined in configuration file. Args: gmrecords: GMrecordsApp instance. """ logging.info('Running subcommand \'%s\'' % self.command_name) self.gmrecords = gmrecords self._get_events() # get the process tag from the user or define by current datetime self.process_tag = (gmrecords.args.label or datetime.utcnow().strftime(TAG_FMT)) logging.info('Processing tag: %s' % self.process_tag) if gmrecords.args.num_processes: # parallelize processing on events try: client = Client(n_workers=gmrecords.args.num_processes) except BaseException as ex: print(ex) print("Could not create a dask client.") print("To turn off paralleization, use '--num-processes 0'.") sys.exit(1) futures = client.map(self._process_event, self.events) for result in as_completed(futures, with_results=True): print(result) # print('Completed event: %s' % result) else: for event in self.events: self._process_event(event) self._summarize_files_created()
async def pick_frame(self, dataset_uuid, x, y): ds = self.data.get_dataset(dataset_uuid) x = int(x) y = int(y) slice_ = Slice(origin=(y, x, 0, 0), shape=(1, 1, ds.shape[2], ds.shape[3])) job = PickFrameJob(dataset=ds, slice_=slice_) executor = self.data.get_executor() log.info("picking %d/%d from %s", x, y, dataset_uuid) futures = [] for task in job.get_tasks(): submit_kwargs = {} futures.append(executor.client.submit(task, **submit_kwargs)) full_result = np.zeros(shape=ds.shape[2:]) async for future, result in dd.as_completed(futures, with_results=True): for tile in result: tile.copy_to_result(full_result) log.info("picking done, encoding image (dtype=%s)", full_result.dtype) image = await run_blocking( _encode_image, full_result, colormap=cm.gist_earth, save_kwargs={'format': 'png'}, ) log.info("image encoded, sending response") return image.read()
def main(self, gmrecords): """ Assemble data and organize it into an ASDF file. Args: gmrecords: GMrecordsApp instance. """ logging.info('Running subcommand \'%s\'' % self.command_name) self.gmrecords = gmrecords self._get_events() print(self.events) logging.info('Number of events to assemble: %s' % len(self.events)) if gmrecords.args.num_processes: # parallelize processing on events try: client = Client(n_workers=gmrecords.args.num_processes) except BaseException as ex: print(ex) print("Could not create a dask client.") print("To turn off paralleization, use '--num-processes 0'.") sys.exit(1) futures = client.map(self._assemble_event, self.events) for result in as_completed(futures, with_results=True): print(result) # print('Completed event: %s' % result) else: for event in self.events: self._assemble_event(event) self._summarize_files_created()
def run_tasks(self, tasks, cancel_id): tasks = list(tasks) tasks_wrapped = [] def _id_to_task(task_id): return tasks[task_id] for idx, orig_task in enumerate(tasks): tasks_wrapped.append(TaskProxy(orig_task, idx)) futures = self._get_futures(tasks_wrapped) self._futures[cancel_id] = futures try: for future, result_wrap in dd.as_completed(futures, with_results=True): if future.cancelled(): del self._futures[cancel_id] raise JobCancelledError() result = result_wrap['task_result'] task = _id_to_task(result_wrap['task_id']) yield result, task finally: if cancel_id in self._futures: del self._futures[cancel_id]
def as_completed(self, drain=True): """Emit submitted jobs as completed, drain all from the work queue if specified""" if drain: while self.primed: self.submit() yield from map(lambda fut: fut.result(), as_completed(self.__running))
def main(self, gmrecords): """Compute waveform metrics. Args: gmrecords: GMrecordsApp instance. """ logging.info('Running subcommand \'%s\'' % self.command_name) self.gmrecords = gmrecords self._get_events() if gmrecords.args.num_processes: # parallelize processing on events try: client = Client(n_workers=gmrecords.args.num_processes) except BaseException as ex: print(ex) print("Could not create a dask client.") print("To turn off paralleization, use '--num-processes 0'.") sys.exit(1) futures = client.map(self._compute_event_waveforms, self.events) for result in as_completed(futures, with_results=True): print(result) else: for event in self.events: self._compute_event_waveforms(event) self._summarize_files_created()
def _iter_dask(self): safefunc = functools.partial(safely_call, self.task_func) allargs = list(self._genargs()) yield len(allargs) cl = self.dask_client for fut in as_completed(cl.map(safefunc, cl.scatter(allargs))): yield fut.result()
def norm(self, N=2): """Function to compute vector N-norm""" norms = self.client.map(_call_norm, self.vecDask, N=N, pure=False) norm = 0.0 for future, result in daskD.as_completed(norms, with_results=True): norm += np.power(np.float64(result), N) return np.power(norm, 1. / N)
async def get_preview_image(self, dataset_uuid): ds = self.data.get_dataset(dataset_uuid) job = SumFramesJob(dataset=ds) executor = self.data.get_executor() log.info("creating preview for dataset %s" % dataset_uuid) futures = [] for task in job.get_tasks(): submit_kwargs = {} futures.append(executor.client.submit(task, **submit_kwargs)) log.info("preview futures created") full_result = np.zeros(shape=ds.shape[2:]) async for future, result in dd.as_completed(futures, with_results=True): for tile in result: tile.copy_to_result(full_result) log.info("preview done, encoding image (dtype=%s)", full_result.dtype) image = await run_blocking( _encode_image, full_result, colormap=cm.gist_earth, save_kwargs={'format': 'png'}, ) log.info("image encoded, sending response") return image.read()
def collect_results(self, **kwa): """ collect (and log) results as they become available (this will block) """ if kwa.get('all'): self.logger.info('collect all results') futures = as_completed(self.futures) else: self.logger.info('collect already done results only') futures = filter(lambda f: f.done() == True, self.futures) # for xi, future in enumerate (as_completed (self.futures)): for xi, future in enumerate(futures): self.results_collected += 1 result = future.result() key = future.key # future.cancel() self.logger.debug('[{xi}] future {key} yielded {result}'.format( xi=xi, key=key, result=result)) self.results.append(dict( index=xi, result=result, )) self.log_status('collect_results')
def min(self): """Function to obtain minimum value within a vector""" mins = self.client.map(_call_min, self.vecDask, pure=False) min_val = np.inf for future, result in daskD.as_completed(mins, with_results=True): if result < min_val: min_val = result return min_val
def max(self): """Function to obtain maximum value within a vector""" maxs = self.client.map(_call_max, self.vecDask, pure=False) max_val = -np.inf for future, result in daskD.as_completed(maxs, with_results=True): if result > max_val: max_val = result return max_val
def run_tasks(self, tasks, cancel_id): futures = self._get_futures(tasks) self._futures[cancel_id] = futures for future, result in dd.as_completed(futures, with_results=True): if future.cancelled(): raise JobCancelledError() yield result del self._futures[cancel_id]
def task_result_thread(ntasks, futures, runs): run_map = dict(zip(futures, runs)) for i, cf in enumerate(as_completed(futures)): run = run_map[cf] status, addr, elap, loadavg = cf.result() if status == 'canceled': continue logging.info('{:>3}/{:<3} {:<9} {:<27} {:<22} {:4.1f}s {}'.format( i + 1, ntasks, status, addr, format_loadavg(loadavg), elap, run))
def run_job(self, job): futures = [] for task in job.get_tasks(): submit_kwargs = {} if not self.is_local: submit_kwargs['workers'] = task.get_locations() futures.append(self.client.submit(task, **submit_kwargs)) for future, result in dd.as_completed(futures, with_results=True): yield result
def results(self): """Blocks until complete""" # submit remaining jobs while self.primed: self.submit() for fut in as_completed(self.__running): self.__results.append(fut.result()) return self.__results
def batch_submit( func, *iterables, batch_size=None, return_results=False, raise_error=False, **kwargs, ): if not all_equal(len(iterable) for iterable in iterables): raise ValueError("iterables does not have the same length") with get_client(auto_spawn=False) as client: batch_size = batch_size if batch_size is not None else len( client.ncores()) logger.debug(f"batch submission, size={batch_size}") # jump start iterables = zip(*iterables) futures = [] for i in range(batch_size): try: futures.append(client.submit(func, *next(iterables), **kwargs)) except StopIteration: logger.warning( f"batch size ({batch_size}) is larger than number of iterable elements" ) break if return_results: results = [] queue = as_completed(futures, with_results=False) while queue.count(): for batches in queue.batches(): n = len(batches) for future in batches: try: result = future.result() except Exception as err: if raise_error: raise elif return_results: result = err if return_results: results.append(result) del future # release the future # submit new task if there is any for i in range(n): try: queue.add( client.submit(func, *next(iterables), **kwargs)) except StopIteration: break if return_results: return results
def collect_output(futures, output_path, **kwargs): ''' Collects the output from the list of futures and merges them into a dataframe. The Dataframe will then be written to a file as specified by the output_path. The datatframe df_started_runs is joined with the job outputs to get the real ontime. ''' if output_path is not None: logger.info( 'Concatenating results from each job into {}'.format(output_path)) n_success = 0 n_total = 0 result_iterator = as_completed(futures, with_results=True, raise_errors=False) with Writer(output_path) as writer: for (future, result) in result_iterator: if isinstance(result, tuple): exc_type, exc_valye, tb = result logger.error('Exception running job: {}'.format(result[1])) logger.error('\n'.join(format_tb(tb))) continue if not result['success']: logger.error('Job errored with reason "{}"'.format( result["reason"])) continue n_success += 1 events = result.get('events') if events is None: output = result['outputfile'] logger.info('Job wrote output to local file {}'.format(output)) continue n_events = len(events) logger.info('There are {} events in the result'.format(n_events)) if n_events == 0: continue n_total += n_events events.columns = rename_columns(events.columns) add_theta_deg_columns(events) writer.append(events) logger.info('Result written successfully') if output_path is not None: logger.info( 'Wrote a total of {} events from {} succesfull runs to {}'.format( n_total, n_success, output_path))
def main(): args = parse_args() logging.info(args) cluster = init_cluster(args) client = Client(cluster) future_list = client.map(dummy_function, range(args.n_jobs)) logging.info(cluster.job_script()) for future in as_completed(future_list): exception = future.exception() traceback.print_exception(type(exception), exception, future.traceback())
def run_search(self): syncWarmupFlag = self.HPOConfig['asyncWarmupFlag'] startTime = time.time() if syncWarmupFlag: print('sync warmup') super().run_search(asyncInitializeFlag=True) print('sync warmup complete\n') print('continuing with async search') else: self.reset_swarm() self.scatter_data_to_workers() self.build_initial_particles() futureEvalParticles = self.client.compute(self.delayedEvalParticles) particleFutureSeq = as_completed(futureEvalParticles) # particleFutureSeq is an iterator of futures, to which we append newly updated particles for particleFuture in particleFutureSeq: testDataPerf, trainDataPerf, pID, nTrees, evalTime = particleFuture.result( ) self.log_particle_history(testDataPerf, trainDataPerf, pID, nTrees, evalTime) self.update_particle(testDataPerf, pID, nTrees, evalTime, wExplore=0) self.swarmEvals += 1 # termination condition approximateEpoch = self.swarmEvals // self.nParticles if approximateEpoch > self.nEpochs: break # create delayed evaluations for newly updated particles delayedParticle = delayed(evaluate_particle)( self.scatteredDataFutures, self.particles[pID].pos, self.paramRanges, self.particles[pID].pID, self.dataset.trainObjective, cpuFlag=self.cpuFlag) futureParticle = self.client.compute(delayedParticle) particleFutureSeq.add(futureParticle) # print progress update via approximate epoch if self.swarmEvals % self.nParticles == 0: print(f'> async epoch {approximateEpoch} of {self.nEpochs}') self.elapsedTime = time.time() - startTime self.report_final_params()
def train_mcts(mcts_dir, num_transforms, train, dataset, model, trainset_dir, evalset_dir, max_demos_train, max_demos_eval, cluster, num_gpus, exploration_cst, score_name, resume, **unused_kwargs): # run all the transformations and save them in <mcts_dir>/transformations utils.test_transformations(train, dataset, model, trainset_dir, evalset_dir, mcts_dir) print('Initializing GPU workers...') client = cluster_utils.make_client(cluster, 'gpu', num_gpus, mcts_dir, no_nanny=True) print('Starting the training...') mc_tree = make_tree(mcts_dir, num_transforms, exploration_cst, score_name, resume) iter_start = mc_tree.iterations print('MCTS of depth {} and exploration constant {}'.format( num_transforms, exploration_cst)) args_worker = [ train, dataset, model, trainset_dir, evalset_dir, mcts_dir, max_demos_train, max_demos_eval ] args_workers_all = [[arg] * num_gpus for arg in [mc_tree] + args_worker] # log_ids will be the list of iter_mcts as well (in the beginning) log_ids = list(range(iter_start, iter_start + num_gpus)) futures = client.map(get_score, *args_workers_all, log_ids, log_ids) jobs_queue = distributed.as_completed(futures) for iter_mcts, future in enumerate(jobs_queue): try: path, policy_scores = future.result() mc_tree.add_path(path, policy_scores) mc_tree.save(mcts_dir) print('MCTS iteration {}'.format(iter_start + iter_mcts)) print('\tpath {}\n\tscore {:.3f}\n\terror {:.3f}cm\n'.format( path, policy_scores[score_name], (1 - policy_scores[score_name]) * 10)) except: print( 'WARNING: one of the MCTS worker died, I will gracefully ignore it' ) iter_mcts_future = iter_start + iter_mcts + num_gpus log_id = iter_mcts_future % num_gpus new_future = client.submit(get_score, mc_tree, *args_worker, iter_mcts_future, log_id) jobs_queue.add(new_future) if iter_mcts > 0 and iter_mcts % 10 == 0: utils.print_mcts_score(mc_tree)
def collect_result(self, futures, results, box, submission_time): """Compile results from completed workers and recompiles their sub outputs into the output for the complete box being worked on. :param futures: list(dask.Future), list of futures representing future dask worker calculations :param results: list[numpy.nd.array], arrays of the appropriate structure representing the final output of processed box (need to be in the same order as the function passed in submit_workers returns in) :param box: numpy.nd.array, the initial complete box being processed :param submission_time: time, the time of submission of the dask workers (used to determine worker runtimes as a performance diagnostic) :return: results: tuple(numpy.nd.arrays), the processed results of the box """ from dask.distributed import as_completed num_future = 0 for future, sub_results in as_completed(futures, with_results=True): # message num_future += 1 sub_t = time.time() - submission_time print("FUTURE #{} complete. Time used: {:.0f} seconds".format( num_future, sub_t)) # catch result - sub_box # and convert the abosulte sub_box into local col/row start/end relative to the primary box # to assemble the result from each worker sub_box = sub_results[-1] x0, y0, x1, y1 = sub_box x0 -= box[0] x1 -= box[0] y0 -= box[1] y1 -= box[1] # catch result - matrices # and loop across all of the returned data to rebuild complete box for i, sub_result in enumerate(sub_results[:-1]): num_dim = sub_result.ndim if num_dim == 4: results[i][:, :, y0:y1, x0:x1] = sub_result elif num_dim == 3: results[i][:, y0:y1, x0:x1] = sub_result elif num_dim == 2: results[i][y0:y1, x0:x1] = sub_result else: msg = "worker result has unexpected dimension: {}".format( num_dim) msg += '\nit should be either 2 or 3 or 4!' raise Exception(msg) return results
def __init__(self, scheduler_host=None, scatter=None, client=None, loop=None, wait_for_workers_timeout=10, **submit_kwargs): super().__init__() if distributed is None: msg = ("You are trying to use 'dask' as a joblib parallel backend " "but dask is not installed. Please install dask " "to fix this error.") raise ValueError(msg) if client is None: if scheduler_host: client = Client(scheduler_host, loop=loop, set_as_default=False) else: try: client = get_client() except ValueError as e: msg = ("To use Joblib with Dask first create a Dask Client" "\n\n" " from dask.distributed import Client\n" " client = Client()\n" "or\n" " client = Client('scheduler-address:8786')") raise ValueError(msg) from e self.client = client if scatter is not None and not isinstance(scatter, (list, tuple)): raise TypeError("scatter must be a list/tuple, got " "`%s`" % type(scatter).__name__) if scatter is not None and len(scatter) > 0: # Keep a reference to the scattered data to keep the ids the same self._scatter = list(scatter) scattered = self.client.scatter(scatter, broadcast=True) self.data_futures = {id(x): f for x, f in zip(scatter, scattered)} else: self._scatter = [] self.data_futures = {} self.wait_for_workers_timeout = wait_for_workers_timeout self.submit_kwargs = submit_kwargs self.waiting_futures = as_completed([], loop=client.loop, with_results=True, raise_errors=False) self._results = {} self._callbacks = {}
def dot(self, other): """Function to compute dot product between two vectors""" checkVector(self, other) dots = self.client.map(_call_dot, self.vecDask, other.vecDask, pure=False) # Adding all the results together dot = 0.0 for future, result in daskD.as_completed(dots, with_results=True): dot += np.float64(result) return dot
async def run_job(self, uuid, ds, job, full_result): self.data.register_job(uuid=uuid, job=job) executor = self.data.get_executor() futures = [] for task in job.get_tasks(): submit_kwargs = {} futures.append(executor.client.submit(task, **submit_kwargs)) self.write(Message(self.data).start_job(job_id=uuid)) self.finish() msg = Message(self.data).start_job(job_id=uuid, ) log_message(msg) self.event_registry.broadcast_event(msg) async for future, result in dd.as_completed(futures, with_results=True): # TODO: # + only send PNG of area that has changed (bounding box of all result tiles!) # + normalize each channel (per channel: keep running min/max, map data to [0, 1]) # + if min/max changes, send whole channel (all results up to this point re-normalized) # + maybe saturate up to some point (20% over current max => keep current max) and send # whole result image once finished # + maybe use visualization framework in-browser (example: GR) # TODO: update task_result message: # + send bbox for blitting for tile in result: tile.copy_to_result(full_result) images = yield full_result # NOTE: make sure the following broadcast_event messages are sent atomically! # (that is: keep the code below synchronous, and only send the messages # once the images have finished encoding, and then send all at once) msg = Message(self.data).task_result( job_id=uuid, num_images=len(images), ) log_message(msg) self.event_registry.broadcast_event(msg) for image in images: raw_bytes = image.read() self.event_registry.broadcast_event(raw_bytes, binary=True) images = yield full_result msg = Message(self.data).finish_job( job_id=uuid, num_images=len(images), ) log_message(msg) self.event_registry.broadcast_event(msg) for image in images: raw_bytes = image.read() self.event_registry.broadcast_event(raw_bytes, binary=True)
def run_tasks( self, tasks: Iterable[TaskProtocol], params_handle: Any, cancel_id: Any, ): tasks = list(tasks) tasks_w_index = list(enumerate(tasks)) def _id_to_task(task_id): return tasks[task_id] workers = self.get_available_workers() threaded_executor = workers.has_threaded_workers() self._futures[cancel_id] = [] initial = [] for w in range(int(len(workers))): if not tasks_w_index: break idx, wrapped_task = tasks_w_index.pop(0) future = self._get_future(wrapped_task, workers, idx, params_handle, threaded_executor) initial.append(future) self._futures[cancel_id].append(future) try: as_completed = dd.as_completed(initial, with_results=True, loop=self.client.loop) for future, result_wrap in as_completed: if future.cancelled(): del self._futures[cancel_id] raise JobCancelledError() result = result_wrap['task_result'] task = _id_to_task(result_wrap['task_id']) if tasks_w_index: idx, wrapped_task = tasks_w_index.pop(0) future = self._get_future( wrapped_task, workers, idx, params_handle, threaded_executor, ) as_completed.add(future) self._futures[cancel_id].append(future) yield result, task finally: if cancel_id in self._futures: del self._futures[cancel_id]