def _parallel_run_many(self, generator, execution: MLClientCtx, runobj: RunObject) -> RunList: results = RunList() tasks = generator.generate(runobj) handler = runobj.spec.handler self._force_handler(handler) set_paths(self.spec.pythonpath) _, handler = self._get_handler(handler) client, function_name = self._get_dask_client(generator.options) parallel_runs = generator.options.parallel_runs or 4 queued_runs = 0 num_errors = 0 def process_result(future): nonlocal num_errors resp, sout, serr = future.result() runobj = RunObject.from_dict(resp) try: log_std(self._db_conn, runobj, sout, serr, skip=self.is_child) resp = self._update_run_state(resp) except RunError as err: resp = self._update_run_state(resp, err=str(err)) num_errors += 1 results.append(resp) if num_errors > generator.max_errors: logger.error("max errors reached, stopping iterations!") return True run_results = resp["status"].get("results", {}) stop = generator.eval_stop_condition(run_results) if stop: logger.info( f"reached early stop condition ({generator.options.stop_condition}), stopping iterations!" ) return stop completed_iter = as_completed([]) for task in tasks: resp = client.submit(remote_handler_wrapper, task.to_json(), handler, self.spec.workdir) completed_iter.add(resp) queued_runs += 1 if queued_runs >= parallel_runs: future = next(completed_iter) early_stop = process_result(future) queued_runs -= 1 if early_stop: break for future in completed_iter: process_result(future) client.close() if function_name and generator.options.teardown_dask: logger.info("tearing down the dask cluster..") mlrun.get_run_db().delete_runtime_object("dask", function_name, force=True) return results
def execute_list_of_clustering_tasks_chunked(clustering_tasks, tests_per_batch=None): if tests_per_batch is None: tests_per_batch = len(clustering_tasks) with Client(address=SCHEDULER_HOSTNAME) as client: nb_of_chunks = math.ceil(len(clustering_tasks) / tests_per_batch) for chunk in tqdm(chunks(clustering_tasks, tests_per_batch), desc="chunks", total=nb_of_chunks): dataset_dict = scatter_datasets(client) # tasks_ready_to_execute = fill_dataset_in_clustering_tasks(chunk, dataset_dict) futures = [] for task in chunk: futures.append( client.submit(ClusteringTask.run, task, dataset_dict[task.dataset_name], pure=False)) # futures = client.map(lambda x: ClusteringTask.run(x), tasks_ready_to_execute, pure = False) for _ in tqdm(distributed.as_completed(futures, with_results=True, raise_errors=False), total=len(futures), desc="tasks in chunk", leave=False): pass
def get_minmax(varname, varpath, client): futures = list() numfiles = 0 for root, _, files in os.walk(varpath): if not files: continue numfiles = len(files) pbar = tqdm(files, desc="{}".format(varname)) for i, f in enumerate(sorted(files)): inpath = os.path.join(root, f) futures.append(client.submit(run_minmax, inpath, varname, i)) mins = [] mins2d = [[] for x in range(numfiles)] maxs = [] maxs2d = [[] for x in range(numfiles)] for future, minmax in as_completed(futures, with_results=True): pbar.update(1) mins2d[minmax[2]] = minmax[0] maxs2d[minmax[2]] = minmax[1] pbar.close() for l in mins2d: for i in l: mins.append(i) for l in maxs2d: for i in l: maxs.append(i) return mins, maxs
def main(): args = parse_args() cluster = LocalCluster( n_workers=int(args.processes), threads_per_worker=1, interface='lo') client = Client(address=cluster.scheduler_address) contents = os.listdir(args.input) futures = [] with open(args.hashfile, 'r') as hashfile: for line in hashfile.readline(): name, expected_hash = line.split('|') _, name = os.path.split(name) if name not in contents: continue() futures.append( client.submit( checkhash, os.path.join(args.input, name), expected_hash)) for future in as_completed(futures): path, match = future.results() print(path, match)
def sample_problems(self, n_min_objects=1, n_max_objects=12, n_configs=100, workspace=((0.25, -0.25), (0.75, 0.25)), radius=0.0375, timeout=2.0, n_max_iter=50): """ n_min_objects, n_max_objects: interval of number of objects to consider for the problems n_configs: number of configurations for each number of objects workspace: workspace in which to sample objects radius: radius for collision checks timeout: time limit that defines when a configuration should be discarded because more objects cannot be added n_max_iter: Number of times we allow timeout to be reached """ workspace = np.asarray(workspace).tolist() self.infos = dict(workspace=workspace, radius=radius) def sample_state(n_objects): sampler = StateSampler(workspace=workspace, radius=radius) n_iter = 0 success = False while not success: if n_iter > n_max_iter: raise SamplerError('Too many objects for this workspace.') try: state = sampler(n_objects, max_time=timeout) success = True except SamplerError: success = False n_iter += 1 return state def sample_src_tgt(n_objects, seed): np.random.seed(seed) src = sample_state(n_objects) tgt = sample_state(n_objects) return src, tgt, n_objects eval_df = defaultdict(list) futures = [] for n in np.arange(n_min_objects, n_max_objects + 1): for _ in range(n_configs): fut = self.client.submit(sample_src_tgt, n, np.random.randint(2**32 - 1)) futures.append(fut) print("Sampling configurations ...") for fut in tqdm(as_completed(futures), total=len(futures)): src, tgt, n_objects = fut.result() eval_df['src'].append(src) eval_df['tgt'].append(tgt) eval_df['n_objects'].append(n_objects) self.eval_df = pd.DataFrame(eval_df) self.eval_df = self.eval_df.sort_values('n_objects').reset_index( drop=True)
def run(self): """ Execute the algorithm. """ self.start_computing_time = time.time() population_to_evaluate = self.create_initial_solutions() task_pool = as_completed(self.evaluate(population_to_evaluate)) self.init_progress() auxiliar_population = [] for future in task_pool: # The initial population is not full if len(auxiliar_population) < self.population_size: received_solution = future.result() auxiliar_population.append(received_solution) new_task = self.client.submit(self.problem.evaluate, self.problem.create_solution()) task_pool.add(new_task) # Perform an algorithm step to create a new solution to be evaluated else: offspring_population = [] if not self.stopping_condition_is_met(): offspring_population.append(future.result()) # Replacement join_population = auxiliar_population + offspring_population auxiliar_population = RankingAndCrowdingDistanceSelection( self.population_size).execute(join_population) # Selection mating_population = [] for _ in range(2): solution = self.selection_operator.execute( population_to_evaluate) mating_population.append(solution) # Reproduction and evaluation new_task = self.client.submit(reproduction, mating_population, self.problem, self.crossover_operator, self.mutation_operator) task_pool.add(new_task) else: for future in task_pool.futures: future.cancel() break self.evaluations += 1 self.solutions = auxiliar_population self.update_progress() self.total_computing_time = time.time() - self.start_computing_time self.solutions = auxiliar_population
def end(self) -> None: if not self.client: raise AirflowException(NOT_STARTED_MESSAGE) if not self.futures: raise AirflowException(NOT_STARTED_MESSAGE) self.client.cancel(list(self.futures.keys())) for future in as_completed(self.futures.copy()): self._process_future(future)
def executor_map(func, iterable): """ orderless parallel map """ # TODO: queues.. import distributed # massively slow to import... executor = distributed.Executor() futures = executor.map(func, iterable) # should return immediately for future in distributed.as_completed(futures): # blocks assert future.done() yield future.result()
def mysum(): result = 0 sub_tasks = [delayed(double)(i) for i in range(100)] with worker_client() as lc: futures = lc.compute(sub_tasks) for f in as_completed(futures): result += f.result() return result
def eval_solver(self, solver, method_name='MyMethod', add_fields=[]): """ solver: The solver to evaluate. method_name: Name of the method to evaluate, name that will appear in results dataframe. add_fields: List containing additionnal fields that should be saved. These fields must be present in the output dict of the solver. """ all_outputs = defaultdict(list) save_fields = ['success', 'n_moves', 'n_collision_checks' ] + list(add_fields) def run_solver(solver, problem, problem_idx): outputs = solver(problem) assert "solved" in outputs if outputs['solved']: assert 'actions' in outputs actions = outputs['actions'] # Verify that the problem is indeed solved. # This will stop the evaluation if the action sequence found doesn't solve the problem # or doesn't respect the constraints (objects within workspace and no collisions) problem.assert_solution_valid(actions) outputs['n_moves'] = len(actions) outputs['success'] = True else: outputs['success'] = False outputs['n_moves'] = np.nan outputs['n_collision_checks'] = np.nan return outputs, problem_idx futures = [] print(f"{method_name} evaluation ...") for row_idx, row in enumerate(self.eval_df.itertuples()): src = row.src tgt = row.tgt problem = RearrangementProblem(src=src, tgt=tgt, workspace=self.infos['workspace'], radius=self.infos['radius']) fut = self.client.submit(run_solver, solver, problem, row_idx) futures.append(fut) for fut in tqdm(as_completed(futures), total=len(futures)): outputs, problem_idx = fut.result() all_outputs['problem_idx'].append(problem_idx) for k in save_fields: all_outputs[f'{method_name}/{k}'].append(outputs[k]) print("Done") sort_ids = np.argsort(all_outputs['problem_idx']) del all_outputs['problem_idx'] all_outputs = pd.DataFrame(all_outputs) all_outputs = all_outputs.iloc[sort_ids].reset_index(drop=True) self.eval_df = pd.concat((self.eval_df, all_outputs), axis=1) return
def test_as_completed(client): ac = client.submit(Counter, actor=True).result() futures = [ac.increment() for _ in range(10)] max = 0 for future in as_completed(futures): value = future.result() if value > max: max = value assert all(future.done() for future in futures) assert max == 10
def solve(self): # start off some particles futures = [] for particle in self.particles: # [:max(int((self.num_workers + 1) / self.replications), 1)]: futures.append(self.create_parallel_particle_future(particle.name)) # time.sleep(60) completed = as_completed(futures, with_results=True) for batch in completed.batches(): for future, (particle_num, score, position, velocity) in batch: self.particles[particle_num].update_score_position_velocity( score, position, velocity) # particle not running anymore self.particles_running[particle_num] = False # update the epoch self.particle_epochs_completed[particle_num] += 1 # see if there's a new best score self.update_global(score, position) # print("-- tassie devil\n{}\n{}\n{}\n".format([particle.name for particle in self.particles], # self.particle_epochs_completed, # self.particles_running)) # find the next particle (min epochs done and not currently running) min_epochs = sys.maxsize next_particle_pos = -1 for pos, (is_running, epochs_done) in enumerate( zip(self.particles_running, self.particle_epochs_completed)): if not is_running and epochs_done < min_epochs: min_epochs = epochs_done next_particle_pos = pos # print("min: {}".format(min_epochs)) if min_epochs < self.iterations: # not done yet - find the min particle particle = self.particles[next_particle_pos] # print("index={}, particle_num={}".format(particle_num, particle.name)) # update for the next run particle.update_velocity(self.global_best_position) particle.update_position() # score the particle position completed.add( self.create_parallel_particle_future(particle_num)) # do something with the results now print("=========== Done ({}) ===============".format(self.iterations)) print(self)
def test_as_completed_async_for(c, s, a, b): futures = c.map(inc, range(10)) ac = as_completed(futures) results = [] async def f(): async for future in ac: result = await future results.append(result) yield f() assert set(results) == set(range(1, 11))
def test_as_completed_async_for_results(c, s, a, b): futures = c.map(inc, range(10)) ac = as_completed(futures, with_results=True) results = [] async def f(): async for future, result in ac: results.append(result) yield f() assert set(results) == set(range(1, 11)) assert not s.counters['op'].components[0]['gather']
def iterate(): locs = map(coords.get_loc, coords.drop_duplicates()) if client is not None: futures = client.map( self._compute_anomaly, [data.isel(**{agedim: sl}) for sl in locs], **kwargs) for future, result in distributed.as_completed( futures, with_results=True): futures.remove(future) yield result else: for sl in locs: yield self._compute_anomaly(data.isel(**{agedim: sl}), **kwargs)
def eval_population(population, client, context=context): """ Concurrently evaluate all the individuals in the given population :param population: to be evaluated :param client: dask client :param context: for storing count of non-viable individuals :return: dask distrib iterator for futures """ # farm out population to worker nodes for evaluation worker_futures = client.map(evaluate(context=context), population, pure=False) # We'll need this later to catch eval tasks as they complete, and to # submit new tasks. return distributed.as_completed(worker_futures)
def record_dataset_dask(client, ds_dir, scene_cls, scene_kwargs, n_chunks, n_frames_per_chunk, start_seed=0, resume=False): seeds = set(range(start_seed, start_seed + n_chunks)) if resume: done_seeds = (ds_dir / 'seeds_recorded.txt').read_text().strip().split('\n') seeds = set(seeds) - set(map(int, done_seeds)) all_keys = (ds_dir / 'keys_recorded.txt').read_text().strip().split('\n') else: all_keys = [] seeds = tuple(seeds) future_kwargs = [] for seed in seeds: kwargs = dict(ds_dir=ds_dir, seed=seed, n_frames=n_frames_per_chunk, scene_cls=scene_cls, scene_kwargs=scene_kwargs) future_kwargs.append(kwargs) futures = [] for kwargs in future_kwargs: futures.append(client.submit(record_chunk, **kwargs)) iterator = as_completed(futures) unit = 'frame' unit_scale = n_frames_per_chunk n_futures = len(future_kwargs) tqdm_iterator = tqdm(iterator, total=n_futures, unit_scale=unit_scale, unit=unit, ncols=80) seeds_file = open(ds_dir / 'seeds_recorded.txt', 'a') keys_file = open(ds_dir / 'keys_recorded.txt', 'a') for future in tqdm_iterator: keys, seed = future.result() all_keys += keys seeds_file.write(f'{seed}\n') seeds_file.flush() keys_file.write('\n'.join(keys) + '\n') keys_file.flush() client.cancel(future) seeds_file.close() keys_file.close() return all_keys
def pytest_runtestloop(self, session): if (session.testsfailed and not session.config.option.continue_on_collection_errors): raise session.Interrupted("%d errors during collection" % session.testsfailed) unregister_plugins = ['debugging', 'terminalreporter'] for p in unregister_plugins: session.config.pluginmanager.unregister(p) if session.config.option.collectonly: return True def generate_tasks(session): for i, item in enumerate(session.items): # @delayed(pure=False) def run_test(_item): # ensure that the plugin manager gets recreated appropriately. _item.config.pluginmanager.__recreate__() results = self.pytest_runtest_protocol(item=_item, nextitem=None) return results # hook = item.ihook # try to ensure that the module gets treated as a dynamic module that does not # exist. # delattr(item.module, '__file__') # setup = hook.pytest_runtest_setup # make_report = hook.pytest_runtest_makereport fut = self.client.submit(run_test, item, pure=False) yield fut with self.remote_syspath_ctx(): tasks = generate_tasks(session) # log these reports to the console. for resolved in as_completed(tasks): t = resolved.result() for report in t: session.ihook.pytest_runtest_logreport(report=report) return True
def run_actions(unit_context, loop_variable_name=None, loop_variable_value=None, in_loop=None, on_dask=None): results = [] triggers = [] if in_loop: if on_dask: futures = [] client = DaskClient().get_dask_client() for value in unit_context.stageContext.pipelineContext.variables[ loop_variable_name]: futures.append( client.submit(run_actions, unit_context, loop_variable_name, value, False, False, pure=False)) for future in as_completed(futures): result, trigger_pipeline_data_list = future.result() results.extend(result) triggers.extend(trigger_pipeline_data_list) else: for value in unit_context.stageContext.pipelineContext.variables[ loop_variable_name]: result, trigger_pipeline_data_list = run_actions( unit_context, loop_variable_name, value, False, False) results.extend(result) triggers.extend(trigger_pipeline_data_list) else: for action in unit_context.unit.do: action_context = ActionContext(unit_context, action) if loop_variable_name and loop_variable_value: action_context.delegateVariableName = loop_variable_name action_context.delegateValue = loop_variable_value result, trigger_pipeline_data_list = run_action(action_context) results.append(result) triggers.extend(trigger_pipeline_data_list) return results, triggers
def test_as_completed_async_for_cancel(c, s, a, b): x = c.submit(inc, 1) y = c.submit(sleep, 0.3) ac = as_completed([x, y]) async def _(): await gen.sleep(0.1) await y.cancel(asynchronous=True) c.loop.add_callback(_) L = [] async def f(): async for future in ac: L.append(future) yield f() assert L == [x, y]
def execute_list_of_futures_with_dataset_requirements(tuple_list, tests_per_batch=None): """ :param tuple_list: list tuples (dataset_name, f(dataset_future) -> future :param tests_per_batch: how much tests per batch you want :return: """ if tests_per_batch is None: tests_per_batch = len(tuple_list) with Client(address=SCHEDULER_HOSTNAME) as client: for chunk in chunks(tuple_list, tests_per_batch): dataset_dict = scatter_datasets(client) futures = fill_dataset_requirements(chunk, client, dataset_dict) for _ in tqdm(distributed.as_completed(futures, with_results=True, raise_errors=True), total=len(futures)): pass client.restart()
def execute_list_of_clustering_tasks(clustering_tasks, tests_per_batch=None): if len(clustering_tasks) == 0: print("all clustering tasks done") if tests_per_batch is None: tests_per_batch = len(clustering_tasks) with Client(address=SCHEDULER_HOSTNAME) as client: with tqdm(total=len(clustering_tasks)) as pbar: tasks_to_still_do = deque(clustering_tasks) dataset_dict = scatter_datasets(client) futures = [] for _ in range(min(tests_per_batch, len(clustering_tasks))): task = tasks_to_still_do.popleft() futures.append( client.submit(ClusteringTask.run, task, dataset_dict[task.dataset_name], pure=False)) as_completed_futures = distributed.as_completed(futures, with_results=True, raise_errors=False) for future, _ in as_completed_futures: pbar.update(1) f: Future = future if f.exception() is not None: t = f.traceback() # for line in t.format(): # print(line, end = "") # traceback.print_exception()f.exception() traceback.print_tb(f.traceback()) print(f.exception()) if len(tasks_to_still_do) > 0: task = tasks_to_still_do.popleft() future = client.submit(ClusteringTask.run, task, dataset_dict[task.dataset_name], pure=False) as_completed_futures.add(future)
def _consume_jobs(self, chunk_size=None): """Consumes jobs If chunk_size is set function consumes specified number of Finished tasks or less if sent_jobs_ids queue became empty. If chunk_size is None function consumes jobs until sent_jobs_ids queue became empty. Jobs with statuses Cancelled, Abandoned, Terminated will be resent and their ids added to sent_jobs_ids queue :param chunk_size: size of consuming chunk :return: generator on job results """ logger.debug("Consuming jobs started") jobs_to_consume = [] while not self.sent_jobs.empty(): job = self.sent_jobs.get() jobs_to_consume.append(job) if chunk_size is not None: chunk_size -= 1 if chunk_size <= 0: break for ready_job in distributed.as_completed(jobs_to_consume): results = ready_job.result() self.sent_jobs_count -= 1 for result in results: (node_id, serialized), exc = result logger.debug( "Got ready task for node %s, serialized: %s, " "error: %s", node_id, serialized, exc) if exc is not None: raise exc yield node_id, serialized logger.debug("Consuming jobs finished")
def parallelStatsDaskSimple(urlSplits, ds, nEpochs, variable, mask, coordinates, reader, outHdfsPath, averagingConfig, sparkConfig, accumulators=['count', 'mean', 'M2', 'min', 'max']): '''Compute N-day climatology statistics in parallel using PySpark or pysparkling.''' if not sparkConfig.startswith('dask,'): print("dask: configuration must be of form 'dask,n'", file=sys.stderr) sys.exit(1) numPartitions = int(sparkConfig.split(',')[1]) with Timer("Configure Dask distributed"): from distributed import Client, as_completed client = Client(DaskClientEndpoint) print('Starting parallel Stats using Dask . . .', file=sys.stderr) start = time.time() futures = client.map( lambda urls: parallelStatsPipeline( urls, ds, nEpochs, variable, mask, coordinates, reader, averagingConfig, outHdfsPath, accumulators), urlSplits) outputFiles = [] for future in as_completed(futures): outputFile = future.result() outputFiles.append(outputFile) end = time.time() print("parallelStats: Completed %s in %0.3f seconds." % (outputFile, (end - start)), file=sys.stderr) return outputFiles
def _consume_jobs(self, chunk_size=None): """Consumes jobs If chunk_size is set function consumes specified number of Finished tasks or less if sent_jobs_ids queue became empty. If chunk_size is None function consumes jobs until sent_jobs_ids queue became empty. Jobs with statuses Cancelled, Abandoned, Terminated will be resent and their ids added to sent_jobs_ids queue :param chunk_size: size of consuming chunk :return: generator on job results """ logger.debug("Consuming jobs started") jobs_to_consume = [] while not self.sent_jobs.empty(): job = self.sent_jobs.get() jobs_to_consume.append(job) if chunk_size is not None: chunk_size -= 1 if chunk_size <= 0: break for ready_job in distributed.as_completed(jobs_to_consume): results = ready_job.result() self.sent_jobs_count -= 1 for result in results: (node_id, serialized), exc = result logger.debug("Got ready task for node %s, serialized: %s, " "error: %s", node_id, serialized, exc) if exc is not None: raise exc yield node_id, serialized logger.debug("Consuming jobs finished")
def run(ctx, path, ncores): path = pathlib.Path(path) if path.is_dir(): paths = list(path.rglob("*.py")) else: paths = [path] import distributed with distributed.Client(n_workers=ncores) as client: print(client) futures = {} for p in tqdm.tqdm(sorted(paths), desc="Creating tasks"): fut = client.submit(run_migrators_on_file, p) futures[fut] = p progress_iterator = tqdm.tqdm(desc="Scanning :", total=len(futures), miniters=1) all_changed = set() for resolved_fut in distributed.as_completed(futures): path = futures[resolved_fut] progress_iterator.update(1) progress_iterator.set_description(f"Scanning: {path}") try: changed = resolved_fut.result() if changed: all_changed.add(path) except CantParseException as e: print(f"Can't parse {path}", file=sys.stderr) progress_iterator.close() if len(all_changed): print("Changed the following files:") for path in sorted(all_changed): print(f" {path}")
def _iterate_jobs(self): """ Iterate through all jobs until the domain size is depleted or time runs out. :return: completed job """ backlog_half = self.backlog_per_worker / 2 active_futures = self._init_futures(self.backlog_per_worker) next_futures = [] try: while ((self._has_more_work() or self.index_completed < self.index_scheduled) and not self.canceled): iterated = 0 for future in as_completed(active_futures): job = future.result() self._mark_job_completed(job) iterated += 1 self.tracer.trace_job(job) if iterated >= (backlog_half * self.worker_count): iterated = 0 if self._has_more_work(): next_futures += self._schedule(backlog_half) if self._has_more_work(): next_futures += self._schedule(backlog_half) active_futures = next_futures next_futures = [] except Exception as e: haydi_logger.error(traceback.format_exc(e)) self.completed = True
printt('[info] master > maxmin finished') else: for i in range(num_worker): chunk_data = client.scatter([anchors, chunks[i]]) workers.append(client.submit(work, chunk_data, f'worker_{i}', cur_iter, *option)) # 이터레이션이 실패할 경우를 대비해 redis 의 값을 백업 #entities_initialized_bak = iter_mget(r, [f'{entity}_v' for entity in entities]) #entities_initialized_bak = np.array([loads(decompress(v)) for v in entities_initialized_bak]) #relations_initialized_bak = iter_mget(r, [f'{relation}_v' for relation in relations]) #relations_initialized_bak = np.array([loads(decompress(v)) for v in relations_initialized_bak]) # client.gather(workers) # result_iter = [worker.result() for worker in workers] result_iter = [] ac = as_completed(workers, with_results=True) for future, result in ac: result_iter.append(result) iterTimes.append(timeit.default_timer() - iterStart) if all([e[0] for e in result_iter]): # 이터레이션 성공 printt('[info] master > iter %d - time : %f' % (cur_iter, timeit.default_timer() - timeNow)) success = True trial = 0 cur_iter += 1 workTimes = [e[1] for e in result_iter] printt('[info] master > Total embedding times : ' + str(workTimes)) # printt('[info] master > Average total embedding time : ' + str(np.mean(workTimes)))
def run(self, domain, worker_reduce_fn, worker_reduce_init, global_reduce_fn, global_reduce_init): size = domain.steps assert size is not None # TODO: Iterators without size workers = 0 for name, value in self.executor.ncores().items(): workers += value if workers == 0: raise Exception("There are no workers") batch_count = workers * 4 batch_size = max(int(round(size / float(batch_count))), 1) batches = self._create_batches(batch_size, size, domain, worker_reduce_fn, worker_reduce_init) logging.info("Qit: starting {} batches with size {}".format( batch_count, batch_size)) if self.job_observer: self.job_observer.on_computation_start(batch_count, batch_size) futures = self.executor.map(process_batch, batches) if self.track_progress: distributed.diagnostics.progress(futures) if self.write_partial_results is not None: result_saver = ResultSaver(self.execution_count, self.write_partial_results) else: result_saver = None timeouted = False results = [] for future in as_completed(futures): job = future.result() if result_saver: result_saver.handle_result(job.result) if self.job_observer: self.job_observer.on_job_completed(job) results.append(job.result) if self.timeout and self.timeout.is_finished(): logging.info("Qit: timeouted after {} seconds".format( self.timeout.timeout)) timeouted = True break # order results if not timeouted: results = [j.result for j in self.executor.gather(futures)] self.execution_count += 1 if worker_reduce_fn is None: results = list(itertools.chain.from_iterable(results)) logging.info("Qit: finished run with size {} (taking {})".format( len(results), domain.size)) results = results[:domain.size] # trim results to required size if global_reduce_fn is None: return results else: if global_reduce_init is None: return reduce(global_reduce_fn, results) else: return reduce(global_reduce_fn, results, global_reduce_init)
def parallel_calculate_chunks(chunks, features, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import as_completed from dask.base import tokenize client = None cluster = None try: client, cluster = create_client_and_cluster(n_jobs=n_jobs, num_tasks=len(chunks), dask_kwargs=dask_kwargs) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): print("Using EntitySet persisted on the cluster as dataset %s" % (es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(features) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) end = time.time() scatter_time = end - start scatter_string = "EntitySet scattered to workers in {:.3f} seconds" print(scatter_string.format(scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, features=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, profile=False, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
def parallel_calculate_chunks(chunks, features, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import as_completed from dask.base import tokenize client = None cluster = None try: client, cluster = create_client_and_cluster(n_jobs=n_jobs, num_tasks=len(chunks), dask_kwargs=dask_kwargs, entityset_size=entityset.__sizeof__()) # scatter the entityset # denote future with leading underscore if verbose: start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): if verbose: msg = "Using EntitySet persisted on the cluster as dataset {}" print(msg.format(es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(features) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) if verbose: end = time.time() scatter_time = end - start scatter_string = "EntitySet scattered to workers in {:.3f} seconds" print(scatter_string.format(scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, features=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, profile=False, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
def end(self): for future in distributed.as_completed(self.futures.copy()): self._process_future(future)
def execute(self): options = self.config["mitodistances"] output_dir = self.config["output-directory"] body_svc, mito_svc = self.init_services() # Resource manager context must be initialized before resource manager client # (to overwrite config values as needed) dvid_mgr_config = self.config["dvid-access-manager"] dvid_mgr_context = LocalResourceManager(dvid_mgr_config) dvid_mgr_client = ResourceManagerClient(dvid_mgr_config["server"], dvid_mgr_config["port"]) syn_server, syn_uuid, syn_instance = (options['synapse-criteria'][k] for k in ('server', 'uuid', 'instance')) syn_conf = float(options['synapse-criteria']['confidence']) syn_types = ['PreSyn', 'PostSyn'] if options['synapse-criteria']['type'] == 'pre': syn_types = ['PreSyn'] elif options['synapse-criteria']['type'] == 'post': syn_types = ['PostSyn'] bodies = load_body_list(options["bodies"], False) skip_flags = [ os.path.exists(f'{output_dir}/{body}.csv') for body in bodies ] bodies_df = pd.DataFrame({'body': bodies, 'should_skip': skip_flags}) bodies = bodies_df.query('not should_skip')['body'] # Shuffle for better load balance? # TODO: Would be better to sort by synapse count, and put large bodies first, # assigned to partitions in round-robin style. # Then work stealing will be more effective at knocking out the smaller jobs at the end. # This requires knowing all the body sizes, though. # Perhaps mito count would be a decent proxy for synapse count, and it's readily available. #bodies = bodies.sample(frac=1.0).values os.makedirs('body-logs') os.makedirs(output_dir, exist_ok=True) mito_server, mito_uuid, mito_instance = (options['mito-labelmap'][k] for k in ('server', 'uuid', 'instance')) @auto_retry(3) def _fetch_synapses(body): with dvid_mgr_client.access_context(syn_server, True, 1, 1): syn_df = fetch_annotation_label(syn_server, syn_uuid, syn_instance, body, format='pandas') if len(syn_df) == 0: return syn_df syn_types, syn_conf syn_df = syn_df.query( 'kind in @syn_types and conf >= @syn_conf').copy() return syn_df[[*'xyz', 'kind', 'conf' ]].sort_values([*'xyz']).reset_index(drop=True) @auto_retry(3) def _fetch_mito_ids(body): with dvid_mgr_client.access_context(mito_server, True, 1, 1): try: return fetch_supervoxels(mito_server, mito_uuid, mito_instance, body) except HTTPError: return [] def process_and_save(body): tbars = _fetch_synapses(body) valid_mitos = _fetch_mito_ids(body) # TODO: # Does the stdout_redirected() mechanism work correctly in the context of multiprocessing? # If not, I should probably just use a custom logging handler instead. with open(f"body-logs/{body}.log", "w") as f, stdout_redirected(f), Timer() as timer: processed_tbars = [] if len(tbars) == 0: logging.getLogger(__name__).warning( f"Body {body}: No synapses found") if len(valid_mitos) == 0: logging.getLogger(__name__).warning( f"Body {body}: Failed to fetch mito supervoxels") processed_tbars = initialize_results(body, tbars) if len(valid_mitos) and len(tbars): processed_tbars = measure_tbar_mito_distances( body_svc, mito_svc, body, tbars=tbars, valid_mitos=valid_mitos) if len(processed_tbars) > 0: processed_tbars.to_csv(f'{output_dir}/{body}.csv', header=True, index=False) with open(f'{output_dir}/{body}.pkl', 'wb') as f: pickle.dump(processed_tbars, f) if len(tbars) == 0: return (body, 0, 'no-synapses', timer.seconds) if len(valid_mitos) == 0: return (body, len(processed_tbars), 'no-mitos', timer.seconds) return (body, len(tbars), 'success', timer.seconds) logger.info( f"Processing {len(bodies)}, skipping {bodies_df['should_skip'].sum()}" ) def process_batch(bodies): return [*map(process_and_save, bodies)] with dvid_mgr_context: batch_size = max(1, len(bodies) // 10_000) futures = self.client.map(process_batch, iter_batches(bodies, batch_size)) # Support synchronous testing with a fake 'as_completed' object if hasattr(self.client, 'DEBUG'): ac = as_completed_synchronous(futures, with_results=True) else: ac = distributed.as_completed(futures, with_results=True) try: results = [] for f, r in tqdm_proxy(ac, total=len(futures)): results.extend(r) finally: results = pd.DataFrame( results, columns=['body', 'synapses', 'status', 'processing_time']) results.to_csv('results-summary.csv', header=True, index=False) num_errors = len(results.query('status == "error"')) if num_errors: logger.warning( f"Encountered {num_errors} errors. See results-summary.csv" )
def parallel_calculate_chunks(chunks, features, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import Client, LocalCluster, as_completed from dask.base import tokenize client = None cluster = None try: if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] workers = n_jobs_to_workers(n_jobs) workers = min(workers, len(chunks)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, **dask_kwargs) # if cluster has bokeh port, notify user if unxepected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): print("Using EntitySet persisted on the cluster as dataset %s" % (es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(features) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) end = time.time() scatter_time = end - start scatter_string = "EntitySet scattered to workers in {:.3f} seconds" print(scatter_string.format(scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, features=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, profile=False, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
def as_completed(futures): return distributed.as_completed(futures)