def gen_samples(): ''' generates all monte carlo samples ''' if DASK: # list of delayed operations operations = [delayed(gen_sample)(k, STATE[k]) for k in range(NS)] # submit futures to client futures = CLIENT.compute(operations) # progress bar if VERBOSE: print('----------------------') print('performing monte carlo') print('----------------------') progress(futures) elif PARALLEL: operations = [delayed(gen_sample)(k, STATE[k]) for k in range(NS)] futures = Parallel(n_jobs=NTHREAD, backend='threading', verbose=VERBOSE)(operations) else: # loop through pressures if VERBOSE: print('----------------------') print('performing monte carlo') print('----------------------') futures = [gen_sample(k, STATE[k]) for k in tqdm(range(NS))] else: futures = [gen_sample(k, STATE[k]) for k in range(NS)] return futures
def build_cli(args=None): if not args: args = parse_args() else: args = parse_args(args) filter_dirty = any(args.packages) or not args._all outputs = get_dask_outputs(args.path, packages=args.packages, filter_dirty=filter_dirty, git_rev=args.git_rev, stop_rev=args.stop_rev, steps=args.steps, max_downstream=args.max_downstream, visualize=args.visualize, test=args.test) if args.visualize: # setattr(nx.drawing, 'graphviz_layout', nx.nx_pydot.graphviz_layout) # graphviz_graph = nx.draw_graphviz(graph, 'dot') # graphviz_graph.draw(args.visualize) visualize(*outputs, filename=args.visualize) # create neat looking graph. else: # many threads, because this is just the dispatch. Takes very little compute. # Only waiting for build complete. cluster = LocalCluster(n_workers=1, threads_per_worker=args.threads, nanny=False) client = Client(cluster) futures = client.persist(outputs) progress(futures)
def write_outputs(): ''' writes outputs for all samples ''' if DASK: operations = [ delayed(write_output)(OUTPUT[k], STATE[k]) for k in range(NS) ] futures = CLIENT.compute(operations) if VERBOSE: print('\n---------------') print('writing outputs') print('---------------') progress(futures) elif PARALLEL: operations = [ delayed(write_output)(OUTPUT[k], STATE[k]) for k in range(NS) ] futures = Parallel(n_jobs=NTHREAD, backend='threading', verbose=VERBOSE)(operations) else: if VERBOSE: print('writing outputs') print('---------------') for k in tqdm(range(NS)): write_output(OUTPUT[k], STATE[k]) else: for k in range(NS): write_output(OUTPUT[k], STATE[k])
def submit( self, client: Client = None, scheduler_address: str = None, priority: int = None, resources: Dict[str, Any] = None, show_progress=False, **kwargs, ) -> None: if not priority: priority = self.priority if not resources: resources = self.resources if not client: client = Client(scheduler_address) self.scheduler_address = client.scheduler.address computation = client.compute(self.graph, retries=3, priority=priority, resources=resources) if show_progress: progress(computation) fire_and_forget(computation) if scheduler_address: client.close() return None
def calculate_cdfs(): ''' calculate cdfs for all samples ''' if VERBOSE: print('computing %s %s samples' % (NS, EL.lower())) if DASK: operations = [ delayed(calculate_cdf)(NATOMS[i], BOX[i], BR, POS[i], RV, CD) for i in range(NS) ] futures = CLIENT.compute(operations) if VERBOSE: progress(futures) print('\n') elif PARALLEL: operations = [ delayed(calculate_cdf)(NATOMS[i], BOX[i], BR, POS[i], RV, CD) for i in range(NS) ] futures = Parallel(n_jobs=NTHREAD, backend='threading', verbose=VERBOSE)(operations) else: if VERBOSE: futures = [ calculate_cdf(NATOMS[i], BOX[i], BR, POS[i], RV, CD) for i in tqdm(range(NS)) ] else: futures = [ calculate_cdf(NATOMS[i], BOX[i], BR, POS[i], RV, CD) for i in range(NS) ] return futures
def map_func(func, arg_list): with Cluster(**kwargs) as cluster: cluster.start_workers(n_workers) with Client(cluster) as client: _results = client.map(func, arg_list) progress(_results, notebook=False) result_list = client.gather(_results) print("") # If there will be more output after a progressbar return result_list
def get_step(self, res, show_progress=False): futures, step_t_start = res if show_progress: distributed.progress(futures) print() step_results = self.get_chunk(futures) descriptors, po_returns, po_lengths, po_novelties, extra = collect_po_results( step_results ) episodes_this_step = 2 * len(po_returns) timesteps_this_step = po_lengths.sum() logger.info( "Optimizer {} finished running {} episodes, {} timesteps".format( self.optim_id, episodes_this_step, timesteps_this_step ) ) self.niche.update_extra(extra) self.update_population(descriptors, po_returns, po_novelties) logger.info("Optimizer {} finished updating population".format(self.optim_id)) step_t_end = time.time() return ( descriptors, po_returns, po_novelties, StepStats( po_returns_mean=po_returns.mean(), po_returns_median=np.median(po_returns), po_returns_std=po_returns.std(), po_returns_max=po_returns.max(), po_returns_min=po_returns.min(), po_novelties_mean=po_novelties.mean(), po_novelties_median=np.median(po_novelties), po_novelties_var=po_novelties.var(), po_novelties_max=po_novelties.max(), po_novelties_min=po_novelties.min(), po_len_mean=po_lengths.mean(), po_len_std=po_lengths.std(), episodes_this_step=episodes_this_step, timesteps_this_step=timesteps_this_step, time_elapsed_this_step=step_t_end - step_t_start, ), )
def init_samples(): ''' initializes all samples ''' if DASK: operations = [delayed(init_sample)(k) for k in range(NS)] futures = CLIENT.compute(operations) if VERBOSE: print('initializing samples') print('--------------------') progress(futures) print('\n') elif PARALLEL: operations = [delayed(init_sample)(k) for k in range(NS)] futures = Parallel(n_jobs=NTHREAD, backend='threading', verbose=VERBOSE)(operations) else: if VERBOSE: print('initializing samples') print('--------------------') futures = [init_sample(k) for k in tqdm(range(NS))] else: futures = [init_sample(k) for k in range(NS)] return futures
def init_headers(): ''' writes headers for all samples ''' if DASK: operations = [delayed(init_header)(k, OUTPUT[k]) for k in range(NS)] futures = CLIENT.compute(operations) if VERBOSE: print('initializing headers') print('--------------------') progress(futures) elif PARALLEL: operations = [delayed(init_header)(k, OUTPUT[k]) for k in range(NS)] futures = Parallel(n_jobs=NTHREAD, backend='threading', verbose=VERBOSE)(operations) else: if VERBOSE: print('initializing headers') print('--------------------') for k in tqdm(range(NS)): init_header(k, OUTPUT[k]) else: for k in range(NS): init_header(k, OUTPUT[k])
def gen_mc_params(): ''' generate adaptive monte carlo parameters for all samples ''' if DASK: # list of delayed operations operations = [delayed(gen_mc_param)(STATE[k]) for k in range(NS)] # submit futures to client futures = CLIENT.compute(operations) # progress bar if VERBOSE: print('\n------------------') print('updating mc params') print('------------------') progress(futures) elif PARALLEL: operations = [delayed(gen_mc_param)(STATE[k]) for k in range(NS)] futures = Parallel(n_jobs=NTHREAD, backend='threading', verbose=VERBOSE)(operations) else: # loop through pressures if VERBOSE: print('updating mc params') print('------------------') futures = [gen_mc_param(STATE[k]) for k in range(NS)] return futures
def dask_executor(items, function, accumulator, **kwargs): """Execute using dask futures Parameters ---------- items : list List of input arguments function : callable A function to be called on each input, which returns an accumulator instance accumulator : AccumulatorABC An accumulator to collect the output of the function client : distributed.client.Client A dask distributed client instance treereduction : int, optional Tree reduction factor for output accumulators (default: 20) status : bool, optional If true (default), enable progress bar compression : int, optional Compress accumulator outputs in flight with LZ4, at level specified (default 1) Set to ``None`` for no compression. priority : int, optional Task priority, default 0 retries : int, optional Number of retries for failed tasks (default: 3) heavy_input : serializable, optional Any value placed here will be broadcast to workers and joined to input items in a tuple (item, heavy_input) that is passed to function. function_name : str, optional Name of the function being passed .. note:: If ``heavy_input`` is set, ``function`` is assumed to be pure. """ from dask.delayed import delayed if len(items) == 0: return accumulator client = kwargs.pop('client') ntree = kwargs.pop('treereduction', 20) status = kwargs.pop('status', True) clevel = kwargs.pop('compression', 1) priority = kwargs.pop('priority', 0) retries = kwargs.pop('retries', 3) heavy_input = kwargs.pop('heavy_input', None) function_name = kwargs.pop('function_name', None) reducer = _reduce() # secret options direct_heavy = kwargs.pop('direct_heavy', None) worker_affinity = kwargs.pop('worker_affinity', False) if clevel is not None: function = _compression_wrapper(clevel, function, name=function_name) reducer = _compression_wrapper(clevel, reducer) if heavy_input is not None: heavy_token = client.scatter(heavy_input, broadcast=True, hash=False, direct=direct_heavy) items = list(zip(items, repeat(heavy_token))) work = [] if worker_affinity: workers = list(client.run(lambda: 0)) def belongsto(workerindex, item): if heavy_input is not None: item = item[0] hashed = _hash((item.fileuuid, item.treename, item.entrystart, item.entrystop)) return hashed % len(workers) == workerindex for workerindex, worker in enumerate(workers): work.extend(client.map( function, [item for item in items if belongsto(workerindex, item)], pure=(heavy_input is not None), priority=priority, retries=retries, workers={worker}, allow_other_workers=False, )) else: work = client.map( function, items, pure=(heavy_input is not None), priority=priority, retries=retries, ) while len(work) > 1: work = client.map( reducer, [work[i:i + ntree] for i in range(0, len(work), ntree)], pure=True, priority=priority, retries=retries, ) work = work[0] if status: from distributed import progress # FIXME: fancy widget doesn't appear, have to live with boring pbar progress(work, multi=True, notebook=False) accumulator += _maybe_decompress(work.result()) return accumulator
# # - Display head of the dataframe # - Display number of rows of this dataframe. # - Compute the total number of passengers. # - Count occurrences in the payment_type column both for the full dataset, and filtered by zero tip (tip_amount == 0). # - Create a new column, tip_fraction # - Plot the average of the new column tip_fraction grouped by day of week. # - Plot the average of the new column tip_fraction grouped by hour of day. # # [Dask dataframe documentation](http://docs.dask.org/en/latest/dataframe.html) # # + import dask.dataframe as dd from distributed import Client, progress c = Client('127.0.0.1:8786') nyc2014 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2014/yellow*.csv', parse_dates=['pickup_datetime', 'dropoff_datetime'], skipinitialspace=True) nyc2015 = dd.read_csv('hdfs://svmass2.mass.uhb.fr:54310/user/datasets/nyc-tlc/2015/yellow*.csv', parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime']) nyc2014, nyc2015 = c.persist([nyc2014, nyc2015]) progress(nyc2014, nyc2015) # -
tiles.append(gvi_) sng_date_utm_strip = merge_arrays(tiles, method='max', nodata=np.NAN) sng_date_reproj = sng_date_utm_strip.rio.reproject('EPSG:4326', ) dates_da.append(sng_date_reproj) # time_agg = xr.combine_nested(dates_da, concat_dim=['time']) time_agg = xr.combine_nested(dates_da, concat_dim=[['time', 'x', 'y']]) GVDM = xr.apply_ufunc(_decades, time_agg, input_core_dims=[['time']], exclude_dims={'time', }, dask='parallelized', dask_gufunc_kwargs={'allow_rechunk': True}, vectorize=True,) GVDM.name = 'GVI' GVDM_f = GVDM.to_netcdf(rf'L:\HSL\observations\S2\GVI_S2{sensor}_{decad}_{alg}.nc', compute=False, encoding={'GVI': {'_FillValue': -999}}).persist() progress(GVDM_f) client.close() pass
def process_file(fname, delay): # return the size in bytes of the file result = os.path.getsize(fname) # simulate a long-running process by sleeping time.sleep(delay * random.random()) return result if __name__ == "__main__": # wrapping the client in a context manager ensures that the client and the cluster get cleaned up with get_cluster() as clust, Client(clust) as cl: print(f"Cluster dashboard running at {cl.cluster.dashboard_link}") # add workers cl.cluster.scale(num_workers) # list all the files in the home directory home_files = list(Path.home().glob("*")) # ensure that the total runtime is ~mean_runtime delays = ( (mean_runtime * num_workers) / len(home_files), ) * len(home_files) # map the function `process_file` over the arguments `home_files` and `delays` # this returns a collection of futures futures = cl.map(process_file, home_files, delays) progress(futures) # block until all the futures are finished result = cl.gather(futures) print(*zip(map(lambda v: str(v.name), home_files), result))