def forward_modeling_multi_shots(c, par_files, d): 'Parallel modeling function' my_dict = c.scatter(d, broadcast=True) records = list(d.keys()) print(records) futures = [] for record in records: futures.append(c.submit(forward_modeling_single_shot, record, table=my_dict, par_files=par_files)) # Check progress progress(futures) # Wait for all workers to finish and collect shots wait(futures) # Getting length of list length = len(futures) final_image = np.array(futures[0].result()) i = 1 print('\n::: start user output :::') print(c) print('::: end user output :::\n') # Iterating using while loop while i < length: final_image[:] += futures[i].result() i += 1 return final_image
def main(): """.""" host = os.getenv('DASK_SCHEDULER_HOST', default='localhost') port = os.getenv('DASK_SCHEDULER_PORT', default=8786) print(host, port) client = Client('{}:{}'.format(host, port)) # client.run(init_logging) # client.run_on_scheduler(init_logging) # Run some mock functions and gather a result data = client.map(print_listdir, range(10)) future = client.submit(print_values, data) progress(future) print('') result = client.gather(future) print(result) # Run a second stage which runs some additional processing. print('here A') data_a = client.map(set_value, range(100)) print('here B') data_b = client.map(square, data_a) print('here C') data_c = client.map(neg, data_b) print('here D') # Submit a function application to the scheduler total = client.submit(sum, data_c) print('here E') progress(total) print(total.result()) print('here F')
def launch_actor(self, _class): """ wrapper to launch an actor Parameters ---------- _class : class object class to put on a worker Returns ------- actor : dask.distributed.Actor pointer (future) """ if self.client is not None: if self.library[0] == 'dask': future = self.client.submit( _class, workers=[self.workers[self.worker_counter]], actor=True) # Create a _class on a worker distributed.progress(future) actor = future.result() # Get back a pointer to that object return actor else: raise Exception( f"{self.library} is supported, but without actor launch functionality!" ) else: actor = _class() return actor
def enrich(self, input_path: str, output_path: str, dataset_id: str, threshold: float = 0.8, spans: int = 20): """ main method -calls all other processing methods and outputs enriched parquet file :param input_path: a directory full of parquets (ingest output) to process :param output_path: a directory to put the output context enriched parquets :param dataset_id: ingest process dataset_id :param threshold: float cut off for postprocess table detection score to process as table caption :param spans: number of words each side of label to pull in as context for each table label in content text if None will use regex to pull out full stop to full stop span around the table label """ for pq in glob.glob(os.path.join(input_path, '*.parquet')): logger.info(f'processing file: {pq}') df = pd.read_parquet(pq) basename = os.path.basename(pq) needed_columns = [ 'content', 'postprocess_cls', 'postprocess_score' ] if needed_columns_are_in_df(needed_columns, list(df.columns)): if dataset_id: logger.info( f'limit enrichment to dataset id: {dataset_id}') df = df[df['dataset_id'] == dataset_id] # GET ALL DOCUMENTS, LIST OF DFs pdf_names = list(df.pdf_name.unique()) single_doc_dfs = [] logger.info('split ingest output into docs') for name in tqdm(pdf_names): single_doc_dfs.append(df[df['pdf_name'] == name]) partial_get_context = functools.partial( Enrich.get_contexts, threshold, spans) logger.info('start enrichment processing') enriched = [ self.client.submit(partial_get_context, doc_df, resources={'process': 1}) for doc_df in single_doc_dfs ] progress(enriched) logger.info('collecting all enriched docs') enriched = [e.result() for e in tqdm(enriched)] df = pd.concat(enriched) df = df.reset_index(drop=True) else: pass df.to_parquet(os.path.join(output_path, basename))
def dask_executor(items, function, accumulator, **kwargs): """Execute using dask futures Parameters ---------- items : list List of input arguments function : callable A function to be called on each input, which returns an accumulator instance accumulator : AccumulatorABC An accumulator to collect the output of the function client : distributed.client.Client A dask distributed client instance treereduction : int, optional Tree reduction factor for output accumulators (default: 20) status : bool, optional If true (default), enable progress bar compression : int, optional Compress accumulator outputs in flight with LZ4, at level specified (default 1) Set to ``None`` for no compression. priority : int, optional Task priority, default 0 heavy_input : serializable, optional Any value placed here will be broadcast to workers and joined to input items in a tuple (item, heavy_input) that is passed to function. function_name : str, optional Name of the function being passed """ if len(items) == 0: return accumulator client = kwargs.pop('client') ntree = kwargs.pop('treereduction', 20) status = kwargs.pop('status', True) clevel = kwargs.pop('compression', 1) priority = kwargs.pop('priority', 0) heavy_input = kwargs.pop('heavy_input', None) function_name = kwargs.pop('function_name', None) reducer = _reduce() if clevel is not None: function = _compression_wrapper(clevel, function, name=function_name) reducer = _compression_wrapper(clevel, reducer) if heavy_input is not None: heavy_token = client.scatter(heavy_input, broadcast=True, hash=False) items = list(zip(items, repeat(heavy_token))) futures = client.map(function, items, priority=priority) while len(futures) > 1: futures = client.map( reducer, [futures[i:i + ntree] for i in range(0, len(futures), ntree)], priority=priority, ) if status: from dask.distributed import progress # FIXME: fancy widget doesn't appear, have to live with boring pbar progress(futures, multi=True, notebook=False) accumulator += _maybe_decompress(futures.pop().result()) return accumulator
def enrich(self, file_path: str, dataset_id: str, threshold: float, spans: int): """ iterate over all ingest output parquets and run distributed context enrichment process :param file_path: a directory full of parquets (ingest output) to process :param dataset_id: ingest process dataset_id :param threshold: float cut off for postprocess table detection score to process as table caption :param spans: number of words each side of label to pull in as context for each table label in content text if None will use regex to pull out full stop to full stop span around the table label """ for pq in glob.glob(os.path.join(file_path, '*.parquet')): logger.info(f'processing file: {pq}') df = pd.read_parquet(pq) basename = os.path.basename(pq) needed_columns = [ 'content', 'postprocess_cls', 'postprocess_score' ] if Ingest.needed_columns_are_in_df(needed_columns, list(df.columns)): if dataset_id: logger.info(f'limit enrichment to dataset id: {dataset_id}') df = df[df['dataset_id'] == dataset_id] # GET ALL DOCUMENTS, LIST OF DFs all_pdf_names = list(df.pdf_name.unique()) single_doc_dfs = [] logger.info('split ingest output into docs') for name in all_pdf_names: single_doc_dfs.append(df[df['pdf_name'] == name]) partial_get_context = functools.partial(Ingest.get_contexts, threshold, spans) logger.info(f'start enrichment processing with doc count {len(single_doc_dfs)}') enriched = [self.client.submit(partial_get_context, doc_df, resources={'process': 1}) for doc_df in single_doc_dfs] progress(enriched) logger.info('collecting all enriched docs') enriched = [e.result() for e in enriched] df = pd.concat(enriched) logger.info(f'size of df returned from enrichment: {len(df)}') df = df.reset_index(drop=True) else: pass logger.info(f'outputting data: {os.path.join(file_path, basename)}') df.to_parquet(os.path.join(file_path, basename))
def parallel_write(filename: str, darray: dask.array) -> None: """Distribute Zarr writing task to workers using dask. Input filename should have extension .zarr""" client = Client() out = darray.to_zarr(filename, compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE), compute=False) try: progress(client) # I believe this is for visualization purpose. fut = client.compute(out) except BrokenPipeError: print('Process complete (likely)...')
def write_file(ctx, *args, **kwargs): """Write the output dataset to file.""" if ctx.obj["output"] is not None: if ctx.obj["verbose"]: click.echo(f"Writing to file {ctx.obj['output']}") with ProgressBar(): r = ctx.obj["ds_out"].to_netcdf(ctx.obj["output"], compute=False) if ctx.obj["dask_nthreads"] is not None: progress(r.data) r.compute() if ctx.obj["dask_nthreads"] is not None: click.echo("") # Distributed's progress doesn't print a final \n.
def save_multiple_images( array: da.Array, output_file: Path, write_mode: str = "x" ) -> None: """ Calculate and store a Dask array in an HDF5 file without exceeding available memory. Use the Dask distributed scheduler to compute a Dask array and store the resulting values to a data set 'data' in the root group of an HDF5 file. The distributed scheduler is capable of managing worker memory better than the default scheduler. In the latter case, the workers can sometimes demand more than the available amount of memory. Using the distributed scheduler avoids this problem. The distributed scheduler cannot write directly to HDF5 files because h5py.File objects are not serialisable. To work around this issue, the data are first stored to a Zarr DirectoryStore, then copied to the final HDF5 file and the Zarr store deleted. Multithreading is used, as the calculation is assumed to be I/O bound. Args: array: A Dask array to be calculated and stored. output_file: Path to the output HDF5 file. write_mode: HDF5 file opening mode. See :class:`h5py.File`. """ # Set a more generous connection timeout than the default 30s. with dask.config.set( { "distributed.comm.timeouts.connect": "60s", "distributed.comm.timeouts.tcp": "60s", "distributed.deploy.lost-worker-timeout": "60s", "distributed.scheduler.idle-timeout": "600s", "distributed.scheduler.locks.lease-timeout": "60s", } ): intermediate = str(output_file.with_suffix(".zarr")) # Overwrite any pre-existing Zarr storage. Don't compute immediately but # return the Array object so we can compute it with a progress bar. method = {"overwrite": True, "compute": False, "return_stored": True} # Prepare to save the calculated images to the intermediate Zarr store. array = array.to_zarr(intermediate, component="data", **method) # Compute the Array and store the values, using a progress bar. progress(array.persist()) print("\nTransferring the images to the output file.") store = zarr.DirectoryStore(intermediate) with h5py.File(output_file, write_mode) as f: zarr.copy_all(zarr.open(store), f, **Bitshuffle()) # Delete the Zarr store. store.clear()
def get_dask_data(df, target_name): df = dd.from_pandas(df, chunksize=100) target = df[target_name] del df[target_name] # Remove target information from dataframe df, target = persist( df, target) # Ask Dask to start work on these in the background progress(df, target) df = dd.get_dummies(df.categorize()).persist() X = df.to_dask_array(lengths=True) y = target.to_dask_array(lengths=True) return train_test_split_dask(X, y, test_size=.2, random_state=42)
def complexity_with_distributed(file_list): """function to compute complexity of the list of files in a distributed master-slave server methodology Args: file_list: list of files to be computed """ # Start the timer to track time start_time = time.time() dask_master_server_node = dask_master_server.map( repo_complexity_analyzer.complexity_analyzer, file_list) progress(dask_master_server_node) print("--- %s seconds ---" % (time.time() - start_time)) complexity_result_list = dask_master_server.gather(dask_master_server_node) return json.dumps(complexity_result_list)
def _(w): if map_menu.marker is None: if at_time is not None: t = at_time.value da = ds.precipitationCal.sel(time=t, method='nearest').persist() else: t0 = from_time.value t1 = to_time.value da = ds.precipitationCal.sel(time=slice(t0, t1)).sum( ['time']).persist() pbar = progress(da, notebook=True, multi=False) out.clear_output() with out: display(pbar) #hbox.children = list(hbox.children) + [pbar.bar_widget] map_menu.da = da.sel(lat=slice(-85, 85)).compute() io = overlay(map_menu.m, map_menu.current_io, da, label) map_menu.current_io = io else: lat, lon = map_menu.marker.location #label.value = str(ds) da = ds.precipitationCal.sel(lat=lat, lon=lon, method='nearest').compute() s = da.to_series() line.x = s.index.values line.y = s
def save_multiple_image_sequences( array: da.Array, intermediate_store: Union[Path, str], output_files: Iterable[Path], write_mode: str = "x", ) -> None: intermediate_store = Path(intermediate_store).with_suffix(".zarr") # Set a more generous connection timeout than the default 30s. with dask.config.set( { "distributed.comm.timeouts.connect": "60s", "distributed.comm.timeouts.tcp": "60s", "distributed.deploy.lost-worker-timeout": "60s", "distributed.scheduler.idle-timeout": "600s", "distributed.scheduler.locks.lease-timeout": "60s", } ): # Overwrite any pre-existing Zarr storage. Don't compute immediately but # return the Array object so we can compute it with a progress bar. method = {"overwrite": True, "compute": False, "return_stored": True} # Prepare to save the calculated images to the intermediate Zarr store. array = [ sub_array.to_zarr(intermediate_store, component=f"{i:d}/data", **method) for i, sub_array in enumerate(array) ] # Compute the Array and store the values, using a progress bar. progress([sub_array.persist() for sub_array in array]) print() print("Transferring the images to the output files.") store = zarr.DirectoryStore(str(intermediate_store)) arrays = zarr.open(store) @delayed def sequence_to_disk(i, output_file): with h5py.File(output_file, write_mode) as f: return zarr.copy_all(arrays[i], f, **Bitshuffle()) transfer = [sequence_to_disk(i, o).persist() for i, o in enumerate(output_files)] progress(transfer) da.compute(transfer) print() # Delete the Zarr store. store.clear()
def main(): client = Client('localhost:8786') A = client.map(set_value, range(100)) B = client.map(square, A) C = client.map(neg, B) total = client.submit(sum, C) print(progress(total)) print(total.result())
def main(): client = Client('localhost:8786') A = client.map(set_value, range(100)) B = client.map(square, A) C = client.map(neg, B) total = client.submit(sum, C) print(progress(total)) print(total.result())
def distributed_main2(): futures = [] for i in range(NUM_JOBS): y_dim = Y_DIM // NUM_JOBS # Sends very little data over the network to each worker future = client.submit(parallel_func2, y_dim) futures.append(future) progress(futures) total = 0 for future in as_completed(futures): total += future.result() print(total) client.close() return total
def progress(self, futures): """ wrapper to log the progress of futures Arguments --------- futures : list of <generalized> futures futures that are to be gathered """ if self.client is None: pass else: if self.library[0] == 'dask': distributed.progress(futures) else: raise Exception( f"{self.library} is supported, but without actor launch functionality!" )
def get_data(df, target, is_dask=False, chunksize=200): if is_dask: df = dd.from_pandas(df, chunksize=chunksize) target_s = df[target] del df[target] df, target_s = persist(df, target_s) progress(df, target_s) df = dd.get_dummies(df.categorize()).persist() y = target_s.to_dask_array(lengths=True) X = df.to_dask_array(lengths=True) else: y = df[target].to_numpy() del df[target] df = pd.get_dummies(df) X = df.to_numpy() return train_test_split_normal(X, y, test_size=.1, random_state=18)
def cache_sofk(fout, fjson, Sk, nx, verbose=True, para=True): if para: import dask from dask.distributed import Client, progress client = Client(processes=False) Sk = dask.delayed(Sk) import pandas as pd mdf = pd.read_json(fjson) box = mdf.iloc[0]['box'] kvecs = legal_kvecs(nx, box) nk = len(kvecs) if verbose and (not para): from progressbar import ProgressBar, Bar, ETA widgets = [Bar('>'), ETA()] bar = ProgressBar(widgets=widgets, maxval=len(mdf)) bar.start() skl = [] sk2l = [] isk = 0 for label, row in mdf.iterrows(): com = np.array(row['positions']) sk = Sk(kvecs, com) skl.append(sk) sk2l.append(sk**2) isk += 1 if verbose and (not para): bar.update(isk) skm = np.mean(skl, axis=0) ske = (np.mean(sk2l, axis=0)-skm**2)**0.5/len(skl)**0.5 if para: skm, ske = dask.persist(skm, ske) if verbose: progress(skm, ske) skm, ske = dask.compute(skm, ske) client.shutdown() # spherical average uk, uskm, uske = shavg(kvecs, skm, ske) np.savetxt(fout, np.array([uk, uskm, uske]).T)
def main_body(cluster, IO, DATA, RESULT, RESTART, futures, nfutures): with Client(cluster, processes=False) as client: print('--------------------------------------------------------------') print('\t Starting clients ... \n') print(client) print( '-------------------------------------------------------------- \n' ) # Do only consider non-nan fields DATA = DATA.where(DATA.MASK == 1, drop=True) # Go over the whole grid for i, j in product(DATA.lat, DATA.lon): mask = DATA.MASK.sel(lat=i, lon=j) # Provide restart grid if necessary if ((mask == 1) & (restart == False)): nfutures = nfutures + 1 futures.append( client.submit(cosipy_core, DATA.sel(lat=i, lon=j))) elif ((mask == 1) & (restart == True)): nfutures = nfutures + 1 futures.append( client.submit(cosipy_core, DATA.sel(lat=i, lon=j), IO.create_grid_restart().sel(lat=i, lon=j))) # Finally, do the calculations and print the progress progress(futures) if (restart == True): IO.get_grid_restart().close() print('\n') print('--------------------------------------------------------------') print('Copy local results to global') print( '-------------------------------------------------------------- \n' ) for future in as_completed(futures): results = future.result() result_data = results[0] restart_data = results[1] IO.write_results_future(result_data) IO.write_restart_future(restart_data)
def multiprocess(jobs, client): """Process jobs using a dask cluster. Inputs: jobs = list of jobs where each job is a list of workflow scripts and parameters client = dask cluster client object :param jobs: list :param client: distributed.client.Client """ # Keep a list of job futures processed = [] # Submit the jobs to the scheduler for job in jobs: # Submit individual job processed.append(client.submit(_process_images_multiproc, job)) # Watch job progress and print a progress bar progress(processed) # Each job outputs results to disk so we do not need to gather results here client.shutdown()
def gather_actor_result(self, future): """ wrapper to pull the .result() of a method called to an actor Arguments --------- future : <generalized> future the future object to be collected from an actor """ if self.client is None: return future else: if self.library[0] == 'dask': distributed.progress(future) result = future.result() return result else: raise Exception( f"{self.library} is supported, but without actor-gather functionality!" )
def write_images_for_annotation(self, pdf_dir, img_dir): """ Helper function that will write images from PDFs for annotation. :param pdf_dir: Path to PDFs to write images for :param img_dir: Output directory where images will be written """ logger.info(f"Converting PDFs to images and writing to target directory: {img_dir}") pdfnames = get_pdf_names(pdf_dir) pdf_to_images = functools.partial(Ingest.pdf_to_images, 'na', self.images_tmp) images = [self.client.submit(pdf_to_images, pdf, resources={'process': 1}) for pdf in pdfnames] progress(images) images = [i.result() for i in images] images = [i for i in images if i is not None] images = [i for il in images for i in il] paths = [f'{tmp_dir}/{pdf_name}_{pn}' for tmp_dir, pdf_name, pn in images] for path in paths: bname = os.path.basename(path) new_bname = bname + '.png' shutil.copy(path, os.path.join(img_dir, new_bname)) logger.info('Done.') shutil.rmtree(self.tmp_dir)
def _log_prob_pt_samples_dask(log_p_pt, samples, nthreads=1, cluster=None): from dask.distributed import Client, LocalCluster, progress import dask.bag as db # calculate the point-wise probabilities and stack them together if cluster is None: # start local dask cluster _cl = LocalCluster(n_workers=nthreads, threads_per_worker=1) else: # use provided dask cluster _cl = cluster with Client(_cl): _log_pred = db.from_sequence(samples).map(log_p_pt) progress(_log_pred) ret = np.stack(_log_pred.compute()) if cluster is None: _cl.close() return ret
def search(model, X, y, params, method="randomized", n_iter=30, cv=5, **kwargs): """Run a cross-validated search for hyperparameters.""" if method.lower() == "randomized": search = RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter, cv=cv) elif method.lower() == "grid": search = GridSearchCV(model, param_grid=params, cv=cv) elif method.lower() == "bayes": search = BayesSearchCV(model, search_spaces=params, n_iter=n_iter, cv=cv) else: message = ("'method' must be either 'randomized', 'grid' or 'bayes'." " Got method='{}'".format(method)) LOGGER.error(message) raise ValueError(message) method_name = method.capitalize() + "SearchCV" LOGGER.info("Beginning " + method_name) when_started = time() progress(search.fit(X, y)) total_time = time() - when_started n_settings = len(search.cv_results_['params']) LOGGER.warn( "{} took {:.2f} seconds for {} candidates parameter settings.".format( method_name, total_time, n_settings)) return search
def run_parallel(collector, seed_epoch, collect): if collect['dask']: # Launching Dask on cluster assert not collect['render'], 'Can not render using dask' from daskoia import CPUCluster cluster = CPUCluster(mem_req=6000) cluster.start_workers(collect['workers']) client = Client(cluster) print('Scheduler Info {}'.format(client.scheduler_info())) futures_traj = client.map(collector, seed_epoch) progress(futures_traj) results = client.gather(futures_traj) else: if collect['workers'] > 1: assert not collect[ 'render'], 'Can not render using multiple processes' results = Parallel(n_jobs=collect['workers'])( delayed(collector)(se) for se in tqdm(seed_epoch)) else: results = (collector(se) for se in tqdm(seed_epoch)) return results
def run_link(input_path, output_path, cluster, dataset_id): logger.info("Setting up client") client = Client(cluster, serializers=['msgpack', 'dask'], deserializers=['msgpack', 'dask', 'pickle']) logger.info(client) full_ent_set = [] # Assumption: The input parquet fits into memory. Will need to switch to dask distributed otherwise for pq in glob.glob(os.path.join(input_path, '*.parquet')): df = pd.read_parquet(pq) if len(df) == 0: logger.warning(f"{pq} is empty -- skipping.") continue contents = df['content'].tolist() results = [client.submit(link, c, resources={'linking': 1}) for c in contents] progress(results) results = [r.result() for r in results] nonlinked_lists, ent_set = zip(*results) nonlinked_lists = list(nonlinked_lists) ent_set = list(ent_set) ent_set = [list(e) if e is not None or len(e) > 0 else None for e in ent_set] full_ent_set.extend(ent_set) nonlinked_lists = [list(e) for e in nonlinked_lists] df['ents_linked'] = ent_set df['ents_unlinked'] = nonlinked_lists basename = os.path.basename(pq) df.to_parquet(os.path.join(output_path, basename)) logger.info('Starting entity info extraction') dfs = [client.submit(construct_linked_kb, e, resources={'linking': 1}) for e in full_ent_set] progress(dfs) dfs = [d.result() for d in dfs] dfs = [d for d in dfs if d is not None] dfs = pd.concat(dfs) dfs.drop_duplicates(inplace=True) dfs['aliases'] = dfs.apply(lambda row: list(row['aliases']), axis=1) dfs['types'] = dfs.apply(lambda row: list(row['types']), axis=1) dfs['dataset_id'] = dataset_id dfs.to_parquet(os.path.join(output_path, f'{dataset_id}_entities.parquet'))
def run(src_dir, dst_dir): client = get_client() # load data tiff_paths = find_src_files(src_dir, "tif") raw_data = tiff_paths.map(read_tiff) # create destination create_dst_dir(dst_dir) # # downsample # bin4_data = raw_data.map(partial(downsample_naive, ratio=(1, 4, 4))) # bin4_data = bin4_data.map(da.rechunk) # bin4_data = client.persist(bin4_data) # # logger.info("downsampling") # progress(bin4_data) logger.info("persist data on cluster") bin4_data = client.persist(raw_data, priority=-10) progress(bin4_data) # save intermediate result zarr_paths = tiff_paths.map(partial(build_zarr_path, dst_dir)) name_data = db.zip(zarr_paths, bin4_data) futures = name_data.starmap(write_zarr, path="raw") logger.info("save as zarr") future = client.compute(futures, priority=10) progress(future) # convert to h5 for ingestion h5_paths = zarr_paths.map(partial(build_h5_path, dst_dir)) src_dst = db.zip(zarr_paths, h5_paths) futures = src_dst.starmap(convert_hdf5) logger.info("convert zarr to h5") future = client.compute(futures, priority=20) progress(future)
def main(): def signal_handler(*args): # Handle any cleanup here print('SIGINT or CTRL-C detected. Exiting gracefully' ' and shutting down the dask kubernetes cluster') if cluster: cluster.close() exit(0) arguments = docopt(__doc__) clear_output = arguments["--clear"] bucket_name = f's3://{arguments["--input-bucket"]}' output_bucket_name = arguments["--output-bucket"] outp_dir = arguments["--output-dir"] filter_config_file = arguments["--filter-config"] output_format = arguments["--format"] scheduler = arguments["--scheduler"] log_file = arguments["--log-file"] launch_kubernetes = arguments["--k8"] log_level = logging.DEBUG if arguments["--verbose"] else logging.INFO languages = arguments["--languages"] signal.signal(signal.SIGINT, signal_handler) if languages: languages = languages.split(',') init_logging(log_level, log_file) # clean output directory if existing if outp_dir is not None and os.path.exists(outp_dir): if clear_output is not None and clear_output: shutil.rmtree(outp_dir) os.mkdir(outp_dir) with open(filter_config_file, 'r') as file: config = json.load(file) # start the dask local cluster if scheduler is None: if launch_kubernetes: cluster = DaskCluster( namespace="dhlab", cluster_id="impresso-pycommons-k8-rebuild", scheduler_pod_spec=make_scheduler_configuration(), worker_pod_spec=make_worker_configuration( docker_image= "ic-registry.epfl.ch/dhlab/impresso_pycommons:v1", memory="5G")) try: cluster.create() cluster.scale(50, blocking=True) client = cluster.make_dask_client() print(client.get_versions(check=False)) except Exception as e: print(e) cluster.close() exit(0) print(client) else: cluster = None client = Client(processes=False, n_workers=8, threads_per_worker=1) else: cluster = None client = Client(scheduler) logger.info(f"Dask cluster: {client}") if arguments["rebuild_articles"]: try: for n, batch in enumerate(config): rebuilt_issues = [] print(f'Processing batch {n + 1}/{len(config)} [{batch}]') newspaper = list(batch.keys())[0] start_year, end_year = batch[newspaper] for year in range(start_year, end_year): print(f'Processing year {year}') print('Retrieving issues...') try: input_issues = read_s3_issues(newspaper, year, bucket_name) except FileNotFoundError: print(f'{newspaper}-{year} not found in {bucket_name}') continue issue_key, json_files = rebuild_issues( issues=input_issues, input_bucket=bucket_name, output_dir=outp_dir, dask_client=client, format=output_format, filter_language=languages) rebuilt_issues.append((issue_key, json_files)) print((f"Uploading {len(rebuilt_issues)} rebuilt bz2files " f"to {output_bucket_name}")) b = db.from_sequence(rebuilt_issues) \ .starmap(compress, output_dir=outp_dir) \ .starmap(upload, bucket_name=output_bucket_name) \ .starmap(cleanup) future = b.persist() progress(future) except Exception as e: traceback.print_tb(e.__traceback__) print(e) if cluster: cluster.close() finally: if cluster: cluster.close() elif arguments["rebuild_pages"]: print("\nFunction not yet implemented (sorry!).\n")
def main(): parser = argparse.ArgumentParser( description = 'Simple example for using dask-joqueue in SLURM') parser.add_argument('--proc_per_job', type = int, default = 1, help = 'Number of processes per job.') parser.add_argument('--cores_per_proc', type = float, default = 2, help = 'Number of cores per process.') parser.add_argument('--n_jobs', type = int, default = 1, help = 'Number of jobs') parser.add_argument('--array', type = int, default = 0, help = 'EXPERIMENTAL. If >0, then submit an job-array '+\ 'of this size. The total number of jobs will'+\ ' be `array * n_jobs`.') parser.add_argument('--container', type = str, help = 'Path to singularity container. If `None`, '+\ 'then assumes conda environment.') parser.add_argument('--qos', type = str, help = 'QOS to use.') parser.add_argument('--dry', action = 'store_true', help = 'Print job script and exit (no submission)') parser.add_argument('--load', type = int, default = 1000, help = 'Load for the function.') args = parser.parse_args() n_procs = args.proc_per_job * args.n_jobs params = { 'cores' : int(args.cores_per_proc * args.proc_per_job), 'memory' : '{0:d}00MB'.format(args.proc_per_job*5), 'processes' : args.proc_per_job, # The name to assign to each worker 'name' : 'dask_test' } job_extra = ['--requeue'] env_extra = [] if not args.qos is None: job_extra.append('--qos {}'.format(args.qos)) if args.array > 0: n_procs = n_procs * args.array job_extra.append('--array 0-{0:d}'.format(args.array - 1)) """ This is added to ensure that each worker has a unique ID. This may be unnecessary. """ env_extra.append( 'JOB_ID=${SLURM_ARRAY_JOB_ID%;*}_${SLURM_ARRAY_TASK_ID%;*}') if not args.container is None: """ When using a container, dask needs to know how to enter the python environment. Note: The binding `-B..` is cluster(OpenMind) specific but can generalize. The binding is required since `singularity` will not bind by default. """ cont = os.path.normpath(args.container) bind = cont.split(os.sep)[1] bind = '-B /{0!s}:/{0!s}'.format(bind) py = 'singularity exec {0!s} {1!s} python3'.format(bind, cont) params.update({'python' : py}) """ Dask will generate a job script but some elements will be missing due to the way the singularity container with interface with slurm. The `modules` need to initialized and `singularity` needs to be added. """ env_extra += [ 'source /etc/profile.d/modules.sh', 'module add openmind/singularity/2.6.0'] params.update({ 'job_extra' : job_extra, 'env_extra' : env_extra}) cluster = SLURMCluster(**params) """ Display the job script. """ print(cluster.job_script()) pprint(params) t0 = time.time() num_crunch(100) expected_dur = (time.time() - t0) * args.load print('Expected time of linear call: {0:f}'.format(expected_dur)) if args.dry: return """ Scale the cluster to the number of jobs. """ print('Scaling by {}'.format(args.n_jobs)) cluster.scale_up(args.proc_per_job * args.n_jobs) """ Setup a client that interfaces with the workers """ client = distributed.Client(cluster) time.sleep(10) print(cluster) print(client) pprint(client.has_what()) # pprint(client.scheduler_info()) """ Generate a transaction. """ futures = client.map(num_crunch, range(args.load)) t0 = time.time() """ Compute (and then discard) while keeping track of progress. """ distributed.progress(futures) dur = time.time() - t0 msg = '\n\nSpeed up of {0:f}x ({1:f}/{2:f})'.format((expected_dur / dur), expected_dur, dur) print(msg) msg = 'Ideal speed up is {0:f}x'.format(n_procs) print(msg) """
# sigma = [0.1] # r_fit = [6] # rep = [0, 1, 2, 3] param_list = [n, t, r, sigma, r_fit, rep, const] params = list(itertools.product(*param_list)) param_df = pd.DataFrame( params, columns=['n', 't', 'r', 'sigma', 'r_fit', 'rep', 'const']) # setup dask job client = Client() client lazy_results = [] for parameters in param_df.values: lazy_result = dask.delayed(run_simulation)(*parameters) lazy_results.append(lazy_result) futures = dask.persist(*lazy_results) progress(futures) # call computation results = dask.compute(*futures) data = pd.DataFrame(results, columns=[ 'loss_true', 'max_qnorm_ub_true', 'loss_fit', 'max_qnorm_ub_fit', 'gen_err_fit' ]) # param_df.to_csv("params_max.csv") # data.to_csv("results_max.csv") table = param_df.join(data) table.to_csv("max_n.csv")