def test_run_dask(fix_task_env): import numpy as np from dask import delayed as dl from dask.distributed import Client dc = Client(processes=False) input_task_example, gathered_task_example, post_processing_task_example = fix_task_env parts = {'a': [0., 1., 2.], 'b': [-3., 10., 2.], 'c': [20.]} numpoints = 20 prefactor = 0.1 input_delayed = dl(input_task_example)(parts) gathered_delayed = dl(gathered_task_example, nout=1)([input_delayed], [numpoints])[0] post_proc_delayed = dl(post_processing_task_example)(input_delayed, gathered_delayed, prefactor) input_future = dc.compute(input_delayed) gathered_future = dc.compute(gathered_delayed) post_proc_future = dc.compute(post_proc_delayed) input_data = input_future.result() gathered_data = gathered_future.result() post_proc_data = post_proc_future.result() assert input_data == parts gather_results = {} for part in parts: gather_results[part] = np.linspace(0.0, 1.0, numpoints) for part in gather_results: assert np.all(gathered_data[part] == gather_results[part]) post_proc_results = 0.0 for part in parts: post_proc_results += prefactor * np.sum(input_data[part]) * \ np.sum(gather_results[part]) assert (post_proc_data == post_proc_results)
class DaskCUDF: def __init__(self): self._dask = None self._client = None self._cluster = None def create(self, *args, **kwargs): logger.print(JUST_CHECKING) logger.print("-----") logger.print(STARTING_DASK) cluster = LocalCUDACluster() self._client = Client(cluster, *args, **kwargs) self._cluster = cluster return self def load(self, session): self._client = session return self @property def dask(self): return self._client def compute(self, *args, **kwargs): return self._client.compute(*args, **kwargs) def info(self): return self._client.scheduler_info() def cuda_cluster(self): return self._cluster
def test_wait_for_future(): client = Client( processes=False, n_workers=1, threads_per_worker=1, dashboard_address=None ) fut = client.compute(slow_compute(1)) rr = list(wait_for_future(fut, 0.1)) assert fut.done() assert len(rr) > 1 # Check that exception doesn't leak out fut = client.compute(slow_compute(1, fail=True)) rr = list(wait_for_future(fut, 0.1)) assert fut.done() assert fut.status == "error" assert len(rr) > 1 print(fut)
class DaskManager(metaclass=Singleton): def __init__(self): self.client = Client( f'{settings.DASK_SCHEDULER_HOST}:{settings.DASK_SCHEDULER_PORT}') def compute(self, graph): future = self.client.compute(graph) future.add_done_callback(self.task_complete) dask_task = DaskTask.objects.create(task_key=future.key) return dask_task def get_future_status(self, task_key): return Future(key=task_key, client=self.client).status @staticmethod def task_complete(future): task = DaskTask.objects.get(pk=future.key) if future.status == 'finished': task.status = future.status task.result = future.result() task.save() elif future.status == 'error': task.status = future.status task.result = traceback.extract_tb( future.traceback()) + [future.exception()] task.save() # will cause exception to be thrown here future.result() else: logger.error('Task completed with unhandled status: ' + future.status)
class Dask: def __init__(self): self._dask = None self._client = None def create(self, *args, **kwargs): logger.print(JUST_CHECKING) logger.print("-----") logger.print(STARTING_DASK) # Create Dask client self._client = Client(*args, **kwargs) # Print cluster info # self._client.scheduler_info()["workers] return self def load(self, session): self._client = session # self._sc = session.sparkContext return self @property def dask(self): return self._client def compute(self, *args, **kwargs): return self._client.compute(*args, **kwargs)
def test_docker_sweep(fix_task_env): import subprocess from dask import delayed as dl from dask.distributed import Client import numpy as np # First, set up dask cluster, which for now is just one scheduler and one worker # TODO: Make this run in series of docker containers (e.g. with docker-compose) scheduler_command = ["dask-scheduler", "--port", "8781", "--no-bokeh"] worker_command = [ "dask-worker", "--nthreads", "1", "--nprocs", "1", "--no-bokeh", "0.0.0.0:8781", ] processes = [] try: processes.append(subprocess.Popen(scheduler_command)) processes.append(subprocess.Popen(worker_command)) client = Client("0.0.0.0:8781") # Next, perform the same sweep as before: input_task_example, gathered_task_example, post_processing_task_example = ( fix_task_env) delayeds = [] collected_inputs = [] for tag1 in np.linspace(0.0, 10.0, 3): parts = { "a": [tag1, 1.0, 2.0], "b": [-3.0, 10.0, 2.0], "c": [20.0] } collected_inputs += [dl(input_task_example)(parts)] for tag2 in range(1, 3): num_grid_vec = tag2 * np.ones( (len(collected_inputs), ), dtype=np.int) collected_outputs = dl(gathered_task_example, nout=3)(collected_inputs, num_grid_vec) for i, output in enumerate(collected_outputs): input_data = collected_inputs[i] for tag3 in np.linspace(-1.0, 1.0, 4): delayeds += [ dl(post_processing_task_example)(input_data, output, tag3) ] results = [] for obj in delayeds: results += [client.compute(obj).result()] assert len(results) == 24 and results[3] == 0.0 finally: # Clean up processes for p in processes: p.kill()
def test_pagerank(): gc.collect() input_data_path = r"../datasets/hibench_small/1/part-00000.csv" # Networkx Call pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst']) G = nx.DiGraph() for i in range(0, len(pd_df)): G.add_edge(pd_df['src'][i], pd_df['dst'][i]) nx_pr = nx.pagerank(G, alpha=0.85) nx_pr = sorted(nx_pr.items(), key=lambda x: x[0]) # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) t0 = time.time() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter='\t', names=['src', 'dst'], dtype=['int32', 'int32']) y = ddf.to_delayed() x = client.compute(y) wait(x) t1 = time.time() print("Reading Csv time: ", t1 - t0) new_ddf = dcg.drop_duplicates(x) t2 = time.time() pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=50) wait(pr) t3 = time.time() print("Running PR algo time: ", t3 - t2) t4 = time.time() res_df = pr.compute() t5 = time.time() print("Compute time: ", t5 - t4) print(res_df) t6 = time.time() # For bigdatax4, chunksize=100000000 to avoid oom on write csv res_df.to_csv('~/pagerank.csv', header=False, index=False) t7 = time.time() print("Write csv time: ", t7 - t6) # Comparison err = 0 tol = 1.0e-05 for i in range(len(res_df)): if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1): err = err + 1 print("Mismatches:", err) assert err < (0.02 * len(res_df)) client.close() cluster.close()
def process_tasks(tasks: Iterable[Task], proc: TaskProc, client: Client, sink: S3COGSink, check_exists: bool = True, chunked_persist: int = 0, verbose: bool = True) -> Iterator[str]: def prep_stage(tasks: Iterable[Task], proc: TaskProc) -> Iterator[Tuple[Union[xr.Dataset, xr.DataArray, None], Task, str]]: for task in tasks: path = sink.uri(task) if check_exists: if sink.exists(task): yield (None, task, path) continue ds = proc(task) yield (ds, task, path) in_flight_cogs: Set[Future] = set() for ds, task, path in _with_lookahead1(prep_stage(tasks, proc)): if ds is None: if verbose: print(f"..skipping: {path} (exists already)") yield path continue if chunked_persist > 0: assert isinstance(ds, xr.DataArray) ds = chunked_persist_da(ds, chunked_persist, client) else: ds = client.persist(ds, fifo_timeout='1ms') if len(in_flight_cogs): done, in_flight_cogs = drain(in_flight_cogs, 1.0) for r in done: yield r if isinstance(ds, xr.DataArray): attrs = ds.attrs.copy() ds = ds.to_dataset(dim='band') for dv in ds.data_vars.values(): dv.attrs.update(attrs) cog = client.compute(sink.dump(task, ds), fifo_timeout='1ms') rr = dask_wait(ds) assert len(rr.not_done) == 0 del ds, rr in_flight_cogs.add(cog) done, _ = drain(in_flight_cogs) for r in done: yield r
def test_tree_reduce_delayed(n_parts, cluster): client = Client(cluster) func = delayed(sum) a = [delayed(i) for i in range(n_parts)] b = tree_reduce(a, func=func) c = client.compute(b, sync=True) assert (sum(range(n_parts)) == c)
def parallel_write(filename: str, darray: dask.array) -> None: """Distribute Zarr writing task to workers using dask. Input filename should have extension .zarr""" client = Client() out = darray.to_zarr(filename, compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE), compute=False) try: progress(client) # I believe this is for visualization purpose. fut = client.compute(out) except BrokenPipeError: print('Process complete (likely)...')
def test_tree_reduce_futures(n_parts, cluster): client = Client(cluster) try: a = [client.submit(s, i) for i in range(n_parts)] b = tree_reduce(a) c = client.compute(b, sync=True) assert (sum(range(n_parts)) == c) finally: client.close()
def test_docker_sweep(fix_task_env, fix_setup_docker): import subprocess from dask import delayed as dl from dask.distributed import Client import numpy as np # First, set up the docker + dask cluster, which for now is just one scheduler and one worker scheduler_command = [ 'dask-scheduler', '--port', '8781', '--bokeh-port', '8780' ] worker_command = [ 'dask-worker', '--nthreads', '1', '--nprocs', '1', 'localhost:8781' ] docker_command = ['docker', 'run', '-d', '--network', 'host', 'qmt:master'] containers = [] try: containers.append( subprocess.check_output(docker_command + scheduler_command).splitlines()[0]) containers.append( subprocess.check_output(docker_command + worker_command).splitlines()[0]) client = Client('localhost:8781') # Next, perform the same sweep as before: input_task_example, gathered_task_example, post_processing_task_example = fix_task_env delayeds = [] collected_inputs = [] for tag1 in np.linspace(0., 10., 3): parts = {'a': [tag1, 1., 2.], 'b': [-3., 10., 2.], 'c': [20.]} collected_inputs += [dl(input_task_example)(parts)] for tag2 in range(1, 3): num_grid_vec = tag2 * np.ones( (len(collected_inputs), ), dtype=np.int) collected_outputs = dl(gathered_task_example, nout=3)(collected_inputs, num_grid_vec) for i, output in enumerate(collected_outputs): input_data = collected_inputs[i] for tag3 in np.linspace(-1.0, 1., 4): delayeds += [ dl(post_processing_task_example)(input_data, output, tag3) ] results = [] for obj in delayeds: results += [client.compute(obj).result()] assert len(results) == 24 and results[3] == 0.0 finally: # Clean up the docker containers for c in containers: subprocess.check_output(['docker', 'kill', c])
def main(src_dir): logging.getLogger("tifffile").setLevel(logging.ERROR) coloredlogs.install(level="DEBUG", fmt="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S") # assume we have tunnel the scheduler to local scheduler = "localhost:8786" logger.info(f'connecting to scheduler at "{scheduler}"') client = Client(scheduler, timeout="300s") # 5 min print(client) src_dir = os.path.abspath(src_dir) files = glob.glob(os.path.join(src_dir, "*_predictions.h5")) logger.info(f"{len(files)} tile(s) to convert") dname = os.path.basename(src_dir) dname = dname.rsplit("_", 1)[0] dname = f"{dname}_labels" dst_dir = os.path.join(os.path.dirname(src_dir), dname) create_dir(dst_dir) futures = [] for f in files: probabilities = read_h5(f, "predictions") label = as_label(probabilities, dtype=np.uint16) fname = os.path.basename(f) fname, _ = os.path.splitext(fname) fname = f"{fname}.tif" dst_path = os.path.join(dst_dir, fname) dst_path = write_tiff(dst_path, label) future = client.compute(dst_path) futures.append(future) with tqdm(total=len(futures)) as pbar: for future in as_completed(futures): try: dst_path = future.result() pbar.set_description(dst_path) except Exception as error: logger.exception(error) finally: pbar.update(1) del future logger.info("closing scheduler connection") client.close()
def connnect_glue(): # os.system('dask-ssh 128.104.222.{103,104,105,107}') # subprocess.call('dask-ssh', '128.104.222.{103,104,105,107}') # time.sleep(10) import numpy as np client = Client('128.104.222.103:8786') client.restart() x = da.from_zarr('/mnt/cephfs/smltar_numpyarr/zarr_data_full') print(x) # y = x[0:1] # z = x[100:101] # m = x[1000:1001] # n = x[1500:1501] # p = x[1400:1401] y = x[0:30] z = x[100:130] m = x[1000:1030] n = x[1500:1530] p = x[1400:1430] # zc = x[108:208] # mc = x[1008:1108] # nc = x[1508:1608] # pc = x[1601:1701] # sum = (y + z - m + p) * n # # sum2 = (zc + mc + nc +pc)*sum # # sum3 = sum2 + (zc + mc + nc +pc)*sum # # # # print(sum2) # sum.visualize('sum3') # frm = sum[15] fu = client.compute(sum) # p = r.result() # print(type(p)) # return p re = fu.result() re = np.array(re) np.save("/mnt/cephfs/result/test", re[15]) # print(p) return re[15]
def code_range(path_dict): """ path_dict = RANGE_CATEGORIES[5] """ # Create a numeric dictionary for these keys key_dict = {key: i + 1 for i, key in enumerate(path_dict.keys())} number_dict = {k: i for i, k in key_dict.items()} vals = key_dict.values() combos = [[c for c in combinations(vals, i + 1)] for i in range(len(vals))] combos = [c for sc in combos for c in sc] combo_keys = {} for combo in combos: key = "-".join([number_dict[c] for c in combo]) value = seq_int(combo) combo_keys[key] = value # Assign each raster a unique value arrays = [] for key, path in path_dict.items(): value = key_dict[key] full_path = DP.join(path) array = xr.open_rasterio(full_path, chunks=CHUNKS)[0].data array[da.isnan(array)] = 0 array[array > 0] = value arrays.append(array) # Stack everything together - we might have to save this a temporary file stack = da.stack(arrays, axis=0) stack = stack.rechunk((stack.shape[0], 5000, 5000)) stack = stack[:, 4000:10000, 4000:10000] # Try to map the function to each point client = Client() codes = da.apply_along_axis(seq_int, 0, stack, dtype="uint8") future = client.compute(codes) result = future.result() client.shutdown() client.close() # Save to temp and delete template = rasterio.open(full_path) temp_path = DP2.join("test.tif") with rasterio.Env(): profile = template.profile profile.update(dtype=rasterio.uint8, count=1, compress='lzw') with rasterio.open(temp_path, 'w', **profile) as dst: dst.write(result)
def compute_tasks(tasks: Iterable[Any], client: Client, max_in_flight: int = 3) -> Iterable[Any]: """ Parallel compute stream with back pressure. Equivalent to: .. code-block:: python (client.compute(task).result() for task in tasks) but with up to ``max_in_flight`` tasks being processed at the same time. Input/Output order is preserved, so there is a possibility of head of line blocking. .. note:: lower limit is 3 concurrent tasks to simplify implementation, there is no point calling this function if you want one active task and supporting exactly 2 active tasks is not worth the complexity, for now. We might special-case 2 at some point. """ # New thread: # 1. Take dask task from iterator # 2. Submit to client for processing # 3. Send it of to wrk_q # # Calling thread: # 1. Pull scheduled future from wrk_q # 2. Wait for result of the future # 3. yield result to calling code from .generic import it2q, qmap # (max_in_flight - 2) -- one on each side of queue wrk_q = queue.Queue(maxsize=max(1, max_in_flight - 2)) # type: queue.Queue # fifo_timeout='0ms' ensures that priority of later tasks is lower futures = (client.compute(task, fifo_timeout='0ms') for task in tasks) in_thread = threading.Thread(target=it2q, args=(futures, wrk_q)) in_thread.start() yield from qmap(lambda f: f.result(), wrk_q) in_thread.join()
def test_reduce_futures(n_parts, cluster): def s(x): return x client = Client(cluster) try: a = [client.submit(s, i) for i in range(n_parts)] b = reduce(a, sum) c = client.compute(b, sync=True) # Testing this gets the correct result for now. assert (sum(range(n_parts)) == c) finally: client.close()
def run_dask_compute(h5_main): raw_data = h5_main[()] #cpu_cores = int(cpu_cores/8) #dask_raw_data = da.from_array(raw_data, chunks='auto') #cluster = LocalCluster(n_workers=cpu_cores/8) #client = Client(cluster, processes=True) #map = dask_raw_data.map_blocks(find_all_peaks, [20, 60], num_steps=30) #results = map.compute() client = Client(processes=False) dask_raw_data = client.scatter(raw_data) args = [[20, 60]] kwargs = {'num_steps': 30} L = client.submit(find_all_peaks, dask_raw_data, args, kwargs) dask_results = client.compute(L) cores = client.ncores() client.close() return cores
def test_pagerank(): gc.collect() input_data_path = r"datasets/hibench_small/1/part-00000.csv" # Networkx Call import pandas as pd pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst']) import networkx as nx G = nx.DiGraph() for i in range(0, len(pd_df)): G.add_edge(pd_df['src'][i], pd_df['dst'][i]) nx_pr = nx.pagerank(G, alpha=0.85) nx_pr = sorted(nx_pr.items(), key=lambda x: x[0]) # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) import dask_cudf import dask_cugraph.pagerank as dcg t0 = time.time() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter='\t', names=['src', 'dst'], dtype=['int32', 'int32']) y = ddf.to_delayed() x = client.compute(y) wait(x) t1 = time.time() print("Reading Csv time: ", t1 - t0) pr = dcg.pagerank(x, alpha=0.85, max_iter=50) t2 = time.time() print("Running PR algo time: ", t2 - t1) res_df = pr.compute() # Comparison err = 0 tol = 1.0e-05 for i in range(len(res_df)): if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1): err = err + 1 print("Mismatches:", err) assert err < (0.02 * len(res_df))
def main(src_dir): logging.getLogger("tifffile").setLevel(logging.ERROR) coloredlogs.install(level="DEBUG", fmt="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S") # assume we have tunnel the scheduler to local scheduler = "localhost:8786" logger.info(f'connecting to scheduler at "{scheduler}"') client = Client(scheduler, timeout="300s") # 5 min print(client) src_dir = os.path.abspath(src_dir) files = glob.glob(os.path.join(src_dir, "*.tif")) logger.info(f"{len(files)} tile(s) to convert") dname = os.path.basename(src_dir) dname = f"{dname}_h5" dst_dir = os.path.join(os.path.dirname(src_dir), dname) create_dir(dst_dir) # write back write_back_tasks = [] for i, src_path in enumerate(files): data = delayed(imageio.volread)(src_path) fname = f"tile_{i:04d}.h5" dst_path = os.path.join(dst_dir, fname) future = pack_arrays(dst_path, data) write_back_tasks.append(future) # submit task futures = client.compute(write_back_tasks, scheduler="processes") with tqdm(total=len(futures)) as pbar: for future in as_completed(futures): pbar.update(1) logger.info("closing scheduler connection") client.close()
def test_handles(cluster): client = Client(cluster) def _has_handle(sessionId): return local_handle(sessionId) is not None try: cb = Comms(verbose=True) cb.init() dfs = [ client.submit(_has_handle, cb.sessionId, pure=False, workers=[w]) for w in cb.worker_addresses ] wait(dfs, timeout=5) assert all(client.compute(dfs, sync=True)) finally: cb.destroy() client.close()
def main(threads=1): headers = dict(EDP=[0, 1, 2, 3], DM=[0, 1, 2], DV=[0, 1, 2, 3]) use_dask = threads > 1 if use_dask: log_msg('{} threads requested. Using DASK.'.format(threads)) from dask.distributed import Client, LocalCluster from dask import delayed import math @delayed def read_csv_files(file_list, header): return [ pd.read_csv(fn, header=header, index_col=0) for fn in file_list ] def read_csv_np(file, header): res = np.loadtxt(file, delimiter=',', dtype=str) first_row = header[-1] + 1 data = res[first_row:].T[1:].T data[data == ''] = np.nan tuples = [tuple(h) for h in res[:first_row].T[1:]] MI = pd.MultiIndex.from_tuples(tuples, names=res[:first_row].T[0]) df = pd.DataFrame(data, columns=MI, index=res[first_row:].T[0], dtype=float) return df @delayed def read_csv_files_np(file_list, header): return [read_csv_np(fn, header=header) for fn in file_list] cluster = LocalCluster() client = Client(cluster) log_msg('Cluster initialized.') log_msg(client) for res_type in ['EDP', 'DM', 'DV']: #for res_type in ['EDP', 'DV']: log_msg('Loading {} files...'.format(res_type)) files = glob.glob('./results/{}/*/{}_*.csv'.format(res_type, res_type)) #files = files[:1000] if use_dask: file_count = len(files) chunk = math.ceil(file_count / threads) df_list = [] for t_i in range(threads): if t_i * chunk < file_count - 1: df_list_i = delayed(read_csv_files)( files[t_i * chunk:(t_i + 1) * chunk], headers[res_type]) df_i = delayed(pd.concat)(df_list_i, axis=0, sort=False) df_list.append(df_i) elif t_i * chunk == file_count - 1: df_i = delayed(read_csv_files)( files[t_i * chunk:(t_i + 1) * chunk], headers[res_type]) df_i = df_i[0] df_list.append(df_i) df_all = delayed(pd.concat)(df_list, axis=0, sort=False) df_all = client.compute(df_all) df_all = client.gather(df_all) else: log_msg('Loading all files') df_list = [ pd.read_csv(resFileName, header=headers[res_type], index_col=0) for resFileName in files ] log_msg('Concatenating all files') df_all = pd.concat(df_list, axis=0, sort=False) df_all.sort_index(axis=0, inplace=True) # save the results log_msg('Saving results') df_all.index = df_all.index.astype(np.int32) df_all.to_hdf('{}.hd5'.format(res_type), 'data', mode='w', format='fixed', complevel=1, complib='blosc:snappy') #df_all.to_csv('{}.csv'.format(res_type)) if use_dask: log_msg('Closing cluster...') cluster.close() client.close() # aggregate the realizations files log_msg('Aggregating individual realizations...') files = glob.glob('./results/{}/*/{}_*.hd5'.format('realizations', 'realizations')) log_msg('Number of files: {}'.format(len(files))) # get the keys from the first file if len(files) > 0: first_file = pd.HDFStore(files[0]) keys = first_file.keys() first_file.close() for key in keys: log_msg('Processing realizations for key {key}'.format(key=key)) df_list = [pd.read_hdf(resFileName, key) for resFileName in files] log_msg('\t\tConcatenating files') df_all = pd.concat(df_list, axis=0, sort=False) df_all.index = df_all.index.astype(np.int32) df_all.sort_index(axis=0, inplace=True) df_all.astype(np.float16).to_hdf('realizations.hd5', key, mode='a', format='fixed', complevel=1, complib='blosc:snappy') log_msg('\t\tResults saved for {key}.'.format(key=key)) log_msg('End of script')
print(("File " + tar_name + " not found in \n" + tar_path)) sys.exit('File not found.') #----make temp-folder for extracting the tar file:------ temp_folder = tempfile.mkdtemp() print(('Temp-folder ' + temp_folder)) #---uncompress Data:------------- print("Now Uncompressing " + tar_name) uncompressTGZ(tar_path + tar_name, temp_folder) #----calculate cloudiness---------- # cc,ASCAtime,cloudmask,scale_factor = cloudiness(temp_folder,client) compute_me = delayed(cloudiness)(temp_folder) computed = client.compute(compute_me) cc, ASCAtime, cloudmask, scale_factor = client.gather(computed) #-------Remove temp-folder------ print("Removing temporaray folder") try: shutil.rmtree(temp_folder) print("Succesfully removed.") except: print(("Temporary folder {0} could not found or deleted".format( temp_folder))) #------write to netCDF---------- print("Writing results to netCDF") ncName = "CloudCoverage_" + ASCAtime[0].strftime("%Y%m%d") + ".nc"
def main(config_path, src_dir): logging.getLogger("tifffile").setLevel(logging.ERROR) coloredlogs.install(level="DEBUG", fmt="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S") # assume we have tunnel the scheduler to local scheduler = "localhost:8786" logger.info(f'connecting to scheduler at "{scheduler}"') client = Client(scheduler, timeout="300s") # 5 min print(client) src_dir = os.path.abspath(src_dir) # load dataset src_ds = open_dataset(src_dir) desc = tuple(f"{k}={v}" for k, v in zip(("x", "y", "z"), reversed(src_ds.tile_shape))) logger.info(f"tiling dimension ({', '.join(desc)})") # generate tile index list (TODO deal with multi-color/view here) def groupby_tiles(inventory, index: List[str]): """ Aggregation function that generates the proper internal list layout for all the tiles in their natural N-D layout. Args: inventory (pd.DataFrame): the listing inventory index (list of str): the column header """ tiles = [] for _, tile in inventory.groupby(index[0]): if len(index) > 1: # we are not at the fastest dimension yet, decrease 1 level tiles.extend(groupby_tiles(tile, index[1:])) else: # fastest dimension, call retrieval function tiles.append(src_ds[tile]) return tiles index = ["tile_y", "tile_x"] if "tile_z" in src_ds.index.names: index = ["tile_z"] + index logger.info(f"a {len(index)}-D tiled dataset") tiles = groupby_tiles(src_ds, index) logger.info(f"{len(tiles)} to process") # downsample tiles_bin4 = [tile[:, ::4, ::4] for tile in tiles] dname = os.path.basename(src_dir) dname = f"{dname}_bin4" dst_dir = os.path.join(os.path.dirname(src_dir), dname) create_dir(dst_dir) # write back write_back_tasks = [] for i, tile in enumerate(tiles_bin4): fname = f"tile_{i:04d}.tif" path = os.path.join(dst_dir, fname) future = delayed(imageio.volwrite)(path, tile) write_back_tasks.append(future) futures = client.compute(write_back_tasks, scheduler="processes") with tqdm(total=len(futures)) as pbar: for future in as_completed(futures): print(future.result()) pbar.update(1) logger.info("closing scheduler connection") client.close()
class DaskManager(metaclass=Singleton): def __init__(self): if settings.DASK_SCHEDULER_HOST is None: self.client = Client(preload='daskworkerinit.py', n_workers=2, threads_per_worker=1, memory_limit='4GB') else: self.client = Client(settings.DASK_SCHEDULER_HOST + ':' + settings.DASK_SCHEDULER_PORT) def compute(self, graph): future = self.client.compute(graph) future.add_done_callback(self.task_complete) dask_task = DaskTask.objects.create(task_key=future.key) return dask_task def get_future_status(self, task_key): return Future(key=task_key, client=self.client).status @staticmethod def task_complete(future): task = DaskTask.objects.get(pk=future.key) if future.status == 'finished': task.status = future.status task.result = future.result() task.end_time = timezone.now() task.duration_ms = int( (task.end_time - task.start_time).total_seconds() * 1000) task.save() elif future.status == 'error': task.status = future.status task.result = traceback.extract_tb( future.traceback()) + [future.exception()] task.end_time = timezone.now() task.duration_ms = int( (task.end_time - task.start_time).total_seconds() * 1000) task.save() else: logger.error( f'Task completed with unhandled status: {future.status}') # send email that a task has finished, could be much more complex, just keeping it simple if future.status == 'finished': # The format requested, I would do a more generic message for all tasks by just passing task id and # a preview of the result, but if this is just a micro service for this particular task then it can work result = future.result() formatted_time = task.end_time.strftime("%d/%m/%Y %H:%M") message = f'{formatted_time} - {result["lines"]} entries processed, sum: {result["sum"]}' elif future.status == 'error': message = f'Task id {task.task_key} failed.' else: message = f'Task id {task.task_key} completed with unknown status: {future.status}' send_mail( 'Task Completed', message, settings.DEFAULT_FROM_EMAIL, [settings.TASK_INFO_EMAIL], fail_silently=False, )
def tsmask_one_iteration(ncpu, mem, block, crs, out_crs, start_of_epoch, end_of_epoch, dirc, loc_str): [y1, y2, x1, x2] = block #Datacube object dc = datacube.Datacube(app='load_clearsentinel') tg_ds = tsf.load_s2_nbart_dask(dc, y1, y2, x1, x2, start_of_epoch, end_of_epoch, { "time": 1, }, crs, out_crs) memstr = str(mem) + 'GB' client = Client(n_workers=ncpu, threads_per_worker=2, memory_limit=memstr) client.compute(tg_ds) client.close() irow = tg_ds['y'].size icol = tg_ds['x'].size tn = tg_ds['time'].size print(tn, irow, icol) # Create numpy array to store TSmask results tsmask = np.zeros((tn, irow, icol), dtype=np.uint8) print("Time series cloud and shadow detection for area (", y1, y2, x1, x2, ")") # Run time series cloud mask algorithm on the data tsmask = tsmask_filter_onearea(tg_ds, ncpu, tsmask) print("Begin applying spatial filter") results = [] # number of process for the pool object number_of_workers = ncpu # Create a Pool object with a number of processes p = Pool(number_of_workers) # create a list of scene paralist = [tsmask[i, :, :] for i in range(tn)] # Start runing the spatial filter function using a pool of indepedent processes results = p.map(cym.spatial_filter_v2, paralist) # Finish the parallel runs p.close() # Join the results and put them back in the correct order p.join() # Save the cloud/shadow masks to the 'tsmask' dataarray in the s2_ds dataset for i in range(tn): tsmask[i, :, :] = results[i] print("Begin calculting long term of the indice set") bgids = bg_indices_one_iteration(ncpu, tg_ds, dirc, loc_str, tsmask, start_of_epoch, end_of_epoch) print(bgids.shape) # print("Begin creating input features for Nmask ANN model") # create_ip_data(tg_ds, bgids, loc_str, dirc) tg_ds.close()
k: bool(int(v)) for k, v in read_pairs_list('data/modifiers.txt') } modifiers_dict[None] = True candidates = make_candidates(matched, modifiers_dict.keys()) print('Number of candidates : {}'.format(len(candidates))) if True: # Dask processing cluster = LocalCluster(n_workers=48) client = Client(cluster) b = db.from_sequence(failed, partition_size=200) [c] = client.scatter( [candidates], broadcast=True) # Broadcast the list of candidates to the workers r = b.map(_fn, c) f = client.compute(r) progress(f) matching_results = f.result() else: # Multiprocessing matching_results = [] with Pool(40) as p: for simple_result in tqdm(p.imap(_fn, failed, chunksize=300), total=len(failed)): matching_results.append(simple_result) matching_results = sorted(matching_results, key=lambda x: x[1][0][0], reverse=True) # normalized_str -> (normalized_name, ({'id':_,'name':_}, modifier_str) ) matching_dict = {
def run(self, current_date: datetime, dry_run: bool = False) -> None: """ Run analysis using mozanalysis for a specific experiment. """ global _dask_cluster logger.info("Analysis.run invoked for experiment %s", self.config.experiment.normandy_slug) self.check_runnable(current_date) assert self.config.experiment.start_date is not None # for mypy self.ensure_enrollments(current_date) # set up dask _dask_cluster = _dask_cluster or LocalCluster( dashboard_address=DASK_DASHBOARD_ADDRESS, processes=True, threads_per_worker=1, n_workers=DASK_N_PROCESSES, ) client = Client(_dask_cluster) results = [] if self.log_config: log_plugin = LogPlugin(self.log_config) client.register_worker_plugin(log_plugin) # add profiling plugins # resource_profiling_plugin = ResourceProfilingPlugin( # scheduler=_dask_cluster.scheduler, # project_id=self.log_config.log_project_id, # dataset_id=self.log_config.log_dataset_id, # table_id=self.log_config.task_profiling_log_table_id, # experiment=self.config.experiment.normandy_slug, # ) # _dask_cluster.scheduler.add_plugin(resource_profiling_plugin) # task_monitoring_plugin = TaskMonitoringPlugin( # scheduler=_dask_cluster.scheduler, # project_id=self.log_config.log_project_id, # dataset_id=self.log_config.log_dataset_id, # table_id=self.log_config.task_monitoring_log_table_id, # experiment=self.config.experiment.normandy_slug, # ) # _dask_cluster.scheduler.add_plugin(task_monitoring_plugin) table_to_dataframe = dask.delayed(self.bigquery.table_to_dataframe) for period in self.config.metrics: segment_results = [] time_limits = self._get_timelimits_if_ready(period, current_date) if time_limits is None: logger.info( "Skipping %s (%s); not ready", self.config.experiment.normandy_slug, period.value, ) continue exp = mozanalysis.experiment.Experiment( experiment_slug=self.config.experiment.normandy_slug, start_date=self.config.experiment.start_date.strftime("%Y-%m-%d"), app_id=self._app_id_to_bigquery_dataset(self.config.experiment.app_id), ) analysis_bases = [] for m in self.config.metrics[period]: for analysis_basis in m.metric.analysis_bases: analysis_bases.append(analysis_basis) analysis_bases = list(set(analysis_bases)) if len(analysis_bases) == 0: continue for analysis_basis in analysis_bases: metrics_table = self.calculate_metrics( exp, time_limits, period, analysis_basis, dry_run ) if dry_run: results.append(metrics_table) else: metrics_dataframe = table_to_dataframe(metrics_table) if dry_run: logger.info( "Not calculating statistics %s (%s); dry run", self.config.experiment.normandy_slug, period.value, ) continue segment_labels = ["all"] + [s.name for s in self.config.experiment.segments] for segment in segment_labels: segment_data = self.subset_to_segment(segment, metrics_dataframe) for m in self.config.metrics[period]: segment_results += self.calculate_statistics( m, segment_data, segment, analysis_basis, ).to_dict()["data"] segment_results += self.counts(segment_data, segment, analysis_basis).to_dict()[ "data" ] results.append( self.save_statistics( period, segment_results, self._table_name(period.value, len(time_limits.analysis_windows)), ) ) result_futures = client.compute(results) client.gather(result_futures) # block until futures have finished
def beta_parallel_disk_detection(dataset, probe, #rxmin=None, # these would allow selecting a sub section #rxmax=None, #rymin=None, #rymax=None, #qxmin=None, #qxmax=None, #qymin=None, #qymax=None, probe_type="FT", dask_client= None, dask_client_params:dict=None, restart_dask_client=True, close_dask_client=False, return_dask_client=True, *args, **kwargs): """ This is not fully validated currently so may not work, please report bugs on the py4DSTEM github page. This parallellises the disk detetection for all probe posistions. This can operate on either in memory or out of memory datasets There is an asumption that unless specifying otherwise you are parallelising on a single Local Machine. If this is not the case its probably best to pass the dask_client into the function, although you can just pass the required arguments to dask_client_params. If no dask_client arguments are passed it will create a dask_client for a local machine Note: Do not pass "peaks" argument as a kwarg, like you might in "_find_Bragg_disks_single_DP_FK", as the results will be unreliable and may cause the calculation to crash. Args: dataset (py4dSTEM datacube): 4DSTEM dataset probe (ndarray): can be regular probe kernel or fourier transormed probe_type (str): "FT" or None dask_client (distributed.client.Client): dask client dask_client_params (dict): parameters to pass to dask client or dask cluster restart_dask_client (bool): if True, function will attempt to restart the dask_client. close_dask_client (bool): if True, function will attempt to close the dask_client. return_dask_client (bool): if True, function will return the dask_client. *args,kwargs will be passed to "_find_Bragg_disks_single_DP_FK" e.g. corrPower, sigma, edgeboundary... Returns: peaks (PointListArray): the Bragg peak positions and the correlenation intensities dask_client(optional) (distributed.client.Client): dask_client for use later. """ #TODO add asserts abotu peaks not being passed # Dask Client stuff #TODO how to guess at default params for client, sqrt no.cores. Something to do with the size of the diffraction patterm # write a function which can do this. #TODO replace dask part with a with statement for easier clean up e.g. # with LocalCluser(params) as cluster, Client(cluster) as client: # ... dask stuff. #TODO add assert statements and other checks. Think about reordering opperations if dask_client == None: if dask_client_params !=None: dask.config.set({'distributed.worker.memory.spill': False, 'distributed.worker.memory.target': False}) cluster = LocalCluster(**dask_client_params) dask_client = Client(cluster, **dask_client_params) else: # AUTO MAGICALLY SET? # LET DASK SET? # HAVE A FUNCTION WHICH RUNS ON A SUBSET OF THE DATA TO PICK OPTIMIAL VALUE? # psutil could be used to count cores. dask.config.set({'distributed.worker.memory.spill': False, # stops spilling to disk 'distributed.worker.memory.target': False}) # stops spilling to disk and erroring out cluster = LocalCluster() dask_client = Client(cluster) else: assert type(dask_client) == distributed.client.Client if restart_dask_client: try: dask_client.restart() except Exception as e: print('Could not restart dask client. Try manually restarting outside or passing "restart_dask_client=False"') # WARNING STATEMENT return e else: pass # Probe stuff assert (probe.shape == dataset.data.shape[2:]), "Probe and Diffraction Pattern Shapes are Mismatched" if probe_type != "FT": #TODO clean up and pull out redudant parts #if probe.dtype != (np.complex128 or np.complex64 or np.complex256): #DO FFT SHIFT THING probe_kernel_FT = np.conj(np.fft.fft2(probe)) dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny)) dask_probe_delayed = dask_probe_array.to_delayed() # delayed_probe_kernel_FT = delayed(probe_kernel_FT) else: probe_kernel_FT = probe dask_probe_array = da.from_array(probe_kernel_FT, chunks=(dataset.Q_Nx, dataset.Q_Ny)) dask_probe_delayed = dask_probe_array.to_delayed() # GET DATA #TODO add another elif if it is a dask array then pass if type(dataset.data) == np.ndarray: dask_data = da.from_array(dataset.data, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny)) elif dataset.stack_pointer != None: dask_data = da.from_array(dataset.stack_pointer, chunks=(1, 1,dataset.Q_Nx, dataset.Q_Ny)) else: print("Couldn't access the data") return None # Convert the data to delayed dataset_delayed = dask_data.to_delayed() # TODO Trim data e.g. rx,ry,qx,qy # I can pass the index values in here I should trim the probe and diffraction pattern first # Into the meat of the function # create an empty list to which we will append the dealyed functions to. res = [] # loop over the dataset_delayed and create a delayed function of for x in np.ndindex(dataset_delayed.shape): temp = delayed(_find_Bragg_disks_single_DP_FK_dask_wrapper)(dataset_delayed[x], probe_kernel_FT=dask_probe_delayed[0,0], #probe_kernel_FT=delayed_probe_kernel_FT, *args, **kwargs) #passing through args from earlier or should I use #corrPower=corrPower, #sigma=sigma_gaussianFilter, #edgeBoundary=edgeBoundary, #minRelativeIntensity=minRelativeIntensity, #minPeakSpacing=minPeakSpacing, #maxNumPeaks=maxNumPeaks, #subpixel='poly') res.append(temp) _temp_peaks = dask_client.compute(res, optimize_graph=True) # creates futures and starts computing output = dask_client.gather(_temp_peaks) # gather the future objects coords = [('qx',float),('qy',float),('intensity',float)] peaks = PointListArray(coordinates=coords, shape=dataset.data.shape[:-2]) #temp_peaks[0][0] # operating over a list so we need the size (0->count) and re-create the probe positions (0->rx,0->ry), for (count,(rx, ry)) in zip([i for i in range(dataset.data[...,0,0].size)],np.ndindex(dataset.data.shape[:-2])): #peaks.get_pointlist(rx, ry).add_pointlist(temp_peaks[0][count]) #peaks.get_pointlist(rx, ry).add_pointlist(output[count][0]) peaks.get_pointlist(rx, ry).add_pointlist(output[count]) # Clean up dask_client.cancel(_temp_peaks) # removes from the dask workers del _temp_peaks # deletes the object if close_dask_client: dask_client.close() return peaks elif close_dask_client == False and return_dask_client == True: return peaks, dask_client elif close_dask_client and return_dask_client == False: return peaks else: print('Dask Client in unknown state, this may result in unpredicitable behaviour later') return peaks
def add(x, y): time.sleep(7) return x + y x = delayed(inc)(1) y = delayed(dec)(2) total = delayed(add)(x, y) # In[6]: # notice the difference from total.compute() # notice that this cell completes immediately fut = c.compute(total) # In[7]: c.gather(fut) # Critically, each futures represents a result held, or being evaluated by the cluster. Thus we can control caching of intermediate values - when a future is no longer referenced, its value is forgotten. For example, although we can explicitly pass data into the cluster using `scatter()`, we normally want to cause the workers to load as much of their own data as possible to avoid excessive communication overhead. # # The [full API](http://distributed.readthedocs.io/en/latest/api.html) of the distributed scheduler gives details of interacting with the cluster, which remember, can be on your local machine or possibly on a massive computational resource. # The futures API offers a work submission style that can easily emulate the map/reduce paradigm (see `c.map()`) that may be familiar to many people. The intermediate results, represented by futures, can be passed to new tasks without having to bring the pull locally from the cluster, and new work can be assigned to work on the output of previous jobs that haven't even begun yet. # # Generally, any Dask operation that is executed using `.compute()` can be submitted for asynchronous execution using `c.compute()` instead, and this applies to all collections. Here is an example with the calculation previously seen in the Bag chapter. We have replaced the `.compute()` method there with the distributed client version, so, again, we could continue to submit more work (perhaps based on the result of the calculation), or, in the next cell, follow the progress of the computation. A similar progress-bar appears in the monitoring UI page. # In[8]: