def verify_results_split(R, I, input_array_path, datadir): from dask_io.optimizer.cases.resplit_utils import get_blocks_shape splitfiles_partition = get_blocks_shape(R, I) print("split files partiton:", splitfiles_partition) all_true = True orig_arr = get_dask_array_from_hdf5(input_array_path, "/data", logic_cs=tuple(I)) for i in range(splitfiles_partition[0]): for j in range(splitfiles_partition[1]): for k in range(splitfiles_partition[2]): splitfilename = f"{i}_{j}_{k}.hdf5" split_filepath = os.path.join(datadir, splitfilename) print("opening", split_filepath) splitarray = get_dask_array_from_hdf5(split_filepath, "/data") print( f"Slices from ground truth {i*I[0]}:{(i+1)*I[0]}, {j*I[1]}:{(j+1)*I[1]}, {k*I[2]}:{(k+1)*I[2]}" ) ground_truth_arr = orig_arr[i * I[0]:(i + 1) * I[0], j * I[1]:(j + 1) * I[1], k * I[2]:(k + 1) * I[2]] verify_task = da.allclose(ground_truth_arr, splitarray) print("VERIFY TASK: ", verify_task) disable_clustering() _res = verify_task.compute() print("RESULT: ", _res) if _res == False: print(f"[Error] Split failed for {splitfilename}") all_true = False clean_files() return all_true
def clean(self): name = self.case['name'] if name: if name == 'merge_hdf5_multiple': try: f = self.case['params']['out_file'] if f: f.close() clean_files() except: pass
def verify_results_merge(input_array_path, merged_array_path): original_array = get_dask_array_from_hdf5(input_array_path, "/data") merged_array = get_dask_array_from_hdf5(merged_array_path, "/data") verify_task = da.allclose(original_array, merged_array) print("VERIFY TASK: ", verify_task) disable_clustering() _res = verify_task.compute() print("RESULT: ", _res) if _res == False: print("[Error] Rechunk failed") clean_files() return _res
def experiment1(): """ Split multidimensional arrays using vanilla dask and clustered strategy implementation from dask_io. """ tests = create_tests() * args.nb_repetitions shuffle(tests) results = list() for i, test in enumerate(tests): print(f'\n\nProcessing test {i + 1}/{len(tests)} ~') print(f'Creating test array if needed...') create_test_array(test, create_random_dask_array, save_to_hdf5) clean_files() print(f'Done. Running test...') result = run_test(test, paths) results.append(result) write_csv(results, paths["outdir"], create_csv_file)
def clean(self): name = self.case['name'] if name: if name == 'split_hdf5' or name == 'split_npy': try: f = self.case['params']['out_file'] if f: f.close() clean_files() except: pass elif name == 'split_hdf5_multiple': for f in self.case['params']['out_files']: try: if f: f.close() clean_files() except: pass
def rechunk_vanilla_dask(indir_path, outdir_path, nthreads, R, O, model): """ Rechunk using vanilla dask """ in_arrays = load_input_files(indir_path) case = Merge('samplename') case.merge_hdf5_multiple(indir_path, store=False) reconstructed_array = case.get() out_files = list() # to keep outfiles open during processing sources = list() targets = list() outfiles_partition = get_blocks_shape(R, O) for i in range(outfiles_partition[0]): for j in range(outfiles_partition[1]): for k in range(outfiles_partition[2]): out_filename = f'{i}_{j}_{k}.hdf5' out_file = h5py.File(os.path.join(outdir_path, out_filename), 'w') dset = out_file.create_dataset('/data', shape=O, dtype=np.float16) tmp_array = reconstructed_array[i * O[0]:(i + 1) * O[0], j * O[1]:(j + 1) * O[1], k * O[2]:(k + 1) * O[2]] print( f'{i*O[0]}: {(i+1)*O[0]}, {j*O[1]}: {(j+1)*O[1]}, {k*O[2]}: {(k+1)*O[2]}' ) out_files.append(out_file) sources.append(tmp_array) targets.append(dset) rechunk_task = da.store(sources, targets, compute=False) # rechunk_task.visualize(filename="tmp_dir/test_graph_vanilla.png") # sys.exit() with Profiler() as prof, ResourceProfiler( dt=0.25) as rprof, CacheProfiler() as cprof: scheduler = 'single-threaded' if nthreads == 1 else 'threads' with dask.config.set(scheduler=scheduler): try: t = time.time() rechunk_task.compute() t = time.time() - t # visualize([prof, rprof, cprof]) except Exception as e: print(e, "\nSomething went wrong during graph execution.") t = None diagnostics = os.path.join(outdir_path, 'exp5_' + str(model) + '.html') visualize([prof, rprof, cprof], diagnostics, show=False) clean_files() for f in out_files: f.close() return t