def test_create_buffers_slabs(): """ Test if the buffering works according to clustered writes when processing slabs. The only strategy that should be used is ``block slices". """ cs = (5, 100, 100) # 20 chunks case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() _, dicts = get_used_proxies(arr.dask.dicts) origarr_name = list(dicts['origarr_to_obj'].keys())[0] nb_bytes_per_block = 100 * 100 * 5 byte_size = 2 l1 = [[i] for i in range(20)] l2 = [list(range(10)), list(range(10, 20))] l3 = [list(range(7)), list(range(7, 14)), list(range(14, 20))] experiment_params = { nb_bytes_per_block * byte_size: l1, nb_bytes_per_block * byte_size * 10: l2, nb_bytes_per_block * byte_size * 7: l3 } for buffer_size, expected in experiment_params.items(): logging.info("\nTesting buffer %s", buffer_size) logging.debug("Expecting %s", expected) enable_clustering(buffer_size, mem_limit=True) buffers = create_buffers(origarr_name, dicts, cs) logging.debug("Got %s", buffers) assert buffers == expected
def test_sum(shape_to_test, nb_chunks): """ Test if the sum of two blocks yields the good result using our optimization function. """ logger.info("testing shape %s", shape_to_test) # prepare test case case = Split(pytest.test_array_path, shape_to_test) case.sum(nb_chunks) # non optimized run disable_clustering() result_non_opti = case.get().compute() # optimized run enable_clustering(buffer_size) result_opti = case.get().compute() assert np.array_equal(result_non_opti, result_opti)
def run_test(test, paths): """ Wrapper around 'run' function for diagnostics. Arguments: ---------- test: paths: """ test.print_config() uid = uuid.uuid4() print("Test ID is ", str(uid)) params = getattr(test, 'params') splitcase = getattr(test, 'splitcase') mergecase = getattr(test, 'mergecase') if params["optimized"]: enable_clustering(params["buffer_size"]) else: disable_clustering() flush_cache() try: arr = splitcase.get() tsplit, diagnostics_split, monitor_split = run_to_hdf5( arr, params, uid, str(params["chunk_shape"]), params["optimized"]) except Exception as e: print(e) return [ params["hardware"], params["cuboid_name"], params["array_shape"], params["chunk_type"], params["chunk_shape"], params["optimized"], params["buffer_size"], params["nthreads"], None, None, None, None, None, None, None, None ] finally: splitcase.clean() R = cuboids[params["cuboid_name"]]['shape'] I = splitcase.chunks_shape print(f'R: {R}') print(f'I: {I}') if not 'auto' in I: success_run_split = verify_results_split( R, I, getattr(test, 'cuboid_filepath'), getattr(test, 'hardware_path')) else: success_run_split = None print(f'[Split] Find the diagnostics output file at {diagnostics_split}') print(f'[Split] Find the monitor output file at {monitor_split}') flush_cache() try: arr = mergecase.get() tmerge, diagnostics_merge, monitor_merge = run_to_hdf5( arr, params, uid, str(params["chunk_shape"]), params["optimized"]) except Exception as e: print(e) return [ params["hardware"], params["cuboid_name"], params["array_shape"], params["chunk_type"], params["chunk_shape"], params["optimized"], params["buffer_size"], params["nthreads"], round(tsplit, 4), None, None, None, None, None, None, None ] finally: mergecase.clean() success_run_merge = verify_results_merge(getattr(test, 'cuboid_filepath'), getattr(test, 'merge_filepath')) print(f'[Merge] Find the diagnostics output file at {diagnostics_merge}') print(f'[Merge] Find the monitor output file at {monitor_merge}') datadir = getattr(test, 'hardware_path') merged_filepath = getattr(test, 'merge_filepath') clean_directory(datadir) os.remove(merged_filepath) sample_res = [ params["hardware"], params["cuboid_name"], params["array_shape"], params["chunk_type"], params["chunk_shape"], params["optimized"], params["buffer_size"], params["nthreads"], round(tsplit, 4), round(tmerge, 4), diagnostics_split, diagnostics_merge, monitor_split, monitor_merge, success_run_split, success_run_merge ] print("-------------RESULT\n", sample_res) return sample_res
create_array(input_filepath, input_array_shape) # split times = list() for buffer in buffers_to_test: print("RUNNING BUFFER ", buffer) with h5py.File(input_filepath, 'r') as f_in: # open original array dset = f_in['/data'] in_arr = da.from_array(dset, chunks=split_cs) with h5py.File(output_filepath, 'x') as f_out: # open split array # run optimized split_arr = split_to_hdf5(in_arr, f_out, nb_blocks=None) print("RUNNING OPTIMIZED") enable_clustering(buffer) flush_cache() with Profiler() as prof, ResourceProfiler( ) as rprof, CacheProfiler(metric=nbytes) as cprof: with dask.config.set(scheduler='single-threaded'): t = time.time() _ = split_arr.compute() t = time.time() - t times.append([buffer, t, "optimized"]) visualize([prof, rprof, cprof], os.path.join(output_directory, str(buffer) + "opti" + ".html"), show=False) os.remove(output_filepath) # remove output file for next run with h5py.File(output_filepath, 'x') as f_out: # open split array
def create_test_array(): if not pytest.test_array_path: create_test_array_nochunk(path, (100, 100, 100)) pytest.test_array_path = path enable_clustering(buffer_size, mem_limit=True)
def test_split(optimized, nb_chunks, shape_to_test): def create_arrays_for_comparison(): """ Get chunks as dask arrays to compare the chunks to the splitted files. """ arr = get_dask_array_from_hdf5(pytest.test_array_path, '/data', logic_cs=shape_to_test) arr_list = get_arr_chunks(arr, nb_chunks=nb_chunks) return arr_list def apply_sanity_check(split_filepath): """ Check if splitted file not empty. """ logger.info("Checking split file integrity...") with h5py.File(split_filepath, 'r') as f: keys_list = list(f.keys()) logger.info("file : %s", f) logger.info("Number of datasets in hdf5 file : %s", len(keys_list)) logger.info("First item: %s", keys_list[0]) assert len(list(f.keys())) != 0 logger.info("Integrity check passed.\n") def store_correct(): """ Compare the real chunks to the splits to see if correctly splitted. """ logger.info("Testing %s matches...", len(arr_list)) with h5py.File(split_filepath, 'r') as f: for i, a in enumerate(arr_list): stored_a = da.from_array(f['/data' + str(i)]) # logger.info("split shape: %s", stored_a.shape) stored_a.rechunk(chunks=shape_to_test) # logger.info("split rechunked to: %s", stored_a.shape) # logger.info("will be compared to : %s ", a.shape) # logger.info("Testing all close...") test = da.allclose(stored_a, a) disable_clustering( ) # TODO: remove this, make it work even for all close assert test.compute() logger.info("Passed.\n") def split(): # overwrite if split file already exists if os.path.isfile(split_filepath): os.remove(split_filepath) case = Split(pytest.test_array_path, shape_to_test) case.split_hdf5(split_filepath, nb_blocks=nb_chunks) case.get().compute() return logger.info("PARAMETERS:") logger.info("Optimized: %s", optimized), nb_chunks, shape_to_test logger.info("Nb_chunk: %s", nb_chunks) logger.info("Shape: %s \n", shape_to_test) # setup config split_filepath = "./split_file.hdf5" if optimized: enable_clustering(buffer_size) else: disable_clustering() # test split() apply_sanity_check(split_filepath) # assert arr_list = create_arrays_for_comparison() store_correct()
def test_create_buffers_blocks(): """ Test if the buffering works according to clustered writes in all 3 possible configurations. Data: ----- input array shape: 100x100x100 input arr created with 2 bytes per pixel block shape: 20x20x20 Which gives us: --------------- - nb blocks per row = 5 - nb blocks per slice = 25 - block size in bytes : (20*20*20) * 2 bytes = 16000 """ cs = (20, 20, 20) case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() _, dicts = get_used_proxies(arr.dask.dicts) origarr_name = list(dicts['origarr_to_obj'].keys())[0] # EXPECTED BEHAVIOR FOR CLUSTERED WRITES l1 = [[i] for i in range(125)] # 1 block l2 = list() # 3 blocks for i in range(25): o = (i * 5) l2.append([0 + o, 1 + o, 2 + o]) l2.append([3 + o, 4 + o]) l3 = list() # 1 block column for i in range(25): l3.append(list(range(i * 5, i * 5 + 5))) l4 = list() # 2 block columns for i in range(5): o = i * 25 # offset l4.append(list(range(0 + o, 10 + o))) l4.append(list(range(10 + o, 20 + o))) l4.append(list(range(20 + o, 25 + o))) l5 = list() # 1 block slice for i in range(5): l5.append(list(range((i * 25), (i * 25) + 25))) l6 = list() # 3 block slices l6.append(list(range(0, 25 * 3))) l6.append(list(range(75, 125))) l7 = [list(range(125))] # whole array nb_bytes_per_block = 20 * 20 * 20 byte_size = 2 experiment_params = { nb_bytes_per_block * byte_size: l1, # 1 block nb_bytes_per_block * byte_size * 3: l2, # some blocks (3) nb_bytes_per_block * byte_size * 5: l3, # 1 block column nb_bytes_per_block * byte_size * 5 * 2: l4, # some block columns (2) nb_bytes_per_block * byte_size * 5 * 5: l5, # 1 block slice nb_bytes_per_block * byte_size * 5 * 5 * 3: l6, # some block slices (3) nb_bytes_per_block * byte_size * 5 * 5 * 5: l7, # whole array } for buffer_size, expected in experiment_params.items(): logging.info("\nTesting buffer %s", buffer_size) logging.debug("Expecting %s", expected) enable_clustering(buffer_size, mem_limit=True) buffers = create_buffers(origarr_name, dicts, cs) logging.debug("Got %s", buffers) assert buffers == expected