class TestConfig(): def __init__(self, params, paths): self.params = params self.create_split_case(params, paths) self.create_merge_case(params, paths) def create_split_case(self, params, paths): try: if params["hardware"] == "hdd": self.hardware_path = paths["hdd_path"] else: self.hardware_path = paths["ssd_path"] self.cuboid_filepath = os.path.join( self.hardware_path, params["cuboid_name"] + ".hdf5") self.splitcase = Split(self.cuboid_filepath, params["chunk_shape"]) self.splitcase.split_hdf5_multiple(self.hardware_path, nb_blocks=None) except Exception as e: print(traceback.format_exc()) print("Something went wrong while creating case config.") exit(1) def create_merge_case(self, params, paths): try: if params["hardware"] == "hdd": self.hardware_path = paths["hdd_path"] else: self.hardware_path = paths["ssd_path"] self.merge_filepath = os.path.join(self.hardware_path, "merged.hdf5") self.mergecase = Merge(self.merge_filepath) self.mergecase.merge_hdf5_multiple(self.hardware_path, data_key='/data', store=True) except Exception as e: print(traceback.format_exc()) print("Something went wrong while creating case config.") exit(1) def print_config(self): print(f'\n-------------------') print(f'Test configuration') print(f'-------------------') print(f'\nTest configurations:') print(f'\tHardware: {self.params["hardware"]}') print(f'\tCuboid name: {self.params["cuboid_name"]}') print(f'\tCuboid shape: "{self.params["array_shape"]}"') print(f'\tChunk shape: "{self.params["chunk_shape"]}"') print(f'\tChunk type: "{self.params["chunk_type"]}"') print(f'\nDask configuration:') print(f'\tOptimization enabled: {self.params["optimized"]}') print(f'\tBuffer size: {self.params["buffer_size"]} bytes') print(f'\tNb threads: {self.params["nthreads"]}') return
def test_create_buffer_node(): # preparation cs = (20, 20, 20) case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() graph = arr.dask.dicts _, dicts = get_used_proxies(graph) origarr_name = list(dicts['origarr_to_obj'].keys())[0] buffers = create_buffers(origarr_name, dicts, cs) # apply function keys = list() for buffer in buffers: key = create_buffer_node(graph, origarr_name, dicts, buffer, cs) keys.append(key) # test output buffers_key = origarr_name.split('-')[-1] + '-merged' indices = set() for buffer_key in graph[buffers_key].keys(): _, start, end = buffer_key indices.add((start, end)) buffers = set([(b[0], b[-1]) for b in buffers]) assert buffers_key in graph.keys() assert len(indices) == len(buffers) assert buffers == indices
def create_split_case(self, params, paths): try: if params["hardware"] == "hdd": self.hardware_path = paths["hdd_path"] else: self.hardware_path = paths["ssd_path"] self.cuboid_filepath = os.path.join( self.hardware_path, params["cuboid_name"] + ".hdf5") self.splitcase = Split(self.cuboid_filepath, params["chunk_shape"]) self.splitcase.split_hdf5_multiple(self.hardware_path, nb_blocks=None) except Exception as e: print(traceback.format_exc()) print("Something went wrong while creating case config.") exit(1)
def test_split_and_merge_multiple(shape_to_test, nb_chunks): """ TODO: add asserts -> retrieve chunks and compare to what have been stored. """ fileslist = list() for infilepath in glob.glob("[0-9]*_[0-9]*_[0-9]*.hdf5" ): # remove split files from previous tests fileslist.append(infilepath) fileslist.append('./reconstructed.hdf5') for fn in fileslist: if os.path.isfile(fn): os.remove(fn) out_dirpath = './' case = Split(pytest.test_array_path, shape_to_test) case.split_hdf5_multiple(out_dirpath, nb_blocks=None) arr = case.get() arr.compute() case.clean() in_dirpath = out_dirpath case = Merge('./reconstructed.hdf5') case.merge_hdf5_multiple(in_dirpath) arr = case.get() arr.compute() case.clean() logger.info("Inspecting filepath: './reconstructed.hdf5'") with h5py.File('./reconstructed.hdf5', 'r') as f: inspect_h5py_file(f) assert f['/data'].shape == (100, 100, 100)
def test_create_buffers_slabs(): """ Test if the buffering works according to clustered writes when processing slabs. The only strategy that should be used is ``block slices". """ cs = (5, 100, 100) # 20 chunks case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() _, dicts = get_used_proxies(arr.dask.dicts) origarr_name = list(dicts['origarr_to_obj'].keys())[0] nb_bytes_per_block = 100 * 100 * 5 byte_size = 2 l1 = [[i] for i in range(20)] l2 = [list(range(10)), list(range(10, 20))] l3 = [list(range(7)), list(range(7, 14)), list(range(14, 20))] experiment_params = { nb_bytes_per_block * byte_size: l1, nb_bytes_per_block * byte_size * 10: l2, nb_bytes_per_block * byte_size * 7: l3 } for buffer_size, expected in experiment_params.items(): logging.info("\nTesting buffer %s", buffer_size) logging.debug("Expecting %s", expected) enable_clustering(buffer_size, mem_limit=True) buffers = create_buffers(origarr_name, dicts, cs) logging.debug("Got %s", buffers) assert buffers == expected
def split(inputfilepath, I): filetosplitpath = inputfilepath splitfilesshape = I case = Split(filetosplitpath, splitfilesshape) case.split_hdf5_multiple( './', nb_blocks=None) # split all blocks into different files arr = case.get() arr.compute() case.clean()
def split(): # overwrite if split file already exists if os.path.isfile(split_filepath): os.remove(split_filepath) case = Split(pytest.test_array_path, shape_to_test) case.split_hdf5(split_filepath, nb_blocks=nb_chunks) case.get().compute() return
def test_get_graph_from_dask(): """ Test if it runs well. TODO: Better test function. """ # create config for the test case = Split(pytest.test_array_path, "auto") case.sum(nb_chunks=None) dask_array = case.get() # test function dask_graph = dask_array.dask.dicts graph = get_graph_from_dask(dask_graph, undirected=False)
def test_split_multiple(shape_to_test, nb_chunks): """ TODO: add asserts -> retrieve chunks and compare to what have been stored. """ out_dirpath = './' case = Split(pytest.test_array_path, shape_to_test) case.split_hdf5_multiple(out_dirpath, nb_blocks=None) arr = case.get() # arr.visualize(filename='/tmp/dask_io_visualize_split_multiple.svg') arr.compute() case.clean() for filepath in glob.glob("*.hdf5"): logger.info("Inspecting filepath: %s", filepath) with h5py.File(filepath, 'r') as f: inspect_h5py_file(f)
def test_sum(shape_to_test, nb_chunks): """ Test if the sum of two blocks yields the good result using our optimization function. """ logger.info("testing shape %s", shape_to_test) # prepare test case case = Split(pytest.test_array_path, shape_to_test) case.sum(nb_chunks) # non optimized run disable_clustering() result_non_opti = case.get().compute() # optimized run enable_clustering(buffer_size) result_opti = case.get().compute() assert np.array_equal(result_non_opti, result_opti)
def split(inputfilepath, I, datadir): """ Split the input array stored at inputfilepath into outputfiles with shape I into datadir. Arguments: ---------- inputfilepath: Path to the input file we want to split. I: Output file shape. Shape of a chunk inside each output file. datadir: Path to directory in which to store the output files. """ print("[preprocessing] Splitting input array...") case = Split(inputfilepath, I) case.split_hdf5_multiple(datadir, nb_blocks=None) arr = case.get() buffer_shape = ONE_GIG * 5 # enable_clustering(buffer_shape) with dask.config.set(scheduler='single-threaded'): arr.compute() # disable_clustering() case.clean() print(f'Split done.')
def test_get_blocks_used(): cs = (20, 20, 20) case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() # routine to get the needed data # we assume those functions have been tested before get_blocks_used cs_confirmed, dicts = get_used_proxies(arr.dask.dicts) assert cs == cs_confirmed origarr_name = list(dicts['origarr_to_obj'].keys())[0] arr_obj = dicts['origarr_to_obj'][origarr_name] strategy, max_blocks_per_load = get_load_strategy(ONE_GIG, cs, (100, 100, 100)) # actual test of the function blocks_used, block_to_proxies = get_blocks_used(dicts, origarr_name, arr_obj, cs) blocks_used.sort() expected = list(range(125)) assert blocks_used == expected
def split(datadir, filepath, cs, split_files=True): """ Arguments: ---------- split_files: if true then perform a split into multiple files, if false then perform a split inside one hdf5 file """ print("Splitting...") splitcase = Split(filepath, chunk_shapes[cs]) if split_files: splitcase.split_hdf5_multiple(datadir, nb_blocks=None) else: out_filepath = os.path.join(datadir, "split.hdf5") splitcase.split_hdf5(out_filepath, nb_blocks=None) arr = splitcase.get() try: with dask.config.set(scheduler='single-threaded'): tsplit = run(arr) splitcase.clean() return tsplit except Exception as e: print(e, "\nOops something went wrong... Aborting.") splitcase.clean() sys.exit(1)
def test_create_buffers_blocks(): """ Test if the buffering works according to clustered writes in all 3 possible configurations. Data: ----- input array shape: 100x100x100 input arr created with 2 bytes per pixel block shape: 20x20x20 Which gives us: --------------- - nb blocks per row = 5 - nb blocks per slice = 25 - block size in bytes : (20*20*20) * 2 bytes = 16000 """ cs = (20, 20, 20) case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() _, dicts = get_used_proxies(arr.dask.dicts) origarr_name = list(dicts['origarr_to_obj'].keys())[0] # EXPECTED BEHAVIOR FOR CLUSTERED WRITES l1 = [[i] for i in range(125)] # 1 block l2 = list() # 3 blocks for i in range(25): o = (i * 5) l2.append([0 + o, 1 + o, 2 + o]) l2.append([3 + o, 4 + o]) l3 = list() # 1 block column for i in range(25): l3.append(list(range(i * 5, i * 5 + 5))) l4 = list() # 2 block columns for i in range(5): o = i * 25 # offset l4.append(list(range(0 + o, 10 + o))) l4.append(list(range(10 + o, 20 + o))) l4.append(list(range(20 + o, 25 + o))) l5 = list() # 1 block slice for i in range(5): l5.append(list(range((i * 25), (i * 25) + 25))) l6 = list() # 3 block slices l6.append(list(range(0, 25 * 3))) l6.append(list(range(75, 125))) l7 = [list(range(125))] # whole array nb_bytes_per_block = 20 * 20 * 20 byte_size = 2 experiment_params = { nb_bytes_per_block * byte_size: l1, # 1 block nb_bytes_per_block * byte_size * 3: l2, # some blocks (3) nb_bytes_per_block * byte_size * 5: l3, # 1 block column nb_bytes_per_block * byte_size * 5 * 2: l4, # some block columns (2) nb_bytes_per_block * byte_size * 5 * 5: l5, # 1 block slice nb_bytes_per_block * byte_size * 5 * 5 * 3: l6, # some block slices (3) nb_bytes_per_block * byte_size * 5 * 5 * 5: l7, # whole array } for buffer_size, expected in experiment_params.items(): logging.info("\nTesting buffer %s", buffer_size) logging.debug("Expecting %s", expected) enable_clustering(buffer_size, mem_limit=True) buffers = create_buffers(origarr_name, dicts, cs) logging.debug("Got %s", buffers) assert buffers == expected