def test_create_buffer_node(): # preparation cs = (20, 20, 20) case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() graph = arr.dask.dicts _, dicts = get_used_proxies(graph) origarr_name = list(dicts['origarr_to_obj'].keys())[0] buffers = create_buffers(origarr_name, dicts, cs) # apply function keys = list() for buffer in buffers: key = create_buffer_node(graph, origarr_name, dicts, buffer, cs) keys.append(key) # test output buffers_key = origarr_name.split('-')[-1] + '-merged' indices = set() for buffer_key in graph[buffers_key].keys(): _, start, end = buffer_key indices.add((start, end)) buffers = set([(b[0], b[-1]) for b in buffers]) assert buffers_key in graph.keys() assert len(indices) == len(buffers) assert buffers == indices
def test_create_buffers_slabs(): """ Test if the buffering works according to clustered writes when processing slabs. The only strategy that should be used is ``block slices". """ cs = (5, 100, 100) # 20 chunks case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() _, dicts = get_used_proxies(arr.dask.dicts) origarr_name = list(dicts['origarr_to_obj'].keys())[0] nb_bytes_per_block = 100 * 100 * 5 byte_size = 2 l1 = [[i] for i in range(20)] l2 = [list(range(10)), list(range(10, 20))] l3 = [list(range(7)), list(range(7, 14)), list(range(14, 20))] experiment_params = { nb_bytes_per_block * byte_size: l1, nb_bytes_per_block * byte_size * 10: l2, nb_bytes_per_block * byte_size * 7: l3 } for buffer_size, expected in experiment_params.items(): logging.info("\nTesting buffer %s", buffer_size) logging.debug("Expecting %s", expected) enable_clustering(buffer_size, mem_limit=True) buffers = create_buffers(origarr_name, dicts, cs) logging.debug("Got %s", buffers) assert buffers == expected
def clustered_optimization(graph): """ Applies clustered IO optimization on a Dask graph. Arguments: ---------- graph : dark_array.dask.dicts """ logger.info( f"Configuration file is at {os.path.join(current_dir, 'logging_config.ini')}" ) logger.info("Log file: %s", logfilename) logger.info("Finding proxies.") chunk_shape, dicts = get_used_proxies(graph) if chunk_shape == None or dicts == None: logger.error( "Chunk shape or dicts = None. Aborting dask_io optimization.") raise ValueError() logger.info("Launching optimization algorithm.") apply_clustered_strategy(graph, dicts, chunk_shape) return graph
def test_get_blocks_used(): cs = (20, 20, 20) case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() # routine to get the needed data # we assume those functions have been tested before get_blocks_used cs_confirmed, dicts = get_used_proxies(arr.dask.dicts) assert cs == cs_confirmed origarr_name = list(dicts['origarr_to_obj'].keys())[0] arr_obj = dicts['origarr_to_obj'][origarr_name] strategy, max_blocks_per_load = get_load_strategy(ONE_GIG, cs, (100, 100, 100)) # actual test of the function blocks_used, block_to_proxies = get_blocks_used(dicts, origarr_name, arr_obj, cs) blocks_used.sort() expected = list(range(125)) assert blocks_used == expected
def keep_algorithm(dsk, keys): print("Inside the keep algorithm") dask_graph = dsk.dicts chunk_shape, dicts = get_used_proxies(dask_graph) return dsk
def test_create_buffers_blocks(): """ Test if the buffering works according to clustered writes in all 3 possible configurations. Data: ----- input array shape: 100x100x100 input arr created with 2 bytes per pixel block shape: 20x20x20 Which gives us: --------------- - nb blocks per row = 5 - nb blocks per slice = 25 - block size in bytes : (20*20*20) * 2 bytes = 16000 """ cs = (20, 20, 20) case = Split(pytest.test_array_path, cs) case.split_hdf5("./split_file.hdf5", nb_blocks=None) arr = case.get() _, dicts = get_used_proxies(arr.dask.dicts) origarr_name = list(dicts['origarr_to_obj'].keys())[0] # EXPECTED BEHAVIOR FOR CLUSTERED WRITES l1 = [[i] for i in range(125)] # 1 block l2 = list() # 3 blocks for i in range(25): o = (i * 5) l2.append([0 + o, 1 + o, 2 + o]) l2.append([3 + o, 4 + o]) l3 = list() # 1 block column for i in range(25): l3.append(list(range(i * 5, i * 5 + 5))) l4 = list() # 2 block columns for i in range(5): o = i * 25 # offset l4.append(list(range(0 + o, 10 + o))) l4.append(list(range(10 + o, 20 + o))) l4.append(list(range(20 + o, 25 + o))) l5 = list() # 1 block slice for i in range(5): l5.append(list(range((i * 25), (i * 25) + 25))) l6 = list() # 3 block slices l6.append(list(range(0, 25 * 3))) l6.append(list(range(75, 125))) l7 = [list(range(125))] # whole array nb_bytes_per_block = 20 * 20 * 20 byte_size = 2 experiment_params = { nb_bytes_per_block * byte_size: l1, # 1 block nb_bytes_per_block * byte_size * 3: l2, # some blocks (3) nb_bytes_per_block * byte_size * 5: l3, # 1 block column nb_bytes_per_block * byte_size * 5 * 2: l4, # some block columns (2) nb_bytes_per_block * byte_size * 5 * 5: l5, # 1 block slice nb_bytes_per_block * byte_size * 5 * 5 * 3: l6, # some block slices (3) nb_bytes_per_block * byte_size * 5 * 5 * 5: l7, # whole array } for buffer_size, expected in experiment_params.items(): logging.info("\nTesting buffer %s", buffer_size) logging.debug("Expecting %s", expected) enable_clustering(buffer_size, mem_limit=True) buffers = create_buffers(origarr_name, dicts, cs) logging.debug("Got %s", buffers) assert buffers == expected