def test_merge_cached_volumes(): # prep case R = R_test B = B_test O = O_test buffers_partition = get_blocks_shape(R, B) buffers_volumes = get_named_volumes(buffers_partition, B) outfiles_partititon = get_blocks_shape(R, O) outfiles_volumes = get_named_volumes(outfiles_partititon, O) buff_to_vols = get_buff_to_vols(R, B, O, buffers_volumes, buffers_partition) test_arrays = get_arrays_dict(buff_to_vols, buffers_volumes, outfiles_volumes, outfiles_partititon) # do the merge merge_cached_volumes(test_arrays, volumes_to_keep_test) # assert expected = { 0: 1, 1: 1, # << modified 2: 1, 3: 2, 4: 3, # << modified 5: 2, 6: 1, 7: 2, 8: 1 } test_arrays_lengths = {k: len(v) for (k, v) in test_arrays.items()} # logger.debug("----------After merge:") # neat_print(test_arrays) for k, v in expected.items(): assert test_arrays_lengths[k] == v
def test_clean_arrays_dict(): # prep case R = R_test B = B_test O = O_test buffers_partition = get_blocks_shape(R, B) buffers_volumes = get_named_volumes(buffers_partition, B) outfiles_partititon = get_blocks_shape(R, O) outfiles_volumes = get_named_volumes(outfiles_partititon, O) buff_to_vols = get_buff_to_vols(R, B, O, buffers_volumes, buffers_partition) test_arrays = get_arrays_dict(buff_to_vols, buffers_volumes, outfiles_volumes, outfiles_partititon) merge_cached_volumes(test_arrays, volumes_to_keep_test) # do the clean clean_arrays_dict(test_arrays) # logger.debug("----------After cleaning:") # logger.debug(test_arrays) for outputfile_key, expected_array_list in d_arrays_expected.items(): arrays_list = test_arrays[outputfile_key] expected_array_list = list(map(lambda e: str(e), expected_array_list)) arrays_list = list(map(lambda e: str(e), arrays_list)) for e in expected_array_list: assert e in arrays_list
def test_get_dirty_arrays_dict(): """ by dirty we mean not cleaned -> see clean function """ R = R_test B = B_test O = O_test buffers_partition = get_blocks_shape(R, B) buffers_volumes = get_named_volumes(buffers_partition, B) outfiles_partititon = get_blocks_shape(R, O) outfiles_volumes = get_named_volumes(outfiles_partititon, O) buff_to_vols = get_buff_to_vols(R, B, O, buffers_volumes, buffers_partition) test_arrays = get_arrays_dict(buff_to_vols, buffers_volumes, outfiles_volumes) test_arrays_lengths = { k: len(v) for (k, v) in test_arrays.items()} expected = { 0: 1, 1: 2, 2: 1, 3: 2, 4: 4, 5: 2, 6: 1, 7: 2, 8: 1 } logger.debug("----------Before merge:") neat_print(test_arrays) for k, v in expected.items(): assert test_arrays_lengths[k] == v
def test_get_buff_to_vols(): R = R_test B = B_test O = O_test buffers_partition = get_blocks_shape(R, B) buffers_volumes = get_named_volumes(buffers_partition, B) outfiles_partititon = get_blocks_shape(R, O) outfiles_volumes = get_named_volumes(outfiles_partititon, O) buff_to_vols = get_buff_to_vols(R, B, O, buffers_volumes, buffers_partition)
def test_get_volumes(): """ test getmain and gethidden """ R = (1, 120, 120) B = (1, 60, 60) O = (1, 40, 40) logger.debug("FUNCTION test_get_volumes ---") from dask_io.optimizer.utils.utils import numeric_to_3d_pos from dask_io.optimizer.cases.resplit_utils import get_blocks_shape buffers_partition = get_blocks_shape(R, B) for bufferindex in range(4): logger.debug("buffer %s", bufferindex) _3d_index = numeric_to_3d_pos(bufferindex, buffers_partition, order='F') T = list() for dim in range(3): nb = _3d_index[dim] + 1 logger.debug("nb:%s", nb) C = (nb * B[dim]) % O[dim] if C == 0 and B[dim] != O[dim]: C = O[dim] T.append(B[dim] - C) logger.debug("C: %s", C) logger.debug("T: %s", T) main_volumes = get_main_volumes(B, T) assert len(main_volumes) == 4
def verify_results(outdir_path, original_array_path, R, O): from dask_io.optimizer.cases.resplit_utils import get_blocks_shape outfiles_partition = get_blocks_shape(R, O) all_true = True with h5py.File(original_array_path, 'r') as f: orig_arr = f["/data"] for i in range(outfiles_partition[0]): for j in range(outfiles_partition[1]): for k in range(outfiles_partition[2]): outfilename = f"{i}_{j}_{k}.hdf5" with h5py.File(os.path.join(outdir_path, outfilename), 'r') as f: data_stored = f["/data"] print( f"Slices from ground truth {i*O[0]}:{(i+1)*O[0]}, {j*O[1]}:{(j+1)*O[1]}, {k*O[2]}:{(k+1)*O[2]}" ) ground_truth = orig_arr[i * O[0]:(i + 1) * O[0], j * O[1]:(j + 1) * O[1], k * O[2]:(k + 1) * O[2]] # print(data_stored[()]) # print(ground_truth) try: assert np.allclose(data_stored[()], ground_truth) print(f"Good output file {outfilename}") except: print(f"Error: bad rechunking {outfilename}") all_true = False # do not return here to see all failures return all_true
def verify_results_split(R, I, input_array_path, datadir): from dask_io.optimizer.cases.resplit_utils import get_blocks_shape splitfiles_partition = get_blocks_shape(R, I) print("split files partiton:", splitfiles_partition) all_true = True orig_arr = get_dask_array_from_hdf5(input_array_path, "/data", logic_cs=tuple(I)) for i in range(splitfiles_partition[0]): for j in range(splitfiles_partition[1]): for k in range(splitfiles_partition[2]): splitfilename = f"{i}_{j}_{k}.hdf5" split_filepath = os.path.join(datadir, splitfilename) print("opening", split_filepath) splitarray = get_dask_array_from_hdf5(split_filepath, "/data") print( f"Slices from ground truth {i*I[0]}:{(i+1)*I[0]}, {j*I[1]}:{(j+1)*I[1]}, {k*I[2]}:{(k+1)*I[2]}" ) ground_truth_arr = orig_arr[i * I[0]:(i + 1) * I[0], j * I[1]:(j + 1) * I[1], k * I[2]:(k + 1) * I[2]] verify_task = da.allclose(ground_truth_arr, splitarray) print("VERIFY TASK: ", verify_task) disable_clustering() _res = verify_task.compute() print("RESULT: ", _res) if _res == False: print(f"[Error] Split failed for {splitfilename}") all_true = False clean_files() return all_true
def test_regions_dict(): """ Given arrays_dict, does this function return the good regions_dict """ logger.debug("== Function == test_regions_dict") R = R_test O = O_test outfiles_partititon = get_blocks_shape(R, O) outfiles_volumes = get_named_volumes(outfiles_partititon, O) regions_dict = get_regions_dict(d_arrays_expected, outfiles_volumes) for outputfile_key, expected_regions_list in regions_dict.items(): regions_list = regions_dict[outputfile_key] expected_regions_list = list(map(lambda e: str(e), expected_regions_list)) regions_list = list(map(lambda e: str(e), regions_list)) logger.debug("Outfile n°%s", outputfile_key) logger.debug("Associated regions:") for e in regions_list: logger.debug("\t%s", e) for e in expected_regions_list: assert e in regions_list
def apply_store(B, O, R, volumestokeep, reconstructed_array): """ Apply store, using the keep strategy. """ # creations of data for dask store function d_arrays, d_regions = compute_zones(B, O, R, volumestokeep) out_files = list() # to keep outfiles open during processing sources = list() targets = list() regions = list() for outfile_index in sorted(d_arrays.keys()): sliceslistoflist = d_arrays[outfile_index] # create file outfiles_partition = get_blocks_shape(R, O) _3d_pos = numeric_to_3d_pos(outfile_index, outfiles_partition, order='F') i, j, k = _3d_pos out_filename = f'{i}_{j}_{k}.hdf5' out_file = h5py.File(os.path.join(outdir_path, out_filename), 'w') out_files.append(out_file) # create dset dset = out_file.create_dataset('/data', shape=O, dtype=np.float16) for i, st in enumerate(sliceslistoflist): tmp_array = reconstructed_array[st[0], st[1], st[2]] # print("Volume to be stored shape: ", tmp_array.shape) reg = d_regions[outfile_index][i] tmp_array = tmp_array.rechunk(tmp_array.shape) sources.append(tmp_array) targets.append(dset) regions.append(reg) return da.store(sources, targets, regions=regions, compute=False), out_files
def check_outputs(): # sanity check outfiles = list() for fpath in glob.glob( "[0-9].hdf5"): # remove split files from previous tests print(f'Filename: {fpath}') with h5py.File(fpath, 'r') as f: inspect_h5py_file(f) # prepare ground truth for verification arrays_expected = dict() outfiles_partititon = get_blocks_shape((1, 120, 120), O) outfiles_volumes = get_named_volumes(outfiles_partititon, O) for outfilekey, volume in outfiles_volumes.items(): slices = convert_Volume_to_slices(volume) arrays_expected[outfilekey] = reconstructed_array[slices[0], slices[1], slices[2]] # verify for fpath in glob.glob("[0-9].hdf5"): outputfile_index = int(fpath.split('.')[0]) print(f'Output file index: ', outputfile_index) array_stored = get_dask_array_from_hdf5(fpath, '/data', logic_cs="dataset_shape") arr_expected = arrays_expected[outputfile_index] print("equal:", da.allclose(array_stored, arr_expected).compute()) print( "stored:", array_stored[slice(0, 1, None), slice(0, 1, None), slice(0, 10, None)].compute()) print( "expected", arr_expected[slice(0, 1, None), slice(0, 1, None), slice(0, 10, None)].compute())
import logging import logging.config logging.config.dictConfig({ 'version': 1, 'disable_existing_loggers': True, }) for case in cases: _type, R, O, I, B, volumestokeep = int(case["type"]), tuple( case["R"]), tuple(case["O"]), tuple(case["I"]), tuple( case["B"]), case["volumestokeep"] print( f'Current run ------ \nType: {_type}\nR: {R},\nO: {O},\nI: {I}\nvolumestokeep: {volumestokeep}' ) buffers_partition = get_blocks_shape(R, B) buffers_volumes = get_named_volumes(buffers_partition, B) # find omega and theta max omega_max = [0, 0, 0] T_max = [0, 0, 0] for buffer_index in buffers_volumes.keys(): _3d_index = numeric_to_3d_pos(buffer_index, buffers_partition, order='F') T, Cs = get_theta(buffers_volumes, buffer_index, _3d_index, O, B) for i in range(3): if Cs[i] > omega_max[i]: omega_max[i] = Cs[i] if T[i] > T_max[i]:
def rechunk_vanilla_dask(indir_path, outdir_path, nthreads, R, O, model): """ Rechunk using vanilla dask """ in_arrays = load_input_files(indir_path) case = Merge('samplename') case.merge_hdf5_multiple(indir_path, store=False) reconstructed_array = case.get() out_files = list() # to keep outfiles open during processing sources = list() targets = list() outfiles_partition = get_blocks_shape(R, O) for i in range(outfiles_partition[0]): for j in range(outfiles_partition[1]): for k in range(outfiles_partition[2]): out_filename = f'{i}_{j}_{k}.hdf5' out_file = h5py.File(os.path.join(outdir_path, out_filename), 'w') dset = out_file.create_dataset('/data', shape=O, dtype=np.float16) tmp_array = reconstructed_array[i * O[0]:(i + 1) * O[0], j * O[1]:(j + 1) * O[1], k * O[2]:(k + 1) * O[2]] print( f'{i*O[0]}: {(i+1)*O[0]}, {j*O[1]}: {(j+1)*O[1]}, {k*O[2]}: {(k+1)*O[2]}' ) out_files.append(out_file) sources.append(tmp_array) targets.append(dset) rechunk_task = da.store(sources, targets, compute=False) # rechunk_task.visualize(filename="tmp_dir/test_graph_vanilla.png") # sys.exit() with Profiler() as prof, ResourceProfiler( dt=0.25) as rprof, CacheProfiler() as cprof: scheduler = 'single-threaded' if nthreads == 1 else 'threads' with dask.config.set(scheduler=scheduler): try: t = time.time() rechunk_task.compute() t = time.time() - t # visualize([prof, rprof, cprof]) except Exception as e: print(e, "\nSomething went wrong during graph execution.") t = None diagnostics = os.path.join(outdir_path, 'exp5_' + str(model) + '.html') visualize([prof, rprof, cprof], diagnostics, show=False) clean_files() for f in out_files: f.close() return t