def keep_reading(B, I, R): """ b: read buffer shape i: inblock shape r: original image shape """ buffer_partition = get_blocks_shape(R, B) read_buffers = get_named_volumes(buffer_partition, B) infiles_partition = get_blocks_shape(R, I) inblocks = get_named_volumes(infiles_partition, I) nb_inblocks_openings = 0 nb_inblocks_seeks = 0 for buffer_index in sorted(read_buffers.keys()): read_buffer = read_buffers[buffer_index] for inblock in inblocks.values(): if hypercubes_overlap(read_buffer, inblock): nb_inblock_seeks_tmp = write_buffer(read_buffer, inblock, I) nb_inblocks_seeks += nb_inblock_seeks_tmp nb_inblocks_openings += 1 print(f"[Reality] Number inblocks opening: {nb_inblocks_openings}") print(f"[Reality] Number inblocks seeks: {nb_inblocks_seeks}") return nb_inblocks_openings + nb_inblocks_seeks
def baseline_rechunk(O, I, R): """ Arguments: ---------- O, I, R: tuples """ infiles_partition = get_blocks_shape(R, I) inblocks = get_named_volumes(infiles_partition, I) outfiles_partition = get_blocks_shape(R, O) outblocks = get_named_volumes(outfiles_partition, O) nb_infile_openings = 0 nb_infile_seeks = 0 nb_outfile_openings = 0 nb_outfile_seeks = 0 for buffer_index in sorted(inblocks.keys()): read_buffer = inblocks[buffer_index] nb_infile_openings += 1 for outblock in outblocks.values(): if hypercubes_overlap(read_buffer, outblock): nb_outfile_seeks_tmp = write_buffer(read_buffer, outblock, O) nb_outfile_seeks += nb_outfile_seeks_tmp nb_outfile_openings += 1 return [ nb_outfile_openings, nb_outfile_seeks, nb_infile_openings, nb_infile_seeks ]
def create_case(args): paths = load_json(args.paths_config) for k, v in paths.items(): if "PYTHONPATH" in k: sys.path.insert(0, v) from repartition_experiments.scripts_exp.exp_utils import create_empty_dir, create_input_chunks, create_input_chunks_distributed from repartition_experiments.algorithms.clustered_writes import clustered_writes from repartition_experiments.algorithms.utils import get_file_manager, get_blocks_shape # preprocessing fm = get_file_manager(args.file_format) R_stringlist, I_stringlist = args.R.split('_'), args.I.split('_') R, I = tuple(map(lambda e: int(e), R_stringlist)), tuple(map(lambda e: int(e), I_stringlist)) print(R, I) indir_path, outdir_path = os.path.join(paths["ssd_path"], 'indir'), os.path.join(paths["ssd_path"], 'outdir') partition = get_blocks_shape(R, I) if args.distributed: # only creates the input blocks, without creating the big image first and splitting it, and stores each chunk in a rounding fashion on the different disks of the cluster create_input_chunks_distributed(I, partition, indir_path, args.file_format) return if not args.splits_only: # creating input image and then splitting it. origarr_filepath = create_input_file(R, paths["ssd_path"], fm) print("creating input file...", origarr_filepath) bpv = 2 R_size = R[0]*R[1]*R[2]*bpv create_empty_dir(indir_path) create_empty_dir(outdir_path) clustered_writes(origarr_filepath, R, I, bpv, R_size, args.file_format, indir_path) else: # only creates the input blocks, without creating the big image first and splitting it create_input_chunks(I, partition, indir_path, args.file_format)
def verify_results(outdir_path, original_array_path, R, O, file_format, addition, split_merge=False): """ Compare content of each output file against expected subarrays from original array. WARNING: this function opens all output files + the original array """ if file_format == "HDF5": file_manager = HDF5_manager() else: print("File format not supported yet. Aborting...") sys.exit(1) partition = get_blocks_shape(R, O) orig_arr_data = file_manager.read_all(original_array_path) all_true = True if split_merge: result_arrpath = os.path.join(outdir_path, "0_0_0.hdf5") return file_manager.check_split_merge(original_array_path, result_arrpath) for i in range(partition[0]): for j in range(partition[1]): for k in range(partition[2]): outfilepath = os.path.join(outdir_path, str(i) + "_" + str(j) + "_" + str(k) + ".hdf5") data_stored = file_manager.read_all(outfilepath) ground_truth = orig_arr_data[i*O[0]:(i+1)*O[0],j*O[1]:(j+1)*O[1],k*O[2]:(k+1)*O[2]] if addition: ground_truth = ground_truth +1 try: assert np.allclose(data_stored, ground_truth, rtol=1e-02) # print(f"Good output file {outfilepath}") except: print(f"Error: bad rechunking {outfilepath}") print(f"Slices from ground truth {i*O[0]}:{(i+1)*O[0]}, {j*O[1]}:{(j+1)*O[1]}, {k*O[2]}:{(k+1)*O[2]}") print("data_stored", data_stored) print("ground_truth", ground_truth) all_true = False # do not return here to see all failures file_manager.close_infiles() # close all files return all_true
def clustered_writes(origarr_filepath, R, cs, bpv, m, ff, outdir_path): """ Implementation of the clustered strategy for splitting a 3D array. Output file names are following the following regex: outdir_path/{i}_{j}_{k}.extension WARNING: this implementation loads the whole input array in RAM. We had 250GB of RAM for our experiments so we decided to use it. Arguments: ---------- R: original array shape m: memory available for the buffer cs: chunk shape bpv: number of bytes per voxel ff: file_format outdir_path: where to write the splits """ strategies = { 0: "blocks", 1: "block_rows", 2: "block_slices" } file_manager = get_file_manager(ff) partition = get_blocks_shape(R, cs) bs, brs, bss = get_entity_sizes(cs, bpv, partition) strategy = get_strategy(m, bs, brs, bss) origarr_size = R[0] * R[1] * R[2] * bpv buffers = compute_buffers(m, strategy, origarr_size, cs, bs, brs, bss, partition, R, bpv) origarr = file_manager.get_dataset(origarr_filepath, '/data') for buffer_index in range(len(buffers.values())): buffer = buffers[buffer_index] buffer_data = read_buffer(origarr, file_manager, buffer) write_splits(file_manager, buffer, buffer_data, cs, outdir_path) file_manager.close_infiles() get_opened_files()
def write_splits(file_manager, buffer, buffer_data, cs, outdir_path): p1, p2 = buffer.get_corners() first_index = (p1[0]/cs[0], p1[1]/cs[1], p1[2]/cs[2]) buffer_shape = (p2[0]-p1[0], p2[1]-p1[1], p2[2]-p1[2]) buff_partition = get_blocks_shape(buffer_shape, cs) _3d_index = first_index for i in range(buff_partition[0]): for j in range(buff_partition[1]): for k in range(buff_partition[2]): split_data = buffer_data[ i * cs[0]:(i+1) * cs[0], j * cs[1]:(j+1) * cs[1], k * cs[2]:(k+1) * cs[2]] region = ((0, cs[0]), (0, cs[1]), (0, cs[2])) file_manager.write_data(int(_3d_index[0] + i), int(_3d_index[1] + j), int(_3d_index[2] + k), outdir_path, split_data, region, cs)
def baseline_rechunk(indir_path, outdir_path, O, I, R, file_format, addition, distributed, debug_mode=False, clean_out_dir=False, dont_write=False): """ Naive rechunk implementation in plain python. The input directory is supposed to contain the input files (output of the split process). WARNING: Does not clean the output directory after use by default. """ print(f"Setting arguments...") global DEBUG_LOCAL global DONT_WRITE global tracker global outdirs_dict, outdir_index outdirs_dict = dict() outdir_index = 0 tracker = Tracker() DEBUG_LOCAL = True if debug_mode else False DONT_WRITE = True if dont_write else False print("Addition mode:", addition) print("DONT_WRITE: ", DONT_WRITE) O, I, R = tuple(O), tuple(I), tuple(R) file_manager = get_file_manager(file_format) infiles_partition = get_blocks_shape(R, I) infiles_volumes = get_named_volumes(infiles_partition, I) outfiles_partition = get_blocks_shape(R, O) outfiles_volumes = get_named_volumes(outfiles_partition, O) outfiles_volumes = outfiles_volumes.values() if distributed: repartition_dict = None json_filename = '/disk0/gtimothee/repartition_dict.json' if not os.path.isfile(json_filename): # print("cannot find association dict json file") sys.exit(1) else: pass # print(f"json file found") try: with open(json_filename) as f: repartition_dict = json.load(f) except Exception as e: print(e) # print("error (1)") sys.exit(1) if repartition_dict == None: # print("error (2)") sys.exit(1) else: pass # print(f"Found reparition dict: {repartition_dict}") input_files = repartition_dict.values() else: input_files = file_manager.get_input_files(indir_path) t_read = 0 t_write = 0 vols_written = list() nb_infile_openings = 0 nb_infile_seeks = 0 nb_outfile_openings = 0 nb_outfile_seeks = 0 buffer_index = 1 for input_file in input_files: print(f"Treating buffer: {buffer_index}...") buffer_index += 1 nb_infile_openings += 1 involume = get_volume(input_file, infiles_volumes, infiles_partition) t1 = time.time() if not DONT_WRITE: data = file_manager.read_data_from_fp(input_file, slices=None) else: data = None t1 = time.time() - t1 t_read += t1 for outvolume in outfiles_volumes: if hypercubes_overlap(involume, outvolume): shape, t2, nb_outfile_seeks_tmp = write_to_outfile( involume, outvolume, data, outfiles_partition, outdir_path, O, file_manager, addition, tracker) t_write += t2 vols_written.append(shape) # nb_outfile_openings += 1 already included in nb_outfile_seeks nb_outfile_seeks += nb_outfile_seeks_tmp file_manager.close_infiles() if DONT_WRITE: assert tracker.is_complete(((0, 0, 0), R)) # print("\nShapes written:") # for row in vols_written: # print(row) if clean_out_dir: print("Cleaning output directory") file_manager.clean_directory(outdir_path) get_opened_files() return t_read, t_write, [ nb_outfile_openings, nb_outfile_seeks, nb_infile_openings, nb_infile_seeks ]