def test_r_hat(): array = Partition((3500, 3500, 3500), name='array') in_blocks = Partition((875, 875, 875), array=array, name='in') out_blocks = Partition((700, 875, 700), array=array, name='out') r_hat = keep.get_r_hat(in_blocks, out_blocks) assert (r_hat == (875, 875, 875))
def test_r_hat_1(): array = Partition((20, 20, 20), name='array') in_blocks = Partition((20, 20, 20), array=array, name='in') out_blocks = Partition((20, 10, 2), array=array, name='out') r_hat = keep.get_r_hat(in_blocks, out_blocks) assert (r_hat == (20, 20, 20))
def test_find_shape_with_constraint(): array = Partition((100, 100, 100), name='array', fill='random') in_blocks = Partition((10, 10, 10), name='out', array=array) out_blocks = Partition((50, 50, 50), name='out', array=array) shape, mc = keep.find_shape_with_constraint(in_blocks, out_blocks, None) assert ((shape, mc) == ((50, 50, 50), 125000)) shape, mc = keep.find_shape_with_constraint(in_blocks, out_blocks, 3000) assert ((shape, mc) == ((1, 50, 50), 2500))
def test_seek_model(): array = Partition((2, 2, 2), name='array', fill='random') out_blocks = Partition((2, 1, 2), name='out', array=array) _, _, seeks, _ = keep.keep(array, out_blocks, None, array) assert (seeks == 3) _, _, seeks, _ = keep.baseline(out_blocks, array, None, array) assert (seeks == 6)
def test_get_f_blocks_1(): array = Partition((12, 12, 12), name='array', fill='random') in_blocks = Partition((4, 4, 4), name='in', array=array) fblocks = keep.get_F_blocks(array.blocks[(0, 0, 0)], in_blocks) assert ([str(b) for b in fblocks] == [ 'Block: origin (0, 0, 0); shape (12, 12, 12); data in mem: 0B', 'None', 'None', 'None', 'None', 'None', 'None', 'None' ])
def test_partition_to_end_coords(): d = 12 array = Partition((d, d, d), name='array') in_blocks = Partition((4, 4, 4), name='in', array=array) coords = keep.partition_to_end_coords(in_blocks) assert (coords == ([3, 7, 11], [3, 7, 11], [3, 7, 11])) d = 3500 array = Partition((d, d, d), name='array') in_blocks = Partition((500, 500, 500), name='in', array=array) coords = keep.partition_to_end_coords(in_blocks) assert (coords == ([499, 999, 1499, 1999, 2499, 2999, 3499], [499, 999, 1499, 1999, 2499, 2999, 3499], [499, 999, 1499, 1999, 2499, 2999, 3499]))
def test_repartition_baseline_2(cleanup_blocks): array = Partition((10, 20, 30), name='array') in_blocks = Partition((10, 20, 30), name='in', array=array, fill='random') in_blocks.blocks[(0, 0, 0)].read() in_data = in_blocks.blocks[(0, 0, 0)].data.bytes() out_blocks = Partition((10, 10, 15), name='out', array=array) in_blocks.repartition(out_blocks, None, keep.baseline) rein_blocks = Partition((10, 20, 30), name='rein', array=array) out_blocks.repartition(rein_blocks, None, keep.baseline) rein_blocks.blocks[(0, 0, 0)].read() rein_data = rein_blocks.blocks[(0, 0, 0)].data.bytes() assert (rein_data == in_data)
def test_repartition_keep_3(cleanup_blocks): array = Partition((6, 6, 6), name='array', fill='random') array.clear() in_blocks = Partition((3, 3, 3), name='in', array=array) array.repartition(in_blocks, None, keep.keep) in_blocks.clear() out_blocks = Partition((2, 2, 2), name='out', array=array) in_blocks.repartition(out_blocks, None, keep.keep) array = Partition((12, 12, 12), name='array', fill='random') array.clear() in_blocks = Partition((4, 4, 4), name='in', array=array) array.repartition(in_blocks, None, keep.keep) in_blocks.clear() out_blocks = Partition((3, 3, 3), name='out', array=array) in_blocks.repartition(out_blocks, None, keep.keep) rein_blocks = Partition((12, 12, 12), name='rein', array=array) out_blocks.repartition(rein_blocks, None, keep.keep) rein_blocks.blocks[(0, 0, 0)].read() rein_data = rein_blocks.blocks[(0, 0, 0)].data.bytes() array.blocks[(0, 0, 0)].read() array_data = array.blocks[(0, 0, 0)].data.bytes() assert (rein_data == array_data)
def test_repartition_keep_1(cleanup_blocks): array = Partition((5, 6, 7), name='array', fill='random') out_blocks = Partition((5, 3, 7), name='out', array=array) array.repartition(out_blocks, None, keep.keep) array.blocks[(0, 0, 0)].read() out_blocks.blocks[(0, 0, 0)].read() out_blocks.blocks[(0, 3, 0)].read() assert (array.blocks[(0, 0, 0)].data.bytes()[:20] == out_blocks.blocks[( 0, 0, 0)].data.bytes()[:20]) assert (array.blocks[(0, 0, 0)].data.bytes()[-20:] == out_blocks.blocks[( 0, 3, 0)].data.bytes()[-20:]) rein_blocks = Partition((5, 6, 7), name='rein') out_blocks.repartition(rein_blocks, None, keep.keep) rein_blocks.blocks[(0, 0, 0)].read() assert (array.blocks[(0, 0, 0)].data.bytes() == rein_blocks.blocks[( 0, 0, 0)].data.bytes())
def test_repartition_keep(cleanup_blocks): array = Partition((2, 2, 2), name='array', fill='random') out_blocks = Partition((2, 1, 2), name='out', array=array) array.repartition(out_blocks, None, keep.keep) array.blocks[(0, 0, 0)].read() out_blocks.blocks[(0, 0, 0)].read() out_blocks.blocks[(0, 1, 0)].read() assert (array.blocks[(0, 0, 0)].data.bytes()[:2] == out_blocks.blocks[( 0, 0, 0)].data.bytes()[:2]) assert (array.blocks[(0, 0, 0)].data.bytes()[2:4] == out_blocks.blocks[( 0, 1, 0)].data.bytes()[:2]) assert (array.blocks[(0, 0, 0)].data.bytes()[4:6] == out_blocks.blocks[( 0, 0, 0)].data.bytes()[2:4]) assert (array.blocks[(0, 0, 0)].data.bytes()[6:8] == out_blocks.blocks[( 0, 1, 0)].data.bytes()[2:4]) rein_blocks = Partition((2, 2, 2), name='rein') out_blocks.repartition(rein_blocks, None, keep.keep) rein_blocks.blocks[(0, 0, 0)].read()
def test_get_f_blocks(): array = Partition((12, 12, 12), name='array', fill='random') in_blocks = Partition((4, 4, 4), name='in', array=array) out_blocks = Partition((3, 3, 3), name='out', array=array) fblocks = keep.get_F_blocks(in_blocks.blocks[(0, 0, 0)], out_blocks) assert ([str(b) for b in fblocks] == [('Block: origin (0, 0, 0); shape' ' (3, 3, 3); data in mem: 0B'), ('Block: origin (0, 0, 3); shape' ' (3, 3, 1); data in mem: 0B'), ('Block: origin (0, 3, 0); shape' ' (3, 1, 3); data in mem: 0B'), ('Block: origin (0, 3, 3); shape' ' (3, 1, 1); data in mem: 0B'), ('Block: origin (3, 0, 0); shape' ' (1, 3, 3); data in mem: 0B'), ('Block: origin (3, 0, 3); shape' ' (1, 3, 1); data in mem: 0B'), ('Block: origin (3, 3, 0); shape' ' (1, 1, 3); data in mem: 0B'), ('Block: origin (3, 3, 3); shape' ' (1, 1, 1); data in mem: 0B')])
def baseline(in_blocks, out_blocks, m, array): ''' Implements get_read_blocks_and_cache(in_blocks, out_blocks, m, array) used in Partition.repartition. It provides a baseline repartitioning algorithm where read blocks are input blocks. Arguments: in_blocks: input partition, to be repartitioned out_blocks: output partition, to be written to disk m: max memory to be used by the repartitioning. This parameter is here for type consistency in Partition.repartition but it is ignored in this baseline implementation. array: partitioned array. This parameter is here for type consistency in Partition.repartition but it is ignored in this baseline implementation. ''' # TODO: creating a new partition makes memory estimates correct, # but it adds an in-memory copy, this could be fixed return (Partition(in_blocks.shape, 'read_blocks', array), BaselineCache(), baseline_seek_count(in_blocks, out_blocks), math.prod(in_blocks.shape))
def keep(in_blocks, out_blocks, m, array): ''' Implements get_read_blocks_and_cache(in_blocks, out_blocks, m, array) used in Partition.repartition. Implements the keep heuristic (Algorithm 2 in the paper). Arguments: in_blocks: input partition, to be repartitioned out_blocks: output partition, to be written to disk m: max memory to be used by the repartitioning. If None, memory constraint is ignored. array: partitioned array. Doesn't need to contain data, used just to get total dimensions of the array. ''' r, peak_mem = find_shape_with_constraint(in_blocks, out_blocks, m) read_blocks = Partition(r, 'read_blocks', array=array) write_blocks, cache = create_write_blocks(read_blocks, out_blocks) # Technically this count is not necessary seeks = keep_seek_count(in_blocks, read_blocks, write_blocks, out_blocks) return read_blocks, cache, seeks, peak_mem
def create_write_blocks(read_blocks, out_blocks): ''' read_block: partition out_blocks: partition ''' match = {} moved_f_blocks = [[] for i in range(len(read_blocks.blocks))] for i, r in enumerate(read_blocks.blocks): f_blocks = get_F_blocks(read_blocks.blocks[r], out_blocks, get_data=False) moved_f_blocks[i] += [f_blocks[0]] # don't move F0 match[(r, 0)] = i for f in range(1, 8): if not f_blocks[f] is None: destF0 = destination_F0(read_blocks, i, f) moved_f_blocks[destF0] += [f_blocks[f]] match[(r, f)] = destF0 merged_blocks = [merge_blocks(blocks) for blocks in moved_f_blocks] match = {k: merged_blocks[match[k]] for k in match} blocks = {m.origin: m for m in merged_blocks} # Warning: write_blocks are a partition but a non-uniform one # This may have side effects. This is also the reason for the # weird create_blocks param write_blocks = Partition((1, 1, 1), name='write_blocks', array=read_blocks.array, create_blocks=False) write_blocks.blocks = blocks cache = KeepCache(out_blocks, match) return write_blocks, cache
def peak_memory(read_shape, in_blocks, out_blocks): ''' Return the estimated amount of memory required to repartition in_blocks into out_blocks, using read_blocks and write_blocks. ''' # To estimate the amount of memory required, we run a dry run of the # repartitioning read_blocks = Partition(read_shape, 'read_blocks', array=in_blocks.array) _, cache = create_write_blocks(read_blocks, out_blocks) def local_get_read_blocks_and_cache(in_blocks, out_blocks, m, array): return read_blocks, cache, None, None (_, _, peak_mem, _, _) = in_blocks.repartition(out_blocks, None, local_get_read_blocks_and_cache, dry_run=True) in_blocks.clear() out_blocks.clear() return peak_mem
def test_partition_clear(cleanup_blocks): array = Partition((12, 12, 12), name='array', fill='random') array.clear()
def main(args=None): parser = ArgumentParser() parser.add_argument("A", action="store", help="shape of the reconstructed array") parser.add_argument( "I", action="store", help="shape of the input blocks. Input blocks " "called 'in...' must be stored on disk", ) parser.add_argument( "O", action="store", help="shape of the outut blocks. Output blocks" " called 'out...' will be created on disk", ) commands = parser.add_mutually_exclusive_group() commands.add_argument( "--create", action="store_true", help="create input blocks on disk" " before repartitioning.", ) commands.add_argument( "--repartition", action="store_true", help="repartition input blocks to output block dimensions", ) commands.add_argument( "--delete", action="store_true", help="delete output blocks after repartitioning.", ) commands.add_argument( "--test-data", action="store_true", help="reconstruct array from input blocks, " "reconstruct array from output blocks, " "check that data is identical in both " "reconstructions.", ) parser.add_argument("--max-mem", action="store", help="max memory to use, in bytes") parser.add_argument( "method", action="store", help="repartitioning method to use", choices=["baseline", "keep"], ) args, params = parser.parse_known_args(args) mem = args.max_mem if mem is not None: mem = int(mem) repart_func = {"baseline": keep.baseline, "keep": keep.keep} array = Partition(make_tuple(args.A), name="array") if args.create: fill = "random" log("Creating input blocks", 1) else: fill = None log("Using existing input blocks", 1) in_blocks = Partition(make_tuple(args.I), name="in", array=array, fill=fill) in_blocks.clear() if not args.create: out_blocks = Partition(make_tuple(args.O), name="out", array=array) # Repartitioning if args.repartition: log("Repartitioning input blocks into output blocks", 1) out_blocks.delete() out_blocks.clear() # shouldn't be necessary but just in case start = time.time() ( total_bytes, seeks, peak_mem, read_time, write_time, ) = in_blocks.repartition(out_blocks, mem, repart_func[args.method]) end = time.time() total_time = end - start assert total_time > read_time + write_time assert total_bytes == 2 * math.prod(array.shape) log( f"Seeks, peak memory (B), read time (s)," f" write time (s), elapsed time (s):" + os.linesep + f"{seeks},{peak_mem},{round(read_time,2)}," f"{round(write_time,2)},{round(total_time,2)}", 2, ) if args.test_data: log("Testing data", 1) in_blocks.repartition(array, mem, repart_func[args.method]) with open(array.blocks[(0, 0, 0)].file_name, "rb") as f: in_data = f.read() array.delete() out_blocks.repartition(array, mem, repart_func[args.method]) with open(array.blocks[(0, 0, 0)].file_name, "rb") as f: out_data = f.read() assert in_data == out_data if args.delete: log("Deleting output blocks", 1) out_blocks.delete()
def test_r_hat_2(): array = Partition((10, 10, 10), name='array') in_blocks = Partition((2, 2, 2), array=array, name='in') out_blocks = Partition((5, 5, 5), array=array, name='out') with pytest.raises(Exception): r_hat = keep.get_r_hat(in_blocks, out_blocks)