def test_r_hat():
    array = Partition((3500, 3500, 3500), name='array')
    in_blocks = Partition((875, 875, 875), array=array, name='in')
    out_blocks = Partition((700, 875, 700), array=array, name='out')

    r_hat = keep.get_r_hat(in_blocks, out_blocks)
    assert (r_hat == (875, 875, 875))
def test_r_hat_1():
    array = Partition((20, 20, 20), name='array')
    in_blocks = Partition((20, 20, 20), array=array, name='in')
    out_blocks = Partition((20, 10, 2), array=array, name='out')

    r_hat = keep.get_r_hat(in_blocks, out_blocks)
    assert (r_hat == (20, 20, 20))
def test_find_shape_with_constraint():
    array = Partition((100, 100, 100), name='array', fill='random')
    in_blocks = Partition((10, 10, 10), name='out', array=array)
    out_blocks = Partition((50, 50, 50), name='out', array=array)
    shape, mc = keep.find_shape_with_constraint(in_blocks, out_blocks, None)
    assert ((shape, mc) == ((50, 50, 50), 125000))

    shape, mc = keep.find_shape_with_constraint(in_blocks, out_blocks, 3000)
    assert ((shape, mc) == ((1, 50, 50), 2500))
def test_seek_model():
    array = Partition((2, 2, 2), name='array', fill='random')
    out_blocks = Partition((2, 1, 2), name='out', array=array)

    _, _, seeks, _ = keep.keep(array, out_blocks, None, array)
    assert (seeks == 3)

    _, _, seeks, _ = keep.baseline(out_blocks, array, None, array)
    assert (seeks == 6)
def test_get_f_blocks_1():
    array = Partition((12, 12, 12), name='array', fill='random')
    in_blocks = Partition((4, 4, 4), name='in', array=array)

    fblocks = keep.get_F_blocks(array.blocks[(0, 0, 0)], in_blocks)

    assert ([str(b) for b in fblocks] == [
        'Block: origin (0, 0, 0); shape (12, 12, 12); data in mem: 0B', 'None',
        'None', 'None', 'None', 'None', 'None', 'None'
    ])
def test_partition_to_end_coords():
    d = 12
    array = Partition((d, d, d), name='array')
    in_blocks = Partition((4, 4, 4), name='in', array=array)
    coords = keep.partition_to_end_coords(in_blocks)
    assert (coords == ([3, 7, 11], [3, 7, 11], [3, 7, 11]))

    d = 3500
    array = Partition((d, d, d), name='array')
    in_blocks = Partition((500, 500, 500), name='in', array=array)
    coords = keep.partition_to_end_coords(in_blocks)
    assert (coords == ([499, 999, 1499, 1999, 2499, 2999,
                        3499], [499, 999, 1499, 1999, 2499, 2999, 3499],
                       [499, 999, 1499, 1999, 2499, 2999, 3499]))
def test_repartition_baseline_2(cleanup_blocks):
    array = Partition((10, 20, 30), name='array')
    in_blocks = Partition((10, 20, 30), name='in', array=array, fill='random')
    in_blocks.blocks[(0, 0, 0)].read()
    in_data = in_blocks.blocks[(0, 0, 0)].data.bytes()
    out_blocks = Partition((10, 10, 15), name='out', array=array)
    in_blocks.repartition(out_blocks, None, keep.baseline)

    rein_blocks = Partition((10, 20, 30), name='rein', array=array)
    out_blocks.repartition(rein_blocks, None, keep.baseline)
    rein_blocks.blocks[(0, 0, 0)].read()
    rein_data = rein_blocks.blocks[(0, 0, 0)].data.bytes()

    assert (rein_data == in_data)
Beispiel #8
0
def create_write_blocks(read_blocks, out_blocks):
    '''
        read_block: partition
        out_blocks: partition
    '''

    match = {}

    moved_f_blocks = [[] for i in range(len(read_blocks.blocks))]

    for i, r in enumerate(read_blocks.blocks):
        f_blocks = get_F_blocks(read_blocks.blocks[r],
                                out_blocks,
                                get_data=False)

        moved_f_blocks[i] += [f_blocks[0]]  # don't move F0
        match[(r, 0)] = i
        for f in range(1, 8):
            if not f_blocks[f] is None:
                destF0 = destination_F0(read_blocks, i, f)
                moved_f_blocks[destF0] += [f_blocks[f]]
                match[(r, f)] = destF0

    merged_blocks = [merge_blocks(blocks) for blocks in moved_f_blocks]
    match = {k: merged_blocks[match[k]] for k in match}
    blocks = {m.origin: m for m in merged_blocks}

    # Warning: write_blocks are a partition but a non-uniform one
    # This may have side effects. This is also the reason for the
    # weird create_blocks param
    write_blocks = Partition((1, 1, 1),
                             name='write_blocks',
                             array=read_blocks.array,
                             create_blocks=False)
    write_blocks.blocks = blocks
    cache = KeepCache(out_blocks, match)

    return write_blocks, cache
def test_repartition_keep_1(cleanup_blocks):
    array = Partition((5, 6, 7), name='array', fill='random')
    out_blocks = Partition((5, 3, 7), name='out', array=array)
    array.repartition(out_blocks, None, keep.keep)
    array.blocks[(0, 0, 0)].read()
    out_blocks.blocks[(0, 0, 0)].read()
    out_blocks.blocks[(0, 3, 0)].read()

    assert (array.blocks[(0, 0, 0)].data.bytes()[:20] == out_blocks.blocks[(
        0, 0, 0)].data.bytes()[:20])
    assert (array.blocks[(0, 0, 0)].data.bytes()[-20:] == out_blocks.blocks[(
        0, 3, 0)].data.bytes()[-20:])

    rein_blocks = Partition((5, 6, 7), name='rein')
    out_blocks.repartition(rein_blocks, None, keep.keep)
    rein_blocks.blocks[(0, 0, 0)].read()

    assert (array.blocks[(0, 0, 0)].data.bytes() == rein_blocks.blocks[(
        0, 0, 0)].data.bytes())
Beispiel #10
0
def test_get_f_blocks():
    array = Partition((12, 12, 12), name='array', fill='random')
    in_blocks = Partition((4, 4, 4), name='in', array=array)
    out_blocks = Partition((3, 3, 3), name='out', array=array)

    fblocks = keep.get_F_blocks(in_blocks.blocks[(0, 0, 0)], out_blocks)

    assert ([str(b) for b in fblocks] == [('Block: origin (0, 0, 0); shape'
                                           ' (3, 3, 3); data in mem: 0B'),
                                          ('Block: origin (0, 0, 3); shape'
                                           ' (3, 3, 1); data in mem: 0B'),
                                          ('Block: origin (0, 3, 0); shape'
                                           ' (3, 1, 3); data in mem: 0B'),
                                          ('Block: origin (0, 3, 3); shape'
                                           ' (3, 1, 1); data in mem: 0B'),
                                          ('Block: origin (3, 0, 0); shape'
                                           ' (1, 3, 3); data in mem: 0B'),
                                          ('Block: origin (3, 0, 3); shape'
                                           ' (1, 3, 1); data in mem: 0B'),
                                          ('Block: origin (3, 3, 0); shape'
                                           ' (1, 1, 3); data in mem: 0B'),
                                          ('Block: origin (3, 3, 3); shape'
                                           ' (1, 1, 1); data in mem: 0B')])
def test_repartition_keep(cleanup_blocks):
    array = Partition((2, 2, 2), name='array', fill='random')
    out_blocks = Partition((2, 1, 2), name='out', array=array)
    array.repartition(out_blocks, None, keep.keep)
    array.blocks[(0, 0, 0)].read()
    out_blocks.blocks[(0, 0, 0)].read()
    out_blocks.blocks[(0, 1, 0)].read()

    assert (array.blocks[(0, 0, 0)].data.bytes()[:2] == out_blocks.blocks[(
        0, 0, 0)].data.bytes()[:2])
    assert (array.blocks[(0, 0, 0)].data.bytes()[2:4] == out_blocks.blocks[(
        0, 1, 0)].data.bytes()[:2])
    assert (array.blocks[(0, 0, 0)].data.bytes()[4:6] == out_blocks.blocks[(
        0, 0, 0)].data.bytes()[2:4])
    assert (array.blocks[(0, 0, 0)].data.bytes()[6:8] == out_blocks.blocks[(
        0, 1, 0)].data.bytes()[2:4])

    rein_blocks = Partition((2, 2, 2), name='rein')
    out_blocks.repartition(rein_blocks, None, keep.keep)
    rein_blocks.blocks[(0, 0, 0)].read()
Beispiel #12
0
def baseline(in_blocks, out_blocks, m, array):
    '''
    Implements get_read_blocks_and_cache(in_blocks, out_blocks, m, array)
    used in Partition.repartition. It provides a baseline repartitioning
    algorithm where read blocks are input blocks.

    Arguments:
        in_blocks: input partition, to be repartitioned
        out_blocks: output partition, to be written to disk
        m: max memory to be used by the repartitioning. This parameter is
           here for type consistency in Partition.repartition but it is
           ignored in this baseline implementation.
        array: partitioned array. This parameter is here for type
               consistency in Partition.repartition but it is ignored in this
               baseline implementation.
    '''
    # TODO: creating a new partition makes memory estimates correct,
    # but it adds an in-memory copy, this could be fixed
    return (Partition(in_blocks.shape, 'read_blocks', array), BaselineCache(),
            baseline_seek_count(in_blocks,
                                out_blocks), math.prod(in_blocks.shape))
Beispiel #13
0
def keep(in_blocks, out_blocks, m, array):
    '''
    Implements get_read_blocks_and_cache(in_blocks, out_blocks, m, array)
    used in Partition.repartition. Implements the keep heuristic (Algorithm
    2 in the paper).

    Arguments:
        in_blocks: input partition, to be repartitioned
        out_blocks: output partition, to be written to disk
        m: max memory to be used by the repartitioning. If None, memory
           constraint is ignored.
        array: partitioned array. Doesn't need to contain data, used just
               to get total dimensions of the array.
    '''

    r, peak_mem = find_shape_with_constraint(in_blocks, out_blocks, m)
    read_blocks = Partition(r, 'read_blocks', array=array)
    write_blocks, cache = create_write_blocks(read_blocks, out_blocks)
    # Technically this count is not necessary
    seeks = keep_seek_count(in_blocks, read_blocks, write_blocks, out_blocks)
    return read_blocks, cache, seeks, peak_mem
Beispiel #14
0
def peak_memory(read_shape, in_blocks, out_blocks):
    '''
    Return the estimated amount of memory required to repartition in_blocks
    into out_blocks, using read_blocks and write_blocks.
    '''

    # To estimate the amount of memory required, we run a dry run of the
    # repartitioning

    read_blocks = Partition(read_shape, 'read_blocks', array=in_blocks.array)
    _, cache = create_write_blocks(read_blocks, out_blocks)

    def local_get_read_blocks_and_cache(in_blocks, out_blocks, m, array):
        return read_blocks, cache, None, None

    (_, _, peak_mem, _,
     _) = in_blocks.repartition(out_blocks,
                                None,
                                local_get_read_blocks_and_cache,
                                dry_run=True)
    in_blocks.clear()
    out_blocks.clear()
    return peak_mem
def test_partition_clear(cleanup_blocks):
    array = Partition((12, 12, 12), name='array', fill='random')
    array.clear()
def test_repartition_keep_3(cleanup_blocks):
    array = Partition((6, 6, 6), name='array', fill='random')
    array.clear()
    in_blocks = Partition((3, 3, 3), name='in', array=array)
    array.repartition(in_blocks, None, keep.keep)

    in_blocks.clear()
    out_blocks = Partition((2, 2, 2), name='out', array=array)
    in_blocks.repartition(out_blocks, None, keep.keep)

    array = Partition((12, 12, 12), name='array', fill='random')
    array.clear()
    in_blocks = Partition((4, 4, 4), name='in', array=array)
    array.repartition(in_blocks, None, keep.keep)

    in_blocks.clear()
    out_blocks = Partition((3, 3, 3), name='out', array=array)
    in_blocks.repartition(out_blocks, None, keep.keep)

    rein_blocks = Partition((12, 12, 12), name='rein', array=array)
    out_blocks.repartition(rein_blocks, None, keep.keep)

    rein_blocks.blocks[(0, 0, 0)].read()
    rein_data = rein_blocks.blocks[(0, 0, 0)].data.bytes()

    array.blocks[(0, 0, 0)].read()
    array_data = array.blocks[(0, 0, 0)].data.bytes()

    assert (rein_data == array_data)
Beispiel #17
0
def main(args=None):
    parser = ArgumentParser()

    parser.add_argument("A",
                        action="store",
                        help="shape of the reconstructed array")
    parser.add_argument(
        "I",
        action="store",
        help="shape of the input blocks. Input blocks "
        "called 'in...' must be stored on disk",
    )
    parser.add_argument(
        "O",
        action="store",
        help="shape of the outut blocks. Output blocks"
        " called 'out...' will be created on disk",
    )
    commands = parser.add_mutually_exclusive_group()
    commands.add_argument(
        "--create",
        action="store_true",
        help="create input blocks on disk"
        " before repartitioning.",
    )
    commands.add_argument(
        "--repartition",
        action="store_true",
        help="repartition input blocks to output block dimensions",
    )
    commands.add_argument(
        "--delete",
        action="store_true",
        help="delete output blocks after repartitioning.",
    )
    commands.add_argument(
        "--test-data",
        action="store_true",
        help="reconstruct array from input blocks, "
        "reconstruct array from output blocks, "
        "check that data is identical in both "
        "reconstructions.",
    )
    parser.add_argument("--max-mem",
                        action="store",
                        help="max memory to use, in bytes")
    parser.add_argument(
        "method",
        action="store",
        help="repartitioning method to use",
        choices=["baseline", "keep"],
    )

    args, params = parser.parse_known_args(args)
    mem = args.max_mem
    if mem is not None:
        mem = int(mem)

    repart_func = {"baseline": keep.baseline, "keep": keep.keep}

    array = Partition(make_tuple(args.A), name="array")

    if args.create:
        fill = "random"
        log("Creating input blocks", 1)
    else:
        fill = None
        log("Using existing input blocks", 1)

    in_blocks = Partition(make_tuple(args.I),
                          name="in",
                          array=array,
                          fill=fill)

    in_blocks.clear()

    if not args.create:
        out_blocks = Partition(make_tuple(args.O), name="out", array=array)

        # Repartitioning
        if args.repartition:
            log("Repartitioning input blocks into output blocks", 1)
            out_blocks.delete()
            out_blocks.clear()  # shouldn't be necessary but just in case
            start = time.time()
            (
                total_bytes,
                seeks,
                peak_mem,
                read_time,
                write_time,
            ) = in_blocks.repartition(out_blocks, mem,
                                      repart_func[args.method])
            end = time.time()
            total_time = end - start
            assert total_time > read_time + write_time
            assert total_bytes == 2 * math.prod(array.shape)
            log(
                f"Seeks, peak memory (B), read time (s),"
                f" write time (s), elapsed time (s):" + os.linesep +
                f"{seeks},{peak_mem},{round(read_time,2)},"
                f"{round(write_time,2)},{round(total_time,2)}",
                2,
            )

        if args.test_data:
            log("Testing data", 1)
            in_blocks.repartition(array, mem, repart_func[args.method])
            with open(array.blocks[(0, 0, 0)].file_name, "rb") as f:
                in_data = f.read()
            array.delete()
            out_blocks.repartition(array, mem, repart_func[args.method])
            with open(array.blocks[(0, 0, 0)].file_name, "rb") as f:
                out_data = f.read()
            assert in_data == out_data

        if args.delete:
            log("Deleting output blocks", 1)
            out_blocks.delete()
Beispiel #18
0
def test_r_hat_2():
    array = Partition((10, 10, 10), name='array')
    in_blocks = Partition((2, 2, 2), array=array, name='in')
    out_blocks = Partition((5, 5, 5), array=array, name='out')
    with pytest.raises(Exception):
        r_hat = keep.get_r_hat(in_blocks, out_blocks)