Python get_used_proxies Examples, dask_io.optimizer.find_proxies.get_used_proxies Python Examples

Example #1

0

Show file

File: test_clustering.py Project: big-data-lab-team/dask_io

def test_create_buffer_node():
    # preparation
    cs = (20, 20, 20)
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    graph = arr.dask.dicts
    _, dicts = get_used_proxies(graph)
    origarr_name = list(dicts['origarr_to_obj'].keys())[0]
    buffers = create_buffers(origarr_name, dicts, cs)

    # apply function
    keys = list()
    for buffer in buffers:
        key = create_buffer_node(graph, origarr_name, dicts, buffer, cs)
        keys.append(key)

    # test output
    buffers_key = origarr_name.split('-')[-1] + '-merged'

    indices = set()
    for buffer_key in graph[buffers_key].keys():
        _, start, end = buffer_key
        indices.add((start, end))

    buffers = set([(b[0], b[-1]) for b in buffers])

    assert buffers_key in graph.keys()
    assert len(indices) == len(buffers)
    assert buffers == indices

Example #2

0

Show file

File: test_clustering.py Project: big-data-lab-team/dask_io

def test_create_buffers_slabs():
    """ Test if the buffering works according to clustered writes when processing slabs.
    The only strategy that should be used is ``block slices".
    """
    cs = (5, 100, 100)  # 20 chunks
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    _, dicts = get_used_proxies(arr.dask.dicts)
    origarr_name = list(dicts['origarr_to_obj'].keys())[0]

    nb_bytes_per_block = 100 * 100 * 5
    byte_size = 2
    l1 = [[i] for i in range(20)]
    l2 = [list(range(10)), list(range(10, 20))]
    l3 = [list(range(7)), list(range(7, 14)), list(range(14, 20))]

    experiment_params = {
        nb_bytes_per_block * byte_size: l1,
        nb_bytes_per_block * byte_size * 10: l2,
        nb_bytes_per_block * byte_size * 7: l3
    }

    for buffer_size, expected in experiment_params.items():
        logging.info("\nTesting buffer %s", buffer_size)
        logging.debug("Expecting %s", expected)
        enable_clustering(buffer_size, mem_limit=True)
        buffers = create_buffers(origarr_name, dicts, cs)
        logging.debug("Got %s", buffers)
        assert buffers == expected

Example #3

0

Show file

def clustered_optimization(graph):
    """ Applies clustered IO optimization on a Dask graph.

    Arguments:
    ----------
        graph : dark_array.dask.dicts
    """
    logger.info(
        f"Configuration file is at {os.path.join(current_dir, 'logging_config.ini')}"
    )
    logger.info("Log file: %s", logfilename)
    logger.info("Finding proxies.")
    chunk_shape, dicts = get_used_proxies(graph)

    if chunk_shape == None or dicts == None:
        logger.error(
            "Chunk shape or dicts = None. Aborting dask_io optimization.")
        raise ValueError()

    logger.info("Launching optimization algorithm.")
    apply_clustered_strategy(graph, dicts, chunk_shape)
    return graph

Example #4

0

Show file

File: test_clustering.py Project: big-data-lab-team/dask_io

def test_get_blocks_used():
    cs = (20, 20, 20)
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    # routine to get the needed data
    # we assume those functions have been tested before get_blocks_used
    cs_confirmed, dicts = get_used_proxies(arr.dask.dicts)

    assert cs == cs_confirmed

    origarr_name = list(dicts['origarr_to_obj'].keys())[0]
    arr_obj = dicts['origarr_to_obj'][origarr_name]
    strategy, max_blocks_per_load = get_load_strategy(ONE_GIG, cs,
                                                      (100, 100, 100))

    # actual test of the function
    blocks_used, block_to_proxies = get_blocks_used(dicts, origarr_name,
                                                    arr_obj, cs)
    blocks_used.sort()
    expected = list(range(125))
    assert blocks_used == expected

Example #5

0

Show file

def keep_algorithm(dsk, keys):
    print("Inside the keep algorithm")
    dask_graph = dsk.dicts
    chunk_shape, dicts = get_used_proxies(dask_graph)
    return dsk

Example #6

0

Show file

File: test_clustering.py Project: big-data-lab-team/dask_io

def test_create_buffers_blocks():
    """ Test if the buffering works according to clustered writes in all 3 possible configurations.

    Data:
    -----
    input array shape: 100x100x100
    input arr created with 2 bytes per pixel
    block shape: 20x20x20
    
    Which gives us:
    ---------------
    - nb blocks per row = 5
    - nb blocks per slice = 25
    - block size in bytes : (20*20*20) * 2 bytes = 16000 
    """
    cs = (20, 20, 20)
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    _, dicts = get_used_proxies(arr.dask.dicts)
    origarr_name = list(dicts['origarr_to_obj'].keys())[0]

    # EXPECTED BEHAVIOR FOR CLUSTERED WRITES
    l1 = [[i] for i in range(125)]  # 1 block
    l2 = list()  # 3 blocks
    for i in range(25):
        o = (i * 5)
        l2.append([0 + o, 1 + o, 2 + o])
        l2.append([3 + o, 4 + o])
    l3 = list()  # 1 block column
    for i in range(25):
        l3.append(list(range(i * 5, i * 5 + 5)))
    l4 = list()  # 2 block columns
    for i in range(5):
        o = i * 25  # offset
        l4.append(list(range(0 + o, 10 + o)))
        l4.append(list(range(10 + o, 20 + o)))
        l4.append(list(range(20 + o, 25 + o)))
    l5 = list()  # 1 block slice
    for i in range(5):
        l5.append(list(range((i * 25), (i * 25) + 25)))
    l6 = list()  # 3 block slices
    l6.append(list(range(0, 25 * 3)))
    l6.append(list(range(75, 125)))
    l7 = [list(range(125))]  # whole array

    nb_bytes_per_block = 20 * 20 * 20
    byte_size = 2
    experiment_params = {
        nb_bytes_per_block * byte_size: l1,  # 1 block
        nb_bytes_per_block * byte_size * 3: l2,  # some blocks (3)
        nb_bytes_per_block * byte_size * 5: l3,  # 1 block column
        nb_bytes_per_block * byte_size * 5 * 2: l4,  # some block columns (2)
        nb_bytes_per_block * byte_size * 5 * 5: l5,  # 1 block slice
        nb_bytes_per_block * byte_size * 5 * 5 * 3:
        l6,  # some block slices (3)
        nb_bytes_per_block * byte_size * 5 * 5 * 5: l7,  # whole array
    }

    for buffer_size, expected in experiment_params.items():
        logging.info("\nTesting buffer %s", buffer_size)
        logging.debug("Expecting %s", expected)
        enable_clustering(buffer_size, mem_limit=True)
        buffers = create_buffers(origarr_name, dicts, cs)
        logging.debug("Got %s", buffers)
        assert buffers == expected