Python enable_clustering Examples, dask_io.optimizer.configure.enable_clustering Python Examples

Example #1

0

Show file

File: test_clustering.py Project: big-data-lab-team/dask_io

def test_create_buffers_slabs():
    """ Test if the buffering works according to clustered writes when processing slabs.
    The only strategy that should be used is ``block slices".
    """
    cs = (5, 100, 100)  # 20 chunks
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    _, dicts = get_used_proxies(arr.dask.dicts)
    origarr_name = list(dicts['origarr_to_obj'].keys())[0]

    nb_bytes_per_block = 100 * 100 * 5
    byte_size = 2
    l1 = [[i] for i in range(20)]
    l2 = [list(range(10)), list(range(10, 20))]
    l3 = [list(range(7)), list(range(7, 14)), list(range(14, 20))]

    experiment_params = {
        nb_bytes_per_block * byte_size: l1,
        nb_bytes_per_block * byte_size * 10: l2,
        nb_bytes_per_block * byte_size * 7: l3
    }

    for buffer_size, expected in experiment_params.items():
        logging.info("\nTesting buffer %s", buffer_size)
        logging.debug("Expecting %s", expected)
        enable_clustering(buffer_size, mem_limit=True)
        buffers = create_buffers(origarr_name, dicts, cs)
        logging.debug("Got %s", buffers)
        assert buffers == expected

Example #2

0

Show file

def test_sum(shape_to_test, nb_chunks):
    """ Test if the sum of two blocks yields the good result using our optimization function.
    """
    logger.info("testing shape %s", shape_to_test)

    # prepare test case
    case = Split(pytest.test_array_path, shape_to_test)
    case.sum(nb_chunks)

    # non optimized run
    disable_clustering()
    result_non_opti = case.get().compute()

    # optimized run
    enable_clustering(buffer_size)
    result_opti = case.get().compute()

    assert np.array_equal(result_non_opti, result_opti)

Example #3

0

Show file

def run_test(test, paths):
    """ Wrapper around 'run' function for diagnostics.

    Arguments:
    ----------
        test:
        paths:
    """
    test.print_config()
    uid = uuid.uuid4()
    print("Test ID is ", str(uid))

    params = getattr(test, 'params')
    splitcase = getattr(test, 'splitcase')
    mergecase = getattr(test, 'mergecase')

    if params["optimized"]:
        enable_clustering(params["buffer_size"])
    else:
        disable_clustering()

    flush_cache()
    try:
        arr = splitcase.get()
        tsplit, diagnostics_split, monitor_split = run_to_hdf5(
            arr, params, uid, str(params["chunk_shape"]), params["optimized"])
    except Exception as e:
        print(e)
        return [
            params["hardware"], params["cuboid_name"], params["array_shape"],
            params["chunk_type"], params["chunk_shape"], params["optimized"],
            params["buffer_size"], params["nthreads"], None, None, None, None,
            None, None, None, None
        ]
    finally:
        splitcase.clean()

    R = cuboids[params["cuboid_name"]]['shape']
    I = splitcase.chunks_shape
    print(f'R: {R}')
    print(f'I: {I}')
    if not 'auto' in I:
        success_run_split = verify_results_split(
            R, I, getattr(test, 'cuboid_filepath'),
            getattr(test, 'hardware_path'))
    else:
        success_run_split = None
    print(f'[Split] Find the diagnostics output file at {diagnostics_split}')
    print(f'[Split] Find the monitor output file at {monitor_split}')

    flush_cache()
    try:
        arr = mergecase.get()
        tmerge, diagnostics_merge, monitor_merge = run_to_hdf5(
            arr, params, uid, str(params["chunk_shape"]), params["optimized"])
    except Exception as e:
        print(e)
        return [
            params["hardware"], params["cuboid_name"], params["array_shape"],
            params["chunk_type"], params["chunk_shape"], params["optimized"],
            params["buffer_size"], params["nthreads"],
            round(tsplit, 4), None, None, None, None, None, None, None
        ]
    finally:
        mergecase.clean()

    success_run_merge = verify_results_merge(getattr(test, 'cuboid_filepath'),
                                             getattr(test, 'merge_filepath'))
    print(f'[Merge] Find the diagnostics output file at {diagnostics_merge}')
    print(f'[Merge] Find the monitor output file at {monitor_merge}')

    datadir = getattr(test, 'hardware_path')
    merged_filepath = getattr(test, 'merge_filepath')
    clean_directory(datadir)
    os.remove(merged_filepath)

    sample_res = [
        params["hardware"], params["cuboid_name"], params["array_shape"],
        params["chunk_type"], params["chunk_shape"], params["optimized"],
        params["buffer_size"], params["nthreads"],
        round(tsplit, 4),
        round(tmerge, 4), diagnostics_split, diagnostics_merge, monitor_split,
        monitor_merge, success_run_split, success_run_merge
    ]
    print("-------------RESULT\n", sample_res)
    return sample_res

Example #4

0

Show file

File: experiment_7.py Project: GTimothee/dask_io_experiments

        create_array(input_filepath, input_array_shape)

    # split
    times = list()
    for buffer in buffers_to_test:
        print("RUNNING BUFFER ", buffer)

        with h5py.File(input_filepath, 'r') as f_in:  # open original array
            dset = f_in['/data']
            in_arr = da.from_array(dset, chunks=split_cs)

            with h5py.File(output_filepath, 'x') as f_out:  # open split array
                # run optimized
                split_arr = split_to_hdf5(in_arr, f_out, nb_blocks=None)
                print("RUNNING OPTIMIZED")
                enable_clustering(buffer)
                flush_cache()
                with Profiler() as prof, ResourceProfiler(
                ) as rprof, CacheProfiler(metric=nbytes) as cprof:
                    with dask.config.set(scheduler='single-threaded'):
                        t = time.time()
                        _ = split_arr.compute()
                        t = time.time() - t
                        times.append([buffer, t, "optimized"])
                        visualize([prof, rprof, cprof],
                                  os.path.join(output_directory,
                                               str(buffer) + "opti" + ".html"),
                                  show=False)

            os.remove(output_filepath)  # remove output file for next run
            with h5py.File(output_filepath, 'x') as f_out:  # open split array

Example #5

0

Show file

File: test_find_proxies.py Project: big-data-lab-team/dask_io

def create_test_array():
    if not pytest.test_array_path:
        create_test_array_nochunk(path, (100, 100, 100))
        pytest.test_array_path = path

    enable_clustering(buffer_size, mem_limit=True)

Example #6

0

Show file

def test_split(optimized, nb_chunks, shape_to_test):
    def create_arrays_for_comparison():
        """ Get chunks as dask arrays to compare the chunks to the splitted files.
        """
        arr = get_dask_array_from_hdf5(pytest.test_array_path,
                                       '/data',
                                       logic_cs=shape_to_test)
        arr_list = get_arr_chunks(arr, nb_chunks=nb_chunks)
        return arr_list

    def apply_sanity_check(split_filepath):
        """ Check if splitted file not empty.
        """
        logger.info("Checking split file integrity...")
        with h5py.File(split_filepath, 'r') as f:
            keys_list = list(f.keys())
            logger.info("file : %s", f)
            logger.info("Number of datasets in hdf5 file : %s", len(keys_list))
            logger.info("First item: %s", keys_list[0])
            assert len(list(f.keys())) != 0
        logger.info("Integrity check passed.\n")

    def store_correct():
        """ Compare the real chunks to the splits to see if correctly splitted. 
        """
        logger.info("Testing %s matches...", len(arr_list))
        with h5py.File(split_filepath, 'r') as f:
            for i, a in enumerate(arr_list):
                stored_a = da.from_array(f['/data' + str(i)])
                # logger.info("split shape: %s", stored_a.shape)

                stored_a.rechunk(chunks=shape_to_test)
                # logger.info("split rechunked to: %s", stored_a.shape)
                # logger.info("will be compared to : %s ", a.shape)
                # logger.info("Testing all close...")
                test = da.allclose(stored_a, a)
                disable_clustering(
                )  # TODO: remove this, make it work even for all close
                assert test.compute()
        logger.info("Passed.\n")

    def split():
        # overwrite if split file already exists
        if os.path.isfile(split_filepath):
            os.remove(split_filepath)

        case = Split(pytest.test_array_path, shape_to_test)
        case.split_hdf5(split_filepath, nb_blocks=nb_chunks)
        case.get().compute()
        return

    logger.info("PARAMETERS:")
    logger.info("Optimized: %s", optimized), nb_chunks, shape_to_test
    logger.info("Nb_chunk: %s", nb_chunks)
    logger.info("Shape: %s \n", shape_to_test)

    # setup config
    split_filepath = "./split_file.hdf5"

    if optimized:
        enable_clustering(buffer_size)
    else:
        disable_clustering()

    # test
    split()
    apply_sanity_check(split_filepath)

    # assert
    arr_list = create_arrays_for_comparison()
    store_correct()

Example #7

0

Show file

File: test_clustering.py Project: big-data-lab-team/dask_io

def test_create_buffers_blocks():
    """ Test if the buffering works according to clustered writes in all 3 possible configurations.

    Data:
    -----
    input array shape: 100x100x100
    input arr created with 2 bytes per pixel
    block shape: 20x20x20
    
    Which gives us:
    ---------------
    - nb blocks per row = 5
    - nb blocks per slice = 25
    - block size in bytes : (20*20*20) * 2 bytes = 16000 
    """
    cs = (20, 20, 20)
    case = Split(pytest.test_array_path, cs)
    case.split_hdf5("./split_file.hdf5", nb_blocks=None)
    arr = case.get()

    _, dicts = get_used_proxies(arr.dask.dicts)
    origarr_name = list(dicts['origarr_to_obj'].keys())[0]

    # EXPECTED BEHAVIOR FOR CLUSTERED WRITES
    l1 = [[i] for i in range(125)]  # 1 block
    l2 = list()  # 3 blocks
    for i in range(25):
        o = (i * 5)
        l2.append([0 + o, 1 + o, 2 + o])
        l2.append([3 + o, 4 + o])
    l3 = list()  # 1 block column
    for i in range(25):
        l3.append(list(range(i * 5, i * 5 + 5)))
    l4 = list()  # 2 block columns
    for i in range(5):
        o = i * 25  # offset
        l4.append(list(range(0 + o, 10 + o)))
        l4.append(list(range(10 + o, 20 + o)))
        l4.append(list(range(20 + o, 25 + o)))
    l5 = list()  # 1 block slice
    for i in range(5):
        l5.append(list(range((i * 25), (i * 25) + 25)))
    l6 = list()  # 3 block slices
    l6.append(list(range(0, 25 * 3)))
    l6.append(list(range(75, 125)))
    l7 = [list(range(125))]  # whole array

    nb_bytes_per_block = 20 * 20 * 20
    byte_size = 2
    experiment_params = {
        nb_bytes_per_block * byte_size: l1,  # 1 block
        nb_bytes_per_block * byte_size * 3: l2,  # some blocks (3)
        nb_bytes_per_block * byte_size * 5: l3,  # 1 block column
        nb_bytes_per_block * byte_size * 5 * 2: l4,  # some block columns (2)
        nb_bytes_per_block * byte_size * 5 * 5: l5,  # 1 block slice
        nb_bytes_per_block * byte_size * 5 * 5 * 3:
        l6,  # some block slices (3)
        nb_bytes_per_block * byte_size * 5 * 5 * 5: l7,  # whole array
    }

    for buffer_size, expected in experiment_params.items():
        logging.info("\nTesting buffer %s", buffer_size)
        logging.debug("Expecting %s", expected)
        enable_clustering(buffer_size, mem_limit=True)
        buffers = create_buffers(origarr_name, dicts, cs)
        logging.debug("Got %s", buffers)
        assert buffers == expected