Esempio n. 1
0
def chunked_emd(tmpdir_factory):
    lt_ctx = Context(executor=InlineJobExecutor())
    datadir = tmpdir_factory.mktemp('hdf5_chunked_data')
    filename = os.path.join(datadir, 'chunked.emd')

    chunks = (32, 32, 128, 128)

    with h5py.File(filename, mode="w") as f:
        f.attrs.create('version_major', 0)
        f.attrs.create('version_minor', 2)

        f.create_group('experimental/science_data')
        group = f['experimental/science_data']
        group.attrs.create('emd_group_type', 1)

        data = np.ones((256, 256, 128, 128), dtype=np.float32)

        group.create_dataset(name='data', data=data, chunks=chunks)
        group.create_dataset(name='dim1', data=range(256))
        group['dim1'].attrs.create('name', b'dim1')
        group['dim1'].attrs.create('units', b'units1')
        group.create_dataset(name='dim2', data=range(256))
        group['dim2'].attrs.create('name', b'dim2')
        group['dim2'].attrs.create('units', b'units2')
        group.create_dataset(name='dim3', data=range(128))
        group['dim3'].attrs.create('name', b'dim3')
        group['dim3'].attrs.create('units', b'units3')
        group.create_dataset(name='dim4', data=range(128))
        group['dim4'].attrs.create('name', b'dim4')
        group['dim4'].attrs.create('units', b'units4')
        f.close()

    yield lt_ctx.load("auto",
                      path=filename,
                      ds_path="/experimental/science_data/data")
Esempio n. 2
0
    def __init__(self, path: str, continuous=False, rois=None, max_runs=-1):
        """
        Parameters
        ----------

        path
            Path to the HDR file

        continuous
            If set to True, will continuously output data

        rois: List[np.ndarray]
            If a list of ROIs is given, in continuous mode, cycle through
            these ROIs from the source data

        max_runs: int
            Maximum number of continuous runs
        """
        if rois is None:
            rois = []
        if not path.lower().endswith(".hdr"):
            raise ValueError("please pass the path to the HDR file!")
        self._path = path
        self._continuous = continuous
        self._rois = rois
        self._ctx = Context(executor=InlineJobExecutor())
        self._ds = None
        self._max_runs = max_runs
        self._mmaps = {}
Esempio n. 3
0
def test_dask_array_2(dask_executor):
    # NOTE: keep in sync with the example in docs/source/api.rst!
    # Construct a Dask array from the dataset
    # The second return value contains information
    # on workers that hold parts of a dataset in local
    # storage to ensure optimal data locality
    ctx = Context(executor=dask_executor)
    dataset = ctx.load("memory", datashape=(16, 16, 16), sig_dims=2)
    dask_array, workers = make_dask_array(dataset)

    # Use the Dask.distributed client of LiberTEM, since it may not be
    # the default client:
    ctx.executor.client.compute(dask_array.sum(axis=(-1, -2))).result()
Esempio n. 4
0
def test_connect_default(local_cluster_url):
    try:
        executor = DaskJobExecutor.connect(
            local_cluster_url, client_kwargs={'set_as_default': True})
        ctx = Context(executor=executor)
        # This queries Dask which scheduler it is using
        ctx2 = Context.make_with("dask-integration")
        # make sure the second uses the Client of the first
        assert ctx2.executor.client is ctx.executor.client
    finally:
        # Only close the Client, keep the cluster running
        # since that is test infrastructure
        executor.client.close()
        ctx.close()
Esempio n. 5
0
def ctx(request, dask_executor):
    if request.param == 'inline':
        yield Context.make_with('inline')
    elif request.param == "dask_executor":
        yield Context(executor=dask_executor)
    elif request.param == "delayed_default":
        yield Context(executor=DelayedJobExecutor())
    elif request.param == "delayed_dist":
        with distributed.Client(n_workers=2,
                                threads_per_worker=4,
                                processes=True) as _:
            yield Context(executor=DelayedJobExecutor())
    elif request.param == "dask_make_default":
        try:
            ctx = Context.make_with('dask-make-default')
            yield ctx
        finally:
            # cleanup: Close cluster and client
            # This is also tested below, here just to make
            # sure things behave as expected.
            assert isinstance(ctx.executor, DaskJobExecutor)
            ctx.executor.is_local = True
            ctx.close()
    elif request.param == "dask_integration":
        with distributed.Client(n_workers=2,
                                threads_per_worker=4,
                                processes=False) as _:
            yield Context.make_with("dask-integration")
    elif request.param == "concurrent":
        yield Context.make_with("threads")
    elif request.param == "delayed":
        yield Context(executor=DelayedJobExecutor())
Esempio n. 6
0
def test_make_default():
    try:
        ctx = Context.make_with("dask-make-default")
        # This queries Dask which scheduler it is using
        ctx2 = Context.make_with("dask-integration")
        # make sure the second uses the Client of the first
        assert ctx2.executor.client is ctx.executor.client
    finally:
        # dask-make-default starts a Client that will persist
        # and not be closed automatically. We have to make sure
        # to close everything ourselves
        if ctx.executor.client.cluster is not None:
            ctx.executor.client.cluster.close(timeout=30)
        ctx.executor.client.close()
        ctx.close()
Esempio n. 7
0
def test_multiple_clients(local_cluster_url, default_raw):
    ex1 = DaskJobExecutor.connect(local_cluster_url)

    # this creates a second Client, and even though we are setting `set_as_default=False`,
    # this Client is then used by functions like `dd.as_completed`. That is because
    # `set_as_default` only sets the dask scheduler config to "dask.distributed", it does
    # not affect setting the _client_ as the global default `Client`!
    # so any time `as_completed` is called, the `loop` needs to be set correctly, otherwise
    # this may result in strange hangs and crashes
    DaskJobExecutor.connect(local_cluster_url)

    udf = SumUDF()

    cx1 = Context(executor=ex1)
    cx1.run_udf(dataset=default_raw, udf=udf)
Esempio n. 8
0
def test_no_dangling_client():
    # Within the whole test suite and LiberTEM we should not have
    # a dangling dask.distributed Client set as default Dask scheduler.
    # That means we confirm that we get a ConcurrentJobExecutor in the
    # default case.
    ctx = Context.make_with("dask-integration")
    assert isinstance(ctx.executor, ConcurrentJobExecutor)
Esempio n. 9
0
def test_use_threads():
    with dask.config.set(scheduler="threads"):
        ctx = Context.make_with("dask-integration")
        assert isinstance(ctx.executor, ConcurrentJobExecutor)
        assert isinstance(ctx.executor.client, (
            concurrent.futures.ThreadPoolExecutor,
            multiprocessing.pool.ThreadPool,
        ))
Esempio n. 10
0
def test_use_distributed():
    # This Client is pretty cheap to start
    # since it only uses threads
    with distributed.Client(n_workers=1, threads_per_worker=1,
                            processes=False) as c:
        ctx = Context.make_with("dask-integration")
        assert isinstance(ctx.executor, DaskJobExecutor)
        assert ctx.executor.client is c
Esempio n. 11
0
    def test_sumsig_delayed(self, shared_dist_ctx_globaldask, my_ds, benchmark):
        ctx = Context(executor=DelayedJobExecutor())
        udf = MySumSigUDF()
        resources = DelayedJobExecutor.get_resources_from_udfs(udf)

        def doit():
            result = ctx.run_udf(dataset=my_ds, udf=udf)
            return result['intensity'].delayed_raw_data.compute(resources=resources)

        benchmark(doit)
Esempio n. 12
0
    def test_large_delayed_merge(self, shared_dist_ctx_globaldask, my_ds, benchmark):
        ctx = Context(executor=DelayedJobExecutor())
        udf = EchoMergeUDF()
        resources = DelayedJobExecutor.get_resources_from_udfs(udf)

        def doit():
            result = ctx.run_udf(dataset=my_ds, udf=udf)
            return result['intensity'].delayed_raw_data.sum(axis=0).compute(resources=resources)

        benchmark(doit)
Esempio n. 13
0
def test_threads_per_worker(default_raw, dask_executor):
    ctx = Context(executor=dask_executor)
    inline_ctx = Context(executor=InlineJobExecutor())
    res = ctx.run_udf(dataset=default_raw,
                      udf=ThreadsPerWorkerUDF())['num_threads']
    res_inline = inline_ctx.run_udf(dataset=default_raw,
                                    udf=ThreadsPerWorkerUDF())['num_threads']
    assert np.allclose(res, 1)
    assert np.allclose(res_inline, psutil.cpu_count(logical=False))
Esempio n. 14
0
def test_threads_per_worker_vanilla(default_raw, monkeypatch):
    old_threads = os.environ.get('NUMBA_NUM_THREADS')
    # Triggers #1053
    monkeypatch.delenv('NUMBA_NUM_THREADS', raising=False)
    ctx = Context()
    assert 'NUMBA_NUM_THREADS' not in os.environ
    # We have to reset it properly since it is set in pytest.ini
    # and Numba will complain if it is changed
    if old_threads:
        os.environ['NUMBA_NUM_THREADS'] = old_threads
    inline_ctx = Context(executor=InlineJobExecutor())
    res = ctx.run_udf(dataset=default_raw, udf=ThreadsPerWorkerUDF())
    res_inline = inline_ctx.run_udf(dataset=default_raw,
                                    udf=ThreadsPerWorkerUDF())
    print(res['num_threads'].data)
    assert np.all(res['num_threads'].data == 1)
    print(res_inline['num_threads'].data)
    assert np.all(res_inline['num_threads'].data == psutil.cpu_count(
        logical=False))
Esempio n. 15
0
def test_context_arguments():
    with pytest.raises(ValueError):
        # refs https://github.com/LiberTEM/LiberTEM/issues/918
        Context(executor=InlineJobExecutor)
Esempio n. 16
0
def test_make_with_unrecognized():
    with pytest.raises(ValueError):
        Context.make_with('not_an_executor')
Esempio n. 17
0
def test_make_with_threads(concurrent_executor):
    ctx = Context.make_with('threads')
    assert isinstance(ctx.executor, concurrent_executor.__class__)
Esempio n. 18
0
def test_make_with_inline(inline_executor):
    ctx = Context.make_with('inline')
    assert isinstance(ctx.executor, inline_executor.__class__)
Esempio n. 19
0
class DataSocketSimulator:
    def __init__(self, path: str, continuous=False, rois=None, max_runs=-1):
        """
        Parameters
        ----------

        path
            Path to the HDR file

        continuous
            If set to True, will continuously output data

        rois: List[np.ndarray]
            If a list of ROIs is given, in continuous mode, cycle through
            these ROIs from the source data

        max_runs: int
            Maximum number of continuous runs
        """
        if rois is None:
            rois = []
        if not path.lower().endswith(".hdr"):
            raise ValueError("please pass the path to the HDR file!")
        self._path = path
        self._continuous = continuous
        self._rois = rois
        self._ctx = Context(executor=InlineJobExecutor())
        self._ds = None
        self._max_runs = max_runs
        self._mmaps = {}

    def open(self):
        ds = self._ctx.load("mib", path=self._path)
        print("dataset shape: %s" % (ds.shape, ))
        self._ds = ds
        self._warmup()

    def get_chunks(self):
        """
        generator of `bytes` for the given configuration
        """
        # first, send acquisition header:
        with open(self._path, 'rb') as f:
            # FIXME: possibly change header in continuous mode?
            hdr = f.read()
            yield get_mpx_header(len(hdr))
            yield hdr
        if self._continuous:
            print("yielding from continuous")
            yield from self._get_continuous()
        else:
            print("yielding from single scan")
            roi = np.ones(self._ds.shape.nav, dtype=bool)
            t = tqdm(total=np.count_nonzero(roi))
            try:
                for item in self._get_single_scan(roi):
                    yield item
                    t.update(1)
            finally:
                t.close()

    def _read_frame_w_header(self, fh, frame_idx, full_frame_size):
        """
        Parameters
        ----------

        fh : LocalFile

        frame_idx : int
            File-relative frame index

        full_frame_size : int
            Size of header plus frame in bytes
        """
        if fh._file is None:
            fh.open()
        f = fh._file
        fileno = f.fileno()
        if fileno not in self._mmaps:
            self._mmaps[fileno] = raw_mmap = mmap.mmap(
                fileno=f.fileno(),
                length=0,
                offset=0,
                access=mmap.ACCESS_READ,
            )
        else:
            raw_mmap = self._mmaps[fileno]

        return bytearray(raw_mmap[full_frame_size * frame_idx:full_frame_size *
                                  (frame_idx + 1)])

    def _warmup(self):
        fileset = self._ds._get_fileset()
        ds_shape = self._ds.shape
        tiling_scheme = TilingScheme.make_for_shape(
            tileshape=Shape((1, ) + tuple(ds_shape.sig),
                            sig_dims=ds_shape.sig.dims),
            dataset_shape=ds_shape,
        )
        slices, ranges, scheme_indices = fileset.get_read_ranges(
            start_at_frame=0,
            stop_before_frame=int(np.prod(self._ds.shape.nav)),
            dtype=np.float32,  # FIXME: don't really care...
            tiling_scheme=tiling_scheme,
            roi=None,
        )

    def _get_single_scan(self, roi):
        fileset = self._ds._get_fileset()
        ds_shape = self._ds.shape
        tiling_scheme = TilingScheme.make_for_shape(
            tileshape=Shape((1, ) + tuple(ds_shape.sig),
                            sig_dims=ds_shape.sig.dims),
            dataset_shape=ds_shape,
        )
        slices, ranges, scheme_indices = fileset.get_read_ranges(
            start_at_frame=0,
            stop_before_frame=int(np.prod(self._ds.shape.nav)),
            dtype=np.float32,  # FIXME: don't really care...
            tiling_scheme=tiling_scheme,
            roi=roi,
        )

        first_file = self._ds._files_sorted[0]
        header_size = first_file.fields['header_size_bytes']

        full_frame_size = header_size + first_file.fields['image_size_bytes']

        mpx_header = get_mpx_header(full_frame_size)

        for idx in range(slices.shape[0]):
            origin = slices[idx, 0]
            # shape = slices[idx, 1]
            # origin, shape = slices[idx]
            tile_ranges = ranges[idx][0]
            file_idx = tile_ranges[0]
            fh = fileset[file_idx]
            global_idx = origin[0]
            local_idx = global_idx - fh.start_idx
            frame_w_header = self._read_frame_w_header(fh, local_idx,
                                                       full_frame_size)
            yield mpx_header
            yield frame_w_header

    def _get_continuous(self):
        if self._rois:
            rois = self._rois
        else:
            rois = [np.ones(self._ds.shape.nav, dtype=bool)]

        i = 0
        for roi in itertools.cycle(rois):
            t0 = time.time()
            yield from self._get_single_scan(roi)
            t1 = time.time()
            print("cycle %d took %.05fs" % (i, t1 - t0))
            i += 1
            if self._max_runs != -1 and i >= self._max_runs:
                raise RuntimeError("max_runs exceeded")

    def handle_conn(self, conn):
        for chunk in self.get_chunks():
            conn.sendall(chunk)
        conn.close()
Esempio n. 20
0
def lt_ctx():
    return Context(executor=InlineJobExecutor())
Esempio n. 21
0
def main(path, scheduler_uri, stackheight, scan_size, method, num_masks,
         num_workers, num_nodes, warmup_rounds):
    scan_size = tuple(int(x) for x in scan_size.split(","))
    if num_nodes is not None and scheduler_uri is None:
        raise Exception("num_nodes limit only works for non-local cluster")
    if scheduler_uri is None:
        dask_executor = BenchmarkDaskExecutor.make_local(
            cluster_kwargs={
                'threads_per_worker': 1,
                'n_workers': num_workers,
            })
    else:
        dask_executor = BenchmarkDaskExecutor.connect(scheduler_uri,
                                                      node_limit=num_nodes)
    ctx = Context(executor=dask_executor)

    workers = ctx.executor.get_available_workers()
    for worker in workers:
        ctx.executor.client.run(_preload, workers=[worker['name']])

    def _load():
        if method == "direct":
            ds = DirectRawFileDataSet(
                path=path,
                dtype="float32",
                scan_size=scan_size,
                detector_size=(128, 128),
                stackheight=stackheight,
            )
        elif method == "read":
            ds = DirectRawFileDataSet(
                path=path,
                dtype="float32",
                scan_size=scan_size,
                detector_size=(128, 128),
                stackheight=stackheight,
                enable_direct=False,
            )
        elif method == "mmap":
            ds = RawFileDataSet(
                path=path,
                dtype="float32",
                scan_size=scan_size,
                detector_size_raw=(128, 128),
                crop_detector_to=(128, 128),
                tileshape=(1, stackheight, 128, 128),
            )
        ds = ds.initialize()
        return ds

    def _getsize():
        return os.stat(path).st_size

    ds = dask_executor.run_function(_load)
    dask_executor.run_function(ds.check_valid)

    total_size = dask_executor.run_function(_getsize)
    assert total_size == np.dtype(ds.dtype).itemsize * ds.shape.size

    def _make_random_mask():
        return np.random.randn(128, 128).astype("float32")

    apply_mask = ctx.create_mask_analysis(dataset=ds,
                                          factories=num_masks *
                                          [_make_random_mask])

    # warmup rounds:
    for i in range(warmup_rounds):
        ctx.run(apply_mask)

    # timed run:
    t0 = time.time()
    ctx.run(apply_mask)
    t1 = time.time()
    delta = t1 - t0

    tilesize_bytes = stackheight * 128 * 128 * 4

    results = {
        "path": path,
        "num_masks": num_masks,
        "bytes": total_size,
        "time": delta,
        "throughput_mib": total_size / delta / 1024 / 1024,
        "tilesize_bytes": tilesize_bytes,
        "method": method,
        "num_nodes": num_nodes,
        "workers": workers,
    }
    print(json.dumps(results, indent=4))
Esempio n. 22
0
def test_use_synchronous():
    with dask.config.set(scheduler="synchronous"):
        ctx = Context.make_with("dask-integration")
        assert isinstance(ctx.executor, InlineJobExecutor)
Esempio n. 23
0
def test_threads_per_worker(dask_executor, default_raw):
    ctx = Context(executor=dask_executor)
    res = ctx.run_udf(dataset=default_raw,
                      udf=ThreadsPerWorkerUDF())['num_threads']
    assert np.allclose(res, 1)
Esempio n. 24
0
def reference(load_kwargs):
    ctx = Context(executor=InlineJobExecutor())
    return _calculate(ctx, load_kwargs)