def get_layer_list(channels, nd2_func, path, frame_shape, frame_dtype, n_timepoints): # channel_dict = dict(zip(channels, [[] for _ in range(len(channels))])) channel_dict = {} for i, channel in enumerate(channels): arr = da.stack([ da.from_delayed(delayed(nd2_func(path, i))(j), shape=frame_shape, dtype=frame_dtype) for j in range(n_timepoints) ]) channel_dict[color_maps[i % len(color_maps)]] = dask.optimize(arr)[0] layer_list = [] print("channel_dict", channel_dict) for channel_name, channel in channel_dict.items(): blending = 'additive' meta = get_metadata(path) add_kwargs = { "name": channel_name, "colormap": channel_name, "blending": blending, "rendering": "mip", **meta } layer_type = "image" layer_list.append((channel, add_kwargs, layer_type)) return layer_list
def test_atop_non_atop_output(): x = da.ones(10, chunks=(5, )) y = (((x + 1) + 2) + 3) w = y.sum() z = (((y * 2) * 3) * 4) z_top_before = tuple(z.dask.dicts[z.name].indices) (zz, ) = dask.optimize(z) z_top_after = tuple(z.dask.dicts[z.name].indices) assert z_top_before == z_top_after, "z_top mutated" dsk = optimize_atop(z.dask, keys=list(dask.core.flatten(z.__dask_keys__()))) assert isinstance(dsk, dask.sharedict.ShareDict) assert len( [layer for layer in dsk.dicts.values() if isinstance(layer, TOP)]) == 1 dsk = optimize_atop( dask.sharedict.merge(w.dask, z.dask), keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()]))) assert isinstance(dsk, dask.sharedict.ShareDict) assert len( [layer for layer in z.dask.dicts.values() if isinstance(layer, TOP)]) >= 1
def get_layer_list(channels, nd2_func, path, frame_shape, frame_dtype, n_timepoints): channel_dict = dict(zip(channels, [[] for _ in range(len(channels))])) for i, channel in enumerate(channels): arr = da.stack([ da.from_delayed(delayed(nd2_func(path, i))(j), shape=frame_shape, dtype=frame_dtype) for j in range(n_timepoints) ]) channel_dict[channel] = dask.optimize(arr)[0] layer_list = [] for channel_name, channel in channel_dict.items(): visible = channel_name in VISIBLE blending = 'additive' if visible else 'translucent' channel_color = list(CHANNEL_COLORS[channel_name]) color = Colormap([[0, 0, 0], channel_color]) meta = get_metadata(path) add_kwargs = { "name": channel_name, "visible": visible, "colormap": color, "blending": blending, **meta } layer_type = "image" layer_list.append((channel, add_kwargs, layer_type)) return layer_list
def compute_with_trace(*args): """Do Dask compute(), but with added Eliot tracing. Dask is a graph of tasks, but Eliot logs trees. So we need to emulate a graph using a tree. We do this by making Eliot action for each task, but having it list the tasks it depends on. We use the following algorithm: 1. Create a top-level action. 2. For each entry in the dask graph, create a child with serialize_task_id. Do this in likely order of execution, so that if B depends on A the task level of B is higher than the task Ievel of A. 3. Replace each function with a wrapper that uses the corresponding task ID (with Action.continue_task), and while it's at it also records which other things this function depends on. Known issues: 1. Retries will confuse Eliot. Probably need different distributed-tree mechanism within Eliot to solve that. """ # 1. Create top-level Eliot Action: with start_action(action_type="dask:compute"): # In order to reduce logging verbosity, add logging to the already # optimized graph: optimized = optimize(*args, optimizations=[_add_logging]) return compute(*optimized, optimize_graph=False)
def optimize_chunk(arr: xr.DataArray, chk: dict) -> xr.DataArray: """ Rechunk a `xr.DataArray` with constrained "rechunk-merge" tasks. Parameters ---------- arr : xr.DataArray The array to be rechunked. chk : dict The desired chunk size. Returns ------- arr_chk : xr.DataArray The rechunked array. """ fast_funcs = FAST_FUNCTIONS + [darr.core.concatenate3] arr_chk = arr.chunk(chk) arr_opt = fct.partial( custom_arr_optimize, fast_funcs=fast_funcs, rewrite_dict={"rechunk-merge": "merge_restricted"}, ) with da.config.set(array_optimize=arr_opt): arr_chk.data = da.optimize(arr_chk.data)[0] return arr_chk
def get_layer_list(channels, nd2_func, path, frame_shape, frame_dtype, n_timepoints): channel_dict = dict(zip(channels, [[] for _ in range(len(channels))])) for i, channel in enumerate(channels): arr = da.stack([ da.from_delayed(delayed(nd2_func(path, i))(j), shape=frame_shape, dtype=frame_dtype) for j in range(n_timepoints) ]) channel_dict[channel] = dask.optimize(arr)[0] layer_list = [] for channel_name, channel in channel_dict.items(): visible = True blending = 'additive' if visible else 'translucent' meta = get_metadata(path) channel_color = meta['channels'][channel_name] color = Colormap([[0, 0, 0], channel_color[:-1]]) # ignore alpha add_kwargs = { "name": channel_name, "visible": visible, "colormap": color, "blending": blending, "scale": meta['scale'], "translate": meta['translate'], } layer_type = "image" layer_list.append((channel, add_kwargs, layer_type)) return layer_list
def test_annotations_survive_optimization(): with dask.annotate(foo="bar"): graph = HighLevelGraph.from_collections( "b", { "a": 1, "b": (inc, "a"), "c": (inc, "b") }, [], ) d = Delayed("b", graph) assert type(d.dask) is HighLevelGraph assert len(d.dask.layers) == 1 assert len(d.dask.layers["b"]) == 3 assert d.dask.layers["b"].annotations == {"foo": "bar"} # Ensure optimizing a Delayed object returns a HighLevelGraph # and doesn't loose annotations (d_opt, ) = dask.optimize(d) assert type(d_opt.dask) is HighLevelGraph assert len(d_opt.dask.layers) == 1 assert len(d_opt.dask.layers["b"]) == 2 # c is culled assert d_opt.dask.layers["b"].annotations == {"foo": "bar"}
def test_delayed_optimize(): x = Delayed('b', {'a': 1, 'b': (inc, 'a'), 'c': (inc, 'b')}) (x2,) = dask.optimize(x) # Delayed's __dask_optimize__ culls out 'c' assert sorted(x2.dask.keys()) == ['a', 'b']
def test_blockwise_non_blockwise_output(): x = da.ones(10, chunks=(5, )) y = ((x + 1) + 2) + 3 w = y.sum() z = ((y * 2) * 3) * 4 z_top_before = tuple(z.dask.dicts[z.name].indices) (zz, ) = dask.optimize(z) z_top_after = tuple(z.dask.dicts[z.name].indices) assert z_top_before == z_top_after, "z_top mutated" dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__()))) assert isinstance(dsk, HighLevelGraph) assert (len([ layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise) ]) == 1) dsk = optimize_blockwise( HighLevelGraph.merge(w.dask, z.dask), keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()])), ) assert isinstance(dsk, HighLevelGraph) assert (len([ layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise) ]) >= 1)
def test_delayed_optimize(): x = Delayed('b', {'a': 1, 'b': (inc, 'a'), 'c': (inc, 'b')}) (x2,) = dask.optimize(x) # Delayed's __dask_optimize__ culls out 'c' assert sorted(x2.dask.keys()) == ['a', 'b']
def compute_with_trace(*args): """Do Dask compute(), but with added Eliot tracing. Dask is a graph of tasks, but Eliot logs trees. So we need to emulate a graph using a tree. We do this by making Eliot action for each task, but having it list the tasks it depends on. We use the following algorithm: 1. Create a top-level action. 2. For each entry in the dask graph, create a child with serialize_task_id. Do this in likely order of execution, so that if B depends on A the task level of B is higher than the task Ievel of A. 3. Replace each function with a wrapper that uses the corresponding task ID (with Action.continue_task), and while it's at it also records which other things this function depends on. Known issues: 1. Retries will confuse Eliot. Probably need different distributed-tree mechanism within Eliot to solve that. """ # 1. Create top-level Eliot Action: with start_action(action_type="dask:compute"): # In order to reduce logging verbosity, add logging to the already # optimized graph: optimized = optimize(*args, optimizations=[_add_logging]) return compute(*optimized, optimize_graph=False)
def run_photoz_dask(runs, modelD, galcat, output_dir, fit_bands, ip_dask): """Run the photo-z on a Dask cluster.""" path_out = Path(output_dir) / 'pzcat.pq' if path_out.exists(): print('Photo-z catalogue already exists.') return # If not specified, we start up a local cluster. client = Client(ip_dask) if not ip_dask is None else Client() xnew_modelD = client.scatter(fix_model(modelD, fit_bands)) #xnew_modelD = fix_model(modelD, fit_bands) galcat = dd.read_parquet(str(output_dir / 'galcat_in.pq')) #npartitions = int(302138 / 10) + 1 npartitions = int(9900 / 10) + 1 galcat = galcat.reset_index().repartition(npartitions=npartitions).set_index('ref_id') ebvD = dict(runs.EBV) pzcat = galcat.map_partitions( bcnz.fit.photoz_flatten, xnew_modelD, ebvD, fit_bands) pzcat = pzcat.repartition(npartitions=100) pzcat = dask.optimize(pzcat)[0] pzcat.to_parquet(str(path_out))
def test_imread_use_dask_false(resources_dir): # Load image as delayed dask array then as numpy array # Check computed task count with dask_utils.cluster_and_client(processes=False) as (cluster, client): # Get filepath f = resources_dir / BIG_OME_FILE # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Check that a client does exist get_client() # Don't use dask for reads use_dask(False) # Read image without dask img = AICSImage(f) assert img.data.shape == (3, 1, 3, 5, 325, 475) # Check that the file was read with base reader then rechunked with dask # Normally the task count for this file is 90 assert len(optimize(img.dask_data)[0].__dask_graph__()) == 3 # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()]
def test_fuse_roots(): x = da.ones(10, chunks=(2,)) y = da.zeros(10, chunks=(2,)) z = (x + 1) + (2 * y ** 2) (zz,) = dask.optimize(z) # assert len(zz.dask) == 5 assert sum(map(dask.istask, zz.dask.values())) == 5 # there are some aliases assert_eq(zz, z)
def initbf(varr: xr.DataArray, A: xr.DataArray, C: xr.DataArray) -> Tuple[xr.DataArray, xr.DataArray]: """ Initialize background terms given spatial and temporal components of cells. A movie representation (with dimensions "height" "width" and "frame") of estimated cell activities are computed as the product between the spatial components matrix and the temporal components matrix of cells over the "unit_id" dimension. Then the residule movie is computed by subtracting the estimated cell activity movie from the input movie. Then the spatial footprint of background `b` is the mean of the residule movie over "frame" dimension, and the temporal component of background `f` is the mean of the residule movie over "height" and "width" dimensions. Parameters ---------- varr : xr.DataArray Input movie data. Should have dimensions ("frame", "height", "width"). A : xr.DataArray Estimation of spatial footprints of cells. Should have dimensions ("unit_id", "height", "width"). C : xr.DataArray Estimation of temporal activities of cells. Should have dimensions ("unit_id", "frame"). Returns ------- b : xr.DataArray Initial estimation of the spatial footprint of background. Has dimensions ("height", "width"). f : xr.DataArray Initial estimation of the temporal activity of background. Has dimension "frame". """ A = A.data.map_blocks(sparse.COO).compute() Yb = (varr - darr.tensordot(C, A, axes=[(0, ), (0, )])).clip(0) b = Yb.mean("frame") f = Yb.mean(["height", "width"]) arr_opt = fct.partial(custom_arr_optimize, rename_dict={"tensordot": "tensordot_restricted"}) with da.config.set(array_optimize=arr_opt): b = da.optimize(b)[0] f = da.optimize(f)[0] b, f = da.compute([b, f])[0] return b, f
def test_basic(client: Client): df = dd.demo.make_timeseries(freq="15D", partition_freq="30D") shuffled = shuffle(df, "id") (opt,) = dask.optimize(shuffled) assert isinstance(hlg_layer_topological(opt.dask, 1), Blockwise) # setup -> blockwise -> barrier -> unpack -> drop_by_shallow_copy assert len(opt.dask.layers) == 5 dd.utils.assert_eq(shuffled, df.shuffle("id", shuffle="tasks"), scheduler=client)
def cg_iterate(A, state, persist=True): ox, or_, op = state Ap = A.dot(op) alpha = or_.dot(or_) / op.dot(Ap) x = ox + alpha * op r = or_ - alpha * Ap p = r + op * r.dot(r) / or_.dot(or_) x, r, p = dask.optimize(x, r, p) if persist: x, r, p = dask.persist(x, r, p, optimize_graph=False) return x, r, p
def optimize(self, *args, **kwargs): """ Run optimisation of graphs Only does something when using dask :param args: :param kwargs: :return: """ if self.using_dask and self._optimize: return optimize(*args, **kwargs)[0] else: return args[0]
def random_mode(dataset_file: object) -> dict: dd, data = independent_mode(dataset_file, alone=False) # After running independent attribute mode, 1) make all distributions uniform; 2) set missing rate to zero. for attr in dd['meta']['attrs']: distribution = dd['distribution']['probs'][attr] uniform_distribution = np.ones_like(distribution) uniform_distribution = utils.normalize_given_distribution( uniform_distribution).tolist() dd['distribution']['probs'][attr] = uniform_distribution dd['missing_rate'][attr] = 0 dd = dask.optimize(dd)[0] dd = dask.compute(dd)[0] return dd, data
def persist_with_trace(*args): """Do Dask persist(), but with added Eliot tracing. Known issues: 1. Retries will confuse Eliot. Probably need different distributed-tree mechanism within Eliot to solve that. """ # 1. Create top-level Eliot Action: with start_action(action_type="dask:persist"): # In order to reduce logging verbosity, add logging to the already # optimized graph: optimized = optimize(*args, optimizations=[_add_logging]) return persist(*optimized, optimize_graph=False)
def ntasks(obj, optimize=False): """Returns length of dask graph. Parameters ---------- optimize: bool, optional Optimize graph? Returns ------- number of tasks, int """ if optimize: return len(dask.optimize(obj)[0].__dask_graph__()) else: return len(obj.__dask_graph__())
def cg_calcs_proto(shape, chunks, dtype, dsk, key, optimize=False, **options): if options.pop('finish', False): dsk_final = dict() key_final = options.pop('name', 'cg-output') for i in range(len(chunks[0])): dsk_final[('x-' + key_final, i)] = ('x-' + key, i) dsk_final[('r-' + key_final, i)] = ('r-' + key, i) dsk_final[('p-' + key_final, i)] = ('p-' + key, i) dsk = dask.sharedict.merge(dsk, dsk_final) key = key_final x = da.Array(dsk, 'x-' + key, shape=shape, chunks=chunks, dtype=dtype) r = da.Array(dsk, 'r-' + key, shape=shape, chunks=chunks, dtype=dtype) p = da.Array(dsk, 'p-' + key, shape=shape, chunks=chunks, dtype=dtype) if optimize: (x, r, p) = dask.optimize(x, r, p) (x, r, p) = dask.persist(x, r, p, optimize_graph=False, traverse=False) (res,) = dask.compute(da.linalg.norm(r)) return x, r, p, res
def test_imread_from_delayed(resources_dir, filename, expected_shape, expected_tasks): # Load image as delayed dask array then as numpy array # Check computed task count with dask_utils.cluster_and_client(processes=False) as (cluster, client): # Get filepath f = resources_dir / filename # Check that there are no open file pointers after init proc = Process() assert str(f) not in [f.path for f in proc.open_files()] # Read image img = imread_dask(f) assert img.shape == expected_shape assert len(optimize(img)[0].__dask_graph__()) == expected_tasks # Check that there are no open file pointers after basics assert str(f) not in [f.path for f in proc.open_files()]
def test_atop_non_atop_output(): x = da.ones(10, chunks=(5,)) y = (((x + 1) + 2) + 3) w = y.sum() z = (((y * 2) * 3) * 4) z_top_before = tuple(z.dask.dicts[z.name].indices) (zz,) = dask.optimize(z) z_top_after = tuple(z.dask.dicts[z.name].indices) assert z_top_before == z_top_after, "z_top mutated" dsk = optimize_atop(z.dask, keys=list(dask.core.flatten(z.__dask_keys__()))) assert isinstance(dsk, dask.sharedict.ShareDict) assert len([layer for layer in dsk.dicts.values() if isinstance(layer, TOP)]) == 1 dsk = optimize_atop(dask.sharedict.merge(w.dask, z.dask), keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()]))) assert isinstance(dsk, dask.sharedict.ShareDict) assert len([layer for layer in z.dask.dicts.values() if isinstance(layer, TOP)]) >= 1
def test_blockwise_non_blockwise_output(): x = da.ones(10, chunks=(5,)) y = (((x + 1) + 2) + 3) w = y.sum() z = (((y * 2) * 3) * 4) z_top_before = tuple(z.dask.dicts[z.name].indices) (zz,) = dask.optimize(z) z_top_after = tuple(z.dask.dicts[z.name].indices) assert z_top_before == z_top_after, "z_top mutated" dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__()))) assert isinstance(dsk, HighLevelGraph) assert len([layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise)]) == 1 dsk = optimize_blockwise(HighLevelGraph.merge(w.dask, z.dask), keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()]))) assert isinstance(dsk, HighLevelGraph) assert len([layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise)]) >= 1
def test_blockwise_different_optimization(c): # Regression test for incorrect results due to SubgraphCallable.__eq__ # not correctly handling subgraphs with the same outputs and arity but # different internals (GH-7632). The bug is triggered by distributed # because it uses a function cache. da = pytest.importorskip("dask.array") np = pytest.importorskip("numpy") u = da.from_array(np.arange(3)) v = da.from_array(np.array([10 + 2j, 7 - 3j, 8 + 1j])) cv = v.conj() x = u * cv (cv, ) = dask.optimize(cv) y = u * cv expected = np.array([0 + 0j, 7 + 3j, 16 - 2j]) with dask.config.set({"optimization.fuse.active": False}): x_value = x.compute() y_value = y.compute() np.testing.assert_equal(x_value, expected) np.testing.assert_equal(y_value, expected)
def independent_mode(dataset_file: object, alone=True) -> dict: dd = {} datos = {} datos['data'] = read_csv(dataset_file) datos['dropna'] = dropna(datos['data']) dd['meta'] = init_meta(datos, describer) dd['meta']['datatypes'] = describer['datatypes'] dd['meta']['categories'] = describer['categories'] dd['meta']['key_candidates'] = describer['key_candidates'] dd['mins'] = attributes.get_mins(datos, dd, describer) dd['maxes'] = attributes.get_maxes(datos, dd, describer) dd['missing_rates'] = attributes.get_missing_rates(datos, dd, describer) dd['distribution'] = {} dd['distribution']['probs'], dd['distribution'][ 'bins'] = attributes.get_distribution(datos, dd, describer) #First noise injection for differential privacy dd['distribution']['probs'] = attributes.inject_laplace_noise(datos, dd) if alone: dd = dask.optimize(dd)[0] dd = dask.compute(dd)[0] return dd, datos
def test_dask_bag_fusing(driver, function_store, driver_name, skip_eager, existing_cube): """ See kartothek/tests/io/cube/test_build.py::test_dask_bag_fusing """ if driver_name == "dask_dataframe": pytest.skip("not relevant for dask.dataframe") partition_size = 1 if driver_name == "dask_bag_bs1" else 3 n_partitions = 4 dfs = [{ "a": pd.DataFrame({ "x": [2 * i, 2 * i + 1], "p": i, "v3": 42 }), "b": pd.DataFrame({ "x": [2 * i, 2 * i + 1], "p": i, "v4": 1337 }), } for i in range(partition_size * n_partitions)] cube = Cube( dimension_columns=["x"], partition_columns=["p"], uuid_prefix="cube", seed_dataset="source", ) bag = db.from_sequence(dfs, partition_size=partition_size).map( _count_execution_to_store, store=function_store) bag = extend_cube_from_bag(data=bag, cube=cube, store=function_store, ktk_cube_dataset_ids=["a", "b"]) dct = dask.optimize(bag)[0].__dask_graph__() tasks = {k for k, v in dct.items() if dask.core.istask(v)} assert len(tasks) == (n_partitions + 1)
def convert_dask_collection(dc): """ Convert dask collection object into mars.core.Object via remote API Parameters ---------- dc: dask collection Dask collection object to be converted. Returns ------- Object Mars Object. """ if not is_dask_collection(dc): raise TypeError( f"'{type(dc).__name__}' object is not a valid dask collection") dc.__dask_graph__().validate() dsk = optimize(dc)[0].__dask_graph__() first_key = next(iter(dsk.keys())) if isinstance(first_key, str): key = [first_key] elif isinstance(first_key, tuple): key = sorted([i for i in dsk.keys() if i[0] == first_key[0]], key=lambda x: x[1]) else: raise ValueError( f"Dask collection object seems be broken, with unexpected key type:'{type(first_key).__name__}'" ) res = reduce(mars_dask_get(dsk, [key])) if isinstance(dc, Bag): return spawn(lambda x: list(x[0][0]), args=(res, )) else: return res
def reader_function(path): """Take a path or list of paths and return a list of LayerData tuples. Readers are expected to return data as a list of tuples, where each tuple is (data, [add_kwargs, [layer_type]]), "add_kwargs" and "layer_type" are both optional. Parameters ---------- path : str or list of str Path to file, or list of paths. Returns ------- layer_data : list of tuples A list of LayerData tuples where each tuple in the list contains (data, metadata, layer_type), where data is a numpy array, metadata is a dict of keyword arguments for the corresponding viewer.add_* method in napari, and layer_type is a lower-case string naming the type of layer. Both "meta", and "layer_type" are optional. napari will default to layer_type=="image" if not provided """ print("reading file ", path) aics_img = AICSImage(path) # dims are normaly in "STCZYX" number_of_channels = aics_img.size_c number_of_time_points = aics_img.size_t nz = aics_img.size_z ny = aics_img.size_y nx = aics_img.size_x name_of_channels = aics_img.get_channel_names() pixel_x, pixel_y, pixel_z = aics_img.get_physical_pixel_size() scale = [1, pixel_z, pixel_y, pixel_x] print("number_of_channels", number_of_channels) print("number_of_time_points", number_of_time_points) print("name_of_channels", name_of_channels) print("scale", scale) print("nz", nz) layer_list = [] channel_dict = {} # for channel in name_of_channels: # channel_dict[channel] = {} if number_of_channels > 1: print("number_of_channels > 1") for cindex, channel_name in enumerate(name_of_channels): if number_of_time_points > 1: print("number_of_time_points > 1") if nz > 1: arr = da.stack([ aics_img.get_image_dask_data('ZYX', S=0, C=cindex, T=tindex) for tindex in range(number_of_time_points) ]) else: arr = da.stack([ aics_img.get_image_dask_data('YX', S=0, C=cindex, T=tindex, Z=0) for tindex in range(number_of_time_points) ]) scale = [1, pixel_y, pixel_x] else: if nz > 1: arr = aics_img.get_image_dask_data('ZYX', S=0, C=cindex, T=0) scale = [pixel_z, pixel_y, pixel_x] else: print("number_of_time_points < 1") print("nz < 1") print("cindex: ", cindex) print("channel_name: ", channel_name) arr = aics_img.get_image_dask_data('YX', S=0, C=cindex, T=0, Z=0) scale = [pixel_y, pixel_x] print("arr.shape", arr.shape) channel_dict[channel_name] = { "data": dask.optimize(arr)[0], "colormap": color_maps[cindex % len(color_maps)] } else: if number_of_time_points > 1: if nz > 1: arr = da.stack([ aics_img.get_image_dask_data('ZYX', S=0, C=0, T=tindex) for tindex in range(number_of_time_points) ]) else: arr = da.stack([ aics_img.get_image_dask_data('YX', S=0, C=0, T=tindex, Z=0) for tindex in range(number_of_time_points) ]) scale = [1, pixel_y, pixel_x] else: if nz > 1: arr = aics_img.get_image_dask_data('ZYX', S=0, C=0, T=0) scale = [pixel_z, pixel_y, pixel_x] else: arr = aics_img.get_image_dask_data('YX', S=0, C=0, T=0, Z=0) scale = [pixel_y, pixel_x] channel_dict[channel_name] = { "data": dask.optimize(arr)[0], "colormap": color_maps[0] } for channel_name, channel in channel_dict.items(): print("creating layer channel_name", channel_name) add_kwargs = { "name": channel_name, "blending": 'additive', "rendering": "mip", "scale": scale, "colormap": channel['colormap'] } print("channel['data'].shape", channel['data'].shape) layer_list.append(( channel['data'], #data add_kwargs, # kwargs "image" # layer type )) return layer_list
def test_delayed_optimize(): x = Delayed("b", {"a": 1, "b": (inc, "a"), "c": (inc, "b")}) (x2, ) = dask.optimize(x) # Delayed's __dask_optimize__ culls out 'c' assert sorted(x2.dask.keys()) == ["a", "b"]
def est_motion_part(varr: darr.Array, npart: int, chunk_nfm: int, alt_error=5, **kwargs) -> Tuple[darr.Array, darr.Array]: """ Construct dask graph for the recursive motion estimation algorithm. Parameters ---------- varr : darr.Array Input dask array representing movie data. npart : int Number of frames/chunks to combine for the recursive algorithm. chunk_nfm : int Number of frames in each parallel task. alt_error : int, optional Error threshold between estimated shifts from two alternative methods, specified in pixels. By default `5`. Returns ------- temps : darr.Array Registration template for the movie. shifts : darr.Array Estimated motion. See Also -------- estimate_motion """ if chunk_nfm is None: chunk_nfm = varr.chunksize[0] varr = varr.rechunk((chunk_nfm, None, None)) arr_opt = fct.partial(custom_arr_optimize, keep_patterns=["^est_motion_chunk"]) if kwargs.get("mesh_size", None): param = get_bspline_param(varr[0].compute(), kwargs["mesh_size"]) tmp_ls = [] sh_ls = [] for blk in varr.blocks: res = da.delayed(est_motion_chunk)(blk, None, alt_error=alt_error, npart=npart, **kwargs) if alt_error: tmp = darr.from_delayed(res[0], shape=(3, blk.shape[1], blk.shape[2]), dtype=blk.dtype) else: tmp = darr.from_delayed(res[0], shape=(blk.shape[1], blk.shape[2]), dtype=blk.dtype) if kwargs.get("mesh_size", None): sh = darr.from_delayed( res[1], shape=(blk.shape[0], 2, int(param[1]), int(param[0])), dtype=float, ) else: sh = darr.from_delayed(res[1], shape=(blk.shape[0], 2), dtype=float) tmp_ls.append(tmp) sh_ls.append(sh) with da.config.set(array_optimize=arr_opt): temps = da.optimize(darr.stack(tmp_ls, axis=0))[0] shifts = da.optimize(darr.concatenate(sh_ls, axis=0))[0] while temps.shape[0] > 1: tmp_ls = [] sh_ls = [] for idx in np.arange(0, temps.numblocks[0], npart): tmps = temps.blocks[idx:idx + npart] sh_org = shifts.blocks[idx:idx + npart] sh_org_ls = [sh_org.blocks[i] for i in range(sh_org.numblocks[0])] res = da.delayed(est_motion_chunk)(tmps, sh_org_ls, alt_error=alt_error, npart=npart, **kwargs) if alt_error: tmp = darr.from_delayed(res[0], shape=(3, tmps.shape[1], tmps.shape[2]), dtype=tmps.dtype) else: tmp = darr.from_delayed(res[0], shape=(tmps.shape[1], tmps.shape[2]), dtype=tmps.dtype) sh_new = darr.from_delayed(res[1], shape=sh_org.shape, dtype=sh_org.dtype) tmp_ls.append(tmp) sh_ls.append(sh_new) temps = darr.stack(tmp_ls, axis=0) shifts = darr.concatenate(sh_ls, axis=0) return temps, shifts
def time_optimize_getitem(self): df = dd.read_parquet("data.parquet", engine="pyarrow") dask.optimize(df)
def load_videos( vpath: str, pattern=r"msCam[0-9]+\.avi$", dtype: Union[str, type] = np.float64, downsample: Optional[dict] = None, downsample_strategy="subset", post_process: Optional[Callable] = None, ) -> xr.DataArray: """ Load multiple videos in a folder and return a `xr.DataArray`. Load videos from the folder specified in `vpath` and according to the regex `pattern`, then concatenate them together and return a `xr.DataArray` representation of the concatenated videos. The videos are sorted by filenames with :func:`natsort.natsorted` before concatenation. Optionally the data can be downsampled, and the user can pass in a custom callable to post-process the result. Parameters ---------- vpath : str The path containing the videos to load. pattern : regexp, optional The regexp matching the filenames of the videso. By default `r"msCam[0-9]+\.avi$"`, which can be interpreted as filenames starting with "msCam" followed by at least a number, and then followed by ".avi". dtype : Union[str, type], optional Datatype of the resulting DataArray, by default `np.float64`. downsample : dict, optional A dictionary mapping dimension names to an integer downsampling factor. The dimension names should be one of "height", "width" or "frame". By default `None`. downsample_strategy : str, optional How the downsampling should be done. Only used if `downsample` is not `None`. Either `"subset"` where data points are taken at an interval specified in `downsample`, or `"mean"` where mean will be taken over data within each interval. By default `"subset"`. post_process : Callable, optional An user-supplied custom function to post-process the resulting array. Four arguments will be passed to the function: the resulting DataArray `varr`, the input path `vpath`, the list of matched video filenames `vlist`, and the list of DataArray before concatenation `varr_list`. The function should output another valide DataArray. In other words, the function should have signature `f(varr: xr.DataArray, vpath: str, vlist: List[str], varr_list: List[xr.DataArray]) -> xr.DataArray`. By default `None` Returns ------- varr : xr.DataArray The resulting array representation of the input movie. Should have dimensions ("frame", "height", "width"). Raises ------ FileNotFoundError if no files under `vpath` match the pattern `pattern` ValueError if the matched files does not have extension ".avi", ".mkv" or ".tif" NotImplementedError if `downsample_strategy` is not "subset" or "mean" """ vpath = os.path.normpath(vpath) vlist = natsorted( [vpath + os.sep + v for v in os.listdir(vpath) if re.search(pattern, v)] ) if not vlist: raise FileNotFoundError( "No data with pattern {}" " found in the specified folder {}".format(pattern, vpath) ) print("loading {} videos in folder {}".format(len(vlist), vpath)) file_extension = os.path.splitext(vlist[0])[1] if file_extension in (".avi", ".mkv"): movie_load_func = load_avi_lazy elif file_extension == ".tif": movie_load_func = load_tif_lazy else: raise ValueError("Extension not supported.") varr_list = [movie_load_func(v) for v in vlist] varr = darr.concatenate(varr_list, axis=0) varr = xr.DataArray( varr, dims=["frame", "height", "width"], coords=dict( frame=np.arange(varr.shape[0]), height=np.arange(varr.shape[1]), width=np.arange(varr.shape[2]), ), ) if dtype: varr = varr.astype(dtype) if downsample: if downsample_strategy == "mean": varr = varr.coarsen(**downsample, boundary="trim", coord_func="min").mean() elif downsample_strategy == "subset": varr = varr.isel(**{d: slice(None, None, w) for d, w in downsample.items()}) else: raise NotImplementedError("unrecognized downsampling strategy") varr = varr.rename("fluorescence") if post_process: varr = post_process(varr, vpath, vlist, varr_list) arr_opt = fct.partial(custom_arr_optimize, keep_patterns=["^load_avi_ffmpeg"]) with da.config.set(array_optimize=arr_opt): varr = da.optimize(varr)[0] return varr