def stretch_linear(self, cutoffs=(0.005, 0.005)): """Stretch linearly the contrast of the current image. Use *cutoffs* for left and right trimming. """ logger.debug("Perform a linear contrast stretch.") logger.debug("Calculate the histogram quantiles: ") logger.debug("Left and right quantiles: " + str(cutoffs[0]) + " " + str(cutoffs[1])) cutoff_type = np.float64 # numpy percentile (which quantile calls) returns 64-bit floats # unless the value is a higher order float if np.issubdtype(self.data.dtype, np.floating) and \ np.dtype(self.data.dtype).itemsize > 8: cutoff_type = self.data.dtype left, right = dask.delayed(self._compute_quantile, nout=2)(self.data.data, self.data.dims, cutoffs) left_data = da.from_delayed(left, shape=(self.data.sizes['bands'],), dtype=cutoff_type) left = xr.DataArray(left_data, dims=('bands',), coords={'bands': self.data['bands']}) right_data = da.from_delayed(right, shape=(self.data.sizes['bands'],), dtype=cutoff_type) right = xr.DataArray(right_data, dims=('bands',), coords={'bands': self.data['bands']}) self.crude_stretch(left, right)
def change_dtype(data: list, output_dtype: str, offset: np.array) -> list: """ Lazy histogram-preserving datatype adjustment of a collection of array-likes. Signed datatypes (int8, int16) are converted to their unsigned counterparts (uint8, uint16) by upcasting to signed type with higher precision, shifting all values by a constant, then downcasting to the final unsigned datatype. The resulting arrays have a global minimum of 0, with the original min-max distance. """ @dask.delayed def adjuster(arr, upcast, offset, dtype): assert arr.ndim == offset.ndim return (arr.astype(upcast) - offset).astype(dtype) if output_dtype == "same": return data elif output_dtype == "uint8": assert (data[0].dtype == "int8") or (data[0].dtype == ">i1") upcast = "int16" elif output_dtype == "uint16": assert (data[0].dtype == "int16") or (data[0].dtype == ">i2") upcast = "int32" return [ da.from_delayed( adjuster( d, upcast=upcast, offset=offset.reshape(-1, 1, 1), dtype=output_dtype ), dtype=output_dtype, shape=d.shape, ) for d in data ]
def transform(self, X): """Transform a sequence of documents to a document-term matrix. Transformation is done in parallel, and correctly handles dask collections. Parameters ---------- X : dask.Bag of raw text documents, length = n_samples Samples. Each sample must be a text document (either bytes or unicode strings, file name or file object depending on the constructor argument) which will be tokenized and hashed. Returns ------- X : dask.array.Array, shape = (n_samples, self.n_features) Document-term matrix. Each block of the array is a scipy sparse matrix. Notes ----- The returned dask Array is composed scipy sparse matricies. If you need to compute on the result immediately, you may need to convert the individual blocks to ndarrays or pydata/sparse matricies. >>> import sparse >>> X.map_blocks(sparse.COO.from_scipy_sparse) # doctest: +SKIP See the :doc:`examples/text-vectorization` for more. """ transformer = super(HashingVectorizer, self).transform msg = "'X' should be a 1-dimensional array with length 'num_samples'." if not dask.is_dask_collection(X): return transformer(X) if isinstance(X, db.Bag): bag2 = X.map_partitions(transformer) objs = bag2.to_delayed() arrs = [ da.from_delayed(obj, (np.nan, self.n_features), self.dtype) for obj in objs ] result = da.concatenate(arrs, axis=0) elif isinstance(X, dd.Series): result = X.map_partitions(transformer) elif isinstance(X, da.Array): # dask.Array chunks = ((np.nan,) * X.numblocks[0], (self.n_features,)) if X.ndim == 1: result = X.map_blocks( transformer, dtype="f8", chunks=chunks, new_axis=1 ) else: raise ValueError(msg) else: raise ValueError(msg) return result
def test_clock_tec_solve_dask(): np.random.seed(1234) import pylab as plt times = np.arange(2) freqs = np.linspace(110e6, 170e6, 1000) cs = np.array([1, 1]) tec = np.array([0.1, 0.2]) delay = np.ones(len(times)) * 2e-9 # 10ns phase = np.multiply.outer(np.ones( len(freqs)), cs) + 8.44797256e-7 * TECU * np.multiply.outer( 1. / freqs, tec) + 2. * np.pi * np.multiply.outer(freqs, delay) phase += 15 * np.pi / 180. * np.random.normal( size=[len(freqs), len(times)]) #plt.imshow(phase,origin='lower',extent=(times[0],times[-1],freqs[0],freqs[-1]),aspect='auto') #plt.colorbar() #plt.xlabel('times (s)') #plt.ylabel('freqs (Hz)') #plt.show() m, cov = least_squares_solve(phase, freqs, times, 15, Ct_ratio=0.01) m_exact = np.array([delay, tec, cs]).T import dask.array as da solsMH = [ da.from_delayed(clock_tec_solve_dask(phase[:, i], freqs, m[i, :], cov[i, :, :], 15, 0.01), shape=(3, ), dtype=np.double) for i in range(len(times)) ] sol_stacked = da.stack(solsMH, axis=0) sol = sol_stacked.compute() print(sol)
def arrays_from_delayed(args, shapes=None, dtypes=None): """ Parameters ---------- args: a collection of dask.delayed objects representing lazy-loaded arrays. shapes: a collection of tuples specifying the shape of each array in args, or None. if None, the first array will be loaded using local computation, and the shape of that arrays will be used for all subsequent arrays. dtypes: a collection of strings specifying the datatype of each array in args, or None. If None, the first array will be loaded using local computation and the dtype of that array will be used for all subsequent arrays. Returns a list of dask arrays. ------- """ if shapes is None or dtypes is None: sample = args[0].compute(scheduler="threads") if shapes is None: shapes = (sample.shape, ) * len(args) if dtypes is None: dtypes = (sample.dtype, ) * len(args) assert len(shapes) == len(args) and len(dtypes) == len(args) arrays = [ da.from_delayed(args[ind], shape=shapes[ind], dtype=dtypes[ind]) for ind in range(len(args)) ] return arrays
def test_bag_array_conversion(): import dask.bag as db b = db.range(10, npartitions=1) x, = b.map_partitions(np.asarray).to_delayed() x, = [da.from_delayed(a, shape=(10,), dtype=int) for a in [x]] z = da.concatenate([x]) assert_eq(z, np.arange(10), check_graph=False)
def _band_hist(band_data): cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth) if approximate: # need a 1D array flat_data = band_data.ravel() # replace with nanpercentile in the future, if available # dask < 0.17 returns all NaNs for this bins = da.percentile(flat_data[da.notnull(flat_data)], cdf * 100.) else: bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.) bins = da.from_delayed(bins, shape=(nwidth,), dtype=cdf.dtype) res = dask.delayed(np.interp)(band_data, bins, cdf) res = da.from_delayed(res, shape=band_data.shape, dtype=band_data.dtype) return res
def _map_all(self, function, inplace=True, **kwargs): calc_result = dd(function)(self.data, **kwargs) if inplace: self.data = da.from_delayed(calc_result, shape=self.data.shape, dtype=self.data.dtype) return None return self._deepcopy_with_new_data(calc_result)
def file_reader(filename, **kwds): """Read data from any format supported by PIL. Parameters ---------- filename: str """ dc = _read_data(filename) lazy = kwds.pop('lazy', False) if lazy: # load the image fully to check the dtype and shape, should be cheap. # Then store this info for later re-loading when required from dask.array import from_delayed from dask import delayed val = delayed(_read_data, pure=True)(filename) dc = from_delayed(val, shape=dc.shape, dtype=dc.dtype) return [{'data': dc, 'metadata': { 'General': {'original_filename': os.path.split(filename)[1]}, "Signal": {'signal_type': "", 'record_by': 'image', }, } }]
def _get_measurement(datasources, geobox, resampling, no_data, dtype, fuse_func=None): """ Gets the measurement array of a band of data """ # pylint: disable=broad-except, protected-access def copyto_fuser(dest, src): """ :type dest: numpy.ndarray :type src: numpy.ndarray """ where_nodata = ( dest == no_data) if not numpy.isnan(no_data) else numpy.isnan(dest) numpy.copyto(dest, src, where=where_nodata) return dest fuse_func = fuse_func or copyto_fuser destination = _make_destination(geobox.shape, no_data, dtype) for source in datasources: buffer = delayed(_read_file)(source, geobox, band=source.get_bandnumber(), no_data=no_data, resampling=resampling) destination = delayed(fuse_func)(destination, buffer) return da.from_delayed(destination, geobox.shape, dtype)
def get_dataset(self, key, info): """Load a dataset.""" if self._channel != key.name: return logger.debug('Reading %s.', key.name) # FIXME: get this from MTD_MSIL1C.xml quantification_value = 10000. jp2 = glymur.Jp2k(self.filename) bitdepth = 0 for seg in jp2.codestream.segment: try: bitdepth = max(bitdepth, seg.bitdepth[0]) except AttributeError: pass jp2.dtype = (np.uint8 if bitdepth <= 8 else np.uint16) # Initialize the jp2 reader / doesn't work in a multi-threaded context. # jp2[0, 0] # data = da.from_array(jp2, chunks=CHUNK_SIZE) / quantification_value * 100 data = da.from_delayed(delayed(jp2.read)(), jp2.shape, jp2.dtype) data = data.rechunk(CHUNK_SIZE) / quantification_value * 100 proj = DataArray(data, dims=['y', 'x']) proj.attrs = info.copy() proj.attrs['units'] = '%' proj.attrs['platform_name'] = self.platform_name return proj
def get_da_images(files, which="data", shape=ZTF_IMAGE_SHAPE, dtype="float32"): """ Get a dask.array stacked for each of the ziff image you want. = Works only with single ziff = """ lazy_array = [dask.delayed(get_ziff_single_image)(f_, which=which) for f_ in files] lazy_arrays = [da.from_delayed(x_, shape=shape, dtype=dtype) for x_ in lazy_array] return da.stack(lazy_arrays)
def fit(self, X, y=None): q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) if isinstance(X, dd.DataFrame): n_columns = len(X.columns) partition_lengths = X.map_partitions(len).compute() dtype = np.find_common_type(X.dtypes, []) blocks = X.to_delayed() X = da.vstack( [ da.from_delayed( block.values, shape=(length, n_columns), dtype=dtype ) for block, length in zip(blocks, partition_lengths) ] ) quantiles = [da.percentile(col, [q_min, 50.0, q_max]) for col in X.T] quantiles = da.vstack(quantiles).compute() self.center_ = quantiles[:, 1] self.scale_ = quantiles[:, 2] - quantiles[:, 0] self.scale_ = skdata._handle_zeros_in_scale(self.scale_, copy=False) return self
def get_data_lazy(image: ImageWrapper, c_index: int = 0) -> da.Array: """Get n-dimensional dask array, with delayed reading from OMERO image.""" size_z = image.getSizeZ() size_t = image.getSizeT() size_x = image.getSizeX() size_y = image.getSizeY() pixels = image.getPrimaryPixels() @delayed @timer def get_plane(plane_name): z, c, t = [int(n) for n in plane_name.split(",")] p = pixels.getPlane(z, c, t) return p dtype = PIXEL_TYPES.get(pixels.getPixelsType().value, None) plane_names = [ f"{z},{c_index},{t}" for t in range(size_t) for z in range(size_z) ] lazy_arrays = [get_plane(pn) for pn in plane_names] dask_arrays = [ da.from_delayed(delayed_reader, shape=(size_y, size_x), dtype=dtype) for delayed_reader in lazy_arrays ] # Stack into one large dask.array if size_z == 1 or size_t == 1: return da.stack(dask_arrays, axis=0) z_stacks = [] for t in range(size_t): z_stacks.append( da.stack(dask_arrays[t * size_z:(t + 1) * size_z], axis=0)) stack = da.stack(z_stacks, axis=0) return stack
def _split(self, test_start, test_stop, n_samples, chunks, seeds): train_objs = [] test_objs = [] train_sizes = [] test_sizes = [] offset = 0 for chunk, seed in zip(chunks, seeds): start, stop = offset, offset + chunk test_id_start = max(test_start, start) test_id_stop = min(test_stop, stop) if test_id_start < test_id_stop: test_objs.append( dask.delayed(_generate_offset_idx)(chunk, test_id_start, test_id_stop, offset, seed)) test_sizes.append(test_id_stop - test_id_start) train_id_stop = min(test_id_start, stop) if train_id_stop > start: train_objs.append( dask.delayed(_generate_offset_idx)(chunk, start, train_id_stop, offset, seed)) train_sizes.append(train_id_stop - start) train_id_start = max(test_id_stop, start) if train_id_start < stop: train_objs.append( dask.delayed(_generate_offset_idx)(chunk, train_id_start, stop, offset, seed)) train_sizes.append(stop - train_id_start) offset = stop train_idx = da.concatenate([ da.from_delayed(obj, (train_size, ), np.dtype("int")) for obj, train_size in zip(train_objs, train_sizes) ]) test_idx = da.concatenate([ da.from_delayed(obj, (test_size, ), np.dtype("int")) for obj, test_size in zip(test_objs, test_sizes) ]) return train_idx, test_idx
def compute(self, data, cache_id=None, fill_value=0, weight_count=10000, weight_min=0.01, weight_distance_max=1.0, weight_delta_max=1.0, weight_sum_min=-1.0, maximum_weight_mode=False, grid_coverage=0, **kwargs): """Resample the data according to the precomputed X/Y coordinates.""" rows = self.cache["rows"] cols = self.cache["cols"] # if the data is scan based then check its metadata or the passed # kwargs otherwise assume the entire input swath is one large # "scanline" rows_per_scan = kwargs.get( 'rows_per_scan', data.attrs.get("rows_per_scan", data.shape[0])) if data.ndim == 3 and 'bands' in data.dims: data_in = tuple( data.sel(bands=band).data for band in data['bands']) elif data.ndim == 2: data_in = data.data else: raise ValueError("Unsupported data shape for EWA resampling.") res = dask.delayed(self._call_fornav)( cols, rows, self.target_geo_def, data_in, grid_coverage=grid_coverage, rows_per_scan=rows_per_scan, weight_count=weight_count, weight_min=weight_min, weight_distance_max=weight_distance_max, weight_delta_max=weight_delta_max, weight_sum_min=weight_sum_min, maximum_weight_mode=maximum_weight_mode) if isinstance(data_in, tuple): new_shape = (len(data_in), ) + self.target_geo_def.shape else: new_shape = self.target_geo_def.shape data_arr = da.from_delayed(res, new_shape, data.dtype) # from delayed creates one large chunk, break it up a bit if we can data_arr = data_arr.rechunk([CHUNK_SIZE] * data_arr.ndim) if data.ndim == 3 and data.dims[0] == 'bands': dims = ('bands', 'y', 'x') elif data.ndim == 2: dims = ('y', 'x') else: dims = data.dims res = xr.DataArray(data_arr, dims=dims, attrs=data.attrs.copy()) return update_resampled_coords(data, res, self.target_geo_def)
def dask_from_mov(path): vid = imageio.get_reader(path, 'ffmpeg') shape = vid.get_meta_data()['size'][::-1] + (3, ) lazy_imread = delayed(vid.get_data) return da.stack([ da.from_delayed(lazy_imread(i), shape=shape, dtype=np.uint8) for i in range(vid.count_frames()) ])
def make_da(delayed_list, length): sample = delayed_list[0].compute() arrays = [ da.from_delayed(item, dtype=sample.dtype, shape=sample.shape) for item in delayed_list ] result = da.concatenate(arrays, axis=0)[:length] return result
def get_lazy_arrays(glob_filenames, imread_sample): lazy_arrays = [dask.delayed(imread)(fn) for fn in glob_filenames] lazy_arrays = [ da.from_delayed(x, shape=imread_sample.shape, dtype=imread_sample.dtype) for x in lazy_arrays ] return lazy_arrays
def _band_hist(band_data): cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth) if approximate: # need a 1D array flat_data = band_data.ravel() # replace with nanpercentile in the future, if available # dask < 0.17 returns all NaNs for this bins = da.percentile(flat_data[da.notnull(flat_data)], cdf * 100.) else: bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.) bins = da.from_delayed(bins, shape=(nwidth, ), dtype=cdf.dtype) res = dask.delayed(np.interp)(band_data, bins, cdf) res = da.from_delayed(res, shape=band_data.shape, dtype=band_data.dtype) return res
def as_known(X, lengths): blocks = X.to_delayed().flatten() P = X.shape[1] arrays = [ da.from_delayed(x, dtype=X.dtype, shape=(length, P)) for x, length in zip(blocks, lengths) ] return da.concatenate(arrays, axis=0)
def test_bag_array_conversion(): import dask.bag as db b = db.range(10, npartitions=1) x, = b.map_partitions(np.asarray).to_delayed() x, = [da.from_delayed(a, shape=(10, ), dtype=int) for a in [x]] z = da.concatenate([x]) assert_eq(z, np.arange(10), check_graph=False)
def hdulists_keyword_to_dask_array(all_hduls, keyword, ext=0, dtype=float): arr = da.stack([ da.from_delayed(_kw_to_0d_seq(hdul, ext, keyword), shape=(), dtype=dtype) for hdul in all_hduls ]) log.info(f"Header keyword {keyword} extracted to new {arr.shape} sequence") return arr
def image(self): """ Returns a delayed dask call for fetching the image for a data point """ token = gbdx.gbdx_connection.access_token load = load_image(self.links["image"]["href"], token, self.imshape, dtype=self.dtype) return da.from_delayed(load, shape=self.imshape, dtype=self.dtype)
def func(band_data, kernel=kernel, mode=mode, index=None): del index delay = dask.delayed(_three_d_effect_delayed)(band_data, kernel, mode) new_data = da.from_delayed(delay, shape=band_data.shape, dtype=band_data.dtype) return new_data
def test_from_delayed_meta(): def f(): return sparse.COO.from_numpy(np.eye(3)) d = dask.delayed(f)() x = da.from_delayed(d, shape=(3, 3), meta=sparse.COO.from_numpy(np.eye(1))) assert isinstance(x._meta, sparse.COO) assert_eq(x, x)
def _graph_standard_degrid(vis_dataset, grid, briggs_factors, cgk_1D, grid_parms): import dask import dask.array as da import xarray as xr import time import itertools # Getting data for gridding chan_chunk_size = vis_dataset[grid_parms["imaging_weight_name"]].chunks[2][0] freq_chan = da.from_array(vis_dataset.coords['chan'].values, chunks=(chan_chunk_size)) n_chunks_in_each_dim = vis_dataset[grid_parms["imaging_weight_name"]].data.numblocks chunk_indx = [] iter_chunks_indx = itertools.product(np.arange(n_chunks_in_each_dim[0]), np.arange(n_chunks_in_each_dim[1]), np.arange(n_chunks_in_each_dim[2]), np.arange(n_chunks_in_each_dim[3])) #n_delayed = np.prod(n_chunks_in_each_dim) chunk_sizes = vis_dataset[grid_parms["imaging_weight_name"]].chunks n_chan_chunks_img = n_chunks_in_each_dim[2] list_of_degrids = [] list_of_sum_weights = [] list_of_degrids = ndim_list(n_chunks_in_each_dim) # Build graph for c_time, c_baseline, c_chan, c_pol in iter_chunks_indx: if grid_parms['chan_mode'] == 'cube': a_c_chan = c_chan else: a_c_chan = 0 if grid_parms['do_imaging_weight']: sub_degrid = dask.delayed(_standard_imaging_weight_degrid_numpy_wrap)( grid.partitions[0,0,a_c_chan,c_pol], vis_dataset[grid_parms["uvw_name"]].data.partitions[c_time, c_baseline, 0], vis_dataset[grid_parms["imaging_weight_name"]].data.partitions[c_time, c_baseline, c_chan, c_pol], briggs_factors.partitions[:,a_c_chan,c_pol], freq_chan.partitions[c_chan], dask.delayed(grid_parms)) single_chunk_size = (chunk_sizes[0][c_time], chunk_sizes[1][c_baseline],chunk_sizes[2][c_chan], chunk_sizes[3][c_pol]) list_of_degrids[c_time][c_baseline][c_chan][c_pol] = da.from_delayed(sub_degrid, single_chunk_size,dtype=np.double) else: print('Degridding of visibilities and psf still needs to be implemented') #sub_grid_and_sum_weights = dask.delayed(_standard_grid_numpy_wrap)( #vis_dataset[vis_dataset[grid_parms["data"]].data.partitions[c_time, c_baseline, c_chan, c_pol], #vis_dataset[grid_parms["uvw"]].data.partitions[c_time, c_baseline, 0], #vis_dataset[grid_parms["imaging_weight"]].data.partitions[c_time, c_baseline, c_chan, c_pol], #freq_chan.partitions[c_chan], #dask.delayed(cgk_1D), dask.delayed(grid_parms)) degrid = da.block(list_of_degrids) return degrid
def compute_gradient_dask(rays, g, dobs, i0, K_ne, m_tci, m_prior, CdCt, sigma_m, Nkernel, size_cell, cov_obj=None): L_m = Nkernel * size_cell # #i not eq i0 mask # mask = np.ones(rays.shape[0],dtype=np.bool) # mask[i0] = False # rays = rays[mask,:,:,:,:] # g = g[mask,:,:] # dobs = dobs[mask,:,:] # CdCt = CdCt[mask,:,:] #residuals #g.shape, dobs.shape [Na,Nt,Nd] dd = g - dobs #weighted residuals #Cd.shape [Na,Nt,Nd] i.e. diagonal #CdCt^-1 = 1./CdCt dd /= (CdCt + 1e-15) #get ray info Na, Nt, Nd, _, Ns = rays.shape # if Na < Nd: # #parallelize over antennas # gradient = da.sum(da.stack([da.from_delayed(delayed(do_gradient)(rays[i,:,:,:,:], dd[i,:,:], K_ne, m_tci, # sigma_m, Nkernel, size_cell),(m_tci.nx,m_tci.ny,m_tci.nz),dtype=np.double) for i in range(Na)],axis=-1),axis=-1) # else: # #parallelize over directions # gradient = da.sum(da.stack([da.from_delayed(delayed(do_gradient)(rays[:,:,d,:,:], dd[:,:,d], K_ne, m_tci, # sigma_m, Nkernel, size_cell),(m_tci.nx,m_tci.ny,m_tci.nz),dtype=np.double) for d in range(Nd)],axis=-1),axis=-1) #parallelize over directions ne_tci = m_tci.copy() np.exp(ne_tci.M, out=ne_tci.M) ne_tci.M *= K_ne / TECU gradient = da.sum(da.stack([ da.from_delayed(delayed(do_gradient)( rays[:, :, d, :, :], dd[:, :, d], ne_tci, sigma_m, Nkernel, size_cell, i0), (m_tci.nx, m_tci.ny, m_tci.nz), dtype=np.double) for d in range(Nd) ], axis=-1), axis=-1) gradient = gradient.compute(get=get) gradient -= gradient[i0, ...] if cov_obj is not None: dm = m_tci.M - m_prior gradient + cov_obj.contract(dm) #gradient += m_tci.M #gradient -= m_prior return gradient
def interpret_raw_file_delayed(name, nx, ny, layers, dx, dy): """ Use Dask.delayed to lazily load a single output file. While this can be used as is, it is intended to be an internal function called by `open_mfdataset`. """ d = dsa.from_delayed(delayed(interpret_raw_file)(name, nx, ny, layers), (layers, ny+dy, nx+dx), float) return d
def array_images(): custom_imread = dask.delayed(skimage.io.imread, pure=True) images = [ custom_imread( '/Users/nivethamahalakshmibalasamy/Documents/ECI-PolarScience/dask_stuff/grayscale-xy-%d.png' % i) for i in range(1376, 1396) ] #print images image_array = [ da.from_delayed(i, sample.shape, sample.dtype) for i in images ] sizes = [j.shape for j in image_array] #print sizes stack = da.stack(image_array, axis=0) print stack #print stack[0] # Combining chunks - A chunk consists of 5 images stack = stack.rechunk((5, 2000, 2000)) print "After rechunking: " temp = stack #temp.visualize() print "Before distributing to workers:" print stack.mean().compute() print stack[1, :].compute() print stack[19, :].mean().compute() stack.visualize() # Distribute array components over workers and centralized scheduler cluster = LocalCluster() client = Client(cluster) print client # Load the entire distributed array on the cluster (4 workers, 4 cores) stack = client.persist(stack) #print stack.shape #print "After distributing to workers: " print stack.mean().compute() # map the otsu thresholding function #print stack[0] stack = da.map_blocks(otsu_thresholding, stack, chunks=(5, 2000, 2000), dtype=sample.dtype) stack = da.map_blocks(blob_detection, stack, chunks=(5, 2000, 2000), dtype=sample.dtype) stack = client.persist(stack) #th = client.persist(th) #thresholded.visualize() #th = client.persist(thresholded) #print thresholded.mean().compute() #print thresholded #print stack.shape print stack.mean().compute() stack.visualize()
def read_prob_map(h5_path, array_info): shape, dtype = array_info data = delayed(read_h5)(h5_path) data = da.from_delayed(data, shape=shape, dtype=dtype, name=os.path.basename(h5_path)) return data
def _preprocess(self, collection, chunks=64, size=None): h, w = size images = [self.read_image(file, (h, w)) for file in collection] images = [da.from_delayed(image, shape=(h, w), dtype=numpy.uint8) for image in images] images = da.stack(images, axis=0) images = images.rechunk(chunks=(chunks, h, w)) return images
def read_tiff(tiff_path, array_info): shape, dtype = array_info data = delayed(imageio.volread)(tiff_path) data = da.from_delayed(data, shape=shape, dtype=dtype, name=os.path.basename(tiff_path)) return data
def scatter_array(arr, dask_client): """Scatter a large numpy array into workers Return the equivalent dask array """ future_arr = dask_client.scatter(arr) return da.from_delayed(future_arr, shape=arr.shape, dtype=arr.dtype, meta=np.zeros_like(arr, shape=()))
def func(band_data, luts=luts, index=-1): # NaN/null values will become 0 lut = luts[:, index] if len(luts.shape) == 2 else luts band_data = band_data.clip(0, lut.size - 1).astype(np.uint8) new_delay = dask.delayed(_lookup_delayed)(lut, band_data) new_data = da.from_delayed(new_delay, shape=band_data.shape, dtype=luts.dtype) return new_data
def file_reader(filename, record_by=None, order=None, lazy=False, optimize=True): """Reads a DM3 file and loads the data into the appropriate class. data_id can be specified to load a given image within a DM3 file that contains more than one dataset. Parameters ---------- record_by: Str One of: SI, Signal2D order : Str One of 'C' or 'F' lazy : bool, default False Load the signal lazily. %s """ with open(filename, "rb") as f: dm = DigitalMicrographReader(f) dm.parse_file() images = [ImageObject(imdict, f, order=order, record_by=record_by) for imdict in dm.get_image_dictionaries()] imd = [] del dm.tags_dict['ImageList'] dm.tags_dict['ImageList'] = {} for image in images: dm.tags_dict['ImageList'][ 'TagGroup0'] = image.imdict.as_dictionary() axes = image.get_axes_dict() mp = image.get_metadata() mp['General']['original_filename'] = os.path.split(filename)[1] post_process = [] if image.to_spectrum is True: post_process.append(lambda s: s.to_signal1D(optimize=optimize)) post_process.append(lambda s: s.squeeze()) if lazy: image.filename = filename from dask.array import from_delayed import dask.delayed as dd val = dd(image.get_data, pure=True)() data = from_delayed(val, shape=image.shape, dtype=image.dtype) else: data = image.get_data() imd.append( {'data': data, 'axes': axes, 'metadata': mp, 'original_metadata': dm.tags_dict, 'post_process': post_process, 'mapping': image.get_mapping(), }) return imd file_reader.__doc__ %= (OPTIMIZE_ARG.replace('False', 'True'))
def func(band_data, luts=luts): # NaN/null values will become 0 band_data = band_data.clip(0, luts.size - 1).astype(np.uint8) def _delayed(luts, band_data): # can't use luts.__getitem__ for some reason return luts[band_data] new_delay = dask.delayed(_delayed)(luts, band_data) new_data = da.from_delayed(new_delay, shape=band_data.shape, dtype=luts.dtype) return new_data
def func(band_data, kernel=kernel, mode=mode): def _delayed(band_data, kernel, mode): band_data = band_data.reshape(band_data.shape[1:]) new_data = convolve2d(band_data, kernel, mode=mode) return new_data.reshape((1, band_data.shape[0], band_data.shape[1])) delay = dask.delayed(_delayed)(band_data, kernel, mode) new_data = da.from_delayed(delay, shape=band_data.shape, dtype=band_data.dtype) return new_data
def compute(self, data, cache_id=None, fill_value=0, weight_count=10000, weight_min=0.01, weight_distance_max=1.0, weight_delta_max=1.0, weight_sum_min=-1.0, maximum_weight_mode=False, grid_coverage=0, **kwargs): """Resample the data according to the precomputed X/Y coordinates. :param grid_coverage: minimum ratio of number of output grid pixels covered with swath pixels """ rows = self.cache["rows"] cols = self.cache["cols"] # if the data is scan based then check its metadata or the passed # kwargs otherwise assume the entire input swath is one large # "scanline" rows_per_scan = kwargs.get('rows_per_scan', data.attrs.get("rows_per_scan", data.shape[0])) if data.ndim == 3 and 'bands' in data.dims: data_in = tuple(data.sel(bands=band).data for band in data['bands']) elif data.ndim == 2: data_in = data.data else: raise ValueError("Unsupported data shape for EWA resampling.") res = dask.delayed(self._call_fornav)( cols, rows, self.target_geo_def, data_in, grid_coverage=grid_coverage, rows_per_scan=rows_per_scan, weight_count=weight_count, weight_min=weight_min, weight_distance_max=weight_distance_max, weight_delta_max=weight_delta_max, weight_sum_min=weight_sum_min, maximum_weight_mode=maximum_weight_mode) if isinstance(data_in, tuple): new_shape = (len(data_in),) + self.target_geo_def.shape else: new_shape = self.target_geo_def.shape data_arr = da.from_delayed(res, new_shape, data.dtype) # from delayed creates one large chunk, break it up a bit if we can data_arr = data_arr.rechunk([CHUNK_SIZE] * data_arr.ndim) if data.ndim == 3 and data.dims[0] == 'bands': dims = ('bands', 'y', 'x') elif data.ndim == 2: dims = ('y', 'x') else: dims = data.dims return xr.DataArray(data_arr, dims=dims, attrs=data.attrs.copy())
def _map_iterate(self, function, iterating_kwargs=(), show_progressbar=None, parallel=None, ragged=None, inplace=True, **kwargs): if ragged not in (True, False): raise ValueError('"ragged" kwarg has to be bool for lazy signals') _logger.debug("Entering '_map_iterate'") size = max(1, self.axes_manager.navigation_size) from hyperspy.misc.utils import (create_map_objects, map_result_construction) func, iterators = create_map_objects(function, size, iterating_kwargs, **kwargs) iterators = (self._iterate_signal(), ) + iterators res_shape = self.axes_manager._navigation_shape_in_array # no navigation if not len(res_shape) and ragged: res_shape = (1,) all_delayed = [dd(func)(data) for data in zip(*iterators)] if ragged: sig_shape = () sig_dtype = np.dtype('O') else: one_compute = all_delayed[0].compute() sig_shape = one_compute.shape sig_dtype = one_compute.dtype pixels = [ da.from_delayed( res, shape=sig_shape, dtype=sig_dtype) for res in all_delayed ] for step in reversed(res_shape): _len = len(pixels) starts = range(0, _len, step) ends = range(step, _len + step, step) pixels = [ da.stack( pixels[s:e], axis=0) for s, e in zip(starts, ends) ] result = pixels[0] res = map_result_construction( self, inplace, result, ragged, sig_shape, lazy=True) return res
def as_lazy_data(data, chunks=None, asarray=False): """ Convert the input array `data` to a dask array. Args: * data: An array. This will be converted to a dask array. Kwargs: * chunks: Describes how the created dask array should be split up. Defaults to a value first defined in biggus (being `8 * 1024 * 1024 * 2`). For more information see http://dask.pydata.org/en/latest/array-creation.html#chunks. * asarray: If True, then chunks will be converted to instances of `ndarray`. Set to False (default) to pass passed chunks through unchanged. Returns: The input array converted to a dask array. """ if chunks is None: # Default to the shape of the wrapped array-like, # but reduce it if larger than a default maximum size. chunks = _limited_shape(data.shape) if not is_lazy_data(data): if data.shape == (): # Workaround for https://github.com/dask/dask/issues/2823. Make # sure scalar dask arrays return numpy objects. dtype = data.dtype data = _getall_delayed(data) data = da.from_delayed(data, (), dtype) else: data = da.from_array(data, chunks=chunks, asarray=asarray) return data
def test_clock_tec_solve_dask(): np.random.seed(1234) import pylab as plt times = np.arange(2) freqs = np.linspace(110e6,170e6,1000) cs = np.array([1,1]) tec = np.array([0.1,0.2]) delay = np.ones(len(times)) * 2e-9# 10ns phase = np.multiply.outer(np.ones(len(freqs)),cs) + 8.44797256e-7*TECU*np.multiply.outer(1./freqs,tec) + 2.*np.pi*np.multiply.outer(freqs,delay) phase += 15*np.pi/180.*np.random.normal(size=[len(freqs),len(times)]) #plt.imshow(phase,origin='lower',extent=(times[0],times[-1],freqs[0],freqs[-1]),aspect='auto') #plt.colorbar() #plt.xlabel('times (s)') #plt.ylabel('freqs (Hz)') #plt.show() m,cov = least_squares_solve(phase, freqs, times,15,Ct_ratio=0.01) m_exact = np.array([delay,tec,cs]).T import dask.array as da solsMH = [da.from_delayed(clock_tec_solve_dask(phase[:,i],freqs,m[i,:], cov[i,:,:],15,0.01),shape=(3,),dtype=np.double) for i in range(len(times))] sol_stacked = da.stack(solsMH, axis = 0) sol = sol_stacked.compute() print(sol)
def dask_win_func(n): return dsar.from_delayed( delayed(numpy_win_func, pure=True)(n), (n,), float)
def ser_reader(filename, objects=None, *args, **kwds): """Reads the information from the file and returns it in the HyperSpy required format. """ header, data = load_ser_file(filename) record_by = guess_record_by(header['DataTypeID']) ndim = int(header['NumberDimensions']) date, time = None, None if objects is not None: objects_dict = convert_xml_to_dict(objects[0]) date, time = _get_date_time(objects_dict.ObjectInfo.AcquireDate) if "PositionY" in data.dtype.names and len(data['PositionY']) > 1 and \ (data['PositionY'][0] == data['PositionY'][1]): # The spatial dimensions are stored in F order i.e. X, Y, ... order = "F" else: # The spatial dimensions are stored in C order i.e. ..., Y, X order = "C" if ndim == 0 and header["ValidNumberElements"] != 0: # The calibration of the axes are not stored in the header. # We try to guess from the position coordinates. array_shape, axes = get_axes_from_position(header=header, data=data) else: axes = [] array_shape = [None, ] * int(ndim) spatial_axes = ["x", "y"][:ndim] for i in range(ndim): idim = 1 + i if order == "C" else ndim - i if (record_by == "spectrum" or header['Dim-%i_DimensionSize' % (i + 1)][0] != 1): units = (header['Dim-%i_Units' % (idim)][0].decode('utf-8') if header['Dim-%i_UnitsLength' % (idim)] > 0 else t.Undefined) if units == "meters": name = (spatial_axes.pop() if order == "F" else spatial_axes.pop(-1)) else: name = t.Undefined axes.append({ 'offset': header['Dim-%i_CalibrationOffset' % idim][0], 'scale': header['Dim-%i_CalibrationDelta' % idim][0], 'units': units, 'size': header['Dim-%i_DimensionSize' % idim][0], 'name': name, }) array_shape[i] = \ header['Dim-%i_DimensionSize' % idim][0] # Spectral dimension if record_by == "spectrum": axes.append({ 'offset': data['CalibrationOffset'][0], 'scale': data['CalibrationDelta'][0], 'size': data['ArrayLength'][0], 'index_in_array': header['NumberDimensions'][0] }) # FEI seems to use the international system of units (SI) for the # energy scale (eV). axes[-1]['units'] = 'eV' axes[-1]['name'] = 'Energy' array_shape.append(data['ArrayLength'][0]) elif record_by == 'image': if objects is not None: units = _guess_units_from_mode(objects_dict, header) else: units = "meters" # Y axis axes.append({ 'name': 'y', 'offset': data['CalibrationOffsetY'][0] - data['CalibrationElementY'][0] * data['CalibrationDeltaY'][0], 'scale': data['CalibrationDeltaY'][0], 'units': units, 'size': data['ArraySizeY'][0], }) array_shape.append(data['ArraySizeY'][0]) # X axis axes.append({ 'name': 'x', 'offset': data['CalibrationOffsetX'][0] - data['CalibrationElementX'][0] * data['CalibrationDeltaX'][0], 'scale': data['CalibrationDeltaX'][0], 'size': data['ArraySizeX'][0], 'units': units, }) array_shape.append(data['ArraySizeX'][0]) # FEI seems to use the international system of units (SI) for the # spatial scale. However, we prefer to work in nm for axis in axes: if axis['units'] == 'meters': axis['units'] = 'nm' axis['scale'] *= 10 ** 9 elif axis['units'] == '1/meters': axis['units'] = '1/nm' axis['scale'] /= 10 ** 9 # Remove Nones from array_shape caused by squeezing size 1 dimensions array_shape = [dim for dim in array_shape if dim is not None] lazy = kwds.pop('lazy', False) if lazy: from dask import delayed from dask.array import from_delayed val = delayed(load_only_data, pure=True)(filename, array_shape, record_by, len(axes)) dc = from_delayed(val, shape=array_shape, dtype=data['Array'].dtype) else: dc = load_only_data(filename, array_shape, record_by, len(axes), data=data) if ordict: original_metadata = OrderedDict() else: original_metadata = {} header_parameters = sarray2dict(header) sarray2dict(data, header_parameters) # We remove the Array key to save memory avoiding duplication del header_parameters['Array'] original_metadata['ser_header_parameters'] = header_parameters metadata = {'General': { 'original_filename': os.path.split(filename)[1], }, "Signal": { 'signal_type': "", 'record_by': record_by, }, } if date is not None and time is not None: metadata['General']['date'] = date metadata['General']['time'] = time dictionary = { 'data': dc, 'metadata': metadata, 'axes': axes, 'original_metadata': original_metadata, 'mapping': mapping} return dictionary
def file_reader(filename, record_by='image', force_read_resolution=False, **kwds): """ Read data from tif files using Christoph Gohlke's tifffile library. The units and the scale of images saved with ImageJ or Digital Micrograph is read. There is limited support for reading the scale of files created with Zeiss and FEI SEMs. Parameters ---------- filename: str record_by: {'image'} Has no effect because this format only supports recording by image. force_read_resolution: Bool Default: False. Force reading the x_resolution, y_resolution and the resolution_unit of the tiff tags. See http://www.awaresystems.be/imaging/tiff/tifftags/resolutionunit.html **kwds, optional """ _logger.debug('************* Loading *************') # For testing the use of local and skimage tifffile library import_local_tifffile = False if 'import_local_tifffile' in kwds.keys(): import_local_tifffile = kwds.pop('import_local_tifffile') imsave, TiffFile = _import_tifffile_library(import_local_tifffile) lazy = kwds.pop('lazy', False) memmap = kwds.pop('memmap', False) with TiffFile(filename, **kwds) as tiff: # change in the Tifffiles API if hasattr(tiff.series[0], 'axes'): # in newer version the axes is an attribute axes = tiff.series[0].axes else: # old version axes = tiff.series[0]['axes'] is_rgb = tiff.is_rgb _logger.debug("Is RGB: %s" % is_rgb) series = tiff.series[0] if hasattr(series, 'shape'): shape = series.shape dtype = series.dtype else: shape = series['shape'] dtype = series['dtype'] if is_rgb: axes = axes[:-1] names = ['R', 'G', 'B', 'A'] lastshape = shape[-1] dtype = np.dtype({'names': names[:lastshape], 'formats': [dtype] * lastshape}) shape = shape[:-1] op = {} for key, tag in tiff[0].tags.items(): op[key] = tag.value names = [axes_label_codes[axis] for axis in axes] _logger.debug('Tiff tags list: %s' % op.keys()) _logger.debug("Photometric: %s" % op['photometric']) _logger.debug('is_imagej: {}'.format(tiff[0].is_imagej)) # workaround for 'palette' photometric, keep only 'X' and 'Y' axes sl = None if op['photometric'] == 3: sl = [0] * len(shape) names = [] for i, axis in enumerate(axes): if axis == 'X' or axis == 'Y': sl[i] = slice(None) names.append(axes_label_codes[axis]) else: axes.replace(axis, '') shape = tuple(_sh for _s, _sh in zip(sl, shape) if isinstance(_s, slice)) _logger.debug("names: {0}".format(names)) scales = [1.0] * len(names) offsets = [0.0] * len(names) units = [t.Undefined] * len(names) try: scales_d, units_d, offsets_d, intensity_axis = \ _parse_scale_unit(tiff, op, shape, force_read_resolution) for i, name in enumerate(names): if name == 'height': scales[i], units[i] = scales_d['x'], units_d['x'] offsets[i] = offsets_d['x'] elif name == 'width': scales[i], units[i] = scales_d['y'], units_d['y'] offsets[i] = offsets_d['y'] elif name in ['depth', 'image series', 'time']: scales[i], units[i] = scales_d['z'], units_d['z'] offsets[i] = offsets_d['z'] except: _logger.info("Scale and units could not be imported") axes = [{'size': size, 'name': str(name), 'scale': scale, 'offset': offset, 'units': unit, } for size, name, scale, offset, unit in zip(shape, names, scales, offsets, units)] md = {'General': {'original_filename': os.path.split(filename)[1]}, 'Signal': {'signal_type': "", 'record_by': "image", }, } if 'units' in intensity_axis.keys(): md['Signal']['quantity'] = intensity_axis['units'] if 'scale' in intensity_axis.keys() and 'offset' in intensity_axis.keys(): dic = {'gain_factor': intensity_axis['scale'], 'gain_offset': intensity_axis['offset']} md['Signal']['Noise_properties'] = {'Variance_linear_model': dic} data_args = TiffFile, filename, is_rgb, sl if lazy: from dask import delayed from dask.array import from_delayed memmap = True val = delayed(_load_data, pure=True)(*data_args, memmap=memmap, **kwds) dc = from_delayed(val, dtype=dtype, shape=shape) # TODO: maybe just pass the memmap from tiffile? else: dc = _load_data(*data_args, memmap=memmap, **kwds) return [{'data': dc, 'original_metadata': op, 'axes': axes, 'metadata': md, }]
def load_from_unf(cls, filename, lazy=False): """Load a `.unf`-file into a :class:`~.SemperFormat` object. Parameters ---------- filename : string The name of the unf-file from which to load the data. Standard format is '\*.unf'. Returns ------- semper : :class:`~.SemperFormat` (N=1) SEMPER file format object containing the loaded information. """ metadata = OrderedDict() with open(filename, 'rb') as f: # Read header: rec_length = np.fromfile( f, dtype='<i4', count=1)[0] # length of header header = np.fromfile( f, dtype=cls.HEADER_DTYPES[ :rec_length // 2], count=1) metadata.update(sarray2dict(header)) assert np.frombuffer(f.read(4), dtype=np.int32)[0] == rec_length, \ 'Error while reading the header (length is not correct)!' data_format = cls.IFORM_DICT[metadata['IFORM']] iversn, remain = divmod(metadata['IFLAG'], 10000) ilabel, ntitle = divmod(remain, 1000) metadata.update( {'IVERSN': iversn, 'ILABEL': ilabel, 'NTITLE': ntitle}) # Read title: title = '' if ntitle > 0: assert np.fromfile( f, dtype='<i4', count=1)[0] == ntitle # length of title title = b''.join(np.fromfile(f, dtype='c', count=ntitle)) title = title.decode() metadata['TITLE'] = title assert np.fromfile(f, dtype='<i4', count=1)[0] == ntitle if ilabel: try: metadata.update(cls._read_label(f)) except Exception as e: warning = ('Could not read label, trying to proceed ' 'without it!') warning += ' (Error message: {})'.format(str(e)) warnings.warn(warning) # Read picture data: pos = f.tell() shape = metadata['NLAY'], metadata[ 'NROW'], metadata['NCOL'] if lazy: from dask.array import from_delayed from dask import delayed task = delayed(_read_data)(f, filename, pos, data_format, shape) data = from_delayed(task, shape=shape, dtype=data_format) else: data = _read_data(f, filename, pos, data_format, shape) offsets = (metadata.get('X0V0', 0.), metadata.get('Y0V2', 0.), metadata.get('Z0V4', 0.)) scales = (metadata.get('DXV1', 1.), metadata.get('DYV3', 1.), metadata.get('DZV5', 1.)) units = (metadata.get('XUNIT', Undefined), metadata.get('YUNIT', Undefined), metadata.get('ZUNIT', Undefined)) return cls(data, title, offsets, scales, units, metadata)