def position_grid(shape, blocksize): """ """ coords = da.meshgrid(*[range(x) for x in shape], indexing='ij') coords = da.stack(coords, axis=-1).astype(np.int16) return da.rechunk(coords, chunks=tuple(blocksize) + (3, ))
def __call__(self, projectables, optional_datasets=None, **info): """Get the corrected reflectance when removing Rayleigh scattering. Uses pyspectral. """ from pyspectral.rayleigh import Rayleigh if not optional_datasets or len(optional_datasets) != 4: vis, red = self.match_data_arrays(projectables) sata, satz, suna, sunz = self.get_angles(vis) red.data = da.rechunk(red.data, vis.data.chunks) else: vis, red, sata, satz, suna, sunz = self.match_data_arrays( projectables + optional_datasets) sata, satz, suna, sunz = optional_datasets # get the dask array underneath sata = sata.data satz = satz.data suna = suna.data sunz = sunz.data # First make sure the two azimuth angles are in the range 0-360: sata = sata % 360. suna = suna % 360. ssadiff = da.absolute(suna - sata) ssadiff = da.minimum(ssadiff, 360 - ssadiff) del sata, suna atmosphere = self.attrs.get('atmosphere', 'us-standard') aerosol_type = self.attrs.get('aerosol_type', 'marine_clean_aerosol') rayleigh_key = (vis.attrs['platform_name'], vis.attrs['sensor'], atmosphere, aerosol_type) logger.info( "Removing Rayleigh scattering with atmosphere '%s' and " "aerosol type '%s' for '%s'", atmosphere, aerosol_type, vis.attrs['name']) if rayleigh_key not in self._rayleigh_cache: corrector = Rayleigh(vis.attrs['platform_name'], vis.attrs['sensor'], atmosphere=atmosphere, aerosol_type=aerosol_type) self._rayleigh_cache[rayleigh_key] = corrector else: corrector = self._rayleigh_cache[rayleigh_key] try: refl_cor_band = corrector.get_reflectance(sunz, satz, ssadiff, vis.attrs['name'], red.data) except (KeyError, IOError): logger.warning( "Could not get the reflectance correction using band name: %s", vis.attrs['name']) logger.warning( "Will try use the wavelength, however, this may be ambiguous!") refl_cor_band = corrector.get_reflectance( sunz, satz, ssadiff, vis.attrs['wavelength'][1], red.data) proj = vis - refl_cor_band proj.attrs = vis.attrs self.apply_modifier_info(vis, proj) return proj
def filt_blocks_da(dask_array, i_starts, i_end=None, func=None, *args): """ Apply function to each block of numpy array separately (function is , can be provided other to, for example, filter array) :param dask_array: dask array, to filter, may be with unknown chunks as for dask series.values :param i_starts: numpy array, indexes of starts of bocks :param i_end: len(dask_array) if None then last element of i_starts must be equal to it else i_end should not be in i_starts # specifing this removes warning 'invalid value encountered in less' :param func: numpy.interp by default interp(NaNs) used if None returns: dask array of same size as x with func upplied >>> Pfilt = filt_blocks_da(a['P'].values, i_burst, i_end=len(a)) ... sum(~isfinite(a['P'].values.compute())), sum(~isfinite(Pfilt)) # some nans was removed : (6, 0) # other values unchanged >>> allclose(Pfilt[isfinite(a['P'].values.compute())], a['P'].values[isfinite(a['P'].values)].compute()) : True """ if func is None: func = np.interp if i_end: i_starts = np.append(i_starts, i_end) else: i_end = i_starts[-1] if np.isnan(dask_array.size): # unknown chunks delayed transformation dask_array = da.from_delayed(dask_array.to_delayed()[0], shape=(i_end, ), dtype=np.float64, name='filt') y = da.rechunk(dask_array, chunks=(tuple(np.diff(i_starts).tolist()), )) y_out = y.map_blocks(func, dtype=np.float64, name='blocks_da') return y_out
def zero_pad(arr, shape, chunks): """Zero pad an array with zeros Args: arr: the array to pad shape: the shape of the new array chunks: how to rechunk the new array Returns: the new padded version of the array >>> print( ... zero_pad( ... np.arange(4).reshape([1, 2, 2, 1]), ... (1, 4, 5, 1), ... None ... )[0,...,0].compute() ... ) [[0 0 0 0 0] [0 0 0 1 0] [0 0 2 3 0] [0 0 0 0 0]] >>> print(zero_pad(np.arange(4).reshape([2, 2]), (4, 5), None).compute()) [[0 0 0 0 0] [0 0 0 1 0] [0 0 2 3 0] [0 0 0 0 0]] >>> zero_pad(zero_pad(np.arange(4).reshape([2, 2]), (4, 5, 1), None)) Traceback (most recent call last): ... RuntimeError: length of shape is incorrect >>> zero_pad(zero_pad(np.arange(4).reshape([2, 2]), (1, 2), None)) Traceback (most recent call last): ... RuntimeError: resize shape is too small >>> arr = da.from_array(np.arange(4).reshape((2, 2)), chunks=(2, 1)) >>> out = zero_pad(arr, (4, 3), (-1, 1)) >>> out.shape (4, 3) >>> out.chunks ((4,), (1, 1, 1)) """ if len(shape) != len(arr.shape): raise RuntimeError("length of shape is incorrect") if not np.all(shape >= arr.shape): raise RuntimeError("resize shape is too small") return pipe( np.array(shape) - np.array(arr.shape), lambda x: np.concatenate( ((x - (x // 2))[..., None], (x // 2)[..., None]), axis=1 ), fmap(tuple), tuple, lambda x: da.pad(arr, x, "constant", constant_values=0), lambda x: da.rechunk(x, chunks=chunks or x.shape), )
def comb_distance(spec_dist, temp_dist, spat_dist): if logWeight == True: spec_dist = da.log(spec_dist + 1) temp_dist = da.log(temp_dist + 1) comb_dist = da.rechunk(spec_dist * temp_dist * spat_dist, chunks=spec_dist.chunksize) print("Done comb distance!", comb_dist) return comb_dist
def __call__(self, projectables, optional_datasets=None, **info): """Get the corrected reflectance when removing Rayleigh scattering. Uses pyspectral. """ from pyspectral.rayleigh import Rayleigh if not optional_datasets or len(optional_datasets) != 4: vis, red = self.check_areas(projectables) sata, satz, suna, sunz = self.get_angles(vis) red.data = da.rechunk(red.data, vis.data.chunks) else: vis, red, sata, satz, suna, sunz = self.check_areas( projectables + optional_datasets) sata, satz, suna, sunz = optional_datasets # get the dask array underneath sata = sata.data satz = satz.data suna = suna.data sunz = sunz.data LOG.info('Removing Rayleigh scattering and aerosol absorption') # First make sure the two azimuth angles are in the range 0-360: sata = sata % 360. suna = suna % 360. ssadiff = da.absolute(suna - sata) ssadiff = da.minimum(ssadiff, 360 - ssadiff) del sata, suna atmosphere = self.attrs.get('atmosphere', 'us-standard') aerosol_type = self.attrs.get('aerosol_type', 'marine_clean_aerosol') rayleigh_key = (vis.attrs['platform_name'], vis.attrs['sensor'], atmosphere, aerosol_type) if rayleigh_key not in self._rayleigh_cache: corrector = Rayleigh(vis.attrs['platform_name'], vis.attrs['sensor'], atmosphere=atmosphere, aerosol_type=aerosol_type) self._rayleigh_cache[rayleigh_key] = corrector else: corrector = self._rayleigh_cache[rayleigh_key] try: refl_cor_band = corrector.get_reflectance(sunz, satz, ssadiff, vis.attrs['name'], red.data) except (KeyError, IOError): LOG.warning("Could not get the reflectance correction using band name: %s", vis.attrs['name']) LOG.warning("Will try use the wavelength, however, this may be ambiguous!") refl_cor_band = corrector.get_reflectance(sunz, satz, ssadiff, vis.attrs['wavelength'][1], red.data) proj = vis - refl_cor_band proj.attrs = vis.attrs self.apply_modifier_info(vis, proj) return proj
def _call_pandas_groupby_statistics(self, scipy_method, data, fill_value=None, skipna=None): """Calculate statistics (min/max) for each bin with drop-in-a-bucket resampling.""" import dask.dataframe as dd import pandas as pd if isinstance(data, xr.DataArray): data = data.data data = data.ravel() # Remove NaN values from the data when used as weights weights = da.where(np.isnan(data), 0, data) # Rechunk indices to match the data chunking if weights.chunks != self.idxs.chunks: self.idxs = da.rechunk(self.idxs, weights.chunks) # Calculate the min of the data falling to each bin out_size = self.target_area.size # merge into one Dataframe df = dd.concat( [dd.from_dask_array(self.idxs), dd.from_dask_array(weights)], axis=1) df.columns = ['x', 'values'] if scipy_method == 'min': statistics = df.map_partitions(lambda part: part.groupby( np.digitize(part.x, bins=np.linspace(0, out_size, out_size)))[ 'values'].min()) elif scipy_method == 'max': statistics = df.map_partitions(lambda part: part.groupby( np.digitize(part.x, bins=np.linspace(0, out_size, out_size)))[ 'values'].max()) # fill missed index statistics = (statistics + pd.Series(np.zeros(out_size))).fillna(0) counts = self.get_sum(np.logical_not(np.isnan(data)).astype( np.int64)).ravel() # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed statistics = self._mask_bins_with_nan_if_not_skipna( skipna, data, out_size, statistics) # set bin without data to fill value statistics = da.where(counts == 0, fill_value, statistics) return statistics.reshape(self.target_area.shape)
def get_chunk_index(self, chunk_shape, nav_shape): if np.prod(nav_shape ) != self.image_dict["NumFrames"] and nav_shape is not None: num_frames = np.prod(nav_shape) else: num_frames = self.image_dict["NumFrames"] indexes = da.arange(num_frames) if nav_shape is not None: indexes = da.reshape(indexes, nav_shape) indexes = da.rechunk(indexes, chunks=chunk_shape) return indexes
def partition(image, folder): # create a dask array from the image in chunks (31 x 150) image_da = da.from_array(image, chunks = (windowSize,image.shape[1])) # padding the array before and after with 15 pixels image_pad = da.pad(image_da, windowSize//2, mode='constant') for i in range(0,windowSize): row = str(i) block_i = image_pad[i:,:] block_i_da = da.rechunk(block_i, chunks=(windowSize,image_pad.shape[1])) block_i_da.map_blocks(block2row, dtype=int, row=row, folder=folder).compute()
def partition(image, folder): image_da = da.from_array(image, chunks=(windowSize, image.shape[1])) image_pad = da.pad(image_da, windowSize // 2, mode='constant') for i in range(0, windowSize): row = str(i) block_i = image_pad[i:, :] block_i_da = da.rechunk(block_i, chunks=(windowSize, image_pad.shape[1])) block_i_da.map_blocks(block2row, dtype=int, row=row, folder=folder).compute()
def _permute(self, array): # Ensure only a single chunk on the baseline axis, so that we can do # the permutation on a chunk-by-chunk basis. This is more efficient # than using dask to do the permutation. # # Currently katdal already does this step, but the code is left here # in case that changes in future. if array.numblocks[2] != 1: array = da.rechunk(array, chunks={2: -1}) # pragma: nocover index = np.s_[:, :, self._corr_product_permutation] return da.map_blocks(lambda block: block[index], array, chunks=(array.chunks[0], array.chunks[1], (len(self._corr_product_permutation), )), dtype=array.dtype)
def da_stack(folder, shape): da_list = [] full_path = path + folder max_blocks = shape[0]//windowSize + 1 for block in range(1,max_blocks + 1): for row in range(0,windowSize): name = str(block) + 'r' + str(row) full_name = full_path + name + '.zarr' try: da_array = da.from_zarr(full_name) da_list.append(da_array) except Exception: continue return da.rechunk(da.concatenate(da_list, axis=0), chunks = (shape[1],windowSize**2))
def two_point_stats(arr1, arr2, periodic_boundary=True, cutoff=None): """Calculate the 2-points stats for two arrays Args: arr1: array used to calculate cross-correlations (n_samples,n_x,n_y) arr2: array used to calculate cross-correlations (n_samples,n_x,n_y) periodic_boundary: whether to assume a periodic boundary (default is true) cutoff: the subarray of the 2 point stats to keep Returns: the snipped 2-points stats >>> two_point_stats( ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... ).shape (2, 5) """ cutoff_ = int((np.min(arr1.shape[1:]) - 1) / 2) if cutoff is None: cutoff = cutoff_ cutoff = min(cutoff, cutoff_) nonperiodic_padder = sequence( dapad( pad_width=[(0, 0)] + [(cutoff, cutoff)] * (arr1.ndim - 1), mode="constant", constant_values=0, ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) padder = identity if periodic_boundary else nonperiodic_padder nonperiodic_normalize = lambda x: x / auto_correlation( padder(np.ones_like(arr1))) normalize = identity if periodic_boundary else nonperiodic_normalize return sequence( map_(padder), list, star(cross_correlation), normalize, center_slice(cutoff=cutoff), )([arr1, arr2])
def get_sum(self, data, skipna=True): """Calculate sums for each bin with drop-in-a-bucket resampling. Parameters ---------- data : Numpy or Dask array Data to be binned and summed. skipna : boolean (optional) If True, skips NaN values for the sum calculation (similarly to Numpy's `nansum`). Buckets containing only NaN are set to zero. If False, sets the bucket to NaN if one or more NaN values are present in the bucket (similarly to Numpy's `sum`). In both cases, empty buckets are set to 0. Default: True Returns ------- data : Numpy or Dask array Bin-wise sums in the target grid """ LOG.info("Get sum of values in each location") if isinstance(data, xr.DataArray): data = data.data data = data.ravel() # Remove NaN values from the data when used as weights weights = da.where(np.isnan(data), 0, data) # Rechunk indices to match the data chunking if weights.chunks != self.idxs.chunks: self.idxs = da.rechunk(self.idxs, weights.chunks) # Calculate the sum of the data falling to each bin out_size = self.target_area.size sums, _ = da.histogram(self.idxs, bins=out_size, range=(0, out_size), weights=weights, density=False) # TODO remove following line in favour of weights = data when dask histogram bug (issue #6935) is fixed sums = self._mask_bins_with_nan_if_not_skipna(skipna, data, out_size, sums) return sums.reshape(self.target_area.shape)
def single_window(df, rgeno, tgeno, threads=1, max_memory=None, justd=False, extend=False): ridx = df.i_ref.values tidx = df.i_tar.values rg = rgeno[:, ridx] tg = tgeno[:, tidx] if extend: # extend the Genotpe at both end to avoid edge effects ridx_a, ridx_b = np.array_split(ridx, 2) tidx_a, tidx_b = np.array_split(tidx, 2) rg = da.concatenate( [rgeno[:, (ridx_a[::-1][:-1])], rg, rgeno[:, (ridx_b[::-1][1:])]], axis=1) tg = da.concatenate( [tgeno[:, (tidx_a[::-1][:-1])], tg, tgeno[:, (tidx_b[::-1][1:])]], axis=1) D_r = da.dot(rg.T, rg) / rg.shape[0] D_t = da.dot(tg.T, tg) / tg.shape[0] # remove the extras D_r = D_r[:, (ridx_a.shape[0] + 1):][:, :(ridx.shape[0])] D_r = D_r[(ridx_a.shape[0] + 1):, :][:(ridx.shape[0]), :] D_t = D_t[:, (tidx_a.shape[0] + 1):][:, :(tidx.shape[0])] D_t = D_t[(tidx_a.shape[0] + 1):, :][:(tidx.shape[0]), :] assert D_r.shape[1] == ridx.shape[0] assert D_t.shape[1] == tidx.shape[0] else: D_r = da.dot(rg.T, rg) / rg.shape[0] D_t = da.dot(tg.T, tg) / tg.shape[0] if justd: return df.snp, D_r, D_t cot = da.diag(da.dot(D_r, D_t)) ref = da.diag(da.dot(D_r, D_r)) tar = da.diag(da.dot(D_t, D_t)) stacked = da.stack([df.snp, ref, tar, cot], axis=1) c_h_u_n_k_s = estimate_chunks(stacked.shape, threads, max_memory) stacked = da.rechunk(stacked, chunks=c_h_u_n_k_s) columns = ['snp', 'ref', 'tar', 'cotag'] return dd.from_dask_array(stacked, columns=columns).compute()
def executeLabeledTraining(self, client: Client = None): # Train Model over the labeled instances if client is not None: with joblib.parallel_backend("dask"): self._ml_technique.fit( da.rechunk(self._X[self._label_idx.index, :]), da.rechunk(self._Y[self._label_idx.index])) # predict the results over the labeled test instances if hasattr(self._ml_technique, 'predict_classes'): label_pred = self._ml_technique.predict_classes( da.rechunk(self._X[self._test_idx, :])) else: label_pred = self._ml_technique.predict( da.rechunk(self._X[self._test_idx, :])) else: self._ml_technique.fit( da.rechunk(self._X[self._label_idx.index, :]), da.rechunk(self._Y[self._label_idx.index])) # predict the results over the labeled test instances if hasattr(self._ml_technique, 'predict_classes'): label_pred = self._ml_technique.predict_classes( da.rechunk(self._X[self._test_idx, :])) else: label_pred = self._ml_technique.predict( da.rechunk(self._X[self._test_idx, :])) # performance calc for all metrics label_perf = [] for metric in self._performance_metrics: value = delayed( metric.compute(y_true=self._Y[self._test_idx], y_pred=label_pred)) label_perf.append( delayed({ "name": metric.metric_name, "value": value })) return label_pred, compute(label_perf)[0]
def get_sum(self, data, mask_all_nan=False): """Calculate sums for each bin with drop-in-a-bucket resampling. Parameters ---------- data : Numpy or Dask array mask_all_nan : boolean (optional) Mask bins that have only NaN results, default: False Returns ------- data : Numpy or Dask array Bin-wise sums in the target grid """ LOG.info("Get sum of values in each location") if isinstance(data, xr.DataArray): data = data.data data = data.ravel() # Remove NaN values from the data when used as weights weights = da.where(np.isnan(data), 0, data) # Rechunk indices to match the data chunking if weights.chunks != self.idxs.chunks: self.idxs = da.rechunk(self.idxs, weights.chunks) # Calculate the sum of the data falling to each bin out_size = self.target_area.size sums, _ = da.histogram(self.idxs, bins=out_size, range=(0, out_size), weights=weights, density=False) if mask_all_nan: nans = np.isnan(data) nan_sums, _ = da.histogram(self.idxs[nans], bins=out_size, range=(0, out_size)) counts = self.get_count().ravel() sums = da.where(nan_sums == counts, np.nan, sums) return sums.reshape(self.target_area.shape)
def dask_arr(self, videos, freq='S'): '''xararry representation of all videos in one folder (event)''' start_time = self._r_timestamp(videos[0]) lazy = [self.read_video(video) for video in videos] sample = lazy[0].compute() _, h, w, c = sample.shape da_array = [ da.from_delayed(arr, dtype=np.uint8, shape=sample.shape) for arr in lazy ] da_array = da.stack(da_array) # da_array= da.reshape(da_array, (da_array.shape[0]*da_array.shape[1], 1080,1920,3), chunks=(1,1080,1920,3)) da_array = da_array.reshape(da_array.shape[0] * da_array.shape[1], h, w, c) da_array = da.rechunk(da_array, (1, h, w, c)) print(da_array) end_time = start_time + datetime.timedelta(seconds=da_array.shape[0] - 1) return da_array, pd.date_range(start_time, end_time, freq='S')
def weighting(spec_dist, temp_dist, comb_dist, similar_pixels_filtered): # Assign max weight (1) when the temporal or spectral distance is zero zero_spec_dist = da.where(spec_dist[:,mid_idx][:,None] == 1, 1, 0) zero_temp_dist = da.where(temp_dist[:,mid_idx][:,None] == 1, 1, 0) zero_dist_mid = da.where((zero_spec_dist == 1), zero_spec_dist, zero_temp_dist) shape = da.subtract(spec_dist.shape,(0,1)) zero_dist = da.zeros(shape, chunks=(spec_dist.shape[0],shape[1])) zero_dist = da.insert(zero_dist, [mid_idx], zero_dist_mid, axis=1) weights = da.where((da.sum(zero_dist,1)[:,None] == 1), zero_dist, comb_dist) # Calculate weights only for the filtered spectrally similar pixels weights_filt = weights*similar_pixels_filtered # Normalize weights norm_weights = da.rechunk(weights_filt/(da.sum(weights_filt,1)[:,None]), chunks = spec_dist.chunksize) print ("Done weighting!", norm_weights) return norm_weights
def dec10216(inbuf): """Decode 10 bits data into 16 bits words. :: /* * pack 4 10-bit words in 5 bytes into 4 16-bit words * * 0 1 2 3 4 5 * 01234567890123456789012345678901234567890 * 0 1 2 3 4 */ ip = &in_buffer[i]; op = &out_buffer[j]; op[0] = ip[0]*4 + ip[1]/64; op[1] = (ip[1] & 0x3F)*16 + ip[2]/16; op[2] = (ip[2] & 0x0F)*64 + ip[3]/4; op[3] = (ip[3] & 0x03)*256 +ip[4]; """ arr10 = inbuf.astype(np.uint16) arr16_len = int(len(arr10) * 4 / 5) arr10_len = int((arr16_len * 5) / 4) arr10 = arr10[:arr10_len] # adjust size # dask is slow with indexing arr10_0 = arr10[::5] arr10_1 = arr10[1::5] arr10_2 = arr10[2::5] arr10_3 = arr10[3::5] arr10_4 = arr10[4::5] arr16_0 = (arr10_0 << 2) + (arr10_1 >> 6) arr16_1 = ((arr10_1 & 63) << 4) + (arr10_2 >> 4) arr16_2 = ((arr10_2 & 15) << 6) + (arr10_3 >> 2) arr16_3 = ((arr10_3 & 3) << 8) + arr10_4 arr16 = da.stack([arr16_0, arr16_1, arr16_2, arr16_3], axis=-1).ravel() arr16 = da.rechunk(arr16, arr16.shape[0]) return arr16
def dec10216(inbuf): """Decode 10 bits data into 16 bits words. :: /* * pack 4 10-bit words in 5 bytes into 4 16-bit words * * 0 1 2 3 4 5 * 01234567890123456789012345678901234567890 * 0 1 2 3 4 */ ip = &in_buffer[i]; op = &out_buffer[j]; op[0] = ip[0]*4 + ip[1]/64; op[1] = (ip[1] & 0x3F)*16 + ip[2]/16; op[2] = (ip[2] & 0x0F)*64 + ip[3]/4; op[3] = (ip[3] & 0x03)*256 +ip[4]; """ arr10 = inbuf.astype(np.uint16) arr16_len = int(len(arr10) * 4 / 5) arr10_len = int((arr16_len * 5) / 4) arr10 = arr10[:arr10_len] # adjust size # dask is slow with indexing arr10_0 = arr10[::5] arr10_1 = arr10[1::5] arr10_2 = arr10[2::5] arr10_3 = arr10[3::5] arr10_4 = arr10[4::5] arr16_0 = (arr10_0 << 2) + (arr10_1 >> 6) arr16_1 = ((arr10_1 & 63) << 4) + (arr10_2 >> 4) arr16_2 = ((arr10_2 & 15) << 6) + (arr10_3 >> 2) arr16_3 = ((arr10_3 & 3) << 8) + arr10_4 arr16 = da.stack([arr16_0, arr16_1, arr16_2, arr16_3], axis=-1).ravel() arr16 = da.rechunk(arr16, arr16.shape[0]) return arr16
def _call_bin_statistic(self, statistic_method, data, fill_value=None, skipna=None): """Calculate statistics (min/max) for each bin with drop-in-a-bucket resampling.""" if isinstance(data, xr.DataArray): data = data.data data = data.ravel() # Rechunk indices to match the data chunking if data.chunks != self.idxs.chunks: self.idxs = da.rechunk(self.idxs, data.chunks) out_shape = self.target_area.shape statistics = da.from_delayed(_get_statistics(statistic_method, data, self.idxs, out_shape), shape=out_shape, dtype=np.float64) return statistics
def correlations_multiple(data, correlations, periodic_boundary=True, cutoff=None): """Calculate 2-point stats for a multiple auto/cross correlation Args: data: the discretized data (n_samples,n_x,n_y,n_correlation) correlation_pair: the correlation pairs periodic_boundary: whether to assume a periodic boudnary (default is true) cutoff: the subarray of the 2 point stats to keep Returns: the 2-points stats array >>> data = np.arange(18).reshape(1, 3, 3, 2) >>> out = correlations_multiple(data, [[0, 1], [1, 1]]) >>> out dask.array<stack, shape=(1, 3, 3, 2), dtype=float64, chunksize=(1, 3, 3, 1)> >>> answer = np.array([[[58, 62, 58], [94, 98, 94], [58, 62, 58]]]) + 1. / 3. >>> assert(out.compute()[...,0], answer) """ return pipe( range(data.shape[-1]), map_(lambda x: (0, x)), lambda x: correlations if correlations else x, map_(lambda x: two_point_stats( data[..., x[0]], data[..., x[1]], periodic_boundary=periodic_boundary, cutoff=cutoff, )), list, lambda x: da.stack(x, axis=-1), lambda x: da.rechunk(x, x.chunks[:-1] + (-1, )), )
def _apply(func, datasets, chunk=CHUNK, pad=None, relabel=False, stack=False, compute=True, out=None, normalize=False, **kwargs): """ Appplies a function to a given set of datasets. Wraps a standard function call of the form: func(*datasets, **kwargs) Named parameters gives extra functionality. Parameters ---------- func: callable Function to be mapped across datasets. datasets: list of numpy array-like Input datasets. chunk: boolean If `True` then input datasets will be assumed tobe `Dask.Array`s and the function will be mapped across arrays blocks. pad: None, int or iterable The padding to apply (only if `chunk = True`). If `pad != None` then `dask.array.ghost.map_overlap` will be used to map the function across overlapping blocks, otherwise `dask.array.map_blocks` will be used. relabel: boolean Some of the labelling functions will yield local labelling if `chunk=True`. If `func` is a labelling function, set `relabel = True` to map the result for global consistency. See `survos2.improc.utils.dask_relabel_chunks` for more details. compute: boolean If `True` the result will be computed and returned in numpy array form, otherwise a `dask.delayed` will be returned if `chunk = True`. out: None or numpy array-like if `out != None` then the result will be stored in there. **kwargs: other keyword arguments Arguments to be passed to `func`. Returns ------- result: numpy array-like The computed result if `compute = True` or `chunk = False`, the result of the lazy wrapping otherwise. """ if stack and len(datasets) > 1: dataset = da.stack(datasets, axis=0) dataset = da.rechunk(dataset, chunks=(dataset.shape[0], ) + dataset.chunks[1:]) datasets = [dataset] if chunk == True: kwargs.setdefault('dtype', out.dtype if out else datasets[0].dtype) kwargs.setdefault('drop_axis', 0 if stack else None) if pad is None or pad == False: result = da.map_blocks(func, *datasets, **kwargs) elif len(datasets) == 1: if np.isscalar(pad): pad = [pad] * datasets[0].ndim if stack: pad[0] = 0 # don't pad feature channel depth = {i: d for i, d in enumerate(pad)} trim = {i: d for i, d in enumerate(pad[1:])} else: depth = trim = {i: d for i, d in enumerate(pad)} g = da.ghost.ghost(datasets[0], depth=depth, boundary='reflect') r = g.map_blocks(func, **kwargs) result = da.ghost.trim_internal(r, trim) else: raise ValueError('`pad` only works with single') rchunks = result.chunks if not relabel and normalize: result = result / da.nanmax(da.fabs(result)) if out is not None: result.store(out, compute=True) elif compute: result = result.compute() if relabel: if out is not None: result = dask_relabel_chunks(da.from_array(out, chunks=rchunks)) result.store(out, compute=True) else: result = dask_relabel_chunks( da.from_array(result, chunks=rchunks)) if compute: result = result.compute() else: result = func(*datasets, **kwargs) if out is not None: out[...] = result if out is None: return result
def image_tikhonov(self, vis_arr, sphere, alpha, scale=True, usedask=False): n_s = sphere.pixels.shape[0] n_v = self.u_arr.shape[0] lambduh = alpha / np.sqrt(n_s) if not usedask: gamma = self.make_gamma(sphere) logger.info("augmented: {}".format(gamma.shape)) vis_aux = vis_to_real(vis_arr) logger.info("vis mean: {} shape: {}".format( np.mean(vis_aux), vis_aux.shape)) tol = min(alpha / 1e4, 1e-10) logger.info("Solving tol={} ...".format(tol)) # reg = linear_model.ElasticNet(alpha=alpha/np.sqrt(n_s), # tol=1e-6, # l1_ratio = 0.01, # max_iter=100000, # positive=True) if False: ( sky, lstop, itn, r1norm, r2norm, anorm, acond, arnorm, xnorm, var, ) = scipy.sparse.linalg.lsqr(gamma, vis_aux, damp=alpha, show=True) logger.info( "Alpha: {}: Iterations: {}: rnorm: {}: xnorm: {}".format( alpha, itn, r2norm, xnorm)) else: reg = linear_model.Ridge(alpha=alpha, tol=tol, solver="lsqr", max_iter=100000) reg.fit(gamma, vis_aux) logger.info(" Solve Complete, iter={}".format(reg.n_iter_)) sky = da.from_array(reg.coef_) residual = vis_aux - gamma @ sky sky, residual_norm, solution_norm = da.compute( sky, np.linalg.norm(residual)**2, np.linalg.norm(sky)**2) score = reg.score(gamma, vis_aux) logger.info("Alpha: {}: Loss: {}: rnorm: {}: snorm: {}".format( alpha, score, residual_norm, solution_norm)) else: from dask_ml.linear_model import LinearRegression import dask_glm from dask.distributed import Client, LocalCluster from dask.diagnostics import ProgressBar import dask logger.info("Starting Dask Client") if True: cluster = LocalCluster(dashboard_address=":8231", processes=False) client = Client(cluster) else: client = Client("tcp://localhost:8786") logger.info("Client = {}".format(client)) harmonic_list = [] p2j = 2 * np.pi * 1.0j dl = sphere.l dm = sphere.m dn = sphere.n n_arr_minus_1 = dn - 1 du = self.u_arr dv = self.v_arr dw = self.w_arr for u, v, w in zip(du, dv, dw): harmonic = da.from_array( np.exp(p2j * (u * dl + v * dm + w * n_arr_minus_1)) / np.sqrt(sphere.npix), chunks=(n_s, ), ) harminc = client.persist(harmonic) harmonic_list.append(harmonic) gamma = da.stack(harmonic_list) logger.info("Gamma Shape: {}".format(gamma.shape)) # gamma = gamma.reshape((n_v, n_s)) gamma = gamma.conj() gamma = client.persist(gamma) logger.info("Gamma Shape: {}".format(gamma.shape)) logger.info("Building Augmented Operator...") proj_operator_real = da.real(gamma) proj_operator_imag = da.imag(gamma) proj_operator = da.block([[proj_operator_real], [proj_operator_imag]]) proj_operator = client.persist(proj_operator) logger.info("Proj Operator shape {}".format(proj_operator.shape)) vis_aux = da.from_array( np.array( np.concatenate((np.real(vis_arr), np.imag(vis_arr))), dtype=np.float32, )) # logger.info("Solving...") en = dask_glm.regularizers.ElasticNet(weight=0.01) en = dask_glm.regularizers.L2() # dT = da.from_array(proj_operator, chunks=(-1, 'auto')) ##dT = da.from_array(proj_operator, chunks=(-1, 'auto')) # dv = da.from_array(vis_aux) dask.config.set({"array.chunk-size": "1024MiB"}) A = da.rechunk(proj_operator, chunks=("auto", n_s)) A = client.persist(A) y = vis_aux # da.rechunk(vis_aux, chunks=('auto', n_s)) y = client.persist(y) # sky = dask_glm.algorithms.proximal_grad(A, y, regularizer=en, lambduh=alpha, max_iter=10000) logger.info("Rechunking completed.. A= {}.".format(A.shape)) reg = LinearRegression( penalty=en, C=1.0 / lambduh, fit_intercept=False, solver="lbfgs", max_iter=1000, tol=1e-8, ) sky = reg.fit(A, y) sky = reg.coef_ score = reg.score(proj_operator, vis_aux) logger.info("Loss function: {}".format(score.compute())) logger.info("Solving Complete: sky = {}".format(sky.shape)) sphere.set_visible_pixels(sky, scale=False) return sky.reshape(-1, 1)
def fft_dask(self, src_fname, src_dset, dst_fname, dst_dset, axis, background_subtraction=True, window=False): """Perform an out of core FFT along a given axes using the DASK module. Requires the data to be in a .hdf5 file. Allows FFT to be performed on large datasets that do not fit into memory. Takes the source .hdf5 file name and dataset as well as th destination file name and dataset as inputs. """ if (src_fname == dst_fname): print('must write to new .hdf5 file') return 1 # open the hdf5 files with hd.File(src_fname, 'r', libver='latest') as s: with hd.File(dst_fname, 'w', libver='latest') as d: # create a destination dataset dshape = s[src_dset].shape cshape = s[src_dset].chunks d.create_dataset(dst_dset, dshape, chunks=cshape, dtype=complex) # CAN WE CLOSE THE FILES HERE AND REOPEN THEM LATER? with hd.File(src_fname, 'r', libver='latest') as s: # make a dask array from the dset data = da.from_array(s[src_dset], s[src_dset].chunks) # weld chunks together to span the fft axis newcshape = sp.array(cshape) newcshape[axis] = dshape[axis] newcshape = tuple(newcshape) # rechunk dask array in order to perform fft data = da.rechunk(data, newcshape) # make optional background subtraction if (background_subtraction == True): background = data[:, :, :, :, 0] data = data - background[:, :, :, :, None] # make optional windowing before fourier transform if (window != False): try: w = eval('signal.' + window + '(data.shape[axis])') dim_arr = sp.ones((1, w.ndim), int).ravel() dim_arr[axis] = -1 window_reshaped = w.reshape(dim_arr) data = data * window_reshaped except: print( 'invalid window function, skipping windowing.\nLook up scipy.signal docs' ) pass # fft and write to destination dataset on disk fft_data = da.fft.fft(data, axis=axis) fft_data.dtype = 'complex64' with ProgressBar(): fft_data.to_hdf5( dst_fname, dst_dset, libver='latest' ) #, chunks=cshape, dtype=complex, compression='lzf') return 0
def blocked_rank(array): chunks = list(array.chunks) chunks[axis] = -1 array = array.rechunk(chunks) return dask.array.map_blocks(rank_along_axis, array)
def single_window(df, rg, tg, threads=1, max_memory=None, justd=False, extend=False): """ Helper function to compute the correlation between variants from a genotype array :param df: Merged dataframe mapping of the positions in the genotypes :param rg: slice of Genotype array of the reference population :param tg: slice of Genotype array of the target population :param threads: Number of threads to estimate memory use :param max_memory: Memory limit :param justd: Return the raw LD matrices insted of its dot product :param extend: 'Circularize' the genome by extending both ends :return: """ if not df.empty: # set Cache to protect memory spilling if max_memory is not None: available_memory = max_memory else: available_memory = psutil.virtual_memory().available / 2 cache = Chest(available_memory=available_memory) # Make sure chunks make sense chunk_opts = dict(threads=threads, memory=available_memory) if not isinstance(rg, np.ndarray): rg = rg.rechunk(estimate_chunks(shape=rg.shape, **chunk_opts)) tg = tg.rechunk(estimate_chunks(shape=tg.shape, **chunk_opts)) # extend the genotype at both end to avoid edge effects if extend: # get the indices of the subset genotype array nidx = np.arange(rg.shape[1]) # Split the array in half (approximately) idx_a, idx_b = np.array_split(nidx, 2) # Get the extednded indices i = np.concatenate([idx_a[::-1][:-1], nidx, idx_b[::-1][1:]]) # Re-subset the genotype arrays with the extensions rg, tg = rg[:, i], tg[:, i] assert rg.shape[1] == tg.shape[1] # Compute the correltion as X'X/N rho_r = da.dot(rg.T, rg) / rg.shape[0] rho_t = da.dot(tg.T, tg) / tg.shape[0] # remove the extras idx = np.arange(i.shape[0])[idx_a.shape[0] - 1:(nidx.shape[0] + idx_b.shape[0])] rho_r, rho_t = rho_r[idx, :], rho_t[idx, :] rho_r, rho_t = rho_r[:, idx], rho_t[:, idx] # Make sure the shape match assert rho_r.shape[1] == rho_r.shape[0] assert rho_t.shape[1] == rho_t.shape[0] else: # Just compute the correlations rho_r = da.dot(rg.T, rg) / rg.shape[0] rho_t = da.dot(tg.T, tg) / tg.shape[0] if justd: # return the raw LD matrices return df.snp, rho_r, rho_t gc.collect() # compute the cotagging/tagging scores cot = da.diag(da.dot(rho_r, rho_t)) ref = da.diag(da.dot(rho_r, rho_r)) tar = da.diag(da.dot(rho_t, rho_t)) stacked = da.stack([df.snp, ref, tar, cot], axis=1) c_h_u_n_k_s = estimate_chunks(stacked.shape, threads, max_memory) stacked = da.rechunk(stacked, chunks=c_h_u_n_k_s) columns = ['snp', 'ref', 'tar', 'cotag'] return dd.from_dask_array(stacked, columns=columns).compute(cache=cache)
def dask_safeslice(data, indices, chunks=None): """ COPIED FROM https://github.com/dask/dask/issues/5540#issuecomment-601150129 Added fancy indexing xarray.core.indexing.DaskIndexingAdapter Return a subset of a dask array, but with indexing applied independently to each slice of the input array, *prior* to their recombination to produce the result array. Args: * data (dask array): input data * indices (int or slice or tuple(int or slice)): required sub-section of the data. Kwargs: * chunks (list of (int or "auto")): chunking argument for 'rechunk' applied to the input. If set, forces the input to be rechunked as specified. ( This replaces the normal operation, which is to rechunk the input making the indexed dimensions undivided ). Mainly for testing on small arrays. .. note:: 'indices' currently does not support Ellipsis or newaxis. """ from collections.abc import Iterable import dask.array as da # The idea is to "push down" the indexing operation to "underneath" the # result concatenation, so it gets done _before_ that. # This 'result concatenation' is actually implicit: the _implied_ # concatenation of all the result chunks into a single output array. # We assume that any *one* chunk *can* be successfully computed. # By applying the indexing operation to each chunk, prior to the # complete result (re-)construction, we hope to make this work. # Normalise input to a list over all data dimensions. # NOTE: FOR NOW, this does not support Ellipsis. # TODO: that could easily be fixed. # Convert the slicing indices to a list of (int or slice). # ( NOTE: not supporting Ellipsis. ) if not isinstance(indices, Iterable): # Convert a single key (slice or integer) to a length-1 list. indices = [indices] else: # Convert other iterable types to lists. indices = list(indices) n_data_dims = data.ndim assert len(indices) <= n_data_dims # Extend with ":" in all the additional (trailing) dims. all_slice = slice(None) indices += (n_data_dims - len(indices)) * [all_slice] assert len(indices) == n_data_dims # Discriminate indexed and non-indexed dims. # An "indexed" dim is where input index is *anything* other than a ":". dim_is_indexed = [index != all_slice for index in indices] # Work out which indices are simple integer values. # ( by definition, all of these will be "indexed" dims ) dim_is_removed = [isinstance(key, int) for key in indices] # Replace single-value indices with length-1 indices, so the indexing # preserves all dimensions (as this makes reconstruction easier). # ( We use the above 'dim_is_removed' to correct this afterwards. ) indices = [slice(key, key + 1) if isinstance(key, int) else key for key in indices] # We will now rechunk to get "our chunks" : but these must not be divided # in dimensions affected by the requested indexing. # So we rechunk, but insist that those dimensions are kept whole. # ( Obviously, not always optimal ... ) # As the indexed dimensions will always be _reduced_ by the indexing, this # is obviously over-conservative + may give chunks which are rather too # small. Let's just ignore that problem for now! if chunks is not None: rechunk_dim_specs = list(chunks) else: rechunk_dim_specs = ["auto"] * n_data_dims for i_dim in range(n_data_dims): if dim_is_indexed[i_dim]: rechunk_dim_specs[i_dim] = -1 data = da.rechunk(data, chunks=rechunk_dim_specs) # Calculate multidimensional indexings of the original data array which # correspond to all these chunks. # Note: following the "-1"s in the above rechunking spec, the indexed dims # should all have only one chunk in them. assert all( len(data.chunks[i_dim]) == 1 for i_dim in range(n_data_dims) if dim_is_removed[i_dim] ) # Make an array of multidimensional indexes corresponding to all chunks. chunks_shape = [len(chunk_lengths) for chunk_lengths in data.chunks] chunks_shape += [n_data_dims] chunk_indices = np.zeros(chunks_shape, dtype=object) # The chunk_indices array ... # * has dimensions of n-data-dims + 1 # * has shape of "chunks-shape" + (n_data_dims,) # * each entry[i0, i1, iN-1] --> n_data_dims * slice-objects. # Pre-fill indexes array with [:, :, ...] chunk_indices[...] = all_slice # Set slice ranges for each dimension at a time. for i_dim in range(n_data_dims): # Fix all keys for this data dimension : chunk_indices[..., i_dim] dim_inds = [all_slice] * n_data_dims + [i_dim] if dim_is_indexed[i_dim]: # This is a user-indexed dim, so should be un-chunked. assert len(data.chunks[i_dim]) == 1 # Set keys for this dim to the user-requested indexing. if EMBED_INDEXES: chunk_indices[tuple(dim_inds)] = indices[i_dim] else: # Replace keys for this dim with the slice range for the # relevant chunk, for each chunk in the dim. startend_positions = np.cumsum([0] + list(data.chunks[i_dim])) starts, ends = startend_positions[:-1], startend_positions[1:] for i_key, (i_start, i_end) in enumerate(zip(starts, ends)): dim_inds[i_dim] = i_key chunk_indices[tuple(dim_inds)] = slice(i_start, i_end) # E.G. chunk_indices[:, :, 1, :][2] = slice(3,6) # Make actual addressed chunks by indexing the original array, arrange them # in the same pattern, and re-combine them all to make a result array. # This needs to be a list-of-lists construction, as da.block requires it. # ( an array of arrays is presumably too confusing ?!? ) def get_chunks(multidim_indices): if multidim_indices.ndim > 1: # Convert the "array of chunks" dims --> lists-of-lists result = [ get_chunks(multidim_indices[i_part]) for i_part in range(multidim_indices.shape[0]) ] else: # Innermost dim contains n-dims * slice-objects # Convert these into a slice of the data array. result = data.__getitem__(tuple(multidim_indices)) if not EMBED_INDEXES: # Now *also* apply the required indexing to this chunk. # It initially seemed *essential* that this be an independent # operation, so that the memory associated with the whole chunk # can be released. # But ACTUALLY this is not so, given the next step (see on). try: result = result.__getitem__(tuple(indices)) except NotImplementedError: result = data for axis, subkey in reversed(list(enumerate(tuple(indices)))): result = result[(slice(None),) * axis + (subkey,)] # AND FINALLY : apply a numpy copy to this indexed-chunk. # This is essential, to release the source chunks ?? # see: https://github.com/dask/dask/issues/3595#issuecomment-449546228 result = result.map_blocks(np.copy) return result listoflists_of_chunks = get_chunks(chunk_indices) result = da.block(listoflists_of_chunks) assert result.ndim == n_data_dims # Unchanged as 'da.block' concatenates. # Finally remove the extra dimensions for single-value indices. assert all( result.shape[i_dim] == 1 for i_dim in range(n_data_dims) if dim_is_removed[i_dim] ) all_dim_indices = [ 0 if dim_is_removed[i_dim] else all_slice for i_dim in range(n_data_dims) ] result = result.__getitem__(tuple(all_dim_indices)) return result
def two_point_stats(arr1, arr2, mask=None, periodic_boundary=True, cutoff=None): """Calculate the 2-points stats for two arrays Args: arr1: array used to calculate cross-correlations (n_samples,n_x,n_y) arr2: array used to calculate cross-correlations (n_samples,n_x,n_y) mask: array specifying confidence in the measurement at a pixel (n_samples,n_x,n_y). In range [0,1]. periodic_boundary: whether to assume a periodic boundary (default is true) cutoff: the subarray of the 2 point stats to keep Returns: the snipped 2-points stats >>> two_point_stats( ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... ).shape (2, 5) Test masking >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]]) >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]]) >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]]) >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask >>> assert np.allclose( ... two_point_stats(array, array, mask=mask, periodic_boundary=False), ... expected ... ) The mask must be in the range 0 to 1. >>> array = da.array([[[1, 0], [0, 1]]]) >>> mask = da.array([[[2, 0], [0, 1]]]) >>> two_point_stats(array, array, mask) Traceback (most recent call last): ... RuntimeError: Mask must be in range [0,1] """ cutoff_ = int((np.min(arr1.shape[1:]) - 1) / 2) if cutoff is None: cutoff = cutoff_ cutoff = min(cutoff, cutoff_) nonperiodic_padder = sequence( dapad( pad_width=[(0, 0)] + [(cutoff, cutoff)] * (arr1.ndim - 1), mode="constant", constant_values=0, ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) padder = identity if periodic_boundary else nonperiodic_padder if mask is not None: if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0: raise RuntimeError("Mask must be in range [0,1]") mask_array = lambda arr: arr * mask normalize = lambda x: x / auto_correlation(padder(mask)) else: mask_array = identity if periodic_boundary: # The periodic normalization could always be the # auto_correlation of the mask. But for the sake of # efficiency, we specify the periodic normalization in the # case there is no mask. normalize = lambda x: x / arr1[0].size else: normalize = lambda x: x / auto_correlation( padder(np.ones_like(arr1))) return sequence( map_(mask_array), map_(padder), list, star(cross_correlation), normalize, center_slice(cutoff=cutoff), )([arr1, arr2])
def two_point_stats(arr1, arr2, periodic_boundary=True, cutoff=None, mask=None): r"""Calculate the 2-points stats for two arrays The discretized two point statistics are given by .. math:: f[r \; \vert \; l, l'] = \frac{1}{S} \sum_s m[s, l] m[s + r, l'] where :math:`f[r \; \vert \; l, l']` is the conditional probability of finding the local states :math:`l` and :math:`l` at a distance and orientation away from each other defined by the vector :math:`r`. `See this paper for more details on the notation. <https://doi.org/10.1007/s40192-017-0089-0>`_ The array ``arr1[i]`` (state :math:`l`) is correlated with ``arr2[i]`` (state :math:`l'`) for each sample ``i``. Both arrays must have the same number of samples and nominal states (integer value) or continuous variables. To calculate multiple different correlations for each sample, see :func:`~pymks.correlations_multiple`. To use ``two_point_stats`` as part of a Scikit-learn pipeline, see :class:`~pymks.TwoPointCorrelation`. Args: arr1: array used to calculate cross-correlations, shape ``(n_samples,n_x,n_y)`` arr2: array used to calculate cross-correlations, shape ``(n_samples,n_x,n_y)`` periodic_boundary: whether to assume a periodic boundary (default is ``True``) cutoff: the subarray of the 2 point stats to keep mask: array specifying confidence in the measurement at a pixel, shape ``(n_samples,n_x,n_y)``. In range [0,1]. Returns: the snipped 2-points stats If both arrays are Dask arrays then a Dask array is returned. >>> out = two_point_stats( ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... ) >>> out.chunks ((2,), (5,)) >>> out.shape (2, 5) If either of the arrays are Numpy then a Numpy array is returned. >>> two_point_stats( ... np.arange(10).reshape(2, 5), ... np.arange(10).reshape(2, 5), ... ) array([[ 3., 4., 6., 4., 3.], [48., 49., 51., 49., 48.]]) Test masking >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]]) >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]]) >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]]) >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask >>> assert np.allclose( ... two_point_stats(array, array, mask=mask, periodic_boundary=False)[:, 1:-1, 1:-1], ... expected ... ) The mask must be in the range 0 to 1. >>> array = da.array([[[1, 0], [0, 1]]]) >>> mask = da.array([[[2, 0], [0, 1]]]) >>> two_point_stats(array, array, mask=mask) Traceback (most recent call last): ... RuntimeError: Mask must be in range [0,1] """ # noqa: #501 n_is_even = 1 - np.array(arr1.shape[1:]) % 2 padding = np.array(arr1.shape[1:]) // 2 nonperiodic_padder = sequence( dapad( pad_width=[(0, 0)] + list(zip(padding, padding + n_is_even)), mode="constant", constant_values=0, ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) padder = identity if periodic_boundary else nonperiodic_padder if mask is not None: if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0: raise RuntimeError("Mask must be in range [0,1]") mask_array = lambda arr: arr * mask normalize = lambda x: x / auto_correlation(padder(mask)) else: mask_array = identity if periodic_boundary: # The periodic normalization could always be the # auto_correlation of the mask. But for the sake of # efficiency, we specify the periodic normalization in the # case there is no mask. normalize = sequence( lambda x: x / arr1[0].size, dapad( pad_width=[(0, 0)] + list(zip(0 * n_is_even, n_is_even)), mode="wrap", ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) else: normalize = lambda x: x / auto_correlation( padder(np.ones_like(arr1))) return sequence( map_(mask_array), map_(padder), list, star(cross_correlation), normalize, center_slice(cutoff=cutoff), )([arr1, arr2])
def correlations_multiple(data, correlations, periodic_boundary=True, cutoff=None): r"""Calculate 2-point stats for a multiple auto/cross correlation The discretized two point statistics are given by .. math:: f[r \; \vert \; l, l'] = \frac{1}{S} \sum_s m[s, l] m[s + r, l'] where :math:`f[r \; \vert \; l, l']` is the conditional probability of finding the local states :math:`l` and :math:`l'` at a distance and orientation away from each other defined by the vector :math:`r`. `See this paper for more details on the notation. <https://doi.org/10.1007/s40192-017-0089-0>`_ The correlations are calulated based on pairs given in ``correlations`` for each sample. To calculate a single correlation for two arrays, see :func:`~pymks.two_point_stats`. To use ``correlations_multiple`` as part of a Scikit-learn pipeline, see :class:`~pymks.TwoPointCorrelation`. Args: data: the discretized data with shape ``(n_samples, n_x, n_y, n_state)`` correlations: the correlation pairs, ``[[i0, j0], [i1, j1], ...]`` periodic_boundary: whether to assume a periodic boundary (default is true) cutoff: the subarray of the 2 point stats to keep Returns: the 2-points stats array If ``data`` is a Numpy array then ``correlations_multiple`` will return a Numpy array. >>> data = np.arange(18).reshape(1, 3, 3, 2) >>> out_np = correlations_multiple(data, [[0, 1], [1, 1]]) >>> out_np.shape (1, 3, 3, 2) >>> answer = np.array([[[58, 62, 58], [94, 98, 94], [58, 62, 58]]]) + 2. / 3. >>> assert np.allclose(out_np[..., 0], answer) However, if ``data`` is a Dask array then a Dask array is returned. >>> data = da.from_array(data, chunks=(1, 3, 3, 2)) >>> out = correlations_multiple(data, [[0, 1], [1, 1]]) >>> out.shape (1, 3, 3, 2) >>> out.chunks ((1,), (3,), (3,), (2,)) >>> assert np.allclose(out[..., 0], answer) """ return pipe( range(data.shape[-1]), map_(lambda x: (0, x)), lambda x: correlations if correlations else x, map_(lambda x: two_point_stats( data[..., x[0]], data[..., x[1]], periodic_boundary=periodic_boundary, cutoff=cutoff, )), list, lambda x: da.stack(x, axis=-1), lambda x: da.rechunk(x, x.chunks[:-1] + (-1, )), )