def show_images(): x = plt.figure(1) plt.clf() plt.imshow(sky, vmin=da.min(sky), vmax=da.max(sky)) plt.title('sky') plt.show(block=False) y = plt.figure(2) plt.clf() plt.imshow(psf, vmin=da.min(psf), vmax=da.max(psf)) plt.title('psf') plt.show(block=False) z = plt.figure(3) plt.clf() plt.imshow(dirty, vmin=da.min(dirty), vmax=da.max(dirty)) plt.title('dirty') plt.show(block=False) while (plt.fignum_exists(1) and plt.fignum_exists(2) and plt.fignum_exists(3)): try: plt.pause(10000000) plt.close("all") except: break
def load_data(statistic, axis): import dask.array as da import numpy as np from glue.utils import view_shape x = da.from_zarr('/mnt/cephfs/zarr_data_full') f = 1500 scale = 2 lh = [] for k in range(scale): lc = [] for i in range(scale): lr = [] for j in range(scale): lr.append(x[f % 3500]) f = f + 1 lc.append(da.concatenate(lr)) lh.append(da.concatenate(lc, 1)) z = da.concatenate(lh, 2) if statistic == 'minimum': return da.min(z, axis).compute() elif statistic == 'maximum': return da.max(z, axis).compute() elif statistic == 'mean' or statistic == 'median': return da.mean(z, axis).compute() elif statistic == 'percentile': return percentile / 100 elif statistic == 'sum': return da.sum(z.axis).compute() return 0
def extract(self): df_path = pd.read_csv('path_to_file.csv', sep=';') df_path = df_path.rename(columns={'Unnamed: 0': 'id'}) df_path = df_path.set_index('id') print(df_path) ds_batch = xr.open_mfdataset(df_path['path'], parallel=True) #loading ncdf files print(ds_batch) print("--- Total size (GB):") print(ds_batch.nbytes * (2**-30)) # get size of the dataset in GB #getting average albedos over whole time period (used for maps and scatter plots) darr = ds_batch['QFLAG'] #getting data for specific band print(darr) #res = darr.mean(['lon','lat']) #res = da.count_nonzero( da.bitwise_and(darr//2**5, 1), ['lon','lat']) #res = (darr==32).sum(['lon','lat']) #res = xr.ufunc.bitwise_and(darr, 0b100000).sum(['lon','lat']) func = lambda x: np.bitwise_and(np.right_shift(x, 5), np.uint64(1)) func = lambda x: np.bitwise_and(x, np.uint64(1)) res = xr.apply_ufunc(func, darr, input_core_dims=[['lon', 'lat']], dask='parallelized', vectorize=True) #res = itwise_and(np.right_shift(darr, 5), 1).sum(['lon','lat]) #res = (darr==32).max(['lon','lat']) print(np.array(res)) sys.exit() da_count = ((da >> 5) & 1) #calculate mean over time #da_mean_lowres = da_mean.sel(lat=slice(70, 30)).sel(lon=slice(-25, 70)) # this can be used to zoom in over Europe da_mean_lowres = da_mean.isel(lat=slice(None, None, 10)).isel( lon=slice(None, None, 10)) #downsampling for faster plotting #getting average, min and max albedos for each time step (used to plot timeline) da_timeline_mean = da.mean(['lon', 'lat']) da_timeline_max = da.max(['lon', 'lat']) da_timeline_min = da.min(['lon', 'lat']) #closing arrays to free memory DS.close() da.close() da_mean.close() return da_mean_lowres, da_timeline_mean, da_timeline_max, da_timeline_min da_mean_lowres.close() da_timeline_mean.close() da_timeline_max.close() da_timeline_min.close()
def plot_subfigure(X, Y, subplot, transform): if transform == "pca": X = PCA(n_components=2).fit_transform(X) elif transform == "cca": X = CCA(n_components=2).fit(X, Y).transform(X) else: raise ValueError min_x = da.min(X[:, 0]) max_x = da.max(X[:, 0]) min_y = da.min(X[:, 1]) max_y = da.max(X[:, 1]) classif = OneVsRestClassifier(LogisticRegression()) classif.fit(X, Y) y_pred = classif.predict(X) print('{} + OneVsRestClassifier + LogisticRegression accuracy_score {}'. format(transform, accuracy_score(Y, y_pred))) plt.subplot(1, 2, subplot) plt.scatter(X[:, 0], X[:, 1], s=15, c='gray', edgecolors=(0, 0, 0)) for i in da.unique(Y.argmax(axis=1)): class_ = da.where(Y[:, i]) plt.scatter(X[class_, 0], X[class_, 1], s=25, linewidths=2, label='Class {}'.format(str(i))) for i in range(len(classif.estimators_)): plot_hyperplane(classif.estimators_[i], min_x, max_x, 'k--', 'Boundary\nfor class {}'.format(str(i))) plt.xticks(()) plt.yticks(()) plt.xlim(min_x - .1 * max_x, max_x + .1 * max_x) plt.ylim(min_y - .1 * max_y, max_y + .1 * max_y)
def add_data(workspace: String, dataset: String): import dask.array as da from survos2.improc.utils import optimal_chunksize ws = get(workspace) with dataset_from_uri(dataset, mode='r') as data: chunk_size = optimal_chunksize(data, Config['computing.chunk_size']) data = da.from_array(data, chunks=chunk_size) data -= da.min(data) data /= da.max(data) ds = ws.add_data(data) logger.info(type(ds)) return ds
def show_images(): plt.figure(1) plt.clf() plt.imshow(quad, vmin=da.min(quad), vmax=da.max(quad)) plt.title('quad') plt.show(block=False) while (plt.fignum_exists(1)): try: plt.pause(100000) plt.close("all") except: break
def show_results(): x = plt.figure(1) plt.clf() plt.imshow(hub, vmin=da.min(hub), vmax=da.max(hub)) plt.title('huber') plt.show(block=False) while (plt.fignum_exists(1)): try: plt.pause(10000000) plt.close("all") except: break
def show_images(): for i in range(len(dirty)): plt.figure(i+1) plt.clf() plt.imshow(quad[i], vmin = da.min(quad[i]), vmax = da.max(quad[i])) plt.title('quad' + str(i)) plt.show(block=False) while(plt.fignum_exists(1)): try: plt.pause(100000) plt.close("all") except: break
def _perlin_dask_numpy(data: da.Array, freq: tuple, seed: int) -> da.Array: np.random.seed(seed) p = np.random.permutation(2**20) p = np.append(p, p) height, width = data.shape linx = da.linspace(0, freq[0], width, endpoint=False, dtype=np.float32) liny = da.linspace(0, freq[1], height, endpoint=False, dtype=np.float32) x, y = da.meshgrid(linx, liny) _func = partial(_perlin, p) data = da.map_blocks(_func, x, y, meta=np.array((), dtype=np.float32)) data = (data - da.min(data)) / da.ptp(data) return data
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, blockshape=(2, )) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def test_reductions(): x = np.arange(5).astype('f4') a = da.from_array(x, chunks=(2,)) assert eq(da.all(a), np.all(x)) assert eq(da.any(a), np.any(x)) assert eq(da.argmax(a, axis=0), np.argmax(x, axis=0)) assert eq(da.argmin(a, axis=0), np.argmin(x, axis=0)) assert eq(da.max(a), np.max(x)) assert eq(da.mean(a), np.mean(x)) assert eq(da.min(a), np.min(x)) assert eq(da.nanargmax(a, axis=0), np.nanargmax(x, axis=0)) assert eq(da.nanargmin(a, axis=0), np.nanargmin(x, axis=0)) assert eq(da.nanmax(a), np.nanmax(x)) assert eq(da.nanmin(a), np.nanmin(x)) assert eq(da.nansum(a), np.nansum(x)) assert eq(da.nanvar(a), np.nanvar(x)) assert eq(da.nanstd(a), np.nanstd(x))
def add_data(workspace: String, data_fname: String): import dask.array as da from survos2.improc.utils import optimal_chunksize ws = get(workspace) logger.info(f"Adding data to workspace {ws}") with dataset_from_uri(data_fname, mode="r") as data: chunk_size = optimal_chunksize(data, Config["computing.chunk_size"]) logger.debug( f'Calculating optimal chunk size using chunk_size {Config["computing.chunk_size"]}: {chunk_size}' ) data = da.from_array(data, chunks=chunk_size) data -= da.min(data) data /= da.max(data) ds = ws.add_data(data) # ds.set_attr("chunk_size", chunk_size) return ds
def statistics(self, data, pca_stats=None): # set headers if pca_stats: # for pca if pca_stats["eigenvals"] is not None: self.stats_header.setText("Eigenvalue: {} ({}%)".format( round(pca_stats["eigenvals"][self.pc_id - 1], 2), round(pca_stats["eigenvals_%"][self.pc_id - 1], 2))) self.stats_header.setToolTip( "It shows how are the dispersion of the data with respect to its component" ) else: self.stats_header.setText("Eigenvalue: --") self.stats_header.setToolTip( "Is only available when the components are computed with the plugin" ) else: # for aoi self.stats_header.setText("Pixels in AOI: {}".format( round(data.size if data.size > 1 else 0, 2))) self.stats_header.setToolTip("") # restore or compute the statistics if self.QCBox_StatsLayer.currentText( ) == self.pc_name and self.stats_pc is not None: min, max, std, p25, p50, p75 = self.stats_pc else: da_data = da.from_array(data, chunks=(8000000, )) min = da.min(da_data).compute() max = da.max(da_data).compute() std = da.std(da_data).compute() p25 = da.percentile(da_data, 25).compute()[0] p50 = da.percentile(da_data, 50).compute()[0] p75 = da.percentile(da_data, 75).compute()[0] if self.QCBox_StatsLayer.currentText() == self.pc_name: self.stats_pc = (min, max, std, p25, p50, p75) # set in dialog self.stats_min.setText(str(round(min, 2))) self.stats_max.setText(str(round(max, 2))) self.stats_std.setText(str(round(std, 2))) self.stats_p25.setText(str(round(p25, 2))) self.stats_p50.setText(str(round(p50, 2))) self.stats_p75.setText(str(round(p75, 2)))
def test_workspace(): ws = Workspace(".") workspace_fpath = "./newws1" ws = ws.create(workspace_fpath) data_fname = "./tmp/testvol_4x4x4b.h5" with dataset_from_uri(data_fname, mode="r") as data: chunk_size = optimal_chunksize(data, Config["computing.chunk_size"]) data = da.from_array(data, chunks=chunk_size) data -= da.min(data) data /= da.max(data) ds = ws.add_data(data) # ds.set_attr("chunk_size", chunk_size) ws.add_dataset("testds", "float32") assert ws.exists(workspace_fpath) assert ws.has_data() assert ws.available_datasets() == ['testds'] ws.add_session('newsesh') assert ws.has_session('newsesh') ws.delete()
def nearestPD(A, threads=1): """ Find the nearest positive-definite matrix to input A Python/Numpy port of John D'Errico's `nearestSPD` MATLAB code [1], which credits [2] from Ahmed Fasih [1] https://www.mathworks.com/matlabcentral/fileexchange/42885-nearestspd [2] N.J. Higham, "Computing a nearest symmetric positive semidefinite matrix" (1988): https://doi.org/10.1016/0024-3795(88)90223-6 """ isPD = lambda x: da.all(np.linalg.eigvals(x) > 0).compute() B = (A + A.T) / 2 _, s, V = da.linalg.svd(B) H = da.dot(V.T, da.dot(da.diag(s), V)) A2 = (B + H) / 2 A3 = (A2 + A2.T) / 2 if isPD(A3): return A3 spacing = da.spacing(da.linalg.norm(A)) # The above is different from [1]. It appears that MATLAB's `chol` Cholesky # decomposition will accept matrixes with exactly 0-eigenvalue, whereas # Numpy's will not. So where [1] uses `eps(mineig)` (where `eps` is Matlab # for `np.spacing`), we use the above definition. CAVEAT: our `spacing` # will be much larger than [1]'s `eps(mineig)`, since `mineig` is usually on # the order of 1e-16, and `eps(1e-16)` is on the order of 1e-34, whereas # `spacing` will, for Gaussian random matrixes of small dimension, be on # othe order of 1e-16. In practice, both ways converge, as the unit test # below suggests. eye_chunk = estimate_chunks((A.shape[0], A.shape[0]), threads=threads)[0] I = da.eye(A.shape[0], chunks=eye_chunk) k = 1 while not isPD(A3): mineig = da.min(da.real(np.linalg.eigvals(A3))) A3 += I * (-mineig * k**2 + spacing) k += 1 return A3
def two_point_stats(arr1, arr2, periodic_boundary=True, cutoff=None, mask=None): r"""Calculate the 2-points stats for two arrays The discretized two point statistics are given by .. math:: f[r \; \vert \; l, l'] = \frac{1}{S} \sum_s m[s, l] m[s + r, l'] where :math:`f[r \; \vert \; l, l']` is the conditional probability of finding the local states :math:`l` and :math:`l` at a distance and orientation away from each other defined by the vector :math:`r`. `See this paper for more details on the notation. <https://doi.org/10.1007/s40192-017-0089-0>`_ The array ``arr1[i]`` (state :math:`l`) is correlated with ``arr2[i]`` (state :math:`l'`) for each sample ``i``. Both arrays must have the same number of samples and nominal states (integer value) or continuous variables. To calculate multiple different correlations for each sample, see :func:`~pymks.correlations_multiple`. To use ``two_point_stats`` as part of a Scikit-learn pipeline, see :class:`~pymks.TwoPointCorrelation`. Args: arr1: array used to calculate cross-correlations, shape ``(n_samples,n_x,n_y)`` arr2: array used to calculate cross-correlations, shape ``(n_samples,n_x,n_y)`` periodic_boundary: whether to assume a periodic boundary (default is ``True``) cutoff: the subarray of the 2 point stats to keep mask: array specifying confidence in the measurement at a pixel, shape ``(n_samples,n_x,n_y)``. In range [0,1]. Returns: the snipped 2-points stats If both arrays are Dask arrays then a Dask array is returned. >>> out = two_point_stats( ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... ) >>> out.chunks ((2,), (5,)) >>> out.shape (2, 5) If either of the arrays are Numpy then a Numpy array is returned. >>> two_point_stats( ... np.arange(10).reshape(2, 5), ... np.arange(10).reshape(2, 5), ... ) array([[ 3., 4., 6., 4., 3.], [48., 49., 51., 49., 48.]]) Test masking >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]]) >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]]) >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]]) >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask >>> assert np.allclose( ... two_point_stats(array, array, mask=mask, periodic_boundary=False)[:, 1:-1, 1:-1], ... expected ... ) The mask must be in the range 0 to 1. >>> array = da.array([[[1, 0], [0, 1]]]) >>> mask = da.array([[[2, 0], [0, 1]]]) >>> two_point_stats(array, array, mask=mask) Traceback (most recent call last): ... RuntimeError: Mask must be in range [0,1] """ # noqa: #501 n_is_even = 1 - np.array(arr1.shape[1:]) % 2 padding = np.array(arr1.shape[1:]) // 2 nonperiodic_padder = sequence( dapad( pad_width=[(0, 0)] + list(zip(padding, padding + n_is_even)), mode="constant", constant_values=0, ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) padder = identity if periodic_boundary else nonperiodic_padder if mask is not None: if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0: raise RuntimeError("Mask must be in range [0,1]") mask_array = lambda arr: arr * mask normalize = lambda x: x / auto_correlation(padder(mask)) else: mask_array = identity if periodic_boundary: # The periodic normalization could always be the # auto_correlation of the mask. But for the sake of # efficiency, we specify the periodic normalization in the # case there is no mask. normalize = sequence( lambda x: x / arr1[0].size, dapad( pad_width=[(0, 0)] + list(zip(0 * n_is_even, n_is_even)), mode="wrap", ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) else: normalize = lambda x: x / auto_correlation( padder(np.ones_like(arr1))) return sequence( map_(mask_array), map_(padder), list, star(cross_correlation), normalize, center_slice(cutoff=cutoff), )([arr1, arr2])
def triclustering(Z, nclusters_row, nclusters_col, nclusters_bnd, errobj, niters, epsilon, row_clusters_init=None, col_clusters_init=None, bnd_clusters_init=None): """ Run the tri-clustering, Dask implementation :param Z: d x m x n data matrix :param nclusters_row: number of row clusters :param nclusters_col: number of column clusters :param nclusters_bnd: number of band clusters :param errobj: convergence threshold for the objective function :param niters: maximum number of iterations :param epsilon: numerical parameter, avoids zero arguments in log :param row_clusters_init: initial row cluster assignment :param col_clusters_init: initial column cluster assignment :param bnd_clusters_init: initial column cluster assignment :return: has converged, number of iterations performed. final row, column, and band clustering, error value """ client = get_client() Z = da.array(Z) if not isinstance(Z, da.Array) else Z [d, m, n] = Z.shape bnd_chunks, row_chunks, col_chunks = Z.chunksize row_clusters = da.array(row_clusters_init) \ if row_clusters_init is not None \ else _initialize_clusters(m, nclusters_row, chunks=row_chunks) col_clusters = da.array(col_clusters_init) \ if col_clusters_init is not None \ else _initialize_clusters(n, nclusters_col, chunks=col_chunks) bnd_clusters = da.array(bnd_clusters_init) \ if bnd_clusters_init is not None \ else _initialize_clusters(d, nclusters_bnd, chunks=bnd_chunks) R = _setup_cluster_matrix(nclusters_row, row_clusters) C = _setup_cluster_matrix(nclusters_col, col_clusters) B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters) e, old_e = 2 * errobj, 0 s = 0 converged = False Gavg = Z.mean() while (not converged) & (s < niters): logger.debug(f'Iteration # {s} ..') # Calculate number of elements in each tri-cluster nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row) nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col) nel_bnd_clusters = da.bincount(bnd_clusters, minlength=nclusters_bnd) logger.debug( 'num of populated clusters: row {}, col {}, bnd {}'.format( da.sum(nel_row_clusters > 0).compute(), da.sum(nel_col_clusters > 0).compute(), da.sum(nel_bnd_clusters > 0).compute())) nel_clusters = da.einsum('i,j->ij', nel_row_clusters, nel_col_clusters) nel_clusters = da.einsum('i,jk->ijk', nel_bnd_clusters, nel_clusters) # calculate tri-cluster averages (epsilon takes care of empty clusters) # first sum values in each tri-cluster .. TriCavg = da.einsum('ij,ilm->jlm', B, Z) # .. along band axis TriCavg = da.einsum('ij,kim->kjm', R, TriCavg) # .. along row axis TriCavg = da.einsum('ij,kli->klj', C, TriCavg) # .. along col axis # finally divide by number of elements in each tri-cluster TriCavg = (TriCavg + Gavg * epsilon) / (nel_clusters + epsilon) # unpack tri-cluster averages .. avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg) # .. along band axis avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck) # .. along col axis # use these for the row cluster assignment idx = (1, 0, 2) d_row = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon) row_clusters = da.argmin(d_row, axis=1) R = _setup_cluster_matrix(nclusters_row, row_clusters) # unpack tri-cluster averages .. avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg) # .. along band axis avg_unpck = da.einsum('ij,kjl->kil', R, avg_unpck) # .. along row axis # use these for the col cluster assignment idx = (2, 0, 1) d_col = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon) col_clusters = da.argmin(d_col, axis=1) C = _setup_cluster_matrix(nclusters_col, col_clusters) # unpack tri-cluster averages .. avg_unpck = da.einsum('ij,kjl->kil', R, TriCavg) # .. along row axis avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck) # .. along col axis # use these for the band cluster assignment d_bnd = _distance(Z, avg_unpck, epsilon) bnd_clusters = da.argmin(d_bnd, axis=1) B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters) # Error value (actually just the band component really) old_e = e minvals = da.min(d_bnd, axis=1) # power 1 divergence, power 2 euclidean e = da.sum(da.power(minvals, 1)) row_clusters, R, col_clusters, C, bnd_clusters, B, e = client.persist( [row_clusters, R, col_clusters, C, bnd_clusters, B, e]) e = e.compute() logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}') converged = abs(e - old_e) < errobj s = s + 1 if converged: logger.debug(f'Triclustering converged in {s} iterations') else: logger.debug(f'Triclustering not converged in {s} iterations') return converged, s, row_clusters, col_clusters, bnd_clusters, e
def plot_dataset(X, y, images=None, labels=None, gray=False, save=None, y_original=None): print('data size {}'.format(X.shape)) uni_y = len(da.unique(y).compute()) x_min, x_max = da.min(X, 0), da.max(X, 0) X = (X - x_min) / (x_max - x_min) #if save is not None: #plt.figure(figsize=(27,18), dpi=600) #else: fig = plt.figure(figsize=(27, 18), dpi=100) ax = plt.subplot(111) for i in tqdm(range(X.shape[0])): plt.text(X[i, 0], X[i, 1], str(y[i]), color=plt.cm.Set1(y[i] / uni_y), fontdict={ 'weight': 'bold', 'size': 9 }) if images is not None: if hasattr(offsetbox, 'AnnotationBbox'): # only print thumbnails with matplotlib > 1.0 shown_images = da.array([[1., 1.]]) # just something big for i in range(X.shape[0]): dist = da.sum((X[i] - shown_images)**2, 1) if da.min(dist) < 4e-3: # don't show points that are too close continue if labels is not None: if y_original is not None: plt.text(X[i, 0] - 0.01, X[i, 1] - 0.033, labels[y_original[i]], fontdict={ 'weight': 'bold', 'size': 15 }) else: plt.text(X[i, 0] - 0.01, X[i, 1] - 0.033, labels[y[i]], fontdict={ 'weight': 'bold', 'size': 15 }) shown_images = da.r_[shown_images, [X[i]]] if gray: image_ = offsetbox.OffsetImage( da.expand_dims(util.invert(images[i]), axis=0)) else: image_ = offsetbox.OffsetImage(images[i], cmap=plt.cm.gray_r) imagebox = offsetbox.AnnotationBbox(image_, X[i]) ax.add_artist(imagebox) plt.xticks([]), plt.yticks([]) for item in [fig, ax]: item.patch.set_visible(False) ax.axis('off') if save is not None: print('Saving Image {} ...'.format(save)) plt.title('epoch ' + save.split('.')[0].split()[-1], fontdict={'fontsize': 20}, loc='left') plt.savefig(save) plt.close() else: plt.show() del X, y, fig, ax gc.collect()
def two_point_stats(arr1, arr2, mask=None, periodic_boundary=True, cutoff=None): """Calculate the 2-points stats for two arrays Args: arr1: array used to calculate cross-correlations (n_samples,n_x,n_y) arr2: array used to calculate cross-correlations (n_samples,n_x,n_y) mask: array specifying confidence in the measurement at a pixel (n_samples,n_x,n_y). In range [0,1]. periodic_boundary: whether to assume a periodic boundary (default is true) cutoff: the subarray of the 2 point stats to keep Returns: the snipped 2-points stats >>> two_point_stats( ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... da.from_array(np.arange(10).reshape(2, 5), chunks=(2, 5)), ... ).shape (2, 5) Test masking >>> array = da.array([[[1, 0 ,0], [0, 1, 1], [1, 1, 0]]]) >>> mask = da.array([[[1, 1, 1], [1, 1, 1], [1, 0, 0]]]) >>> norm_mask = da.array([[[2, 4, 3], [4, 7, 4], [3, 4, 2]]]) >>> expected = da.array([[[1, 0, 1], [1, 4, 1], [1, 0, 1]]]) / norm_mask >>> assert np.allclose( ... two_point_stats(array, array, mask=mask, periodic_boundary=False), ... expected ... ) The mask must be in the range 0 to 1. >>> array = da.array([[[1, 0], [0, 1]]]) >>> mask = da.array([[[2, 0], [0, 1]]]) >>> two_point_stats(array, array, mask) Traceback (most recent call last): ... RuntimeError: Mask must be in range [0,1] """ cutoff_ = int((np.min(arr1.shape[1:]) - 1) / 2) if cutoff is None: cutoff = cutoff_ cutoff = min(cutoff, cutoff_) nonperiodic_padder = sequence( dapad( pad_width=[(0, 0)] + [(cutoff, cutoff)] * (arr1.ndim - 1), mode="constant", constant_values=0, ), lambda x: da.rechunk(x, (x.chunks[0], ) + x.shape[1:]), ) padder = identity if periodic_boundary else nonperiodic_padder if mask is not None: if da.max(mask).compute() > 1.0 or da.min(mask).compute() < 0.0: raise RuntimeError("Mask must be in range [0,1]") mask_array = lambda arr: arr * mask normalize = lambda x: x / auto_correlation(padder(mask)) else: mask_array = identity if periodic_boundary: # The periodic normalization could always be the # auto_correlation of the mask. But for the sake of # efficiency, we specify the periodic normalization in the # case there is no mask. normalize = lambda x: x / arr1[0].size else: normalize = lambda x: x / auto_correlation( padder(np.ones_like(arr1))) return sequence( map_(mask_array), map_(padder), list, star(cross_correlation), normalize, center_slice(cutoff=cutoff), )([arr1, arr2])
def coclustering(Z, nclusters_row, nclusters_col, errobj, niters, epsilon, col_clusters_init=None, row_clusters_init=None, run_on_worker=False): """ Run the co-clustering, Dask implementation :param Z: m x n data matrix :param nclusters_row: num row clusters :param nclusters_col: number of column clusters :param errobj: convergence threshold for the objective function :param niters: maximum number of iterations :param epsilon: numerical parameter, avoids zero arguments in log :param row_clusters_init: initial row cluster assignment :param col_clusters_init: initial column cluster assignment :param run_on_worker: whether the function is submitted to a Dask worker :return: has converged, number of iterations performed. final row and column clustering, error value """ client = get_client() Z = da.array(Z) if not isinstance(Z, da.Array) else Z [m, n] = Z.shape row_chunks, col_chunks = Z.chunksize row_clusters = da.array(row_clusters_init) \ if row_clusters_init is not None \ else _initialize_clusters(m, nclusters_row, chunks=row_chunks) col_clusters = da.array(col_clusters_init) \ if col_clusters_init is not None \ else _initialize_clusters(n, nclusters_col, chunks=col_chunks) R = _setup_cluster_matrix(nclusters_row, row_clusters) C = _setup_cluster_matrix(nclusters_col, col_clusters) e, old_e = 2 * errobj, 0 s = 0 converged = False Gavg = Z.mean() while (not converged) & (s < niters): logger.debug(f'Iteration # {s} ..') # Calculate cluster based averages # nel_clusters is a matrix with the number of elements per co-cluster # originally computed as: da.dot(da.dot(R.T, da.ones((m, n))), C) nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row) nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col) logger.debug('num of populated clusters: row {}, col {}'.format( da.sum(nel_row_clusters > 0).compute(), da.sum(nel_col_clusters > 0).compute())) nel_clusters = da.outer(nel_row_clusters, nel_col_clusters) CoCavg = (da.matmul(da.matmul(R.T, Z), C) + Gavg * epsilon) / \ (nel_clusters + epsilon) # Calculate distance based on row approximation d_row = _distance(Z, da.matmul(C, CoCavg.T), epsilon) # Assign to best row cluster row_clusters = da.argmin(d_row, axis=1) R = _setup_cluster_matrix(nclusters_row, row_clusters) # Calculate distance based on column approximation d_col = _distance(Z.T, da.matmul(R, CoCavg), epsilon) # Assign to best column cluster col_clusters = da.argmin(d_col, axis=1) C = _setup_cluster_matrix(nclusters_col, col_clusters) # Error value (actually just the column components really) old_e = e minvals = da.min(d_col, axis=1) # power 1 divergence, power 2 euclidean e = da.sum(da.power(minvals, 1)) row_clusters, R, col_clusters, C, e = client.persist( [row_clusters, R, col_clusters, C, e]) if run_on_worker: # this is workaround for e.compute() for a function that runs # on a worker with multiple threads # https://github.com/dask/distributed/issues/3827 e = client.compute(e) secede() e = e.result() rejoin() else: e = e.compute() logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}') converged = abs(e - old_e) < errobj s = s + 1 if converged: logger.debug(f'Coclustering converged in {s} iterations') else: logger.debug(f'Coclustering not converged in {s} iterations') return converged, s, row_clusters, col_clusters, e
def density_flux(population, total_population, carrying_capacity, distance, csx, csy, **kwargs): """ 'density-based dispersion' Dispersal is calculated using the following sequence of methods: Portions of populations at each element (node, or grid cell) in the study area array (raster) are moved to surrounding elements (a neighbourhood) within a radius that is defined by the input distance (:math:`d`), as presented in the conceptual figure below. .. image:: images/density_flux_neighbourhood.png :align: center .. attention:: No dispersal will occur if the provided distance is less than the distance between elements (grid cells) in the model domain, as none will be included in the neighbourhood The mean density (:math:`\\rho`) of all elements in the neighbourhood is calculated as: .. math:: \\rho=\\frac{\\sum_{i=1}^{n} \\frac{pop_T(i)}{k_T(i)}}{n} where, :math:`pop_T` is the total population (of the entire species) at each element (:math:`i`); and\n :math:`k_T` is the total carrying capacity for the species The density gradient at each element (:math:`\\Delta`) with respect to the mean is calculated as: .. math:: \\Delta(i)=\\frac{pop_T(i)}{k_T(i)}-\\rho If the centroid element is above the mean :math:`[\\Delta(i_0) > 0]`, it is able to release a portion of its population to elements in the neighbourhood. The eligible population to be received by surrounding elements is equal to the sum of populations at elements with negative density gradients, the :math:`candidates`: .. math:: candidates=\\sum_{i=1}^{n} \\Delta(i)[\\Delta(i) < 0]k_T(i) The minimum of either the population above the mean at the centroid element - :math:`source=\\Delta(i_0)*k_T(i_0)`, or the :math:`candidates` are used to determine the total population that is dispersed from the centroid element to the other elements in the neighbourhood: .. math:: dispersal=min\{source, candidates\} The population at the centroid element becomes: .. math:: pop_a(i_0)=pop_a(i_0)-\\frac{pop_a(i_0)}{pop_T(i_0)}dispersal where, :math:`pop_a` is the age (stage) group population, which is a sub-population of the total. The populations of the candidate elements in the neighbourhood become (a net gain due to negative gradients): .. math:: pop_a(i)=pop_a(i)-\\frac{\\Delta(i)[\\Delta(i) < 0]k_T(i)}{candidates}dispersal\\frac{pop_a(i)}{pop_T(i)} :param da.Array population: Sub-population to redistribute (subset of the ``total_population``) :param da.Array total_population: Total population :param da.Array carrying_capacity: Total Carrying Capacity (k) :param float distance: Maximum dispersal distance :param float csx: Cell size of the domain in the x-direction :param float csy: Cell size of the domain in the y-direction .. Attention:: Ensure the cell sizes are in the same units as the specified direction :Keyword Arguments: **mask** (*array*) -- A weighting mask that scales dispersal based on the normalized mask value (default: None) :return: Redistributed population """ if any([ not isinstance(a, da.Array) for a in [population, total_population, carrying_capacity] ]): raise DispersalError('Inputs must be a dask arrays') if distance == 0: # Don't do anything return population chunks = tuple(c[0] if c else 0 for c in population.chunks)[:2] mask = kwargs.get('mask', None) if mask is None: mask = da.ones(shape=population.shape, dtype='float32', chunks=chunks) # Normalize the mask mask_min = da.min(mask) _range = da.max(mask) - mask_min mask = da.where(_range > 0, (mask - mask_min) / _range, 1.) # Calculate the kernel indices and shape kernel = calculate_kernel(distance, csx, csy) if kernel is None: # Not enough distance to cover a grid cell return population kernel, m, n = kernel m = int(m) n = int(n) a = da.pad(da.dstack( [population, total_population, carrying_capacity, mask]), ((m, m), (n, n), (0, 0)), 'constant', constant_values=0) _m = -m if m == 0: _m = None _n = -n if n == 0: _n = None output = delayed(density_flux_task)(a, kernel, m, n)[m:_m, n:_n, 0] output = da.from_delayed(output, population.shape, np.float32) return output.rechunk(chunks)