def triclustering(Z, nclusters_row, nclusters_col, nclusters_bnd, errobj, niters, epsilon, row_clusters_init=None, col_clusters_init=None, bnd_clusters_init=None): """ Run the tri-clustering, Dask implementation :param Z: d x m x n data matrix :param nclusters_row: number of row clusters :param nclusters_col: number of column clusters :param nclusters_bnd: number of band clusters :param errobj: convergence threshold for the objective function :param niters: maximum number of iterations :param epsilon: numerical parameter, avoids zero arguments in log :param row_clusters_init: initial row cluster assignment :param col_clusters_init: initial column cluster assignment :param bnd_clusters_init: initial column cluster assignment :return: has converged, number of iterations performed. final row, column, and band clustering, error value """ client = get_client() Z = da.array(Z) if not isinstance(Z, da.Array) else Z [d, m, n] = Z.shape bnd_chunks, row_chunks, col_chunks = Z.chunksize row_clusters = da.array(row_clusters_init) \ if row_clusters_init is not None \ else _initialize_clusters(m, nclusters_row, chunks=row_chunks) col_clusters = da.array(col_clusters_init) \ if col_clusters_init is not None \ else _initialize_clusters(n, nclusters_col, chunks=col_chunks) bnd_clusters = da.array(bnd_clusters_init) \ if bnd_clusters_init is not None \ else _initialize_clusters(d, nclusters_bnd, chunks=bnd_chunks) R = _setup_cluster_matrix(nclusters_row, row_clusters) C = _setup_cluster_matrix(nclusters_col, col_clusters) B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters) e, old_e = 2 * errobj, 0 s = 0 converged = False Gavg = Z.mean() while (not converged) & (s < niters): logger.debug(f'Iteration # {s} ..') # Calculate number of elements in each tri-cluster nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row) nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col) nel_bnd_clusters = da.bincount(bnd_clusters, minlength=nclusters_bnd) logger.debug( 'num of populated clusters: row {}, col {}, bnd {}'.format( da.sum(nel_row_clusters > 0).compute(), da.sum(nel_col_clusters > 0).compute(), da.sum(nel_bnd_clusters > 0).compute())) nel_clusters = da.einsum('i,j->ij', nel_row_clusters, nel_col_clusters) nel_clusters = da.einsum('i,jk->ijk', nel_bnd_clusters, nel_clusters) # calculate tri-cluster averages (epsilon takes care of empty clusters) # first sum values in each tri-cluster .. TriCavg = da.einsum('ij,ilm->jlm', B, Z) # .. along band axis TriCavg = da.einsum('ij,kim->kjm', R, TriCavg) # .. along row axis TriCavg = da.einsum('ij,kli->klj', C, TriCavg) # .. along col axis # finally divide by number of elements in each tri-cluster TriCavg = (TriCavg + Gavg * epsilon) / (nel_clusters + epsilon) # unpack tri-cluster averages .. avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg) # .. along band axis avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck) # .. along col axis # use these for the row cluster assignment idx = (1, 0, 2) d_row = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon) row_clusters = da.argmin(d_row, axis=1) R = _setup_cluster_matrix(nclusters_row, row_clusters) # unpack tri-cluster averages .. avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg) # .. along band axis avg_unpck = da.einsum('ij,kjl->kil', R, avg_unpck) # .. along row axis # use these for the col cluster assignment idx = (2, 0, 1) d_col = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon) col_clusters = da.argmin(d_col, axis=1) C = _setup_cluster_matrix(nclusters_col, col_clusters) # unpack tri-cluster averages .. avg_unpck = da.einsum('ij,kjl->kil', R, TriCavg) # .. along row axis avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck) # .. along col axis # use these for the band cluster assignment d_bnd = _distance(Z, avg_unpck, epsilon) bnd_clusters = da.argmin(d_bnd, axis=1) B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters) # Error value (actually just the band component really) old_e = e minvals = da.min(d_bnd, axis=1) # power 1 divergence, power 2 euclidean e = da.sum(da.power(minvals, 1)) row_clusters, R, col_clusters, C, bnd_clusters, B, e = client.persist( [row_clusters, R, col_clusters, C, bnd_clusters, B, e]) e = e.compute() logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}') converged = abs(e - old_e) < errobj s = s + 1 if converged: logger.debug(f'Triclustering converged in {s} iterations') else: logger.debug(f'Triclustering not converged in {s} iterations') return converged, s, row_clusters, col_clusters, bnd_clusters, e
def __call__(self, datasets, **info): """Create the composite DataArray object for ERFDNB.""" if len(datasets) != 4: raise ValueError("Expected 4 datasets, got %d" % (len(datasets), )) from scipy.special import erf dnb_data = datasets[0] sza_data = datasets[1] lza_data = datasets[2] output_dataset = dnb_data.where(~(dnb_data.isnull() | sza_data.isnull())) # this algorithm assumes units of "W cm-2 sr-1" so if there are other # units we need to adjust for that if dnb_data.attrs.get("units", "W m-2 sr-1") == "W m-2 sr-1": unit_factor = 10000. else: unit_factor = 1. # convert to decimal instead of % moon_illum_fraction = da.mean(datasets[3].data) * 0.01 # From Steve Miller and Curtis Seaman # maxval = 10.^(-1.7 - (((2.65+moon_factor1+moon_factor2))*(1+erf((solar_zenith-95.)/(5.*sqrt(2.0)))))) # minval = 10.^(-4. - ((2.95+moon_factor2)*(1+erf((solar_zenith-95.)/(5.*sqrt(2.0)))))) # scaled_radiance = (radiance - minval) / (maxval - minval) # radiance = sqrt(scaled_radiance) # Version 2: Update from Curtis Seaman # maxval = 10.^(-1.7 - (((2.65+moon_factor1+moon_factor2))*(1+erf((solar_zenith-95.)/(5.*sqrt(2.0)))))) # minval = 10.^(-4. - ((2.95+moon_factor2)*(1+erf((solar_zenith-95.)/(5.*sqrt(2.0)))))) # saturated_pixels = where(radiance gt maxval, nsatpx) # saturation_pct = float(nsatpx)/float(n_elements(radiance)) # print, 'Saturation (%) = ', saturation_pct # # while saturation_pct gt 0.005 do begin # maxval = maxval*1.1 # saturated_pixels = where(radiance gt maxval, nsatpx) # saturation_pct = float(nsatpx)/float(n_elements(radiance)) # print, saturation_pct # endwhile # # scaled_radiance = (radiance - minval) / (maxval - minval) # radiance = sqrt(scaled_radiance) moon_factor1 = 0.7 * (1.0 - moon_illum_fraction) moon_factor2 = 0.0022 * lza_data.data erf_portion = 1 + erf((sza_data.data - 95.0) / (5.0 * np.sqrt(2.0))) max_val = da.power( 10, -1.7 - (2.65 + moon_factor1 + moon_factor2) * erf_portion) * unit_factor min_val = da.power(10, -4.0 - (2.95 + moon_factor2) * erf_portion) * unit_factor # Update from Curtis Seaman, increase max radiance curve until less # than 0.5% is saturated if self.saturation_correction: delayed = dask.delayed(self._saturation_correction)( output_dataset.data, unit_factor, min_val, max_val) output_dataset.data = da.from_delayed(delayed, output_dataset.shape, output_dataset.dtype) output_dataset.data = output_dataset.data.rechunk( dnb_data.data.chunks) else: inner_sqrt = (output_dataset - min_val) / (max_val - min_val) # clip negative values to 0 before the sqrt inner_sqrt = inner_sqrt.where(inner_sqrt > 0, 0) output_dataset.data = np.sqrt(inner_sqrt).data info = dnb_data.attrs.copy() info.update(self.attrs) info["standard_name"] = "equalized_radiance" info["mode"] = "L" output_dataset.attrs = info return output_dataset
def coclustering(Z, nclusters_row, nclusters_col, errobj, niters, epsilon, col_clusters_init=None, row_clusters_init=None, run_on_worker=False): """ Run the co-clustering, Dask implementation :param Z: m x n data matrix :param nclusters_row: num row clusters :param nclusters_col: number of column clusters :param errobj: convergence threshold for the objective function :param niters: maximum number of iterations :param epsilon: numerical parameter, avoids zero arguments in log :param row_clusters_init: initial row cluster assignment :param col_clusters_init: initial column cluster assignment :param run_on_worker: whether the function is submitted to a Dask worker :return: has converged, number of iterations performed. final row and column clustering, error value """ client = get_client() Z = da.array(Z) if not isinstance(Z, da.Array) else Z [m, n] = Z.shape row_chunks, col_chunks = Z.chunksize row_clusters = da.array(row_clusters_init) \ if row_clusters_init is not None \ else _initialize_clusters(m, nclusters_row, chunks=row_chunks) col_clusters = da.array(col_clusters_init) \ if col_clusters_init is not None \ else _initialize_clusters(n, nclusters_col, chunks=col_chunks) R = _setup_cluster_matrix(nclusters_row, row_clusters) C = _setup_cluster_matrix(nclusters_col, col_clusters) e, old_e = 2 * errobj, 0 s = 0 converged = False Gavg = Z.mean() while (not converged) & (s < niters): logger.debug(f'Iteration # {s} ..') # Calculate cluster based averages # nel_clusters is a matrix with the number of elements per co-cluster # originally computed as: da.dot(da.dot(R.T, da.ones((m, n))), C) nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row) nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col) logger.debug('num of populated clusters: row {}, col {}'.format( da.sum(nel_row_clusters > 0).compute(), da.sum(nel_col_clusters > 0).compute())) nel_clusters = da.outer(nel_row_clusters, nel_col_clusters) CoCavg = (da.matmul(da.matmul(R.T, Z), C) + Gavg * epsilon) / \ (nel_clusters + epsilon) # Calculate distance based on row approximation d_row = _distance(Z, da.matmul(C, CoCavg.T), epsilon) # Assign to best row cluster row_clusters = da.argmin(d_row, axis=1) R = _setup_cluster_matrix(nclusters_row, row_clusters) # Calculate distance based on column approximation d_col = _distance(Z.T, da.matmul(R, CoCavg), epsilon) # Assign to best column cluster col_clusters = da.argmin(d_col, axis=1) C = _setup_cluster_matrix(nclusters_col, col_clusters) # Error value (actually just the column components really) old_e = e minvals = da.min(d_col, axis=1) # power 1 divergence, power 2 euclidean e = da.sum(da.power(minvals, 1)) row_clusters, R, col_clusters, C, e = client.persist( [row_clusters, R, col_clusters, C, e]) if run_on_worker: # this is workaround for e.compute() for a function that runs # on a worker with multiple threads # https://github.com/dask/distributed/issues/3827 e = client.compute(e) secede() e = e.result() rejoin() else: e = e.compute() logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}') converged = abs(e - old_e) < errobj s = s + 1 if converged: logger.debug(f'Coclustering converged in {s} iterations') else: logger.debug(f'Coclustering not converged in {s} iterations') return converged, s, row_clusters, col_clusters, e