def fit(self, X, y=None): # CHECKING THE TYPES if isinstance(X, dask.array.Array): import dask.array as numerical_module from dask.array.linalg import cholesky, inv else: import numpy as numerical_module from scipy.linalg import cholesky, inv # 1. Computes the mean vector and the covariance matrix of the training set mu = numerical_module.mean(X, axis=0) cov = numerical_module.cov(X.T) # 2. Computes the inverse of the covariance matrix inv_cov = pinv(cov) if self.pinv else inv(cov) # 3. Computes the Cholesky decomposition of the inverse covariance matrix self.weights = cholesky( inv_cov, lower=True ) # Setting lower true to have the same implementation as in the previous code self.input_subtract = mu self.input_divide = 1.0 return self
def target(query): arr = concatenate([ from_delayed(process(f), [s, to - fr + 1], 'float64') for _, f, s in query[['filename', 'size']].itertuples() ]) with ProgressBar(): mat = cov(arr.T).compute() mat_cov = mat[1:, 1:] mat_icov = mat[1:, 0][:, None] @ mat[0, 1:][None, :] / mat[0, 0] mat_pcov = mat_cov - mat_icov return mat_pcov
def compute_unsupervised_metrics(latent, y, discretize): scores = {} cov_latent = da.cov(latent) # Gaussian total correlation. scores["gaussian_total_correlation"] = gaussian_total_correlation(cov_latent).compute() # Gaussian Wasserstein correlation. scores["gaussian_wasserstein_correlation"] = gaussian_wasserstein_correlation(cov_latent).compute() scores["gaussian_wasserstein_correlation_norm"] = delayed(scores["gaussian_wasserstein_correlation"] / np.sum(np.diag(cov_latent))).compute() scores["mutual_info_score"] = compute_mig(latent, y, discretize) return scores
def test_cov(): x = np.arange(56).reshape((7, 8)) d = da.from_array(x, chunks=(4, 4)) assert_eq(da.cov(d), np.cov(x)) assert_eq(da.cov(d, rowvar=0), np.cov(x, rowvar=0)) with pytest.warns(None): # warning dof <= 0 for slice assert_eq(da.cov(d, ddof=10), np.cov(x, ddof=10)) assert_eq(da.cov(d, bias=1), np.cov(x, bias=1)) assert_eq(da.cov(d, d), np.cov(x, x)) y = np.arange(8) e = da.from_array(y, chunks=(4, )) assert_eq(da.cov(d, e), np.cov(x, y)) assert_eq(da.cov(e, d), np.cov(y, x)) with pytest.raises(ValueError): da.cov(d, ddof=1.5)
def test_cov(): x = np.arange(56).reshape((7, 8)) d = da.from_array(x, chunks=(4, 4)) assert_eq(da.cov(d), np.cov(x)) assert_eq(da.cov(d, rowvar=0), np.cov(x, rowvar=0)) with pytest.warns(None): # warning dof <= 0 for slice assert_eq(da.cov(d, ddof=10), np.cov(x, ddof=10)) assert_eq(da.cov(d, bias=1), np.cov(x, bias=1)) assert_eq(da.cov(d, d), np.cov(x, x)) y = np.arange(8) e = da.from_array(y, chunks=(4,)) assert_eq(da.cov(d, e), np.cov(x, y)) assert_eq(da.cov(e, d), np.cov(y, x)) with pytest.raises(ValueError): da.cov(d, ddof=1.5)
def test_cov(): x = np.arange(56).reshape((7, 8)) d = da.from_array(x, chunks=(4, 4)) assert eq(da.cov(d), np.cov(x)) assert eq(da.cov(d, rowvar=0), np.cov(x, rowvar=0)) assert eq(da.cov(d, ddof=10), np.cov(x, ddof=10)) assert eq(da.cov(d, bias=1), np.cov(x, bias=1)) assert eq(da.cov(d, d), np.cov(x, x)) y = np.arange(8) e = da.from_array(y, chunks=(4,)) assert eq(da.cov(d, e), np.cov(x, y)) assert eq(da.cov(e, d), np.cov(y, x)) assert raises(ValueError, lambda: da.cov(d, ddof=1.5))
def test_cov(): x = np.arange(56).reshape((7, 8)) d = da.from_array(x, chunks=(4, 4)) assert eq(da.cov(d), np.cov(x)) assert eq(da.cov(d, rowvar=0), np.cov(x, rowvar=0)) assert eq(da.cov(d, ddof=10), np.cov(x, ddof=10)) assert eq(da.cov(d, bias=1), np.cov(x, bias=1)) assert eq(da.cov(d, d), np.cov(x, x)) y = np.arange(8) e = da.from_array(y, chunks=(4, )) assert eq(da.cov(d, e), np.cov(x, y)) assert eq(da.cov(e, d), np.cov(y, x)) assert raises(ValueError, lambda: da.cov(d, ddof=1.5))
def pca(A, B, n_pc, estimator_matrix, out_dir, n_threads, block_size): """Calculate the principal components for the vertical stack A or with combinations of the stack B :param A: first input raster data (fists period) :param B: second input raster data (second period) or None :param n_pc: number of principal components to output :param estimator_matrix: pca with correlation of covariance :param out_dir: directory to save the outputs :return: pca files list and statistics """ # init dask as threads (shared memory is required) dask.config.set(pool=ThreadPool(n_threads)) def get_profile(path): """Get geospatial metadata profile such as projections, pixel sizes, etc""" with rasterio.open(path) as src: return src.profile.copy() if B: raw_image_a = read_raster(A, block_size=block_size) raw_image_b = read_raster(B, block_size=block_size) raw_image = da.vstack((raw_image_a, raw_image_b)) else: raw_image = read_raster(A, block_size=block_size) # flat each dimension (bands) flat_dims = raw_image.reshape( (raw_image.shape[0], raw_image.shape[1] * raw_image.shape[2])) n_bands = raw_image.shape[0] ######## # subtract the mean of column i from column i, in order to center the matrix. band_mean = [] for i in range(n_bands): band_mean.append(dask.delayed(da.mean)(flat_dims[i])) band_mean = dask.compute(*band_mean) ######## # compute the matrix correlation/covariance estimation_matrix = np.empty((n_bands, n_bands)) for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] if estimator_matrix == "Correlation": estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1] if estimator_matrix == "Covariance": estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1] ######## # calculate eigenvectors & eigenvalues of the matrix # use 'eigh' rather than 'eig' since estimation_matrix # is symmetric, the performance gain is substantial eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix) # sort eigenvalue in decreasing order idx_eigenvals = np.argsort(eigenvals)[::-1] eigenvectors = eigenvectors[:, idx_eigenvals] # sort eigenvectors according to same index eigenvals = eigenvals[idx_eigenvals] # select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) eigenvectors = eigenvectors[:, :n_pc] ######## # save the principal components separated in tif images # output image profile prof = get_profile(A) prof.update(count=1, driver='GTiff', dtype=np.float32) @dask.delayed def get_principal_component(i, j): return eigenvectors[j, i] * (raw_image[j] - band_mean[j]) pca_files = [] for i in range(n_pc): pc = dask.delayed(sum)( [get_principal_component(i, j) for j in range(n_bands)]) pc = pc.astype(np.float32) # save component as file tmp_pca_file = Path(out_dir, 'pc_{}.tif'.format(i + 1)) write_raster(tmp_pca_file, pc.compute(), **prof) pca_files.append(tmp_pca_file) # compute the pyramids for each pc image @dask.delayed def pyramids(pca_file): call('gdaladdo --config BIGTIFF_OVERVIEW YES "{}"'.format(pca_file), shell=True) dask.compute(*[pyramids(pca_file) for pca_file in pca_files], num_workers=2) ######## # pca statistics pca_stats = {} pca_stats["eigenvals"] = eigenvals pca_stats["eigenvals_%"] = eigenvals * 100 / n_bands pca_stats["eigenvectors"] = eigenvectors return pca_files, pca_stats
def _cov_diaged(da: daskarr) -> daskarr: a = daskarr.cov(da, rowvar=0) a[a == np.inf] = 0 a[a == np.nan] = 0 return a + np.eye(a.shape[0])
def pca(a, b, n_pc, estimator_matrix, out_dir, n_threads, block_size, nodata): """Calculate the principal components for the vertical stack A or with combinations of the stack B :param A: first input raster data (fists period) :param B: second input raster data (second period) or None :param n_pc: number of principal components to output :param estimator_matrix: pca with correlation of covariance :param out_dir: directory to save the outputs :return: pca files list and statistics """ A = a B = b # get/set the nodata if nodata is None: ds = gdal.Open(A, gdal.GA_ReadOnly) nodata = ds.GetRasterBand(1).GetNoDataValue() del ds print("\nPRINCIPAL COMPONENTS ANALYSIS") print(" Compute {} components for:".format(n_pc)) print(" A: {}".format(A)) if B is not None: print(" B: {}".format(B)) # init dask as threads (shared memory is required) dask.config.set(pool=ThreadPool(n_threads)) # registered Dask progress bar pbar = ProgressBar() pbar.register() print("\nRead and prepare data:") raw_image = [] nodata_mask = None src_ds_A = gdal.Open(A, gdal.GA_ReadOnly) src_ds_B = None for band in range(src_ds_A.RasterCount): ds = src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if nodata is not None: nodata_mask = ds == nodata if nodata_mask is None else np.logical_or( nodata_mask, ds == nodata) raw_image.append(ds) if B is not None: src_ds_B = gdal.Open(B, gdal.GA_ReadOnly) for band in range(src_ds_B.RasterCount): ds = src_ds_B.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if nodata is not None: nodata_mask = np.logical_or(nodata_mask, ds == nodata) raw_image.append(ds) # pair-masking data, let only the valid data across all dimensions/bands if nodata is not None: raw_image = [b[~nodata_mask] for b in raw_image] # flat each dimension (bands) flat_dims = da.vstack(raw_image).rechunk((1, block_size**2)) # bands n_bands = flat_dims.shape[0] ######## # compute the mean of each band, in order to center the matrix. band_mean = [] for i in range(n_bands): band_mean.append(dask.delayed(np.mean)(flat_dims[i])) band_mean = dask.compute(*band_mean) ######## # compute the matrix correlation/covariance print("\nComputing the estimator matrix:") estimation_matrix = np.empty((n_bands, n_bands)) if estimator_matrix == "correlation": for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1] if estimator_matrix == "covariance": for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1] # free mem del raw_image, flat_dims, src_ds_B, ds ######## # calculate eigenvectors & eigenvalues of the matrix # use 'eigh' rather than 'eig' since estimation_matrix # is symmetric, the performance gain is substantial eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix) # sort eigenvalue in decreasing order idx_eigenvals = np.argsort(eigenvals)[::-1] eigenvectors = eigenvectors[:, idx_eigenvals] # sort eigenvectors according to same index eigenvals = eigenvals[idx_eigenvals] # select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) eigenvectors = eigenvectors[:, :n_pc] ######## # save the principal components separated in tif images def get_raw_band_from_stack(band): src_ds_A = gdal.Open(A, gdal.GA_ReadOnly) if band < src_ds_A.RasterCount: return src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if band >= src_ds_A.RasterCount: src_ds_B = gdal.Open(B, gdal.GA_ReadOnly) return src_ds_B.GetRasterBand(band - src_ds_A.RasterCount + 1).ReadAsArray().flatten().astype( np.float32) @dask.delayed def get_principal_component(i, j): return eigenvectors[j, i] * (get_raw_band_from_stack(j) - band_mean[j]) print("\nComputing and saving the components in pca-stack.tif:") # save component as file tmp_pca_file = Path(out_dir) / 'pca-stack.tif' driver = gdal.GetDriverByName("GTiff") out_pc = driver.Create(str(tmp_pca_file), src_ds_A.RasterXSize, src_ds_A.RasterYSize, n_pc, gdal.GDT_Float32) for i in range(n_pc): pc = dask.delayed(sum)( [get_principal_component(i, j) for j in range(n_bands)]) pc = pc.astype(np.float32) pc = np.array(pc.compute()) if nodata is not None: pc[nodata_mask] = 0 pc = pc.reshape((src_ds_A.RasterYSize, src_ds_A.RasterXSize)) pcband = out_pc.GetRasterBand(i + 1) if nodata is not None: pcband.SetNoDataValue(0) pcband.WriteArray(pc) del pc, pcband # set projection and geotransform if src_ds_A.GetGeoTransform() is not None: out_pc.SetGeoTransform(src_ds_A.GetGeoTransform()) if src_ds_A.GetProjection() is not None: out_pc.SetProjection(src_ds_A.GetProjection()) out_pc.FlushCache() # free mem del src_ds_A, nodata_mask, out_pc print("\nDONE")
# f['photon_diagnostics/FEL01/I0_monitor/iom_sh_a_pc'][...] # .astype('float64') # ) intensities = arrs.sum(1) where = bunches % bp != 0 return append(intensities[where, None], arrs[where, :], axis=1) # %% run = 442 globbed = glob('/home/ldm/ExperimentalData/Online4LDM/20144078' '/Test/Run_{:03d}/rawdata/*.h5'.format(run)) arr = concatenate([from_delayed(process(g), [shapes(g), to-fr+1], 'float64') for g in globbed]) with ProgressBar(): img = cov(arr.T).compute() # %% def scale(arr): m = min(abs(arr.min()), abs(arr.max()))*0.5 return -m, m img_cov = img[1:, 1:] img_icov = img[1:, 0][:, None] @ img[0, 1:][None, :] / img[0, 0] img_pcov = img_cov - img_icov plt.figure() plt.subplot(221) plt.pcolormesh(img_pcov, cmap='RdBu') plt.clim(*scale(img_pcov))
def pca(A, B, n_pc, estimator_matrix, out_dir, n_threads, block_size, nodata=None): """Calculate the principal components for the vertical stack A or with combinations of the stack B :param A: first input raster data (fists period) :param B: second input raster data (second period) or None :param n_pc: number of principal components to output :param estimator_matrix: pca with correlation of covariance :param out_dir: directory to save the outputs :return: pca files list and statistics """ # init dask as threads (shared memory is required) dask.config.set(pool=ThreadPool(n_threads)) raw_image = [] nodata_mask = None src_ds_A = gdal.Open(str(A), gdal.GA_ReadOnly) src_ds_B = None for band in range(src_ds_A.RasterCount): ds = src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if nodata is not None: nodata_mask = ds == nodata if nodata_mask is None else np.logical_or( nodata_mask, ds == nodata) raw_image.append(ds) if B: src_ds_B = gdal.Open(str(B), gdal.GA_ReadOnly) for band in range(src_ds_B.RasterCount): ds = src_ds_B.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if nodata is not None: nodata_mask = np.logical_or(nodata_mask, ds == nodata) raw_image.append(ds) # pair-masking data, let only the valid data across all dimensions/bands if nodata is not None: raw_image = [b[~nodata_mask] for b in raw_image] # flat each dimension (bands) flat_dims = da.vstack(raw_image).rechunk((1, block_size**2)) # bands n_bands = flat_dims.shape[0] ######## # compute the mean of each band, in order to center the matrix. band_mean = [] for i in range(n_bands): band_mean.append(dask.delayed(np.mean)(flat_dims[i])) band_mean = dask.compute(*band_mean) ######## # compute the matrix correlation/covariance estimation_matrix = np.empty((n_bands, n_bands)) if estimator_matrix == "Correlation": for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1] if estimator_matrix == "Covariance": for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1] # free mem del raw_image, flat_dims, src_ds_B, ds ######## # calculate eigenvectors & eigenvalues of the matrix # use 'eigh' rather than 'eig' since estimation_matrix # is symmetric, the performance gain is substantial eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix) # sort eigenvalue in decreasing order idx_eigenvals = np.argsort(eigenvals)[::-1] eigenvectors = eigenvectors[:, idx_eigenvals] # sort eigenvectors according to same index eigenvals = eigenvals[idx_eigenvals] # select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) eigenvectors = eigenvectors[:, :n_pc] ######## # save the principal components separated in tif images def get_raw_band_from_stack(band): src_ds_A = gdal.Open(str(A), gdal.GA_ReadOnly) if band < src_ds_A.RasterCount: return src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if band >= src_ds_A.RasterCount: src_ds_B = gdal.Open(str(B), gdal.GA_ReadOnly) return src_ds_B.GetRasterBand(band - src_ds_A.RasterCount + 1).ReadAsArray().flatten().astype( np.float32) pca_files = [] for i in range(n_pc): pc = 0 for j in range(n_bands): pc = pc + eigenvectors[j, i] * (get_raw_band_from_stack(j) - band_mean[j]) if nodata is not None: pc[nodata_mask] = 0 pc = pc.reshape((src_ds_A.RasterYSize, src_ds_A.RasterXSize)) # save component as file tmp_pca_file = Path(out_dir) / 'pc_{}.tif'.format(i + 1) driver = gdal.GetDriverByName("GTiff") out_pc = driver.Create(str(tmp_pca_file), src_ds_A.RasterXSize, src_ds_A.RasterYSize, 1, gdal.GDT_Float32) pcband = out_pc.GetRasterBand(1) if nodata is not None: pcband.SetNoDataValue(0) pcband.WriteArray(pc) # set projection and geotransform if src_ds_A.GetGeoTransform() is not None: out_pc.SetGeoTransform(src_ds_A.GetGeoTransform()) if src_ds_A.GetProjection() is not None: out_pc.SetProjection(src_ds_A.GetProjection()) out_pc.FlushCache() del pc, pcband, out_pc pca_files.append(tmp_pca_file) # free mem del src_ds_A, nodata_mask # compute the pyramids for each pc image for pca_file in pca_files: call('gdaladdo -q --config BIGTIFF_OVERVIEW YES "{}"'.format(pca_file), shell=True) ######## # pca statistics pca_stats = {} pca_stats["eigenvals"] = eigenvals pca_stats["eigenvals_%"] = eigenvals * 100 / n_bands pca_stats["eigenvectors"] = eigenvectors return pca_files, pca_stats