def test_corrcoef(): x = np.arange(56).reshape((7, 8)) d = da.from_array(x, chunks=(4, 4)) assert_eq(da.corrcoef(d), np.corrcoef(x)) assert_eq(da.corrcoef(d, rowvar=0), np.corrcoef(x, rowvar=0)) assert_eq(da.corrcoef(d, d), np.corrcoef(x, x)) y = np.arange(8) e = da.from_array(y, chunks=(4,)) assert_eq(da.corrcoef(d, e), np.corrcoef(x, y)) assert_eq(da.corrcoef(e, d), np.corrcoef(y, x))
def test_corrcoef(): x = np.arange(56).reshape((7, 8)) d = da.from_array(x, chunks=(4, 4)) assert_eq(da.corrcoef(d), np.corrcoef(x)) assert_eq(da.corrcoef(d, rowvar=0), np.corrcoef(x, rowvar=0)) assert_eq(da.corrcoef(d, d), np.corrcoef(x, x)) y = np.arange(8) e = da.from_array(y, chunks=(4, )) assert_eq(da.corrcoef(d, e), np.corrcoef(x, y)) assert_eq(da.corrcoef(e, d), np.corrcoef(y, x))
def pearson_1xn( x: da.Array, data: da.Array, value_range: Optional[Tuple[float, float]] = None, k: Optional[int] = None, ) -> Tuple[np.ndarray, np.ndarray]: """ Parameters ---------- x : da.Array data : da.Array value_range : Optional[Tuple[float, float]] = None k : Optional[int] = None """ _, ncols = data.shape corrs = [] for j in range(ncols): mask = ~(da.isnan(x) | da.isnan(data[:, j])) _, (corr, _) = da.corrcoef(np.array(x)[mask], np.array(data[:, j])[mask]) corrs.append(corr) (corrs, ) = da.compute(corrs) corrs = np.asarray(corrs) return corr_filter(corrs, value_range, k)
def estimate_ld(hd5, filename, chromosome, threads, memory): print("Estimating LD for Chromosome", chromosome) dset = hd5['/%s' % chromosome][:] chunks = estimate_chunks(dset.shape, threads, memory) array = da.from_array(dset, chunks=chunks) del dset gc.collect() rho = da.corrcoef(da.ma.masked_invalid(array).T) ** 2 filename='%s_ld.hdf5' % filename da.to_hdf5(filename, '/%s' % chromosome, rho) return chromosome, filename
def missing_heatmap(df: EDAFrame) -> Optional[pd.DataFrame]: """Calculate a heatmap visualization of nullity correlation in the given DataFrame.""" return da.corrcoef(df.nulls, rowvar=False)
def pca(A, B, n_pc, estimator_matrix, out_dir, n_threads, block_size): """Calculate the principal components for the vertical stack A or with combinations of the stack B :param A: first input raster data (fists period) :param B: second input raster data (second period) or None :param n_pc: number of principal components to output :param estimator_matrix: pca with correlation of covariance :param out_dir: directory to save the outputs :return: pca files list and statistics """ # init dask as threads (shared memory is required) dask.config.set(pool=ThreadPool(n_threads)) def get_profile(path): """Get geospatial metadata profile such as projections, pixel sizes, etc""" with rasterio.open(path) as src: return src.profile.copy() if B: raw_image_a = read_raster(A, block_size=block_size) raw_image_b = read_raster(B, block_size=block_size) raw_image = da.vstack((raw_image_a, raw_image_b)) else: raw_image = read_raster(A, block_size=block_size) # flat each dimension (bands) flat_dims = raw_image.reshape( (raw_image.shape[0], raw_image.shape[1] * raw_image.shape[2])) n_bands = raw_image.shape[0] ######## # subtract the mean of column i from column i, in order to center the matrix. band_mean = [] for i in range(n_bands): band_mean.append(dask.delayed(da.mean)(flat_dims[i])) band_mean = dask.compute(*band_mean) ######## # compute the matrix correlation/covariance estimation_matrix = np.empty((n_bands, n_bands)) for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] if estimator_matrix == "Correlation": estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1] if estimator_matrix == "Covariance": estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1] ######## # calculate eigenvectors & eigenvalues of the matrix # use 'eigh' rather than 'eig' since estimation_matrix # is symmetric, the performance gain is substantial eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix) # sort eigenvalue in decreasing order idx_eigenvals = np.argsort(eigenvals)[::-1] eigenvectors = eigenvectors[:, idx_eigenvals] # sort eigenvectors according to same index eigenvals = eigenvals[idx_eigenvals] # select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) eigenvectors = eigenvectors[:, :n_pc] ######## # save the principal components separated in tif images # output image profile prof = get_profile(A) prof.update(count=1, driver='GTiff', dtype=np.float32) @dask.delayed def get_principal_component(i, j): return eigenvectors[j, i] * (raw_image[j] - band_mean[j]) pca_files = [] for i in range(n_pc): pc = dask.delayed(sum)( [get_principal_component(i, j) for j in range(n_bands)]) pc = pc.astype(np.float32) # save component as file tmp_pca_file = Path(out_dir, 'pc_{}.tif'.format(i + 1)) write_raster(tmp_pca_file, pc.compute(), **prof) pca_files.append(tmp_pca_file) # compute the pyramids for each pc image @dask.delayed def pyramids(pca_file): call('gdaladdo --config BIGTIFF_OVERVIEW YES "{}"'.format(pca_file), shell=True) dask.compute(*[pyramids(pca_file) for pca_file in pca_files], num_workers=2) ######## # pca statistics pca_stats = {} pca_stats["eigenvals"] = eigenvals pca_stats["eigenvals_%"] = eigenvals * 100 / n_bands pca_stats["eigenvectors"] = eigenvectors return pca_files, pca_stats
def pca(a, b, n_pc, estimator_matrix, out_dir, n_threads, block_size, nodata): """Calculate the principal components for the vertical stack A or with combinations of the stack B :param A: first input raster data (fists period) :param B: second input raster data (second period) or None :param n_pc: number of principal components to output :param estimator_matrix: pca with correlation of covariance :param out_dir: directory to save the outputs :return: pca files list and statistics """ A = a B = b # get/set the nodata if nodata is None: ds = gdal.Open(A, gdal.GA_ReadOnly) nodata = ds.GetRasterBand(1).GetNoDataValue() del ds print("\nPRINCIPAL COMPONENTS ANALYSIS") print(" Compute {} components for:".format(n_pc)) print(" A: {}".format(A)) if B is not None: print(" B: {}".format(B)) # init dask as threads (shared memory is required) dask.config.set(pool=ThreadPool(n_threads)) # registered Dask progress bar pbar = ProgressBar() pbar.register() print("\nRead and prepare data:") raw_image = [] nodata_mask = None src_ds_A = gdal.Open(A, gdal.GA_ReadOnly) src_ds_B = None for band in range(src_ds_A.RasterCount): ds = src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if nodata is not None: nodata_mask = ds == nodata if nodata_mask is None else np.logical_or( nodata_mask, ds == nodata) raw_image.append(ds) if B is not None: src_ds_B = gdal.Open(B, gdal.GA_ReadOnly) for band in range(src_ds_B.RasterCount): ds = src_ds_B.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if nodata is not None: nodata_mask = np.logical_or(nodata_mask, ds == nodata) raw_image.append(ds) # pair-masking data, let only the valid data across all dimensions/bands if nodata is not None: raw_image = [b[~nodata_mask] for b in raw_image] # flat each dimension (bands) flat_dims = da.vstack(raw_image).rechunk((1, block_size**2)) # bands n_bands = flat_dims.shape[0] ######## # compute the mean of each band, in order to center the matrix. band_mean = [] for i in range(n_bands): band_mean.append(dask.delayed(np.mean)(flat_dims[i])) band_mean = dask.compute(*band_mean) ######## # compute the matrix correlation/covariance print("\nComputing the estimator matrix:") estimation_matrix = np.empty((n_bands, n_bands)) if estimator_matrix == "correlation": for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1] if estimator_matrix == "covariance": for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1] # free mem del raw_image, flat_dims, src_ds_B, ds ######## # calculate eigenvectors & eigenvalues of the matrix # use 'eigh' rather than 'eig' since estimation_matrix # is symmetric, the performance gain is substantial eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix) # sort eigenvalue in decreasing order idx_eigenvals = np.argsort(eigenvals)[::-1] eigenvectors = eigenvectors[:, idx_eigenvals] # sort eigenvectors according to same index eigenvals = eigenvals[idx_eigenvals] # select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) eigenvectors = eigenvectors[:, :n_pc] ######## # save the principal components separated in tif images def get_raw_band_from_stack(band): src_ds_A = gdal.Open(A, gdal.GA_ReadOnly) if band < src_ds_A.RasterCount: return src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if band >= src_ds_A.RasterCount: src_ds_B = gdal.Open(B, gdal.GA_ReadOnly) return src_ds_B.GetRasterBand(band - src_ds_A.RasterCount + 1).ReadAsArray().flatten().astype( np.float32) @dask.delayed def get_principal_component(i, j): return eigenvectors[j, i] * (get_raw_band_from_stack(j) - band_mean[j]) print("\nComputing and saving the components in pca-stack.tif:") # save component as file tmp_pca_file = Path(out_dir) / 'pca-stack.tif' driver = gdal.GetDriverByName("GTiff") out_pc = driver.Create(str(tmp_pca_file), src_ds_A.RasterXSize, src_ds_A.RasterYSize, n_pc, gdal.GDT_Float32) for i in range(n_pc): pc = dask.delayed(sum)( [get_principal_component(i, j) for j in range(n_bands)]) pc = pc.astype(np.float32) pc = np.array(pc.compute()) if nodata is not None: pc[nodata_mask] = 0 pc = pc.reshape((src_ds_A.RasterYSize, src_ds_A.RasterXSize)) pcband = out_pc.GetRasterBand(i + 1) if nodata is not None: pcband.SetNoDataValue(0) pcband.WriteArray(pc) del pc, pcband # set projection and geotransform if src_ds_A.GetGeoTransform() is not None: out_pc.SetGeoTransform(src_ds_A.GetGeoTransform()) if src_ds_A.GetProjection() is not None: out_pc.SetProjection(src_ds_A.GetProjection()) out_pc.FlushCache() # free mem del src_ds_A, nodata_mask, out_pc print("\nDONE")
def fit(self): if self.corr_mat is None: dense = da.from_delayed(delayed(toarray)(self.matrix), shape=self.matrix.shape,dtype=self.matrix.dtype) self.corr_mat = da.corrcoef(dense).compute() self.fitted = True
def pca(A, B, n_pc, estimator_matrix, out_dir, n_threads, block_size, nodata=None): """Calculate the principal components for the vertical stack A or with combinations of the stack B :param A: first input raster data (fists period) :param B: second input raster data (second period) or None :param n_pc: number of principal components to output :param estimator_matrix: pca with correlation of covariance :param out_dir: directory to save the outputs :return: pca files list and statistics """ # init dask as threads (shared memory is required) dask.config.set(pool=ThreadPool(n_threads)) raw_image = [] nodata_mask = None src_ds_A = gdal.Open(str(A), gdal.GA_ReadOnly) src_ds_B = None for band in range(src_ds_A.RasterCount): ds = src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if nodata is not None: nodata_mask = ds == nodata if nodata_mask is None else np.logical_or( nodata_mask, ds == nodata) raw_image.append(ds) if B: src_ds_B = gdal.Open(str(B), gdal.GA_ReadOnly) for band in range(src_ds_B.RasterCount): ds = src_ds_B.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if nodata is not None: nodata_mask = np.logical_or(nodata_mask, ds == nodata) raw_image.append(ds) # pair-masking data, let only the valid data across all dimensions/bands if nodata is not None: raw_image = [b[~nodata_mask] for b in raw_image] # flat each dimension (bands) flat_dims = da.vstack(raw_image).rechunk((1, block_size**2)) # bands n_bands = flat_dims.shape[0] ######## # compute the mean of each band, in order to center the matrix. band_mean = [] for i in range(n_bands): band_mean.append(dask.delayed(np.mean)(flat_dims[i])) band_mean = dask.compute(*band_mean) ######## # compute the matrix correlation/covariance estimation_matrix = np.empty((n_bands, n_bands)) if estimator_matrix == "Correlation": for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1] if estimator_matrix == "Covariance": for i in range(n_bands): deviation_scores_band_i = flat_dims[i] - band_mean[i] for j in range(i, n_bands): deviation_scores_band_j = flat_dims[j] - band_mean[j] estimation_matrix[j][i] = estimation_matrix[i][j] = \ da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1] # free mem del raw_image, flat_dims, src_ds_B, ds ######## # calculate eigenvectors & eigenvalues of the matrix # use 'eigh' rather than 'eig' since estimation_matrix # is symmetric, the performance gain is substantial eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix) # sort eigenvalue in decreasing order idx_eigenvals = np.argsort(eigenvals)[::-1] eigenvectors = eigenvectors[:, idx_eigenvals] # sort eigenvectors according to same index eigenvals = eigenvals[idx_eigenvals] # select the first n eigenvectors (n is desired dimension # of rescaled data array, or dims_rescaled_data) eigenvectors = eigenvectors[:, :n_pc] ######## # save the principal components separated in tif images def get_raw_band_from_stack(band): src_ds_A = gdal.Open(str(A), gdal.GA_ReadOnly) if band < src_ds_A.RasterCount: return src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype( np.float32) if band >= src_ds_A.RasterCount: src_ds_B = gdal.Open(str(B), gdal.GA_ReadOnly) return src_ds_B.GetRasterBand(band - src_ds_A.RasterCount + 1).ReadAsArray().flatten().astype( np.float32) pca_files = [] for i in range(n_pc): pc = 0 for j in range(n_bands): pc = pc + eigenvectors[j, i] * (get_raw_band_from_stack(j) - band_mean[j]) if nodata is not None: pc[nodata_mask] = 0 pc = pc.reshape((src_ds_A.RasterYSize, src_ds_A.RasterXSize)) # save component as file tmp_pca_file = Path(out_dir) / 'pc_{}.tif'.format(i + 1) driver = gdal.GetDriverByName("GTiff") out_pc = driver.Create(str(tmp_pca_file), src_ds_A.RasterXSize, src_ds_A.RasterYSize, 1, gdal.GDT_Float32) pcband = out_pc.GetRasterBand(1) if nodata is not None: pcband.SetNoDataValue(0) pcband.WriteArray(pc) # set projection and geotransform if src_ds_A.GetGeoTransform() is not None: out_pc.SetGeoTransform(src_ds_A.GetGeoTransform()) if src_ds_A.GetProjection() is not None: out_pc.SetProjection(src_ds_A.GetProjection()) out_pc.FlushCache() del pc, pcband, out_pc pca_files.append(tmp_pca_file) # free mem del src_ds_A, nodata_mask # compute the pyramids for each pc image for pca_file in pca_files: call('gdaladdo -q --config BIGTIFF_OVERVIEW YES "{}"'.format(pca_file), shell=True) ######## # pca statistics pca_stats = {} pca_stats["eigenvals"] = eigenvals pca_stats["eigenvals_%"] = eigenvals * 100 / n_bands pca_stats["eigenvectors"] = eigenvectors return pca_files, pca_stats