Exemple #1
0
    def fit(self, X, y=None):
        # CHECKING THE TYPES
        if isinstance(X, dask.array.Array):
            import dask.array as numerical_module

            from dask.array.linalg import cholesky, inv

        else:
            import numpy as numerical_module

            from scipy.linalg import cholesky, inv

        # 1. Computes the mean vector and the covariance matrix of the training set
        mu = numerical_module.mean(X, axis=0)
        cov = numerical_module.cov(X.T)

        # 2. Computes the inverse of the covariance matrix
        inv_cov = pinv(cov) if self.pinv else inv(cov)

        # 3. Computes the Cholesky decomposition of the inverse covariance matrix
        self.weights = cholesky(
            inv_cov, lower=True
        )  # Setting lower true to have the same implementation as in the previous code
        self.input_subtract = mu
        self.input_divide = 1.0

        return self
def target(query):
    arr = concatenate([
        from_delayed(process(f), [s, to - fr + 1], 'float64')
        for _, f, s in query[['filename', 'size']].itertuples()
    ])

    with ProgressBar():
        mat = cov(arr.T).compute()
    mat_cov = mat[1:, 1:]
    mat_icov = mat[1:, 0][:, None] @ mat[0, 1:][None, :] / mat[0, 0]
    mat_pcov = mat_cov - mat_icov
    return mat_pcov
Exemple #3
0
def compute_unsupervised_metrics(latent, y, discretize):
    scores = {}
    cov_latent = da.cov(latent)

    # Gaussian total correlation.
    scores["gaussian_total_correlation"] = gaussian_total_correlation(cov_latent).compute()

    # Gaussian Wasserstein correlation.
    scores["gaussian_wasserstein_correlation"] = gaussian_wasserstein_correlation(cov_latent).compute()
    scores["gaussian_wasserstein_correlation_norm"] = delayed(scores["gaussian_wasserstein_correlation"] / np.sum(np.diag(cov_latent))).compute()

    scores["mutual_info_score"] = compute_mig(latent, y, discretize)
    return scores
def test_cov():
    x = np.arange(56).reshape((7, 8))
    d = da.from_array(x, chunks=(4, 4))

    assert_eq(da.cov(d), np.cov(x))
    assert_eq(da.cov(d, rowvar=0), np.cov(x, rowvar=0))
    with pytest.warns(None):  # warning dof <= 0 for slice
        assert_eq(da.cov(d, ddof=10), np.cov(x, ddof=10))
    assert_eq(da.cov(d, bias=1), np.cov(x, bias=1))
    assert_eq(da.cov(d, d), np.cov(x, x))

    y = np.arange(8)
    e = da.from_array(y, chunks=(4, ))

    assert_eq(da.cov(d, e), np.cov(x, y))
    assert_eq(da.cov(e, d), np.cov(y, x))

    with pytest.raises(ValueError):
        da.cov(d, ddof=1.5)
Exemple #5
0
def test_cov():
    x = np.arange(56).reshape((7, 8))
    d = da.from_array(x, chunks=(4, 4))

    assert_eq(da.cov(d), np.cov(x))
    assert_eq(da.cov(d, rowvar=0), np.cov(x, rowvar=0))
    with pytest.warns(None):  # warning dof <= 0 for slice
        assert_eq(da.cov(d, ddof=10), np.cov(x, ddof=10))
    assert_eq(da.cov(d, bias=1), np.cov(x, bias=1))
    assert_eq(da.cov(d, d), np.cov(x, x))

    y = np.arange(8)
    e = da.from_array(y, chunks=(4,))

    assert_eq(da.cov(d, e), np.cov(x, y))
    assert_eq(da.cov(e, d), np.cov(y, x))

    with pytest.raises(ValueError):
        da.cov(d, ddof=1.5)
Exemple #6
0
def test_cov():
    x = np.arange(56).reshape((7, 8))
    d = da.from_array(x, chunks=(4, 4))

    assert eq(da.cov(d), np.cov(x))
    assert eq(da.cov(d, rowvar=0), np.cov(x, rowvar=0))
    assert eq(da.cov(d, ddof=10), np.cov(x, ddof=10))
    assert eq(da.cov(d, bias=1), np.cov(x, bias=1))
    assert eq(da.cov(d, d), np.cov(x, x))

    y = np.arange(8)
    e = da.from_array(y, chunks=(4,))

    assert eq(da.cov(d, e), np.cov(x, y))
    assert eq(da.cov(e, d), np.cov(y, x))

    assert raises(ValueError, lambda: da.cov(d, ddof=1.5))
Exemple #7
0
def test_cov():
    x = np.arange(56).reshape((7, 8))
    d = da.from_array(x, chunks=(4, 4))

    assert eq(da.cov(d), np.cov(x))
    assert eq(da.cov(d, rowvar=0), np.cov(x, rowvar=0))
    assert eq(da.cov(d, ddof=10), np.cov(x, ddof=10))
    assert eq(da.cov(d, bias=1), np.cov(x, bias=1))
    assert eq(da.cov(d, d), np.cov(x, x))

    y = np.arange(8)
    e = da.from_array(y, chunks=(4, ))

    assert eq(da.cov(d, e), np.cov(x, y))
    assert eq(da.cov(e, d), np.cov(y, x))

    assert raises(ValueError, lambda: da.cov(d, ddof=1.5))
def pca(A, B, n_pc, estimator_matrix, out_dir, n_threads, block_size):
    """Calculate the principal components for the vertical stack A or with
    combinations of the stack B

    :param A: first input raster data (fists period)
    :param B: second input raster data (second period) or None
    :param n_pc: number of principal components to output
    :param estimator_matrix: pca with correlation of covariance
    :param out_dir: directory to save the outputs
    :return: pca files list and statistics
    """
    # init dask as threads (shared memory is required)
    dask.config.set(pool=ThreadPool(n_threads))

    def get_profile(path):
        """Get geospatial metadata profile such as projections, pixel sizes, etc"""
        with rasterio.open(path) as src:
            return src.profile.copy()

    if B:
        raw_image_a = read_raster(A, block_size=block_size)
        raw_image_b = read_raster(B, block_size=block_size)
        raw_image = da.vstack((raw_image_a, raw_image_b))
    else:
        raw_image = read_raster(A, block_size=block_size)

    # flat each dimension (bands)
    flat_dims = raw_image.reshape(
        (raw_image.shape[0], raw_image.shape[1] * raw_image.shape[2]))

    n_bands = raw_image.shape[0]

    ########
    # subtract the mean of column i from column i, in order to center the matrix.
    band_mean = []
    for i in range(n_bands):
        band_mean.append(dask.delayed(da.mean)(flat_dims[i]))
    band_mean = dask.compute(*band_mean)

    ########
    # compute the matrix correlation/covariance
    estimation_matrix = np.empty((n_bands, n_bands))
    for i in range(n_bands):
        deviation_scores_band_i = flat_dims[i] - band_mean[i]
        for j in range(i, n_bands):
            deviation_scores_band_j = flat_dims[j] - band_mean[j]
            if estimator_matrix == "Correlation":
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1]
            if estimator_matrix == "Covariance":
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1]

    ########
    # calculate eigenvectors & eigenvalues of the matrix
    # use 'eigh' rather than 'eig' since estimation_matrix
    # is symmetric, the performance gain is substantial
    eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix)

    # sort eigenvalue in decreasing order
    idx_eigenvals = np.argsort(eigenvals)[::-1]
    eigenvectors = eigenvectors[:, idx_eigenvals]
    # sort eigenvectors according to same index
    eigenvals = eigenvals[idx_eigenvals]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    eigenvectors = eigenvectors[:, :n_pc]

    ########
    # save the principal components separated in tif images

    # output image profile
    prof = get_profile(A)
    prof.update(count=1, driver='GTiff', dtype=np.float32)

    @dask.delayed
    def get_principal_component(i, j):
        return eigenvectors[j, i] * (raw_image[j] - band_mean[j])

    pca_files = []
    for i in range(n_pc):
        pc = dask.delayed(sum)(
            [get_principal_component(i, j) for j in range(n_bands)])
        pc = pc.astype(np.float32)
        # save component as file
        tmp_pca_file = Path(out_dir, 'pc_{}.tif'.format(i + 1))
        write_raster(tmp_pca_file, pc.compute(), **prof)
        pca_files.append(tmp_pca_file)

    # compute the pyramids for each pc image
    @dask.delayed
    def pyramids(pca_file):
        call('gdaladdo --config BIGTIFF_OVERVIEW YES "{}"'.format(pca_file),
             shell=True)

    dask.compute(*[pyramids(pca_file) for pca_file in pca_files],
                 num_workers=2)

    ########
    # pca statistics
    pca_stats = {}
    pca_stats["eigenvals"] = eigenvals
    pca_stats["eigenvals_%"] = eigenvals * 100 / n_bands
    pca_stats["eigenvectors"] = eigenvectors

    return pca_files, pca_stats
Exemple #9
0
def _cov_diaged(da: daskarr) -> daskarr:
    a = daskarr.cov(da, rowvar=0)
    a[a == np.inf] = 0
    a[a == np.nan] = 0
    return a + np.eye(a.shape[0])
Exemple #10
0
def pca(a, b, n_pc, estimator_matrix, out_dir, n_threads, block_size, nodata):
    """Calculate the principal components for the vertical stack A or with
    combinations of the stack B

    :param A: first input raster data (fists period)
    :param B: second input raster data (second period) or None
    :param n_pc: number of principal components to output
    :param estimator_matrix: pca with correlation of covariance
    :param out_dir: directory to save the outputs
    :return: pca files list and statistics
    """
    A = a
    B = b
    # get/set the nodata
    if nodata is None:
        ds = gdal.Open(A, gdal.GA_ReadOnly)
        nodata = ds.GetRasterBand(1).GetNoDataValue()
        del ds

    print("\nPRINCIPAL COMPONENTS ANALYSIS")
    print("    Compute {} components for:".format(n_pc))
    print("    A: {}".format(A))
    if B is not None:
        print("    B: {}".format(B))

    # init dask as threads (shared memory is required)
    dask.config.set(pool=ThreadPool(n_threads))
    # registered Dask progress bar
    pbar = ProgressBar()
    pbar.register()

    print("\nRead and prepare data:")

    raw_image = []
    nodata_mask = None
    src_ds_A = gdal.Open(A, gdal.GA_ReadOnly)
    src_ds_B = None
    for band in range(src_ds_A.RasterCount):
        ds = src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype(
            np.float32)
        if nodata is not None:
            nodata_mask = ds == nodata if nodata_mask is None else np.logical_or(
                nodata_mask, ds == nodata)
        raw_image.append(ds)
    if B is not None:
        src_ds_B = gdal.Open(B, gdal.GA_ReadOnly)
        for band in range(src_ds_B.RasterCount):
            ds = src_ds_B.GetRasterBand(band +
                                        1).ReadAsArray().flatten().astype(
                                            np.float32)
            if nodata is not None:
                nodata_mask = np.logical_or(nodata_mask, ds == nodata)
            raw_image.append(ds)

    # pair-masking data, let only the valid data across all dimensions/bands
    if nodata is not None:
        raw_image = [b[~nodata_mask] for b in raw_image]
    # flat each dimension (bands)
    flat_dims = da.vstack(raw_image).rechunk((1, block_size**2))
    # bands
    n_bands = flat_dims.shape[0]

    ########
    # compute the mean of each band, in order to center the matrix.
    band_mean = []
    for i in range(n_bands):
        band_mean.append(dask.delayed(np.mean)(flat_dims[i]))
    band_mean = dask.compute(*band_mean)

    ########
    # compute the matrix correlation/covariance
    print("\nComputing the estimator matrix:")
    estimation_matrix = np.empty((n_bands, n_bands))
    if estimator_matrix == "correlation":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    if estimator_matrix == "covariance":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    # free mem
    del raw_image, flat_dims, src_ds_B, ds

    ########
    # calculate eigenvectors & eigenvalues of the matrix
    # use 'eigh' rather than 'eig' since estimation_matrix
    # is symmetric, the performance gain is substantial
    eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix)

    # sort eigenvalue in decreasing order
    idx_eigenvals = np.argsort(eigenvals)[::-1]
    eigenvectors = eigenvectors[:, idx_eigenvals]
    # sort eigenvectors according to same index
    eigenvals = eigenvals[idx_eigenvals]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    eigenvectors = eigenvectors[:, :n_pc]

    ########
    # save the principal components separated in tif images

    def get_raw_band_from_stack(band):
        src_ds_A = gdal.Open(A, gdal.GA_ReadOnly)
        if band < src_ds_A.RasterCount:
            return src_ds_A.GetRasterBand(band +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)
        if band >= src_ds_A.RasterCount:
            src_ds_B = gdal.Open(B, gdal.GA_ReadOnly)
            return src_ds_B.GetRasterBand(band - src_ds_A.RasterCount +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)

    @dask.delayed
    def get_principal_component(i, j):
        return eigenvectors[j, i] * (get_raw_band_from_stack(j) - band_mean[j])

    print("\nComputing and saving the components in pca-stack.tif:")

    # save component as file
    tmp_pca_file = Path(out_dir) / 'pca-stack.tif'
    driver = gdal.GetDriverByName("GTiff")
    out_pc = driver.Create(str(tmp_pca_file), src_ds_A.RasterXSize,
                           src_ds_A.RasterYSize, n_pc, gdal.GDT_Float32)

    for i in range(n_pc):
        pc = dask.delayed(sum)(
            [get_principal_component(i, j) for j in range(n_bands)])
        pc = pc.astype(np.float32)
        pc = np.array(pc.compute())
        if nodata is not None:
            pc[nodata_mask] = 0
        pc = pc.reshape((src_ds_A.RasterYSize, src_ds_A.RasterXSize))

        pcband = out_pc.GetRasterBand(i + 1)
        if nodata is not None:
            pcband.SetNoDataValue(0)
        pcband.WriteArray(pc)
        del pc, pcband
    # set projection and geotransform
    if src_ds_A.GetGeoTransform() is not None:
        out_pc.SetGeoTransform(src_ds_A.GetGeoTransform())
    if src_ds_A.GetProjection() is not None:
        out_pc.SetProjection(src_ds_A.GetProjection())
    out_pc.FlushCache()

    # free mem
    del src_ds_A, nodata_mask, out_pc

    print("\nDONE")
        #     f['photon_diagnostics/FEL01/I0_monitor/iom_sh_a_pc'][...]
        #         .astype('float64')
        # )
        intensities = arrs.sum(1)
        where = bunches % bp != 0
    return append(intensities[where, None], arrs[where, :], axis=1)


# %%
run = 442
globbed = glob('/home/ldm/ExperimentalData/Online4LDM/20144078'
               '/Test/Run_{:03d}/rawdata/*.h5'.format(run))
arr = concatenate([from_delayed(process(g), [shapes(g), to-fr+1], 'float64')
                   for g in globbed])
with ProgressBar():
    img = cov(arr.T).compute()

# %%
def scale(arr):
    m = min(abs(arr.min()), abs(arr.max()))*0.5
    return -m, m


img_cov = img[1:, 1:]
img_icov = img[1:, 0][:, None] @ img[0, 1:][None, :] / img[0, 0]
img_pcov = img_cov - img_icov

plt.figure()
plt.subplot(221)
plt.pcolormesh(img_pcov, cmap='RdBu')
plt.clim(*scale(img_pcov))
Exemple #12
0
def pca(A,
        B,
        n_pc,
        estimator_matrix,
        out_dir,
        n_threads,
        block_size,
        nodata=None):
    """Calculate the principal components for the vertical stack A or with
    combinations of the stack B

    :param A: first input raster data (fists period)
    :param B: second input raster data (second period) or None
    :param n_pc: number of principal components to output
    :param estimator_matrix: pca with correlation of covariance
    :param out_dir: directory to save the outputs
    :return: pca files list and statistics
    """
    # init dask as threads (shared memory is required)
    dask.config.set(pool=ThreadPool(n_threads))

    raw_image = []
    nodata_mask = None
    src_ds_A = gdal.Open(str(A), gdal.GA_ReadOnly)
    src_ds_B = None
    for band in range(src_ds_A.RasterCount):
        ds = src_ds_A.GetRasterBand(band + 1).ReadAsArray().flatten().astype(
            np.float32)
        if nodata is not None:
            nodata_mask = ds == nodata if nodata_mask is None else np.logical_or(
                nodata_mask, ds == nodata)
        raw_image.append(ds)
    if B:
        src_ds_B = gdal.Open(str(B), gdal.GA_ReadOnly)
        for band in range(src_ds_B.RasterCount):
            ds = src_ds_B.GetRasterBand(band +
                                        1).ReadAsArray().flatten().astype(
                                            np.float32)
            if nodata is not None:
                nodata_mask = np.logical_or(nodata_mask, ds == nodata)
            raw_image.append(ds)

    # pair-masking data, let only the valid data across all dimensions/bands
    if nodata is not None:
        raw_image = [b[~nodata_mask] for b in raw_image]
    # flat each dimension (bands)
    flat_dims = da.vstack(raw_image).rechunk((1, block_size**2))
    # bands
    n_bands = flat_dims.shape[0]

    ########
    # compute the mean of each band, in order to center the matrix.
    band_mean = []
    for i in range(n_bands):
        band_mean.append(dask.delayed(np.mean)(flat_dims[i]))
    band_mean = dask.compute(*band_mean)

    ########
    # compute the matrix correlation/covariance
    estimation_matrix = np.empty((n_bands, n_bands))
    if estimator_matrix == "Correlation":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.corrcoef(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    if estimator_matrix == "Covariance":
        for i in range(n_bands):
            deviation_scores_band_i = flat_dims[i] - band_mean[i]
            for j in range(i, n_bands):
                deviation_scores_band_j = flat_dims[j] - band_mean[j]
                estimation_matrix[j][i] = estimation_matrix[i][j] = \
                    da.cov(deviation_scores_band_i, deviation_scores_band_j)[0][1]
    # free mem
    del raw_image, flat_dims, src_ds_B, ds

    ########
    # calculate eigenvectors & eigenvalues of the matrix
    # use 'eigh' rather than 'eig' since estimation_matrix
    # is symmetric, the performance gain is substantial
    eigenvals, eigenvectors = np.linalg.eigh(estimation_matrix)

    # sort eigenvalue in decreasing order
    idx_eigenvals = np.argsort(eigenvals)[::-1]
    eigenvectors = eigenvectors[:, idx_eigenvals]
    # sort eigenvectors according to same index
    eigenvals = eigenvals[idx_eigenvals]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    eigenvectors = eigenvectors[:, :n_pc]

    ########
    # save the principal components separated in tif images

    def get_raw_band_from_stack(band):
        src_ds_A = gdal.Open(str(A), gdal.GA_ReadOnly)
        if band < src_ds_A.RasterCount:
            return src_ds_A.GetRasterBand(band +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)
        if band >= src_ds_A.RasterCount:
            src_ds_B = gdal.Open(str(B), gdal.GA_ReadOnly)
            return src_ds_B.GetRasterBand(band - src_ds_A.RasterCount +
                                          1).ReadAsArray().flatten().astype(
                                              np.float32)

    pca_files = []
    for i in range(n_pc):
        pc = 0
        for j in range(n_bands):
            pc = pc + eigenvectors[j, i] * (get_raw_band_from_stack(j) -
                                            band_mean[j])

        if nodata is not None:
            pc[nodata_mask] = 0
        pc = pc.reshape((src_ds_A.RasterYSize, src_ds_A.RasterXSize))
        # save component as file
        tmp_pca_file = Path(out_dir) / 'pc_{}.tif'.format(i + 1)
        driver = gdal.GetDriverByName("GTiff")
        out_pc = driver.Create(str(tmp_pca_file), src_ds_A.RasterXSize,
                               src_ds_A.RasterYSize, 1, gdal.GDT_Float32)
        pcband = out_pc.GetRasterBand(1)
        if nodata is not None:
            pcband.SetNoDataValue(0)
        pcband.WriteArray(pc)
        # set projection and geotransform
        if src_ds_A.GetGeoTransform() is not None:
            out_pc.SetGeoTransform(src_ds_A.GetGeoTransform())
        if src_ds_A.GetProjection() is not None:
            out_pc.SetProjection(src_ds_A.GetProjection())
        out_pc.FlushCache()
        del pc, pcband, out_pc

        pca_files.append(tmp_pca_file)

    # free mem
    del src_ds_A, nodata_mask

    # compute the pyramids for each pc image
    for pca_file in pca_files:
        call('gdaladdo -q --config BIGTIFF_OVERVIEW YES "{}"'.format(pca_file),
             shell=True)

    ########
    # pca statistics
    pca_stats = {}
    pca_stats["eigenvals"] = eigenvals
    pca_stats["eigenvals_%"] = eigenvals * 100 / n_bands
    pca_stats["eigenvectors"] = eigenvectors

    return pca_files, pca_stats