Beispiel #1
0
def triclustering(Z,
                  nclusters_row,
                  nclusters_col,
                  nclusters_bnd,
                  errobj,
                  niters,
                  epsilon,
                  row_clusters_init=None,
                  col_clusters_init=None,
                  bnd_clusters_init=None):
    """
    Run the tri-clustering, Dask implementation

    :param Z: d x m x n data matrix
    :param nclusters_row: number of row clusters
    :param nclusters_col: number of column clusters
    :param nclusters_bnd: number of band clusters
    :param errobj: convergence threshold for the objective function
    :param niters: maximum number of iterations
    :param epsilon: numerical parameter, avoids zero arguments in log
    :param row_clusters_init: initial row cluster assignment
    :param col_clusters_init: initial column cluster assignment
    :param bnd_clusters_init: initial column cluster assignment
    :return: has converged, number of iterations performed. final row,
    column, and band clustering, error value
    """
    client = get_client()

    Z = da.array(Z) if not isinstance(Z, da.Array) else Z

    [d, m, n] = Z.shape
    bnd_chunks, row_chunks, col_chunks = Z.chunksize

    row_clusters = da.array(row_clusters_init) \
        if row_clusters_init is not None \
        else _initialize_clusters(m, nclusters_row, chunks=row_chunks)
    col_clusters = da.array(col_clusters_init) \
        if col_clusters_init is not None \
        else _initialize_clusters(n, nclusters_col, chunks=col_chunks)
    bnd_clusters = da.array(bnd_clusters_init) \
        if bnd_clusters_init is not None \
        else _initialize_clusters(d, nclusters_bnd, chunks=bnd_chunks)
    R = _setup_cluster_matrix(nclusters_row, row_clusters)
    C = _setup_cluster_matrix(nclusters_col, col_clusters)
    B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters)

    e, old_e = 2 * errobj, 0
    s = 0
    converged = False

    Gavg = Z.mean()

    while (not converged) & (s < niters):
        logger.debug(f'Iteration # {s} ..')
        # Calculate number of elements in each tri-cluster
        nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row)
        nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col)
        nel_bnd_clusters = da.bincount(bnd_clusters, minlength=nclusters_bnd)
        logger.debug(
            'num of populated clusters: row {}, col {}, bnd {}'.format(
                da.sum(nel_row_clusters > 0).compute(),
                da.sum(nel_col_clusters > 0).compute(),
                da.sum(nel_bnd_clusters > 0).compute()))
        nel_clusters = da.einsum('i,j->ij', nel_row_clusters, nel_col_clusters)
        nel_clusters = da.einsum('i,jk->ijk', nel_bnd_clusters, nel_clusters)

        # calculate tri-cluster averages (epsilon takes care of empty clusters)
        # first sum values in each tri-cluster ..
        TriCavg = da.einsum('ij,ilm->jlm', B, Z)  # .. along band axis
        TriCavg = da.einsum('ij,kim->kjm', R, TriCavg)  # .. along row axis
        TriCavg = da.einsum('ij,kli->klj', C, TriCavg)  # .. along col axis
        # finally divide by number of elements in each tri-cluster
        TriCavg = (TriCavg + Gavg * epsilon) / (nel_clusters + epsilon)

        # unpack tri-cluster averages ..
        avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg)  # .. along band axis
        avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck)  # .. along col axis
        # use these for the row cluster assignment
        idx = (1, 0, 2)
        d_row = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon)
        row_clusters = da.argmin(d_row, axis=1)
        R = _setup_cluster_matrix(nclusters_row, row_clusters)

        # unpack tri-cluster averages ..
        avg_unpck = da.einsum('ij,jkl->ikl', B, TriCavg)  # .. along band axis
        avg_unpck = da.einsum('ij,kjl->kil', R, avg_unpck)  # .. along row axis
        # use these for the col cluster assignment
        idx = (2, 0, 1)
        d_col = _distance(Z.transpose(idx), avg_unpck.transpose(idx), epsilon)
        col_clusters = da.argmin(d_col, axis=1)
        C = _setup_cluster_matrix(nclusters_col, col_clusters)

        # unpack tri-cluster averages ..
        avg_unpck = da.einsum('ij,kjl->kil', R, TriCavg)  # .. along row axis
        avg_unpck = da.einsum('ij,klj->kli', C, avg_unpck)  # .. along col axis
        # use these for the band cluster assignment
        d_bnd = _distance(Z, avg_unpck, epsilon)
        bnd_clusters = da.argmin(d_bnd, axis=1)
        B = _setup_cluster_matrix(nclusters_bnd, bnd_clusters)

        # Error value (actually just the band component really)
        old_e = e
        minvals = da.min(d_bnd, axis=1)
        # power 1 divergence, power 2 euclidean
        e = da.sum(da.power(minvals, 1))
        row_clusters, R, col_clusters, C, bnd_clusters, B, e = client.persist(
            [row_clusters, R, col_clusters, C, bnd_clusters, B, e])
        e = e.compute()
        logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}')
        converged = abs(e - old_e) < errobj
        s = s + 1
    if converged:
        logger.debug(f'Triclustering converged in {s} iterations')
    else:
        logger.debug(f'Triclustering not converged in {s} iterations')
    return converged, s, row_clusters, col_clusters, bnd_clusters, e
Beispiel #2
0
    def __call__(self, datasets, **info):
        """Create the composite DataArray object for ERFDNB."""
        if len(datasets) != 4:
            raise ValueError("Expected 4 datasets, got %d" % (len(datasets), ))

        from scipy.special import erf
        dnb_data = datasets[0]
        sza_data = datasets[1]
        lza_data = datasets[2]
        output_dataset = dnb_data.where(~(dnb_data.isnull()
                                          | sza_data.isnull()))
        # this algorithm assumes units of "W cm-2 sr-1" so if there are other
        # units we need to adjust for that
        if dnb_data.attrs.get("units", "W m-2 sr-1") == "W m-2 sr-1":
            unit_factor = 10000.
        else:
            unit_factor = 1.

        # convert to decimal instead of %
        moon_illum_fraction = da.mean(datasets[3].data) * 0.01

        # From Steve Miller and Curtis Seaman
        # maxval = 10.^(-1.7 - (((2.65+moon_factor1+moon_factor2))*(1+erf((solar_zenith-95.)/(5.*sqrt(2.0))))))
        # minval = 10.^(-4. - ((2.95+moon_factor2)*(1+erf((solar_zenith-95.)/(5.*sqrt(2.0))))))
        # scaled_radiance = (radiance - minval) / (maxval - minval)
        # radiance = sqrt(scaled_radiance)

        # Version 2: Update from Curtis Seaman
        # maxval = 10.^(-1.7 - (((2.65+moon_factor1+moon_factor2))*(1+erf((solar_zenith-95.)/(5.*sqrt(2.0))))))
        # minval = 10.^(-4. - ((2.95+moon_factor2)*(1+erf((solar_zenith-95.)/(5.*sqrt(2.0))))))
        # saturated_pixels = where(radiance gt maxval, nsatpx)
        # saturation_pct = float(nsatpx)/float(n_elements(radiance))
        # print, 'Saturation (%) = ', saturation_pct
        #
        # while saturation_pct gt 0.005 do begin
        #   maxval = maxval*1.1
        #   saturated_pixels = where(radiance gt maxval, nsatpx)
        #   saturation_pct = float(nsatpx)/float(n_elements(radiance))
        #   print, saturation_pct
        # endwhile
        #
        # scaled_radiance = (radiance - minval) / (maxval - minval)
        # radiance = sqrt(scaled_radiance)

        moon_factor1 = 0.7 * (1.0 - moon_illum_fraction)
        moon_factor2 = 0.0022 * lza_data.data
        erf_portion = 1 + erf((sza_data.data - 95.0) / (5.0 * np.sqrt(2.0)))
        max_val = da.power(
            10, -1.7 -
            (2.65 + moon_factor1 + moon_factor2) * erf_portion) * unit_factor
        min_val = da.power(10, -4.0 -
                           (2.95 + moon_factor2) * erf_portion) * unit_factor

        # Update from Curtis Seaman, increase max radiance curve until less
        # than 0.5% is saturated
        if self.saturation_correction:
            delayed = dask.delayed(self._saturation_correction)(
                output_dataset.data, unit_factor, min_val, max_val)
            output_dataset.data = da.from_delayed(delayed,
                                                  output_dataset.shape,
                                                  output_dataset.dtype)
            output_dataset.data = output_dataset.data.rechunk(
                dnb_data.data.chunks)
        else:
            inner_sqrt = (output_dataset - min_val) / (max_val - min_val)
            # clip negative values to 0 before the sqrt
            inner_sqrt = inner_sqrt.where(inner_sqrt > 0, 0)
            output_dataset.data = np.sqrt(inner_sqrt).data

        info = dnb_data.attrs.copy()
        info.update(self.attrs)
        info["standard_name"] = "equalized_radiance"
        info["mode"] = "L"
        output_dataset.attrs = info
        return output_dataset
Beispiel #3
0
def coclustering(Z,
                 nclusters_row,
                 nclusters_col,
                 errobj,
                 niters,
                 epsilon,
                 col_clusters_init=None,
                 row_clusters_init=None,
                 run_on_worker=False):
    """
    Run the co-clustering, Dask implementation

    :param Z: m x n data matrix
    :param nclusters_row: num row clusters
    :param nclusters_col: number of column clusters
    :param errobj: convergence threshold for the objective function
    :param niters: maximum number of iterations
    :param epsilon: numerical parameter, avoids zero arguments in log
    :param row_clusters_init: initial row cluster assignment
    :param col_clusters_init: initial column cluster assignment
    :param run_on_worker: whether the function is submitted to a Dask worker
    :return: has converged, number of iterations performed. final row and
    column clustering, error value
    """
    client = get_client()

    Z = da.array(Z) if not isinstance(Z, da.Array) else Z

    [m, n] = Z.shape
    row_chunks, col_chunks = Z.chunksize

    row_clusters = da.array(row_clusters_init) \
        if row_clusters_init is not None \
        else _initialize_clusters(m, nclusters_row, chunks=row_chunks)
    col_clusters = da.array(col_clusters_init) \
        if col_clusters_init is not None \
        else _initialize_clusters(n, nclusters_col, chunks=col_chunks)
    R = _setup_cluster_matrix(nclusters_row, row_clusters)
    C = _setup_cluster_matrix(nclusters_col, col_clusters)

    e, old_e = 2 * errobj, 0
    s = 0
    converged = False

    Gavg = Z.mean()

    while (not converged) & (s < niters):
        logger.debug(f'Iteration # {s} ..')
        # Calculate cluster based averages
        # nel_clusters is a matrix with the number of elements per co-cluster
        # originally computed as:  da.dot(da.dot(R.T, da.ones((m, n))), C)
        nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row)
        nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col)
        logger.debug('num of populated clusters: row {}, col {}'.format(
            da.sum(nel_row_clusters > 0).compute(),
            da.sum(nel_col_clusters > 0).compute()))
        nel_clusters = da.outer(nel_row_clusters, nel_col_clusters)
        CoCavg = (da.matmul(da.matmul(R.T, Z), C) + Gavg * epsilon) / \
                 (nel_clusters + epsilon)

        # Calculate distance based on row approximation
        d_row = _distance(Z, da.matmul(C, CoCavg.T), epsilon)
        # Assign to best row cluster
        row_clusters = da.argmin(d_row, axis=1)
        R = _setup_cluster_matrix(nclusters_row, row_clusters)

        # Calculate distance based on column approximation
        d_col = _distance(Z.T, da.matmul(R, CoCavg), epsilon)
        # Assign to best column cluster
        col_clusters = da.argmin(d_col, axis=1)
        C = _setup_cluster_matrix(nclusters_col, col_clusters)

        # Error value (actually just the column components really)
        old_e = e
        minvals = da.min(d_col, axis=1)
        # power 1 divergence, power 2 euclidean
        e = da.sum(da.power(minvals, 1))
        row_clusters, R, col_clusters, C, e = client.persist(
            [row_clusters, R, col_clusters, C, e])
        if run_on_worker:
            # this is workaround for e.compute() for a function that runs
            # on a worker with multiple threads
            # https://github.com/dask/distributed/issues/3827
            e = client.compute(e)
            secede()
            e = e.result()
            rejoin()
        else:
            e = e.compute()
        logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}')
        converged = abs(e - old_e) < errobj
        s = s + 1
    if converged:
        logger.debug(f'Coclustering converged in {s} iterations')
    else:
        logger.debug(f'Coclustering not converged in {s} iterations')
    return converged, s, row_clusters, col_clusters, e