Example #1
0
def test_twoNN(data_Fig1, output_xy_test1):
    est = twoNearestNeighbors(frac=0.8, n_jobs=-1)
    assert est.blockAn == True
    assert est.block_ratio == 20
    assert est.metric == "euclidean"
    #assert est.frac == 1

    est.fit(data_Fig1)
    assert hasattr(est, 'is_fitted_')

    df_bm = output_xy_test1
    #npt.assert_almost_equal(est.x_, df_bm['x'], decimal=5)
    #npt.assert_almost_equal(est.y_, df_bm['y'], decimal=5)

    assert int(round(est.dim_)) == 2
Example #2
0
    def fit(self, X, y=None):
        """Fit the PAk estimator on the data.

        Parameters
        ----------
        X : array [n_samples, n_samples] if metric == ``precomputed``, or, 
            [n_samples, n_features] otherwise
            The input samples.

        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self : object
            Returns self.
        """
        # Input validation
        X = check_array(X,
                        accept_sparse=['csr', 'csc', 'coo'],
                        dtype=np.float64,
                        ensure_min_samples=2)

        self.dim_ = self.dim
        if not self.dim:
            if self.dim_algo == "auto":
                self.dim_ = X.shape[1]
            elif self.dim_algo == "twoNN":
                if self.block_ratio >= X.shape[0]:
                    raise ValueError(
                        "block_ratio is larger than the sample size, the minimum size for block analysis \
                                would be zero. Please set a lower value.")
                self.dim_ = twoNearestNeighbors(blockAn=self.blockAn,
                                                block_ratio=self.block_ratio,
                                                metric=self.metric,
                                                frac=self.frac,
                                                n_jobs=self.n_jobs).fit(X).dim_
            else:
                pass

        self.k_max_ = self.k_max
        if self.k_max > X.shape[0]:
            # the following value is chosen to better address very small data set
            self.k_max_ = int(X.shape[0] * 0.4)
        if self.k_max < 3:
            raise ValueError(
                "k_max is below 3, the minimum value required for \
                        statistical significance. Please use a larger datasets."
            )

        # check if NN matrix is precomputed:
        if self.nn_distances is not None and self.nn_indices is not None:
            # overwrite the self.k_max_
            self.k_max_ = self.nn_distances.shape[1] - 1
            self.distances_ = self.nn_distances
            self.indices_ = self.nn_indices
        elif self.metric == "precomputed":
            nbrs = NearestNeighbors(
                n_neighbors=self.k_max_ +
                1,  # The point i is counted in its neighborhood 
                algorithm="brute",
                metric=self.metric,
                n_jobs=self.n_jobs).fit(X)
            self.distances_, self.indices_ = nbrs.kneighbors(X)
        else:
            nbrs = NearestNeighbors(
                n_neighbors=self.k_max_ +
                1,  # The point i is counted in its neighborhood 
                algorithm="auto",
                metric=self.metric,
                n_jobs=self.n_jobs).fit(X)
            self.distances_, self.indices_ = nbrs.kneighbors(X)

        self.densities_, self.err_densities_, self.k_hat_, self.dc_ = _PointAdaptive_kNN(
            self.distances_,
            self.indices_,
            k_max=self.k_max_,
            D_thr=self.D_thr,
            dim=self.dim_)
        self.is_fitted_ = True

        return self
Example #3
0
    def fit(self, X, y=None):
        """Fit the DPA clustering on the data.

        Parameters
        ----------
        X : array [n_samples, n_samples] if metric == “precomputed”, or, 
            [n_samples, n_features] otherwise
            The input samples. Similarities / affinities between
            instances if ``affinity='precomputed'``.

        y : Ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self : object
            Returns self.
        """
        # Input validation
        X = check_array(X,
                        accept_sparse=['csr', 'csc', 'coo'],
                        dtype=np.float64,
                        ensure_min_samples=2)

        allow_squared = self.affinity in [
            "precomputed", "precomputed_nearest_neighbors"
        ]
        if X.shape[0] == X.shape[1] and not allow_squared:
            warnings.warn("The DPA clustering API has changed. ``fit``"
                          "now constructs an affinity matrix from data. To use"
                          " a custom affinity matrix, "
                          "set ``affinity=precomputed``.")

        self.k_max_ = self.k_max
        self.dim_ = self.dim
        if not self.dim:
            if self.dim_algo == "auto":
                self.dim_ = X.shape[1]
            elif self.dim_algo == "twoNN":
                if self.block_ratio >= X.shape[0]:
                    raise ValueError(
                        "block_ratio is larger than the sample size, the minimum size for \
                                      block analysis would be zero. Please set a value lower than "
                        + str(X.shape[0]))
                self.dim_ = twoNearestNeighbors(blockAn=self.blockAn,
                                                block_ratio=self.block_ratio,
                                                metric=self.metric,
                                                frac=self.frac,
                                                n_jobs=self.n_jobs).fit(X).dim_
            else:
                pass

        # If densities, uncertainties and k_hat are provided as input, compute only the
        # matrix of nearest neighbor:
        self.densities_ = self.densities
        self.err_densities_ = self.err_densities
        self.k_hat_ = self.k_hat
        if self.densities_ is not None and self.err_densities_ is not None and self.k_hat_ is not None:
            # If the nearest neighbors matrix is precomputed:
            if self.nn_distances is not None and self.nn_indices is not None:
                self.k_max_ = max(self.k_hat_)
                self.distances_ = self.nn_distances
                self.indices_ = self.nn_indices
            else:
                self.k_max_ = max(self.k_hat_)
                if self.metric == "precomputed":
                    nbrs = NearestNeighbors(
                        n_neighbors=self.k_max_ +
                        1,  # The point i is counted in its neighborhood 
                        algorithm="brute",
                        metric=self.metric,
                        n_jobs=self.n_jobs).fit(X)
                else:
                    nbrs = NearestNeighbors(
                        n_neighbors=self.k_max_ +
                        1,  # The point i is counted in its neighborhood 
                        algorithm="auto",
                        metric=self.metric,
                        n_jobs=self.n_jobs).fit(X)
                self.distances_, self.indices_ = nbrs.kneighbors(X)
        elif self.density_algo == "PAk":
            # If the nearest neighbors matrix is precomputed:
            if self.nn_distances is not None and self.nn_indices is not None:
                self.k_max_ = self.nn_distances.shape[1] - 1
                PAk = PointAdaptive_kNN(k_max=self.k_max_,
                                        D_thr=self.D_thr,
                                        metric=self.metric,
                                        nn_distances=self.nn_distances,
                                        nn_indices=self.nn_indices,
                                        dim_algo=self.dim_algo,
                                        blockAn=self.blockAn,
                                        block_ratio=self.block_ratio,
                                        frac=self.frac,
                                        dim=self.dim_,
                                        n_jobs=self.n_jobs).fit(X)
            else:
                PAk = PointAdaptive_kNN(k_max=self.k_max_,
                                        D_thr=self.D_thr,
                                        metric=self.metric,
                                        dim_algo=self.dim_algo,
                                        blockAn=self.blockAn,
                                        block_ratio=self.block_ratio,
                                        frac=self.frac,
                                        dim=self.dim_,
                                        n_jobs=self.n_jobs).fit(X)
            self.distances_ = PAk.distances_
            self.indices_ = PAk.indices_
            self.densities_ = PAk.densities_
            self.err_densities_ = PAk.err_densities_
            self.k_hat_ = PAk.k_hat_
            self.k_max_ = max(self.k_hat_)
        else:
            # TODO: implement option for kNN
            pass
        self.labels_, self.halos_, self.topography_, self.g_, self.centers_ = _DensityPeakAdvanced(
            self.densities_, self.err_densities_, self.k_hat_, self.distances_,
            self.indices_, self.Z)

        self.is_fitted_ = True

        return self
Example #4
0
print(end - start)

est.topography_

# Running again with a different Z without the need of recomputing the neighbors-densities

params = est.get_computed_params()
est.set_params(**params)
est.set_params(Z=1)
start = time.time()
est.fit(data_F1)
end = time.time()
print(end - start)

# The PAk and twoNN estimator can be used indipendently from the DPA clustering method.

# +
from Pipeline import PAk
from Pipeline import twoNN

rho_est = PAk.PointAdaptive_kNN()
d_est = twoNN.twoNearestNeighbors()

# +
results = rho_est.fit(data_F1)
print(results.densities_[:10])

dim = d_est.fit(data_F1).dim_
print(dim)
# -