def test_twoNN(data_Fig1, output_xy_test1): est = twoNearestNeighbors(frac=0.8, n_jobs=-1) assert est.blockAn == True assert est.block_ratio == 20 assert est.metric == "euclidean" #assert est.frac == 1 est.fit(data_Fig1) assert hasattr(est, 'is_fitted_') df_bm = output_xy_test1 #npt.assert_almost_equal(est.x_, df_bm['x'], decimal=5) #npt.assert_almost_equal(est.y_, df_bm['y'], decimal=5) assert int(round(est.dim_)) == 2
def fit(self, X, y=None): """Fit the PAk estimator on the data. Parameters ---------- X : array [n_samples, n_samples] if metric == ``precomputed``, or, [n_samples, n_features] otherwise The input samples. y : Ignored Not used, present here for API consistency by convention. Returns ------- self : object Returns self. """ # Input validation X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, ensure_min_samples=2) self.dim_ = self.dim if not self.dim: if self.dim_algo == "auto": self.dim_ = X.shape[1] elif self.dim_algo == "twoNN": if self.block_ratio >= X.shape[0]: raise ValueError( "block_ratio is larger than the sample size, the minimum size for block analysis \ would be zero. Please set a lower value.") self.dim_ = twoNearestNeighbors(blockAn=self.blockAn, block_ratio=self.block_ratio, metric=self.metric, frac=self.frac, n_jobs=self.n_jobs).fit(X).dim_ else: pass self.k_max_ = self.k_max if self.k_max > X.shape[0]: # the following value is chosen to better address very small data set self.k_max_ = int(X.shape[0] * 0.4) if self.k_max < 3: raise ValueError( "k_max is below 3, the minimum value required for \ statistical significance. Please use a larger datasets." ) # check if NN matrix is precomputed: if self.nn_distances is not None and self.nn_indices is not None: # overwrite the self.k_max_ self.k_max_ = self.nn_distances.shape[1] - 1 self.distances_ = self.nn_distances self.indices_ = self.nn_indices elif self.metric == "precomputed": nbrs = NearestNeighbors( n_neighbors=self.k_max_ + 1, # The point i is counted in its neighborhood algorithm="brute", metric=self.metric, n_jobs=self.n_jobs).fit(X) self.distances_, self.indices_ = nbrs.kneighbors(X) else: nbrs = NearestNeighbors( n_neighbors=self.k_max_ + 1, # The point i is counted in its neighborhood algorithm="auto", metric=self.metric, n_jobs=self.n_jobs).fit(X) self.distances_, self.indices_ = nbrs.kneighbors(X) self.densities_, self.err_densities_, self.k_hat_, self.dc_ = _PointAdaptive_kNN( self.distances_, self.indices_, k_max=self.k_max_, D_thr=self.D_thr, dim=self.dim_) self.is_fitted_ = True return self
def fit(self, X, y=None): """Fit the DPA clustering on the data. Parameters ---------- X : array [n_samples, n_samples] if metric == “precomputed”, or, [n_samples, n_features] otherwise The input samples. Similarities / affinities between instances if ``affinity='precomputed'``. y : Ignored Not used, present here for API consistency by convention. Returns ------- self : object Returns self. """ # Input validation X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, ensure_min_samples=2) allow_squared = self.affinity in [ "precomputed", "precomputed_nearest_neighbors" ] if X.shape[0] == X.shape[1] and not allow_squared: warnings.warn("The DPA clustering API has changed. ``fit``" "now constructs an affinity matrix from data. To use" " a custom affinity matrix, " "set ``affinity=precomputed``.") self.k_max_ = self.k_max self.dim_ = self.dim if not self.dim: if self.dim_algo == "auto": self.dim_ = X.shape[1] elif self.dim_algo == "twoNN": if self.block_ratio >= X.shape[0]: raise ValueError( "block_ratio is larger than the sample size, the minimum size for \ block analysis would be zero. Please set a value lower than " + str(X.shape[0])) self.dim_ = twoNearestNeighbors(blockAn=self.blockAn, block_ratio=self.block_ratio, metric=self.metric, frac=self.frac, n_jobs=self.n_jobs).fit(X).dim_ else: pass # If densities, uncertainties and k_hat are provided as input, compute only the # matrix of nearest neighbor: self.densities_ = self.densities self.err_densities_ = self.err_densities self.k_hat_ = self.k_hat if self.densities_ is not None and self.err_densities_ is not None and self.k_hat_ is not None: # If the nearest neighbors matrix is precomputed: if self.nn_distances is not None and self.nn_indices is not None: self.k_max_ = max(self.k_hat_) self.distances_ = self.nn_distances self.indices_ = self.nn_indices else: self.k_max_ = max(self.k_hat_) if self.metric == "precomputed": nbrs = NearestNeighbors( n_neighbors=self.k_max_ + 1, # The point i is counted in its neighborhood algorithm="brute", metric=self.metric, n_jobs=self.n_jobs).fit(X) else: nbrs = NearestNeighbors( n_neighbors=self.k_max_ + 1, # The point i is counted in its neighborhood algorithm="auto", metric=self.metric, n_jobs=self.n_jobs).fit(X) self.distances_, self.indices_ = nbrs.kneighbors(X) elif self.density_algo == "PAk": # If the nearest neighbors matrix is precomputed: if self.nn_distances is not None and self.nn_indices is not None: self.k_max_ = self.nn_distances.shape[1] - 1 PAk = PointAdaptive_kNN(k_max=self.k_max_, D_thr=self.D_thr, metric=self.metric, nn_distances=self.nn_distances, nn_indices=self.nn_indices, dim_algo=self.dim_algo, blockAn=self.blockAn, block_ratio=self.block_ratio, frac=self.frac, dim=self.dim_, n_jobs=self.n_jobs).fit(X) else: PAk = PointAdaptive_kNN(k_max=self.k_max_, D_thr=self.D_thr, metric=self.metric, dim_algo=self.dim_algo, blockAn=self.blockAn, block_ratio=self.block_ratio, frac=self.frac, dim=self.dim_, n_jobs=self.n_jobs).fit(X) self.distances_ = PAk.distances_ self.indices_ = PAk.indices_ self.densities_ = PAk.densities_ self.err_densities_ = PAk.err_densities_ self.k_hat_ = PAk.k_hat_ self.k_max_ = max(self.k_hat_) else: # TODO: implement option for kNN pass self.labels_, self.halos_, self.topography_, self.g_, self.centers_ = _DensityPeakAdvanced( self.densities_, self.err_densities_, self.k_hat_, self.distances_, self.indices_, self.Z) self.is_fitted_ = True return self
print(end - start) est.topography_ # Running again with a different Z without the need of recomputing the neighbors-densities params = est.get_computed_params() est.set_params(**params) est.set_params(Z=1) start = time.time() est.fit(data_F1) end = time.time() print(end - start) # The PAk and twoNN estimator can be used indipendently from the DPA clustering method. # + from Pipeline import PAk from Pipeline import twoNN rho_est = PAk.PointAdaptive_kNN() d_est = twoNN.twoNearestNeighbors() # + results = rho_est.fit(data_F1) print(results.densities_[:10]) dim = d_est.fit(data_F1).dim_ print(dim) # -