Esempio n. 1
0
    def fit(self, X, y=None):
        """Compute DiviK clustering.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster. It must be noted that the data
            will be converted to C ordering, which will cause a memory
            copy if the given data is not C-contiguous.
        y : Ignored
            not used, present here for API consistency by convention.
        """
        if np.isnan(X).any():
            raise ValueError("NaN values are not supported.")

        with context_if(self.verbose,
                        tqdm.tqdm,
                        total=X.shape[0],
                        file=sys.stdout,
                        smoothing=0) as progress:
            self.result_ = self._divik(X, progress)

        if self.result_ is None:
            self.labels_ = np.zeros((X.shape[0], ), dtype=int)
            self.paths_ = {0: (0, )}
        else:
            self.labels_, self.paths_ = summary.merged_partition(
                self.result_, return_paths=True)

        self.reverse_paths_ = {
            value: key
            for key, value in self.paths_.items()
        }

        if self.result_ is None:
            self.filters_ = np.ones([1, X.shape[1]], dtype=bool)
        else:
            self.filters_ = np.array(
                [self._get_filter(path) for path in self.reverse_paths_],
                dtype=bool)
        self.centroids_ = pd.DataFrame(X).groupby(self.labels_, sort=True)\
            .mean().values
        self.depth_ = summary.depth(self.result_)
        self.n_clusters_ = summary.total_number_of_clusters(self.result_)

        return self
 def test_without_rejection_updates_merged_and_nothing_else(self):
     filtered = sm.reject_split(DUMMY_RESULT, 0)
     self.assertEqual(filtered.clustering.best_score_,
                      DUMMY_RESULT.clustering.best_score_)
     self.assertEqual(sm.depth(filtered), sm.depth(DUMMY_RESULT))
     npt.assert_equal(filtered.merged, sm.merged_partition(DUMMY_RESULT))
 def test_returns_paths_to_partitions(self):
     partition, paths = sm.merged_partition(DUMMY_RESULT, return_paths=True)
     self.assertEqual(paths[0], (0, ))
     self.assertEqual(paths[1], (1, 0))
     self.assertEqual(paths[4], (2, 1))
     self.assertNotIn(6, paths)
 def test_merges_disjoint_regions(self):
     partition = sm.merged_partition(DUMMY_RESULT)
     regions, counts = np.unique(partition, return_counts=True)
     npt.assert_equal([10, 1, 4, 2, 3, 5], counts)
     npt.assert_equal(np.arange(6), regions)
Esempio n. 5
0
def make_merged(result: Optional[DivikResult]) -> np.ndarray:
    depth = summary.depth(result)
    return np.hstack([
        summary.merged_partition(result, limit + 1).reshape(-1, 1)
        for limit in range(depth)
    ])
Esempio n. 6
0
 def test_returns_paths_to_partitions(self):
     partition, paths = sm.merged_partition(DUMMY_RESULT, return_paths=True)
     assert paths[0] == (0,)
     assert paths[1] == (1, 0)
     assert paths[4] == (2, 1)
     assert 6 not in paths