Example #1
0
    def __init__(self, g_matrix, g_l, g_k, g_first_center, g_l_multipler):
        '''

        :param g_matrix: Term Matrix 입니당

        :param g_l: Scalable K mean++ 에서 사용하는 l 값 입니다. initializing 시에 각 횟수 마다 뽑을 Center 후보들의 갯수를 의미합니당

        :param g_k: 최종 Center의 갯수를 의미합니당!

        :param g_first_center: 초기 Center의 갯수를 지정합니다!

        :param g_l_multipler: 초기 Center 갯수에서 얼마나 많은 최종 Center들을 뽑을 것인지를 결정 하는 값입니다. 이 값이 일정 수준을 넘어 가면
                               계산량은 적어지지만, 잘못 하면 Center의 갯수가 k개 보다 작아져서 K mean 실행이 불가 하며
                               상대적으로 작은 값을 뽑으면 보다 좋은 Center 를 얻을 수 있으나, 계산량이 늘어나서 시간이 오래 걸리게 됩니다. !ㅅ!
                               예시 : k = 350, l = 0.5
                               중간 결과 : 696개의 Center 선택
                               K(350)까지 줄이는데 걸린 시간 : 대략 10분 30초
        '''
        self.matrix = g_matrix
        self.l = g_l
        self.k = g_k
        self.multipler = g_l_multipler
        self.norm = kmean.row_norms(self.matrix, True)
        self.fc = g_first_center
        self.init_center, self.init_center_index = self.PickFirstCenter()
        self.process_center = self.init_center
        self.process_center_index = self.init_center_index
        self.center_distance = 0
Example #2
0
 def test_fit_given_init(self, X_blobs):
     X_ = X_blobs.compute()
     x_squared_norms = k_means_.row_norms(X_, squared=True)
     rs = np.random.RandomState(0)
     init = k_means_._k_init(X_, 3, x_squared_norms, rs)
     dkkm = DKKMeans(3, init=init, random_state=rs)
     skkm = SKKMeans(3, init=init, random_state=rs, n_init=1)
     dkkm.fit(X_blobs)
     skkm.fit(X_)
     assert_eq(dkkm.inertia_, skkm.inertia_)
Example #3
0
 def test_fit_given_init(self):
     X, y = sklearn.datasets.make_blobs(n_samples=1000, n_features=4, random_state=1)
     X = da.from_array(X, chunks=500)
     X_ = X.compute()
     x_squared_norms = k_means_.row_norms(X_, squared=True)
     rs = np.random.RandomState(0)
     init = k_means_._k_init(X_, 3, x_squared_norms, rs)
     dkkm = DKKMeans(3, init=init, random_state=0)
     skkm = SKKMeans(3, init=init, random_state=0, n_init=1)
     dkkm.fit(X)
     skkm.fit(X_)
     assert_eq(dkkm.inertia_, skkm.inertia_)
Example #4
0
def test_row_norms(X_blobs):
    result = row_norms(X_blobs, squared=True)
    expected = k_means_.row_norms(X_blobs.compute(), squared=True)
    assert_eq(result, expected)
Example #5
0
    def fit(self, X, y=None):
        """Compute the centroids on X by chunking it into mini-batches.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster.

        y : Ignored

        """
        random_state = check_random_state(self.random_state)
        X = check_array(X,
                        accept_sparse="csr",
                        order='C',
                        dtype=[np.float64, np.float32])
        n_samples, n_features = X.shape
        if n_samples < self.n_clusters:
            raise ValueError("Number of samples smaller than number "
                             "of clusters.")

        n_init = self.n_init
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=X.dtype)
            if n_init != 1:
                warnings.warn(
                    'Explicit initial center position passed: '
                    'performing only one init in MiniBatchKMeans instead of '
                    'n_init=%d' % self.n_init,
                    RuntimeWarning,
                    stacklevel=2)
                n_init = 1

        x_squared_norms = k_means_.row_norms(X, squared=True)

        if self.tol > 0.0:
            tol = k_means_._tolerance(X, self.tol)

            # using tol-based early stopping needs the allocation of a
            # dedicated before which can be expensive for high dim data:
            # hence we allocate it outside of the main loop
            old_center_buffer = np.zeros(n_features, dtype=X.dtype)
        else:
            tol = 0.0
            # no need for the center buffer if tol-based early stopping is
            # disabled
            old_center_buffer = np.zeros(0, dtype=X.dtype)

        distances = np.zeros(self.batch_size, dtype=X.dtype)
        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        n_iter = int(self.max_iter * n_batches)

        init_size = self.init_size
        if init_size is None:
            init_size = 3 * self.batch_size
        if init_size > n_samples:
            init_size = n_samples
        self.init_size_ = init_size

        validation_indices = random_state.randint(0, n_samples, init_size)
        X_valid = X[validation_indices]
        x_squared_norms_valid = x_squared_norms[validation_indices]

        # perform several inits with random sub-sets
        best_inertia = None
        for init_idx in range(n_init):
            if self.verbose:
                print("Init %d/%d with method: %s" %
                      (init_idx + 1, n_init, self.init))
            counts = np.zeros(self.n_clusters, dtype=np.int32)

            # TODO: once the `k_means` function works with sparse input we
            # should refactor the following init to use it instead.

            # Initialize the centers using only a fraction of the data as we
            # expect n_samples to be very large when using MiniBatchKMeans
            cluster_centers = k_means_._init_centroids(
                X,
                self.n_clusters,
                self.init,
                random_state=random_state,
                x_squared_norms=x_squared_norms,
                init_size=init_size)

            # Compute the label assignment on the init dataset
            batch_inertia, centers_squared_diff = k_means_._mini_batch_step(
                X_valid,
                x_squared_norms[validation_indices],
                cluster_centers,
                counts,
                old_center_buffer,
                False,
                distances=None,
                verbose=self.verbose)

            # Keep only the best cluster centers across independent inits on
            # the common validation set
            _, inertia = k_means_._labels_inertia(X_valid,
                                                  x_squared_norms_valid,
                                                  cluster_centers)
            if self.verbose:
                print("Inertia for init %d/%d: %f" %
                      (init_idx + 1, n_init, inertia))
            if best_inertia is None or inertia < best_inertia:
                self.cluster_centers_ = cluster_centers
                self.counts_ = counts
                best_inertia = inertia

        # Empty context to be used inplace by the convergence check routine
        convergence_context = {}

        # Perform the iterative optimization until the final convergence
        # criterion
        for iteration_idx in range(n_iter):
            # Sample a minibatch from the full dataset
            minibatch_indices = random_state.randint(0, n_samples,
                                                     self.batch_size)

            # Perform the actual update step on the minibatch data
            batch_inertia, centers_squared_diff = k_means_._mini_batch_step(
                X[minibatch_indices],
                x_squared_norms[minibatch_indices],
                self.cluster_centers_,
                self.counts_,
                old_center_buffer,
                tol > 0.0,
                distances=distances,
                # Here we randomly choose whether to perform
                # random reassignment: the choice is done as a function
                # of the iteration index, and the minimum number of
                # counts, in order to force this reassignment to happen
                # every once in a while
                random_reassign=((iteration_idx + 1) %
                                 (10 + self.counts_.min()) == 0),
                random_state=random_state,
                reassignment_ratio=self.reassignment_ratio,
                verbose=self.verbose)

            # Monitor convergence and do early stopping if necessary
            if k_means_._mini_batch_convergence(self,
                                                iteration_idx,
                                                n_iter,
                                                tol,
                                                n_samples,
                                                centers_squared_diff,
                                                batch_inertia,
                                                convergence_context,
                                                verbose=self.verbose):
                break

        self.n_iter_ = iteration_idx + 1

        if self.compute_labels:
            self.labels_, self.inertia_ = self._labels_inertia_minibatch(X)

        return self
Example #6
0
    def fit(self, X, y, sample_weight=None):
        """Compute k-means-- clustering.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            Training instances to cluster. It must be noted that the data
            will be converted to C ordering, which will cause a memory
            copy if the given data is not C-contiguous.

        y : Ignored
            Not used, present here for API consistency by convention.

        sample_weight : array-like, shape (n_samples,), optional
            The weights for each observation in X. If None, all observations
            are assigned equal weight (default: None).

        Returns
        -------
        self
            Fitted estimator.
        """
        random_state = check_random_state(self.random_state)

        n_init = self.n_init
        if n_init <= 0:
            raise ValueError("Invalid number of initializations."
                             " n_init=%d must be bigger than zero." % n_init)

        if self.max_iter <= 0:
            raise ValueError(
                "Number of iterations should be a positive number,"
                " got %d instead" % self.max_iter)

        # avoid forcing order when copy_x=False
        order = "C" if self.copy_x else None
        X = check_array(X,
                        accept_sparse="csr",
                        dtype=[np.float64, np.float32],
                        order=order,
                        copy=self.copy_x)
        # verify that the number of samples given is larger than k
        if _num_samples(X) < self.n_clusters:
            raise ValueError("n_samples=%d should be >= n_clusters=%d" %
                             (_num_samples(X), self.n_clusters))

        tol = _tolerance(X, self.tol)

        # If the distances are precomputed every job will create a matrix of
        # shape (n_clusters, n_samples). To stop KMeans from eating up memory
        # we only activate this if the created matrix is guaranteed to be
        # under 100MB. 12 million entries consume a little under 100MB if they
        # are of type double.
        precompute_distances = self.precompute_distances
        if precompute_distances == "auto":
            n_samples = X.shape[0]
            precompute_distances = (self.n_clusters * n_samples) < 12e6
        elif isinstance(precompute_distances, bool):
            pass
        else:
            raise ValueError(
                "precompute_distances should be 'auto' or True/False"
                ", but a value of %r was passed" % precompute_distances)

        # Validate init array
        init = self.init
        if hasattr(init, "__array__"):
            init = check_array(init, dtype=X.dtype.type, copy=True)
            _validate_center_shape(X, self.n_clusters, init)

            if n_init != 1:
                warnings.warn(
                    "Explicit initial center position passed: "
                    "performing only one init in k-means instead of n_init=%d"
                    % n_init,
                    RuntimeWarning,
                    stacklevel=2,
                )
                n_init = 1

        # subtract of mean of x for more accurate distance computations
        if not sp.issparse(X):
            X_mean = X.mean(axis=0)
            # The copy was already done above
            X -= X_mean

            if hasattr(init, "__array__"):
                init -= X_mean

        # precompute squared norms of data points
        x_squared_norms = row_norms(X, squared=True)

        best_labels, best_inertia, best_centers = None, None, None

        kmeans_single = _k_means_minus_minus

        seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
        if effective_n_jobs(self.n_jobs) == 1:
            # For a single thread, less memory is needed if we just store one
            # set of the best results (as opposed to one set per run per
            # thread).
            for seed in seeds:
                # run a k-means once
                labels, inertia, centers, n_iter_ = kmeans_single(
                    X,
                    sample_weight,
                    self.n_clusters,
                    self.prop_outliers,
                    max_iter=self.max_iter,
                    init=init,
                    verbose=self.verbose,
                    precompute_distances=precompute_distances,
                    tol=tol,
                    x_squared_norms=x_squared_norms,
                    random_state=seed,
                )
                # determine if these results are the best so far
                if best_inertia is None or inertia < best_inertia:
                    best_labels = labels.copy()
                    best_centers = centers.copy()
                    best_inertia = inertia
                    best_n_iter = n_iter_
        else:
            # parallelisation of k-means runs
            results = Parallel(n_jobs=self.n_jobs, verbose=0)(
                delayed(kmeans_single)(
                    X,
                    sample_weight,
                    self.n_clusters,
                    self.prop_outliers,
                    max_iter=self.max_iter,
                    init=init,
                    verbose=self.verbose,
                    tol=tol,
                    precompute_distances=precompute_distances,
                    x_squared_norms=x_squared_norms,
                    # Change seed to ensure variety
                    random_state=seed,
                ) for seed in seeds)
            # Get results with the lowest inertia
            labels, inertia, centers, n_iters = zip(*results)
            best = np.argmin(inertia)
            best_labels = labels[best]
            best_inertia = inertia[best]
            best_centers = centers[best]
            best_n_iter = n_iters[best]

        if not sp.issparse(X):
            if not self.copy_x:
                X += X_mean
            best_centers += X_mean

        distinct_clusters = len(set(best_labels))
        if distinct_clusters < self.n_clusters:
            warnings.warn(
                "Number of distinct clusters ({}) found smaller than "
                "n_clusters ({}). Possibly due to duplicate points "
                "in X.".format(distinct_clusters, self.n_clusters),
                ConvergenceWarning,
                stacklevel=2,
            )

        self.cluster_centers_ = best_centers
        self.labels_ = best_labels
        self.inertia_ = best_inertia
        self.n_iter_ = best_n_iter
        return self