コード例 #1
0
ファイル: kmeans.py プロジェクト: mthomure/glimpse-project
    def partial_fit(self, X, y=None, weights=None):
        """Update k means estimate on a single mini-batch X.

        Parameters
        ----------
        X: array-like, shape = [n_samples, n_features]
            Coordinates of the data points to cluster.
        """

        X = check_arrays(X, sparse_format="csr", copy=False)[0]
        n_samples, n_features = X.shape
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=np.float64)

        if n_samples == 0:
            return self

        x_squared_norms = _squared_norms(X)
        self.random_state_ = check_random_state(self.random_state)
        if (not hasattr(self, 'counts_')
                or not hasattr(self, 'cluster_centers_')):
            # this is the first call partial_fit on this object:
            # initialize the cluster centers
            self.cluster_centers_ = _init_centroids(
                X, self.n_clusters, self.init,
                random_state=self.random_state_,
                x_squared_norms=x_squared_norms, init_size=self.init_size, weights=weights)

            self.initial_cluster_centers_ = self.cluster_centers_.copy()

            self.counts_ = np.zeros(self.n_clusters, dtype=np.int32)
            random_reassign = False
        else:
            # The lower the minimum count is, the more we do random
            # reassignment, however, we don't want to do random
            # reassignment too often, to allow for building up counts
            random_reassign = self.random_state_.randint(
                10 * (1 + self.counts_.min())) == 0

        _mini_batch_step(X, x_squared_norms, self.cluster_centers_,
                         self.counts_, np.zeros(0, np.double), 0,
                         random_reassign=random_reassign,
                         random_state=self.random_state_,
                         reassignment_ratio=self.reassignment_ratio,
                         verbose=self.verbose, weights=weights, sphered=self.sphered)

        if self.compute_labels:
            self.labels_, self.inertia_ = _labels_inertia(
                X, x_squared_norms, self.cluster_centers_, weights=weights)

        return self
コード例 #2
0
    def partial_fit(self, X, y=None, weights=None):
        """Update k means estimate on a single mini-batch X.

        Parameters
        ----------
        X: array-like, shape = [n_samples, n_features]
            Coordinates of the data points to cluster.
        """

        X = check_arrays(X, sparse_format="csr", copy=False)[0]
        n_samples, n_features = X.shape
        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=np.float64)

        if n_samples == 0:
            return self

        x_squared_norms = _squared_norms(X)
        self.random_state_ = check_random_state(self.random_state)
        if (not hasattr(self, 'counts_')
                or not hasattr(self, 'cluster_centers_')):
            # this is the first call partial_fit on this object:
            # initialize the cluster centers
            self.cluster_centers_ = _init_centroids(
                X,
                self.n_clusters,
                self.init,
                random_state=self.random_state_,
                x_squared_norms=x_squared_norms,
                init_size=self.init_size,
                weights=weights)

            self.initial_cluster_centers_ = self.cluster_centers_.copy()

            self.counts_ = np.zeros(self.n_clusters, dtype=np.int32)
            random_reassign = False
        else:
            # The lower the minimum count is, the more we do random
            # reassignment, however, we don't want to do random
            # reassignment too often, to allow for building up counts
            random_reassign = self.random_state_.randint(
                10 * (1 + self.counts_.min())) == 0

        _mini_batch_step(X,
                         x_squared_norms,
                         self.cluster_centers_,
                         self.counts_,
                         np.zeros(0, np.double),
                         0,
                         random_reassign=random_reassign,
                         random_state=self.random_state_,
                         reassignment_ratio=self.reassignment_ratio,
                         verbose=self.verbose,
                         weights=weights,
                         sphered=self.sphered)

        if self.compute_labels:
            self.labels_, self.inertia_ = _labels_inertia(
                X, x_squared_norms, self.cluster_centers_, weights=weights)

        return self
コード例 #3
0
    def fit(self, X, y=None, weights=None):
        """Compute the centroids on X by chunking it into mini-batches.

        Parameters
        ----------
        X: array-like, shape = [n_samples, n_features]
            Coordinates of the data points to cluster
        """
        random_state = check_random_state(self.random_state)
        if self.k is not None:
            warnings.warn(
                "Parameter k has been replaced by 'n_clusters'"
                " and will be removed in release 0.14.",
                DeprecationWarning,
                stacklevel=2)
            self.n_clusters = self.k
        X = check_arrays(X,
                         sparse_format="csr",
                         copy=False,
                         check_ccontiguous=True,
                         dtype=np.float64)[0]
        if sp.issparse(X):
            raise ValueError("Only dense arrays are supported")

        n_samples, n_features = X.shape
        if n_samples < self.n_clusters:
            raise ValueError("Number of samples smaller than number "
                             "of clusters.")

        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=np.float64)

        x_squared_norms = _squared_norms(X)

        if self.tol > 0.0:
            tol = _tolerance(X, self.tol)

            # using tol-based early stopping needs the allocation of a
            # dedicated before which can be expensive for high dim data:
            # hence we allocate it outside of the main loop
            old_center_buffer = np.zeros(n_features, np.double)
        else:
            tol = 0.0
            # no need for the center buffer if tol-based early stopping is
            # disabled
            old_center_buffer = np.zeros(0, np.double)

        distances = np.zeros(self.batch_size, dtype=np.float64)
        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        n_iter = int(self.max_iter * n_batches)

        init_size = self.init_size
        if init_size is None:
            init_size = 3 * self.batch_size
        if init_size > n_samples:
            init_size = n_samples
        self.init_size_ = init_size

        validation_indices = random_state.random_integers(
            0, n_samples - 1, init_size)
        X_valid = X[validation_indices]
        x_squared_norms_valid = x_squared_norms[validation_indices]
        weights_valid = weights[
            validation_indices] if weights is not None else None

        # perform several inits with random sub-sets
        best_inertia = None
        for init_idx in range(self.n_init):
            if self.verbose:
                print "Init %d/%d with method: %s" % (init_idx + 1,
                                                      self.n_init, self.init)
            counts = np.zeros(self.n_clusters, dtype=np.int32)

            # TODO: once the `k_means` function works with sparse input we
            # should refactor the following init to use it instead.

            # Initialize the centers using only a fraction of the data as we
            # expect n_samples to be very large when using MiniBatchKMeans
            cluster_centers = _init_centroids(X,
                                              self.n_clusters,
                                              self.init,
                                              random_state=random_state,
                                              x_squared_norms=x_squared_norms,
                                              init_size=init_size,
                                              weights=weights,
                                              sphered=self.sphered)

            # Compute the label assignement on the init dataset
            batch_inertia, centers_squared_diff = _mini_batch_step(
                X_valid,
                x_squared_norms[validation_indices],
                cluster_centers,
                counts,
                old_center_buffer,
                False,
                distances=distances,
                verbose=self.verbose,
                weights=weights_valid,
                sphered=self.sphered)

            # Keep only the best cluster centers across independent inits on
            # the common validation set
            _, inertia = _labels_inertia(X_valid,
                                         x_squared_norms_valid,
                                         cluster_centers,
                                         weights=weights)
            if self.verbose:
                print "Inertia for init %d/%d: %f" % (init_idx + 1,
                                                      self.n_init, inertia)
            if best_inertia is None or inertia < best_inertia:
                self.cluster_centers_ = cluster_centers
                self.counts_ = counts
                best_inertia = inertia

        # Empty context to be used inplace by the convergence check routine
        convergence_context = {}

        self.initial_cluster_centers_ = self.cluster_centers_.copy()

        # Perform the iterative optimization until the final convergence
        # criterion
        for iteration_idx in xrange(n_iter):

            # Sample a minibatch from the full dataset
            minibatch_indices = random_state.random_integers(
                0, n_samples - 1, self.batch_size)

            minibatch_weights = weights[
                minibatch_indices] if weights is not None else None

            # Perform the actual update step on the minibatch data
            batch_inertia, centers_squared_diff = _mini_batch_step(
                X[minibatch_indices],
                x_squared_norms[minibatch_indices],
                self.cluster_centers_,
                self.counts_,
                old_center_buffer,
                tol > 0.0,
                distances=distances,
                # Here we randomly choose whether to perform
                # random reassignment: the choice is done as a function
                # of the iteration index, and the minimum number of
                # counts, in order to force this reassignment to happen
                # every once in a while
                random_reassign=((iteration_idx + 1) %
                                 (10 + self.counts_.min()) == 0),
                random_state=random_state,
                reassignment_ratio=self.reassignment_ratio,
                verbose=self.verbose,
                weights=minibatch_weights,
                sphered=self.sphered)

            # Monitor convergence and do early stopping if necessary
            if _mini_batch_convergence(self,
                                       iteration_idx,
                                       n_iter,
                                       tol,
                                       n_samples,
                                       centers_squared_diff,
                                       batch_inertia,
                                       convergence_context,
                                       verbose=self.verbose):
                break

        if self.compute_labels:
            if self.verbose:
                print 'Computing label assignements and total inertia'
            self.labels_, self.inertia_ = _labels_inertia(
                X, x_squared_norms, self.cluster_centers_, weights=weights)

        return self
コード例 #4
0
ファイル: kmeans.py プロジェクト: mthomure/glimpse-project
    def fit(self, X, y=None, weights=None):
        """Compute the centroids on X by chunking it into mini-batches.

        Parameters
        ----------
        X: array-like, shape = [n_samples, n_features]
            Coordinates of the data points to cluster
        """
        random_state = check_random_state(self.random_state)
        if self.k is not None:
            warnings.warn("Parameter k has been replaced by 'n_clusters'"
                          " and will be removed in release 0.14.",
                          DeprecationWarning, stacklevel=2)
            self.n_clusters = self.k
        X = check_arrays(X, sparse_format="csr", copy=False,
                         check_ccontiguous=True, dtype=np.float64)[0]
        if sp.issparse(X):
            raise ValueError("Only dense arrays are supported")

        n_samples, n_features = X.shape
        if n_samples < self.n_clusters:
            raise ValueError("Number of samples smaller than number "
                             "of clusters.")

        if hasattr(self.init, '__array__'):
            self.init = np.ascontiguousarray(self.init, dtype=np.float64)

        x_squared_norms = _squared_norms(X)

        if self.tol > 0.0:
            tol = _tolerance(X, self.tol)

            # using tol-based early stopping needs the allocation of a
            # dedicated before which can be expensive for high dim data:
            # hence we allocate it outside of the main loop
            old_center_buffer = np.zeros(n_features, np.double)
        else:
            tol = 0.0
            # no need for the center buffer if tol-based early stopping is
            # disabled
            old_center_buffer = np.zeros(0, np.double)

        distances = np.zeros(self.batch_size, dtype=np.float64)
        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
        n_iter = int(self.max_iter * n_batches)

        init_size = self.init_size
        if init_size is None:
            init_size = 3 * self.batch_size
        if init_size > n_samples:
            init_size = n_samples
        self.init_size_ = init_size

        validation_indices = random_state.random_integers(
            0, n_samples - 1, init_size)
        X_valid = X[validation_indices]
        x_squared_norms_valid = x_squared_norms[validation_indices]
        weights_valid = weights[validation_indices] if weights is not None else None

        # perform several inits with random sub-sets
        best_inertia = None
        for init_idx in range(self.n_init):
            if self.verbose:
                print "Init %d/%d with method: %s" % (
                    init_idx + 1, self.n_init, self.init)
            counts = np.zeros(self.n_clusters, dtype=np.int32)

            # TODO: once the `k_means` function works with sparse input we
            # should refactor the following init to use it instead.

            # Initialize the centers using only a fraction of the data as we
            # expect n_samples to be very large when using MiniBatchKMeans
            cluster_centers = _init_centroids(
                X, self.n_clusters, self.init,
                random_state=random_state,
                x_squared_norms=x_squared_norms,
                init_size=init_size, weights=weights, sphered=self.sphered)

            # Compute the label assignement on the init dataset
            batch_inertia, centers_squared_diff = _mini_batch_step(
                X_valid, x_squared_norms[validation_indices],
                cluster_centers, counts, old_center_buffer, False,
                distances=distances, verbose=self.verbose,
                weights=weights_valid, sphered=self.sphered)

            # Keep only the best cluster centers across independent inits on
            # the common validation set
            _, inertia = _labels_inertia(X_valid, x_squared_norms_valid,
                                         cluster_centers, weights=weights)
            if self.verbose:
                print "Inertia for init %d/%d: %f" % (
                    init_idx + 1, self.n_init, inertia)
            if best_inertia is None or inertia < best_inertia:
                self.cluster_centers_ = cluster_centers
                self.counts_ = counts
                best_inertia = inertia

        # Empty context to be used inplace by the convergence check routine
        convergence_context = {}

        self.initial_cluster_centers_ = self.cluster_centers_.copy()

        # Perform the iterative optimization until the final convergence
        # criterion
        for iteration_idx in xrange(n_iter):

            # Sample a minibatch from the full dataset
            minibatch_indices = random_state.random_integers(
                0, n_samples - 1, self.batch_size)

            minibatch_weights = weights[minibatch_indices] if weights is not None else None

            # Perform the actual update step on the minibatch data
            batch_inertia, centers_squared_diff = _mini_batch_step(
                X[minibatch_indices], x_squared_norms[minibatch_indices],
                self.cluster_centers_, self.counts_,
                old_center_buffer, tol > 0.0, distances=distances,
                # Here we randomly choose whether to perform
                # random reassignment: the choice is done as a function
                # of the iteration index, and the minimum number of
                # counts, in order to force this reassignment to happen
                # every once in a while
                random_reassign=((iteration_idx + 1)
                                 % (10 + self.counts_.min()) == 0),
                random_state=random_state,
                reassignment_ratio=self.reassignment_ratio,
                verbose=self.verbose, weights=minibatch_weights, sphered=self.sphered)

            # Monitor convergence and do early stopping if necessary
            if _mini_batch_convergence(
                    self, iteration_idx, n_iter, tol, n_samples,
                    centers_squared_diff, batch_inertia, convergence_context,
                    verbose=self.verbose):
                break

        if self.compute_labels:
            if self.verbose:
                print 'Computing label assignements and total inertia'
            self.labels_, self.inertia_ = _labels_inertia(
                X, x_squared_norms, self.cluster_centers_, weights=weights)

        return self