def score(self, X, y=None): """Opposite of the value of X on the K-means objective. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data. Returns ------- score : float Opposite of the value of X on the K-means objective. """ check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) return -_labels_inertia(X, x_squared_norms, self.cluster_centers_)[1]
def predict(self, X): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]
def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, init_size=None): """Compute the initial centroids Parameters ---------- X : array, shape (n_samples, n_features) k : int number of centroids init : {'k-means++', 'random' or ndarray or callable} optional Method for initialization random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. x_squared_norms : array, shape (n_samples,), optional Squared euclidean norm of each data point. Pass it if you have it at hands already to avoid it being recomputed here. Default: None init_size : int, optional Number of samples to randomly sample for speeding up the initialization (sometimes at the expense of accuracy): the only algorithm is initialized by running a batch KMeans on a random subset of the data. This needs to be larger than k. Returns ------- centers : array, shape(k, n_features) """ random_state = check_random_state(random_state) n_samples = X.shape[0] if x_squared_norms is None: x_squared_norms = row_norms(X, squared=True) if init_size is not None and init_size < n_samples: if init_size < k: warnings.warn("init_size=%d should be larger than k=%d. " "Setting it to 3*k" % (init_size, k), RuntimeWarning, stacklevel=2) init_size = 3 * k init_indices = random_state.randint(0, n_samples, init_size) X = X[init_indices] x_squared_norms = x_squared_norms[init_indices] n_samples = X.shape[0] elif n_samples < k: raise ValueError("n_samples=%d should be larger than k=%d" % (n_samples, k)) if isinstance(init, string_types) and init == 'k-means++': centers = _k_init(X, k, random_state=random_state, x_squared_norms=x_squared_norms) elif isinstance(init, string_types) and init == 'random': seeds = random_state.permutation(n_samples)[:k] centers = X[seeds] elif hasattr(init, '__array__'): # ensure that the centers have the same dtype as X # this is a requirement of fused types of cython centers = np.array(init, dtype=X.dtype) elif callable(init): centers = init(X, k, random_state=random_state) centers = np.asarray(centers, dtype=X.dtype) else: raise ValueError("the init parameter for the k-means should " "be 'k-means++' or 'random' or an ndarray, " "'%s' (type '%s') was passed." % (init, type(init))) if sp.issparse(centers): centers = centers.toarray() _validate_center_shape(X, k, centers) return centers
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): """Scale input vectors individually to unit norm (vector length). Read more in the :ref:`User Guide <preprocessing_normalization>`. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1', 'l2', or 'max', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). return_norm : boolean, default False whether to return the computed norms Returns ------- X : {array-like, sparse matrix}, shape [n_samples, n_features] Normalized input X. norms : array, shape [n_samples] if axis=1 else [n_features] An array of norms along given axis for X. When X is sparse, a NotImplementedError will be raised for norm 'l1' or 'l2'. See also -------- Normalizer: Performs normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). Notes ----- For a comparison of the different scalers, transformers, and normalizers, see :ref:`examples/preprocessing/plot_all_scaling.py <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`. """ if norm not in ('l1', 'l2', 'max'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_array(X, sparse_format, copy=copy, estimator='the normalize function', dtype=FLOAT_DTYPES) if axis == 0: X = X.T if sparse.issparse(X): if return_norm and norm in ('l1', 'l2'): raise NotImplementedError("return_norm=True is not implemented " "for sparse matrices with norm 'l1' " "or norm 'l2'") if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) elif norm == 'max': _, norms = min_max_axis(X, 1) norms_elementwise = norms.repeat(np.diff(X.indptr)) mask = norms_elementwise != 0 X.data[mask] /= norms_elementwise[mask] else: if norm == 'l1': norms = np.abs(X).sum(axis=1) elif norm == 'l2': norms = row_norms(X) elif norm == 'max': norms = np.max(X, axis=1) norms = _handle_zeros_in_scale(norms, copy=False) X /= norms[:, np.newaxis] if axis == 0: X = X.T if return_norm: return X, norms else: return X
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, X_norm_squared=None): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. For efficiency reasons, the euclidean distance between a pair of row vector x and y is computed as:: dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) This formulation has two advantages over other ways of computing distances. First, it is computationally efficient when dealing with sparse data. Second, if one argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)` can be pre-computed. However, this is not the most precise way of doing this computation, and the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples_1, n_features) Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) Y_norm_squared : array-like, shape (n_samples_2, ), optional Pre-computed dot-products of vectors in Y (e.g., ``(Y**2).sum(axis=1)``) squared : boolean, optional Return squared Euclidean distances. X_norm_squared : array-like, shape = [n_samples_1], optional Pre-computed dot-products of vectors in X (e.g., ``(X**2).sum(axis=1)``) Returns ------- distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) Examples -------- >>> from sklearn.metrics.pairwise import euclidean_distances >>> X = [[0, 1], [1, 1]] >>> # distance between rows of X >>> euclidean_distances(X, X) array([[ 0., 1.], [ 1., 0.]]) >>> # get distance to origin >>> euclidean_distances(X, [[0, 0]]) array([[ 1. ], [ 1.41421356]]) See also -------- paired_distances : distances betweens pairs of elements of X and Y. """ X, Y = check_pairwise_arrays(X, Y) if X_norm_squared is not None: XX = check_array(X_norm_squared) if XX.shape == (1, X.shape[0]): XX = XX.T elif XX.shape != (X.shape[0], 1): raise ValueError( "Incompatible dimensions for X and X_norm_squared") else: XX = row_norms(X, squared=True)[:, np.newaxis] if X is Y: # shortcut in the common case euclidean_distances(X, X) YY = XX.T elif Y_norm_squared is not None: YY = np.atleast_2d(Y_norm_squared) if YY.shape != (1, Y.shape[0]): raise ValueError( "Incompatible dimensions for Y and Y_norm_squared") else: YY = row_norms(Y, squared=True)[np.newaxis, :] distances = safe_sparse_dot(X, Y.T, dense_output=True) distances *= -2 distances += XX distances += YY np.maximum(distances, 0, out=distances) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. distances.flat[::distances.shape[0] + 1] = 0.0 return distances if squared else np.sqrt(distances, out=distances)
def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", batch_size=500, metric_kwargs=None): """Compute minimum distances between one point and a set of points. This function computes for each row in X, the index of the row of Y which is closest (according to the specified distance). The minimal distances are also returned. This is mostly equivalent to calling: (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis), pairwise_distances(X, Y=Y, metric=metric).min(axis=axis)) but uses much less memory, and is faster for large arrays. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples1, n_features) Array containing points. Y : {array-like, sparse matrix}, shape (n_samples2, n_features) Arrays containing points. axis : int, optional, default 1 Axis along which the argmin and distances are to be computed. metric : string or callable, default 'euclidean' metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. Distance matrices are not supported. Valid values for metric are: - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. batch_size : integer To reduce memory consumption over the naive solution, data are processed in batches, comprising batch_size rows of X and batch_size rows of Y. The default value is quite conservative, but can be changed for fine-tuning. The larger the number, the larger the memory usage. metric_kwargs : dict, optional Keyword arguments to pass to specified metric function. Returns ------- argmin : numpy.ndarray Y[argmin[i], :] is the row in Y that is closest to X[i, :]. distances : numpy.ndarray distances[i] is the distance between the i-th row in X and the argmin[i]-th row in Y. See also -------- sklearn.metrics.pairwise_distances sklearn.metrics.pairwise_distances_argmin """ dist_func = None if metric in PAIRWISE_DISTANCE_FUNCTIONS: dist_func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif not callable(metric) and not isinstance(metric, str): raise ValueError("'metric' must be a string or a callable") X, Y = check_pairwise_arrays(X, Y) if metric_kwargs is None: metric_kwargs = {} if axis == 0: X, Y = Y, X # Allocate output arrays indices = np.empty(X.shape[0], dtype=np.intp) values = np.empty(X.shape[0]) values.fill(np.infty) for chunk_x in gen_batches(X.shape[0], batch_size): X_chunk = X[chunk_x, :] for chunk_y in gen_batches(Y.shape[0], batch_size): Y_chunk = Y[chunk_y, :] if dist_func is not None: if metric == 'euclidean': # special case, for speed d_chunk = safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True) d_chunk *= -2 d_chunk += row_norms(X_chunk, squared=True)[:, np.newaxis] d_chunk += row_norms(Y_chunk, squared=True)[np.newaxis, :] np.maximum(d_chunk, 0, d_chunk) else: d_chunk = dist_func(X_chunk, Y_chunk, **metric_kwargs) else: d_chunk = pairwise_distances(X_chunk, Y_chunk, metric=metric, **metric_kwargs) # Update indices and minimum values using chunk min_indices = d_chunk.argmin(axis=1) min_values = d_chunk[np.arange(chunk_x.stop - chunk_x.start), min_indices] flags = values[chunk_x] > min_values indices[chunk_x][flags] = min_indices[flags] + chunk_y.start values[chunk_x][flags] = min_values[flags] if metric == "euclidean" and not metric_kwargs.get("squared", False): np.sqrt(values, values) return indices, values
def k_means_constrained(X, n_clusters, size_min=None, size_max=None, init='k-means++', distance_func=cdist, n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, return_n_iter=False): """K-Means clustering with minimum and maximum cluster size constraints. Read more in the :ref:`User Guide <k_means>`. Parameters ---------- X : array-like, shape (n_samples, n_features) The observations to cluster. size_min : int, optional, default: None Constrain the label assignment so that each cluster has a minimum size of size_min. If None, no constrains will be applied size_max : int, optional, default: None Constrain the label assignment so that each cluster has a maximum size of size_max. If None, no constrains will be applied n_clusters : int The number of clusters to form as well as the number of centroids to generate. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. verbose : boolean, optional Verbosity mode. tol : float, optional The relative increment in the results before declaring convergence. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. copy_x : boolean, optional When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True, then the original data is not modified. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean. n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. return_n_iter : bool, optional Whether or not to return the number of iterations. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). best_n_iter : int Number of iterations corresponding to the best results. Returned only if `return_n_iter` is set to True. """ if sp.issparse(X): raise NotImplementedError("Not implemented for sparse X") if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) # Validate init array if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # subtract of mean of x for more accurate distance computations if not sp.issparse(X): X_mean = X.mean(axis=0) # The copy was already done above X -= X_mean if hasattr(init, '__array__'): init -= X_mean # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = kmeans_constrained_single( X, n_clusters, size_min=size_min, size_max=size_max, distance_func=distance_func, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(kmeans_constrained_single)( X, n_clusters, size_min=size_min, size_max=size_max, max_iter=max_iter, init=init, distance_func=distance_func, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not sp.issparse(X): if not copy_x: X += X_mean best_centers += X_mean if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia