def _check_fit_data(self, X): """Verify that the number of samples given is larger than k""" X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) if X.shape[0] < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % (X.shape[0], self.n_clusters)) return X
def _check_test_data(self, X): X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES) n_samples, n_features = X.shape expected_n_features = self.cluster_centers_.shape[1] if not n_features == expected_n_features: raise ValueError("Incorrect number of features. " "Got %d features, expected %d" % (n_features, expected_n_features)) return X
def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): """Scale input vectors individually to unit norm (vector length). Read more in the :ref:`User Guide <preprocessing_normalization>`. Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] The data to normalize, element by element. scipy.sparse matrices should be in CSR format to avoid an un-necessary copy. norm : 'l1', 'l2', or 'max', optional ('l2' by default) The norm to use to normalize each non zero sample (or each non-zero feature if axis is 0). axis : 0 or 1, optional (1 by default) axis used to normalize the data along. If 1, independently normalize each sample, otherwise (if 0) normalize each feature. copy : boolean, optional, default True set to False to perform inplace row normalization and avoid a copy (if the input is already a numpy array or a scipy.sparse CSR matrix and if axis is 1). return_norm : boolean, default False whether to return the computed norms Returns ------- X : {array-like, sparse matrix}, shape [n_samples, n_features] Normalized input X. norms : array, shape [n_samples] if axis=1 else [n_features] An array of norms along given axis for X. When X is sparse, a NotImplementedError will be raised for norm 'l1' or 'l2'. See also -------- Normalizer: Performs normalization using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). Notes ----- For a comparison of the different scalers, transformers, and normalizers, see :ref:`examples/preprocessing/plot_all_scaling.py <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`. """ if norm not in ('l1', 'l2', 'max'): raise ValueError("'%s' is not a supported norm" % norm) if axis == 0: sparse_format = 'csc' elif axis == 1: sparse_format = 'csr' else: raise ValueError("'%d' is not a supported axis" % axis) X = check_array(X, sparse_format, copy=copy, estimator='the normalize function', dtype=FLOAT_DTYPES) if axis == 0: X = X.T if sparse.issparse(X): if return_norm and norm in ('l1', 'l2'): raise NotImplementedError("return_norm=True is not implemented " "for sparse matrices with norm 'l1' " "or norm 'l2'") if norm == 'l1': inplace_csr_row_normalize_l1(X) elif norm == 'l2': inplace_csr_row_normalize_l2(X) elif norm == 'max': _, norms = min_max_axis(X, 1) norms_elementwise = norms.repeat(np.diff(X.indptr)) mask = norms_elementwise != 0 X.data[mask] /= norms_elementwise[mask] else: if norm == 'l1': norms = np.abs(X).sum(axis=1) elif norm == 'l2': norms = row_norms(X) elif norm == 'max': norms = np.max(X, axis=1) norms = _handle_zeros_in_scale(norms, copy=False) X /= norms[:, np.newaxis] if axis == 0: X = X.T if return_norm: return X, norms else: return X
def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): """ Set X and Y appropriately and checks inputs If Y is None, it is set as a pointer to X (i.e. not a copy). If Y is given, this does not happen. All distance metrics should use this function first to assert that the given parameters are correct and safe to use. Specifically, this function first ensures that both X and Y are arrays, then checks that they are at least two dimensional while ensuring that their elements are floats (or dtype if provided). Finally, the function checks that the size of the second dimension of the two arrays is equal, or the equivalent check for a precomputed distance matrix. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples_a, n_features) Y : {array-like, sparse matrix}, shape (n_samples_b, n_features) precomputed : bool True if X is to be treated as precomputed distances to the samples in Y. dtype : string, type, list of types or None (default=None) Data type required for X and Y. If None, the dtype will be an appropriate float type selected by _return_float_dtype. .. versionadded:: 0.18 Returns ------- safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features) An array equal to X, guaranteed to be a numpy array. safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features) An array equal to Y if Y was not None, guaranteed to be a numpy array. If Y was None, safe_Y will be a pointer to X. """ X, Y, dtype_float = _return_float_dtype(X, Y) warn_on_dtype = dtype is not None estimator = 'check_pairwise_arrays' if dtype is None: dtype = dtype_float if Y is X or Y is None: X = Y = check_array(X, accept_sparse='csr', dtype=dtype, warn_on_dtype=warn_on_dtype, estimator=estimator) else: X = check_array(X, accept_sparse='csr', dtype=dtype, warn_on_dtype=warn_on_dtype, estimator=estimator) Y = check_array(Y, accept_sparse='csr', dtype=dtype, warn_on_dtype=warn_on_dtype, estimator=estimator) if precomputed: if X.shape[1] != Y.shape[0]: raise ValueError("Precomputed metric requires shape " "(n_queries, n_indexed). Got (%d, %d) " "for %d indexed." % (X.shape[0], X.shape[1], Y.shape[0])) elif X.shape[1] != Y.shape[1]: raise ValueError("Incompatible dimension for X and Y matrices: " "X.shape[1] == %d while Y.shape[1] == %d" % (X.shape[1], Y.shape[1])) return X, Y
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, X_norm_squared=None): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. For efficiency reasons, the euclidean distance between a pair of row vector x and y is computed as:: dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) This formulation has two advantages over other ways of computing distances. First, it is computationally efficient when dealing with sparse data. Second, if one argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)` can be pre-computed. However, this is not the most precise way of doing this computation, and the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. Read more in the :ref:`User Guide <metrics>`. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples_1, n_features) Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) Y_norm_squared : array-like, shape (n_samples_2, ), optional Pre-computed dot-products of vectors in Y (e.g., ``(Y**2).sum(axis=1)``) squared : boolean, optional Return squared Euclidean distances. X_norm_squared : array-like, shape = [n_samples_1], optional Pre-computed dot-products of vectors in X (e.g., ``(X**2).sum(axis=1)``) Returns ------- distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) Examples -------- >>> from sklearn.metrics.pairwise import euclidean_distances >>> X = [[0, 1], [1, 1]] >>> # distance between rows of X >>> euclidean_distances(X, X) array([[ 0., 1.], [ 1., 0.]]) >>> # get distance to origin >>> euclidean_distances(X, [[0, 0]]) array([[ 1. ], [ 1.41421356]]) See also -------- paired_distances : distances betweens pairs of elements of X and Y. """ X, Y = check_pairwise_arrays(X, Y) if X_norm_squared is not None: XX = check_array(X_norm_squared) if XX.shape == (1, X.shape[0]): XX = XX.T elif XX.shape != (X.shape[0], 1): raise ValueError( "Incompatible dimensions for X and X_norm_squared") else: XX = row_norms(X, squared=True)[:, np.newaxis] if X is Y: # shortcut in the common case euclidean_distances(X, X) YY = XX.T elif Y_norm_squared is not None: YY = np.atleast_2d(Y_norm_squared) if YY.shape != (1, Y.shape[0]): raise ValueError( "Incompatible dimensions for Y and Y_norm_squared") else: YY = row_norms(Y, squared=True)[np.newaxis, :] distances = safe_sparse_dot(X, Y.T, dense_output=True) distances *= -2 distances += XX distances += YY np.maximum(distances, 0, out=distances) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. # This may not be the case due to floating point rounding errors. distances.flat[::distances.shape[0] + 1] = 0.0 return distances if squared else np.sqrt(distances, out=distances)
def k_means_constrained(X, n_clusters, size_min=None, size_max=None, init='k-means++', distance_func=cdist, n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, return_n_iter=False): """K-Means clustering with minimum and maximum cluster size constraints. Read more in the :ref:`User Guide <k_means>`. Parameters ---------- X : array-like, shape (n_samples, n_features) The observations to cluster. size_min : int, optional, default: None Constrain the label assignment so that each cluster has a minimum size of size_min. If None, no constrains will be applied size_max : int, optional, default: None Constrain the label assignment so that each cluster has a maximum size of size_max. If None, no constrains will be applied n_clusters : int The number of clusters to form as well as the number of centroids to generate. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. verbose : boolean, optional Verbosity mode. tol : float, optional The relative increment in the results before declaring convergence. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. copy_x : boolean, optional When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True, then the original data is not modified. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean. n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. return_n_iter : bool, optional Whether or not to return the number of iterations. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). best_n_iter : int Number of iterations corresponding to the best results. Returned only if `return_n_iter` is set to True. """ if sp.issparse(X): raise NotImplementedError("Not implemented for sparse X") if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) # Validate init array if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # subtract of mean of x for more accurate distance computations if not sp.issparse(X): X_mean = X.mean(axis=0) # The copy was already done above X -= X_mean if hasattr(init, '__array__'): init -= X_mean # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = kmeans_constrained_single( X, n_clusters, size_min=size_min, size_max=size_max, distance_func=distance_func, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(kmeans_constrained_single)( X, n_clusters, size_min=size_min, size_max=size_max, max_iter=max_iter, init=init, distance_func=distance_func, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not sp.issparse(X): if not copy_x: X += X_mean best_centers += X_mean if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia