def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, random_state): def is_string(s, target_str): return isinstance(s, string_types) and s == target_str if is_string(cluster_centers_0, 'k-means++'): #_seed = random_state.randint(np.iinfo('i').max) #daal_engine = daal4py.engines_mt19937(fptype=X_fptype, method='defaultDense') #kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='plusPlusDense', engine=daal_engine) #kmeans_init_res = kmeans_init.compute(X) #centroids_ = kmeans_init_res.centroids centroids_ = _k_init(X, nClusters, np.square(X).sum(axis=1), random_state) elif is_string(cluster_centers_0, 'random'): _seed = random_state.randint(np.iinfo('i').max) daal_engine = daal4py.engines_mt19937(seed=_seed, fptype=X_fptype, method='defaultDense') kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='randomDense', engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif hasattr(cluster_centers_0, '__array__'): cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif callable(cluster_centers_0): cc_arr = cluster_centers_0(X, nClusters, random_state) cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif is_string(cluster_centers_0, 'deterministic'): kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='defaultDense') kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError("Cluster centers should either be 'k-means++', 'random', 'deterministic' or an array") return centroids_
def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, random_state): def is_string(s, target_str): return isinstance(s, str) and s == target_str deterministic = False if is_string(cluster_centers_0, 'k-means++'): _seed = random_state.randint(np.iinfo('i').max) daal_engine = daal4py.engines_mt19937(fptype=X_fptype, method='defaultDense', seed=_seed) _n_local_trials = 2 + int(np.log(nClusters)) kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, nTrials=_n_local_trials, method='plusPlusDense', engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif is_string(cluster_centers_0, 'random'): _seed = random_state.randint(np.iinfo('i').max) daal_engine = daal4py.engines_mt19937(seed=_seed, fptype=X_fptype, method='defaultDense') kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='randomDense', engine=daal_engine) kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids elif hasattr(cluster_centers_0, '__array__'): deterministic = True cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif callable(cluster_centers_0): cc_arr = cluster_centers_0(X, nClusters, random_state) cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype) _validate_center_shape(X, nClusters, cc_arr) centroids_ = cc_arr elif is_string(cluster_centers_0, 'deterministic'): deterministic = True kmeans_init = daal4py.kmeans_init(nClusters, fptype=X_fptype, method='defaultDense') kmeans_init_res = kmeans_init.compute(X) centroids_ = kmeans_init_res.centroids else: raise ValueError("Cluster centers should either be 'k-means++', " "'random', 'deterministic' or an array") return deterministic, centroids_
def subspace_k_means(X, n_clusters, sample_weight=None, init='k-means++', n_init=10, max_iter=300, tol=1e-4, tol_eig=-1e-10, verbose=False, random_state=None, copy_x=True, n_jobs=1, return_n_iter=False): if sp.issparse(X): raise ValueError("SubspaceKMeans does not support sparse matrix") if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) # Validate init array if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # subtract of mean of x for more accurate distance computations X_mean = X.mean(axis=0) # The copy was already done above X -= X_mean if hasattr(init, '__array__'): init -= X_mean # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = subspace_kmeans_single( X, sample_weight, n_clusters, init=init, max_iter=max_iter, tol=tol, tol_eig=tol_eig, verbose=verbose, x_squared_norms=x_squared_norms, random_state=seeds[it]) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(subspace_kmeans_single)( X, sample_weight, n_clusters, init=init, max_iter=max_iter, tol=tol, tol_eig=tol_eig, verbose=verbose, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not copy_x: X += X_mean best_centers += X_mean if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
def k_means_constrained(X, n_clusters, size_min=None, size_max=None, init='k-means++', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, return_n_iter=False): """K-Means clustering with minimum and maximum cluster size constraints. Read more in the :ref:`User Guide <k_means>`. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The observations to cluster. size_min : int, optional, default: None Constrain the label assignment so that each cluster has a minimum size of size_min. If None, no constrains will be applied size_max : int, optional, default: None Constrain the label assignment so that each cluster has a maximum size of size_max. If None, no constrains will be applied n_clusters : int The number of clusters to form as well as the number of centroids to generate. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. verbose : boolean, optional Verbosity mode. tol : float, optional The relative increment in the results before declaring convergence. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. copy_x : boolean, optional When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True, then the original data is not modified. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean. n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. return_n_iter : bool, optional Whether or not to return the number of iterations. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). best_n_iter : int Number of iterations corresponding to the best results. Returned only if `return_n_iter` is set to True. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) # Validate init array if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # subtract of mean of x for more accurate distance computations if not sp.issparse(X): X_mean = X.mean(axis=0) # The copy was already done above X -= X_mean if hasattr(init, '__array__'): init -= X_mean # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = kmeans_constrained_single( X, n_clusters, size_min=size_min, size_max=size_max, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(kmeans_constrained_single)( X, n_clusters, size_min=size_min, size_max=size_max, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not sp.issparse(X): if not copy_x: X += X_mean best_centers += X_mean if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
def spherical_k_means(X, n_clusters, init='k-means++', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, algorithm="auto", return_n_iter=False): """Modified from sklearn.cluster.k_means_.k_means. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) best_inertia = np.infty X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_spherical_kmeans_single_lloyd)( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
def spherical_k_means( X, n_clusters, sample_weight=None, init="k-means++", n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, algorithm="auto", return_n_iter=False, ): """Modified from sklearn.cluster.k_means_.k_means. """ if n_init <= 0: raise ValueError( "Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init ) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError( "Number of iterations should be a positive number," " got %d instead" % max_iter ) best_inertia = np.infty # avoid forcing order when copy_x=False order = "C" if copy_x else None X = check_array( X, accept_sparse="csr", dtype=[np.float64, np.float32], order=order, copy=copy_x ) # verify that the number of samples given is larger than k if _num_samples(X) < n_clusters: raise ValueError( "n_samples=%d should be >= n_clusters=%d" % (_num_samples(X), n_clusters) ) tol = _tolerance(X, tol) if hasattr(init, "__array__"): init = check_array(init, dtype=X.dtype.type, order="C", copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( "Explicit initial center position passed: " "performing only one init in k-means instead of n_init=%d" % n_init, RuntimeWarning, stacklevel=2, ) n_init = 1 # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd( X, n_clusters, sample_weight, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state, ) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_spherical_kmeans_single_lloyd)( X, n_clusters, sample_weight, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed, ) for seed in seeds ) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
def movMF(X, n_clusters, posterior_type='soft', force_weights=None, n_init=10, n_jobs=1, max_iter=300, verbose=False, init='random-class', random_state=None, tol=1e-6, copy_x=True, Int=None): """Wrapper for parallelization of _movMF and running n_init times. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) best_inertia = np.infty X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # defaults best_centers = None best_labels = None best_weights = None best_concentrations = None best_posterior = None best_inertia = None # if I==-1: # chosenmovmf=_movMFI # else: # chosenmovmf=_movMF if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # cluster on the sphere (centers, weights, concentrations, posterior, labels, inertia) = _movMF(X, n_clusters, posterior_type=posterior_type, force_weights=force_weights, max_iter=max_iter, verbose=verbose, init=init, random_state=random_state, tol=tol, Int=Int) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_centers = centers.copy() best_labels = labels.copy() best_weights = weights.copy() best_concentrations = concentrations.copy() best_posterior = posterior.copy() best_inertia = inertia else: # parallelisation of movMF runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_movMF)(X, n_clusters, posterior_type=posterior_type, force_weights=force_weights, max_iter=max_iter, verbose=verbose, init=init, random_state=random_state, tol=tol, Int=Int) for seed in seeds) # Get results with the lowest inertia centers, weights, concentrations, posterior, labels, inertia = (zip( *results)) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_concentrations = concentrations[best] best_weights = weights[best] return (best_centers, best_labels, best_inertia, best_weights, best_concentrations, best_posterior)
def spherical_k_means(X, n_clusters, init='k-means++', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, algorithm="auto", return_n_iter=False): """Modified from sklearn.cluster.k_means_.k_means. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) best_inertia = np.infty X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) if hasattr(init, '__array__'): init = check_array(init, dtype=X.dtype.type, copy=True) _validate_center_shape(X, n_clusters, init) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = _spherical_kmeans_single_lloyd( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_spherical_kmeans_single_lloyd)(X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia