def execute(cls, ctx, op: "KMeansPlusPlusInit"): try: from sklearn.cluster._kmeans import _k_init except ImportError: # pragma: no cover from sklearn.cluster.k_means_ import _k_init (x, x_squared_norms), device_id, _ = as_same_device( [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True) with device(device_id): ctx[op.outputs[0].key] = _k_init(x, op.n_clusters, x_squared_norms, op.state, op.n_local_trials)
def fit(self, X): self.high_dim_data = X self.highdim = self.high_dim_data.shape[1] print("Projecting {}-dimensional data to {} dimensions ...".format( self.highdim, self.n_dims)) starttime = time.time() self.proj_vectors = [ rand_vector(self.highdim) for n in range(self.n_dims) ] if self.n_dims == self.highdim: self.low_dim_data = np.copy(X) else: self.low_dim_data = [ np.array(self.project(x)) for x in self.high_dim_data ] print("Projection completed in {} seconds.".format(time.time() - starttime)) self.datamax = [max(i) for i in zip(*self.low_dim_data)] self.datamin = [min(i) for i in zip(*self.low_dim_data)] x_squared_norms = row_norms(self.low_dim_data, squared=True) starttime = time.time() cents = _k_init(np.array(self.low_dim_data), self.n_clusters, x_squared_norms, RandomState()) kmcents = _k_init(np.array(self.high_dim_data), self.n_clusters, row_norms(self.high_dim_data, squared=True), RandomState()) print("Initialization of centers completed in {} seconds.".format( time.time() - starttime)) self.JLKcenters = [HDNode(center=cent) for cent in cents] self.KMcenters = [HDNode(center=np.copy(cent)) for cent in kmcents]
def _get_initialization(self): inits = list() def squared_euclidean(x): return np.inner(x, x) squared_norms = np.apply_along_axis(func1d=squared_euclidean, axis=1, arr=self.X_norm) for seed in self.seeds: init = _k_init(X=self.X_norm, n_clusters=self.n_clusters, x_squared_norms=squared_norms, random_state=seed) inits.append(init) self.inits = inits
def init_model( lm_model, tokenizer, data_loader, n_clusters, embedding_extractor=concat_cls_n_hidden_states, device='cpu', random_state=np.random.RandomState(42), **kwargs, ): initial_embeddings = [] labels = [] for batch_texts, batch_labels in data_loader: inputs = tokenizer(list(batch_texts), return_tensors='pt', padding=True, truncation=True) inputs = inputs.to(device) with torch.no_grad(): outputs = lm_model.base_model(**inputs) extracted_embeddings = embedding_extractor( outputs).cpu().detach().numpy() initial_embeddings.append(extracted_embeddings) labels.extend(batch_labels.numpy().astype('int')) initial_embeddings = np.vstack(initial_embeddings) initial_centroids = _k_init(initial_embeddings, n_clusters=n_clusters, x_squared_norms=row_norms(initial_embeddings, squared=True), random_state=random_state) model = ClusterLM(lm_model=lm_model, tokenizer=tokenizer, embedding_extractor=embedding_extractor, initial_centroids=torch.from_numpy(initial_centroids), device=device, **kwargs) return model, initial_centroids, initial_embeddings
def get_initialization_centroids(self, input_data): """ :param input_data: Matrix shape nxd (n: number of observations; d: dimensionality of observations) :return: """ if self["--initialization"] == "random": return np.random.normal( (self["--nb-cluster"], input_data.shape[1])) elif self["--initialization"] == "uniform_sampling": return input_data[np.random.permutation( input_data.shape[0])[:self["--nb-cluster"]]] elif self["--initialization"] == "kmeans++": seed = np.random.RandomState(self["--seed"]) x_squared_norms = row_norms(input_data, squared=True) centers = _k_init(input_data, self["--nb-cluster"], x_squared_norms, random_state=seed) return centers else: raise NotImplementedError("Unknown initialization.")
def _kmeans_plusplus(*args, **kwargs): return _k_init(*args, **kwargs), None
from sklearn.cluster import KMeans import matplotlib.pyplot as plt import numpy as np from sklearn.cluster._kmeans import _k_init from sklearn.utils.extmath import row_norms from sklearn.datasets import make_blobs k = 4 seed = np.random.RandomState(0) X, y = make_blobs(1000, centers=k) x_squared_norms = row_norms(X, squared=True) centers = _k_init(X, k, x_squared_norms, random_state=seed) random_centers = X[seed.permutation(X.shape[0])[:k]] plt.scatter(X[:, 0], X[:, 1], c="c") plt.scatter(centers[:, 0], centers[:, 1], c="g", label="kmeans++") plt.scatter(random_centers[:, 0], random_centers[:, 1], c="r", label="random uniform") plt.legend() plt.show()