Beispiel #1
0
    def execute(cls, ctx, op: "KMeansPlusPlusInit"):
        try:
            from sklearn.cluster._kmeans import _k_init
        except ImportError:  # pragma: no cover
            from sklearn.cluster.k_means_ import _k_init

        (x, x_squared_norms), device_id, _ = as_same_device(
            [ctx[inp.key] for inp in op.inputs],
            device=op.device,
            ret_extra=True)

        with device(device_id):
            ctx[op.outputs[0].key] = _k_init(x, op.n_clusters, x_squared_norms,
                                             op.state, op.n_local_trials)
    def fit(self, X):
        self.high_dim_data = X
        self.highdim = self.high_dim_data.shape[1]

        print("Projecting {}-dimensional data to {} dimensions ...".format(
            self.highdim, self.n_dims))
        starttime = time.time()
        self.proj_vectors = [
            rand_vector(self.highdim) for n in range(self.n_dims)
        ]
        if self.n_dims == self.highdim:
            self.low_dim_data = np.copy(X)
        else:
            self.low_dim_data = [
                np.array(self.project(x)) for x in self.high_dim_data
            ]
        print("Projection completed in {} seconds.".format(time.time() -
                                                           starttime))

        self.datamax = [max(i) for i in zip(*self.low_dim_data)]
        self.datamin = [min(i) for i in zip(*self.low_dim_data)]

        x_squared_norms = row_norms(self.low_dim_data, squared=True)

        starttime = time.time()
        cents = _k_init(np.array(self.low_dim_data), self.n_clusters,
                        x_squared_norms, RandomState())
        kmcents = _k_init(np.array(self.high_dim_data), self.n_clusters,
                          row_norms(self.high_dim_data, squared=True),
                          RandomState())
        print("Initialization of centers completed in {} seconds.".format(
            time.time() - starttime))

        self.JLKcenters = [HDNode(center=cent) for cent in cents]

        self.KMcenters = [HDNode(center=np.copy(cent)) for cent in kmcents]
    def _get_initialization(self):
        inits = list()

        def squared_euclidean(x):
            return np.inner(x, x)

        squared_norms = np.apply_along_axis(func1d=squared_euclidean,
                                            axis=1,
                                            arr=self.X_norm)
        for seed in self.seeds:
            init = _k_init(X=self.X_norm,
                           n_clusters=self.n_clusters,
                           x_squared_norms=squared_norms,
                           random_state=seed)
            inits.append(init)
        self.inits = inits
Beispiel #4
0
def init_model(
        lm_model,
        tokenizer,
        data_loader,
        n_clusters,
        embedding_extractor=concat_cls_n_hidden_states,
        device='cpu',
        random_state=np.random.RandomState(42),
        **kwargs,
):
    initial_embeddings = []
    labels = []
    for batch_texts, batch_labels in data_loader:
        inputs = tokenizer(list(batch_texts),
                           return_tensors='pt',
                           padding=True,
                           truncation=True)
        inputs = inputs.to(device)
        with torch.no_grad():
            outputs = lm_model.base_model(**inputs)
        extracted_embeddings = embedding_extractor(
            outputs).cpu().detach().numpy()
        initial_embeddings.append(extracted_embeddings)
        labels.extend(batch_labels.numpy().astype('int'))

    initial_embeddings = np.vstack(initial_embeddings)

    initial_centroids = _k_init(initial_embeddings,
                                n_clusters=n_clusters,
                                x_squared_norms=row_norms(initial_embeddings,
                                                          squared=True),
                                random_state=random_state)

    model = ClusterLM(lm_model=lm_model,
                      tokenizer=tokenizer,
                      embedding_extractor=embedding_extractor,
                      initial_centroids=torch.from_numpy(initial_centroids),
                      device=device,
                      **kwargs)

    return model, initial_centroids, initial_embeddings
Beispiel #5
0
 def get_initialization_centroids(self, input_data):
     """
     :param input_data: Matrix shape nxd (n: number of observations; d: dimensionality of observations)
     :return:
     """
     if self["--initialization"] == "random":
         return np.random.normal(
             (self["--nb-cluster"], input_data.shape[1]))
     elif self["--initialization"] == "uniform_sampling":
         return input_data[np.random.permutation(
             input_data.shape[0])[:self["--nb-cluster"]]]
     elif self["--initialization"] == "kmeans++":
         seed = np.random.RandomState(self["--seed"])
         x_squared_norms = row_norms(input_data, squared=True)
         centers = _k_init(input_data,
                           self["--nb-cluster"],
                           x_squared_norms,
                           random_state=seed)
         return centers
     else:
         raise NotImplementedError("Unknown initialization.")
Beispiel #6
0
 def _kmeans_plusplus(*args, **kwargs):
     return _k_init(*args, **kwargs), None
Beispiel #7
0
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster._kmeans import _k_init
from sklearn.utils.extmath import row_norms
from sklearn.datasets import make_blobs

k = 4
seed = np.random.RandomState(0)
X, y = make_blobs(1000, centers=k)
x_squared_norms = row_norms(X, squared=True)
centers = _k_init(X, k, x_squared_norms, random_state=seed)


random_centers = X[seed.permutation(X.shape[0])[:k]]
plt.scatter(X[:, 0], X[:, 1], c="c")
plt.scatter(centers[:, 0], centers[:, 1], c="g", label="kmeans++")
plt.scatter(random_centers[:, 0], random_centers[:, 1], c="r", label="random uniform")
plt.legend()
plt.show()