def _fit(self, num_iters=10): scores = [] start = time.time() for i in range(num_iters): print('Starting sklearn KMeans: %d' % i) sklearn_kmeans = SklearnKMeans(n_clusters=self.num_clusters, init='k-means++', max_iter=50, n_init=1, tol=1e-4, random_state=i * 42) sklearn_kmeans.fit(self.points) scores.append(sklearn_kmeans.inertia_) self._report(num_iters, start, time.time(), scores)
def fit(self, X, K, sample_labels=None, estimator_params=None): self._reset() # Note: previously set n_init=50 self.model_ = SklearnKMeans(K) if estimator_params is not None: assert isinstance(estimator_params, dict) self.model_.set_params(**estimator_params) # Compute Kmeans model self.model_.fit(X) if sample_labels is None: #print("INSIDE") sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])] print("sample labels:",len(sample_labels)," ",X.shape) assert len(sample_labels) == X.shape[0] self.sample_labels_ = np.array(sample_labels) self.n_clusters_ = K # Record sample label/distance from its cluster center self.sample_distances_ = OrderedDict() for cluster_label in range(self.n_clusters_): assert cluster_label not in self.sample_distances_ member_rows = X[self.cluster_labels_ == cluster_label, :] member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label] centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0) # "All clusters must have at least 1 member!" if member_rows.shape[0] == 0: return None # Calculate distance between each member row and the current cluster dists = np.empty(member_rows.shape[0]) dist_labels = [] for j, (row, label) in enumerate(zip(member_rows, member_labels)): dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze() dist_labels.append(label) # Sort the distances/labels in ascending order sort_order = np.argsort(dists) dists = dists[sort_order] dist_labels = np.array(dist_labels)[sort_order] self.sample_distances_[cluster_label] = { "sample_labels": dist_labels, "distances": dists, } return self
def fit(self, X, K, sample_labels=None, estimator_params=None): """Fits a Sklearn KMeans model to X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data. K : int The number of clusters. sample_labels : array-like, shape (n_samples), optional Labels for each of the samples in X. estimator_params : dict, optional The parameters to pass to the KMeans estimators. Returns ------- self """ self._reset() # Note: previously set n_init=50 self.model_ = SklearnKMeans(K) if estimator_params is not None: assert isinstance(estimator_params, dict) self.model_.set_params(**estimator_params) # Compute Kmeans model self.model_.fit(X) if sample_labels is None: sample_labels = ["sample_{}".format(i) for i in range(X.shape[0])] assert len(sample_labels) == X.shape[0] self.sample_labels_ = np.array(sample_labels) self.n_clusters_ = K # Record sample label/distance from its cluster center self.sample_distances_ = OrderedDict() for cluster_label in range(self.n_clusters_): assert cluster_label not in self.sample_distances_ member_rows = X[self.cluster_labels_ == cluster_label, :] member_labels = self.sample_labels_[self.cluster_labels_ == cluster_label] centroid = np.expand_dims(self.cluster_centers_[cluster_label], axis=0) # "All clusters must have at least 1 member!" if member_rows.shape[0] == 0: return None # Calculate distance between each member row and the current cluster dists = np.empty(member_rows.shape[0]) dist_labels = [] for j, (row, label) in enumerate(zip(member_rows, member_labels)): dists[j] = cdist(np.expand_dims(row, axis=0), centroid, "euclidean").squeeze() dist_labels.append(label) # Sort the distances/labels in ascending order sort_order = np.argsort(dists) dists = dists[sort_order] dist_labels = np.array(dist_labels)[sort_order] self.sample_distances_[cluster_label] = { "sample_labels": dist_labels, "distances": dists, } return self