def _initialize(self): """Set the initial weights, means and covs (with full covariance matrix). weights: the prior of the clusters (what percentage of data does a cluster have) means: the mean points of the clusters covs: the covariance matrix of the clusters """ self.weights = np.ones(self.K) if self.init == 'random': self.means = [ self.X[x] for x in random.sample(range(self.n_samples), self.K) ] self.covs = [np.cov(self.X.T) for _ in range(K)] elif self.init == 'kmeans': kmeans = KMeans(K=self.K, max_iters=self.max_iters // 3, init='++') kmeans.fit(self.X) self.assignments = kmeans.predict() self.means = kmeans.centroids self.covs = [] for i in np.unique(self.assignments): self.weights[int(i)] = (self.assignments == i).sum() self.covs.append(np.cov(self.X[self.assignments == i].T)) else: raise ValueError('Unknown type of init parameter') self.weights /= self.weights.sum()
def kmeans_example(plot=False): X, y = make_blobs(centers=4, n_samples=500, n_features=2, shuffle=True, random_state=42) clusters = len(np.unique(y)) k = KMeans(K=clusters, max_iters=150, init='++') k.fit(X) k.predict() if plot: k.plot()
def robust_example(plot=False): X, y = load_robust() clusters = 5 k = KMeans(K=clusters, max_iters=50, init="++") k.fit(X) k.predict() if plot: k.plot()
def _initialize(self): """Set the initial weights, means and covs (with full covariance matrix). weights: the prior of the clusters (what percentage of data does a cluster have) means: the mean points of the clusters covs: the covariance matrix of the clusters """ self.weights = np.ones(self.K) if self.init == 'random': self.means = [self.X[x] for x in random.sample(range(self.n_samples), self.K)] self.covs = [np.cov(self.X.T) for _ in range(K)] elif self.init == 'kmeans': kmeans = KMeans(K=self.K, max_iters=self.max_iters // 3, init='++') kmeans.fit(self.X) self.assignments = kmeans.predict() self.means = kmeans.centroids self.covs = [] for i in np.unique(self.assignments): self.weights[int(i)] = (self.assignments == i).sum() self.covs.append(np.cov(self.X[self.assignments == i].T)) else: raise ValueError('Unknown type of init parameter') self.weights /= self.weights.sum()
def KMeans_and_GMM(K): COLOR = 'bgrcmyk' X, y = make_clusters(skew=True, n_samples=1500, centers=K) _, axes = plt.subplots(1, 3) # Ground Truth axes[0].scatter(X[:, 0], X[:, 1], c=[COLOR[int(assignment)] for assignment in y]) axes[0].set_title("Ground Truth") # KMeans kmeans = KMeans(K=K, init='++') kmeans.fit(X) y_kmeans = kmeans.predict() c_kmeans = np.array(kmeans.centroids) axes[1].scatter(X[:, 0], X[:, 1], c=[COLOR[int(assignment)] for assignment in y_kmeans]) axes[1].scatter(c_kmeans[:, 0], c_kmeans[:, 1], c=COLOR[:K], marker="o", s=500) axes[1].set_title("KMeans") # Gaussian Mixture gmm = GaussianMixture(K=K, init='kmeans') gmm.fit(X) axes[2].set_title("Gaussian Mixture") gmm.plot(ax=axes[2])
def iris_example(plot=False): X, y = load_iris() clusters = len(np.unique(y)) k = KMeans(K=clusters, max_iters=50, init="++") k.fit(X, y) k.predict() data = np.zeros([k.n_samples, 2]) # Dimension reducing # Sepal width*length data[:, 0] = k.X[:, 0] * k.X[:, 1] # Petal width*length data[:, 1] = k.X[:, 2] * k.X[:, 3] if plot: k.plot(data)
def KMeans_and_GMM(K): COLOR = 'bgrcmyk' X, y = make_clusters(skew=True, n_samples=1500, centers=K) _, axes = plt.subplots(1, 3) # Ground Truth axes[0].scatter(X[:, 0], X[:, 1], c=[COLOR[int(assignment)] for assignment in y]) axes[0].set_title("Ground Truth") # KMeans kmeans = KMeans(K=K, init='++') kmeans.fit(X) kmeans.predict() axes[1].set_title("KMeans") kmeans.plot(ax=axes[1], holdon=True) # Gaussian Mixture gmm = GaussianMixture(K=K, init='kmeans') gmm.fit(X) axes[2].set_title("Gaussian Mixture") gmm.plot(ax=axes[2])
def test_initialization(): with pytest.raises(ValueError): kmeans = KMeans(init='test', K=2) kmeans.fit(data) kmeans._initialize_cetroids('test') kmeans = KMeans(init='random', K=2) kmeans.fit(data) kmeans._initialize_cetroids('random') assert len(kmeans.centroids) == kmeans.K kmeans = KMeans(init='++', K=2) kmeans.fit(data) kmeans._initialize_cetroids('++') assert len(kmeans.centroids) == kmeans.K