def findClustering(self, cluster): # Use KMeans to form 2 clusters kmeans = KMeans() kmeans.setK(2) kmeans.fit(cluster) predictedClass = kmeans.predict(cluster) centroids = kmeans.centroids # Compute SSE for the clustering sum = 0 for clusterIndex in range(len(centroids)): for element in cluster: sum += (np.linalg.norm( np.array(element) - np.array(centroids[clusterIndex])))**2 return sum, predictedClass
def main(): random_seed = 0 iteration = 50 init_method = 'kmeans++' X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=random_seed) plt.scatter(X[:, 0], X[:, 1], s=4, c='blue') kmeans = KMeans() #kmeans.fit_range(X, list(range(3, 7)), random_seed=random_seed, iteration=iteration, init_method=init_method) kmeans.fit(X, 4, random_seed=random_seed, iteration=iteration, init_method=init_method) y_pred = kmeans.predict(X) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(X[:, 0], X[:, 1], c=y_pred, s=4, cmap='viridis') centers = kmeans.centroids ax.scatter(centers[:, 0], centers[:, 1], c='red', s=15, alpha=0.5) plt.show()
def main(): km = KMeans(3) iris = pd.read_csv("iris.csv") data = np.array( iris[["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]].values.tolist() ) km.fit(data) print("cluster centers: %s" % km.cluster_centers) for d in iris.values: prediction = km.predict([[ d[2], d[2], d[3], d[4] ]]) print(d[5]+" - "+str(prediction[0]))
def fit_predict(self, X): """ 使用数据集,训练一个谱聚类模型,并且对数据集进行聚类 Parameters ---------- X : class:`ndarray<numpy.ndarray>` of shape (N,M) 训练集中一共有N个数据,每个数据集具有M个属性 """ if self.affinity == "full_link": w = self.full_link(X, dist=self.rbf) elif self.affinity == "nearest_neighbors": w = self.knn_nearest(X) norm_laplacians = self.laplacians_matrix(w) eigval, eigvec = np.linalg.eig(norm_laplacians) ix = np.argsort(eigval)[0:self.n_clusters] H = eigvec[:, ix] kmeans = KMeans(n_clusters=self.n_clusters) kmeans.fit(H) pred = kmeans.predict(H) return pred
class GMM(): def __init__(self, initializer='support', cov_type='full'): assert initializer in [ 'support', 'uniform' ], 'Please select initialization scheme as support or uniform' assert cov_type in [ 'full', 'tied', 'diag', 'spherical' ], 'Please select covariance type as full, tied, diag, or spherical' self.kmeans_cls_ = KMeans() self.means_ = None self.cov_ = None self.mixture_weights_ = None self.membership_weights_ = None self.k_ = None self.ll_graph_ = [] self.initializer_ = initializer self.cov_type_ = cov_type def fit(self, X, k, tol_=1e-6): self.k_ = k self.initialize(X) new_ll = self.get_log_likelihood(X) old_ll = new_ll - tol_ * 10 while old_ll - new_ll < -tol_: self.ll_graph_.append(new_ll) self.gaussian_probabilities_multiple(X, normalized=True) self.update_mixture_weights() self.update_means(X) self.update_var(X) old_ll = new_ll new_ll = self.get_log_likelihood(X) def get_cov_from_init(self, X, predictions): if self.cov_type_ == 'full': return np.array( [np.cov(X[predictions == k].T) for k in range(self.k_)]) if self.cov_type_ == 'tied': pass if self.cov_type_ == 'diag': return np.array([ (1 / (len(X[predictions == k]) - 1)) * np.einsum('ij->j', (X[predictions == k] - self.means_[k])**2) for k in range(self.k_) ]) if self.cov_type_ == 'spherical': return np.repeat(np.mean(np.array([ (1 / (len(X[predictions == k]) - 1)) * np.einsum('ij->j', (X[predictions == k] - self.means_[k])**2) for k in range(self.k_) ]), axis=-1)[:, np.newaxis], X.shape[-1], axis=1) def initialize(self, X): self.kmeans_cls_.fit(X, self.k_) predictions_ = self.kmeans_cls_.predict(X) self.means_ = self.kmeans_cls_.means_ self.cov_ = self.get_cov_from_init(X, predictions_) if self.initializer_ == 'support': self.mixture_weights_ = np.array([ sum(predictions_ == k) / len(predictions_) for k in range(self.k_) ]) if self.initializer_ == 'uniform': self.mixture_weights_ = (np.array([1] * self.k_)) / self.k_ def gaussian_probabilities_multiple(self, X, normalized=True): d = X.shape[-1] input_ = X[:, None, :] if self.cov_type_ in ['full', 'tied']: exp_part = -0.5 * np.einsum( 'ijk,jkl,ijl->ij', input_ - self.means_, np.array(list(map(np.linalg.inv, self.cov_))), input_ - self.means_) output = (1 / ((2 * np.pi)**(d / 2) * np.array( list(map(lambda x: np.linalg.det(x)** (1 / 2), self.cov_)))))[None, :] * np.exp(exp_part) elif self.cov_type_ in ['diag', 'spherical']: exp_part = -0.5 * np.einsum('ijk,jk->ij', (input_ - self.means_[None, :, :])**2, 1 / self.cov_) output = (1 / ((2 * np.pi)**(d / 2) * np.prod(self.cov_, axis=1)** (1 / 2)))[None, :] * np.exp(exp_part) if normalized: output = np.einsum('ij,j->ij', output, self.mixture_weights_) output = output / np.sum(output, axis=1, keepdims=True) self.membership_weights_ = output else: return output def update_mixture_weights(self): self.mixture_weights_ = np.einsum( 'ij->j', self.membership_weights_) / self.membership_weights_.shape[0] def update_means(self, X): self.means_ = np.einsum('id,ik->kd', X, self.membership_weights_) / np.einsum( 'ik->k', self.membership_weights_)[:, None] def update_var(self, X): input_ = X[:, None, :] if self.cov_type_ in ['full', 'tied']: self.cov_ = np.einsum( 'ij,ijk,ijl->jlk', self.membership_weights_, input_ - self.means_, input_ - self.means_) / np.einsum( 'ik->k', self.membership_weights_)[:, None, None] elif self.cov_type_ in ['diag', 'spherical']: self.cov_ = (np.einsum('ij,ilk,ilk->jk', self.membership_weights_, input_, input_) - 2 * np.einsum('ij,ilk,jk->jk', self.membership_weights_, input_, self.means_)) / np.einsum( 'ik->k', self.membership_weights_ )[:, None] + self.means_**2 def get_log_likelihood(self, X): output = self.gaussian_probabilities_multiple(X, normalized=False) output = np.log(np.einsum('ij,j->i', output, self.mixture_weights_)) return np.einsum('i->', output)
sk_agglo_accuracy_average = 0 dbscan_accuracy = 0 sk_dbscan_accuracy = 0 print ('=== ACCURACY FROM PREDICT ===') print () k = 0 for train_index, test_index in kf.split(X, y): print (str(k) + '-fold') X_train, y_train = X.iloc[train_index], y.iloc[train_index] X_test, y_test = X.iloc[test_index], y.iloc[test_index] # KMeans kmeans.fit(np.asarray(X_train)) result = kmeans.predict(np.asarray(X_test)) accuracy, dict = clustering_accuracy_score(np.asarray(y_test), np.asarray(result)) kmeans_accuracy += accuracy print ('KMeans') print ('Accuracy\t', accuracy) print ('Format {Real class : cluster}') print ('Dict\t\t', str(dict)) print () sk_kmeans.fit(X_train) sk_result = sk_kmeans.predict(X_test) accuracy, dict = clustering_accuracy_score(np.asarray(y_test), np.asarray(sk_result)) sk_kmeans_accuracy += accuracy print ('Sklearn KMeans') print ('Accuracy\t', accuracy) print ('Format {Real class : cluster}')
display_clusters(y, "Настоящие метки") def display_metrics(n_clusters, metrics, title): plt.figure(figsize=(8, 6)) plt.grid(linestyle='--') plt.plot(n_clusters, metrics, linestyle='-', marker='.', color='r') plt.title(title) plt.xlabel("Количество кластеров") plt.ylabel("Значение метрики") plt.show() external_metrics = [] internal_metrics = [] for i in range(1, 11): kMean = KMeans(k=i) centroids = kMean.fit(X_norm) y_pred = kMean.predict(X_norm) if i == 1: internal_metrics.append(0.0) else: internal_metrics.append(silhouette(X_norm, y_pred, centroids)) external_metrics.append(adjusted_rand_index(y, y_pred)) display_clusters(y_pred, str(i) + ' кластеров') display_metrics(range(1, 11), external_metrics, 'Внешняя метрика') display_metrics(range(1, 11), internal_metrics, 'Внутренняя метрика')
from KMeans import KMeans import matplotlib.pyplot as plt from sklearn.datasets import make_blobs import numpy as np X, y = make_blobs(n_samples=1000, n_features=2, centers=3, center_box=(-15, 15)) kmeans = KMeans(n_clusters=3) kmeans.fit(X) prediction = kmeans.predict(X) loss = kmeans.loss plt.figure(1) plt.plot(range(len(loss)), loss) plt.figure(2) plt.scatter(X[:, 0], X[:, 1], c=prediction) plt.figure(3) test = np.random.uniform(-15, 15, size=(5000, 2)) test_prediction = kmeans.predict(test) plt.scatter(test[:, 0], test[:, 1], c=test_prediction) plt.show()
if preset == "Regular": attributeList = bigAttributes elif preset == "Wings": attributeList = wingAttributes X = dataBase.makePlayersList(attributeList) print("Working with", len(X), "players with", len(attributeList), "attributes each.") kX = X.copy() bkX = X.copy() # Clustering with KMeans kmeans = KMeans() kmeans.setK(clusters) kmeans.fit(kX) pred = kmeans.predict(kX) # Make plot for KMeans # Convert data points to 2D points for plotting pca = PCA(n_components=2) kX = pca.fit_transform(kX) # Make labels based on player position """ labels for regular: 0 - goalkeepers 1 - defenders 2 - midfielders 3 - forwards
from KMeans import KMeans from sklearn.cluster import KMeans as km import numpy as np from sklearn.datasets import make_blobs import matplotlib.pyplot as plt import time reduced_data, check = make_blobs(n_samples=1000, n_features=2, centers=3, cluster_std=7) start_time = time.time() kmeans = KMeans(n_cluster=3, total_iter=300) kmeans.fit(reduced_data) pred = kmeans.predict(reduced_data) print(time.time() - start_time) plt.figure(1) for i in range(0, len(pred)): if pred[i] == 0: plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="b", alpha=.5) if pred[i] == 1: plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="g", alpha=.5) if pred[i] == 2: plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="y", alpha=.5) plt.scatter(kmeans.centroids[0, 0], kmeans.centroids[0, 1], marker="*", c="r") #wrong kmenas good plots plt.scatter(kmeans.centroids[1, 0], kmeans.centroids[1, 1], marker="*", c="r") plt.scatter(kmeans.centroids[2, 0], kmeans.centroids[2, 1], marker="*", c="r") start_time = time.time() kmeans = km(n_clusters=3) kmeans.fit(reduced_data) pred = kmeans.predict(reduced_data)