コード例 #1
0
 def findClustering(self, cluster):
     # Use KMeans to form 2 clusters
     kmeans = KMeans()
     kmeans.setK(2)
     kmeans.fit(cluster)
     predictedClass = kmeans.predict(cluster)
     centroids = kmeans.centroids
     # Compute SSE for the clustering
     sum = 0
     for clusterIndex in range(len(centroids)):
         for element in cluster:
             sum += (np.linalg.norm(
                 np.array(element) - np.array(centroids[clusterIndex])))**2
     return sum, predictedClass
コード例 #2
0
ファイル: main.py プロジェクト: bluesilence/python
def main():
    random_seed = 0
    iteration = 50
    init_method = 'kmeans++'
    X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=random_seed)
    plt.scatter(X[:, 0], X[:, 1], s=4, c='blue')
    kmeans = KMeans()
    #kmeans.fit_range(X, list(range(3, 7)), random_seed=random_seed, iteration=iteration, init_method=init_method)
    
    kmeans.fit(X, 4, random_seed=random_seed, iteration=iteration, init_method=init_method)
    y_pred = kmeans.predict(X)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(X[:, 0], X[:, 1], c=y_pred, s=4, cmap='viridis')
    centers = kmeans.centroids
    ax.scatter(centers[:, 0], centers[:, 1], c='red', s=15, alpha=0.5)
    plt.show()
コード例 #3
0
ファイル: main.py プロジェクト: chachalaca/K-means
def main():

    km = KMeans(3)

    iris = pd.read_csv("iris.csv")
    data = np.array(
        iris[["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]].values.tolist()
    )

    km.fit(data)

    print("cluster centers: %s" % km.cluster_centers)

    for d in iris.values:
        prediction = km.predict([[
            d[2],
            d[2],
            d[3],
            d[4]
        ]])
        print(d[5]+" - "+str(prediction[0]))
コード例 #4
0
    def fit_predict(self, X):
        """
        使用数据集,训练一个谱聚类模型,并且对数据集进行聚类

        Parameters
        ----------
        X : class:`ndarray<numpy.ndarray>` of shape (N,M)
            训练集中一共有N个数据,每个数据集具有M个属性
        """

        if self.affinity == "full_link":
            w = self.full_link(X, dist=self.rbf)
        elif self.affinity == "nearest_neighbors":
            w = self.knn_nearest(X)
        norm_laplacians = self.laplacians_matrix(w)

        eigval, eigvec = np.linalg.eig(norm_laplacians)
        ix = np.argsort(eigval)[0:self.n_clusters]
        H = eigvec[:, ix]

        kmeans = KMeans(n_clusters=self.n_clusters)
        kmeans.fit(H)
        pred = kmeans.predict(H)
        return pred
コード例 #5
0
class GMM():
    def __init__(self, initializer='support', cov_type='full'):
        assert initializer in [
            'support', 'uniform'
        ], 'Please select initialization scheme as support or uniform'
        assert cov_type in [
            'full', 'tied', 'diag', 'spherical'
        ], 'Please select covariance type as full, tied, diag, or spherical'
        self.kmeans_cls_ = KMeans()
        self.means_ = None
        self.cov_ = None
        self.mixture_weights_ = None
        self.membership_weights_ = None
        self.k_ = None
        self.ll_graph_ = []
        self.initializer_ = initializer
        self.cov_type_ = cov_type

    def fit(self, X, k, tol_=1e-6):
        self.k_ = k
        self.initialize(X)
        new_ll = self.get_log_likelihood(X)
        old_ll = new_ll - tol_ * 10
        while old_ll - new_ll < -tol_:
            self.ll_graph_.append(new_ll)
            self.gaussian_probabilities_multiple(X, normalized=True)
            self.update_mixture_weights()
            self.update_means(X)
            self.update_var(X)
            old_ll = new_ll
            new_ll = self.get_log_likelihood(X)

    def get_cov_from_init(self, X, predictions):
        if self.cov_type_ == 'full':
            return np.array(
                [np.cov(X[predictions == k].T) for k in range(self.k_)])
        if self.cov_type_ == 'tied':
            pass
        if self.cov_type_ == 'diag':
            return np.array([
                (1 / (len(X[predictions == k]) - 1)) *
                np.einsum('ij->j', (X[predictions == k] - self.means_[k])**2)
                for k in range(self.k_)
            ])
        if self.cov_type_ == 'spherical':
            return np.repeat(np.mean(np.array([
                (1 / (len(X[predictions == k]) - 1)) *
                np.einsum('ij->j', (X[predictions == k] - self.means_[k])**2)
                for k in range(self.k_)
            ]),
                                     axis=-1)[:, np.newaxis],
                             X.shape[-1],
                             axis=1)

    def initialize(self, X):
        self.kmeans_cls_.fit(X, self.k_)
        predictions_ = self.kmeans_cls_.predict(X)
        self.means_ = self.kmeans_cls_.means_
        self.cov_ = self.get_cov_from_init(X, predictions_)
        if self.initializer_ == 'support':
            self.mixture_weights_ = np.array([
                sum(predictions_ == k) / len(predictions_)
                for k in range(self.k_)
            ])
        if self.initializer_ == 'uniform':
            self.mixture_weights_ = (np.array([1] * self.k_)) / self.k_

    def gaussian_probabilities_multiple(self, X, normalized=True):
        d = X.shape[-1]

        input_ = X[:, None, :]
        if self.cov_type_ in ['full', 'tied']:
            exp_part = -0.5 * np.einsum(
                'ijk,jkl,ijl->ij', input_ - self.means_,
                np.array(list(map(np.linalg.inv, self.cov_))),
                input_ - self.means_)
            output = (1 / ((2 * np.pi)**(d / 2) * np.array(
                list(map(lambda x: np.linalg.det(x)**
                         (1 / 2), self.cov_)))))[None, :] * np.exp(exp_part)
        elif self.cov_type_ in ['diag', 'spherical']:
            exp_part = -0.5 * np.einsum('ijk,jk->ij',
                                        (input_ - self.means_[None, :, :])**2,
                                        1 / self.cov_)
            output = (1 / ((2 * np.pi)**(d / 2) * np.prod(self.cov_, axis=1)**
                           (1 / 2)))[None, :] * np.exp(exp_part)

        if normalized:
            output = np.einsum('ij,j->ij', output, self.mixture_weights_)
            output = output / np.sum(output, axis=1, keepdims=True)
            self.membership_weights_ = output
        else:
            return output

    def update_mixture_weights(self):
        self.mixture_weights_ = np.einsum(
            'ij->j',
            self.membership_weights_) / self.membership_weights_.shape[0]

    def update_means(self, X):
        self.means_ = np.einsum('id,ik->kd', X,
                                self.membership_weights_) / np.einsum(
                                    'ik->k', self.membership_weights_)[:, None]

    def update_var(self, X):
        input_ = X[:, None, :]
        if self.cov_type_ in ['full', 'tied']:
            self.cov_ = np.einsum(
                'ij,ijk,ijl->jlk', self.membership_weights_,
                input_ - self.means_, input_ - self.means_) / np.einsum(
                    'ik->k', self.membership_weights_)[:, None, None]
        elif self.cov_type_ in ['diag', 'spherical']:
            self.cov_ = (np.einsum('ij,ilk,ilk->jk', self.membership_weights_,
                                   input_, input_) - 2 *
                         np.einsum('ij,ilk,jk->jk', self.membership_weights_,
                                   input_, self.means_)) / np.einsum(
                                       'ik->k', self.membership_weights_
                                   )[:, None] + self.means_**2

    def get_log_likelihood(self, X):
        output = self.gaussian_probabilities_multiple(X, normalized=False)
        output = np.log(np.einsum('ij,j->i', output, self.mixture_weights_))
        return np.einsum('i->', output)
コード例 #6
0
ファイル: main.py プロジェクト: deborrrrrah/Clustering
sk_agglo_accuracy_average = 0
dbscan_accuracy = 0
sk_dbscan_accuracy = 0

print ('=== ACCURACY FROM PREDICT ===')
print ()

k = 0
for train_index, test_index in kf.split(X, y):
    print (str(k) + '-fold')
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_test, y_test = X.iloc[test_index], y.iloc[test_index]

    # KMeans 
    kmeans.fit(np.asarray(X_train))
    result = kmeans.predict(np.asarray(X_test))
    accuracy, dict = clustering_accuracy_score(np.asarray(y_test), np.asarray(result))
    kmeans_accuracy += accuracy
    print ('KMeans')
    print ('Accuracy\t', accuracy)
    print ('Format {Real class : cluster}')
    print ('Dict\t\t', str(dict))
    print ()

    sk_kmeans.fit(X_train)
    sk_result = sk_kmeans.predict(X_test)
    accuracy, dict = clustering_accuracy_score(np.asarray(y_test), np.asarray(sk_result))
    sk_kmeans_accuracy += accuracy
    print ('Sklearn KMeans')
    print ('Accuracy\t', accuracy)
    print ('Format {Real class : cluster}')
コード例 #7
0
ファイル: main.py プロジェクト: olezhko9/ITMO-ML-8-sem
display_clusters(y, "Настоящие метки")


def display_metrics(n_clusters, metrics, title):
    plt.figure(figsize=(8, 6))
    plt.grid(linestyle='--')
    plt.plot(n_clusters, metrics, linestyle='-', marker='.', color='r')
    plt.title(title)
    plt.xlabel("Количество кластеров")
    plt.ylabel("Значение метрики")
    plt.show()


external_metrics = []
internal_metrics = []
for i in range(1, 11):
    kMean = KMeans(k=i)
    centroids = kMean.fit(X_norm)
    y_pred = kMean.predict(X_norm)
    if i == 1:
        internal_metrics.append(0.0)
    else:
        internal_metrics.append(silhouette(X_norm, y_pred, centroids))

    external_metrics.append(adjusted_rand_index(y, y_pred))
    display_clusters(y_pred, str(i) + ' кластеров')

display_metrics(range(1, 11), external_metrics, 'Внешняя метрика')
display_metrics(range(1, 11), internal_metrics, 'Внутренняя метрика')
コード例 #8
0
from KMeans import KMeans
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import numpy as np

X, y = make_blobs(n_samples=1000,
                  n_features=2,
                  centers=3,
                  center_box=(-15, 15))

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
prediction = kmeans.predict(X)
loss = kmeans.loss
plt.figure(1)
plt.plot(range(len(loss)), loss)
plt.figure(2)
plt.scatter(X[:, 0], X[:, 1], c=prediction)
plt.figure(3)
test = np.random.uniform(-15, 15, size=(5000, 2))
test_prediction = kmeans.predict(test)
plt.scatter(test[:, 0], test[:, 1], c=test_prediction)
plt.show()
コード例 #9
0
    if preset == "Regular":
        attributeList = bigAttributes
    elif preset == "Wings":
        attributeList = wingAttributes

    X = dataBase.makePlayersList(attributeList)
    print("Working with", len(X), "players with", len(attributeList),
          "attributes each.")
    kX = X.copy()
    bkX = X.copy()

    # Clustering with KMeans
    kmeans = KMeans()
    kmeans.setK(clusters)
    kmeans.fit(kX)
    pred = kmeans.predict(kX)

    # Make plot for KMeans

    # Convert data points to 2D points for plotting
    pca = PCA(n_components=2)
    kX = pca.fit_transform(kX)

    # Make labels based on player position
    """
    labels for regular:
        0 - goalkeepers
        1 - defenders
        2 - midfielders
        3 - forwards
        
コード例 #10
0
from KMeans import KMeans
from sklearn.cluster import KMeans as km
import numpy as np
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import time
reduced_data, check = make_blobs(n_samples=1000,
                                 n_features=2,
                                 centers=3,
                                 cluster_std=7)
start_time = time.time()
kmeans = KMeans(n_cluster=3, total_iter=300)
kmeans.fit(reduced_data)
pred = kmeans.predict(reduced_data)
print(time.time() - start_time)
plt.figure(1)
for i in range(0, len(pred)):
    if pred[i] == 0:
        plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="b", alpha=.5)
    if pred[i] == 1:
        plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="g", alpha=.5)
    if pred[i] == 2:
        plt.scatter(reduced_data[i, 0], reduced_data[i, 1], c="y", alpha=.5)
plt.scatter(kmeans.centroids[0, 0], kmeans.centroids[0, 1], marker="*",
            c="r")  #wrong kmenas good plots
plt.scatter(kmeans.centroids[1, 0], kmeans.centroids[1, 1], marker="*", c="r")
plt.scatter(kmeans.centroids[2, 0], kmeans.centroids[2, 1], marker="*", c="r")
start_time = time.time()
kmeans = km(n_clusters=3)
kmeans.fit(reduced_data)
pred = kmeans.predict(reduced_data)