예제 #1
0
def elbow(X, range_clusters=range(2, 6)):

    inertias = []
    ks = range_clusters
    for k in ks:
        model = KMeans(n_clusters=k, random_state=12)
        model.fit(X.values)
        # centroids_, clusters_, inertia_ = k_means(X_final.values, k=k)
        inertias.append(model.inertia)

    plt.plot(ks, inertias, '-o', color='black')
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.xticks(ks)
    plt.show()
예제 #2
0
def elbow(X,
          range_clusters=range(2, 6),
          alg='kmeans',
          cat_features=[],
          random_state=42):

    inertias = []
    ks = range_clusters
    model = None
    for k in ks:
        if alg == 'kmeans':
            model = KMeans(n_clusters=k, random_state=random_state)
        elif alg == 'kmodes':
            model = KMeans(n_clusters=k, random_state=random_state)
        else:
            model = KPrototypes(n_clusters=k,
                                cat_features=cat_features,
                                random_state=random_state)

        model.fit(X.values)
        # centroids_, clusters_, inertia_ = k_means(X_final.values, k=k)
        inertias.append(model.inertia)

    plt.plot(ks, inertias, '-o', color='black')
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.title(alg)
    plt.xticks(ks)
    plt.show()
예제 #3
0
def make_plots():
    """make scatter plots of each e-step and m-step of the kmeans algorithm
    uses old faithful dataset.
    """
    X = np.loadtxt(os.path.join('data', 'faithful.txt'))
    X = normalize(X)

    init_means = np.array([[-1.75,1],[1.75,-1]])

    clf = KMeans(init_centers=init_means, n_clusters=2)

    tmpdir = 'tmp'
    try:
        os.makedirs(tmpdir)
    except:
        pass
    basename = os.path.join(tmpdir,'faithful_kmeans')
    centers = init_means

    plotscatterhist(X, [], '{0}_{1}.png'.format(basename, 0),
                    centers, lumped=True, dpi=200)

    for i in range(1,50,2):
        print 'iteration:', i,
        centers_old = centers.copy()
        labels, inertia = clf._e_step(X, centers)
        print 'inertia:', inertia
        plotscatterhist(X, labels, '{0}_{1}.png'.format(basename, i),
                        centers, lumped=False, dpi=200)
        centers = clf._m_step(X, labels)
        plotscatterhist(X, labels, '{0}_{1}.png'.format(basename, i+1),
                        centers, lumped=False, dpi=200)
        if np.sum((centers_old - centers) ** 2) < 1e-2:
            break
    print 'starting video encoding...',
    print 'done.'
예제 #4
0
    def __init__(self, data, labels, k=None, radius=None, metric='euclidean'):
        # defined attributes
        self.data = data
        self.labels = labels
        self.k = k
        self.radius = radius
        self.metric = metric

        # computed attributes
        if k:
            cluster = KMeans(data, k).cluster()
        else:
            cluster = GMeans(data).cluster()
            self.k = cluster.k

        self.centroids = cluster.centroids
        self.weights = None

        # compute heuristic for radius if none provided
        if not radius:
            n, d = self.data.shape
            centroid = np.mean(data, axis=0)
            self.radius = np.max(distance.cdist(centroid[np.newaxis, :],
                                                data)) / (k**(1 / d))
예제 #5
0
def get_clusterer(n_clusters,
                  cat_features=[],
                  alg='kmeans',
                  agglo_params=None,
                  random_state=10):
    clusterer = None
    metric = ''
    # if len(cat_features) == 0:
    if alg == 'agglo':
        clusterer = AgglomerativeClustering(affinity=agglo_params[0],
                                            compute_full_tree='auto',
                                            linkage=agglo_params[1],
                                            memory=None,
                                            n_clusters=n_clusters,
                                            pooling_func='deprecated')
        metric = 'euclidean'
    elif alg == 'kmeans':
        clusterer = KMeans(n_clusters=n_clusters, random_state=random_state)
        metric = 'euclidean'
    elif alg == 'kmodes':
        clusterer = KModes(n_clusters=n_clusters, random_state=random_state)
        metric = 'manhattan'
    elif alg == 'fuzzy':
        clusterer = FuzzyCMeans(n_clusters=n_clusters,
                                random_state=random_state)
        metric = 'euclidean'


# else:
    elif alg == 'kproto':
        clusterer = KPrototypes(n_clusters=n_clusters,
                                cat_features=cat_features,
                                random_state=random_state)
        metric = 'manhattan'

    return clusterer, metric

# Perform K-means with one-hot encoding
# 

# In[17]:


from cluster.kmeans import KMeans

best_clusters = None
best_centroids = None
best_r = None
best_score = -9999
for r in range(25):
  kmeans = KMeans(n_clusters=3, random_state=r)
  kmeans.fit(df_features_OHE.values)
  centroids, clusters, inertia = kmeans.centroids, kmeans.labels, kmeans.inertia
  # centroids, clusters, inertia = k_means(df_.values, k=3, random_state=r)
  score = adjusted_rand_score(y_encoded, clusters)
  if score > best_score:
    best_clusters = clusters
    best_centroids = centroids
    best_score = score 
    best_r = r
kmeans_clusters = best_clusters
best_score


# In[18]:
# ## K-Means
#
# Implement your own K-Means (KM) algorithm and apply it to the data of the file. Note that you are not allowed to use sklearn library.

# Choose the best distance metric: euclidean, manhattan and cosine.

# In[120]:

X_scaled_encoded.head()

# In[121]:

from cluster.kmeans import KMeans

kme = KMeans(n_clusters=3, random_state=11)
kme.fit(X_scaled_encoded.values)

# y = splits['y'][rows_to_remove].values

plt.scatter(X_scaled_encoded_pca.values[:, 0],
            X_scaled_encoded_pca.values[:, 1],
            c=kme.labels,
            s=50,
            cmap='viridis')

# centroids_pcs = get_components(model.centroids, n_components=0.9).values
centroids_pcs = PCA(n_components=n_comp).fit_transform(kme.centroids)

plt.scatter(centroids_pcs[:, 0], centroids_pcs[:, 1], marker='x', c='r', s=200)
plt.title('K-means')
예제 #8
0
from cluster.kmeans import KMeans
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs

# Generate dataset
X, y = make_blobs(centers=3, n_samples=500, random_state=1)

# 需要重复运行10次k-means,否则就会以一定几率出现很差的情况
cls = KMeans(n_clusters=3, init="k-means++", n_init=10)
cls.fit(X)

group_colors = ['skyblue', 'coral', 'lightgreen']
colors = [group_colors[j] for j in cls.labels]
fig, ax = plt.subplots(figsize=(4, 4))
ax.scatter(X[:, 0], X[:, 1], color=colors, alpha=0.5)
ax.scatter(cls.cluster_centers[:, 0],
           cls.cluster_centers[:, 1],
           color=['blue', 'darkred', 'green'],
           marker='o',
           lw=2)
ax.set_xlabel('$x_0$')
ax.set_ylabel('$x_1$')
plt.show()
#
# Before analysing the K-means algorithm results fot the Breast Cancer W. dataset, it is important to know which is the
# random state that performs a better clustering. To do this, we have chosen the adjusted rand score as a metric in order to know which random state gives the best result.

# In[92]:

# best random score:
from cluster.kmeans import KMeans
from sklearn.metrics import adjusted_rand_score

best_clusters = None
best_centroids = None
best_r = None
best_score = -9999
for r in range(20):
    kme = KMeans(n_clusters=2, random_state=r)
    kme.fit(X_num_scaled.values)
    score = adjusted_rand_score(y, kme.labels)
    if score > best_score:
        best_clusters = kme.labels
        best_centroids = kme.centroids
        best_score = score
        best_r = r
fcm_clusters = best_clusters
print('Best score:', best_score)
print('Best random state value:', best_r)

# In[93]:

kme = KMeans(n_clusters=2, random_state=0)
kme.fit(X_num_scaled.values)
예제 #10
0
파일: main.py 프로젝트: reeechart/rakluster
from cluster.kmeans import KMeans
from cluster.kmedoids import KMedoids
from cluster.agglomerative_clustering import AgglomerativeClustering
from cluster.dbscan import DBSCAN
from cluster.metrics import purity
from sklearn import datasets

iris = datasets.load_iris()

kmeans = KMeans(n_clusters=3, max_iter=100)
kmeans.fit(iris.data)
print(kmeans.labels_)
print(purity(kmeans.labels_, iris.target))

kmedoids = KMedoids(n_clusters=3, max_iter=100)
kmedoids.fit(iris.data)
print(kmedoids.labels_)
print(purity(kmedoids.labels_, iris.target))

single_agglo = AgglomerativeClustering(
    n_clusters=3,
    linkage=AgglomerativeClustering.SINGLE_LINKAGE,
    affinity=AgglomerativeClustering.EUCLIDEAN_DISTANCE)
single_agglo.fit(iris.data)
print(single_agglo.labels_)
print(purity(single_agglo.labels_, iris.target))

complete_agglo = AgglomerativeClustering(
    n_clusters=3,
    linkage=AgglomerativeClustering.COMPLETE_LINKAGE,
    affinity=AgglomerativeClustering.EUCLIDEAN_DISTANCE)
예제 #11
0
def silhouette(X, X_pca, range_clusters=range(2, 5)):
    """
        Function provided by sklearn with some modifications.
        Reference: https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
    """

    for n_clusters in range_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        clusterer.fit(X.values)
        cluster_labels = clusterer.labels

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X_pca.values[:, 0],
                    X_pca.values[:, 1],
                    marker='.',
                    s=30,
                    lw=0,
                    alpha=0.7,
                    c=colors,
                    edgecolor='k')

        # Labeling the clusters
        centers = clusterer.centroids
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0],
                    centers[:, 1],
                    marker='o',
                    c="white",
                    alpha=1,
                    s=200,
                    edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0],
                        c[1],
                        marker='$%d$' % i,
                        alpha=1,
                        s=50,
                        edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(
            ("Silhouette analysis for KMeans clustering on sample data "
             "with n_clusters = %d" % n_clusters),
            fontsize=14,
            fontweight='bold')

    plt.show()