Ejemplo n.º 1
0
    def OptimumCluster(self,df):

        from yellowbrick.cluster import KElbowVisualizer
        kmeans = KMeans()
        visualizer = KElbowVisualizer(kmeans, k=(1,15))
        visualizer.fit(df) 
        visualizer.poof() 
Ejemplo n.º 2
0
def kelbow_optimization(df):
    # Shows optimal number of clusters for model
    model = KMeans()
    visualizer = KElbowVisualizer(model, k=(1, 10))
    visualizer.fit(df)
    visualizer.poof()
    visualizer.show(outpath="Elbow Kmeans Cluster.pdf")
    return df
Ejemplo n.º 3
0
def showElbow():
    # Make 8 blobs dataset
    X, y = make_blobs(centers=8)
    # Instantiate the clustering model and visualizer
    visualizer = KElbowVisualizer(MiniBatchKMeans(), k=(4, 12))

    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.poof()  # Draw/show/poof the data
 def elbow(matrix):
     """
     This function is not explicitly used since it helps deciding 'k' for clustering
     :param matrix: tf-idf matrix
     :return: show graph with the degree of distortion
     """
     elbow = KElbowVisualizer(KMeans(), k=10)
     elbow.fit(matrix)
     elbow.poof()
Ejemplo n.º 5
0
 def elbow_kmeans_dist(self, corpus):
     """Perform elbow method for k-means clustering using distortion.
     
     Keyword Arguments: 
         corpus -- corpus to train on
     """
     km = KMeans(init='k-means++')
     visualizer = KElbowVisualizer(km,
                                   k=range(self.start, self.stop,
                                           self.step),
                                   timings=False)
     visualizer.fit(corpus.vectors)
     visualizer.poof(outpath=self.folder + 'elbow_distortion.png')
     print('Saved elbow curve.')
     return
Ejemplo n.º 6
0
def dendo(modelo):
    dendogram = modelo.copy()
    weighted = linkage(dendogram, method='weighted')

    print('weighted')
    fig = plt.figure(figsize=(25, 10))
    weightedplt = dendrogram(
        weighted,
        truncate_mode = 'lastp',
        p=12)
    plt.show()

    calinski_harabasz = KMeans()
    visualizer = KElbowVisualizer(calinski_harabasz, k=(2, 20), metric='calinski_harabasz')
    visualizer.fit(weighted)
    visualizer.poof();
Ejemplo n.º 7
0
def elbow(Xl, Yl):
    """
    Implementación del método Elbow (https://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set)
    para conocer el número de clusters que debe tener cada dataset.Lo usaremos posteriormente en la selección de muestras.
    El número de clusters se elige a ojo en función de donde encontremos el codo y del tiempo necesario para el aprendizaje.
    IMP: En algunos casos no esta bien definido el codo. (¡La vida real es asi de dura!)
    """
    for clase in range(1, 17):
        indexclasified = np.where(Yl == clase)[0]  # Indexes of class
        Xlclase = Xl[indexclasified, :]
        model = KMeans(n_jobs=-1)
        visualizer = KElbowVisualizer(model,
                                      k=(1, 12),
                                      title=('Método Elbow para la clase ') +
                                      str(clase))
        visualizer.fit(Xlclase)
        visualizer.poof()
Ejemplo n.º 8
0
def draw_elbow(path="images/elbow.png"):
    # Generate synthetic dataset with 8 blobs
    X, y = make_blobs(centers=8,
                      n_features=12,
                      n_samples=1000,
                      shuffle=True,
                      random_state=42)

    # Create a new figure to draw the clustering visualizer on
    _, ax = plt.subplots()

    # Instantiate the clustering model and visualizer
    model = KMeans()
    visualizer = KElbowVisualizer(model, ax=ax, k=(4, 12))

    visualizer.fit(X)  # Fit the data to the visualizer
    visualizer.poof(outpath=path)  # Draw/show/poof the data
Ejemplo n.º 9
0
 def elbow_kmeans_ch(self, corpus):
     """Perform elbow method for k-means clustering using calinski_harabaz.
     
     Keyword Arguments: 
         corpus -- corpus to train on
     """
     print('Iterating kmeans over range of topics...')
     km = KMeans(init='k-means++')
     visualizer = KElbowVisualizer(km,
                                   k=range(self.start, self.stop,
                                           self.step),
                                   metric='calinski_harabaz',
                                   timings=False)
     visualizer.fit(corpus.vectors)
     visualizer.poof(outpath=self.folder + 'elbow_c_h.png')
     print('Saved elbow curve.')
     return
Ejemplo n.º 10
0
def run_kmeans(X, y, title):
    model = KMeans()
    visualizer = KElbowVisualizer(model,
                                  k=(2, 30),
                                  metric='silhouette',
                                  title=title)

    visualizer.fit(X)  # Fit the data to the visualizer
    visualizer.poof()  # Draw/show/poof the data

    visualizer = KElbowVisualizer(model,
                                  k=(2, 30),
                                  metric='distortion',
                                  title=title)

    visualizer.fit(X)  # Fit the data to the visualizer
    visualizer.poof()  # Draw/show/poof the data

    visualizer = KElbowVisualizer(model,
                                  k=(2, 30),
                                  metric='calinski_harabaz',
                                  title=title)

    visualizer.fit(X)  # Fit the data to the visualizer
    visualizer.poof()  # Draw/show/poof the data
Ejemplo n.º 11
0
features = dataset.iloc[:, 0:7]
target = dataset.iloc[:, -1]
'''
print('----- features')
print(features)
print('----- target')
print(target)
exit()
'''

model = KMeans()
visualizer = KElbowVisualizer(model, k=(1, 10))

visualizer.fit(features)  # Fit the data to the visualizer
visualizer.poof()  # Draw/show/poof the data

kmeans = KMeans(n_clusters=3)
kmeans.fit(features)
cluster_labels = kmeans.fit_predict(features)

kmeans.cluster_centers_

silhouette_avg = metrics.silhouette_score(features, cluster_labels)
print('silhouette coefficient for the above clutering = ', silhouette_avg)


def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    return np.sum(np.amax(contingency_matrix,
Ejemplo n.º 12
0
# Clustering Evaluation Imports
from functools import partial

from sklearn.cluster import MiniBatchKMeans
from sklearn.datasets import make_blobs as sk_make_blobs

from yellowbrick.cluster import KElbowVisualizer

# Helpers for easy dataset creation
N_SAMPLES = 1000
N_FEATURES = 12
SHUFFLE = True

# Make blobs partial
make_blobs = partial(sk_make_blobs,
                     n_samples=N_SAMPLES,
                     n_features=N_FEATURES,
                     shuffle=SHUFFLE)

if __name__ == '__main__':
    # Make 8 blobs dataset
    X, y = make_blobs(centers=8)

    # Instantiate the clustering model and visualizer
    # Instantiate the clustering model and visualizer
    visualizer = KElbowVisualizer(MiniBatchKMeans(), k=(4, 12))

    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.poof(outpath="images/elbow.png")  # Draw/show/poof the data
Ejemplo n.º 13
0
print(clf)

results = clf.fit_transform(X_train)

model = KMeans(
    random_state=0,
    n_jobs=-1,
)

# https://www.scikit-yb.org/en/latest/api/cluster/elbow.html
visualizer = KElbowVisualizer(model, k=(1, 20))

visualizer.fit(results)  # Fit the data to the visualizer
# Finalize and render the figure
visualizer.show(outpath="charts/income.k-means.PCA.KElbowVisualizer.png")
visualizer.poof()

model = KMeans(
    n_clusters=4,
    random_state=0,
    n_jobs=-1,
)
visualizer = InterclusterDistance(model)

visualizer.fit(results)  # Fit the data to the visualizer
# Finalize and render the figure
visualizer.show(outpath="charts/income.k-means.PCA.InterclusterDistance.png")
visualizer.poof()

model = KMeans(n_clusters=4, random_state=0)
visualizer = SilhouetteVisualizer(model)
Ejemplo n.º 14
0
def elbow_method(matrix, k):
    elbow = KElbowVisualizer(KMeans(), k=k)
    elbow.fit(matrix)
    elbow.poof()
Ejemplo n.º 15
0
    model = KMeans(random_state=5)

    print("Prepping silhoutte plot...")
    plt.close()
    plt.figure()
    if dataset == "QSAR":
        visualizer = KElbowVisualizer(model,
                                      metric='silhouette',
                                      k=[2, 5, 10, 15, 20, 25, 30, 35, 40])
    else:
        visualizer = KElbowVisualizer(model,
                                      metric='silhouette',
                                      k=[2, 5, 10, 15, 20])
    visualizer.fit(dataX)  # Fit the data to the visualizer
    visualizer.poof(
        outpath=out +
        "{}_kmeans_sil.png".format(dataset))  # Draw/show/poof the data

    print("Prepping distortion plot...")
    plt.close()
    plt.figure()
    if dataset == "QSAR":
        visualizer = KElbowVisualizer(model,
                                      metric='distortion',
                                      k=[2, 5, 10, 15, 20, 25, 30, 35, 40])
    else:
        visualizer = KElbowVisualizer(model,
                                      metric='distortion',
                                      k=[2, 5, 10, 15, 20])
    visualizer.fit(dataX)  # Fit the data to the visualizer
    visualizer.poof(
def k_elbow(data, k_min, k_max, locate_elbow=True):
    model = KMeans(init="k-means++", n_jobs=-1)
    visualizer = KElbowVisualizer(model, k=(k_min, k_max), locate_elbow=True)
    visualizer.fit(data)  # Fit the data to the visualizer
    visualizer.poof()  # Draw/show/poof the data
Ejemplo n.º 17
0
for cluster in n_clusters:
    model = KMeans(cluster, random_state=42)
    preds = model.fit_predict(
        Data
    )  #Since we had 10 clusters, we have 10 labels in the output i.e. 0 to 9
    score = silhouette_score(Data, preds)
    print(cluster, " : ", score)
#the maximnum score corsp to the best nb of clusters

#☻visualize distribution
for cluster in n_clusters:
    model = KMeans(cluster, random_state=42)
    visualizer = SilhouetteVisualizer(model, colors='yellowbrick')

    visualizer.fit(Data)
    visualizer.poof()  # Fit the data to the visualizer
    visualizer.show()
""" a better method to visualize silhouette score method """

import matplotlib.cm as cm  #changes the default colormap
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(13, 8)
    clusterer = KMeans(n_clusters=n_clusters, random_state=1)
    cluster_labels = clusterer.fit_predict(Data)
    silhouette_avg = silhouette_score(Data, cluster_labels)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :",
          silhouette_avg)
    sample_silhouette_values = silhouette_samples(Data, cluster_labels)
Ejemplo n.º 18
0
import warnings
warnings.filterwarnings('ignore')
from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

df_test_res = feature_extract(df_test_res)  # 特征提取函数
data_feature = df_test_res.iloc[:, 3:]
# 进行标准化
scaler = StandardScaler()
data_feature_scaled = scaler.fit_transform(data_feature)

kmeans = KMeans(random_state=123)

# Instantiate the KElbowVisualizer with the number of clusters and the metric
Visualizer = KElbowVisualizer(kmeans,
                              k=(2, 7),
                              metric='silhouette',
                              timings=False)
plt.figure(figsize=(5, 3))
# Fit the data and visualize
Visualizer.fit(data_feature_scaled)
Visualizer.poof()