def OptimumCluster(self,df): from yellowbrick.cluster import KElbowVisualizer kmeans = KMeans() visualizer = KElbowVisualizer(kmeans, k=(1,15)) visualizer.fit(df) visualizer.poof()
def kelbow_optimization(df): # Shows optimal number of clusters for model model = KMeans() visualizer = KElbowVisualizer(model, k=(1, 10)) visualizer.fit(df) visualizer.poof() visualizer.show(outpath="Elbow Kmeans Cluster.pdf") return df
def showElbow(): # Make 8 blobs dataset X, y = make_blobs(centers=8) # Instantiate the clustering model and visualizer visualizer = KElbowVisualizer(MiniBatchKMeans(), k=(4, 12)) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof() # Draw/show/poof the data
def elbow(matrix): """ This function is not explicitly used since it helps deciding 'k' for clustering :param matrix: tf-idf matrix :return: show graph with the degree of distortion """ elbow = KElbowVisualizer(KMeans(), k=10) elbow.fit(matrix) elbow.poof()
def elbow_kmeans_dist(self, corpus): """Perform elbow method for k-means clustering using distortion. Keyword Arguments: corpus -- corpus to train on """ km = KMeans(init='k-means++') visualizer = KElbowVisualizer(km, k=range(self.start, self.stop, self.step), timings=False) visualizer.fit(corpus.vectors) visualizer.poof(outpath=self.folder + 'elbow_distortion.png') print('Saved elbow curve.') return
def dendo(modelo): dendogram = modelo.copy() weighted = linkage(dendogram, method='weighted') print('weighted') fig = plt.figure(figsize=(25, 10)) weightedplt = dendrogram( weighted, truncate_mode = 'lastp', p=12) plt.show() calinski_harabasz = KMeans() visualizer = KElbowVisualizer(calinski_harabasz, k=(2, 20), metric='calinski_harabasz') visualizer.fit(weighted) visualizer.poof();
def elbow(Xl, Yl): """ Implementación del método Elbow (https://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set) para conocer el número de clusters que debe tener cada dataset.Lo usaremos posteriormente en la selección de muestras. El número de clusters se elige a ojo en función de donde encontremos el codo y del tiempo necesario para el aprendizaje. IMP: En algunos casos no esta bien definido el codo. (¡La vida real es asi de dura!) """ for clase in range(1, 17): indexclasified = np.where(Yl == clase)[0] # Indexes of class Xlclase = Xl[indexclasified, :] model = KMeans(n_jobs=-1) visualizer = KElbowVisualizer(model, k=(1, 12), title=('Método Elbow para la clase ') + str(clase)) visualizer.fit(Xlclase) visualizer.poof()
def draw_elbow(path="images/elbow.png"): # Generate synthetic dataset with 8 blobs X, y = make_blobs(centers=8, n_features=12, n_samples=1000, shuffle=True, random_state=42) # Create a new figure to draw the clustering visualizer on _, ax = plt.subplots() # Instantiate the clustering model and visualizer model = KMeans() visualizer = KElbowVisualizer(model, ax=ax, k=(4, 12)) visualizer.fit(X) # Fit the data to the visualizer visualizer.poof(outpath=path) # Draw/show/poof the data
def elbow_kmeans_ch(self, corpus): """Perform elbow method for k-means clustering using calinski_harabaz. Keyword Arguments: corpus -- corpus to train on """ print('Iterating kmeans over range of topics...') km = KMeans(init='k-means++') visualizer = KElbowVisualizer(km, k=range(self.start, self.stop, self.step), metric='calinski_harabaz', timings=False) visualizer.fit(corpus.vectors) visualizer.poof(outpath=self.folder + 'elbow_c_h.png') print('Saved elbow curve.') return
def run_kmeans(X, y, title): model = KMeans() visualizer = KElbowVisualizer(model, k=(2, 30), metric='silhouette', title=title) visualizer.fit(X) # Fit the data to the visualizer visualizer.poof() # Draw/show/poof the data visualizer = KElbowVisualizer(model, k=(2, 30), metric='distortion', title=title) visualizer.fit(X) # Fit the data to the visualizer visualizer.poof() # Draw/show/poof the data visualizer = KElbowVisualizer(model, k=(2, 30), metric='calinski_harabaz', title=title) visualizer.fit(X) # Fit the data to the visualizer visualizer.poof() # Draw/show/poof the data
features = dataset.iloc[:, 0:7] target = dataset.iloc[:, -1] ''' print('----- features') print(features) print('----- target') print(target) exit() ''' model = KMeans() visualizer = KElbowVisualizer(model, k=(1, 10)) visualizer.fit(features) # Fit the data to the visualizer visualizer.poof() # Draw/show/poof the data kmeans = KMeans(n_clusters=3) kmeans.fit(features) cluster_labels = kmeans.fit_predict(features) kmeans.cluster_centers_ silhouette_avg = metrics.silhouette_score(features, cluster_labels) print('silhouette coefficient for the above clutering = ', silhouette_avg) def purity_score(y_true, y_pred): # compute contingency matrix (also called confusion matrix) contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred) return np.sum(np.amax(contingency_matrix,
# Clustering Evaluation Imports from functools import partial from sklearn.cluster import MiniBatchKMeans from sklearn.datasets import make_blobs as sk_make_blobs from yellowbrick.cluster import KElbowVisualizer # Helpers for easy dataset creation N_SAMPLES = 1000 N_FEATURES = 12 SHUFFLE = True # Make blobs partial make_blobs = partial(sk_make_blobs, n_samples=N_SAMPLES, n_features=N_FEATURES, shuffle=SHUFFLE) if __name__ == '__main__': # Make 8 blobs dataset X, y = make_blobs(centers=8) # Instantiate the clustering model and visualizer # Instantiate the clustering model and visualizer visualizer = KElbowVisualizer(MiniBatchKMeans(), k=(4, 12)) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof(outpath="images/elbow.png") # Draw/show/poof the data
print(clf) results = clf.fit_transform(X_train) model = KMeans( random_state=0, n_jobs=-1, ) # https://www.scikit-yb.org/en/latest/api/cluster/elbow.html visualizer = KElbowVisualizer(model, k=(1, 20)) visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show(outpath="charts/income.k-means.PCA.KElbowVisualizer.png") visualizer.poof() model = KMeans( n_clusters=4, random_state=0, n_jobs=-1, ) visualizer = InterclusterDistance(model) visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show(outpath="charts/income.k-means.PCA.InterclusterDistance.png") visualizer.poof() model = KMeans(n_clusters=4, random_state=0) visualizer = SilhouetteVisualizer(model)
def elbow_method(matrix, k): elbow = KElbowVisualizer(KMeans(), k=k) elbow.fit(matrix) elbow.poof()
model = KMeans(random_state=5) print("Prepping silhoutte plot...") plt.close() plt.figure() if dataset == "QSAR": visualizer = KElbowVisualizer(model, metric='silhouette', k=[2, 5, 10, 15, 20, 25, 30, 35, 40]) else: visualizer = KElbowVisualizer(model, metric='silhouette', k=[2, 5, 10, 15, 20]) visualizer.fit(dataX) # Fit the data to the visualizer visualizer.poof( outpath=out + "{}_kmeans_sil.png".format(dataset)) # Draw/show/poof the data print("Prepping distortion plot...") plt.close() plt.figure() if dataset == "QSAR": visualizer = KElbowVisualizer(model, metric='distortion', k=[2, 5, 10, 15, 20, 25, 30, 35, 40]) else: visualizer = KElbowVisualizer(model, metric='distortion', k=[2, 5, 10, 15, 20]) visualizer.fit(dataX) # Fit the data to the visualizer visualizer.poof(
def k_elbow(data, k_min, k_max, locate_elbow=True): model = KMeans(init="k-means++", n_jobs=-1) visualizer = KElbowVisualizer(model, k=(k_min, k_max), locate_elbow=True) visualizer.fit(data) # Fit the data to the visualizer visualizer.poof() # Draw/show/poof the data
for cluster in n_clusters: model = KMeans(cluster, random_state=42) preds = model.fit_predict( Data ) #Since we had 10 clusters, we have 10 labels in the output i.e. 0 to 9 score = silhouette_score(Data, preds) print(cluster, " : ", score) #the maximnum score corsp to the best nb of clusters #☻visualize distribution for cluster in n_clusters: model = KMeans(cluster, random_state=42) visualizer = SilhouetteVisualizer(model, colors='yellowbrick') visualizer.fit(Data) visualizer.poof() # Fit the data to the visualizer visualizer.show() """ a better method to visualize silhouette score method """ import matplotlib.cm as cm #changes the default colormap range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10] for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(13, 8) clusterer = KMeans(n_clusters=n_clusters, random_state=1) cluster_labels = clusterer.fit_predict(Data) silhouette_avg = silhouette_score(Data, cluster_labels) print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg) sample_silhouette_values = silhouette_samples(Data, cluster_labels)
import warnings warnings.filterwarnings('ignore') from yellowbrick.cluster import KElbowVisualizer from sklearn.metrics import silhouette_samples, silhouette_score from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans import matplotlib.pyplot as plt df_test_res = feature_extract(df_test_res) # 特征提取函数 data_feature = df_test_res.iloc[:, 3:] # 进行标准化 scaler = StandardScaler() data_feature_scaled = scaler.fit_transform(data_feature) kmeans = KMeans(random_state=123) # Instantiate the KElbowVisualizer with the number of clusters and the metric Visualizer = KElbowVisualizer(kmeans, k=(2, 7), metric='silhouette', timings=False) plt.figure(figsize=(5, 3)) # Fit the data and visualize Visualizer.fit(data_feature_scaled) Visualizer.poof()