def goKmeans(): clusteringNum = request.form['clusteringNum'] dataset = json.loads(request.form.get('dataset')) if (clusteringNum == '' or int(float(clusteringNum)) < 2): clusteringNum = 2 dataset = np.array(dataset) #dataset = np.delete(dataset, 0, 1) new_list = list( list(float(a) for a in b if BN.is_number(a)) for b in dataset) kmeans = KMeans(n_clusters=int(float(clusteringNum)), random_state=0).fit(new_list) new_list_as_array = np.array(new_list) SilhouetteVisualize = SilhouetteVisualizer(kmeans) SilhouetteVisualize.fit(new_list_as_array) if (len(new_list) > 10): k_upper_bound = 10 else: k_upper_bound = len(new_list) KElbowVisualize = KElbowVisualizer(KMeans(), k=k_upper_bound) KElbowVisualize.fit(new_list_as_array) # Fit the data to the visualizer silhouette = SilhouetteVisualize.silhouette_score_ elbow = KElbowVisualize.elbow_value_ return jsonify({ 'inputArray': list(new_list), 'kmeansLabels': (kmeans.labels_.tolist()), 'elbowValue': str(elbow), 'silhouetteValue': ('%.3f' % silhouette) })
def investigate_cluster_nr(min, max, pca_scores): rows, cols = 6, 6 #REMEMBER TO ADJUST fig, ax = plt.subplots(rows, cols, figsize=(15, 8)) row, col, first = 0, 0, 0 for i in range(min, max): #was 5, 30 if first != 0: if col < cols - 1: col += 1 elif col == cols - 1: col = 0 row += 1 first = 1 kmeans_pca = KMeans(n_clusters=i, init='k-means++', random_state=42) kmeans_pca.fit(pca_scores) visualizer = SilhouetteVisualizer(kmeans_pca, colors='yellowbrick', ax=ax[row][col]) visualizer.fit(pca_scores)
def silhouette(ax): from sklearn.cluster import KMeans from yellowbrick.cluster import SilhouetteVisualizer from sklearn.datasets import make_blobs kws = { 'centers': 8, 'n_samples': 1000, 'n_features': 12, 'shuffle': True, } X = make_blobs() X, y = make_blobs(centers=8) visualizer = SilhouetteVisualizer(KMeans(6), ax=ax) visualizer.title = "Silhouette Clusters for K-Means (k=6) on an 8 Blob Dataset" visualizer.fit(X) return visualizer
def shiloutte_score_plot(self, directory): self._check_model() plt.figure(figsize=(10, 10)) visualizer = SilhouetteVisualizer( self.best_estimator_.named_steps['clustering'], colors='yellowbrick', is_fitted=True) visualizer.fit(self.data_preprocessed) visualizer.show(directory + "/shiloutte_score.png") visualizer.finalize() plt.close()
def get_elbow_plot(X): output_text = "" try: model = KMeans(random_state=40, ) elbow_score = KElbowVisualizer(model, k=(1, 30)) elbow_score.fit(X) elbow_value = elbow_score.elbow_value_ model = KMeans(elbow_value, random_state=42) silhoutte_score = SilhouetteVisualizer(model, colors='yellowbrick') silhoutte_score.fit(X) output_text = """The optimal number of clusters is """ + \ str(silhoutte_score.n_clusters_) + """ and the silhouette score is """ + \ str(np.round(silhoutte_score.silhouette_score_, 2)) except ValueError as e: print(e) return output_text
def sc(f, g, krange): df = pd.read_table(f, index_col=0, header=0) if g == 'c': df = df.T min_k = krange[0] max_k = krange[1] num_fig = max_k - min_k + 1 num_row = math.ceil(num_fig / 3.0) plt.figure(figsize=(12, num_row * 4)) for k in range(min_k, max_k + 1): model = KMeans(k, random_state=1) visualizer = SilhouetteVisualizer(model, colors='yellowbrick') plt.subplot(num_row, 3, k - min_k + 1) visualizer.fit(df) plt.title('k=%d silhouette score %0.2f' % (k, visualizer.silhouette_score_)) plt.grid(0) plt.tight_layout() plt.savefig(os.path.basename(f).replace('.txt', '_silhouette.pdf'))
def plot_cluster_silhouette(estimator, dataset, version): visualizer = SilhouetteVisualizer(estimator, colors='yellowbrick') visualizer.fit(data.DATA[dataset][version]['x_train']) visualizer.show( f'{PLOTS_FOLDER}/{dataset}/{version}/{dataset}_{version}_{estimator.__class__.__name__}_cluster_silhouettes_k{estimator.n_clusters}.png' ) plt.clf()
def log_silhouette_chart(model, X, experiment=None, **kwargs): """Log Silhouette Coefficients charts for KMeans clusterer. Charts are computed for j = 2, 3, ..., n_clusters. Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method. Tip: Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example. Args: model (:obj:`KMeans`): | KMeans object. X (:obj:`ndarray`): | Training instances to cluster. experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``): | Neptune ``Experiment`` object to control to which experiment you log the data. | If ``None``, log to currently active, and most recent experiment. kwargs: KMeans parameters. Returns: ``None`` Examples: .. code:: python3 km = KMeans(n_init=11, max_iter=270) X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743) neptune.init('my_workspace/my_project') neptune.create_experiment() log_silhouette_chart(km, X=X, n_clusters=12) """ assert isinstance(model, KMeans), 'Model should be sklearn KMeans instance.' exp = _validate_experiment(experiment) model.set_params(**kwargs) n_clusters = model.get_params()['n_clusters'] for j in range(2, n_clusters + 1): model.set_params(**{'n_clusters': j}) model.fit(X) try: fig, ax = plt.subplots() visualizer = SilhouetteVisualizer(model, is_fitted=True, ax=ax) visualizer.fit(X) visualizer.finalize() exp.log_image( 'charts_sklearn', fig, image_name='Silhouette Coefficients for k={}'.format(j)) plt.close(fig) except Exception as e: print('Did not log Silhouette Coefficients chart. Error {}'.format( e))
def clustering(self, k): word_vectors = self.__model_p__.wv KM_model = KMeans(n_clusters=k, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors) center_closest = [] for i in range(k): center_closest.append([ el[0] for el in word_vectors.similar_by_vector( KM_model.cluster_centers_[i], topn=15, restrict_vocab=None) ]) metric_str = 'euclidean' score = silhouette_score(word_vectors.vectors, KM_model.predict(word_vectors.vectors), metric=metric_str) print("silhouette_score:", score) SVmodel = SilhouetteVisualizer(KM_model, is_fitted=True) SVmodel.fit(word_vectors.vectors) SVmodel.show() words = pd.DataFrame(word_vectors.vocab.keys(), columns=['words']) words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}']) words['cluster'] = words.vectors.apply( lambda x: KM_model.predict([np.array(x)])) words.cluster = words.cluster.apply(lambda x: x[0]) words['closeness_score'] = words.apply( lambda x: 1 / (KM_model.transform([x.vectors]).min()), axis=1) return KM_model, center_closest, score, words
def silhouettevisual(model, X, graph): visualizer = SilhouetteVisualizer( model, colors='yellowbrick', title=" Silhouette Plot of KMeans Clustering for " + graph) visualizer.fit(X) visualizer.show()
def do_kmeans(df, k): k_means = KMeans(init='k-means++', n_clusters=k, n_init=10, max_iter=1000, random_state=40) k_means.fit(df) wcss = k_means.inertia_ sil = silhouette_score(df, k_means.labels_) plt.style.use('default'); sample_silhouette_values = silhouette_samples(df, k_means.labels_) sizes = 200*sample_silhouette_values plt.figure(figsize=(16, 10)); plt.grid(True); plt.scatter(df.iloc[:, 0], df.iloc[:, 1], s=sizes, c=k_means.labels_) plt.scatter(k_means.cluster_centers_[:, 0], k_means.cluster_centers_[:, 1], marker='x', s=300, c="black") plt.title("K-Means (K={}, WCSS={:.2f}, Sil={:.2f})".format(k, wcss, sil), fontsize=20); plt.xlabel('Age', fontsize=22); plt.ylabel('Income', fontsize=22); plt.xticks(fontsize=18); plt.yticks(fontsize=18); plt.show() visualizer = SilhouetteVisualizer(k_means) visualizer.fit(df) visualizer.poof() fig = visualizer.ax.get_figure(); print("K={}, WCSS={:.2f}, Sil={:.2f}".format(k, wcss, sil))
def kmeans_exp(): with open('features_GMM.csv', mode='r') as feature_file: feature_reader = csv.reader(feature_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) i = 0 for fs in feature_reader: if i > 0: print(f"user n#{i}") print(fs) fs.pop() df_training = pd.DataFrame() training_list, testing_dict, training_fs_list, validation_fs_dict = load_dataset( i) training_list = training_list[0:16] + training_list[ 21:25] + training_list[36:40] for element in training_list: df_training = df_training.append(element) # knn = KNeighborsClassifier() model = KMeans(n_clusters=6, random_state=42).fit(df_training) for signature in testing_dict['genuine']: cluster = model.predict(signature) print("testing signature:") occurences = Counter(cluster) print(occurences) visualizer = SilhouetteVisualizer(model) visualizer.fit(df_training) visualizer.show() i += 1
def silhouette_plot(self, latent_data): """Silhouette Plots and Scores to determine optimal K in KMeans""" fig, ax = plt.subplots(2, 2, figsize=(15, 8)) for i in [2, 3, 4, 5]: ''' Create KMeans instance for different number of clusters ''' km = KMeans(n_clusters=i, max_iter=10, random_state=0) q, mod = divmod(i, 2) ''' Create SilhouetteVisualizer instance with KMeans instance Fit the visualizer ''' visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q - 1][mod]) visualizer.fit(latent_data) fig.suptitle('Silhouette Plots for 2, 3, 4, 5 Cluster Centers', fontsize=18) plt.savefig('Silhouette-Visualization.png', bbox_inches='tight') plt.close('all')
def silhouette(ax=None): X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True) viz = SilhouetteVisualizer(KMeans(9), ax=ax) viz.fit(X) viz.finalize() return viz
def showSilhouette(): # Make 8 blobs dataset X, y = make_blobs(centers=8) # Instantiate the clustering model and visualizer model = MiniBatchKMeans(6) visualizer = SilhouetteVisualizer(model) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof() # Draw/show/poof the data
def create_silhouette_chart(model, X, **kwargs): """Create silhouette coefficients charts for KMeans clusterer. Charts are computed for j = 2, 3, ..., n_clusters. Tip: Check Sklearn-Neptune integration `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_ for the full example. Args: model (:obj:`KMeans`): | KMeans object. X (:obj:`ndarray`): | Training instances to cluster. kwargs: KMeans parameters. Returns: ``neptune.types.FileSeries`` object that you can assign to run's ``base_namespace``. Examples: .. code:: python3 import neptune.new.integrations.sklearn as npt_utils km = KMeans(n_init=11, max_iter=270) X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743) run = neptune.init(project='my_workspace/my_project') run['kmeans/silhouette'] = npt_utils.create_silhouette_chart(km, X, n_clusters=12) """ assert isinstance(model, KMeans), 'Model should be sklearn KMeans instance.' charts = [] model.set_params(**kwargs) n_clusters = model.get_params()['n_clusters'] for j in range(2, n_clusters + 1): model.set_params(**{'n_clusters': j}) model.fit(X) try: fig, ax = plt.subplots() visualizer = SilhouetteVisualizer(model, is_fitted=True, ax=ax) visualizer.fit(X) visualizer.finalize() charts.append(neptune.types.File.as_image(fig)) plt.close(fig) except Exception as e: print('Did not log Silhouette Coefficients chart. Error {}'.format( e)) return neptune.types.FileSeries(charts)
def Silhouette_plot(x, from_k, to_k): sil_score = [] for k in range(from_k, to_k + 1): #Instatiate the clustering model and visualizer m = KMeans(n_clusters=k) visualizer = SilhouetteVisualizer(m) visualizer.fit(x) #Draw/show/poof the data visualizer.poof() sil_score.append([visualizer.silhouette_score_.round(3), k]) return sil_score
def silhouette(matrix, k): """ This function is also not explicitly used since it shows the decided 'k' is good or not. :param matrix: tf-idf matrix :param k: decided k (from elbow matrix) :return: show graph with all cluster's internal similarities and uniqueness with other clusters. """ model_kmeans = KMeans(n_clusters=k, max_iter=200) silhouette = SilhouetteVisualizer(model_kmeans) silhouette.fit(matrix) silhouette.poof()
def silhouette_plot(text, model, cv): ''' Loads in a saved model and produces a silhouette score plot ''' path = 'models/{}'.format(model) pipe = load(path) kmeans = pipe.named_steps['kmeans'] svd = pipe.named_steps['truncatedsvd'] X = svd.fit_transform(cv) visualizer = SilhouetteVisualizer(kmeans, colors='sns_deep') visualizer.fit(X) visualizer.show(outpath="plots/Silhouette.png") plt.close()
def clustering(fname="clustering.png"): # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18, 6)) X, y = make_blobs(centers=7) # Add K-Elbow to the left oz = KElbowVisualizer(MiniBatchKMeans(), k=(3, 12), ax=axes[0]) oz.fit(X, y) oz.finalize() # Add SilhouetteVisualizer to the right oz = SilhouetteVisualizer(Birch(n_clusters=5), ax=axes[1]) oz.fit(X, y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def main(): xtrain1, xtest1, ytrain1, ytest1, xtrain2, xtest2, ytrain2, ytest2 = load_data( ) km = KMeans(4) visualizer = SilhouetteVisualizer(km, colors='yellowbrick') visualizer.fit(xtrain1) visualizer.show() ytest = km.fit_predict(xtrain1) print(metrics.homogeneity_score(ytrain1, ytest)) score(xtrain2, 20, ytrain2) elbowplot(xtrain2, 20, "distortion", "K Means Clustering Distortion vs Number of Clusters dat2", "figs/kmeans/kmeans_elbow_dat2.png") elbowplot(xtrain1, 100, "distortion", "K Means Clustering Distortion vs Number of Clusters dat1", "figs/kmeans/kmeans_elbow_dat1.png") elbowplot(xtrain2, 40, "silhouette", "K Means Clustering Silhouette Score vs Number of Clusters dat2", "figs/kmeans/kmeans_silhouette_dat2.png", elbow=False) elbowplot(xtrain1, 100, "silhouette", "K Means Clustering Silhouette Score vs Number of Clusters dat1", "figs/kmeans/kmeans_silhouette_dat1.png", elbow=False) elbowplot( xtrain2, 20, "calinski_harabasz", "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat2", "figs/kmeans/kmeans_calinski_dat2.png", elbow=False) elbowplot( xtrain1, 100, "calinski_harabasz", "K Means Clustering Calinski Harabasz Score vs Number of Clusters dat1", "figs/kmeans/kmeans_calinski_dat1.png", elbow=False)
def silhoutte_yellowbrick( X, y, features, ): plt.switch_backend('agg') plt.clf() X_train, X_test, y_train, y_test = train_test_split(X[features], y, stratify=y, test_size=0.01) X = pd.DataFrame(X_test, columns=features) y = pd.Series(y_test) n_clusters = y.nunique() model = MiniBatchKMeans(n_clusters) visualizer_sil = SilhouetteVisualizer(model, colors='yellowbrick') visualizer_sil.fit(X) visualizer_sil.finalize() return plt
def kmeans_silhouette_plots(tfidf, num_clusters=[3, 5, 7, 9, 11]): ''' Vectorizer results are normalized, which makes KMeans behave as spherical k-means for better results. Since LSA/SVD results are not normalized, we have to redo the normalization. The best silhouette value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar. ''' print('\nUse kmeans silhouette score to visualize Silhouette Coefficients') for k in num_clusters: start = datetime.now() svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) reduced = lsa.fit_transform(tfidf) model = KMeans(n_clusters=k, init='k-means++') # Instantiate the clustering model and visualizer visualizer = SilhouetteVisualizer(model) # Fit the training data to the visualizer visualizer.fit(reduced) visualizer.finalize() filename = r'images/silhouette_plots/kmeans_silh_plot_' + str( k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png' plt.savefig(filename) plt.close() end = datetime.now() print(' ' + filename) print(" Time taken: {}".format(end - start))
visualizer.poof() model = KMeans( n_clusters=4, random_state=0, n_jobs=-1, ) visualizer = InterclusterDistance(model) visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show(outpath="charts/income.k-means.PCA.InterclusterDistance.png") visualizer.poof() model = KMeans(n_clusters=4, random_state=0) visualizer = SilhouetteVisualizer(model) visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show(outpath="charts/income.k-means.PCA.SilhouetteVisualizer.png") lowest_bic = np.infty bic = [] n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type) gmm.fit(results)
# Clustering Evaluation Imports from functools import partial from sklearn.cluster import MiniBatchKMeans from sklearn.datasets import make_blobs as sk_make_blobs from yellowbrick.cluster import SilhouetteVisualizer # Helpers for easy dataset creation N_SAMPLES = 1000 N_FEATURES = 12 SHUFFLE = True # Make blobs partial make_blobs = partial(sk_make_blobs, n_samples=N_SAMPLES, n_features=N_FEATURES, shuffle=SHUFFLE) if __name__ == '__main__': # Make 8 blobs dataset X, y = make_blobs(centers=8) # Instantiate the clustering model and visualizer model = MiniBatchKMeans(6) visualizer = SilhouetteVisualizer(model) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof(outpath="images/silhouette.png") # Draw/show/poof the data
silhouette_avg = silhouette_score(x, m.labels_).round(3) sils.append([silhouette_avg, k]) #Compute the silhouette score for each sample # sample_silhouette_value = silhouette_samples(x,m.labels_) # print(sample_silhouette_value) return sils ss = sil_score(x, 2, 5) print(f'score={ss}') print(f'optinum number of clusters ={max(ss)[1]}') # #Visualize Silhouette #Instantiate the clustering model and visualizer model = KMeans(n_clusters=3) visualizer = SilhouetteVisualizer(model) #fit the training data to visualizer visualizer.fit(x) #Draw/show/poof the data visualizer.poof() print(visualizer.silhouette_score_) #near 1 is good def Silhouette_plot(x, from_k, to_k): sil_score = [] for k in range(from_k, to_k + 1): #Instatiate the clustering model and visualizer m = KMeans(n_clusters=k) visualizer = SilhouetteVisualizer(m) visualizer.fit(x) #Draw/show/poof the data
model = KMeans(n_clusters=k, random_state=1) # 모델 선택 model.fit(X) # 모델 훈련 kmeans_models.append(model) inertias.append(model.inertia_) silhouettes.append(silhouette_score(X, model.labels_)) print('inertias:', inertias) print('silhouettes:', silhouettes) # k - 실루엣 점수 그래프 plt.plot(k_ranges[1:], silhouettes[1:], marker='o') plt.xlabel('k') plt.ylabel('silhouette score') plt.show() # 실루엣 점수(silhouette score) = (b - a) / max(a, b) # a: 동일한 클러스터 안에서 다른 샘플들과의 거리의 평균 # b: 가장 가까운 클러스터까지의 평균 거리 # -1 <= ss <= 1 # 실루엣 점수가 큰 모델의 클러스터 개수가 적절한 클러스터 개수 # ss = 1: 샘플들이 자기 클러스터 안에 잘 모여 있고, # 다른 클러스터와는 멀리 떨어져 있는 경우 # ss = 0: 샘들들이 클러스터 경계에 몰려 있는 경우 # ss = -1: 샘플들이 잘못된 클러스터에 포함되는 경우. # 실루엣 다이어그램 # pip install yellowbrick for model in kmeans_models[1:]: # 훈련된 각각의 KMeans 모델들에 대해서 visualizer = SilhouetteVisualizer(model, color='yellowbrick') visualizer.fit(X) visualizer.show()
def silhouette(): X, _ = make_blobs(centers=8) oz = SilhouetteVisualizer(MiniBatchKMeans(6), ax=newfig()) oz.fit(X) savefig(oz, "silhouette")
def explore_DBSCAN_clustering( df, num_cols=None, metric="euclidean", eps=[0.5], min_samples=[5], include_silhouette=True, include_PCA=True, random_state=None, ): """fit and plot DBSCAN clustering algorithms Parameters ---------- df : pandas.DataFrame the dataset, should be transformed with StandardScaler num_cols : list, optional list of numeric column names, in case of None, get all numeric columns metric : str, optional metric, by default "euclidean" eps : list, optional list of eps hyperparams, by default [0.5] min_samples: list, optional list of min_samples hyperparams, by default [5] include_silhouette : bool, optional whether Silhouette plots should be generated, by default True include_PCA : bool, optional whether PCA plots should be generated, by default True random_state : int, optional a number determines random number generation for centroid initialization, by default None Returns ------- Tuple list a list of n_clusters values returned by DBSCAN models dict a dictionary with key=type of plot, value=list of plots Examples ------- >>> original_df = pd.read_csv("/data/menu.csv") >>> numeric_features = eda.get_numeric_columns(original_df) >>> numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler()) >>> preprocessor = make_column_transformer( >>> (numeric_transformer, numeric_features) >>> ) >>> df = pd.DataFrame( >>> data=preprocessor.fit_transform(original_df), columns=numeric_features >>> ) >>> n_clusters, dbscan_plots = explore_DBSCAN_clusterting(df) """ if num_cols is None: num_cols = get_numeric_columns(df) else: _verify_numeric_cols(df, num_cols) x = df[num_cols] results = {} n_clusters = [] s_plots = [] pca_plots = [] print("------------------------") print("DBSCAN CLUSTERING") print("------------------------") for e in eps: for ms in min_samples: dbscan = DBSCAN(eps=e, min_samples=ms, metric=metric) dbscan.fit(x) k = len(set(dbscan.labels_)) - 1 # exclduing -1 labels n_clusters.append(k) print(f"eps={e}, min_samples={ms}, n_cluster={k}") if include_silhouette and k > 0: # generat Silhouette plot dbscan.n_clusters = k dbscan.predict = lambda x: dbscan.labels_ fig, ax = plt.subplots() s_visualizer = SilhouetteVisualizer(dbscan, colors="yellowbrick", ax=ax) s_visualizer.fit(x) s_visualizer.show() s_plots.append(fig) # plt.clf() plt.close() else: s_plots.append(None) if include_PCA: # genrate PCA plot p_lot = plot_pca_clusters(x, dbscan.labels_, random_state=random_state) pca_plots.append(p_lot) else: pca_plots.append(None) results["Silhouette"] = s_plots results["PCA"] = pca_plots return n_clusters, results
# Load the data from the files in the corpus for cat in categories: for name in os.listdir(os.path.join(path, cat)): files.append(os.path.join(path, cat, name)) target.append(cat) with open(os.path.join(path, cat, name), 'r') as f: data.append(f.read()) # Return the data bunch for use similar to the newsgroups example return Bunch( categories=categories, files=files, data=data, target=target, ) corpus = load_corpus('hobbies') tfidf = TfidfVectorizer(stop_words='english') docs = tfidf.fit_transform(corpus.data) # Instantiate the clustering model and visualizer visualizer = SilhouetteVisualizer(KMeans(n_clusters=6)) visualizer.fit(docs) visualizer.poof() # Instantiate the clustering model and visualizer visualizer = KElbowVisualizer(KMeans(), metric='silhouette', k=[4,10]) visualizer.fit(docs) visualizer.poof()
'869Individual_AssignmentQ1_graphs/jelwery-kmeans-elbow-interia.png') plt.figure() plt.grid(True) plt.plot(list(silhouettes.keys()), list(silhouettes.values())) plt.title('K-Means, Elbow Method') plt.xlabel("Number of clusters, K") plt.ylabel("Silhouette") plt.savefig( '869Individual_AssignmentQ1_graphs/jelwery-kmeans-elbow-silhouette.png') # In[18]: ###sklearn.metrics.davies_bouldin_score(X, k_means.labels_) visualizer = SilhouetteVisualizer(k_means) visualizer.fit(X) visualizer.poof() fig = visualizer.ax.get_figure() fig.savefig( '869Individual_AssignmentQ1_graphs/jelwery-kmeans-5-silhouette.png', transparent=False) # # Answer to Question [1], Part [c] # In[19]: ###Intepretting the Clusters ###Means k_means.cluster_centers_