def plot_cluster_distances(estimator, dataset, version): visualizer = InterclusterDistance(estimator) visualizer.fit(data.DATA[dataset][version]['x_train']) visualizer.show( f'{PLOTS_FOLDER}/{dataset}/{version}/{dataset}_{version}_{estimator.__class__.__name__}_cluster_distances_k{estimator.n_clusters}.png' ) plt.clf()
def intercluster_distance(ax=None): X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True) viz = InterclusterDistance(KMeans(9), ax=ax) viz.fit(X) viz.finalize() return viz
def cluster_distances(model, X, graph): visualizer = InterclusterDistance( model, legend=True, legend_loc='upper left', title=" KMeans Intercluster Distance Map for " + graph) visualizer.fit(X) visualizer.show()
def ica(training_set, test_set, y_train): # https://www.ritchieng.com/machine-learning-dimensionality-reduction-feature-transform/ ica_avg_kurtosis_curve(training_set) km = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, random_state=RAND) ica = FastICA(n_components=10, random_state=RAND, max_iter=1000) X_train = ica.fit_transform(training_set) plot_silhouette(km, X_train, title="ICA(10), K=2") visualizer = InterclusterDistance(km) visualizer.fit(X_train) # Fit the data to the visualizer visualizer.show() # F km = KMeans(n_clusters=5, init='k-means++', n_init=10, max_iter=300, random_state=RAND) ica = FastICA(n_components=10, random_state=RAND, max_iter=1000) X_train = ica.fit_transform(training_set) plot_silhouette(km, X_train, title="ICA(10), K=5") km = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, random_state=RAND) ica = FastICA(n_components=10, random_state=RAND, max_iter=1000) X_train = ica.fit_transform(training_set) plot_silhouette(km, X_train, title="ICA(10)" ", K=3") km = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, random_state=RAND) km.fit(X_train) hs = metrics.homogeneity_score(y_train, km.labels_) print("homogenatity score for K=2:", hs) y_train_inverse = (~y_train.astype(bool)).astype(int) hs = metrics.homogeneity_score(y_train_inverse, km.labels_) print("homogenatity score for K=2: (inverse)", hs)
def cluster_distance_map(text, model, cv): path = 'models/{}'.format(model) pipe = load(path) kmeans = pipe.named_steps['kmeans'] svd = pipe.named_steps['truncatedsvd'] X = svd.fit_transform(cv) visualizer = InterclusterDistance( kmeans, embedding='mds', ) visualizer.fit(X) visualizer.show(outpath="plots/ClusterMap.png") plt.close()
def distance_yellowbrick( X, y, features, ): plt.switch_backend('agg') plt.clf() X_train, X_test, y_train, y_test = train_test_split(X[features], y, stratify=y, test_size=0.01) X = pd.DataFrame(X_test, columns=features) y = pd.Series(y_test) n_clusters = y.nunique() model = MiniBatchKMeans(n_clusters) visualizer_dist = InterclusterDistance(model) visualizer_dist.fit(X) visualizer_dist.finalize() return plt
def kMeans(): # citation: https://realpython.com/k-means-clustering-python/ digits = load_digits() # features digits_features = digits.data[:, 0:-1] # label label = digits.data[:, -1] scaler = StandardScaler() scaled_features = scaler.fit_transform(digits_features) # citation: hands on machine learning gm = GaussianMixture(covariance_type='spherical', n_components=8, n_init=10) gm.fit(scaled_features) print("GM Converged", gm.converged_) print("GM Convergence Iterations", gm.n_iter_) print("GM weights", gm.weights_) gm.predict(scaled_features) gm.predict_proba(scaled_features) gm.score_samples(scaled_features) aic = [] bic = [] for i in range(21): gm = GaussianMixture(covariance_type='spherical', n_components=20, n_init=10) gm.fit(scaled_features) aic.append(gm.aic(scaled_features)) bic.append(gm.bic(scaled_features)) plt.plot(aic, label="AIC") plt.plot(bic, label="BIC") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Number of Clusters") plt.ylabel("Information Criterion") plt.legend() plt.show() # x_centered = digits_features - digits_features.mean(axis=0) # U, s, Vt = np.linalg.svd(x_centered) # c1 = Vt.T[:, 0] # c2 = Vt.T[:, 1] # W2 = Vt.T[:, :2] # X2D = x_centered.dot(W2) # pca = PCA() # pca.fit(scaled_features) # cumsum = np.cumsum(pca.explained_variance_ratio_) # d = np.argmax(cumsum >= 0.95) + 1 # pca = PCA(n_components=0.95) # X_reduced = pca.fit_transform(scaled_features) explained_variance = [] for i in range(63): pca = PCA(n_components=i) pca.fit(scaled_features) cumsum = np.cumsum(pca.explained_variance_ratio_) plt.plot(cumsum, label="Explained Variance Ratio") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Number of Dimensions") plt.ylabel("Explained Variance Ratio") plt.legend() plt.show() digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split( digits_features, label) # ica # citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn error = [] for i in range(1, 50): pca = PCA(n_components=i) pca.fit(digits_trainingX) U, S, VT = np.linalg.svd(digits_trainingX - digits_trainingX.mean(0)) x_train_pca = pca.transform(digits_trainingX) x_train_pca2 = (digits_trainingX - pca.mean_).dot(pca.components_.T) x_projected = pca.inverse_transform(x_train_pca) x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_ loss = ((digits_trainingX - x_projected)**2).mean() error.append(loss) plt.clf() plt.figure(figsize=(15, 15)) plt.title("reconstruction error") plt.plot(error, 'r') plt.xticks(range(len(error)), range(1, 50), rotation='vertical') plt.xlim([-1, len(error)]) plt.show() clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8, ), random_state=1, solver='lbfgs') clf.fit(digits_trainingX, digits_trainingY) y_pred = clf.predict(digits_testingX) print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred)) k_acc = [] k_gm = [] time_arr = [] for k in range(1, 15): kmeans = KMeans(n_clusters=k) X_train = kmeans.fit_transform(digits_trainingX) X_test = kmeans.transform(digits_testingX) start_time = time.time() clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8, ), random_state=1, solver='lbfgs') clf.fit(X_train, digits_trainingY) total_time = time.time() - start_time y_pred = clf.predict(X_test) score = accuracy_score(digits_testingY, y_pred) k_acc.append(score) time_arr.append(total_time) plt.plot(k_acc, label="K-Means") plt.plot(time_arr, label="Computation Time") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("k # of clusters") plt.ylabel("NN Accuracy") plt.legend() plt.show() acc = [] acc_ica = [] acc_rca = [] for i in range(1, 40): pca = PCA(n_components=i) X_train = pca.fit_transform(digits_trainingX) X_test = pca.transform(digits_testingX) clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8, ), random_state=1, solver='lbfgs') clf.fit(X_train, digits_trainingY) y_pred = clf.predict(X_test) score = accuracy_score(digits_testingY, y_pred) acc.append(score) ica = FastICA(n_components=i) x_train_i = ica.fit_transform(digits_trainingX) x_test_i = ica.transform(digits_testingX) clf.fit(x_train_i, digits_trainingY) y_pred_i = clf.predict(x_test_i) score_i = accuracy_score(digits_testingY, y_pred_i) acc_ica.append(score_i) rca = GaussianRandomProjection(n_components=i) x_train_r = rca.fit_transform(digits_trainingX) x_test_r = rca.transform(digits_testingX) clf.fit(x_train_r, digits_trainingY) y_pred_r = clf.predict(x_test_r) score_r = accuracy_score(digits_testingY, y_pred_r) acc_rca.append(score_r) plt.plot(acc, label="PCA") plt.plot(acc_ica, label="ICA") plt.plot(acc_rca, label="RCA") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Components") plt.ylabel("NN Accuracy") plt.legend() plt.show() # cumsum = np.cumsum(pca.explained_variance_ratio_) # d = np.argmax(cumsum >= 0.95) + 1 # randomized projections rnd_pca = PCA(n_components=50, svd_solver="randomized") X_reduced_rand = rnd_pca.fit_transform(scaled_features) # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py # k best scaler = MinMaxScaler() digits_indices = np.arange(digits_features.shape[-1]) scaled_features_norm = scaler.fit_transform(scaled_features) k_selected = SelectKBest(f_classif, k=50) k_selected.fit(scaled_features_norm, label) scores = -np.log10(k_selected.pvalues_) plt.bar(digits_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)') plt.xlabel("Features") plt.ylabel("F-Score") plt.show() gm = GaussianMixture(covariance_type='spherical', n_components=8, n_init=10) gm.fit(X_reduced_inc) print("GM Converged - PCA Inc", gm.converged_) print("GM Convergence Iterations", gm.n_iter_) print("GM weights", gm.weights_) gm.predict(X_reduced_inc) gm.predict_proba(X_reduced_inc) gm.score_samples(X_reduced_inc) kmeans = KMeans(init="random", n_clusters=63, n_init=10, max_iter=300, random_state=42) kmeans.fit(scaled_features) # the lowest SSE value print("KMeans Inertia", kmeans.inertia_) # final locations of the centroid print("KMeans Cluster Centers", kmeans.cluster_centers_) # num of iterations required to converge print("KMeans Iterations Required To Converge", kmeans.n_iter_) # labels print("KMeans Labels", kmeans.labels_[:5]) kmeans_kwargs = { "init": "random", "n_init": 10, "max_iter": 300, "random_state": 42, } sse = [] for k in range(1, 63): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) kmeans.fit(scaled_features) sse.append(kmeans.inertia_) kl = KneeLocator(range(1, 63), sse, curve="convex", direction="decreasing") # optimal k (number of clusters) for this dataset print("Elbow", kl.elbow) clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8, ), random_state=1, solver='lbfgs') clf.fit(digits_trainingX, digits_trainingY) y_pred = clf.predict(digits_testingX) model = KMeans(n_clusters=5) kmeans.fit(scaled_features) labels = kmeans.fit_predict(digits_testingX) print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred)) print("Accuracy Score K-Means", accuracy_score(digits_testingY, labels)) elbow_visualizer = KElbowVisualizer(model, k=(2, 63)) elbow_visualizer.fit(digits_features) elbow_visualizer.show() silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick') silhouette_visualizer.fit(digits_features) silhouette_visualizer.show() ic_visualizer = InterclusterDistance(model) ic_visualizer.fit(digits_features) ic_visualizer.show() # gmm = GaussianMixture(n_components=7).fit(digits_features) # labels = gmm.predict(digits_features) # plt.scatter(digits_features[:, 0], digits_features[:, 1], c=labels, s=40, cmap='viridis') # plt.show() # digits_features_pd = pd.DataFrame(data=digits_features[1:, 1:], # index=digits_features[1:,0], # columns=digits_features[0,1:]) # pd.plotting.scatter_matrix(digits_features_pd) # probs = GaussianMixture.predict_proba(digits_features) # print(probs[:5].round(3)) kmeans = KMeans(init="random", n_clusters=18, n_init=10, max_iter=300, random_state=42) kmeans.fit(X_reduced_inc) # the lowest SSE value print("KMeans Inertia", kmeans.inertia_) # final locations of the centroid print("KMeans Cluster Centers", kmeans.cluster_centers_) # num of iterations required to converge print("KMeans Iterations Required To Converge", kmeans.n_iter_) # labels print("KMeans Labels", kmeans.labels_[:5]) kmeans_kwargs = { "init": "random", "n_init": 10, "max_iter": 300, "random_state": 42, } sse = [] for k in range(1, 18): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) kmeans.fit(scaled_features) sse.append(kmeans.inertia_) kl = KneeLocator(range(1, 18), sse, curve="convex", direction="decreasing") # optimal k (number of clusters) for this dataset print("Elbow", kl.elbow) model = KMeans() elbow_visualizer = KElbowVisualizer(model, k=(2, 18)) elbow_visualizer.fit(X_reduced_inc) elbow_visualizer.show() silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick') silhouette_visualizer.fit(X_reduced_inc) silhouette_visualizer.show() ic_visualizer = InterclusterDistance(model) ic_visualizer.fit(X_reduced_inc) ic_visualizer.show()
def icdm(): X, _ = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True) oz = InterclusterDistance(KMeans(9), ax=newfig()) oz.fit(X) savefig(oz, "icdm")
model.fit(X_scaled) print("Predicted labels ----") model.predict(X_scaled) df['cluster'] = model.predict(X_scaled) plt.figure(figsize=(12,9)) model=MiniBatchKMeans(n_clusters=2).fit(X_scaled) visualizer = SilhouetteVisualizer(model, colors='yellowbrick') visualizer.fit(X_scaled) visualizer.show() plt.figure(figsize=(12,9)) visualizer = InterclusterDistance(model, min_size=10000) visualizer.fit(X_scaled) visualizer.show() df = pd.concat([df,X_scaled], axis=1) """ k-prototype 聚类算法 """ from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" %matplotlib inline import pandas as pd import numpy as np from sklearn import preprocessing
st.text(classification_report(data_target, pred)) #Confusion matrix plot_confusion_matrix(data_target, pred, figsize=(7, 5), cmap="PuBuGn") bottom, top = plt.ylim() plt.ylim(bottom + 0.5, top - 0.5) st.pyplot() # Elbow Method visualizer = KElbowVisualizer(KmeansClus, k=(1, 10)) visualizer.fit(data_feature) visualizer.show() st.pyplot() # Inter Cluster Distances visualizer_inter = InterclusterDistance(KmeansClus) visualizer_inter.fit(data_feature) visualizer_inter.show() st.pyplot() except: st.write("Fill all parameters.") ######################################## # Mini-Batch k-means ######################################## if ML_option == "Mini-Batch k-means": try: # Mini Batch parameters Nk = st.number_input("Number of clusters: ", min_value=1, step=1) MBatchClus = MiniBatchKMeans(n_clusters=Nk) MBatchClus.fit(data_feature)
def kMeans(): twitterX, twitterY, twitter_dataset, scaled_features = preprocess() gm = GaussianMixture(covariance_type='tied', n_components=18, n_init=10) gm.fit(scaled_features) print("GM Converged", gm.converged_) print("GM Convergence Iterations", gm.n_iter_) print("GM weights", gm.weights_) gm.predict(scaled_features) gm.predict_proba(scaled_features) gm.score_samples(scaled_features) aic = [] bic = [] for i in range(10): gm = GaussianMixture(covariance_type='spherical', n_components=9, n_init=10) gm.fit(scaled_features) aic.append(gm.aic(scaled_features)) bic.append(gm.bic(scaled_features)) plt.plot(aic, label="AIC") plt.plot(bic, label="BIC") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Number of Clusters") plt.ylabel("Information Criterion") plt.legend() plt.show() twitter_trainingX, twitter_testingX, twitter_trainingY, twitter_testingY = train_test_split(twitterX, twitterY) error = [] #citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn for i in range(1, 8): pca = FastICA(n_components=i) pca.fit(twitter_trainingX) U, S, VT = np.linalg.svd(twitter_trainingX - twitter_trainingX.mean(0)) x_train_pca = pca.transform(twitter_trainingX) x_train_pca2 = (twitter_trainingX - pca.mean_).dot(pca.components_.T) x_projected = pca.inverse_transform(x_train_pca) x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_ loss = ((twitter_trainingX - x_projected) ** 2).mean() error.append(loss) plt.clf() plt.figure(figsize=(15, 15)) plt.title("reconstruction error") plt.plot(error, 'r') plt.xticks(range(len(error)), range(1, 8), rotation='vertical') plt.xlim([-1, len(error)]) plt.show() clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8,), random_state=1, solver='lbfgs') clf.fit(twitter_trainingX, twitter_trainingY) y_pred = clf.predict(twitter_testingX) print("Accuracy Score Normal", accuracy_score(twitter_testingY, y_pred)) kmeans = KMeans( init="random", n_clusters=3, n_init=10, max_iter=300, random_state=42 ) kmeans.fit(scaled_features) labels = kmeans.fit_predict(twitter_testingX) print("Accuracy Score K-Means", accuracy_score(twitter_testingY, labels)) for i in range(9): pca = PCA(n_components=i) pca.fit(scaled_features) cumsum = np.cumsum(pca.explained_variance_ratio_) plt.plot(cumsum, label="Explained Variance Ratio") # plt.xticks(np.arange(min(x), max(x) + 1, 1.0)) # plt.xticks(range(1,18)) plt.xlabel("Number of Dimensions") plt.ylabel("Explained Variance Ratio") plt.legend() plt.show() # ica num_batches = 100 inc_pca = IncrementalPCA(n_components=5) for X_batch in np.array_split(scaled_features, num_batches): inc_pca.partial_fit(X_batch) X_reduced_inc = inc_pca.transform(scaled_features) # randomized projections rnd_pca = PCA(n_components=5, svd_solver="randomized") X_reduced_rand = rnd_pca.fit_transform(scaled_features) # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py # k best scaler = MinMaxScaler() digits_indices = np.arange(twitterX.shape[-1]) scaled_features_norm = scaler.fit_transform(scaled_features) k_selected = SelectKBest(f_classif, k=8) k_selected.fit(scaled_features_norm, twitterY) scores = -np.log10(k_selected.pvalues_) plt.bar(digits_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)') plt.xlabel("Features") plt.ylabel("F-Score") plt.show() digits kmeans = KMeans( init="random", n_clusters=5, n_init=10, max_iter=300, random_state=42 ) kmeans.fit(scaled_features) labels = kmeans.fit_predict(twitter_dataset) #the lowest SSE value print("KMeans Inertia", kmeans.inertia_) #final locations of the centroid print("KMeans Cluster Centers", kmeans.cluster_centers_) #num of iterations required to converge print("KMeans Iterations Required To Converge", kmeans.n_iter_) #labels print("KMeans Labels", kmeans.labels_[:5]) kmeans_kwargs = { "init":"random", "n_init":10, "max_iter":300, "random_state":42, } sse = [] for k in range(1, 18): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) kmeans.fit(scaled_features) sse.append(kmeans.inertia_) model = KMeans(n_clusters=9) elbow_visualizer = KElbowVisualizer(model, k=(2, 18)) elbow_visualizer.fit(twitterX) elbow_visualizer.show() silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick') silhouette_visualizer.fit(twitterX) silhouette_visualizer.show() ic_visualizer = InterclusterDistance(model) ic_visualizer.fit(twitterX) ic_visualizer.show() X = twitter_dataset[:, []] plt.scatter()
def intercluster(X): model = KMeans(3) visualizer = InterclusterDistance(model) visualizer.fit(X) visualizer.show()
def cluster_metrics(i_patches, a_patches, g_patches, city_names, K, save_path, g_indices): # intra-cluster distances: ssd of samples to the nearest cluster centre sum_of_squared_distances = [] silhouette_scores = [] calinski_harabasz_scores = [] davies_bouldin_scores = [] k_mean_list = [] for k in K: model, k_means, A = get_kmeans144_result(a_patches, k) k_mean_list.append(k_means) sum_of_squared_distances.append(k_means.inertia_) labels = k_means.labels_ score = metrics.silhouette_score(A, labels, metric='euclidean') silhouette_scores.append(score) score = metrics.calinski_harabasz_score(A, labels) calinski_harabasz_scores.append(score) score = metrics.davies_bouldin_score(A, labels) davies_bouldin_scores.append(score) mydict = dict_cluster(i_patches, a_patches, g_patches, city_names, k_means) save_path_k = '{}_{}'.format(save_path, k) gt_ratio = gt_metric(mydict, save_path_k) plot_figure(K, sum_of_squared_distances, save_path, 'sum_of_squared_distances') plot_figure(K, silhouette_scores, save_path, 'silhouette_scores') plot_figure(K, calinski_harabasz_scores, save_path, 'calinski_harabasz_scores') plot_figure(K, davies_bouldin_scores, save_path, 'davies_bouldin_score') ssd_best_index = sum_of_squared_distances.index( max(sum_of_squared_distances)) sil_best_index = silhouette_scores.index(max(silhouette_scores)) ch_best_index = calinski_harabasz_scores.index( max(calinski_harabasz_scores)) db_best_index = davies_bouldin_scores.index(max(davies_bouldin_scores)) #gtr_best_index = gt_ratio.index(max(gt_ratio)) all_indices = [ ssd_best_index, sil_best_index, ch_best_index, db_best_index ] #, gtr_best_index] #, axis=None) best_k = np.array(K)[np.unique(all_indices)] for ind in range(len(K)): #best_k: # Visualize output clusters of K means in 2D k_means = k_mean_list[ind] visualizer = InterclusterDistance(k_means) visualizer.fit(A) # Fit the data to the visualizer #visualizer.show() # Finalize and render the figure visualizer.show( outpath='{}_{}_InterclusterDistance.png'.format(save_path, ind)) visualizer.poof() # Visualize through TSNE A_embedded = TSNE().fit_transform(A) plt.figure() palette = sns.color_palette("bright", 2) y_ = np.asarray(g_indices) y = y_.astype(np.float32) sns.scatterplot(A_embedded[:, 0], A_embedded[:, 1], hue=y, legend='full', palette=palette) plt.savefig('{}_tsne.png'.format(save_path)) return
# Clustering Evaluation Imports from functools import partial from sklearn.cluster import KMeans from sklearn.datasets import make_blobs as sk_make_blobs from yellowbrick.cluster import InterclusterDistance # Helpers for easy dataset creation N_SAMPLES = 1000 N_FEATURES = 12 SHUFFLE = True # Make blobs partial make_blobs = partial(sk_make_blobs, n_samples=N_SAMPLES, n_features=N_FEATURES, shuffle=SHUFFLE) if __name__ == '__main__': # Make 8 blobs dataset X, y = make_blobs(centers=12) # Instantiate the clustering model and visualizer # Instantiate the clustering model and visualizer visualizer = InterclusterDistance(KMeans(9)) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof(outpath="images/icdm.png") # Draw/show/poof the data
) # https://www.scikit-yb.org/en/latest/api/cluster/elbow.html visualizer = KElbowVisualizer(model, k=(1, 20)) visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show(outpath="charts/income.k-means.PCA.KElbowVisualizer.png") visualizer.poof() model = KMeans( n_clusters=4, random_state=0, n_jobs=-1, ) visualizer = InterclusterDistance(model) visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show(outpath="charts/income.k-means.PCA.InterclusterDistance.png") visualizer.poof() model = KMeans(n_clusters=4, random_state=0) visualizer = SilhouetteVisualizer(model) visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show(outpath="charts/income.k-means.PCA.SilhouetteVisualizer.png") lowest_bic = np.infty bic = []
visualizerRadViz = RadViz(classes=classes, features=features, title=' ') visualizerRadViz.fit(X, y) # Fit the data to the visualizer visualizerRadViz.transform(X) # Transform the data locationFileNameRVZ = os.path.join('/home/ak/Documents/Research/Papers/figures',str(symbols[symbolIdx]) \ +'_idx_'+str(idx)+'_label_'+str(labelsIdx)+'_date_'+str(dateIdx)+'_radviz.png') visualizerRadViz.show(outpath=locationFileNameRVZ) plt.show() ## MDS # Instantiate the clustering model and visualizer model = KMeans(6) plt.figure() plt.xlabel('features', fontsize=12) plt.ylabel('features', fontsize=12) plt.xticks(fontsize=14) plt.yticks(fontsize=12) visualizerID = InterclusterDistance(model) visualizerID.fit(X) # Fit the data to the visualizer locationFileNameID = os.path.join( '/home/ak/Documents/Research/Papers/figures', str(symbols[symbolIdx]) + '_idx_' + str(idx) + '_KMeans_MDS.png') visualizerID.show(outpath=locationFileNameID ) # Finalize and render the figure plt.show()
def pca(training_set, test_set): pca = PCA() pca.fit_transform(training_set) pca.transform(test_set) explained_variance = pca.explained_variance_ratio_ components = 16 print("for " + str(components) + " components") top_n = explained_variance[:components] print(top_n) print("captures ") print(np.sum(top_n)) print("percent") pca_cum_variance(pca) pca = PCA(n_components=16) X_train = pca.fit_transform(training_set) X_test = pca.transform(test_set) distortions = [] for i in range(1, 11): km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=300, random_state=RAND) km.fit(X_train) distortions.append(km.inertia_) plt.plot(range(1, 11), distortions, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.title("Distortion vs # Clusters PCA-20") plt.tight_layout() plt.show() km = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_train, title="PCA20, K=3") visualizer = InterclusterDistance(km) visualizer.fit(X_train) # Fit the data to the visualizer visualizer.show() # F km = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_train, title="PCA20, K=2") visualizer = InterclusterDistance(km) visualizer.fit(X_train) # Fit the data to the visualizer visualizer.show() # F km = KMeans(n_clusters=4, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_train, title="PCA20, K=4") km = KMeans(n_clusters=5, init='k-means++', n_init=10, max_iter=300, random_state=RAND) plot_silhouette(km, X_train, title="PCA20, K=5")
def plain_clustering(): distortions = [] for i in range(1, 11): km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=300, random_state=RAND) km.fit(X_train) distortions.append(km.inertia_) plt.plot(range(1, 11), distortions, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.tight_layout() plt.show() km = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, random_state=RAND) y_km = km.fit_predict(X_train) visualizer = InterclusterDistance(km) visualizer.fit(X_train) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure # cluster_labels = np.unique(y_km) # n_clusters = cluster_labels.shape[0] # silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') # y_ax_lower, y_ax_upper = 0, 0 # yticks = [] # for i, c in enumerate(cluster_labels): # c_silhouette_vals = silhouette_vals[y_km == c] # c_silhouette_vals.sort() # y_ax_upper += len(c_silhouette_vals) # color = cm.jet(float(i) / n_clusters) # plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, # edgecolor='none', color=color) # # yticks.append((y_ax_lower + y_ax_upper) / 2.) # y_ax_lower += len(c_silhouette_vals) # # silhouette_avg = np.mean(silhouette_vals) # plt.axvline(silhouette_avg, color="red", linestyle="--") # # plt.yticks(yticks, cluster_labels + 1) # plt.ylabel('Cluster') # plt.xlabel('Silhouette coefficient') # # plt.tight_layout() # # plt.savefig('images/11_04.png', dpi=300) # plt.show() km = KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300, random_state=RAND) y_km = km.fit_predict(X_train) visualizer = InterclusterDistance(km) visualizer.fit(X_train) # Fit the data to the visualizer visualizer.show() # F cluster_labels = np.unique(y_km) n_clusters = cluster_labels.shape[0] silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') y_ax_lower, y_ax_upper = 0, 0 yticks = [] for i, c in enumerate(cluster_labels): c_silhouette_vals = silhouette_vals[y_km == c] c_silhouette_vals.sort() y_ax_upper += len(c_silhouette_vals) color = cm.jet(float(i) / n_clusters) plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color) yticks.append((y_ax_lower + y_ax_upper) / 2.) y_ax_lower += len(c_silhouette_vals) silhouette_avg = np.mean(silhouette_vals) plt.axvline(silhouette_avg, color="red", linestyle="--") plt.yticks(yticks, cluster_labels + 1) plt.ylabel('Cluster') plt.xlabel('Silhouette coefficient') plt.tight_layout() # plt.savefig('images/11_04.png', dpi=300) plt.show()
plt.title("K-Means (Dot Size = Silhouette Distance)", fontsize=20) plt.xlabel('Annual Income (K)', fontsize=22) plt.ylabel('Spending Score', fontsize=22) plt.xticks(fontsize=18) plt.yticks(fontsize=18) # plt.savefig('out/mall-kmeans-5-silhouette-size.png'); visualizer = SilhouetteVisualizer(k_means) visualizer.fit(X) visualizer.poof() fig = visualizer.ax.get_figure() # fig.savefig('out/mall-kmeans-5-silhouette.png', transparent=False); # Instantiate the clustering model and visualizer visualizer = InterclusterDistance(k_means) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof() # Draw/show/poof the data # plt.savefig('out/mall-kmeans-5-tsne.png', transparent=False); # Elbow Method (Manual) inertias = {} silhouettes = {} for k in range(2, 11): kmeans = KMeans(init='k-means++', n_init=10, n_clusters=k, max_iter=1000, random_state=42).fit(X) inertias[ k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.xlabel('Number of clusters') plt.ylabel('WCCS') plt.show() kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0) pred_y_train = kmeans.fit_predict(X_train) print("K Cluster Train Accuracy") homo_score = metrics.homogeneity_score(pred_y_train, Y_train_encoded) print("Homogeneity Score") print(homo_score) print((accuracy_score(pred_y_train, Y_train_encoded))) visualizer = InterclusterDistance(kmeans) #visualizer.fit(X_train) #visualizer.show() pred_y_test = kmeans.fit_predict(X_validation) print("K Cluster Test Accuracy") homo_score_test = metrics.homogeneity_score(pred_y_test, Y_test_encoded) print("Homogeneity Score") print(homo_score_test) print((accuracy_score(pred_y_test, Y_test_encoded))) visualizer.fit(X_validation) visualizer.show() #Using K means Cluster as Features