res_df, k_labels_df = run_kmeans(X, y) k_labels_df.to_csv(f"data/kmeans_labels_{data_key}.csv", index=False) res_df.plot( subplots=True, style=".-", title=f"KMeans performance vs n_clusters on {data_key} data", ) plt.xlabel("n_clusters") plt.savefig(f"output/kmeans_{data_key}.png") plt.close() if "credit" in data_key: examine_credit_cluster( X.values[:, TOP_FEATURES[:2]], y, title="True Label", xylabel=DATA["credit"][0].columns[TOP_FEATURES[:2]], fname=f"output/true_cluster_{data_key}.png", ) examine_credit_cluster( X.values[:, TOP_FEATURES[:2]], k_labels_df[y.nunique()], title="KMeans", xylabel=DATA["credit"][0].columns[TOP_FEATURES[:2]], fname=f"output/kmeans_cluster_{data_key}.png", ) if data_key == "fashion": plot_fashion_cluster(X, k_labels_df[len(np.unique(y))], fname="output/kmeans_cluster_fashion.png")
X_ica = ica.fit_transform(X) kurt = kurtosis(X_ica) plt.plot(kurt) plt.xlabel("IC") plt.ylabel("kurtosis") plt.title(f"ICA kurtosis on {data_key} data") plt.savefig(f"output/ica_kurtosis_{data_key}.png") plt.close() top_kurt_ind = (-kurt).argsort() # IC examine_credit_cluster( X_ica[:, top_kurt_ind[:2]], y, title=f"ICA transformation on {data_key} data", xylabel=["IC1", "IC2"], fname=f"output/ica_ic_cluster_{data_key}.png", ) if data_key == "fashion": # IC plot_fashion_cluster( ica.components_[top_kurt_ind[:25], :], range(25), fname="output/ica_ic_fashion.png", ) # reconstructed image X_recon = ica.inverse_transform(X_ica) plot_fashion_cluster(X_recon,
print(f"Running KMeans on {data_key} data") for algo_key in DATA[data_key]: print(f"..Running on {algo_key} transformed data") X, y = DATA[data_key][algo_key] res_df, k_labels_df = run_kmeans(X, y) k_labels_df.to_csv(f"data/kmeans_labels_{algo_key}_{data_key}.csv", index=False) res_df.plot( subplots=True, style=".-", title= f"KMeans performance vs n_clusters on {algo_key} transformed {data_key} data", ) plt.xlabel("n_clusters") plt.savefig(f"output/kmeans_{algo_key}_{data_key}.png") plt.close() examine_credit_cluster( X[:, :2], y, title=f"True label of {algo_key} transformed {data_key} data", fname=f"output/true_cluster_{algo_key}_{data_key}.png", ) examine_credit_cluster( X[:, :2], k_labels_df[len(np.unique(y))], title=f"KMeans on {algo_key} transformed {data_key} data", fname=f"output/kmeans_cluster_{algo_key}_{data_key}.png", )
for data_key in DATA: if data_key not in RUN_DATA: continue print(f"Running EM on {data_key} data") X, y = DATA[data_key] res_df, k_labels_df = run_em(X, y) k_labels_df.to_csv(f"data/EM_labels_{data_key}.csv", index=False) res_df.plot( subplots=True, style=".-", title=f"EM performance vs n_clusters on {data_key} data", ) plt.xlabel("n_clusters") plt.savefig(f"output/EM_{data_key}.png") plt.close() if "credit" in data_key: examine_credit_cluster( X.values[:, TOP_FEATURES[:2]], k_labels_df[y.nunique()], title="EM", xylabel=DATA["credit"][0].columns[TOP_FEATURES[:2]], fname=f"output/EM_cluster_{data_key}.png", ) if data_key == "fashion": plot_fashion_cluster(X, k_labels_df[len(np.unique(y))], fname="output/EM_cluster_fashion.png")
index=NUM_CLUSTERS, ) k_labels_df = pd.DataFrame(data=k_labels) return res_df, k_labels_df for data_key in DATA: if data_key not in RUN_DATA: continue print(f"Running EM on {data_key} data") for algo_key in DATA[data_key]: print(f"..Running on {algo_key} transformed data") X, y = DATA[data_key][algo_key] res_df, k_labels_df = run_em(X, y) k_labels_df.to_csv(f"data/EM_labels_{algo_key}_{data_key}.csv", index=False) res_df.plot( subplots=True, style=".-", title=f"EM performance vs n_clusters on {algo_key} transformed {data_key} data", ) plt.xlabel("n_clusters") plt.savefig(f"output/EM_{algo_key}_{data_key}.png") plt.close() examine_credit_cluster( X[:, :2], k_labels_df[len(np.unique(y))], title=f"EM on {algo_key} transformed {data_key} data", fname=f"output/EM_cluster_{algo_key}_{data_key}.png", )
X_pca = pca.fit_transform(X) eigenvalues = pca.explained_variance_ plt.plot(eigenvalues) if data_key == "fashion": plt.yscale("log") plt.xlabel("PC") plt.ylabel("eigenvalue") plt.title(f"PCA eigenvalue on {data_key} data") plt.savefig(f"output/pca_eigenval_{data_key}.png") plt.close() # PC examine_credit_cluster( X_pca[:, :2], y, title=f"PCA transformation on {data_key} data", xylabel=["PC1", "PC2"], fname=f"output/pca_pc_cluster_{data_key}.png", ) if data_key == "fashion": # PC image plot_fashion_cluster( pca.components_[:25, :], range(25), fname="output/pca_pc_fashion.png" ) # reconstructed image pca = PCA(n_components=0.95, whiten=True, random_state=0) X_recon = pca.inverse_transform(pca.fit_transform(X)) print(f"...Keep {pca.n_components_} components for {data_key} data...") plot_fashion_cluster(X_recon, y, fname="output/pca_reconstructed_fashion.png")
from sklearn.manifold import TSNE from load_data import DATA from examine_cluster import examine_credit_cluster RUN_DATA = ["credit", "fashion"] for data_key in DATA: if data_key not in RUN_DATA: continue print(f"Running TSNE on {data_key} data") X, y = DATA[data_key] tsne = TSNE(n_components=3, random_state=0, n_jobs=-1) X_tsne = tsne.fit_transform(X) examine_credit_cluster( X_tsne[:, :2], y, title=f"TSNE on {data_key} data", xylabel=["", ""], fname=f"output/tsne_cluster_{data_key}.png", )