def B1(pca=False): ''' Plot WC_SSD and SC over K. ''' K = [2, 4, 6, 8, 16, 32] fnames = [ 'digits-embedding.csv', 'digits-embedding-2467.csv', 'digits-embedding-67.csv' ] wc_ssd_val = zeros((len(fnames), len(K))) sc_val = zeros((len(fnames), len(K))) for i, fname in enumerate(fnames): X = genfromtxt(fname, delimiter=',')[:, 2:] for j, k in enumerate(K): kmeans = KMeans(n_clusters=k) kmeans.fit(X) wc_ssd_val[i, j], sc_val[i, j], _ = kmeans.get_evals() # Plot WC_SSD figure() for i, fname in enumerate(fnames): plot(K, wc_ssd_val[i], label=fname) legend() title('WC_SSD v.s. K') figure() for i, fname in enumerate(fnames): plot(K, sc_val[i], label=fname) legend() title('SC v.s. K') show()
def B4(pca=False): ''' Evaluate using NMI and visualize in 2D. ''' fnames = [ 'digits-embedding.csv', 'digits-embedding-2467.csv', 'digits-embedding-67.csv' ] nmi = zeros(len(fnames)) for i, k, fname in zip([0, 1, 2], [8, 4, 2], fnames): raw = genfromtxt(fname, delimiter=',') X = raw[:, 2:] y = get_normalized_labels(raw[:, 1]) kmeans = KMeans(n_clusters=k) ind = kmeans.fit(X, y) _, _, nmi[i] = kmeans.get_evals() figure() perm = permutation(X.shape[0])[:1000] X = X[perm] ind = ind[perm] colors = rand(k, 3)[ind, :] scatter(X[:, 0], X[:, 1], c=colors, alpha=0.9, s=30) print(fnames) print("NMI =", nmi) show()
def Bonus4(): ''' Repeat B1, B2, B4 with PCA embedding. ''' K = [2, 4, 6, 8, 16, 32] fnames = [ 'digits-pca-embedding.csv', 'digits-pca-embedding-2467.csv', 'digits-pca-embedding-67.csv' ] wc_ssd_val = zeros((len(fnames), len(K), 10)) sc_val = zeros((len(fnames), len(K), 10)) nmi_val = zeros((len(fnames), len(K), 10)) for i, fname in enumerate(fnames): raw = genfromtxt(fname, delimiter=',') X = raw[:, 2:] y = get_normalized_labels(raw[:, 1]) for j, k in enumerate(K): for m in range(10): kmeans = KMeans(n_clusters=k) ind = kmeans.fit(X, y) wc_ssd_val[i, j, m], sc_val[i, j, m], nmi_val[i, j, m] = kmeans.get_evals() figure() perm = permutation(X.shape[0])[:1000] X = X[perm] ind = ind[perm] colors = rand(k, 3)[ind, :] scatter(X[:, 0], X[:, 1], c=colors, alpha=0.9, s=30) save('Bonus_wc_ssd_val.npy', wc_ssd_val) save('Bonus_sc_val.npy', sc_val) save('Bonus_nmi_val.npy', nmi_val) wc_ssd_val = load('Bonus_wc_ssd_val.npy') sc_val = load('Bonus_sc_val.npy') # nmi_val = load('Bonus_nmi_val.npy') ssd_means = mean(wc_ssd_val, axis=2) sc_means = mean(sc_val, axis=2) ssd_std = std(wc_ssd_val, axis=2) sc_std = std(sc_val, axis=2) # Plot WC_SSD figure() for i, fname in enumerate(fnames): errorbar(K, ssd_means[i], ssd_std[i], capsize=4, label=fname) legend() title('WC_SSD v.s. K') figure() for i, fname in enumerate(fnames): errorbar(K, sc_means[i], sc_std[i], capsize=4, label=fname) legend() title('SC v.s. K') print(fnames) print("NMI =", mean(nmi_val, axis=2)) show()
def B3(): ''' Repeat 10 times for each K. ''' K = [2, 4, 6, 8, 16, 32] fnames = [ 'digits-embedding.csv', 'digits-embedding-2467.csv', 'digits-embedding-67.csv' ] wc_ssd_val = zeros((len(fnames), len(K), 10)) sc_val = zeros((len(fnames), len(K), 10)) for i, fname in enumerate(fnames): X = genfromtxt(fname, delimiter=',')[:, 2:] for j, k in enumerate(K): for m in range(10): kmeans = KMeans(n_clusters=k) kmeans.fit(X) wc_ssd_val[i, j, m], sc_val[i, j, m], _ = kmeans.get_evals() save('B3_wc_ssd_val.npy', wc_ssd_val), save('B3_sc_val.npy', sc_val) wc_ssd_val = load('B3_wc_ssd_val.npy') sc_val = load('B3_sc_val.npy') ssd_means = mean(wc_ssd_val, axis=2) sc_means = mean(sc_val, axis=2) ssd_std = std(wc_ssd_val, axis=2) sc_std = std(sc_val, axis=2) # Plot WC_SSD figure() for i, fname in enumerate(fnames): errorbar(K, ssd_means[i], ssd_std[i], capsize=4, label=fname) legend() title('WC_SSD v.s. K') figure() for i, fname in enumerate(fnames): errorbar(K, sc_means[i], sc_std[i], capsize=4, label=fname) legend() title('SC v.s. K') show()