def fn(pipe): p = pipe \ .split(5) \ .pipe(kmeans(clusters)) \ .y(seeding_centroids(0.1)) \ .y(label_consensus()) \ .pipe(knn(neighbors)) \ .pipe(predict()) \ .pipe(evaluate()) \ .merge('evaluation', average('evaluation')) return p
sigma2 = sum([np.linalg.norm(x-y)**2 for x in data for y in data])/(len(data)**2) sigma = np.sqrt(sigma2) rho_exp = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/(2*sigma)) rho_gauss = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)**2/(2*(sigma)**2)) G = eclust.kernel_matrix(data, rho) #G = eclust.kernel_matrix(data, rho_gauss) #G = eclust.kernel_matrix(data, rho_exp) k = 3 r = [] r.append(wrapper.kmeans(k, data, run_times=5)) r.append(wrapper.gmm(k, data, run_times=5)) r.append(wrapper.spectral_clustering(k, data, G, run_times=5)) r.append(wrapper.spectral(k, data, G, run_times=5)) r.append(wrapper.kernel_kmeans(k, data, G, run_times=5, ini='random')) #r.append(wrapper.kernel_kmeans(k, data, G, run_times=5, ini='k-means++')) #r.append(wrapper.kernel_kmeans(k, data, G, run_times=5, ini='spectral')) r.append(wrapper.kernel_kgroups(k,data,G,run_times=5, ini='random')) #r.append(wrapper.kernel_kgroups(k,data,G,run_times=5, ini='k-means++')) #r.append(wrapper.kernel_kgroups(k,data,G,run_times=5, ini='spectral')) t = PrettyTable(['Algorithm', 'Accuracy', 'A-Rand']) algos = ['kmeans', 'GMM', 'spectral clustering', 'spectral', 'kernel k-means', 'kernel k-groups'] for algo, zh in zip(algos, r):
m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((0.7 * np.ones(d), np.zeros(D - d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) return X, z r = [] for _ in range(num_experiments): for dim in dimensions: X, z = generate_data(dim) G = eclust.kernel_matrix(X, lambda x, y: np.linalg.norm(x - y)) zh = wrapper.kmeans(k, X) a = metric.accuracy(z, zh) r.append(['k-means', dim, a]) zh = wrapper.gmm(k, X) a = metric.accuracy(z, zh) r.append(['gmm', dim, a]) zh = wrapper.spectral_clustering(k, X, G) a = metric.accuracy(z, zh) r.append(['spectral clustering', dim, a]) zh = wrapper.kernel_kmeans(k, X, G) a = metric.accuracy(z, zh) r.append(['kernel k-means', dim, a])
d = 10 m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((0.7*np.ones(d), np.zeros(D-d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) return X, z r = [] for _ in range(num_experiments): for dim in dimensions: X, z = generate_data(dim) G = eclust.kernel_matrix(X, lambda x, y: np.linalg.norm(x-y)) zh = wrapper.kmeans(k, X) a = metric.accuracy(z, zh) r.append(['k-means', dim, a]) zh = wrapper.gmm(k, X) a = metric.accuracy(z, zh) r.append(['gmm', dim, a]) zh = wrapper.spectral_clustering(k, X, G) a = metric.accuracy(z, zh) r.append(['spectral clustering', dim, a]) zh = wrapper.kernel_kmeans(k, X, G) a = metric.accuracy(z, zh) r.append(['kernel k-means', dim, a])
m2 = 1.5 s2 = 0.3 X, z = data.univariate_normal([m1, m2], [s1, s2], [n1, n2]) #X, z = data.univariate_lognormal([m1, m2], [s1, s2], [n1, n2]) Y = np.array([[x] for x in X]) bw = 0.5 # bandwidth num_points = 1500 # number points for linspace low = -6 high = 6 #low = -2 #high = 20 ### clustering t = PrettyTable(['Method', 'Accuracy']) G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x - y)) zh_kmeans = wrapper.kmeans(k, Y) t.add_row(['k-means', metric.accuracy(z, zh_kmeans)]) zh_gmm = wrapper.gmm(k, Y) t.add_row(['gmm', metric.accuracy(z, zh_gmm)]) zh_kgroups = wrapper.kernel_kgroups(k, Y, G) t.add_row(['kernel k-groups', metric.accuracy(z, zh_kgroups)]) print t X_plot = np.linspace(low, high, num_points)[:, np.newaxis] ### kernel density estimation x1_true = X[np.where(z == 0)][:, np.newaxis] x2_true = X[np.where(z == 1)][:, np.newaxis] fig = plt.figure() ax = fig.add_subplot(111)
# delete missing entries delete_missing = np.where(data=='?')[0] data = np.delete(data, delete_missing, axis=0) data = np.array(data, dtype=float) z = np.delete(z, delete_missing, axis=0) # normalize data data = (data - data.mean(axis=0))/data.std(axis=0) G = eclust.kernel_matrix(data, rho) #G = energy.eclust.kernel_matrix(data, rho_gauss) #G = energy.eclust.kernel_matrix(data, rho_exp) r = [] r.append(wrapper.kmeans(6, data, run_times=10)) r.append(wrapper.gmm(6, data, run_times=10)) r.append(wrapper.spectral_clustering(6, data, G, run_times=10)) r.append(wrapper.spectral(6, data, G, run_times=10)) #r.append(wrapper.kernel_kmeans(6, data, G, run_times=10, ini='random')) r.append(wrapper.kernel_kmeans(6, data, G, run_times=10, ini='k-means++')) #r.append(wrapper.kernel_kmeans(6, data, G, run_times=10, ini='spectral')) #r.append(wrapper.kernel_kgroups(6,data,G,run_times=10, ini='random')) r.append(wrapper.kernel_kgroups(6,data,G,run_times=10, ini='k-means++')) #r.append(wrapper.kernel_kgroups(6,data,G,run_times=10, ini='spectral')) t = PrettyTable(['Algorithm', 'Accuracy', 'A-Rand']) algos = ['kmeans', 'GMM', 'spectral clustering', 'spectral',
### generate data k = 2 n = 2000 n1, n2 = np.random.multinomial(n, [0.5, 0.5]) m1 = 0 s1 = 1.5 m2 = 1.5 s2 = 0.3 #X, z = data.univariate_normal([m1, m2], [s1, s2], [n1, n2]) X, z = data.univariate_lognormal([m1, m2], [s1, s2], [n1, n2]) Y = np.array([[x] for x in X]) ### clustering t = PrettyTable(['Method', 'Accuracy']) G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x-y)) zh_kmeans = wrapper.kmeans(k, Y) t.add_row(['k-means', metric.accuracy(z, zh_kmeans)]) zh_gmm = wrapper.gmm(k, Y) t.add_row(['gmm', metric.accuracy(z, zh_gmm)]) zh_kgroups = wrapper.kernel_kgroups(k, Y, G) t.add_row(['kernel k-groups', metric.accuracy(z, zh_kgroups)]) print t ### estimated classes x1_true = X[np.where(z==0)] x2_true = X[np.where(z==1)] x1_kmeans = X[np.where(zh_kmeans==0)] x2_kmeans = X[np.where(zh_kmeans==1)] x1_gmm = X[np.where(zh_gmm==0)]
from pipe import Pipe from wrapper import kmeans from pipetools import * from dataset import * dataset = get_pendigits() a = Pipe()\ .x(dataset.X)\ .pipe(kmeans(dataset.cluster_cnt))\ .connect(stop()) print(a)
# delete missing entries delete_missing = np.where(data == '?')[0] data = np.delete(data, delete_missing, axis=0) data = np.array(data, dtype=float) z = np.delete(z, delete_missing, axis=0) # normalize data data = (data - data.mean(axis=0)) / data.std(axis=0) G = eclust.kernel_matrix(data, rho) #G = energy.eclust.kernel_matrix(data, rho_gauss) #G = energy.eclust.kernel_matrix(data, rho_exp) r = [] r.append(wrapper.kmeans(6, data, run_times=10)) r.append(wrapper.gmm(6, data, run_times=10)) r.append(wrapper.spectral_clustering(6, data, G, run_times=10)) r.append(wrapper.spectral(6, data, G, run_times=10)) #r.append(wrapper.kernel_kmeans(6, data, G, run_times=10, ini='random')) r.append(wrapper.kernel_kmeans(6, data, G, run_times=10, ini='k-means++')) #r.append(wrapper.kernel_kmeans(6, data, G, run_times=10, ini='spectral')) #r.append(wrapper.kernel_kgroups(6,data,G,run_times=10, ini='random')) r.append(wrapper.kernel_kgroups(6, data, G, run_times=10, ini='k-means++')) #r.append(wrapper.kernel_kgroups(6,data,G,run_times=10, ini='spectral')) t = PrettyTable(['Algorithm', 'Accuracy', 'A-Rand']) algos = [