def gauss_dimensions_mean(dimensions=range(2, 100, 20), total_points=200, num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] table = np.zeros((num_experiments * len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): ### generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d))) s2 = np.eye(D) # flip Bernoulli coins to get number of points in each cluster n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) # get data, construct gram Matrix X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) ################## ### cluster with different algorithms # can change number of times we execute each experiment # and initialization as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) #################### count += 1 return table
def cigars_circles(num_experiments=10, run_times=5, kind='cigars'): table = [] for i in range(num_experiments): this_experiment = [] if kind == 'cigars': m1 = [0,0] m2 = [6.5,0] s1 = np.array([[1,0],[0,20]]) s2 = np.array([[1,0],[0,20]]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [200, 200]) k = 2 init = 'k-means++' elif kind == '2circles': X, z = data.circles([1, 3], [0.2, 0.2], [400, 400]) k = 2 init = 'random' elif kind == '3circles': X, z = data.circles([1, 3, 5], [0.2, 0.2, 0.2], [400, 400, 400]) init = 'random' k = 3 else: raise ValueError("Don't know which example to sample.") #sigma = 2 sigma = 1 G = eclust.kernel_matrix(X, rho_standard) G_half = eclust.kernel_matrix(X, rho_half) G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma)) G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma)) this_experiment.append( run_clustering.energy_hartigan(k,X,G,z,init=init, run_times=run_times)) this_experiment.append( run_clustering.energy_hartigan(k,X,G_half,z, init=init,run_times=run_times)) this_experiment.append( run_clustering.energy_hartigan(k,X,G_exp,z, init=init,run_times=run_times)) this_experiment.append( run_clustering.energy_hartigan(k,X,G_rbf,z, init=init,run_times=run_times)) #this_experiment.append( # run_clustering.spectral(k,X,G_exp,z,run_times=run_times)) this_experiment.append( run_clustering.spectral(k,X,G_rbf,z,run_times=run_times)) this_experiment.append( run_clustering.kmeans(k,X,z,init=init,run_times=run_times)) this_experiment.append( run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times)) this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X))) table.append(this_experiment) table = np.array(table) for i in range(8): print table[:,i].mean(), scipy.stats.sem(table[:,i])
def gauss_dimensions_cov( dimensions=range(2, 100, 20), total_points=200, num_experiments=100, d=10): """High dimensions but with nontrivial covariance.""" k = 2 if not d: d = dimensions[0] table = np.zeros((num_experiments * len(dimensions), 6)) count = 0 for D in dimensions: for l in range(num_experiments): # generate data m1 = np.zeros(D) m2 = np.concatenate((np.ones(d), np.zeros(D - d))) s1 = np.eye(D) # from uniform 1, 5 s2_1 = np.array([ 1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813, 3.637 ]) s2 = np.diag(np.concatenate((s2_1, np.ones(D - d)))) n1, n2 = np.random.multinomial(total_points, [0.5, 0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10): """Test unbalanced clusters.""" k = 2 D = 4 d = 2 N = 300 table = np.zeros((num_experiments * len(num_points), 6)) count = 0 for p in num_points: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((1.5 * np.ones(d), np.zeros(D - d))) s2 = np.diag(np.concatenate((.5 * np.ones(d), np.ones(D - d)))) pi1 = (N - p) / N / 2. pi2 = (N + p) / N / 2. n1, n2 = np.random.multinomial(N, [pi1, pi2]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) table[count, 0] = p table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def normal_or_lognormal(numpoints=range(10,100,10), num_experiments=100, kind='normal'): table = np.zeros((num_experiments*len(numpoints), 6)) count = 0 k = 2 for n in numpoints: for i in range(num_experiments): # generate data D = 20 d = 5 m1 = np.zeros(D) s1 = 0.5*np.eye(D) m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(n, [0.5,0.5]) if kind == 'normal': X, z = data.multivariate_normal([m1,m2], [s1,s2], [n1,n2]) else: X, z = data.multivariate_lognormal([m1,m2], [s1,s2], [n1,n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) rho2 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5) G2 = eclust.kernel_matrix(X, rho2) rho3 = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2) G3 = eclust.kernel_matrix(X, rho3) table[count, 0] = n table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_hartigan(k, X, G2, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.energy_hartigan(k, X, G3, z, init="k-means++", run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def normal_or_lognormal(numpoints=range(10,100,10), num_experiments=100, kind='normal'): table = np.zeros((num_experiments*len(numpoints), 6)) count = 0 k = 2 for n in numpoints: for i in range(num_experiments): # generate data D = 20 d = 5 m1 = np.zeros(D) s1 = 0.5*np.eye(D) m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d))) s2 = np.eye(D) if kind == 'normal': X, z = data.multivariate_normal([m1, m2], [s1, s2], [n, n]) else: X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n, n]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) rho2 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5) G2 = eclust.kernel_matrix(X, rho2) rho3 = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2) G3 = eclust.kernel_matrix(X, rho3) table[count, 0] = n table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_hartigan(k, X, G2, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.energy_hartigan(k, X, G3, z, init="k-means++", run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def normal_or_lognormal_difference( numpoints=range(10, 100, 10), num_experiments=100, kind='normal'): k = 2 table = [] for n in numpoints: for i in range(num_experiments): this_res = [n] # generate data D = 20 d = 5 m1 = np.zeros(D) s1 = 0.5 * np.eye(D) m2 = 0.5 * np.concatenate((np.ones(d), np.zeros(D - d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(n, [0.5, 0.5]) if kind == 'normal': X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) else: X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y) / 2) G = eclust.kernel_matrix(X, rho) hart = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) lloyd = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) spectral = run_clustering.spectral(k, X, G, z, run_times=5) this_res.append(hart - lloyd) this_res.append(hart - spectral) table.append(this_res) table = np.array(table) return table
def gauss_dimensions_mean(dimensions=range(2,100,20), total_points=200, num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] table = np.zeros((num_experiments*len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): ### generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d))) s2 = np.eye(D) # flip Bernoulli coins to get number of points in each cluster n1, n2 = np.random.multinomial(total_points, [0.5,0.5]) # get data, construct gram Matrix X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) ################## ### cluster with different algorithms # can change number of times we execute each experiment # and initialization as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) #################### count += 1 return table
def gauss_dimensions_cov(dimensions=range(2,100,20), total_points=200, num_experiments=100, d=10): """High dimensions but with nontrivial covariance.""" k = 2 if not d: d = dimensions[0] table = np.zeros((num_experiments*len(dimensions), 6)) count = 0 for D in dimensions: for l in range(num_experiments): # generate data m1 = np.zeros(D) m2 = np.concatenate((np.ones(d), np.zeros(D-d))) s1 = np.eye(D) # from uniform 1, 5 s2_1 = np.array([1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813, 3.637]) s2 = np.diag(np.concatenate((s2_1, np.ones(D-d)))) n1, n2 = np.random.multinomial(total_points, [0.5,0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_mean(dimensions=range(2,100,20), num_points=[100, 100], num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] n1, n2 = num_points table = np.zeros((num_experiments*len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d))) s2 = np.eye(D) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.energy_spectral(k, X, G, z, init="k-means++", run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10): """Test unbalanced clusters.""" k = 2 D = 4 d = 2 N = 250 table = np.zeros((num_experiments*len(num_points), 6)) count = 0 for p in num_points: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((1.5*np.ones(d), np.zeros(D-d))) s2 = np.diag(np.concatenate((.5*np.ones(d), np.ones(D-d)))) n1 = N-p n2 = N+p X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x-y) G = eclust.kernel_matrix(X, rho) table[count, 0] = p table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.energy_spectral(k, X, G, z, init="k-means++", run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
def normal_or_lognormal_difference(numpoints=range(10,100,10), num_experiments=100, kind='normal'): k = 2 table = [] for n in numpoints: for i in range(num_experiments): this_res = [n] # generate data D = 20 d = 5 m1 = np.zeros(D) s1 = 0.5*np.eye(D) m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d))) s2 = np.eye(D) n1, n2 = np.random.multinomial(n, [0.5,0.5]) if kind == 'normal': X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1,n2]) else: X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1,n2]) rho = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2) G = eclust.kernel_matrix(X, rho) hart = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) lloyd = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) spectral = run_clustering.spectral(k, X, G, z, run_times=5) this_res.append(hart-lloyd) this_res.append(hart-spectral) table.append(this_res) table = np.array(table) return table
def gauss_dimensions_mean(dimensions=range(2, 100, 20), num_points=[100, 100], num_experiments=100, d=None): # data distribution k = 2 delta = 0.7 if not d: d = dimensions[0] n1, n2 = num_points table = np.zeros((num_experiments * len(dimensions), 6)) count = 0 for D in dimensions: for i in range(num_experiments): # generate data m1 = np.zeros(D) s1 = np.eye(D) m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d))) s2 = np.eye(D) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) rho = lambda x, y: np.linalg.norm(x - y) G = eclust.kernel_matrix(X, rho) # can change the number of times we execute each experiment # and initialization method as well table[count, 0] = D table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, init="k-means++", run_times=5) table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, init="k-means++", run_times=5) table[count, 3] = run_clustering.energy_spectral(k, X, G, z, init="k-means++", run_times=5) table[count, 4] = run_clustering.kmeans(k, X, z, init="k-means++", run_times=5) table[count, 5] = run_clustering.gmm(k, X, z, init="kmeans", run_times=5) count += 1 return table
#m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d))) #s2 = np.eye(D) #n1, n2 = np.random.multinomial(N, [0.5,0.5]) #X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) d = 10 D = 200 N = 200 m1 = np.zeros(D) m2 = np.concatenate((np.ones(d), np.zeros(D - d))) s1 = np.eye(D) s2_1 = np.array( [1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813, 3.637]) s2 = np.diag(np.concatenate((s2_1, np.ones(D - d)))) n1, n2 = np.random.multinomial(N, [0.5, 0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) numcols = 5 Y = np.zeros(shape=(N, numcols + 1)) Y[:, :numcols] = X[:, :numcols] idx0 = np.where(z == 0) idx1 = np.where(z == 1) Y[idx0, numcols] = 0 Y[idx1, numcols] = 1 df = pd.DataFrame(Y, columns=[r"$x_%i$" % i for i in range(1, numcols + 1)] + ["class"]) g = sns.PairGrid( df, hue="class", #palette="hls",
#s1 = np.eye(D) #m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d))) #s2 = np.eye(D) #n1, n2 = np.random.multinomial(N, [0.5,0.5]) #X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) d = 10 D = 200 N = 200 m1 = np.zeros(D) m2 = np.concatenate((np.ones(d), np.zeros(D-d))) s1 = np.eye(D) s2_1 = np.array([1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813, 3.637]) s2 = np.diag(np.concatenate((s2_1, np.ones(D-d)))) n1, n2 = np.random.multinomial(N, [0.5,0.5]) X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2]) numcols=5 Y = np.zeros(shape=(N,numcols+1)) Y[:,:numcols] = X[:,:numcols] idx0 = np.where(z==0) idx1 = np.where(z==1) Y[idx0,numcols] = 0 Y[idx1,numcols] = 1 df = pd.DataFrame(Y, columns=[r"$x_%i$"%i for i in range(1, numcols+1)]+["class"]) g = sns.PairGrid(df, hue="class", #palette="hls", vars=[r"$x_%i$"%i for i in range(1, numcols+1)]) def scatter_fake_diag(x, y, *a, **kw):
import matplotlib.pyplot as plt #sns.set_style("ticks", {"xtick.direction":"in", "ytick.direction": "in"}) from customize_plots import * import sys # syntetic data for testing m1 = np.zeros(10) m2 = 2 * np.ones(10) m3 = 4 * np.ones(10) m4 = 5 * np.ones(10) s1 = np.eye(10) s2 = np.eye(10) s3 = np.eye(10) s4 = np.eye(10) n1 = n2 = n3 = n4 = 30 X, z = data.multivariate_normal([m1, m2, m3, m4], [s1, s2, s3, s4], [n1, n2, n3, n4]) # max number of clusters K = 10 # gap statistic with k-means k_hat, df_kmeans = gap_statistics(X, B=50, K=K, cluster_func=kmeans, type_ref="svd") print k_hat rho = lambda x, y: np.power(np.linalg.norm(x - y), 1) G = eclust.kernel_matrix(X, rho) gaps = eigenvalues(G, K)
import matplotlib.pyplot as plt #sns.set_style("ticks", {"xtick.direction":"in", "ytick.direction": "in"}) from customize_plots import * import sys # syntetic data for testing m1 = np.zeros(10) m2 = 2*np.ones(10) m3 = 4*np.ones(10) m4 = 5*np.ones(10) s1 = np.eye(10) s2 = np.eye(10) s3 = np.eye(10) s4 = np.eye(10) n1 = n2 = n3 = n4 = 30 X, z = data.multivariate_normal([m1,m2,m3,m4], [s1,s2,s3,s4], [n1,n2,n3,n4]) # max number of clusters K = 10 # gap statistic with k-means k_hat, df_kmeans = gap_statistics(X, B=50, K=K, cluster_func=kmeans, type_ref="svd") print k_hat rho = lambda x, y: np.power(np.linalg.norm(x-y), 1) G = eclust.kernel_matrix(X, rho) gaps = eigenvalues(G, K) rho2 = lambda x, y: 2-2*np.exp(-np.power(np.linalg.norm(x-y),2)/2/((2)**2)) G2 = eclust.kernel_matrix(X, rho2)