def cigars_circles(num_experiments=10, run_times=5, kind='cigars'):
    table = []
    for i in range(num_experiments):
        this_experiment = []
        
        if kind == 'cigars':
            m1 = [0,0]
            m2 = [6.5,0]
            s1 = np.array([[1,0],[0,20]])
            s2 = np.array([[1,0],[0,20]])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [200, 200])
            k = 2
            init = 'k-means++'
        elif kind == '2circles':
            X, z = data.circles([1, 3], [0.2, 0.2], [400, 400])
            k = 2
            init = 'random'
        elif kind == '3circles':
            X, z = data.circles([1, 3, 5], [0.2, 0.2, 0.2], [400, 400, 400])
            init = 'random'
            k = 3
        else:
            raise ValueError("Don't know which example to sample.")

        #sigma = 2
        sigma = 1
        G = eclust.kernel_matrix(X, rho_standard)
        G_half = eclust.kernel_matrix(X, rho_half)
        G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma))
        G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma))

        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G,z,init=init,
                                           run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_half,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_exp,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_rbf,z,
                                           init=init,run_times=run_times))
        #this_experiment.append(
        #    run_clustering.spectral(k,X,G_exp,z,run_times=run_times))
        
        this_experiment.append(
            run_clustering.spectral(k,X,G_rbf,z,run_times=run_times))
        
        this_experiment.append(
            run_clustering.kmeans(k,X,z,init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times))
        this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X)))
        
        table.append(this_experiment)
    
    table = np.array(table)
    for i in range(8):
        print table[:,i].mean(), scipy.stats.sem(table[:,i])
def mnist(num_experiments=10, digits=[0,1,2], num_points=100, run_times=5):
    
    k = len(digits)
    f = gzip.open('experiments_data/mnist.pkl.gz', 'rb')
    train_set, valid_set, test_set = cPickle.load(f)
    f.close()
    images_test, labels_test = train_set
    images, labels = valid_set

    # using training test to compute sigma
    X_train, z_train = sample_digits(digits, images_test, labels_test, 
                                     num_points)
    n, _ = X_train.shape
    sigma = np.sqrt(sum([np.linalg.norm(X_train[i]-X_train[j])**2 
                         for i in range(n) for j in range(n)])/(n**2))
    print sigma
    print
    
    table = []
    init = 'k-means++'
    for i in range(num_experiments):
        this_experiment = []
        
        # now cluster on validation set
        X, z = sample_digits(digits, images, labels, num_points)
        
        G = eclust.kernel_matrix(X, rho_standard)
        G_half = eclust.kernel_matrix(X, rho_half)
        G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma))
        G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma))
        
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G,z,init=init,
                                           run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_half,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_exp,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_rbf,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.spectral(k,X,G_rbf,z,
                                        run_times=run_times))
        
        this_experiment.append(
            run_clustering.kmeans(k,X,z,init="k-means++",run_times=run_times))
        this_experiment.append(
            run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times))
        # my gmm was breaking for some unknown reason
        #this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X)))

        table.append(this_experiment)

    table = np.array(table)
    for i in range(table[0,:].shape[0]):
        print table[:,i].mean(), scipy.stats.sem(table[:,i])
def gauss_dimensions_mean(dimensions=range(2, 100, 20),
                          total_points=200,
                          num_experiments=100,
                          d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            ####################

            count += 1

    return table
Beispiel #4
0
 def run(self):
     k = 2
     ncols = 3 + 1  # number of clustering methods + 1
     table = np.zeros((self.experiments * len(self.numpoints), ncols))
     count = 0
     init = 'k-means++'
     for i, n in enumerate(self.numpoints):
         for ne in range(self.experiments):
             Y, z = self.get_sample(n)
             G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x - y))
             table[count, 0] = n
             table[count, 1] = run_clustering.energy_hartigan(k,
                                                              Y,
                                                              G,
                                                              z,
                                                              init=init,
                                                              run_times=5)
             table[count, 2] = run_clustering.kmeans(k,
                                                     Y,
                                                     z,
                                                     init=init,
                                                     run_times=5)
             table[count, 3] = run_clustering.gmm(k,
                                                  Y,
                                                  z,
                                                  init='kmeans',
                                                  run_times=5)
             count += 1
     return table
def normal_or_lognormal(numpoints=range(10,100,10), num_experiments=100,
                        kind='normal'):
    table = np.zeros((num_experiments*len(numpoints), 6))
    count = 0
    k = 2

    for n in numpoints:
        for i in range(num_experiments):

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5,0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1,m2], [s1,s2], [n1,n2])
            else:
                X, z = data.multivariate_lognormal([m1,m2], [s1,s2], [n1,n2])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            rho2 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5)
            G2 = eclust.kernel_matrix(X, rho2)

            rho3 = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G3 = eclust.kernel_matrix(X, rho3)

            table[count, 0] = n
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_hartigan(k, X, G2, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_hartigan(k, X, G3, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
Beispiel #6
0
def normal_or_lognormal(numpoints=range(10,100,10), num_experiments=100,
                        kind='normal'):
    table = np.zeros((num_experiments*len(numpoints), 6))
    count = 0
    k = 2

    for n in numpoints:
        for i in range(num_experiments):

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n, n])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n, n])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            rho2 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5)
            G2 = eclust.kernel_matrix(X, rho2)

            rho3 = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G3 = eclust.kernel_matrix(X, rho3)

            table[count, 0] = n
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_hartigan(k, X, G2, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_hartigan(k, X, G3, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
def gauss_dimensions_cov(
        dimensions=range(2, 100, 20), total_points=200, num_experiments=100,
        d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D - d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([
                1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813,
                3.637
            ])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D - d))))
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 300
    table = np.zeros((num_experiments * len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5 * np.ones(d), np.zeros(D - d)))
            s2 = np.diag(np.concatenate((.5 * np.ones(d), np.ones(D - d))))
            pi1 = (N - p) / N / 2.
            pi2 = (N + p) / N / 2.
            n1, n2 = np.random.multinomial(N, [pi1, pi2])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)

            count += 1

    return table
def gauss_dimensions_mean(dimensions=range(2,100,20), total_points=200,
                          num_experiments=100, d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            ####################
            
            count += 1

    return table
Beispiel #10
0
def normal_or_lognormal_difference(
        numpoints=range(10, 100, 10), num_experiments=100, kind='normal'):
    k = 2
    table = []
    for n in numpoints:
        for i in range(num_experiments):
            this_res = [n]

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5 * np.eye(D)
            m2 = 0.5 * np.concatenate((np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5, 0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2],
                                                   [n1, n2])

            rho = lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y) / 2)
            G = eclust.kernel_matrix(X, rho)

            hart = run_clustering.energy_hartigan(k,
                                                  X,
                                                  G,
                                                  z,
                                                  init="k-means++",
                                                  run_times=5)
            lloyd = run_clustering.energy_lloyd(k,
                                                X,
                                                G,
                                                z,
                                                init="k-means++",
                                                run_times=5)
            spectral = run_clustering.spectral(k, X, G, z, run_times=5)
            this_res.append(hart - lloyd)
            this_res.append(hart - spectral)

            table.append(this_res)
    table = np.array(table)
    return table
 def run(self):
     k = 2
     ncols = 3 + 1 # number of clustering methods + 1
     table = np.zeros((self.experiments*len(self.numpoints), ncols))
     count = 0
     init = 'k-means++'
     for i, n in enumerate(self.numpoints):
         for ne in range(self.experiments):
             Y, z = self.get_sample(n)
             G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x-y))
             table[count, 0] = n
             table[count, 1] = run_clustering.energy_hartigan(k, Y, G, z,
                                                 init=init, run_times=5)
             table[count, 2] = run_clustering.kmeans(k, Y, z, init=init,
                                                 run_times=5)
             table[count, 3] = run_clustering.gmm(k, Y, z, init='kmeans',
                                                 run_times=5)
             count += 1
     return table
def gauss_dimensions_cov(dimensions=range(2,100,20), total_points=200,
                         num_experiments=100, d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D-d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([1.367,  3.175,  3.247,  4.403,  1.249,                                             1.969, 4.035,   4.237,  2.813,  3.637])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D-d))))
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            
            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                      run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            count += 1

    return table
def gauss_dimensions_mean(dimensions=range(2,100,20), num_points=[100, 100],
                          num_experiments=100, d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    n1, n2 = num_points
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 250
    table = np.zeros((num_experiments*len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5*np.ones(d), np.zeros(D-d)))
            s2 = np.diag(np.concatenate((.5*np.ones(d), np.ones(D-d))))
            n1 = N-p
            n2 = N+p

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z,
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z,
                                                init="kmeans", run_times=5)

            count += 1

    return table
def normal_or_lognormal_difference(numpoints=range(10,100,10), 
                                   num_experiments=100,
                                   kind='normal'):
    k = 2
    table = []
    for n in numpoints:
        for i in range(num_experiments):
            this_res = [n]

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5,0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1,n2])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1,n2])

            rho = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G = eclust.kernel_matrix(X, rho)

            hart = run_clustering.energy_hartigan(k, X, G, z, 
                                        init="k-means++", run_times=5)
            lloyd = run_clustering.energy_lloyd(k, X, G, z, 
                                        init="k-means++", run_times=5)
            spectral = run_clustering.spectral(k, X, G, z, 
                                            run_times=5)
            this_res.append(hart-lloyd)
            this_res.append(hart-spectral)
            
            table.append(this_res)
    table = np.array(table)
    return table
    # gap statistic with k-means
    #k_hat, df_kmeans = gap_statistics(X, B=50, K=K, cluster_func=kmeans, 
    #                                     type_ref="svd")
    #print k_hat
    #df_kmeans.to_csv("experiments_data2/kmeans_synapse_gap.csv")
    #plot_gap(infile="experiments_data2/kmeans_synapse_gap.csv", 
    #         output="kmeans_synapse_gap.pdf", 
    #         xlabel="$k$", 
    #         ylabel1=r"$g_k$", 
    #         ylabel2=r"$J_k$")

    rho2 = lambda x, y: np.power(np.linalg.norm(x-y), 1)
    #rho3 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5)
    rho4 = lambda x, y: np.power(np.linalg.norm(x-y), 0.25)
    G2 = eclust.kernel_matrix(X, rho2)
    #G3 = eclust.kernel_matrix(X, rho3)
    G4 = eclust.kernel_matrix(X, rho4)
    
    #gaps2 = eigenvalues(G2, K)
    #gaps3 = eigenvalues(G3, K)
    #gaps4 = eigenvalues(G4, K)
    #plot_eigenvalue_gaps([gaps2,gaps3,gaps4], output="eigen_gap.pdf")

    #elb2 = elbow_kernel(X, G2, K+2, cluster_func=energy_hartigan)
    #elb2 = np.log(elb2)
    #elb2 = elb2[1:] - elb2[:-1]
    #elb3 = elbow_kernel(X, G3, K+2, cluster_func=energy_hartigan)
    #elb3 = np.log(elb3)
    #elb3 = elb3[1:] - elb3[:-1]
    #elb4 = elbow_kernel(X, G4, K+2, cluster_func=energy_hartigan)
Beispiel #17
0
    print "# Class 1:", len(labels[np.where(labels == 1)])
    print

    n0 = 500
    n1 = 500
    data_class0 = data[np.where(labels == 0)]
    data_class1 = data[np.where(labels == 1)]
    idx0 = np.random.choice(range(len(data_class0)), n0, replace=True)
    idx1 = np.random.choice(range(len(data_class1)), n1, replace=True)
    data, labels = shuffle_data([data_class0[idx0], data_class1[idx1]])

    #data = (data - data.mean(axis=0))/data.std(axis=0)

    rho = lambda x, y: np.power(np.linalg.norm(x - y), 1)
    #rho = lambda x, y: 2-2*np.exp(-np.power(np.linalg.norm(x-y),1)/(2*1**2))
    G = eclust.kernel_matrix(data, rho)

    labels_hat = run_clustering.kmeans(2, data)
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print

    labels_hat = run_clustering.gmm(2, data)
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print

    labels_hat = run_clustering.energy_hartigan(2,
                                                data,
                                                G,
                                                run_times=5,
Beispiel #18
0
def kernel_gap_statistics(X, B, K, cluster_func, rho, type_ref='svd'):
    """Implement gap statistics from Tibshrani using any clustering method
    that operates on a kernel matrix defined by a semi metric.

    ================================================================
    NOTE: for some reason this method does not work well.
    There is a huge variability when clustering the reference data,
    specially when using the SVD approach.
    
    DON'T USE THIS METHOD!
    ================================================================
    
    Parameters:
        
        X: data set
        B: number of reference samples
        K: maximum number of clusters 
        cluster_func: function used for clusterin, accept k and X and G, 
            returns objective function value
        rho: the semimetric to generate pairwise kernel matrix
        type_ref: reference distribution method {"uniform", "svd"}
    
    """
    K = K + 1
    Wks = []  # will contain objective function values for each k
    gaps = []  # will contain the gaps
    sks = []  # will contain the variances
    log_Wkbs_sem = []  # standard error from the mean

    G = eclust.kernel_matrix(X, rho)  # generate kernel on data

    for k in range(1, K):

        # cluster original data
        Wk = cluster_func(k, X, G)
        Wks.append(Wk)

        # generate reference data and cluster this data
        Wkbs = []
        for b in range(B):
            # generate reference distribution
            if type_ref == 'svd':
                Z = draw_svd(X)
            else:
                Z = draw_uniform(X)
            # cluster reference set
            G_ref = eclust.kernel_matrix(Z, rho)
            Wkb = cluster_func(k, Z, G_ref)
            Wkbs.append(Wkb)
        log_Wkbs = np.log(Wkbs)
        l_bar = np.mean(log_Wkbs)
        log_Wkbs_sem.append(stats.sem(log_Wkbs))

        # gap statistic
        gap = l_bar - np.log(Wk)
        gaps.append(gap)

        # compute variance
        sdk = np.std(log_Wkbs)
        sk = sdk * np.sqrt(1 + 1 / B)
        sks.append(sk)

    # find number of clusters
    #for i in range(len(gaps)-1):
    #    delta = gaps[i] - gaps[i+1] - sks[i+1]
    #    if delta <= 0:
    #        break
    #k_hat = i+1

    # collect data, for ploting purposes
    d = {
        'score': np.array(Wks),
        'gap': np.array(gaps),
        'var': np.array(sks),
        'sem': np.array(log_Wkbs_sem)
    }
    df = pd.DataFrame(data=d, index=range(1, K))
    gap_k = df['gap'][:-1].values
    gap_k_plus_1 = df['gap'][1:].values
    sigma_k_plus_1 = df['var'][1:].values
    gap2 = gap_k - (gap_k_plus_1 - sigma_k_plus_1)
    df.drop(df.tail(1).index, inplace=True)
    df['gap2'] = pd.Series(gap2, index=df.index)
    return df
    s2 = np.eye(10)
    s3 = np.eye(10)
    s4 = np.eye(10)
    n1 = n2 = n3 = n4 = 30
    X, z = data.multivariate_normal([m1,m2,m3,m4], [s1,s2,s3,s4], [n1,n2,n3,n4])

    # max number of clusters
    K = 10
    
    # gap statistic with k-means
    k_hat, df_kmeans = gap_statistics(X, B=50, K=K, cluster_func=kmeans, 
                                         type_ref="svd")
    print k_hat

    rho = lambda x, y: np.power(np.linalg.norm(x-y), 1)
    G = eclust.kernel_matrix(X, rho)
    gaps = eigenvalues(G, K)
    
    rho2 = lambda x, y: 2-2*np.exp(-np.power(np.linalg.norm(x-y),2)/2/((2)**2))
    G2 = eclust.kernel_matrix(X, rho2)
    gaps2 = eigenvalues(G2, K)
    
    plot_eigenvalue_gaps([gaps, gaps2], output="eigen_gap.pdf")

    #elb2 = elbow_kernel(X, G2, K+2, cluster_func=energy_hartigan)
    #elb2 = np.log(elb2)
    #elb2 = elb2[1:] - elb2[:-1]
    #elb3 = elbow_kernel(X, G3, K+2, cluster_func=energy_hartigan)
    #elb3 = np.log(elb3)
    #elb3 = elb3[1:] - elb3[:-1]
    #elb4 = elbow_kernel(X, G4, K+2, cluster_func=energy_hartigan)
Beispiel #20
0
    k = 3
    
    rho = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)**2/2/4)
    G = eclust.kernel_matrix(X, rho)
    """

    n = 400
    n1, n2 = np.random.multinomial(n, [0.5, 0.5])
    m1 = 1.5
    s1 = 0.3
    m2 = 0
    s2 = 1.5
    #X, z = data.univariate_normal([m1, m2], [s1, s2], [n1, n2])
    X, z = data.univariate_lognormal([m1, m2], [s1, s2], [n1, n2])
    Y = np.array([[x] for x in X])
    G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x - y))

    k = 2
    init = "k-means++"
    #init="random"

    print "Energy-spectral:", energy_spectral(k,
                                              Y,
                                              G,
                                              z,
                                              init=init,
                                              run_times=5)
    print "Spectral Clustering:", spectral(k, Y, G, z, run_times=5)
    print "Energy-Lloyd:", energy_lloyd(k, Y, G, z, init=init, run_times=5)
    print "Energy-Hartigan:", energy_hartigan(k,
                                              Y,
def kernel_gap_statistics(X, B, K, cluster_func, rho, type_ref='svd'):
    """Implement gap statistics from Tibshrani using any clustering method
    that operates on a kernel matrix defined by a semi metric.

    ================================================================
    NOTE: for some reason this method does not work well.
    There is a huge variability when clustering the reference data,
    specially when using the SVD approach.
    
    DON'T USE THIS METHOD!
    ================================================================
    
    Parameters:
        
        X: data set
        B: number of reference samples
        K: maximum number of clusters 
        cluster_func: function used for clusterin, accept k and X and G, 
            returns objective function value
        rho: the semimetric to generate pairwise kernel matrix
        type_ref: reference distribution method {"uniform", "svd"}
    
    """
    K = K+1
    Wks = [] # will contain objective function values for each k
    gaps = [] # will contain the gaps
    sks = [] # will contain the variances
    log_Wkbs_sem = [] # standard error from the mean
    
    G = eclust.kernel_matrix(X, rho) # generate kernel on data

    for k in range(1, K):
    
        # cluster original data
        Wk = cluster_func(k, X, G)
        Wks.append(Wk)
        
        # generate reference data and cluster this data
        Wkbs = []
        for b in range(B):
            # generate reference distribution
            if type_ref == 'svd':
                Z = draw_svd(X)
            else:
                Z = draw_uniform(X)
            # cluster reference set
            G_ref = eclust.kernel_matrix(Z, rho) 
            Wkb = cluster_func(k, Z, G_ref)
            Wkbs.append(Wkb)
        log_Wkbs = np.log(Wkbs)
        l_bar = np.mean(log_Wkbs)
        log_Wkbs_sem.append(stats.sem(log_Wkbs))

        # gap statistic
        gap = l_bar - np.log(Wk)
        gaps.append(gap)

        # compute variance
        sdk = np.std(log_Wkbs)
        sk = sdk*np.sqrt(1+1/B)
        sks.append(sk)

    # find number of clusters
    #for i in range(len(gaps)-1):
    #    delta = gaps[i] - gaps[i+1] - sks[i+1]
    #    if delta <= 0:
    #        break
    #k_hat = i+1

    # collect data, for ploting purposes 
    d = {
            'score': np.array(Wks), 
            'gap': np.array(gaps), 
            'var': np.array(sks),
            'sem': np.array(log_Wkbs_sem)
    }
    df = pd.DataFrame(data=d, index=range(1,K))
    gap_k = df['gap'][:-1].values
    gap_k_plus_1 = df['gap'][1:].values
    sigma_k_plus_1 = df['var'][1:].values
    gap2 = gap_k - (gap_k_plus_1 - sigma_k_plus_1)
    df.drop(df.tail(1).index, inplace=True)
    df['gap2'] = pd.Series(gap2, index=df.index)
    return df
Beispiel #22
0
    X, z = data.multivariate_normal([m1, m2, m3, m4], [s1, s2, s3, s4],
                                    [n1, n2, n3, n4])

    # max number of clusters
    K = 10

    # gap statistic with k-means
    k_hat, df_kmeans = gap_statistics(X,
                                      B=50,
                                      K=K,
                                      cluster_func=kmeans,
                                      type_ref="svd")
    print k_hat

    rho = lambda x, y: np.power(np.linalg.norm(x - y), 1)
    G = eclust.kernel_matrix(X, rho)
    gaps = eigenvalues(G, K)

    rho2 = lambda x, y: 2 - 2 * np.exp(-np.power(np.linalg.norm(x - y), 2) / 2
                                       / ((2)**2))
    G2 = eclust.kernel_matrix(X, rho2)
    gaps2 = eigenvalues(G2, K)

    plot_eigenvalue_gaps([gaps, gaps2], output="eigen_gap.pdf")

    #elb2 = elbow_kernel(X, G2, K+2, cluster_func=energy_hartigan)
    #elb2 = np.log(elb2)
    #elb2 = elb2[1:] - elb2[:-1]
    #elb3 = elbow_kernel(X, G3, K+2, cluster_func=energy_hartigan)
    #elb3 = np.log(elb3)
    #elb3 = elb3[1:] - elb3[:-1]
Beispiel #23
0
    # gap statistic with k-means
    #k_hat, df_kmeans = gap_statistics(X, B=50, K=K, cluster_func=kmeans,
    #                                     type_ref="svd")
    #print k_hat
    #df_kmeans.to_csv("experiments_data2/kmeans_synapse_gap.csv")
    #plot_gap(infile="experiments_data2/kmeans_synapse_gap.csv",
    #         output="kmeans_synapse_gap.pdf",
    #         xlabel="$k$",
    #         ylabel1=r"$g_k$",
    #         ylabel2=r"$J_k$")

    rho2 = lambda x, y: np.power(np.linalg.norm(x - y), 1)
    #rho3 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5)
    rho4 = lambda x, y: np.power(np.linalg.norm(x - y), 0.25)
    G2 = eclust.kernel_matrix(X, rho2)
    #G3 = eclust.kernel_matrix(X, rho3)
    G4 = eclust.kernel_matrix(X, rho4)

    #gaps2 = eigenvalues(G2, K)
    #gaps3 = eigenvalues(G3, K)
    #gaps4 = eigenvalues(G4, K)
    #plot_eigenvalue_gaps([gaps2,gaps3,gaps4], output="eigen_gap.pdf")

    #elb2 = elbow_kernel(X, G2, K+2, cluster_func=energy_hartigan)
    #elb2 = np.log(elb2)
    #elb2 = elb2[1:] - elb2[:-1]
    #elb3 = elbow_kernel(X, G3, K+2, cluster_func=energy_hartigan)
    #elb3 = np.log(elb3)
    #elb3 = elb3[1:] - elb3[:-1]
    #elb4 = elbow_kernel(X, G4, K+2, cluster_func=energy_hartigan)
    print "# Class 1:", len(labels[np.where(labels==1)])
    print

    n0 = 500
    n1 = 500
    data_class0 = data[np.where(labels==0)]
    data_class1 = data[np.where(labels==1)]
    idx0 = np.random.choice(range(len(data_class0)), n0, replace=True)
    idx1 = np.random.choice(range(len(data_class1)), n1, replace=True)
    data, labels = shuffle_data([data_class0[idx0], data_class1[idx1]])

    #data = (data - data.mean(axis=0))/data.std(axis=0)

    rho = lambda x, y: np.power(np.linalg.norm(x-y), 1)
    #rho = lambda x, y: 2-2*np.exp(-np.power(np.linalg.norm(x-y),1)/(2*1**2))
    G = eclust.kernel_matrix(data, rho)
    
    labels_hat = run_clustering.kmeans(2, data)
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print
    
    labels_hat = run_clustering.gmm(2, data)
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print
    
    labels_hat = run_clustering.energy_hartigan(2, data, G, run_times=5,
                                                    init="gmm")
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
def gauss_dimensions_mean(dimensions=range(2, 100, 20),
                          num_points=[100, 100],
                          num_experiments=100,
                          d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    n1, n2 = num_points
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)

            count += 1

    return table
    k = 3
    
    rho = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)**2/2/4)
    G = eclust.kernel_matrix(X, rho)
    """

    n = 400
    n1, n2 = np.random.multinomial(n, [0.5, 0.5])
    m1 = 1.5
    s1 = 0.3
    m2 = 0
    s2 = 1.5
    #X, z = data.univariate_normal([m1, m2], [s1, s2], [n1, n2])
    X, z = data.univariate_lognormal([m1, m2], [s1, s2], [n1, n2])
    Y = np.array([[x] for x in X])
    G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x-y))
    
    k = 2
    init="k-means++"
    #init="random"

    print "Energy-spectral:", energy_spectral(k, Y, G, z, init=init,
                                                run_times=5)
    print "Spectral Clustering:", spectral(k, Y, G, z, run_times=5)
    print "Energy-Lloyd:", energy_lloyd(k, Y, G, z, init=init,
                                        run_times=5)
    print "Energy-Hartigan:", energy_hartigan(k, Y, G, z, init=init,
                                        run_times=5)
    print "k-means:", kmeans(k, Y, z, run_times=5, init=init)
    print  "GMM:", gmm(k, Y, z, run_times=5, init="kmeans")