Beispiel #1
0
 def run(self):
     k = 2
     ncols = 3 + 1  # number of clustering methods + 1
     table = np.zeros((self.experiments * len(self.numpoints), ncols))
     count = 0
     init = 'k-means++'
     for i, n in enumerate(self.numpoints):
         for ne in range(self.experiments):
             Y, z = self.get_sample(n)
             G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x - y))
             table[count, 0] = n
             table[count, 1] = run_clustering.energy_hartigan(k,
                                                              Y,
                                                              G,
                                                              z,
                                                              init=init,
                                                              run_times=5)
             table[count, 2] = run_clustering.kmeans(k,
                                                     Y,
                                                     z,
                                                     init=init,
                                                     run_times=5)
             table[count, 3] = run_clustering.gmm(k,
                                                  Y,
                                                  z,
                                                  init='kmeans',
                                                  run_times=5)
             count += 1
     return table
def gauss_dimensions_mean(dimensions=range(2, 100, 20),
                          total_points=200,
                          num_experiments=100,
                          d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            ####################

            count += 1

    return table
def cigars_circles(num_experiments=10, run_times=5, kind='cigars'):
    table = []
    for i in range(num_experiments):
        this_experiment = []
        
        if kind == 'cigars':
            m1 = [0,0]
            m2 = [6.5,0]
            s1 = np.array([[1,0],[0,20]])
            s2 = np.array([[1,0],[0,20]])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [200, 200])
            k = 2
            init = 'k-means++'
        elif kind == '2circles':
            X, z = data.circles([1, 3], [0.2, 0.2], [400, 400])
            k = 2
            init = 'random'
        elif kind == '3circles':
            X, z = data.circles([1, 3, 5], [0.2, 0.2, 0.2], [400, 400, 400])
            init = 'random'
            k = 3
        else:
            raise ValueError("Don't know which example to sample.")

        #sigma = 2
        sigma = 1
        G = eclust.kernel_matrix(X, rho_standard)
        G_half = eclust.kernel_matrix(X, rho_half)
        G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma))
        G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma))

        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G,z,init=init,
                                           run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_half,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_exp,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_rbf,z,
                                           init=init,run_times=run_times))
        #this_experiment.append(
        #    run_clustering.spectral(k,X,G_exp,z,run_times=run_times))
        
        this_experiment.append(
            run_clustering.spectral(k,X,G_rbf,z,run_times=run_times))
        
        this_experiment.append(
            run_clustering.kmeans(k,X,z,init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times))
        this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X)))
        
        table.append(this_experiment)
    
    table = np.array(table)
    for i in range(8):
        print table[:,i].mean(), scipy.stats.sem(table[:,i])
def mnist(num_experiments=10, digits=[0,1,2], num_points=100, run_times=5):
    
    k = len(digits)
    f = gzip.open('experiments_data/mnist.pkl.gz', 'rb')
    train_set, valid_set, test_set = cPickle.load(f)
    f.close()
    images_test, labels_test = train_set
    images, labels = valid_set

    # using training test to compute sigma
    X_train, z_train = sample_digits(digits, images_test, labels_test, 
                                     num_points)
    n, _ = X_train.shape
    sigma = np.sqrt(sum([np.linalg.norm(X_train[i]-X_train[j])**2 
                         for i in range(n) for j in range(n)])/(n**2))
    print sigma
    print
    
    table = []
    init = 'k-means++'
    for i in range(num_experiments):
        this_experiment = []
        
        # now cluster on validation set
        X, z = sample_digits(digits, images, labels, num_points)
        
        G = eclust.kernel_matrix(X, rho_standard)
        G_half = eclust.kernel_matrix(X, rho_half)
        G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma))
        G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma))
        
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G,z,init=init,
                                           run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_half,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_exp,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_rbf,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.spectral(k,X,G_rbf,z,
                                        run_times=run_times))
        
        this_experiment.append(
            run_clustering.kmeans(k,X,z,init="k-means++",run_times=run_times))
        this_experiment.append(
            run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times))
        # my gmm was breaking for some unknown reason
        #this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X)))

        table.append(this_experiment)

    table = np.array(table)
    for i in range(table[0,:].shape[0]):
        print table[:,i].mean(), scipy.stats.sem(table[:,i])
def gauss_dimensions_cov(
        dimensions=range(2, 100, 20), total_points=200, num_experiments=100,
        d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D - d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([
                1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813,
                3.637
            ])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D - d))))
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 300
    table = np.zeros((num_experiments * len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5 * np.ones(d), np.zeros(D - d)))
            s2 = np.diag(np.concatenate((.5 * np.ones(d), np.ones(D - d))))
            pi1 = (N - p) / N / 2.
            pi2 = (N + p) / N / 2.
            n1, n2 = np.random.multinomial(N, [pi1, pi2])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)

            count += 1

    return table
def normal_or_lognormal(numpoints=range(10,100,10), num_experiments=100,
                        kind='normal'):
    table = np.zeros((num_experiments*len(numpoints), 6))
    count = 0
    k = 2

    for n in numpoints:
        for i in range(num_experiments):

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5,0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1,m2], [s1,s2], [n1,n2])
            else:
                X, z = data.multivariate_lognormal([m1,m2], [s1,s2], [n1,n2])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            rho2 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5)
            G2 = eclust.kernel_matrix(X, rho2)

            rho3 = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G3 = eclust.kernel_matrix(X, rho3)

            table[count, 0] = n
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_hartigan(k, X, G2, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_hartigan(k, X, G3, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
Beispiel #8
0
def normal_or_lognormal(numpoints=range(10,100,10), num_experiments=100,
                        kind='normal'):
    table = np.zeros((num_experiments*len(numpoints), 6))
    count = 0
    k = 2

    for n in numpoints:
        for i in range(num_experiments):

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n, n])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n, n])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            rho2 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5)
            G2 = eclust.kernel_matrix(X, rho2)

            rho3 = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G3 = eclust.kernel_matrix(X, rho3)

            table[count, 0] = n
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_hartigan(k, X, G2, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_hartigan(k, X, G3, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
def gauss_dimensions_mean(dimensions=range(2,100,20), total_points=200,
                          num_experiments=100, d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            ####################
            
            count += 1

    return table
 def run(self):
     k = 2
     ncols = 3 + 1 # number of clustering methods + 1
     table = np.zeros((self.experiments*len(self.numpoints), ncols))
     count = 0
     init = 'k-means++'
     for i, n in enumerate(self.numpoints):
         for ne in range(self.experiments):
             Y, z = self.get_sample(n)
             G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x-y))
             table[count, 0] = n
             table[count, 1] = run_clustering.energy_hartigan(k, Y, G, z,
                                                 init=init, run_times=5)
             table[count, 2] = run_clustering.kmeans(k, Y, z, init=init,
                                                 run_times=5)
             table[count, 3] = run_clustering.gmm(k, Y, z, init='kmeans',
                                                 run_times=5)
             count += 1
     return table
 def run(self):
     k = 2
     ncols = 3 + 1 # number of clustering methods + 1
     table = np.zeros((self.experiments*len(self.numpoints), ncols))
     count = 0
     for i, n in enumerate(self.numpoints):
         for ne in range(self.experiments):
             X, z = self.get_sample(n)
             Y = np.array([[x] for x in X])
             #G = eclust.kernel_matrix(Y, lambda x, y: np.linalg.norm(x-y))
             table[count, 0] = n
             table[count, 1] = run_clustering.energy1D(X, z)
             table[count, 2] = run_clustering.kmeans(k, Y, z, run_times=5)
             table[count, 3] = run_clustering.gmm(k, Y, z, run_times=5)
             #table[count, 4] = run_clustering.energy_hartigan(k, X, G, z,
             #                            init="k-means++", run_times=3)
             #table[count, 4] = run_clustering.energy_hartigan(k, Y, G, z,
             #                    init="spectral")
             count += 1
     return table
def gauss_dimensions_cov(dimensions=range(2,100,20), total_points=200,
                         num_experiments=100, d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D-d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([1.367,  3.175,  3.247,  4.403,  1.249,                                             1.969, 4.035,   4.237,  2.813,  3.637])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D-d))))
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            
            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                      run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            count += 1

    return table
def gauss_dimensions_mean(dimensions=range(2,100,20), num_points=[100, 100],
                          num_experiments=100, d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    n1, n2 = num_points
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 250
    table = np.zeros((num_experiments*len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5*np.ones(d), np.zeros(D-d)))
            s2 = np.diag(np.concatenate((.5*np.ones(d), np.ones(D-d))))
            n1 = N-p
            n2 = N+p

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z,
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z,
                                                init="kmeans", run_times=5)

            count += 1

    return table
def gauss_dimensions_mean(dimensions=range(2, 100, 20),
                          num_points=[100, 100],
                          num_experiments=100,
                          d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    n1, n2 = num_points
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)

            count += 1

    return table
Beispiel #16
0
# delete missing entries
delete_missing = np.where(data == '?')[0]
data = np.delete(data, delete_missing, axis=0)
data = np.array(data, dtype=float)
true_labels = np.delete(true_labels, delete_missing, axis=0)

# normalize data
data = (data - data.mean(axis=0)) / data.std(axis=0)

G = energy.eclust.kernel_matrix(data, rho)
#G = energy.eclust.kernel_matrix(data, rho_gauss)
#G = energy.eclust.kernel_matrix(data, rho_exp)

kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++")
gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans")
spectral_labels = cluster.spectral(6, data, G, run_times=10)
energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10)
lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral")
hart_labels = cluster.energy_hartigan(6,
                                      data,
                                      G,
                                      run_times=10,
                                      init="spectral")

t = PrettyTable([
    'Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure',
    'Fowlkes-Mallows'
])

algos = [
Beispiel #17
0
    data_class0 = data[np.where(labels == 0)]
    data_class1 = data[np.where(labels == 1)]
    idx0 = np.random.choice(range(len(data_class0)), n0, replace=True)
    idx1 = np.random.choice(range(len(data_class1)), n1, replace=True)
    data, labels = shuffle_data([data_class0[idx0], data_class1[idx1]])

    #data = (data - data.mean(axis=0))/data.std(axis=0)

    rho = lambda x, y: np.power(np.linalg.norm(x - y), 1)
    #rho = lambda x, y: 2-2*np.exp(-np.power(np.linalg.norm(x-y),1)/(2*1**2))
    G = eclust.kernel_matrix(data, rho)

    labels_hat = run_clustering.kmeans(2, data)
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print

    labels_hat = run_clustering.gmm(2, data)
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print

    labels_hat = run_clustering.energy_hartigan(2,
                                                data,
                                                G,
                                                run_times=5,
                                                init="gmm")
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print
# delete missing entries
delete_missing = np.where(data=='?')[0]
data = np.delete(data, delete_missing, axis=0)
data = np.array(data, dtype=float)
true_labels = np.delete(true_labels, delete_missing, axis=0)

# normalize data
data = (data - data.mean(axis=0))/data.std(axis=0)

G = energy.eclust.kernel_matrix(data, rho)
#G = energy.eclust.kernel_matrix(data, rho_gauss)
#G = energy.eclust.kernel_matrix(data, rho_exp)

kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++")
gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans")
spectral_labels = cluster.spectral(6, data, G, run_times=10)
energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10)
lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral")
hart_labels = cluster.energy_hartigan(6,data,G,run_times=10,init="spectral")

t = PrettyTable(['Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure', 
                'Fowlkes-Mallows'])

algos = ['kmeans', 'GMM', 'spectral', 'energy_spectral', 'energy_lloyd',
         'energy_hartigan']
pred_labels = [kmeans_labels, gmm_labels, spectral_labels,
               energy_spectral_labels, lloyd_labels, hart_labels]

for algo, pred_label in zip(algos, pred_labels):
    t.add_row([algo, 
    n1 = 500
    data_class0 = data[np.where(labels==0)]
    data_class1 = data[np.where(labels==1)]
    idx0 = np.random.choice(range(len(data_class0)), n0, replace=True)
    idx1 = np.random.choice(range(len(data_class1)), n1, replace=True)
    data, labels = shuffle_data([data_class0[idx0], data_class1[idx1]])

    #data = (data - data.mean(axis=0))/data.std(axis=0)

    rho = lambda x, y: np.power(np.linalg.norm(x-y), 1)
    #rho = lambda x, y: 2-2*np.exp(-np.power(np.linalg.norm(x-y),1)/(2*1**2))
    G = eclust.kernel_matrix(data, rho)
    
    labels_hat = run_clustering.kmeans(2, data)
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print
    
    labels_hat = run_clustering.gmm(2, data)
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print
    
    labels_hat = run_clustering.energy_hartigan(2, data, G, run_times=5,
                                                    init="gmm")
    print accuracy(labels, labels_hat)
    print type_errors(labels, labels_hat)
    print