def gauss_dimensions_mean(dimensions=range(2, 100, 20),
                          total_points=200,
                          num_experiments=100,
                          d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            ####################

            count += 1

    return table
def cigars_circles(num_experiments=10, run_times=5, kind='cigars'):
    table = []
    for i in range(num_experiments):
        this_experiment = []
        
        if kind == 'cigars':
            m1 = [0,0]
            m2 = [6.5,0]
            s1 = np.array([[1,0],[0,20]])
            s2 = np.array([[1,0],[0,20]])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [200, 200])
            k = 2
            init = 'k-means++'
        elif kind == '2circles':
            X, z = data.circles([1, 3], [0.2, 0.2], [400, 400])
            k = 2
            init = 'random'
        elif kind == '3circles':
            X, z = data.circles([1, 3, 5], [0.2, 0.2, 0.2], [400, 400, 400])
            init = 'random'
            k = 3
        else:
            raise ValueError("Don't know which example to sample.")

        #sigma = 2
        sigma = 1
        G = eclust.kernel_matrix(X, rho_standard)
        G_half = eclust.kernel_matrix(X, rho_half)
        G_exp = eclust.kernel_matrix(X, lambda x,y: rho_exp(x, y, sigma))
        G_rbf = eclust.kernel_matrix(X, lambda x,y: rho_rbf(x, y, sigma))

        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G,z,init=init,
                                           run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_half,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_exp,z,
                                           init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.energy_hartigan(k,X,G_rbf,z,
                                           init=init,run_times=run_times))
        #this_experiment.append(
        #    run_clustering.spectral(k,X,G_exp,z,run_times=run_times))
        
        this_experiment.append(
            run_clustering.spectral(k,X,G_rbf,z,run_times=run_times))
        
        this_experiment.append(
            run_clustering.kmeans(k,X,z,init=init,run_times=run_times))
        this_experiment.append(
            run_clustering.gmm(k,X,z,init="kmeans",run_times=run_times))
        this_experiment.append(energy.metric.accuracy(z, energy.gmm.gmm(k,X)))
        
        table.append(this_experiment)
    
    table = np.array(table)
    for i in range(8):
        print table[:,i].mean(), scipy.stats.sem(table[:,i])
def gauss_dimensions_cov(
        dimensions=range(2, 100, 20), total_points=200, num_experiments=100,
        d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D - d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([
                1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813,
                3.637
            ])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D - d))))
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 300
    table = np.zeros((num_experiments * len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5 * np.ones(d), np.zeros(D - d)))
            s2 = np.diag(np.concatenate((.5 * np.ones(d), np.ones(D - d))))
            pi1 = (N - p) / N / 2.
            pi2 = (N + p) / N / 2.
            n1, n2 = np.random.multinomial(N, [pi1, pi2])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)

            count += 1

    return table
def normal_or_lognormal(numpoints=range(10,100,10), num_experiments=100,
                        kind='normal'):
    table = np.zeros((num_experiments*len(numpoints), 6))
    count = 0
    k = 2

    for n in numpoints:
        for i in range(num_experiments):

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5,0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1,m2], [s1,s2], [n1,n2])
            else:
                X, z = data.multivariate_lognormal([m1,m2], [s1,s2], [n1,n2])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            rho2 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5)
            G2 = eclust.kernel_matrix(X, rho2)

            rho3 = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G3 = eclust.kernel_matrix(X, rho3)

            table[count, 0] = n
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_hartigan(k, X, G2, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_hartigan(k, X, G3, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
Beispiel #6
0
def normal_or_lognormal(numpoints=range(10,100,10), num_experiments=100,
                        kind='normal'):
    table = np.zeros((num_experiments*len(numpoints), 6))
    count = 0
    k = 2

    for n in numpoints:
        for i in range(num_experiments):

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n, n])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n, n])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            rho2 = lambda x, y: np.power(np.linalg.norm(x-y), 0.5)
            G2 = eclust.kernel_matrix(X, rho2)

            rho3 = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G3 = eclust.kernel_matrix(X, rho3)

            table[count, 0] = n
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_hartigan(k, X, G2, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_hartigan(k, X, G3, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
Beispiel #7
0
def normal_or_lognormal_difference(
        numpoints=range(10, 100, 10), num_experiments=100, kind='normal'):
    k = 2
    table = []
    for n in numpoints:
        for i in range(num_experiments):
            this_res = [n]

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5 * np.eye(D)
            m2 = 0.5 * np.concatenate((np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5, 0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2],
                                                   [n1, n2])

            rho = lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y) / 2)
            G = eclust.kernel_matrix(X, rho)

            hart = run_clustering.energy_hartigan(k,
                                                  X,
                                                  G,
                                                  z,
                                                  init="k-means++",
                                                  run_times=5)
            lloyd = run_clustering.energy_lloyd(k,
                                                X,
                                                G,
                                                z,
                                                init="k-means++",
                                                run_times=5)
            spectral = run_clustering.spectral(k, X, G, z, run_times=5)
            this_res.append(hart - lloyd)
            this_res.append(hart - spectral)

            table.append(this_res)
    table = np.array(table)
    return table
def gauss_dimensions_mean(dimensions=range(2,100,20), total_points=200,
                          num_experiments=100, d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            ####################
            
            count += 1

    return table
def gauss_dimensions_cov(dimensions=range(2,100,20), total_points=200,
                         num_experiments=100, d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D-d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([1.367,  3.175,  3.247,  4.403,  1.249,                                             1.969, 4.035,   4.237,  2.813,  3.637])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D-d))))
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            
            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                      run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            count += 1

    return table
def gauss_dimensions_mean(dimensions=range(2,100,20), num_points=[100, 100],
                          num_experiments=100, d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    n1, n2 = num_points
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 250
    table = np.zeros((num_experiments*len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5*np.ones(d), np.zeros(D-d)))
            s2 = np.diag(np.concatenate((.5*np.ones(d), np.ones(D-d))))
            n1 = N-p
            n2 = N+p

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z,
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z,
                                                init="kmeans", run_times=5)

            count += 1

    return table
def normal_or_lognormal_difference(numpoints=range(10,100,10), 
                                   num_experiments=100,
                                   kind='normal'):
    k = 2
    table = []
    for n in numpoints:
        for i in range(num_experiments):
            this_res = [n]

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5,0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1,n2])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1,n2])

            rho = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G = eclust.kernel_matrix(X, rho)

            hart = run_clustering.energy_hartigan(k, X, G, z, 
                                        init="k-means++", run_times=5)
            lloyd = run_clustering.energy_lloyd(k, X, G, z, 
                                        init="k-means++", run_times=5)
            spectral = run_clustering.spectral(k, X, G, z, 
                                            run_times=5)
            this_res.append(hart-lloyd)
            this_res.append(hart-spectral)
            
            table.append(this_res)
    table = np.array(table)
    return table
def gauss_dimensions_mean(dimensions=range(2, 100, 20),
                          num_points=[100, 100],
                          num_experiments=100,
                          d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    n1, n2 = num_points
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)

            count += 1

    return table
#m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
#s2 = np.eye(D)
#n1, n2 = np.random.multinomial(N, [0.5,0.5])
#X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

d = 10
D = 200
N = 200
m1 = np.zeros(D)
m2 = np.concatenate((np.ones(d), np.zeros(D - d)))
s1 = np.eye(D)
s2_1 = np.array(
    [1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813, 3.637])
s2 = np.diag(np.concatenate((s2_1, np.ones(D - d))))
n1, n2 = np.random.multinomial(N, [0.5, 0.5])
X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

numcols = 5
Y = np.zeros(shape=(N, numcols + 1))
Y[:, :numcols] = X[:, :numcols]
idx0 = np.where(z == 0)
idx1 = np.where(z == 1)
Y[idx0, numcols] = 0
Y[idx1, numcols] = 1
df = pd.DataFrame(Y,
                  columns=[r"$x_%i$" % i
                           for i in range(1, numcols + 1)] + ["class"])

g = sns.PairGrid(
    df,
    hue="class",  #palette="hls",
#s1 = np.eye(D)
#m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
#s2 = np.eye(D)
#n1, n2 = np.random.multinomial(N, [0.5,0.5])
#X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

d = 10
D = 200
N = 200
m1 = np.zeros(D)
m2 = np.concatenate((np.ones(d), np.zeros(D-d)))
s1 = np.eye(D)
s2_1 = np.array([1.367,  3.175,  3.247,  4.403,  1.249,                                             1.969, 4.035,   4.237,  2.813,  3.637])
s2 = np.diag(np.concatenate((s2_1, np.ones(D-d))))
n1, n2 = np.random.multinomial(N, [0.5,0.5])
X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

numcols=5
Y = np.zeros(shape=(N,numcols+1))
Y[:,:numcols] = X[:,:numcols]
idx0 = np.where(z==0)
idx1 = np.where(z==1)
Y[idx0,numcols] = 0
Y[idx1,numcols] = 1
df = pd.DataFrame(Y, 
        columns=[r"$x_%i$"%i for i in range(1, numcols+1)]+["class"])

g = sns.PairGrid(df, hue="class", #palette="hls",
                    vars=[r"$x_%i$"%i for i in range(1, numcols+1)])

def scatter_fake_diag(x, y, *a, **kw):
Beispiel #16
0
    import matplotlib.pyplot as plt
    #sns.set_style("ticks", {"xtick.direction":"in", "ytick.direction": "in"})
    from customize_plots import *
    import sys

    # syntetic data for testing
    m1 = np.zeros(10)
    m2 = 2 * np.ones(10)
    m3 = 4 * np.ones(10)
    m4 = 5 * np.ones(10)
    s1 = np.eye(10)
    s2 = np.eye(10)
    s3 = np.eye(10)
    s4 = np.eye(10)
    n1 = n2 = n3 = n4 = 30
    X, z = data.multivariate_normal([m1, m2, m3, m4], [s1, s2, s3, s4],
                                    [n1, n2, n3, n4])

    # max number of clusters
    K = 10

    # gap statistic with k-means
    k_hat, df_kmeans = gap_statistics(X,
                                      B=50,
                                      K=K,
                                      cluster_func=kmeans,
                                      type_ref="svd")
    print k_hat

    rho = lambda x, y: np.power(np.linalg.norm(x - y), 1)
    G = eclust.kernel_matrix(X, rho)
    gaps = eigenvalues(G, K)
    import matplotlib.pyplot as plt
    #sns.set_style("ticks", {"xtick.direction":"in", "ytick.direction": "in"})
    from customize_plots import *
    import sys

    # syntetic data for testing
    m1 = np.zeros(10)
    m2 = 2*np.ones(10)
    m3 = 4*np.ones(10)
    m4 = 5*np.ones(10)
    s1 = np.eye(10)
    s2 = np.eye(10)
    s3 = np.eye(10)
    s4 = np.eye(10)
    n1 = n2 = n3 = n4 = 30
    X, z = data.multivariate_normal([m1,m2,m3,m4], [s1,s2,s3,s4], [n1,n2,n3,n4])

    # max number of clusters
    K = 10
    
    # gap statistic with k-means
    k_hat, df_kmeans = gap_statistics(X, B=50, K=K, cluster_func=kmeans, 
                                         type_ref="svd")
    print k_hat

    rho = lambda x, y: np.power(np.linalg.norm(x-y), 1)
    G = eclust.kernel_matrix(X, rho)
    gaps = eigenvalues(G, K)
    
    rho2 = lambda x, y: 2-2*np.exp(-np.power(np.linalg.norm(x-y),2)/2/((2)**2))
    G2 = eclust.kernel_matrix(X, rho2)