def gauss_dimensions_mean(dimensions=range(2, 100, 20),
                          total_points=200,
                          num_experiments=100,
                          d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            ####################

            count += 1

    return table
def gauss_dimensions_cov(
        dimensions=range(2, 100, 20), total_points=200, num_experiments=100,
        d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D - d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([
                1.367, 3.175, 3.247, 4.403, 1.249, 1.969, 4.035, 4.237, 2.813,
                3.637
            ])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D - d))))
            n1, n2 = np.random.multinomial(total_points, [0.5, 0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 300
    table = np.zeros((num_experiments * len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5 * np.ones(d), np.zeros(D - d)))
            s2 = np.diag(np.concatenate((.5 * np.ones(d), np.ones(D - d))))
            pi1 = (N - p) / N / 2.
            pi2 = (N + p) / N / 2.
            n1, n2 = np.random.multinomial(N, [pi1, pi2])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)

            count += 1

    return table
def gauss_dimensions_mean(dimensions=range(2,100,20), total_points=200,
                          num_experiments=100, d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            ### generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            # flip Bernoulli coins to get number of points in each cluster
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            # get data, construct gram Matrix
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            ##################

            ### cluster with different algorithms
            # can change number of times we execute each experiment
            # and initialization as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            ####################
            
            count += 1

    return table
Exemple #5
0
def normal_or_lognormal_difference(
        numpoints=range(10, 100, 10), num_experiments=100, kind='normal'):
    k = 2
    table = []
    for n in numpoints:
        for i in range(num_experiments):
            this_res = [n]

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5 * np.eye(D)
            m2 = 0.5 * np.concatenate((np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5, 0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2],
                                                   [n1, n2])

            rho = lambda x, y: 2 - 2 * np.exp(-np.linalg.norm(x - y) / 2)
            G = eclust.kernel_matrix(X, rho)

            hart = run_clustering.energy_hartigan(k,
                                                  X,
                                                  G,
                                                  z,
                                                  init="k-means++",
                                                  run_times=5)
            lloyd = run_clustering.energy_lloyd(k,
                                                X,
                                                G,
                                                z,
                                                init="k-means++",
                                                run_times=5)
            spectral = run_clustering.spectral(k, X, G, z, run_times=5)
            this_res.append(hart - lloyd)
            this_res.append(hart - spectral)

            table.append(this_res)
    table = np.array(table)
    return table
def gauss_dimensions_cov(dimensions=range(2,100,20), total_points=200,
                         num_experiments=100, d=10):
    """High dimensions but with nontrivial covariance."""
    k = 2
    if not d:
        d = dimensions[0]
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for l in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            m2 = np.concatenate((np.ones(d), np.zeros(D-d)))
            s1 = np.eye(D)
            # from uniform 1, 5
            s2_1 = np.array([1.367,  3.175,  3.247,  4.403,  1.249,                                             1.969, 4.035,   4.237,  2.813,  3.637])
            s2 = np.diag(np.concatenate((s2_1, np.ones(D-d))))
            n1, n2 = np.random.multinomial(total_points, [0.5,0.5])
            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])

            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)
            
            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.spectral(k, X, G, z, 
                                                      run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            count += 1

    return table
def gauss_dimensions_mean(dimensions=range(2,100,20), num_points=[100, 100],
                          num_experiments=100, d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    n1, n2 = num_points
    table = np.zeros((num_experiments*len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta*np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k, X, G, z, 
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z, 
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z, 
                                                init="kmeans", run_times=5)
            
            count += 1

    return table
def gauss_dimensions_pi(num_points=range(0, 180, 10), num_experiments=10):
    """Test unbalanced clusters."""
    k = 2
    D = 4
    d = 2
    N = 250
    table = np.zeros((num_experiments*len(num_points), 6))
    count = 0

    for p in num_points:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((1.5*np.ones(d), np.zeros(D-d)))
            s2 = np.diag(np.concatenate((.5*np.ones(d), np.ones(D-d))))
            n1 = N-p
            n2 = N+p

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x-y)
            G = eclust.kernel_matrix(X, rho)

            table[count, 0] = p
            table[count, 1] = run_clustering.energy_hartigan(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k, X, G, z,
                                                init="k-means++", run_times=5)
            table[count, 4] = run_clustering.kmeans(k, X, z,
                                                init="k-means++", run_times=5)
            table[count, 5] = run_clustering.gmm(k, X, z,
                                                init="kmeans", run_times=5)

            count += 1

    return table
def normal_or_lognormal_difference(numpoints=range(10,100,10), 
                                   num_experiments=100,
                                   kind='normal'):
    k = 2
    table = []
    for n in numpoints:
        for i in range(num_experiments):
            this_res = [n]

            # generate data
            D = 20
            d = 5
            m1 = np.zeros(D)
            s1 = 0.5*np.eye(D)
            m2 = 0.5*np.concatenate((np.ones(d), np.zeros(D-d)))
            s2 = np.eye(D)
            n1, n2 = np.random.multinomial(n, [0.5,0.5])

            if kind == 'normal':
                X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1,n2])
            else:
                X, z = data.multivariate_lognormal([m1, m2], [s1, s2], [n1,n2])

            rho = lambda x, y: 2-2*np.exp(-np.linalg.norm(x-y)/2)
            G = eclust.kernel_matrix(X, rho)

            hart = run_clustering.energy_hartigan(k, X, G, z, 
                                        init="k-means++", run_times=5)
            lloyd = run_clustering.energy_lloyd(k, X, G, z, 
                                        init="k-means++", run_times=5)
            spectral = run_clustering.spectral(k, X, G, z, 
                                            run_times=5)
            this_res.append(hart-lloyd)
            this_res.append(hart-spectral)
            
            table.append(this_res)
    table = np.array(table)
    return table
def gauss_dimensions_mean(dimensions=range(2, 100, 20),
                          num_points=[100, 100],
                          num_experiments=100,
                          d=None):
    # data distribution
    k = 2
    delta = 0.7
    if not d:
        d = dimensions[0]
    n1, n2 = num_points
    table = np.zeros((num_experiments * len(dimensions), 6))
    count = 0

    for D in dimensions:
        for i in range(num_experiments):

            # generate data
            m1 = np.zeros(D)
            s1 = np.eye(D)
            m2 = np.concatenate((delta * np.ones(d), np.zeros(D - d)))
            s2 = np.eye(D)

            X, z = data.multivariate_normal([m1, m2], [s1, s2], [n1, n2])
            rho = lambda x, y: np.linalg.norm(x - y)
            G = eclust.kernel_matrix(X, rho)

            # can change the number of times we execute each experiment
            # and initialization method as well
            table[count, 0] = D
            table[count, 1] = run_clustering.energy_hartigan(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 2] = run_clustering.energy_lloyd(k,
                                                          X,
                                                          G,
                                                          z,
                                                          init="k-means++",
                                                          run_times=5)
            table[count, 3] = run_clustering.energy_spectral(k,
                                                             X,
                                                             G,
                                                             z,
                                                             init="k-means++",
                                                             run_times=5)
            table[count, 4] = run_clustering.kmeans(k,
                                                    X,
                                                    z,
                                                    init="k-means++",
                                                    run_times=5)
            table[count, 5] = run_clustering.gmm(k,
                                                 X,
                                                 z,
                                                 init="kmeans",
                                                 run_times=5)

            count += 1

    return table
Exemple #11
0
data = np.delete(data, delete_missing, axis=0)
data = np.array(data, dtype=float)
true_labels = np.delete(true_labels, delete_missing, axis=0)

# normalize data
data = (data - data.mean(axis=0)) / data.std(axis=0)

G = energy.eclust.kernel_matrix(data, rho)
#G = energy.eclust.kernel_matrix(data, rho_gauss)
#G = energy.eclust.kernel_matrix(data, rho_exp)

kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++")
gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans")
spectral_labels = cluster.spectral(6, data, G, run_times=10)
energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10)
lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral")
hart_labels = cluster.energy_hartigan(6,
                                      data,
                                      G,
                                      run_times=10,
                                      init="spectral")

t = PrettyTable([
    'Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure',
    'Fowlkes-Mallows'
])

algos = [
    'kmeans', 'GMM', 'spectral', 'energy_spectral', 'energy_lloyd',
    'energy_hartigan'
]
data = np.delete(data, delete_missing, axis=0)
data = np.array(data, dtype=float)
true_labels = np.delete(true_labels, delete_missing, axis=0)

# normalize data
data = (data - data.mean(axis=0))/data.std(axis=0)

G = energy.eclust.kernel_matrix(data, rho)
#G = energy.eclust.kernel_matrix(data, rho_gauss)
#G = energy.eclust.kernel_matrix(data, rho_exp)

kmeans_labels = cluster.kmeans(6, data, run_times=10, init="k-means++")
gmm_labels = cluster.gmm(6, data, run_times=10, init="kmeans")
spectral_labels = cluster.spectral(6, data, G, run_times=10)
energy_spectral_labels = cluster.energy_spectral(6, data, G, run_times=10)
lloyd_labels = cluster.energy_lloyd(6, data, G, run_times=10, init="spectral")
hart_labels = cluster.energy_hartigan(6,data,G,run_times=10,init="spectral")

t = PrettyTable(['Algorithm', 'Accuracy', 'A-Rand', 'Mutual Info', 'V-Measure', 
                'Fowlkes-Mallows'])

algos = ['kmeans', 'GMM', 'spectral', 'energy_spectral', 'energy_lloyd',
         'energy_hartigan']
pred_labels = [kmeans_labels, gmm_labels, spectral_labels,
               energy_spectral_labels, lloyd_labels, hart_labels]

for algo, pred_label in zip(algos, pred_labels):
    t.add_row([algo, 
        energy.metric.accuracy(true_labels, pred_label),
        sklearn.metrics.adjusted_rand_score(true_labels, pred_label),
        sklearn.metrics.adjusted_mutual_info_score(true_labels, pred_label),