def two_moons_clustering():
    """
    TO BE COMPLETED.

    Used in question 2.7
    """
    # Generate data and compute number of clusters
    X, Y = two_moons(600)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 3
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'
    chosen_eig_indices = [1, 2,
                          3]  # indices of the ordered eigenvalues to pick

    # build laplacian
    W = build_similarity_graph(X, var=var, k=k)
    L = build_laplacian(W, laplacian_normalization)
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Example #2
0
def two_moons_clustering():
    """
    TO BE COMPLETED.

    Used in question 2.7
    """
    # Generate data and compute number of clusters
    X, Y = two_moons(600)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    eps = 0.8
    var = 1.0  # exponential_euclidean's sigma^2
    laplacian_normalization = 'rw'

    # build laplacian
    W = build_similarity_graph(X, var=var, eps=eps, k=k)
    L = build_laplacian(W, laplacian_normalization)

    # spectral clustering
    Y_rec = spectral_clustering_adaptive(L, num_classes=num_classes)

    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Example #3
0
def two_moons_clustering(eig_max=15):
    """
    TO BE COMPLETED.

    Used in question 2.7
    """
    # Generate data and compute number of clusters
    X, Y = two_moons(600)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'

    #    chosen_eig_indices = [0, 1, 2]    # indices of the ordered eigenvalues to pick

    if k == 0:  # compute epsilon
        dists = sd.cdist(
            X, X, 'euclidean'
        )  # dists[i, j] = euclidean distance between x_i and x_j

        min_tree = min_span_tree(dists)

        l = []
        n1, m1 = min_tree.shape

        for i in range(n1):
            for j in range(m1):
                if min_tree[i][j] == True:
                    l.append(dists[i][j])

        #distance_threshold = sorted(l)[-1]
        distance_threshold = sorted(l)[-1]

        eps = np.exp(-(distance_threshold)**2.0 / (2 * var))

    # build laplacian
    W = build_similarity_graph(X, var=var, eps=eps, k=k)
    L = build_laplacian(W, laplacian_normalization)

    # chose the eigenvalues
    eigenvalues, U = np.linalg.eig(L)
    indexes = np.argsort(eigenvalues)
    eigenvalues = eigenvalues[indexes]
    U = U[:, indexes]
    chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max=eig_max)

    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Example #4
0
def parameter_sensitivity():
    """
    TO BE COMPLETED.

    A function to test spectral clustering sensitivity to parameter choice.

    Used in question 2.9
    """
    # the number of samples to generate
    num_samples = 500
    """
    Choose parameters
    """
    var = 1.0  # exponential_euclidean's sigma^2
    laplacian_normalization = 'rw'
    # chosen_eig_indices = [0, 1]
    """
    Choose candidate parameters
    """
    # the number of neighbours for the graph or the epsilon threshold
    # parameter_candidate = np.arange(3, 33, 3)
    parameter_candidate = np.linspace(0.2, 1, 9)
    parameter_performance = []

    for param in parameter_candidate:
        # Generate data
        X, Y = two_moons(num_samples)
        num_classes = len(np.unique(Y))

        W = build_similarity_graph(X, eps=param)
        # W = build_similarity_graph(X, k=param)
        L = build_laplacian(W, laplacian_normalization)

        Y_rec = spectral_clustering_adaptive(L, num_classes)
        # Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes)

        parameter_performance += [skm.adjusted_rand_score(Y, Y_rec)]

    plt.figure()
    plt.plot(parameter_candidate, parameter_performance)
    plt.title('parameter sensitivity')
    plt.show()
Example #5
0
def two_moons_clustering():
    """
    TO BE COMPLETED.

    Used in question 2.7
    """
    # Generate data and compute number of clusters
    X, Y = two_moons(600)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'
    chosen_eig_indices = [0, 1]  # indices of the ordered eigenvalues to pick

    # build laplacian
    # build laplacian
    if k == 0:
        dists = sd.cdist(X, X, metric="euclidean")
        min_tree = min_span_tree(dists)
        distance_threshold = dists[min_tree].max()
        eps = np.exp(-distance_threshold**2.0 / (2 * var))
        print(eps)
        W = build_similarity_graph(X, var=var, k=k, eps=eps)
    else:
        W = build_similarity_graph(X, var=var, k=k)
    L = build_laplacian(W, laplacian_normalization)

    #    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)
    #
    #    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))

    Y_rec_adaptive = spectral_clustering_adaptive(L, num_classes=num_classes)

    plot_clustering_result(X, Y, L, Y_rec_adaptive,
                           KMeans(num_classes).fit_predict(X))
Example #6
0
def parameter_sensitivity(eig_max=15):
    """
    TO BE COMPLETED.

    A function to test spectral clustering sensitivity to parameter choice.

    Used in question 2.9
    """
    # the number of samples to generate
    num_samples = 500
    """
    Choose parameters
    """
    var = 1.0  # exponential_euclidean's sigma^2
    laplacian_normalization = 'unn'
    #chosen_eig_indices = [0, 1, 2]
    """
    Choose candidate parameters
    """
    parameter_candidate = [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
    ]  # the number of neighbours for the graph or the epsilon threshold
    parameter_performance = []

    for k in parameter_candidate:
        # Generate data
        X, Y = two_moons(num_samples, 1, 0.02)
        num_classes = len(np.unique(Y))

        if k == 0:  # compute epsilon
            dists = sd.cdist(
                X, X, 'euclidean'
            )  # dists[i, j] = euclidean distance between x_i and x_j

            min_tree = min_span_tree(dists)

            l = []
            n1, m1 = min_tree.shape

            for i in range(n1):
                for j in range(m1):
                    if min_tree[i][j] == True:
                        l.append(dists[i][j])
            distance_threshold = sorted(l)[-1]
            eps = np.exp(-(distance_threshold)**2.0 / (2 * var))
            W = build_similarity_graph(X, var=var, eps=eps, k=k)
        else:
            W = build_similarity_graph(X, k=k)
        L = build_laplacian(W, laplacian_normalization)

        eigenvalues, U = np.linalg.eig(L)
        indexes = np.argsort(eigenvalues)
        eigenvalues = eigenvalues[indexes]
        U = U[:, indexes]
        chosen_eig_indices = choose_eigenvalues(eigenvalues, eig_max=eig_max)

        Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes)

        parameter_performance += [skm.adjusted_rand_score(Y, Y_rec)]

    plt.figure()
    plt.plot(parameter_candidate, parameter_performance)
    plt.title('parameter sensitivity')
    plt.show()


#parameter_sensitivity()
def how_to_choose_epsilon(gen_pam, k):
    """
    TO BE COMPLETED.

    Consider the distance matrix with entries dist(x_i, x_j) (the euclidean distance between x_i and x_j)
    representing a fully connected graph.
    One way to choose the parameter epsilon to build a graph is to choose the maximum value of dist(x_i, x_j) where
    (i,j) is an edge that is present in the minimal spanning tree of the fully connected graph. Then, the threshold
    epsilon can be chosen as exp(-dist(x_i, x_j)**2.0/(2*sigma^2)).
    """
    # the number of samples to generate
    num_samples = 100

    # the option necessary for worst_case_blob, try different values
    #gen_pam = 10  # to understand the meaning of the parameter, read worst_case_blob in generate_data.py

    # get blob data
    # X, Y = worst_case_blob(num_samples, gen_pam)
    X, Y = two_moons(num_samples)
    """
     use the distance function and the min_span_tree function to build the minimal spanning tree min_tree                   
     - var: the exponential_euclidean's sigma2 parameter          
     - dists: (n x n) matrix with euclidean distance between all possible couples of points                   
     - min_tree: (n x n) indicator matrix for the edges in the minimal spanning tree                           
    """
    var = 1.0
    dists = sd.cdist(
        X, X,
        'euclidean')  # dists[i, j] = euclidean distance between x_i and x_j

    min_tree = min_span_tree(dists)

    l = []
    n1, m1 = min_tree.shape
    for i in range(n1):
        for j in range(m1):
            if min_tree[i][j] == True:
                l.append([(i, j), dists[i][j]])
    l = sorted(l, key=lambda x: x[1], reverse=True)

    #print(min_tree)
    """
    set threshold epsilon to the max weight in min_tree 
    """
    distance_threshold = l[0][1]
    eps = np.exp(-distance_threshold**2.0 / (2 * var))
    """
    use the build_similarity_graph function to build the graph W  
     W: (n x n) dimensional matrix representing                    
        the adjacency matrix of the graph
       use plot_graph_matrix to plot the graph                    
    """
    W = build_similarity_graph(X, var=var, eps=eps, k=k)
    plot_graph_matrix(X, Y, W)
    return eps, X, Y, W


#if __name__ == '__main__':
#    for gp in [0,1,10,100]:
#        print(gp)
#        how_to_choose_epsilon(gp,0)
#    for k in [0,1,2,5,10]:
#        how_to_choose_epsilon(0,k)
Example #8
0
    distance_threshold = np.max(dists[min_tree])
    eps = np.exp(-distance_threshold**2 / (2 * var))
    """
    use the build_similarity_graph function to build the graph W  
     W: (n x n) dimensional matrix representing                    
        the adjacency matrix of the graph
       use plot_graph_matrix to plot the graph                    
    """
    W = build_similarity_graph(X, var=var, eps=eps, k=0)
    plot_graph_matrix(X, Y, W)


if __name__ == '__main__':
    n = 300
    blobs_data, blobs_clusters = blobs(n)
    moons_data, moons_clusters = two_moons(n)
    point_circle_data, point_circle_clusters = point_and_circle(n)
    worst_blobs_data, worst_blobs_clusters = worst_case_blob(n, 1.0)

    var = 1

    X, Y = moons_data, moons_clusters
    n_samples = X.shape[0]
    dists = pairwise_distances(X).reshape((n_samples, n_samples))
    min_tree = min_span_tree(dists)
    eps = np.exp(-np.max(dists[min_tree])**2 / (2 * var))
    W_eps = build_similarity_graph(X, var=var, eps=0.6)
    W_knn = build_similarity_graph(X, k=15)

    plot_graph_matrix(X, Y, W_eps)
    plot_graph_matrix(X, Y, W_knn)