def two_blobs_clustering():
    """
    TO BE COMPLETED

    Clustering of two blobs. Used in questions 2.1 and 2.2
    """

    # Get data and compute number of classes
    X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 3
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'
    chosen_eig_indices = [1, 2,
                          3]  # indices of the ordered eigenvalues to pick

    # build laplacian
    W = build_similarity_graph(X, var=var, k=k)
    L = build_laplacian(W, laplacian_normalization)

    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    # Plot results
    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Ejemplo n.º 2
0
def find_the_bend():
    """
    TO BE COMPLETED

    Used in question 2.3
    :return:
    """

    # the number of samples to generate
    num_samples = 600

    # Generate blobs and compute number of clusters
    X, Y = blobs(num_samples, 4, 0.2)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1  # exponential_euclidean's sigma^2
    laplacian_normalization = 'unn'  # either 'unn'normalized, 'sym'metric normalization or 'rw' random-walk normalization

    # build laplacian
    if k == 0:
        dists = sd.cdist(X, X, metric="euclidean")
        min_tree = min_span_tree(dists)
        distance_threshold = dists[min_tree].max()
        eps = np.exp(-distance_threshold**2.0 / (2 * var))
        print(eps)
        W = build_similarity_graph(X, var=var, k=k, eps=eps)
    else:
        W = build_similarity_graph(X, var=var, k=k)
    L = build_laplacian(W, laplacian_normalization)
    """
    compute first 15 eigenvalues and call choose_eigenvalues() to choose which ones to use. 
    """
    eigenvalues, vects = scipy.linalg.eig(L)
    eigenvalues = sorted(eigenvalues.real)
    #    for ind,val in enumerate(eigenvalues[:15]):
    #        plt.scatter(ind, val)
    #    plt.xlabel("index of the eigenvalue")
    #    plt.ylabel("value of the eigenvalue")
    #    chosen_eig_indices =  [0,1,2,3]  # indices of the ordered eigenvalues to pick
    """
    compute spectral clustering solution using a non-adaptive method first, and an adaptive one after (see handout) 
    Y_rec = (n x 1) cluster assignments [0,1,..., c-1]    
    """
    # run spectral clustering
    #    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)
    Y_rec_adaptive = spectral_clustering_adaptive(L, num_classes=num_classes)

    #    plot_the_bend(X, Y, L, Y_rec, eigenvalues)
    plot_the_bend(X, Y, L, Y_rec_adaptive, eigenvalues)
Ejemplo n.º 3
0
def two_blobs_clustering():
    """
    TO BE COMPLETED

    Clustering of two blobs. Used in questions 2.1 and 2.2
    """

    # Get data and compute number of classes
    X, Y = blobs(50, n_blobs=2, blob_var=0.15, surplus=0)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'
    chosen_eig_indices = [0, 1,
                          2]  # indices of the ordered eigenvalues to pick

    if k == 0:  # compute epsilon
        dists = sd.cdist(
            X, X, 'euclidean'
        )  # dists[i, j] = euclidean distance between x_i and x_j

        min_tree = min_span_tree(dists)

        l = []
        n1, m1 = min_tree.shape

        for i in range(n1):
            for j in range(m1):
                if min_tree[i][j] == True:
                    l.append(dists[i][j])

        #distance_threshold = sorted(l)[-1]
        distance_threshold = sorted(l)[-2]

        eps = np.exp(-(distance_threshold)**2.0 / (2 * var))
    #####

    # build laplacian
    W = build_similarity_graph(X, var=var, eps=eps, k=k)
    plot_graph_matrix(X, Y, W)

    L = build_laplacian(W, laplacian_normalization)

    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    # Plot results
    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Ejemplo n.º 4
0
def two_blobs_clustering():
    """
    TO BE COMPLETED

    Clustering of two blobs. Used in questions 2.1 and 2.2
    """

    question = '2.2'

    # Get data and compute number of classes
    X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0)
    num_classes = len(np.unique(Y))
    n = X.shape[0]
    """
    Choose parameters
    """
    var = 1.0  # exponential_euclidean's sigma^2
    laplacian_normalization = 'rw'

    if question == '2.1':
        # as the graph has to be connected in this question, we construct a epsilon-graph using a MST
        dists = pairwise_distances(X).reshape(
            (n, n))  # dists[i, j] = euclidean distance between x_i and x_j
        min_tree = min_span_tree(dists)
        distance_threshold = np.max(dists[min_tree])
        eps = np.exp(-distance_threshold**2 / (2 * var))

        # choice of eigenvectors to use
        chosen_eig_indices = [1]  # indices of the ordered eigenvalues to pick

        # build similarity graph and laplacian
        W = build_similarity_graph(X, var=var, eps=eps)
        L = build_laplacian(W, laplacian_normalization)

    elif question == '2.2':
        # choice of eigenvectors to use
        chosen_eig_indices = [0, 1]

        # choice of k for the k-nn graph
        k = 20

        # build similarity graph and laplacian
        W = build_similarity_graph(X, var=var, k=k)
        L = build_laplacian(W, laplacian_normalization)

    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    # Plot results
    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Ejemplo n.º 5
0
def how_to_choose_epsilon():
    """
    TO BE COMPLETED.

    Consider the distance matrix with entries dist(x_i, x_j) (the euclidean distance between x_i and x_j)
    representing a fully connected graph.
    One way to choose the parameter epsilon to build a graph is to choose the maximum value of dist(x_i, x_j) where
    (i,j) is an edge that is present in the minimal spanning tree of the fully connected graph. Then, the threshold
    epsilon can be chosen as exp(-dist(x_i, x_j)**2.0/(2*sigma^2)).
    """
    # the number of samples to generate
    num_samples = 100

    # the option necessary for worst_case_blob, try different values
    gen_pam = 25  # to understand the meaning of the parameter, read worst_case_blob in generate_data.py

    # get blob data
    #X, Y = worst_case_blob(num_samples, gen_pam)
    #X, Y = two_moons(num_samples)
    X, Y = blobs(600, n_blobs=4, surplus=500)
    """
     use the distance function and the min_span_tree function to build the minimal spanning tree min_tree                   
     - var: the exponential_euclidean's sigma2 parameter          
     - dists: (n x n) matrix with euclidean distance between all possible couples of points                   
     - min_tree: (n x n) indicator matrix for the edges in the minimal spanning tree                           
    """
    var = 1.0
    dists = sd.cdist(X, X, metric="euclidean"
                     )  # dists[i, j] = euclidean distance between x_i and x_j
    min_tree = min_span_tree(dists)
    """
    set threshold epsilon to the max weight in min_tree 
    """
    distance_threshold = dists[min_tree].max()
    eps = np.exp(-distance_threshold**2.0 / (2 * var))
    """
    use the build_similarity_graph function to build the graph W  
     W: (n x n) dimensional matrix representing                    
        the adjacency matrix of the graph
       use plot_graph_matrix to plot the graph                    
    """
    W = build_similarity_graph(X, var=var, eps=eps, k=0)
    plot_graph_matrix(X, Y, W)
Ejemplo n.º 6
0
def find_the_bend():
    """
    TO BE COMPLETED

    Used in question 2.3
    :return:
    """

    # the number of samples to generate
    num_samples = 600

    # Generate blobs and compute number of clusters
    # var_blobs = 0.03  # question 2.3.
    var_blobs = 0.2  # question 2.4.
    X, Y = blobs(num_samples, 4, var_blobs)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 20
    var = 1.0  # exponential_euclidean's sigma^2
    laplacian_normalization = 'rw'  # 'unn'normalized, 'sym'metric normalization or 'rw' random-walk normalization

    # build laplacian
    W = build_similarity_graph(X, var=var, k=k)
    L = build_laplacian(W, laplacian_normalization)
    """
    compute first 15 eigenvalues and call choose_eigenvalues() to choose which ones to use. 
    """
    eigenvalues, _ = scipy.linalg.eig(L)
    eigenvalues = eigenvalues[np.argsort(eigenvalues)].real
    eigenvalues = eigenvalues[:15]
    chosen_eig_indices = choose_eigenvalues(
        eigenvalues)  # indices of the ordered eigenvalues to pick
    """
    compute spectral clustering solution using a non-adaptive method first, and an adaptive one after (see handout) 
    Y_rec = (n x 1) cluster assignments [0,1,..., c-1]    
    """
    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)
    # Y_rec_adaptive = spectral_clustering_adaptive(L, num_classes=num_classes)

    plot_the_bend(X, Y, L, Y_rec, eigenvalues)
Ejemplo n.º 7
0
def two_blobs_clustering():
    """
    TO BE COMPLETED

    Clustering of two blobs. Used in questions 2.1 and 2.2
    """

    # Get data and compute number of classes
    X, Y = blobs(600, n_blobs=2, blob_var=0.15, surplus=0)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2

    laplacian_normalization = 'unn'
    chosen_eig_indices = [0, 1]  # indices of the ordered eigenvalues to pick

    # build laplacian
    if k == 0:
        dists = sd.cdist(X, X, metric="euclidean")
        min_tree = min_span_tree(dists)
        distance_threshold = dists[min_tree].max()
        eps = np.exp(-distance_threshold**2.0 / (2 * var))

        W = build_similarity_graph(X, var=var, k=k, eps=eps)
    else:
        W = build_similarity_graph(X, var=var, k=k)

    L = build_laplacian(W, laplacian_normalization)

    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)

    # Plot results
    plot_clustering_result(X, Y, L, Y_rec, KMeans(num_classes).fit_predict(X))
Ejemplo n.º 8
0
def find_the_bend(eig_max=15, blob_var=0.03):
    """
    TO BE COMPLETED

    Used in question 2.3
    :return:
    """
    eig_max -= 1  # to count starting from 0
    # the number of samples to generate
    num_samples = 600

    # Generate blobs and compute number of clusters
    X, Y = blobs(num_samples, 4, blob_var)
    num_classes = len(np.unique(Y))
    """
    Choose parameters
    """
    k = 0
    var = 1.0  # exponential_euclidean's sigma^2
    laplacian_normalization = 'sym'  # either 'unn'normalized, 'sym'metric normalization or 'rw' random-walk normalization

    if k == 0:  # compute epsilon
        dists = sd.cdist(
            X, X, 'euclidean'
        )  # dists[i, j] = euclidean distance between x_i and x_j

        min_tree = min_span_tree(dists)

        l = []
        n1, m1 = min_tree.shape

        for i in range(n1):
            for j in range(m1):
                if min_tree[i][j] == True:
                    l.append(dists[i][j])

        #distance_threshold = sorted(l)[-1]
        distance_threshold = sorted(l)[-num_classes]

        eps = np.exp(-(distance_threshold)**2.0 / (2 * var))

    # build laplacian
    W = build_similarity_graph(X, var=var, eps=eps, k=k)
    L = build_laplacian(W, laplacian_normalization)
    """
    compute first 15 eigenvalues and call choose_eigenvalues() to choose which ones to use. 
    """
    eigenvalues, U = np.linalg.eig(L)
    indexes = np.argsort(eigenvalues)
    eigenvalues = eigenvalues[indexes]
    U = U[:, indexes]

    chosen_eig_indices = choose_eigenvalues(
        eigenvalues,
        eig_max=eig_max)  # indices of the ordered eigenvalues to pick

    plt.plot(eigenvalues, [i for i in range(len(eigenvalues))], 'r+')
    """
    compute spectral clustering solution using a non-adaptive method first, and an adaptive one after (see handout) 
    Y_rec = (n x 1) cluster assignments [0,1,..., c-1]    
    """
    # run spectral clustering
    Y_rec = spectral_clustering(L, chosen_eig_indices, num_classes=num_classes)
    Y_rec_adaptive = spectral_clustering_adaptive(L,
                                                  num_classes=num_classes,
                                                  eig_max=eig_max)

    plot_the_bend(X, Y, L, Y_rec_adaptive, eigenvalues)
Ejemplo n.º 9
0
    """
    distance_threshold = np.max(dists[min_tree])
    eps = np.exp(-distance_threshold**2 / (2 * var))
    """
    use the build_similarity_graph function to build the graph W  
     W: (n x n) dimensional matrix representing                    
        the adjacency matrix of the graph
       use plot_graph_matrix to plot the graph                    
    """
    W = build_similarity_graph(X, var=var, eps=eps, k=0)
    plot_graph_matrix(X, Y, W)


if __name__ == '__main__':
    n = 300
    blobs_data, blobs_clusters = blobs(n)
    moons_data, moons_clusters = two_moons(n)
    point_circle_data, point_circle_clusters = point_and_circle(n)
    worst_blobs_data, worst_blobs_clusters = worst_case_blob(n, 1.0)

    var = 1

    X, Y = moons_data, moons_clusters
    n_samples = X.shape[0]
    dists = pairwise_distances(X).reshape((n_samples, n_samples))
    min_tree = min_span_tree(dists)
    eps = np.exp(-np.max(dists[min_tree])**2 / (2 * var))
    W_eps = build_similarity_graph(X, var=var, eps=0.6)
    W_knn = build_similarity_graph(X, k=15)

    plot_graph_matrix(X, Y, W_eps)