def regularized_spectral_clustering(adj_matrix, tau, n_clusters, algo='scan'):
    """
    :param adj_matrix: adjacency matrix representation of graph where [m][n] >0 if there is edge and [m][n] = weight
    :param n_clusters: cluster partitioning constant
    :param algo: the clustering separation algorithm, possible value kmeans++ or scan
    :return: labels, number of clustering iterations needed, smallest set of cluster found, execution time
    """
    start = timer()
    regularized_laplacian = regularized_laplacian_matrix(adj_matrix, tau)
    eigen_values, eigen_vectors = eigen_solver(regularized_laplacian, n_clusters=n_clusters)
    if algo == 'kmeans++':
        _, labels, _, num_iterations = k_means(eigen_vectors,
                                               n_clusters=n_clusters,
                                               return_n_iter=True)
    else:
        if n_clusters == 2:  # cluster based on sign
            second_eigen_vector_index = np.argsort(eigen_values)[1]
            second_eigen_vector = eigen_vectors.T[second_eigen_vector_index]
            labels = [0 if val <= 0 else 1 for val in second_eigen_vector]  # use only the second eigenvector
            num_iterations = 1
        else:  # bisecting it into k-ways, use all eigenvectors
            labels = discretize(eigen_vectors)
            num_iterations = 20  # assume worst case scenario that it tooks 20 restarts
    end = timer()
    execution_time = end - start
    smallest_cluster_size = min(np.sum(labels), abs(np.sum(labels) - len(labels)))
    return labels, num_iterations, smallest_cluster_size, execution_time
Example #2
0
    def spectral_clustering_sg(self,
                               affinity,
                               max_clusters=8,
                               eigen_solver=None,
                               random_state=None,
                               n_init=10,
                               eigen_tol=0.0,
                               assign_labels='kmeans'):

        if assign_labels not in ('kmeans', 'discretize'):
            raise ValueError("The 'assign_labels' parameter should be "
                             "'kmeans' or 'discretize', but '%s' was given" %
                             assign_labels)

        random_state = check_random_state(random_state)
        n_components = max_clusters
        maps, lambdas = self.spectral_embedding(affinity,
                                                n_components=n_components,
                                                eigen_solver=eigen_solver,
                                                random_state=random_state,
                                                eigen_tol=eigen_tol,
                                                drop_first=False)

        # determin n_clusters by Spectral Gap HERE!!
        n_clusters = self.estimate_num_of_clusters(lambdas)
        if assign_labels == 'kmeans':
            _, labels, _ = k_means(maps,
                                   n_clusters,
                                   random_state=0,
                                   n_init=n_init)
        else:
            labels = discretize(maps, random_state=random_state)
        return labels
Example #3
0
def signed_spectral_clustering(affinity,
                               random_state=None,
                               n_clusters=2,
                               eigen_tol=0.0):

    maps = signed_spectral_embedding(affinity, random_state, n_clusters,
                                     eigen_tol)
    clusters = discretize(maps, random_state=random_state)
    return clusters
Example #4
0
def spectral_clustering(affinity,
                        n_clusters=8,
                        n_components=None,
                        eigen_solver=None,
                        random_state=None,
                        n_init=10,
                        eigen_tol=0.0,
                        assign_labels='kmeans',
                        fuzzy_m=2,
                        fuzzy_error=0.0005,
                        fuzzy_maxiter=10000,
                        fuzzy_label_threshold=None):
    if assign_labels not in ('kmeans', 'fuzzy_cmeans', 'discretize'):
        raise ValueError(
            "The 'assign_labels' parameter should be "
            "'kmeans', 'fuzzy_cmeans' or 'discretize', but '%s' was given" %
            assign_labels)

    random_state_ = sp.check_random_state(random_state)
    n_components = n_clusters if n_components is None else n_components
    maps = spectral_embedding(affinity,
                              n_components=n_components,
                              eigen_solver=eigen_solver,
                              random_state=random_state,
                              eigen_tol=eigen_tol,
                              drop_first=False)

    if assign_labels == 'kmeans':
        _, labels, _ = sp.k_means(maps,
                                  n_clusters,
                                  random_state=random_state_,
                                  n_init=n_init)
    elif assign_labels == 'fuzzy_cmeans':
        if fuzzy_label_threshold is None:
            fuzzy_label_threshold = 1. / n_clusters

        _, u, _, _, _, _, _ = fuzz.cluster.cmeans(np.exp(maps.T),
                                                  n_clusters,
                                                  seed=random_state,
                                                  m=fuzzy_m,
                                                  error=fuzzy_error,
                                                  maxiter=fuzzy_maxiter)
        # from sklearn.mixture import GMM
        # gmm = GMM(n_components=n_clusters, covariance_type='full', random_state=random_state, n_init=n_init).fit(maps)
        # u = gmm.predict_proba(maps)
        # u = u.T
        assignments = np.argwhere(u.T >= fuzzy_label_threshold)
        labels = [[] for _ in range(u.shape[1])]
        for row in assignments:
            labels[row[0]].append(row[1])
    else:
        labels = sp.discretize(maps, random_state=random_state_)

    return labels
Example #5
0
    def fit(self, X, y=None):
        """Creates an affinity matrix for X using the selected affinity,
        then applies spectral clustering to this affinity matrix.
        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            OR, if affinity==`precomputed`, a precomputed affinity
            matrix of shape (n_samples, n_samples)
        """

        # this class is not tested with sparse matrix.
        # any contribution (report, coding) is welcome!
        X = check_array(X,
                        accept_sparse=['csr', 'csc', 'coo'],
                        dtype=np.float64)

        ell = self.n_clusters + 1  # +1 for drop_first, x2 for zero suppression in frequent_direction.
        k = self.n_buffer_rows
        if self.affinity == 'rbf':
            self.affinity_matrix_, dd = laplacian_sketch_rbf_kernel(
                X, ell, k, normed=self.normed, gamma=self.gamma)
        elif self.affinity == 'cosine':
            self.affinity_matrix_, dd = laplacian_sketch_cosine_similarity(
                X, ell, k, normed=self.normed)
        else:
            params = self.kernel_params
            if params is None:
                params = {}
            if callable(self.affinity):
                self.affinity_matrix_, dd = laplacian_sketch(
                    X, ell, k, False, self.normed, self.affinity, params)
            else:
                warnings.warn("%s is unknown kernel" % self.affinity)

        random_state = check_random_state(self.random_state)

        # spectral embedding post process.
        maps = spectral_embedding_imitation(self.affinity_matrix_,
                                            dd,
                                            n_components=self.n_clusters,
                                            random_state=random_state,
                                            drop_first=False)

        if self.assign_labels == 'kmeans':
            _, self.labels_, _ = k_means(maps,
                                         n_clusters,
                                         random_state=random_state,
                                         n_init=n_init)
        else:
            self.labels_ = discretize(maps, random_state=random_state)
Example #6
0
def test_discretize(seed=36):
    # Test the discretize using a noise assignment matrix
    LB = LabelBinarizer()
    for n_sample in [50, 100, 150, 500]:
        for n_class in range(2, 10):
            # random class labels
            random_state = np.random.RandomState(seed)
            y_true = random_state.random_integers(0, n_class, n_sample)
            y_true = np.array(y_true, np.float)
            # noise class assignment matrix
            y_true_noisy = (LB.fit_transform(y_true)
                            + 0.1 * random_state.randn(n_sample, n_class + 1))
            y_pred = discretize(y_true_noisy)
            assert_greater(adjusted_rand_score(y_true, y_pred), 0.9)
Example #7
0
def test_discretize(seed=36):
    # Test the discretize using a noise assignment matrix
    LB = LabelBinarizer()
    for n_sample in [50, 100, 150, 500]:
        for n_class in range(2, 10):
            # random class labels
            random_state = np.random.RandomState(seed)
            y_true = random_state.random_integers(0, n_class, n_sample)
            y_true = np.array(y_true, np.float)
            # noise class assignment matrix
            y_true_noisy = (LB.fit_transform(y_true)
                            + 0.1 * random_state.randn(n_sample, n_class + 1))
            y_pred = discretize(y_true_noisy, random_state)
            assert_greater(adjusted_rand_score(y_true, y_pred), 0.9)
def test_discretize(seed=8):
    # Test the discretize using a noise assignment matrix
    random_state = np.random.RandomState(seed)
    for n_samples in [50, 100, 150, 500]:
        for n_class in range(2, 10):
            # random class labels
            y_true = random_state.random_integers(0, n_class, n_samples)
            y_true = np.array(y_true, np.float)
            # noise class assignment matrix
            y_indicator = sparse.coo_matrix(
                (np.ones(n_samples), (np.arange(n_samples), y_true)), shape=(n_samples, n_class + 1)
            )
            y_true_noisy = y_indicator.todense() + 0.1 * random_state.randn(n_samples, n_class + 1)
            y_pred = discretize(y_true_noisy, random_state)
            assert_greater(adjusted_rand_score(y_true, y_pred), 0.8)
Example #9
0
def test_discretize(n_samples):
    # Test the discretize using a noise assignment matrix
    random_state = np.random.RandomState(seed=8)
    for n_class in range(2, 10):
        # random class labels
        y_true = random_state.randint(0, n_class + 1, n_samples)
        y_true = np.array(y_true, np.float)
        # noise class assignment matrix
        y_indicator = sparse.coo_matrix(
            (np.ones(n_samples), (np.arange(n_samples), y_true)),
            shape=(n_samples, n_class + 1))
        y_true_noisy = (y_indicator.toarray() +
                        0.1 * random_state.randn(n_samples, n_class + 1))
        y_pred = discretize(y_true_noisy, random_state)
        assert adjusted_rand_score(y_true, y_pred) > 0.8
Example #10
0
def Discretize(V, **kwargs):
    try:
        from sklearn.cluster.spectral import discretize
    except ImportError:
        raise ImportError('Use of this function (Discretize) requires the '
                          'installation of sklearn.')

    copy = kwargs.pop('copy', True)
    max_svd_restarts = kwargs.pop('max_svd_restarts', 30)
    n_iter_max = kwargs.pop('n_iter_max', 20)
    random_state = kwargs.pop('random_state', None)

    labels = discretize(V, copy=copy, max_svd_restarts=max_svd_restarts, 
                        n_iter_max=n_iter_max, random_state=random_state)
    return labels
Example #11
0
def spectral_clustering(affinity,
                        n_clusters=8,
                        n_components=None,
                        eigen_solver=None,
                        random_state=None,
                        n_init=10,
                        eigen_tol=0.0,
                        assign_labels='kmeans',
                        size_min=None,
                        size_max=None):
    if assign_labels not in ('kmeans', 'neo-kmeans', 'discretize'):
        raise ValueError("The 'assign_labels' parameter should be "
                         "'kmeans' or 'discretize', but '%s' was given" %
                         assign_labels)

    random_state = check_random_state(random_state)
    n_components = n_clusters if n_components is None else n_components

    # The first eigen vector is constant only for fully connected graphs
    # and should be kept for spectral clustering (drop_first = False)
    # See spectral_embedding documentation.
    maps = spectral_embedding(affinity,
                              n_components=n_components,
                              eigen_solver=eigen_solver,
                              random_state=random_state,
                              eigen_tol=eigen_tol,
                              drop_first=False)

    if assign_labels == 'kmeans':
        _, labels, _ = k_means_constrained(maps,
                                           n_clusters,
                                           random_state=random_state,
                                           n_init=n_init,
                                           size_min=size_min,
                                           size_max=size_max)
    elif assign_labels == 'neo-kmeans':
        raise ValueError(
            f"assign_labels: {assign_labels} is not currently supported.")
    else:
        labels = discretize(maps, random_state=random_state)

    return labels
Example #12
0
def spectral_hg_partitioning(hg,
                             n_clusters,
                             assign_labels='kmeans',
                             n_components=None,
                             random_state=None,
                             n_init=10):
    """
    :param hg: instance of HyperG
    :param n_clusters: int,
    :param assign_labels: str, {'kmeans', 'discretize'}, default: 'kmeans'
    :param n_components: int, number of eigen vectors to use for the spectral embedding
    :param random_state: int or None (default)
    :param n_init: int, number of time the k-means algorithm will be run
    with different centroid seeds.
    :return: numpy array, shape = (n_samples,), labels of each point
    """

    assert isinstance(hg, HyperG)
    assert n_clusters <= hg.num_nodes()

    random_state = check_random_state(random_state)

    if n_components is None:
        n_components = n_clusters

    L = hg.laplacian().toarray()
    L = check_symmetric(L)

    eigenval, eigenvec = eigh(L)
    embeddings = eigenvec[:, :n_components]

    if assign_labels == 'kmeans':
        _, labels, _ = k_means(embeddings,
                               n_clusters,
                               random_state=random_state,
                               n_init=n_init)
    else:
        labels = discretize(embeddings, random_state=random_state)

    return labels
def parcel_selection(X, grp_mask, write_dir='/tmp/', method='ward',
                     k_range=KRANGE, criterion='ll', verbose=True):
    """ Functiond edicated to parcel selection """
    # Define the structure A of the data. Pixels connected to their neighbors.
    n_voxels, n_contrasts, n_subjects = X.shape
    n_components = 100

    # Define a spatial model
    shape = grp_mask.shape
    connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr()

    # concatenate the data spatially
    Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects))
    X_ = PCA(n_components=n_components).fit_transform(Xv)

    i, j = connectivity.nonzero()
    sigma = np.sum((Xv[i] - Xv[j]) ** 2, 1).mean()
    if method == 'spectral':
        i, j = connectivity.nonzero()
        sigma = np.sum((Xv[i] - Xv[j]) ** 2, 1).mean()
        connectivity.data = np.exp( - np.sum((Xv[i] - Xv[j]) ** 2, 1) /
                                      (2 * sigma))
        
        maps = spectral_embedding(connectivity, n_components=n_components,
                              eigen_solver='arpack',
                              random_state=None,
                              eigen_tol=0.0, drop_first=False)
        
    del Xv
   
    # parcel selection
    all_bic = {}
    all_crit = {}
    for k in k_range:
        if method == 'ward':
            ward = Ward(n_clusters=k, 
                        connectivity=connectivity).fit(X_)
            labels = ward.labels_
        elif method == 'spectral':
            if k <= n_components:
                for i in range(10):
                    labels = discretize(maps[:, :k])
                    if len(np.unique(labels)) == k:
                        break
            else:
                _, labels, _ = k_means(maps[:, :100], n_clusters=k, n_init=1,
                         precompute_distances=False, max_iter=10)
        elif method == 'geometric':
            xyz = np.array(np.where(grp_mask)).T
            _, labels, _ = k_means(xyz, n_clusters=k, n_init=1,
                                   precompute_distances=False, max_iter=10)
        elif method in ['k-means', 'kmeans']:                
            _, labels, _ = k_means(X_, n_clusters=k, n_init=1,
                                   precompute_distances=False, max_iter=10)
        elif method == 'gmm':
            from sklearn.mixture import GMM
            labels = GMM(n_components=k, covariance_type='spherical', n_iter=10,
                      n_init=1).fit(X_).predict(X_)
            
        ll, bic = 0, 0
        for contrast in range(n_contrasts):
            ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map(
                X[:, contrast], labels, null=False)
            bic += bic_.sum()
            if criterion == 'log-LR':
                ll2, _, _, _, bic_ = parameter_map(
                    X[:, contrast], labels, null=True)
                ll += np.sum((ll1 - ll2))
            elif criterion == 'll':
                ll += np.sum(ll1)
            elif criterion == 'sigma':
                ll = (sigma1_.mean(), sigma2_.mean())
            elif criterion == 'kfold':
                ll += score_spatial_model(X[:, contrast], labels, cv='kfold')
        all_crit[k] = ll
        all_bic[k] = bic
        if verbose:
            print 'k: ', k, ' bic: ', bic, ' crit: ', ll
    if criterion == 'log-LR':
        file = open(path.join( write_dir, 'all_llr_%s.pck' % method), 'w')
        pickle.dump(all_crit, file)
    elif criterion == 'll':
        file = open(path.join( write_dir, 'all_ll_%s.pck' % method), 'w')
        pickle.dump(all_crit, file)
    elif criterion == 'sigma':
        file = open(path.join( write_dir, 'all_sigma_%s.pck' % method), 'w')
        pickle.dump(all_crit, file)
    elif criterion == 'kfold':
        file = open(path.join( write_dir, 'all_kfold_%s.pck' % method), 'w')
        pickle.dump(all_crit, file)
    file = open(path.join( write_dir, 'all_bic_%s.pck' % method), 'w')
    pickle.dump(all_bic, file)
    return all_crit, all_bic
def parcel_cv(X, grp_mask, write_dir='/tmp/', method='ward', n_folds=10, 
              k_range=KRANGE, verbose=True):
    """ Functiond edicated to parcel selection using 10-fold cross-validation"""
    from sklearn.cross_validation import KFold, ShuffleSplit
    # Define the structure A of the data. Pixels connected to their neighbors.
    n_voxels, n_contrasts, n_subjects = X.shape
    n_components = 100

    # Define a spatial model
    shape = grp_mask.shape
    connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr()
    ic, jc = connectivity.nonzero()

    # concatenate the data spatially
    Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects))
    sigma = np.sum((Xv[ic] - Xv[jc]) ** 2, 1).mean()

    # pre-compute PCA for the cross_validation loops
    if n_folds == int(n_folds):
        cv = KFold(X.shape[2], n_folds)
    else:
        cv = ShuffleSplit(X.shape[2], 10, .2)
    maps = []
    for (train, test) in cv:
        X_ = np.reshape(X[:, :, train], (n_voxels, n_contrasts * len(train)))
        
        if method == 'spectral':
            connectivity.data = np.exp( 
                - np.sum((X_[ic] - X_[jc]) ** 2, 1) / (2 * sigma))
            maps.append(spectral_embedding(
                    connectivity, n_components=n_components,
                    eigen_solver='arpack', random_state=None,
                    eigen_tol=0.0, drop_first=False))
        else:
            maps.append(PCA(n_components=n_components).fit_transform(X_))

    # parcel selection
    all_crit = {}
    for k in k_range:
        ll, ll_cv = 0, 0
        for (it, (train, test)) in enumerate(cv):
            if method == 'ward':
                ward = Ward(n_clusters=k, 
                            connectivity=connectivity).fit(maps[it])
                labels = ward.labels_
            elif method in ['k-means', 'kmeans']:
                _, labels, _ = k_means(maps[it], n_clusters=k, n_init=1,
                         precompute_distances=False, max_iter=10)
            elif method == 'spectral':
                 if k <= n_components:
                     for i in range(10):
                         labels = discretize(maps[it][:, :k])
                         if len(np.unique(labels)) == k:
                             break
                 else:
                     _, labels, _ = k_means(
                         maps[it], n_clusters=k, n_init=1,
                         precompute_distances=False, max_iter=10)
            elif method == 'geometric':
                xyz = np.array(np.where(grp_mask)).T
                _, labels, _ = k_means(xyz, n_clusters=k, n_init=1,
                                       precompute_distances=False, max_iter=10)
            for contrast in range(n_contrasts):
                ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map(
                    X[:, contrast, train], labels, null=False)
                ll += ll1.sum()
                ll2 = log_likelihood_map(
                    X[:, contrast, test], labels, mu_, sigma1_, sigma2_)

                ll_cv += ll2.sum()
        all_crit[k] = ll_cv
        if verbose:
            print 'k: ', k, 'll: ', ll, ' ll_cv: ', ll_cv
    
    file = open(path.join( write_dir, 'll_cv_%s.pck' % method), 'w')
    pickle.dump(all_crit, file)
    return all_crit
Example #15
0
def parcel_selection(X,
                     grp_mask,
                     write_dir='/tmp/',
                     method='ward',
                     k_range=KRANGE,
                     criterion='ll',
                     verbose=True):
    """ Functiond edicated to parcel selection """
    # Define the structure A of the data. Pixels connected to their neighbors.
    n_voxels, n_contrasts, n_subjects = X.shape
    n_components = 100

    # Define a spatial model
    shape = grp_mask.shape
    connectivity = grid_to_graph(shape[0], shape[1], shape[2],
                                 grp_mask).tocsr()

    # concatenate the data spatially
    Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects))
    X_ = PCA(n_components=n_components).fit_transform(Xv)

    i, j = connectivity.nonzero()
    sigma = np.sum((Xv[i] - Xv[j])**2, 1).mean()
    if method == 'spectral':
        i, j = connectivity.nonzero()
        sigma = np.sum((Xv[i] - Xv[j])**2, 1).mean()
        connectivity.data = np.exp(-np.sum(
            (Xv[i] - Xv[j])**2, 1) / (2 * sigma))

        maps = spectral_embedding(connectivity,
                                  n_components=n_components,
                                  eigen_solver='arpack',
                                  random_state=None,
                                  eigen_tol=0.0,
                                  drop_first=False)

    del Xv

    # parcel selection
    all_bic = {}
    all_crit = {}
    for k in k_range:
        if method == 'ward':
            ward = Ward(n_clusters=k, connectivity=connectivity).fit(X_)
            labels = ward.labels_
        elif method == 'spectral':
            if k <= n_components:
                for i in range(10):
                    labels = discretize(maps[:, :k])
                    if len(np.unique(labels)) == k:
                        break
            else:
                _, labels, _ = k_means(maps[:, :100],
                                       n_clusters=k,
                                       n_init=1,
                                       precompute_distances=False,
                                       max_iter=10)
        elif method == 'geometric':
            xyz = np.array(np.where(grp_mask)).T
            _, labels, _ = k_means(xyz,
                                   n_clusters=k,
                                   n_init=1,
                                   precompute_distances=False,
                                   max_iter=10)
        elif method in ['k-means', 'kmeans']:
            _, labels, _ = k_means(X_,
                                   n_clusters=k,
                                   n_init=1,
                                   precompute_distances=False,
                                   max_iter=10)
        elif method == 'gmm':
            from sklearn.mixture import GMM
            labels = GMM(n_components=k,
                         covariance_type='spherical',
                         n_iter=10,
                         n_init=1).fit(X_).predict(X_)

        ll, bic = 0, 0
        for contrast in range(n_contrasts):
            ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map(X[:, contrast],
                                                             labels,
                                                             null=False)
            bic += bic_.sum()
            if criterion == 'log-LR':
                ll2, _, _, _, bic_ = parameter_map(X[:, contrast],
                                                   labels,
                                                   null=True)
                ll += np.sum((ll1 - ll2))
            elif criterion == 'll':
                ll += np.sum(ll1)
            elif criterion == 'sigma':
                ll = (sigma1_.mean(), sigma2_.mean())
            elif criterion == 'kfold':
                ll += score_spatial_model(X[:, contrast], labels, cv='kfold')
        all_crit[k] = ll
        all_bic[k] = bic
        if verbose:
            print 'k: ', k, ' bic: ', bic, ' crit: ', ll
    if criterion == 'log-LR':
        file = open(path.join(write_dir, 'all_llr_%s.pck' % method), 'w')
        pickle.dump(all_crit, file)
    elif criterion == 'll':
        file = open(path.join(write_dir, 'all_ll_%s.pck' % method), 'w')
        pickle.dump(all_crit, file)
    elif criterion == 'sigma':
        file = open(path.join(write_dir, 'all_sigma_%s.pck' % method), 'w')
        pickle.dump(all_crit, file)
    elif criterion == 'kfold':
        file = open(path.join(write_dir, 'all_kfold_%s.pck' % method), 'w')
        pickle.dump(all_crit, file)
    file = open(path.join(write_dir, 'all_bic_%s.pck' % method), 'w')
    pickle.dump(all_bic, file)
    return all_crit, all_bic
Example #16
0
def spectral_clustering(affinity,
                        n_clusters=8,
                        n_components=None,
                        eigen_solver=None,
                        random_state=None,
                        n_init=10,
                        eigen_tol=0.0,
                        assign_labels='kmeans',
                        norm_laplacian=True):
    """Apply clustering to a projection to the normalized laplacian.

    In practice Spectral Clustering is very useful when the structure of
    the individual clusters is highly non-convex or more generally when
    a measure of the center and spread of the cluster is not a suitable
    description of the complete cluster. For instance when clusters are
    nested circles on the 2D plan.

    If affinity is the adjacency matrix of a graph, this method can be
    used to find normalized graph cuts.

    Read more in the :ref:`User Guide <spectral_clustering>`.

    Parameters
    -----------
    affinity : array-like or sparse matrix, shape: (n_samples, n_samples)
        The affinity matrix describing the relationship of the samples to
        embed. **Must be symmetric**.

        Possible examples:
          - adjacency matrix of a graph,
          - heat kernel of the pairwise distance matrix of the samples,
          - symmetric k-nearest neighbours connectivity matrix of the samples.

    n_clusters : integer, optional
        Number of clusters to extract.

    n_components : integer, optional, default is n_clusters
        Number of eigen vectors to use for the spectral embedding

    eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
        The eigenvalue decomposition strategy to use. AMG requires pyamg
        to be installed. It can be faster on very large, sparse problems,
        but may also lead to instabilities

    random_state : int seed, RandomState instance, or None (default)
        A pseudo random number generator used for the initialization
        of the lobpcg eigen vectors decomposition when eigen_solver == 'amg'
        and by the K-Means initialization.

    n_init : int, optional, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    eigen_tol : float, optional, default: 0.0
        Stopping criterion for eigendecomposition of the Laplacian matrix
        when using arpack eigen_solver.

    assign_labels : {'kmeans', 'discretize'}, default: 'kmeans'
        The strategy to use to assign labels in the embedding
        space.  There are two ways to assign labels after the laplacian
        embedding.  k-means can be applied and is a popular choice. But it can
        also be sensitive to initialization. Discretization is another
        approach which is less sensitive to random initialization. See
        the 'Multiclass spectral clustering' paper referenced below for
        more details on the discretization approach.

    Returns
    -------
    labels : array of integers, shape: n_samples
        The labels of the clusters.

    References
    ----------

    - Normalized cuts and image segmentation, 2000
      Jianbo Shi, Jitendra Malik
      http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324

    - A Tutorial on Spectral Clustering, 2007
      Ulrike von Luxburg
      http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323

    - Multiclass spectral clustering, 2003
      Stella X. Yu, Jianbo Shi
      http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf

    Notes
    ------
    The graph should contain only one connect component, elsewhere
    the results make little sense.

    This algorithm solves the normalized cut for k=2: it is a
    normalized spectral clustering.
    """
    if assign_labels not in ('kmeans', 'discretize'):
        raise ValueError("The 'assign_labels' parameter should be "
                         "'kmeans' or 'discretize', but '%s' was given" %
                         assign_labels)

    random_state = check_random_state(random_state)
    n_components = n_clusters if n_components is None else n_components
    maps = spectral_embedding(affinity,
                              n_components=n_components,
                              eigen_solver=eigen_solver,
                              random_state=random_state,
                              eigen_tol=eigen_tol,
                              drop_first=False,
                              norm_laplacian=norm_laplacian)

    if assign_labels == 'kmeans':
        _, labels, _ = k_means(maps,
                               n_clusters,
                               random_state=random_state,
                               n_init=n_init)
    else:
        labels = discretize(maps, random_state=random_state)

    return labels
Example #17
0
def reproducibility_selection(X,
                              grp_mask,
                              niter=2,
                              method='ward',
                              k_range=KRANGE,
                              write_dir='/tmp',
                              verbose=True):
    """ Returns a reproducibility metric on bootstraped models
    
    Parameters
    ----------
    X: array of shape (n_voxels, n_contrasts, n_subjects)
       the input data
    grp_mask: array of shape (image_shape),
              the non-zeros elements yield the spatial model
    niter: int, number of bootstrap samples estimated
    method: string, one of 'ward', 'kmeans', 'spectral'
    k_range: list of ints, 
             the possible number of parcels to be tested
    """
    n_voxels, n_contrasts, n_subjects = X.shape
    n_components = 100

    # Define a spatial model
    shape = grp_mask.shape
    connectivity = grid_to_graph(shape[0], shape[1], shape[2],
                                 grp_mask).tocsr()
    # concatenate the data spatially
    Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects))
    # pre-computed stuff
    ic, jc = connectivity.nonzero()
    sigma = np.sum((Xv[ic] - Xv[jc])**2, 1).mean()

    maps = []
    for i in range(niter):
        bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int)
        X_ = Xv[:, bootstrap]
        if method == 'spectral':
            connectivity.data = np.exp(-np.sum(
                (X_[ic] - X_[jc])**2, 1) / (2 * sigma))
            maps.append(
                spectral_embedding(connectivity,
                                   n_components=n_components,
                                   eigen_solver='arpack',
                                   random_state=None,
                                   eigen_tol=0.0,
                                   drop_first=False))
        else:
            maps.append(PCA(n_components=n_components).fit_transform(X_))

    ars_score = {}
    ami_score = {}
    vm_score = {}
    for (ik, k_) in enumerate(k_range):
        label_ = []
        for i in range(niter):
            bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int)
            if method == 'spectral':
                if k_ <= n_components:
                    for _ in range(10):
                        labels = discretize(maps[i][:, :k_])
                        if len(np.unique(labels)) == k_:
                            break
                else:
                    _, labels, _ = k_means(maps[i],
                                           n_clusters=k_,
                                           n_init=1,
                                           precompute_distances=False,
                                           max_iter=10)
            elif method == 'ward':
                ward = Ward(n_clusters=k_,
                            connectivity=connectivity).fit(maps[i])
                labels = ward.labels_
            elif method in ['k-means', 'kmeans']:
                _, labels, _ = k_means(maps[i],
                                       n_clusters=k_,
                                       n_init=1,
                                       precompute_distances=False,
                                       max_iter=10)
            elif method == 'geometric':
                xyz = np.array(np.where(grp_mask)).T
                _, labels, _ = k_means(xyz,
                                       n_clusters=k_,
                                       n_init=1,
                                       precompute_distances=False,
                                       max_iter=10)
            label_.append(labels)
        ars_score[k_] = reproducibility_rating(label_, 'ars')
        ami_score[k_] = reproducibility_rating(label_, 'ami')
        vm_score[k_] = reproducibility_rating(label_, 'vm')
        if verbose:
            print 'k: ', k_, '  ari: ', ars_score[k_], 'ami: ',ami_score[k_],\
                ' vm: ', vm_score[k_]
    file = open(path.join(write_dir, 'ari_score_%s.pck' % method), 'w')
    pickle.dump(ars_score, file)
    file = open(path.join(write_dir, 'ami_score_%s.pck' % method), 'w')
    pickle.dump(ami_score, file)
    file = open(path.join(write_dir, 'vm_score_%s.pck' % method), 'w')
    pickle.dump(vm_score, file)
    return ars_score, ami_score, vm_score
Example #18
0
def parcel_cv(X,
              grp_mask,
              write_dir='/tmp/',
              method='ward',
              n_folds=10,
              k_range=KRANGE,
              verbose=True):
    """ Functiond edicated to parcel selection using 10-fold cross-validation"""
    from sklearn.cross_validation import KFold, ShuffleSplit
    # Define the structure A of the data. Pixels connected to their neighbors.
    n_voxels, n_contrasts, n_subjects = X.shape
    n_components = 100

    # Define a spatial model
    shape = grp_mask.shape
    connectivity = grid_to_graph(shape[0], shape[1], shape[2],
                                 grp_mask).tocsr()
    ic, jc = connectivity.nonzero()

    # concatenate the data spatially
    Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects))
    sigma = np.sum((Xv[ic] - Xv[jc])**2, 1).mean()

    # pre-compute PCA for the cross_validation loops
    if n_folds == int(n_folds):
        cv = KFold(X.shape[2], n_folds)
    else:
        cv = ShuffleSplit(X.shape[2], 10, .2)
    maps = []
    for (train, test) in cv:
        X_ = np.reshape(X[:, :, train], (n_voxels, n_contrasts * len(train)))

        if method == 'spectral':
            connectivity.data = np.exp(-np.sum(
                (X_[ic] - X_[jc])**2, 1) / (2 * sigma))
            maps.append(
                spectral_embedding(connectivity,
                                   n_components=n_components,
                                   eigen_solver='arpack',
                                   random_state=None,
                                   eigen_tol=0.0,
                                   drop_first=False))
        else:
            maps.append(PCA(n_components=n_components).fit_transform(X_))

    # parcel selection
    all_crit = {}
    for k in k_range:
        ll, ll_cv = 0, 0
        for (it, (train, test)) in enumerate(cv):
            if method == 'ward':
                ward = Ward(n_clusters=k,
                            connectivity=connectivity).fit(maps[it])
                labels = ward.labels_
            elif method in ['k-means', 'kmeans']:
                _, labels, _ = k_means(maps[it],
                                       n_clusters=k,
                                       n_init=1,
                                       precompute_distances=False,
                                       max_iter=10)
            elif method == 'spectral':
                if k <= n_components:
                    for i in range(10):
                        labels = discretize(maps[it][:, :k])
                        if len(np.unique(labels)) == k:
                            break
                else:
                    _, labels, _ = k_means(maps[it],
                                           n_clusters=k,
                                           n_init=1,
                                           precompute_distances=False,
                                           max_iter=10)
            elif method == 'geometric':
                xyz = np.array(np.where(grp_mask)).T
                _, labels, _ = k_means(xyz,
                                       n_clusters=k,
                                       n_init=1,
                                       precompute_distances=False,
                                       max_iter=10)
            for contrast in range(n_contrasts):
                ll1, mu_, sigma1_, sigma2_, bic_ = parameter_map(X[:, contrast,
                                                                   train],
                                                                 labels,
                                                                 null=False)
                ll += ll1.sum()
                ll2 = log_likelihood_map(X[:, contrast, test], labels, mu_,
                                         sigma1_, sigma2_)

                ll_cv += ll2.sum()
        all_crit[k] = ll_cv
        if verbose:
            print 'k: ', k, 'll: ', ll, ' ll_cv: ', ll_cv

    file = open(path.join(write_dir, 'll_cv_%s.pck' % method), 'w')
    pickle.dump(all_crit, file)
    return all_crit
def reproducibility_selection(
    X, grp_mask, niter=2, method='ward', k_range=KRANGE, write_dir='/tmp',
    verbose=True):
    """ Returns a reproducibility metric on bootstraped models
    
    Parameters
    ----------
    X: array of shape (n_voxels, n_contrasts, n_subjects)
       the input data
    grp_mask: array of shape (image_shape),
              the non-zeros elements yield the spatial model
    niter: int, number of bootstrap samples estimated
    method: string, one of 'ward', 'kmeans', 'spectral'
    k_range: list of ints, 
             the possible number of parcels to be tested
    """
    n_voxels, n_contrasts, n_subjects = X.shape
    n_components = 100

    # Define a spatial model
    shape = grp_mask.shape
    connectivity = grid_to_graph(shape[0], shape[1], shape[2], grp_mask).tocsr()
    # concatenate the data spatially
    Xv = np.reshape(X, (n_voxels, n_contrasts * n_subjects))
    # pre-computed stuff
    ic, jc = connectivity.nonzero()
    sigma = np.sum((Xv[ic] - Xv[jc]) ** 2, 1).mean()
    
    maps = []
    for i in range(niter):
        bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int)
        X_ = Xv[:, bootstrap]
        if method == 'spectral':
            connectivity.data = np.exp( 
                - np.sum((X_[ic] - X_[jc]) ** 2, 1) / (2 * sigma))
            maps.append(spectral_embedding(connectivity,
                                           n_components=n_components,
                                           eigen_solver='arpack',
                                           random_state=None,
                                           eigen_tol=0.0, drop_first=False))
        else:
            maps.append(PCA(n_components=n_components).fit_transform(X_))
            
    ars_score = {}
    ami_score = {}
    vm_score = {}
    for (ik, k_) in enumerate(k_range):
        label_ = []
        for i in range(niter):
            bootstrap = (np.random.rand(Xv.shape[1]) * Xv.shape[1]).astype(int)
            if method == 'spectral':
                if k_ <= n_components:
                    for _ in range(10):
                        labels = discretize(maps[i][:, :k_])
                        if len(np.unique(labels)) == k_:
                            break
                else:
                    _, labels, _ = k_means(
                        maps[i], n_clusters=k_, n_init=1,
                        precompute_distances=False, max_iter=10)
            elif method == 'ward':
                    ward = Ward(n_clusters=k_, 
                                connectivity=connectivity).fit(maps[i])
                    labels = ward.labels_
            elif method in ['k-means', 'kmeans']:
                _, labels, _ = k_means(maps[i], n_clusters=k_, n_init=1,
                                       precompute_distances=False, max_iter=10)
            elif method == 'geometric':
                xyz = np.array(np.where(grp_mask)).T
                _, labels, _ = k_means(xyz, n_clusters=k_, n_init=1,
                                       precompute_distances=False, max_iter=10)
            label_.append(labels)
        ars_score[k_] = reproducibility_rating(label_, 'ars')
        ami_score[k_] = reproducibility_rating(label_, 'ami')
        vm_score[k_] = reproducibility_rating(label_, 'vm')
        if verbose:
            print 'k: ', k_, '  ari: ', ars_score[k_], 'ami: ',ami_score[k_],\
                ' vm: ', vm_score[k_]
    file = open(path.join(write_dir, 'ari_score_%s.pck' % method), 'w')
    pickle.dump(ars_score, file)
    file = open(path.join(write_dir, 'ami_score_%s.pck' % method), 'w')
    pickle.dump(ami_score, file)
    file = open(path.join(write_dir, 'vm_score_%s.pck' % method), 'w')
    pickle.dump(vm_score, file)
    return ars_score, ami_score, vm_score    
Example #20
0
def spectral_clustering(affinity, n_clusters=8, n_components=None,
                        eigen_solver=None, random_state=None, n_init=10,
                        eigen_tol=0.0, assign_labels='kmeans',
                        norm_laplacian=True):
    """Apply clustering to a projection to the normalized laplacian.

    In practice Spectral Clustering is very useful when the structure of
    the individual clusters is highly non-convex or more generally when
    a measure of the center and spread of the cluster is not a suitable
    description of the complete cluster. For instance when clusters are
    nested circles on the 2D plan.

    If affinity is the adjacency matrix of a graph, this method can be
    used to find normalized graph cuts.

    Read more in the :ref:`User Guide <spectral_clustering>`.

    Parameters
    -----------
    affinity : array-like or sparse matrix, shape: (n_samples, n_samples)
        The affinity matrix describing the relationship of the samples to
        embed. **Must be symmetric**.

        Possible examples:
          - adjacency matrix of a graph,
          - heat kernel of the pairwise distance matrix of the samples,
          - symmetric k-nearest neighbours connectivity matrix of the samples.

    n_clusters : integer, optional
        Number of clusters to extract.

    n_components : integer, optional, default is n_clusters
        Number of eigen vectors to use for the spectral embedding

    eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'}
        The eigenvalue decomposition strategy to use. AMG requires pyamg
        to be installed. It can be faster on very large, sparse problems,
        but may also lead to instabilities

    random_state : int seed, RandomState instance, or None (default)
        A pseudo random number generator used for the initialization
        of the lobpcg eigen vectors decomposition when eigen_solver == 'amg'
        and by the K-Means initialization.

    n_init : int, optional, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    eigen_tol : float, optional, default: 0.0
        Stopping criterion for eigendecomposition of the Laplacian matrix
        when using arpack eigen_solver.

    assign_labels : {'kmeans', 'discretize'}, default: 'kmeans'
        The strategy to use to assign labels in the embedding
        space.  There are two ways to assign labels after the laplacian
        embedding.  k-means can be applied and is a popular choice. But it can
        also be sensitive to initialization. Discretization is another
        approach which is less sensitive to random initialization. See
        the 'Multiclass spectral clustering' paper referenced below for
        more details on the discretization approach.

    Returns
    -------
    labels : array of integers, shape: n_samples
        The labels of the clusters.

    References
    ----------

    - Normalized cuts and image segmentation, 2000
      Jianbo Shi, Jitendra Malik
      http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324

    - A Tutorial on Spectral Clustering, 2007
      Ulrike von Luxburg
      http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323

    - Multiclass spectral clustering, 2003
      Stella X. Yu, Jianbo Shi
      http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf

    Notes
    ------
    The graph should contain only one connect component, elsewhere
    the results make little sense.

    This algorithm solves the normalized cut for k=2: it is a
    normalized spectral clustering.
    """
    if assign_labels not in ('kmeans', 'discretize'):
        raise ValueError("The 'assign_labels' parameter should be "
                         "'kmeans' or 'discretize', but '%s' was given"
                         % assign_labels)

    random_state = check_random_state(random_state)
    n_components = n_clusters if n_components is None else n_components
    maps = spectral_embedding(affinity, n_components=n_components,
                              eigen_solver=eigen_solver,
                              random_state=random_state,
                              eigen_tol=eigen_tol, drop_first=False,
                              norm_laplacian=norm_laplacian)

    if assign_labels == 'kmeans':
        _, labels, _ = k_means(maps, n_clusters, random_state=random_state,
                               n_init=n_init)
    else:
        labels = discretize(maps, random_state=random_state)

    return labels