Exemple #1
0
def Ward(D, adj_list, n_clusters):
    """
    Method to apply Ward Clustering to feature data.

    Parameters:
    - - - - -
    D : array
        input feature matrix
    adj_list : dictionary
                adjacency list

    Returns:
    - - - -
    c : array
        parent links for ward clusters
    """

    D_norm = statistics.Normalize(D)
    similarity = np.corrcoef(D_norm)

    linkage = ClusterTree(similarity, adj_list)
    z = Cluster(linkage, n=n_clusters)
    c = subgraphs.ClusterSpanningTrees().fit(adj_list, z)

    return c
Exemple #2
0
    def fit(self,
            features,
            adj_list,
            init_z=None,
            init_c=None,
            gt_z=None,
            edge_prior=None):
        """
        Main function to fit the distance-dependent Chinese Restaurant Process.
        Parameters:
        - - - - -
        features : array
                data array of features for each sample
        adj_list : dictionary
                adjacency list of samples
        init_z : array
            initialized cortical map
        init_c : array
                initialized parent linkage matrix
        gt_z : array
                ground truth map for computing normalized mutual information
        edge_prior : dictionary
                nested dictionary, probability of neighboring vertices beloning
                to same parcels
        """

        # initialize Gibbs sampling object
        gibbs = sampling.Gibbs()
        nvox = len(adj_list)

        # normalize each feature to have zero mean, unit variance
        features = statistics.Normalize(features)

        stats = {
            'times': [],
            'lp': [],
            'max_lp': [],
            'K': [],
            'z': np.empty((0, nvox)),
            'c': np.empty((0, nvox)),
            'NMI': [],
            'deltaC': [],
            'boundC': []
        }

        # If initial clustering provided, generate spanning trees from it
        # Otherwise, if desired, generate spanning trees from Ward Clustering
        if np.any(init_z):
            init_c = subgraphs.ClusterSpanningTrees().fit(adj_list, init_z)
        elif self.ward:
            init_c = ward_clustering.Ward(features, adj_list, self.n_clusters)

        # compute initial MST component array matrix
        [c, G] = subgraphs.sparse_linkage(adj_list, nvox, init_c)

        # compute initial parcel count and parcel assignments
        [K, z, parcels] = subgraphs.connected_components(G)
        self.init_z = z

        # compute log-likelihood of initial cortical map
        curr_lp = self._fullProbabilityDDCRP(parcels, features)

        max_lp = -1. * np.inf
        map_z, boundC, deltaC = [], [], []
        t0 = time.time()
        steps = 0

        order = np.arange(nvox)

        # perform mcmc_passes of over all samples
        for mcmc_pass in np.arange(self.mcmc_passes):

            # shuffle sample order for each MCMC pass
            np.random.shuffle(order)

            for i in order:

                # if current map log-probability greater than current max
                # set current map to best map
                if curr_lp > max_lp:
                    max_lp = curr_lp
                    map_z = z

                if steps % self.stats_interval == 0:
                    stats = statistics.UpdateStats(stats, t0,
                                                   curr_lp, max_lp, K, list(z),
                                                   list(c), steps, gt_z, map_z,
                                                   deltaC, boundC,
                                                   self.verbose)

                # remove current link to parent
                G[i, c[i]] = 0

                # if link was self-link
                if c[i] == i:
                    # Removing self-loop, parcellation won't change
                    rem_delta_lp = -np.log(self.alpha)
                    z_rem = z
                    parcels_rem = parcels
                else:
                    # otherwise compute new connected components
                    K_rem, z_rem, parcels_rem = subgraphs.connected_components(
                        G)

                    # if number of components changed
                    if K_rem != K:
                        # We split a cluster, compute change in likelihood
                        rem_delta_lp = -self._LogProbDifference(
                            parcels_rem, z_rem[i], z_rem[c[i]], features)

                    else:
                        rem_delta_lp = 0

                # get neighbors of sample i
                adj_list_i = adj_list[i]

                # initialize empty log-prob vector
                lp = np.zeros((len(adj_list_i) + 1, ))
                lp[-1] = np.log(self.alpha)

                for j, n in enumerate(adj_list_i):
                    # just undoing split
                    if z_rem[n] == z_rem[c[i]]:
                        lp[j] = -rem_delta_lp - (c[i] == i) * np.log(
                            self.alpha)

                    # (possibly) new merge
                    elif z_rem[n] != z_rem[i]:
                        lp[j] = self._LogProbDifference(
                            parcels_rem, z_rem[i], z_rem[n], features)

                # sample new neighbor according to Gibbs
                new_neighbor = gibbs.sample(lp)
                if new_neighbor < len(adj_list_i):
                    c[i] = adj_list_i[new_neighbor]
                else:
                    c[i] = i

                # update current full log-likelihood with new parcels
                curr_lp = curr_lp + rem_delta_lp + lp[new_neighbor]
                # add new edge to parent graph
                G[i, c[i]] = 1
                # compute new connected components
                [K_new, z_new, parcels_new] = subgraphs.connected_components(G)

                deltaC = statistics.delta_C(parcels, parcels_new)
                boundC = statistics.boundaries(z_new, adj_list)
                K, z, parcels = K_new, z_new, parcels_new
                steps += 1

        # update diagnostic statistics
        stats = statistics.UpdateStats(stats, t0, curr_lp, max_lp, K, list(z),
                                       list(c), steps, gt_z, map_z, deltaC,
                                       boundC, self.verbose)

        # for visualization purposes
        map_z[np.where(map_z == 0)[0]] = map_z.max() + 1

        self.map_z_ = map_z
        self.stats_ = stats