def Ward(D, adj_list, n_clusters): """ Method to apply Ward Clustering to feature data. Parameters: - - - - - D : array input feature matrix adj_list : dictionary adjacency list Returns: - - - - c : array parent links for ward clusters """ D_norm = statistics.Normalize(D) similarity = np.corrcoef(D_norm) linkage = ClusterTree(similarity, adj_list) z = Cluster(linkage, n=n_clusters) c = subgraphs.ClusterSpanningTrees().fit(adj_list, z) return c
def fit(self, features, adj_list, init_z=None, init_c=None, gt_z=None, edge_prior=None): """ Main function to fit the distance-dependent Chinese Restaurant Process. Parameters: - - - - - features : array data array of features for each sample adj_list : dictionary adjacency list of samples init_z : array initialized cortical map init_c : array initialized parent linkage matrix gt_z : array ground truth map for computing normalized mutual information edge_prior : dictionary nested dictionary, probability of neighboring vertices beloning to same parcels """ # initialize Gibbs sampling object gibbs = sampling.Gibbs() nvox = len(adj_list) # normalize each feature to have zero mean, unit variance features = statistics.Normalize(features) stats = { 'times': [], 'lp': [], 'max_lp': [], 'K': [], 'z': np.empty((0, nvox)), 'c': np.empty((0, nvox)), 'NMI': [], 'deltaC': [], 'boundC': [] } # If initial clustering provided, generate spanning trees from it # Otherwise, if desired, generate spanning trees from Ward Clustering if np.any(init_z): init_c = subgraphs.ClusterSpanningTrees().fit(adj_list, init_z) elif self.ward: init_c = ward_clustering.Ward(features, adj_list, self.n_clusters) # compute initial MST component array matrix [c, G] = subgraphs.sparse_linkage(adj_list, nvox, init_c) # compute initial parcel count and parcel assignments [K, z, parcels] = subgraphs.connected_components(G) self.init_z = z # compute log-likelihood of initial cortical map curr_lp = self._fullProbabilityDDCRP(parcels, features) max_lp = -1. * np.inf map_z, boundC, deltaC = [], [], [] t0 = time.time() steps = 0 order = np.arange(nvox) # perform mcmc_passes of over all samples for mcmc_pass in np.arange(self.mcmc_passes): # shuffle sample order for each MCMC pass np.random.shuffle(order) for i in order: # if current map log-probability greater than current max # set current map to best map if curr_lp > max_lp: max_lp = curr_lp map_z = z if steps % self.stats_interval == 0: stats = statistics.UpdateStats(stats, t0, curr_lp, max_lp, K, list(z), list(c), steps, gt_z, map_z, deltaC, boundC, self.verbose) # remove current link to parent G[i, c[i]] = 0 # if link was self-link if c[i] == i: # Removing self-loop, parcellation won't change rem_delta_lp = -np.log(self.alpha) z_rem = z parcels_rem = parcels else: # otherwise compute new connected components K_rem, z_rem, parcels_rem = subgraphs.connected_components( G) # if number of components changed if K_rem != K: # We split a cluster, compute change in likelihood rem_delta_lp = -self._LogProbDifference( parcels_rem, z_rem[i], z_rem[c[i]], features) else: rem_delta_lp = 0 # get neighbors of sample i adj_list_i = adj_list[i] # initialize empty log-prob vector lp = np.zeros((len(adj_list_i) + 1, )) lp[-1] = np.log(self.alpha) for j, n in enumerate(adj_list_i): # just undoing split if z_rem[n] == z_rem[c[i]]: lp[j] = -rem_delta_lp - (c[i] == i) * np.log( self.alpha) # (possibly) new merge elif z_rem[n] != z_rem[i]: lp[j] = self._LogProbDifference( parcels_rem, z_rem[i], z_rem[n], features) # sample new neighbor according to Gibbs new_neighbor = gibbs.sample(lp) if new_neighbor < len(adj_list_i): c[i] = adj_list_i[new_neighbor] else: c[i] = i # update current full log-likelihood with new parcels curr_lp = curr_lp + rem_delta_lp + lp[new_neighbor] # add new edge to parent graph G[i, c[i]] = 1 # compute new connected components [K_new, z_new, parcels_new] = subgraphs.connected_components(G) deltaC = statistics.delta_C(parcels, parcels_new) boundC = statistics.boundaries(z_new, adj_list) K, z, parcels = K_new, z_new, parcels_new steps += 1 # update diagnostic statistics stats = statistics.UpdateStats(stats, t0, curr_lp, max_lp, K, list(z), list(c), steps, gt_z, map_z, deltaC, boundC, self.verbose) # for visualization purposes map_z[np.where(map_z == 0)[0]] = map_z.max() + 1 self.map_z_ = map_z self.stats_ = stats