def cluster_data(img, thr, xyz_a, k=26): '''docstring for cluster_data''' # Import packages from scipy.sparse import coo_matrix, cs_graph_components import numpy as np # Threshold the entire correlation map and find connected components, store this in sparse matrix val_idx = img > thr # store valid indices xyz_th = xyz_a[ val_idx] # find the 3D indices corresponding to the above threshold voxels i, j, d = graph_3d_grid( xyz_th, k=k) # find the connected components for the above threshold voxels nvoxs = xyz_th.shape[ 0] # store the number of correlated voxels in entire network adj = coo_matrix((d, (i, j)), shape=( nvoxs, nvoxs)) # and store the connected nodes and weights in sparse matrix # Identify the connected components (clusters) within the graph nc, labels = cs_graph_components(adj) # Copy the node labels to their voxel equivalents lbl_img = np.zeros(img.shape) # init lbl_img - map to store label data # add 2 so that labels corresponding to unconnected voxels (-2) # will be zero in lbl_img, and label==0 will now equal 2 lbl_img[val_idx] = labels + 2 return lbl_img
def tree_information_sparse(forest, n_features): """Computes mutual information objective from forest. Parameters ---------- forest: sparse matrix graph containing trees representing cluster n_features: int dimensionality of input space. """ entropy = 0 sym_forest = forest + forest.T n_components, components = sparse.cs_graph_components(sym_forest) if np.any(components < 0): # there is a lonely node entropy -= 1e10 #n_samples = len(components) for i in xrange(n_components): inds = np.where(components == i)[0] subforest = forest[inds[:, np.newaxis], inds] L = subforest.sum() n_samples_c = len(inds) if L == 0: warnings.warn("L is zero. This means there are identical points in" " the dataset") L = 1e-10 entropy += (n_samples_c * ( (n_features - 1) * np.log(n_samples_c) - n_features * np.log(L))) return entropy
def plot_clustering(X, y=None, axes=None, three_d=False, forest=None): if y is None and forest is None: raise ValueError("give me y or a sparse matrix representing the" "forest") if y is None: _, y = sparse.cs_graph_components(forest + forest.T) if three_d and X.shape[1] > 3: X = RandomizedPCA(n_components=3).fit_transform(X) elif not three_d and X.shape[1] > 2: X = RandomizedPCA(n_components=2).fit_transform(X) if axes == None or three_d: plt.figure() axes = plt.gca() if three_d: axes = plt.gca(axes=axes, projection='3d') colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'] * 10) color = colors[y] if three_d: axes.scatter(X[:, 0], X[:, 1], X[:, 2], color=color) else: axes.scatter(X[:, 0], X[:, 1], color=color, s=10) if not forest is None: for edge in np.vstack(forest.nonzero()).T: i, j = edge axes.plot([X[i, 0], X[j, 0]], [X[i, 1], X[j, 1]], c=color[i]) axes.set_xticks(()) axes.set_yticks(()) return axes
def clusterByAABB(poly, opts, argv): aabbs = [] aabbsOut = np.zeros((poly.GetLines().GetNumberOfCells(), 6)) for (i, idl) in enumerate(lineGenerator(poly.GetLines())): pts = idListToPoints(idl, poly.GetPoints()) aabb = AABB(pts) aabbs.append(aabb) aabbsOut[i, :] = aabb.toArray() overlaps = np.zeros((len(aabbs), len(aabbs))) for (i1, ab1) in enumerate(aabbs): for i2 in range(i1, len(aabbs)): if (i1 == i2): overlaps[i1, i2] = 1 else: ab2 = aabbs[i2] overlaps[i1, i2] = ab1.Intersect(ab2) overlaps[i2, i1] = overlaps[i1, i2] threshold = np.vectorize(lambda x: 1 if x > opts.overlap else 0, otypes=[np.int32]) adjacency = threshold(overlaps) conComps = scsp.cs_graph_components(adjacency) addCellIntArray(poly, "VortexCluster", conComps[1]) aabbGlyphs = glyphAABB(aabbs) addCellIntArray(aabbGlyphs, "VortexCluster", conComps[1]) if (opts.aabbOut != ""): np.savetxt(opts.aabbOut, aabbsOut) if (opts.aabbLabels != ""): np.savetxt(opts.aabbLabels, conComps[1]) return (poly, aabbGlyphs)
def tree_information_sparse(forest, n_features): """Computes mutual information objective from forest. Parameters ---------- forest: sparse matrix graph containing trees representing cluster n_features: int dimensionality of input space. """ entropy = 0 sym_forest = forest + forest.T n_components, components = sparse.cs_graph_components(sym_forest) if np.any(components < 0): # there is a lonely node entropy -= 1e10 #n_samples = len(components) for i in xrange(n_components): inds = np.where(components == i)[0] subforest = forest[inds[:, np.newaxis], inds] L = subforest.sum() n_samples_c = len(inds) if L == 0: warnings.warn("L is zero. This means there are identical points in" " the dataset") L = 1e-10 entropy += (n_samples_c * ((n_features - 1) * np.log(n_samples_c) - n_features * np.log(L))) return entropy
def clusterByAABB(poly,opts,argv): aabbs = [] aabbsOut = np.zeros((poly.GetLines().GetNumberOfCells(),6)) for (i,idl) in enumerate(lineGenerator(poly.GetLines())): pts = idListToPoints(idl, poly.GetPoints()) aabb = AABB(pts) aabbs.append(aabb) aabbsOut[i,:] = aabb.toArray() overlaps = np.zeros((len(aabbs), len(aabbs))) for (i1, ab1) in enumerate(aabbs): for i2 in range(i1, len(aabbs)): if (i1 == i2): overlaps[i1,i2] = 1 else: ab2 = aabbs[i2] overlaps[i1,i2] = ab1.Intersect(ab2) overlaps[i2,i1] = overlaps[i1,i2] threshold = np.vectorize(lambda x: 1 if x > opts.overlap else 0, otypes=[np.int32]) adjacency = threshold(overlaps) conComps = scsp.cs_graph_components(adjacency) addCellIntArray(poly, "VortexCluster", conComps[1]) aabbGlyphs = glyphAABB(aabbs) addCellIntArray(aabbGlyphs, "VortexCluster", conComps[1]) if (opts.aabbOut != ""): np.savetxt(opts.aabbOut, aabbsOut) if (opts.aabbLabels != ""): np.savetxt(opts.aabbLabels, conComps[1]) return (poly, aabbGlyphs)
def euclidean_mst(X, neighbors_estimator, verbose=2): n_neighbors = min(2, X.shape[0]) while True: # make sure we have a connected minimum spanning tree. # otherwise we need to consider more neighbors n_neighbors = 2 * n_neighbors if verbose > 1: print("Trying to build mst with %d neighbors." % n_neighbors) distances = neighbors_estimator.kneighbors_graph( X, n_neighbors=n_neighbors, mode='distance') n_components, component_indicators =\ sparse.cs_graph_components(distances + distances.T) if len(np.unique(component_indicators)) > 1: continue distances.sort_indices() forest = minimum_spanning_tree(distances) _, inds = sparse.cs_graph_components(forest + forest.T) assert (len(np.unique(inds)) == 1) break return forest
def euclidean_mst(X, neighbors_estimator, verbose=2): n_neighbors = min(2, X.shape[0]) while True: # make sure we have a connected minimum spanning tree. # otherwise we need to consider more neighbors n_neighbors = 2 * n_neighbors if verbose > 1: print("Trying to build mst with %d neighbors." % n_neighbors) distances = neighbors_estimator.kneighbors_graph( X, n_neighbors=n_neighbors, mode='distance') n_components, component_indicators =\ sparse.cs_graph_components(distances + distances.T) if len(np.unique(component_indicators)) > 1: continue distances.sort_indices() forest = minimum_spanning_tree(distances) _, inds = sparse.cs_graph_components(forest + forest.T) assert(len(np.unique(inds)) == 1) break return forest
def __init__(self, clf, A=None, n_jobs=-1, copy=True): if copy and A is not None: self.A = A.copy() else: self.A = A self.copy = copy self.clf = clf self.n_jobs = n_jobs if A is not None: self.n_components_A = sparse.cs_graph_components(A)[0] else: self.n_components_A = 1
def test_cs_graph_components(self): import numpy as np from scipy.sparse import csr_matrix, cs_graph_components D = np.eye(4, dtype=np.bool) n_comp, flag = cs_graph_components(csr_matrix(D)) assert_(n_comp == 4) assert_equal(flag, [0, 1, 2, 3]) D[0,1] = D[1,0] = 1 n_comp, flag = cs_graph_components(csr_matrix(D)) assert_(n_comp == 3) assert_equal(flag, [0, 0, 1, 2]) # A pathological case... D[2,2] = 0 n_comp, flag = cs_graph_components(csr_matrix(D)) assert_(n_comp == 2) assert_equal(flag, [0, 0, -2, 1])
def cc(self): """Compte the different connected components of the graph. Returns ------- label: array of shape(self.V), labelling of the vertices """ try: from scipy.sparse import cs_graph_components _, label = cs_graph_components(self.adjacency()) except: lil = self.to_coo_matrix().tolil().rows.tolist() label = lil_cc(lil) return label
def get_from_fiber_graph(self,G): self.ncc,vertexCC = sp.cs_graph_components(G+G.transpose()) self.n = vertexCC.shape[0] noniso = np.nonzero(np.not_equal(vertexCC,-2))[0] cccounter = Counter(vertexCC[noniso]) cc_badLabel,_ = zip(*cccounter.most_common()) cc_dict = dict(zip(cc_badLabel, np.arange(self.ncc)+1)) cc_dict[-2] = 0 self.vertexCC = np.array([cc_dict[v] for v in vertexCC]) self.ccsize = Counter(vertexCC)
def fit(self, X): self.nearest_neighbors_ = NearestNeighbors(algorithm=self.nearest_neighbor_algorithm) self.nearest_neighbors_.fit(X) forest = euclidean_mst(X, self.nearest_neighbors_) weights = forest.data inds = np.argsort(weights)[::-1] edges = np.vstack(forest.nonzero()).T n_samples = len(edges) + 1 i = 0 while len(forest.nonzero()[0]) > n_samples - self.n_clusters: e = edges[inds[i]] forest[e[0], e[1]] = 0 if np.min(sparse.cs_graph_components(forest + forest.T)[1]) < 0: # only one node in new component. messes up cs_graph_components forest[e[0], e[1]] = weights[i] elif (np.min(np.bincount(sparse.cs_graph_components(forest + forest.T)[1])) < 2): # disallow small clusters forest[e[0], e[1]] = weights[i] i += 1 self.labels_ = sparse.cs_graph_components(forest + forest.T)[1] return self
def cc(self): """Compte the different connected components of the graph. Returns ------- label: array of shape(self.V), labelling of the vertices """ try: from scipy.sparse import cs_graph_components _, label = cs_graph_components(self.adjacency()) except: pass lil = self.to_coo_matrix().tolil().rows.tolist() label = lil_cc(lil) return label
def get_lcc_idx(G): """Determines and sorts the connected components of G Each vertex in G is assigned a label corresponding to its connected component. The largest connected component is labelled 0, second largest 1, etc. **NOTE**: All isolated vertices (ie no incident edges) are put in 1 connected components """ ncc,vertexCC = sp.cs_graph_components(G) cc_size = Counter(vertexCC) cc_size = sorted(cc_size.iteritems(), key=lambda cc: cc[1],reverse=True) cc_badLabel,_ = zip(*cc_size) cc_dict = dict(zip(cc_badLabel, np.arange(ncc+1))) vertexCC = [cc_dict[vcc] for vcc in vertexCC] return np.array(vertexCC)
def cluster_data(img, thr, xyz_a, k=26): """docstring for cluster_data""" from scipy.sparse import coo_matrix, cs_graph_components # Threshold the entire correlation map and find connected components, store this in sparse matrix val_idx = img > thr # store valid indices xyz_th = xyz_a[val_idx] # find the 3D indices corresponding to the above threshold voxels i,j,d = graph_3d_grid(xyz_th, k=k) # find the connected components for the above threshold voxels nvoxs = xyz_th.shape[0] # store the number of correlated voxels in entire network adj = coo_matrix((d, (i,j)), shape=(nvoxs,nvoxs)) # and store the connected nodes and weights in sparse matrix # Identify the connected components (clusters) within the graph nc, labels = cs_graph_components(adj) # Copy the node labels to their voxel equivalents lbl_img = np.zeros(img.shape) # init lbl_img - map to store label data # add 2 so that labels corresponding to unconnected voxels (-2) # will be zero in lbl_img, and label==0 will now equal 2 lbl_img[val_idx] = labels + 2 return lbl_img
def build_sym_geom_adjacency(geoms, max_gnn=100): """ Return the sparsest yet maximally connected symetric geometrical adjacency matrix """ global INTERNAL_PARAMETERS min_gnn = INTERNAL_PARAMETERS['min_geom_neighbors'] assert min_gnn < max_gnn, "Too high minimum number of neighbors" n_pts = geoms.shape[0] for n_neighbors in range(min_gnn, max_gnn + 1): # find the lowest number of NN s.t. the graph is not too disconnected C = build_geom_neighbor_graph(geoms, n_neighbors) neighbs = C.indices.reshape((n_pts, n_neighbors)) C = C + C.T C.data[:] = 1 n_comp, _ = sparse.cs_graph_components(C) if n_comp == 1: print "# use n_neighbors=%d" % n_neighbors break elif n_comp < 1: raise ValueError('Bug: n_comp=%d' % n_comp) if n_comp > 1: print "# use maximum n_neighbors=%d (%d components)" % ( n_neighbors, n_comp) return n_comp, C, neighbs
def fit(self, X): """ Parameters ---------- X : ndarray, shape (n_samples, n_features) Input data. Returns ------ self """ n_samples, n_features = X.shape self.nearest_neighbors_ = NearestNeighbors(algorithm=self.nearest_neighbor_algorithm) if self.verbose: print("Fitting neighbors data structure.") self.nearest_neighbors_.fit(X) if self.verbose: print("Datastructure used: %s" % self.nearest_neighbors_._fit_method) if self.verbose: print("Bulding minimum spanning tree.") forest = euclidean_mst(X, self.nearest_neighbors_, verbose=self.verbose) # the dimensionality of the space can at most be n_samples if self.infer_dimensionality: if self.verbose: print("Estimating dimensionality.") intrinsic_dimensionality = estimate_dimension( X, neighbors_estimator=self.nearest_neighbors_) if self.verbose > 0: print("Estimated dimensionality: %d" % intrinsic_dimensionality) elif n_samples < n_features: warnings.warn("Got dataset with n_samples < n_features. Setting" "intrinsic dimensionality to n_samples. This is most" " likely to high, leading to uneven clusters." " It is recommendet to set infer_dimensionality=True.") intrinsic_dimensionality = n_samples else: intrinsic_dimensionality = n_features if self.verbose: print("Cutting spanning tree.") clusters = [(forest, np.arange(n_samples))] cut_improvement = [itm_binary(forest.copy(), intrinsic_dimensionality, return_edge=True)] # init cluster_infos to anything. # doesn't matter any way as there is only one component cluster_infos = [0] y = np.zeros(n_samples, dtype=np.int) removed_edges = [] # keep all possible next splits, pick the one with highest gain. while len(clusters) < self.n_clusters: if self.verbose > 1: print("Finding for split %d." % len(clusters)) possible_improvements = (np.array([cut_i[1] * cut_i[0].shape[0] for cut_i in cut_improvement]) - np.array(cluster_infos)) i_to_split = np.argmax(possible_improvements) split, info, edge = cut_improvement.pop(i_to_split) # get rid of old cluster cluster_infos.pop(i_to_split) # need the indices of the nodes in the cluster to keep track # of where our datapoint went _, old_inds = clusters.pop(i_to_split) removed_edges.append((old_inds[list(edge[:2])], edge[2])) n_split_components, split_components_indicator = \ sparse.cs_graph_components(split + split.T) assert(n_split_components == 2) assert(len(np.unique(split_components_indicator)) == 2) for i in xrange(n_split_components): inds = np.where(split_components_indicator == i)[0] clusters.append((split[inds[np.newaxis, :], inds], old_inds[inds])) mi = tree_information_sparse(clusters[-1][0], intrinsic_dimensionality) cluster_infos.append(mi) imp = itm_binary(clusters[-1][0].copy(), intrinsic_dimensionality, return_edge=True) cut_improvement.append(imp) # correspondence of nodes to datapoints not present in sparse matrices # but we saved the indices. c_inds = [c[1] for c in clusters] y = np.empty(n_samples, dtype=np.int) assert len(np.hstack(c_inds)) == n_samples for i, c in enumerate(c_inds): y[c] = i # for computing the objective, we don't care about the indices result = block_diag([c[0] for c in clusters], format='csr') self.labels_ = y self.tree_information_ = (tree_information_sparse(result, intrinsic_dimensionality) / n_samples) return self