Example #1
0
def cluster_data(img, thr, xyz_a, k=26):
    '''docstring for cluster_data'''

    # Import packages
    from scipy.sparse import coo_matrix, cs_graph_components
    import numpy as np

    # Threshold the entire correlation map and find connected components, store this in sparse matrix
    val_idx = img > thr  # store valid indices
    xyz_th = xyz_a[
        val_idx]  # find the 3D indices corresponding to the above threshold voxels
    i, j, d = graph_3d_grid(
        xyz_th,
        k=k)  # find the connected components for the above threshold voxels
    nvoxs = xyz_th.shape[
        0]  # store the number of correlated voxels in entire network
    adj = coo_matrix((d, (i, j)), shape=(
        nvoxs,
        nvoxs))  # and store the connected nodes and weights in sparse matrix

    # Identify the connected components (clusters) within the graph
    nc, labels = cs_graph_components(adj)

    # Copy the node labels to their voxel equivalents
    lbl_img = np.zeros(img.shape)  # init lbl_img - map to store label data
    # add 2 so that labels corresponding to unconnected voxels (-2)
    # will be zero in lbl_img, and label==0 will now equal 2
    lbl_img[val_idx] = labels + 2
    return lbl_img
def tree_information_sparse(forest, n_features):
    """Computes mutual information objective from forest.

    Parameters
    ----------
    forest: sparse matrix
        graph containing trees representing cluster
    n_features: int
        dimensionality of input space.
    """
    entropy = 0
    sym_forest = forest + forest.T
    n_components, components = sparse.cs_graph_components(sym_forest)
    if np.any(components < 0):
        # there is a lonely node
        entropy -= 1e10
    #n_samples = len(components)

    for i in xrange(n_components):
        inds = np.where(components == i)[0]
        subforest = forest[inds[:, np.newaxis], inds]
        L = subforest.sum()
        n_samples_c = len(inds)
        if L == 0:
            warnings.warn("L is zero. This means there are identical points in"
                          " the dataset")
            L = 1e-10
        entropy += (n_samples_c * (
            (n_features - 1) * np.log(n_samples_c) - n_features * np.log(L)))
    return entropy
def plot_clustering(X, y=None, axes=None, three_d=False, forest=None):
    if y is None and forest is None:
        raise ValueError("give me y or a sparse matrix representing the"
                "forest")
    if y is None:
        _, y = sparse.cs_graph_components(forest + forest.T)
    if three_d and X.shape[1] > 3:
        X = RandomizedPCA(n_components=3).fit_transform(X)
    elif not three_d and X.shape[1] > 2:
        X = RandomizedPCA(n_components=2).fit_transform(X)
    if axes == None or three_d:
        plt.figure()
        axes = plt.gca()
    if three_d:
        axes = plt.gca(axes=axes, projection='3d')

    colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'] * 10)
    color = colors[y]
    if three_d:
        axes.scatter(X[:, 0], X[:, 1], X[:, 2], color=color)
    else:
        axes.scatter(X[:, 0], X[:, 1], color=color, s=10)
    if not forest is None:
        for edge in np.vstack(forest.nonzero()).T:
            i, j = edge
            axes.plot([X[i, 0], X[j, 0]], [X[i, 1], X[j, 1]], c=color[i])
    axes.set_xticks(())
    axes.set_yticks(())
    return axes
Example #4
0
def clusterByAABB(poly, opts, argv):
    aabbs = []
    aabbsOut = np.zeros((poly.GetLines().GetNumberOfCells(), 6))
    for (i, idl) in enumerate(lineGenerator(poly.GetLines())):
        pts = idListToPoints(idl, poly.GetPoints())
        aabb = AABB(pts)
        aabbs.append(aabb)
        aabbsOut[i, :] = aabb.toArray()
    overlaps = np.zeros((len(aabbs), len(aabbs)))
    for (i1, ab1) in enumerate(aabbs):
        for i2 in range(i1, len(aabbs)):
            if (i1 == i2):
                overlaps[i1, i2] = 1
            else:
                ab2 = aabbs[i2]
                overlaps[i1, i2] = ab1.Intersect(ab2)
                overlaps[i2, i1] = overlaps[i1, i2]
    threshold = np.vectorize(lambda x: 1 if x > opts.overlap else 0,
                             otypes=[np.int32])
    adjacency = threshold(overlaps)
    conComps = scsp.cs_graph_components(adjacency)
    addCellIntArray(poly, "VortexCluster", conComps[1])
    aabbGlyphs = glyphAABB(aabbs)
    addCellIntArray(aabbGlyphs, "VortexCluster", conComps[1])
    if (opts.aabbOut != ""):
        np.savetxt(opts.aabbOut, aabbsOut)
    if (opts.aabbLabels != ""):
        np.savetxt(opts.aabbLabels, conComps[1])
    return (poly, aabbGlyphs)
def tree_information_sparse(forest, n_features):
    """Computes mutual information objective from forest.

    Parameters
    ----------
    forest: sparse matrix
        graph containing trees representing cluster
    n_features: int
        dimensionality of input space.
    """
    entropy = 0
    sym_forest = forest + forest.T
    n_components, components = sparse.cs_graph_components(sym_forest)
    if np.any(components < 0):
        # there is a lonely node
        entropy -= 1e10
    #n_samples = len(components)

    for i in xrange(n_components):
        inds = np.where(components == i)[0]
        subforest = forest[inds[:, np.newaxis], inds]
        L = subforest.sum()
        n_samples_c = len(inds)
        if L == 0:
            warnings.warn("L is zero. This means there are identical points in"
                    " the dataset")
            L = 1e-10
        entropy += (n_samples_c * ((n_features - 1) * np.log(n_samples_c) -
            n_features * np.log(L)))
    return entropy
Example #6
0
def plot_clustering(X, y=None, axes=None, three_d=False, forest=None):
    if y is None and forest is None:
        raise ValueError("give me y or a sparse matrix representing the"
                         "forest")
    if y is None:
        _, y = sparse.cs_graph_components(forest + forest.T)
    if three_d and X.shape[1] > 3:
        X = RandomizedPCA(n_components=3).fit_transform(X)
    elif not three_d and X.shape[1] > 2:
        X = RandomizedPCA(n_components=2).fit_transform(X)
    if axes == None or three_d:
        plt.figure()
        axes = plt.gca()
    if three_d:
        axes = plt.gca(axes=axes, projection='3d')

    colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'] * 10)
    color = colors[y]
    if three_d:
        axes.scatter(X[:, 0], X[:, 1], X[:, 2], color=color)
    else:
        axes.scatter(X[:, 0], X[:, 1], color=color, s=10)
    if not forest is None:
        for edge in np.vstack(forest.nonzero()).T:
            i, j = edge
            axes.plot([X[i, 0], X[j, 0]], [X[i, 1], X[j, 1]], c=color[i])
    axes.set_xticks(())
    axes.set_yticks(())
    return axes
Example #7
0
def clusterByAABB(poly,opts,argv):
	aabbs = []
	aabbsOut = np.zeros((poly.GetLines().GetNumberOfCells(),6))
	for (i,idl) in enumerate(lineGenerator(poly.GetLines())):
		pts = idListToPoints(idl, poly.GetPoints())
		aabb = AABB(pts)
		aabbs.append(aabb)
		aabbsOut[i,:] = aabb.toArray()
	overlaps = np.zeros((len(aabbs), len(aabbs)))
	for (i1, ab1) in enumerate(aabbs):
		for i2 in range(i1, len(aabbs)):
			if (i1 == i2):
				overlaps[i1,i2] = 1
			else:
				ab2 = aabbs[i2]
				overlaps[i1,i2] = ab1.Intersect(ab2)
				overlaps[i2,i1] = overlaps[i1,i2]
	threshold = np.vectorize(lambda x: 1 if x > opts.overlap else 0, otypes=[np.int32])
	adjacency = threshold(overlaps)
	conComps = scsp.cs_graph_components(adjacency)
	addCellIntArray(poly, "VortexCluster", conComps[1])
	aabbGlyphs = glyphAABB(aabbs)
	addCellIntArray(aabbGlyphs, "VortexCluster", conComps[1])
	if (opts.aabbOut != ""):
		np.savetxt(opts.aabbOut, aabbsOut)
	if (opts.aabbLabels != ""):
		np.savetxt(opts.aabbLabels, conComps[1])
	return (poly, aabbGlyphs)
Example #8
0
def euclidean_mst(X, neighbors_estimator, verbose=2):
    n_neighbors = min(2, X.shape[0])
    while True:
        # make sure we have a connected minimum spanning tree.
        # otherwise we need to consider more neighbors
        n_neighbors = 2 * n_neighbors
        if verbose > 1:
            print("Trying to build mst with %d neighbors." % n_neighbors)
        distances = neighbors_estimator.kneighbors_graph(
            X, n_neighbors=n_neighbors, mode='distance')
        n_components, component_indicators =\
            sparse.cs_graph_components(distances + distances.T)
        if len(np.unique(component_indicators)) > 1:
            continue
        distances.sort_indices()
        forest = minimum_spanning_tree(distances)
        _, inds = sparse.cs_graph_components(forest + forest.T)
        assert (len(np.unique(inds)) == 1)
        break
    return forest
Example #9
0
def euclidean_mst(X, neighbors_estimator, verbose=2):
    n_neighbors = min(2, X.shape[0])
    while True:
        # make sure we have a connected minimum spanning tree.
        # otherwise we need to consider more neighbors
        n_neighbors = 2 * n_neighbors
        if verbose > 1:
            print("Trying to build mst with %d neighbors." % n_neighbors)
        distances = neighbors_estimator.kneighbors_graph(
            X, n_neighbors=n_neighbors, mode='distance')
        n_components, component_indicators =\
            sparse.cs_graph_components(distances + distances.T)
        if len(np.unique(component_indicators)) > 1:
            continue
        distances.sort_indices()
        forest = minimum_spanning_tree(distances)
        _, inds = sparse.cs_graph_components(forest + forest.T)
        assert(len(np.unique(inds)) == 1)
        break
    return forest
 def __init__(self, clf, A=None, n_jobs=-1, copy=True):
     if copy and A is not None:
         self.A = A.copy()
     else:
         self.A = A
     self.copy = copy
     self.clf = clf
     self.n_jobs = n_jobs
     if A is not None:
         self.n_components_A = sparse.cs_graph_components(A)[0]
     else:
         self.n_components_A = 1
Example #11
0
    def test_cs_graph_components(self):
        import numpy as np
        from scipy.sparse import csr_matrix, cs_graph_components

        D = np.eye(4, dtype=np.bool)

        n_comp, flag = cs_graph_components(csr_matrix(D))
        assert_(n_comp == 4)
        assert_equal(flag, [0, 1, 2, 3])

        D[0,1] = D[1,0] = 1

        n_comp, flag = cs_graph_components(csr_matrix(D))
        assert_(n_comp == 3)
        assert_equal(flag, [0, 0, 1, 2])

        # A pathological case...
        D[2,2] = 0
        n_comp, flag = cs_graph_components(csr_matrix(D))
        assert_(n_comp == 2)
        assert_equal(flag, [0, 0, -2, 1])
 def __init__(self, clf, A=None, n_jobs=-1, copy=True):
     if copy and A is not None:
         self.A = A.copy()
     else:
         self.A = A
     self.copy = copy
     self.clf = clf
     self.n_jobs = n_jobs
     if A is not None:
         self.n_components_A = sparse.cs_graph_components(A)[0]
     else:
         self.n_components_A = 1
Example #13
0
    def cc(self):
        """Compte the different connected components of the graph.

        Returns
        -------
        label: array of shape(self.V), labelling of the vertices
        """
        try:
            from scipy.sparse import cs_graph_components
            _, label = cs_graph_components(self.adjacency())
        except:
            lil = self.to_coo_matrix().tolil().rows.tolist()
            label = lil_cc(lil)
        return label
Example #14
0
    def get_from_fiber_graph(self,G):
        self.ncc,vertexCC = sp.cs_graph_components(G+G.transpose())

        self.n = vertexCC.shape[0]

        noniso = np.nonzero(np.not_equal(vertexCC,-2))[0]

        cccounter = Counter(vertexCC[noniso])
        cc_badLabel,_ = zip(*cccounter.most_common())
        cc_dict = dict(zip(cc_badLabel, np.arange(self.ncc)+1))
        cc_dict[-2] = 0

        self.vertexCC = np.array([cc_dict[v] for v in vertexCC])
        self.ccsize = Counter(vertexCC)
    def fit(self, X):
        self.nearest_neighbors_ = NearestNeighbors(algorithm=self.nearest_neighbor_algorithm)
        self.nearest_neighbors_.fit(X)
        forest = euclidean_mst(X, self.nearest_neighbors_)
        weights = forest.data
        inds = np.argsort(weights)[::-1]
        edges = np.vstack(forest.nonzero()).T
        n_samples = len(edges) + 1
        i = 0
        while len(forest.nonzero()[0]) > n_samples - self.n_clusters:
            e = edges[inds[i]]
            forest[e[0], e[1]] = 0
            if np.min(sparse.cs_graph_components(forest + forest.T)[1]) < 0:
                # only one node in new component. messes up cs_graph_components
                forest[e[0], e[1]] = weights[i]
            elif (np.min(np.bincount(sparse.cs_graph_components(forest +
                                                                forest.T)[1])) <
                  2):
                # disallow small clusters
                forest[e[0], e[1]] = weights[i]

            i += 1
        self.labels_ = sparse.cs_graph_components(forest + forest.T)[1]
        return self
Example #16
0
    def cc(self):
        """Compte the different connected components of the graph.

        Returns
        -------
        label: array of shape(self.V), labelling of the vertices
        """
        try:
            from scipy.sparse import cs_graph_components
            _, label = cs_graph_components(self.adjacency())
        except:
            pass
        lil = self.to_coo_matrix().tolil().rows.tolist()
        label = lil_cc(lil)
        return label
Example #17
0
def get_lcc_idx(G):
    """Determines and sorts the connected components of G
    
    Each vertex in G is assigned a label corresponding to its connected component.
    The largest connected component is labelled 0, second largest 1, etc.
    
    **NOTE**: All isolated vertices (ie no incident edges) are put in 1 connected components
    """
    ncc,vertexCC = sp.cs_graph_components(G)
        
    cc_size = Counter(vertexCC)
    cc_size = sorted(cc_size.iteritems(), key=lambda cc: cc[1],reverse=True)
    cc_badLabel,_ = zip(*cc_size)
    cc_dict = dict(zip(cc_badLabel, np.arange(ncc+1)))
    
    vertexCC = [cc_dict[vcc] for vcc in vertexCC]
   
    return np.array(vertexCC)
Example #18
0
def cluster_data(img, thr, xyz_a, k=26):
    """docstring for cluster_data"""
    from scipy.sparse import coo_matrix, cs_graph_components
    # Threshold the entire correlation map and find connected components, store this in sparse matrix
    val_idx = img > thr                 # store valid indices
    xyz_th = xyz_a[val_idx]             # find the 3D indices corresponding to the above threshold voxels
    i,j,d = graph_3d_grid(xyz_th, k=k)  # find the connected components for the above threshold voxels
    nvoxs = xyz_th.shape[0]             # store the number of correlated voxels in entire network
    adj = coo_matrix((d, (i,j)), shape=(nvoxs,nvoxs)) # and store the connected nodes and weights in sparse matrix
    
    # Identify the connected components (clusters) within the graph
    nc, labels = cs_graph_components(adj)
    
    # Copy the node labels to their voxel equivalents
    lbl_img = np.zeros(img.shape)           # init lbl_img - map to store label data
    # add 2 so that labels corresponding to unconnected voxels (-2)
    # will be zero in lbl_img, and label==0 will now equal 2
    lbl_img[val_idx] = labels + 2 
    return lbl_img
Example #19
0
def build_sym_geom_adjacency(geoms, max_gnn=100):
    """ Return the sparsest yet maximally connected symetric geometrical adjacency matrix
    """
    global INTERNAL_PARAMETERS
    min_gnn = INTERNAL_PARAMETERS['min_geom_neighbors']
    assert min_gnn < max_gnn, "Too high minimum number of neighbors"
    n_pts = geoms.shape[0]
    for n_neighbors in range(min_gnn, max_gnn + 1):
        # find the lowest number of NN s.t. the graph is not too disconnected
        C = build_geom_neighbor_graph(geoms, n_neighbors)
        neighbs = C.indices.reshape((n_pts, n_neighbors))
        C = C + C.T
        C.data[:] = 1
        n_comp, _ = sparse.cs_graph_components(C)
        if n_comp == 1:
            print "# use n_neighbors=%d" % n_neighbors
            break
        elif n_comp < 1:
            raise ValueError('Bug: n_comp=%d' % n_comp)
    if n_comp > 1:
        print "# use maximum n_neighbors=%d (%d components)" % (
            n_neighbors, n_comp)
    return n_comp, C, neighbs
Example #20
0
    def fit(self, X):
        """
        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Input data.

        Returns
        ------
        self
        """
        n_samples, n_features = X.shape

        self.nearest_neighbors_ = NearestNeighbors(algorithm=self.nearest_neighbor_algorithm)
        if self.verbose:
            print("Fitting neighbors data structure.")
        self.nearest_neighbors_.fit(X)
        if self.verbose:
            print("Datastructure used: %s" % self.nearest_neighbors_._fit_method)
        if self.verbose:
            print("Bulding minimum spanning tree.")
        forest = euclidean_mst(X, self.nearest_neighbors_, verbose=self.verbose)

        # the dimensionality of the space can at most be n_samples
        if self.infer_dimensionality:
            if self.verbose:
                print("Estimating dimensionality.")
            intrinsic_dimensionality = estimate_dimension(
                X, neighbors_estimator=self.nearest_neighbors_)
            if self.verbose > 0:
                print("Estimated dimensionality: %d" % intrinsic_dimensionality)
        elif n_samples < n_features:
            warnings.warn("Got dataset with n_samples < n_features. Setting"
                          "intrinsic dimensionality to n_samples. This is most"
                          " likely to high, leading to uneven clusters."
                          " It is recommendet to set infer_dimensionality=True.")
            intrinsic_dimensionality = n_samples
        else:
            intrinsic_dimensionality = n_features

        if self.verbose:
            print("Cutting spanning tree.")
        clusters = [(forest, np.arange(n_samples))]
        cut_improvement = [itm_binary(forest.copy(), intrinsic_dimensionality,
                                      return_edge=True)]
        # init cluster_infos to anything.
        # doesn't matter any way as there is only one component
        cluster_infos = [0]
        y = np.zeros(n_samples, dtype=np.int)
        removed_edges = []
        # keep all possible next splits, pick the one with highest gain.
        while len(clusters) < self.n_clusters:
            if self.verbose > 1:
                print("Finding for split %d." % len(clusters))
            possible_improvements = (np.array([cut_i[1] * cut_i[0].shape[0] for
                                               cut_i in cut_improvement]) -
                                     np.array(cluster_infos))
            i_to_split = np.argmax(possible_improvements)
            split, info, edge = cut_improvement.pop(i_to_split)
            # get rid of old cluster
            cluster_infos.pop(i_to_split)
            # need the indices of the nodes in the cluster to keep track
            # of where our datapoint went
            _, old_inds = clusters.pop(i_to_split)
            removed_edges.append((old_inds[list(edge[:2])], edge[2]))

            n_split_components, split_components_indicator = \
                sparse.cs_graph_components(split + split.T)
            assert(n_split_components == 2)
            assert(len(np.unique(split_components_indicator)) == 2)

            for i in xrange(n_split_components):
                inds = np.where(split_components_indicator == i)[0]
                clusters.append((split[inds[np.newaxis, :], inds],
                                 old_inds[inds]))
                mi = tree_information_sparse(clusters[-1][0], intrinsic_dimensionality)
                cluster_infos.append(mi)
                imp = itm_binary(clusters[-1][0].copy(), intrinsic_dimensionality,
                                 return_edge=True)
                cut_improvement.append(imp)

        # correspondence of nodes to datapoints not present in sparse matrices
        # but we saved the indices.
        c_inds = [c[1] for c in clusters]
        y = np.empty(n_samples, dtype=np.int)
        assert len(np.hstack(c_inds)) == n_samples

        for i, c in enumerate(c_inds):
            y[c] = i

        # for computing the objective, we don't care about the indices
        result = block_diag([c[0] for c in clusters], format='csr')
        self.labels_ = y
        self.tree_information_ = (tree_information_sparse(result, intrinsic_dimensionality) /
                                  n_samples)
        return self