def compute_connected_components(adjacency_matrix): """ Given an adjacency matrix of a graph, computes the number of connected components. Parameters adjacency_matrix: adjacency matrix of the graph Returns cc: number of connected components in the graph represented by the adjacency matrix """ if issparse(adjacency_matrix): difference_matrix = adjacency_matrix - adjacency_matrix.transpose() is_symmetric_p = np.all(1e-10 > difference_matrix.data) is_symmetric_n = np.all(difference_matrix.data > -1e-10) else: difference_matrix = adjacency_matrix - adjacency_matrix.T is_symmetric_p = np.all(1e-10 > difference_matrix) is_symmetric_n = np.all(difference_matrix > -1e-10) if is_symmetric_p and is_symmetric_n: return connected_components(adjacency_matrix, False) else: return connected_components(adjacency_matrix, True)
def updateItemClusters(self, userID, chosenItem, itemClusterNum, articlePool): m = self.itemNum n = len(self.users) #UserNeighbor = {} for a in articlePool: if self.IGraph[chosenItem.id][a.id] == 1: #UserNeighbor[a.id] = np.ones([n,n]) for i in range(n): diff = math.fabs(np.dot( self.users[userID].UserTheta, a.featureVector )- np.dot( self.users[i].UserTheta, a.featureVector)) CB = self.alpha_2* (np.sqrt(np.dot(np.dot(a.featureVector, self.users[userID].AInv), a.featureVector)) + np.sqrt(np.dot(np.dot(a.featureVector, self.users[i].AInv), a.featureVector))) * np.sqrt(np.log10(self.time+1)) if diff > CB: self.UserNeighbor[a.id][userID][i] = 0 self.UserNeighbor[a.id][i][userID] = 0 if not np.array_equal(UserNeighbor[a.id], self.UGraph[itemClusterNum]): self.IGraph[chosenItem.id][a.id] = 0 self.IGraph[a.id][chosenItem.id] = 0 #print 'delete edge' self.N_components_Item, component_list_Item = connected_components(csr_matrix(self.IGraph)) self.Iclusters = component_list_Item # For each new item cluster, allocate a new connected graph over users representing a single user clsuter self.UGraph = [] self.Uclusters = [] for i in range(self.N_components_Item): if self.cluster_init =='Erdos-Renyi': p = 3 * math.log(len(self.users))/len(self.users) self.UGraph.append(np.random.choice([0, 1], size=(len(self.users),len(self.users)), p=[1-p, p])) else: self.UGraph.append(np.ones([len(self.users), len(self.users)]) ) self.Uclusters.append([]) N_components_U, components_U = connected_components(csr_matrix(self.UGraph[i])) self.Uclusters[i] = components_U return self.N_components_Item
def test_strong_connections(): X1de = np.array([[0, 1, 0], [0, 0, 0], [0, 0, 0]]) X2de = X1de + X1de.T X1sp = csgraph.csgraph_from_dense(X1de, null_value=0) X2sp = csgraph.csgraph_from_dense(X2de, null_value=0) for X in X1sp, X1de: n_components, labels =\ csgraph.connected_components(X, directed=True, connection='strong') assert_equal(n_components, 3) labels.sort() assert_array_almost_equal(labels, [0, 1, 2]) for X in X2sp, X2de: n_components, labels =\ csgraph.connected_components(X, directed=True, connection='strong') assert_equal(n_components, 2) labels.sort() assert_array_almost_equal(labels, [0, 0, 1])
def __init__(self,dimension,alpha,lambda_,n,alpha_2, cluster_init="Complete"): self.time = 0 #N_LinUCBAlgorithm.__init__(dimension = dimension, alpha=alpha,lambda_ = lambda_,n=n) self.users = [] #algorithm have n users, each user has a user structure for i in range(n): self.users.append(CLUBUserStruct(dimension,lambda_, i)) self.dimension = dimension self.alpha = alpha self.alpha_2 = alpha_2 if (cluster_init=="Erdos-Renyi"): p = 3*math.log(n)/n self.Graph = np.random.choice([0, 1], size=(n,n), p=[1-p, p]) self.clusters = [] g = csr_matrix(self.Graph) N_components, components = connected_components(g) else: self.Graph = np.ones([n,n]) self.clusters = [] g = csr_matrix(self.Graph) N_components, components = connected_components(g) self.CanEstimateCoUserPreference = False self.CanEstimateUserPreference = False self.CanEstimateW = False
def test_grid_to_graph(): # Checking that the function works with graphs containing no edges size = 2 roi_size = 1 # Generating two convex parts with one vertex # Thus, edges will be empty in _to_graph mask = np.zeros((size, size), dtype=np.bool) mask[0:roi_size, 0:roi_size] = True mask[-roi_size:, -roi_size:] = True mask = mask.reshape(size ** 2) A = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray) assert_true(connected_components(A)[0] == 2) # Checking that the function works whatever the type of mask is mask = np.ones((size, size), dtype=np.int16) A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask) assert_true(connected_components(A)[0] == 1) # Checking dtype of the graph mask = np.ones((size, size)) A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.bool) assert_true(A.dtype == np.bool) A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.int) assert_true(A.dtype == np.int) A = grid_to_graph(n_x=size, n_y=size, n_z=size, mask=mask, dtype=np.float64) assert_true(A.dtype == np.float64)
def _randomly_divide_connected_graph(adj, n_regions): """ Divide the provided connected graph into `n_regions` regions. Parameters ---------- adj : :class:`scipy.sparse.csr_matrix` Adjacency matrix. n_regions : int The desired number of clusters. Must be > 0 and <= number of nodes. Returns ------- labels : :class:`numpy.ndarray` Each element (an integer in {0, ..., `n_regions` - 1}) specifies the region an area (defined by the index in the array) belongs to. Examples -------- >>> from scipy.sparse import diags >>> n_nodes = 10 >>> adj_diagonal = [1] * (n_nodes-1) >>> # 10x10 adjacency matrix representing the path 0-1-2-...-9-10 >>> adj = diags([adj_diagonal, adj_diagonal], offsets=[-1, 1]) >>> n_regions_desired = 4 >>> labels = _randomly_divide_connected_graph(adj, n_regions_desired) >>> n_regions_obtained = len(set(labels)) >>> n_regions_desired == n_regions_obtained True """ if not n_regions > 0: msg = "n_regions is {} but must be positive.".format(n_regions) raise ValueError(msg) n_areas = adj.shape[0] if not n_regions <= n_areas: msg = "n_regions is {} but must less than or equal to " + \ "the number of nodes which is {}".format(n_regions, n_areas) raise ValueError(msg) mst = csg.minimum_spanning_tree(adj) for _ in range(n_regions - 1): # try different links to cut and pick the one leading to the most # balanced solution best_link = None max_region_size = float("inf") for __ in range(5): mst_copy = mst.copy() nonzero_i, nonzero_j = mst_copy.nonzero() random_position = random.randrange(len(nonzero_i)) i, j = nonzero_i[random_position], nonzero_j[random_position] mst_copy[i, j] = 0 mst_copy.eliminate_zeros() labels = csg.connected_components(mst_copy, directed=False)[1] max_size = max(np.unique(labels, return_counts=True)[1]) if max_size < max_region_size: best_link = (i, j) max_region_size = max_size mst[best_link[0], best_link[1]] = 0 mst.eliminate_zeros() return csg.connected_components(mst)[1]
def intra_encounter_matching(): import numpy as np from scipy.sparse import coo_matrix, csgraph qreq_, cm_list = testdata_workflow() # qaids = [cm.qaid for cm in cm_list] # top_aids = [cm.get_top_aids(5) for cm in cm_list] aid_pairs = np.array([(cm.qaid, daid) for cm in cm_list for daid in cm.get_top_aids(5)]) top_scores = ut.flatten([cm.get_top_scores(5) for cm in cm_list]) N = aid_pairs.max() + 1 mat = coo_matrix((top_scores, aid_pairs.T), shape=(N, N)) csgraph.connected_components(mat) tree = csgraph.minimum_spanning_tree(mat) # NOQA import plottool as pt dense = mat.todense() pt.imshow(dense / dense.max() * 255) pt.show_if_requested() # baseline jobid import opengm # https://github.com/opengm/opengm/blob/master/src/interfaces/python/examples/tutorial/OpenGM%20tutorial.ipynb numVar = 10 unaries = np.ones([numVar, 3], dtype=opengm.value_type) gm = opengm.gm(np.ones(numVar, dtype=opengm.label_type) * 3) unary_fids = gm.addFunctions(unaries) gm.addFactors(unary_fids, np.arange(numVar)) infParam = opengm.InfParam( workflow=ut.ensure_ascii('(IC)(TTC-I,CC-I)'), ) inf = opengm.inference.Multicut(gm, parameter=infParam) visitor = inf.verboseVisitor(printNth=1, multiline=False) inf.infer(visitor) arg = inf.arg() # gridVariableIndices = opengm.secondOrderGridVis(img.shape[0], img.shape[1]) # fid = gm.addFunction(regularizer) # gm.addFactors(fid, gridVariableIndices) # regularizer = opengm.pottsFunction([3, 3], 0.0, beta) # gridVariableIndices = opengm.secondOrderGridVis(img.shape[0], img.shape[1]) # fid = gm.addFunction(regularizer) # gm.addFactors(fid, gridVariableIndices) unaries = np.random.rand(10, 10, 2) potts = opengm.PottsFunction([2, 2], 0.0, 0.4) gm = opengm.grid2d2Order(unaries=unaries, regularizer=potts) inf = opengm.inference.GraphCut(gm) inf.infer() arg = inf.arg() # NOQA """
def test_connect_regions_with_grid(): try: face = sp.face(gray=True) except AttributeError: # Newer versions of scipy have face in misc from scipy import misc face = misc.face(gray=True) mask = face > 50 graph = grid_to_graph(*face.shape, mask=mask) assert_equal(ndimage.label(mask)[1], connected_components(graph)[0]) mask = face > 150 graph = grid_to_graph(*face.shape, mask=mask, dtype=None) assert_equal(ndimage.label(mask)[1], connected_components(graph)[0])
def join_CCs_simple(X, W, num_ccs=1, verbose=False): """Old method for connecting the graph. Use join_CCs now.""" n, labels = connected_components(W, directed=False, return_labels=True) CC_labels = labels while n > num_ccs: if verbose: print n, 'connected components' Dcenter, min_edge_idxs = inter_cluster_distance(X, n, labels) p_inds,q_inds = min_k_indices(Dcenter, 2).T # self + 1 == 2 ii,jj = min_edge_idxs[p_inds,q_inds].T W[ii,jj] = 1 W[jj,ii] = 1 n, labels = connected_components(W, directed=False, return_labels=True) return CC_labels
def visCC(self): """fix me.... :/""" """to visualize the neighbours""" if isVisualize: fig888 = plt.figure() ax = plt.subplot(1,1,1) """ visualization, see if connected components make sense""" s111,c111 = connected_components(sparsemtx) #s is the total CComponent, c is the label color = np.array([np.random.randint(0,255) for _ in range(3*int(s111))]).reshape(s111,3) fig888 = plt.figure(888) ax = plt.subplot(1,1,1) # im = plt.imshow(np.zeros([528,704,3])) for i in range(s111): ind = np.where(c111==i)[0] print ind for jj in range(len(ind)): startlimit = np.min(np.where(x[ind[jj],:]!=0)) endlimit = np.max(np.where(x[ind[jj],:]!=0)) # lines = ax.plot(x[ind[jj],startlimit:endlimit], y[ind[jj],startlimit:endlimit],color = (0,1,0),linewidth=2) lines = ax.plot(x[ind[jj],startlimit:endlimit], y[ind[jj],startlimit:endlimit],color = (color[i-1].T)/255.,linewidth=2) fig888.canvas.draw() plt.pause(0.0001) plt.show()
def markov_stationary_components(P, tol=1e-12): """ Split the chain first to connected components, and solve the stationary state for the smallest one """ n = P.shape[0] # 0. Drop zero edges P = P.tocsr() P.eliminate_zeros() # 1. Separate to connected components n_components, labels = csgraph.connected_components(P, directed=True, connection='strong') # 2. Pick the smallest one sizes = [(labels == j).sum() for j in range(n_components)] min_j = np.argmin(sizes) indices = np.flatnonzero(labels == min_j) #print("Solving for component {0}/{1} of size {2}".format(min_j, n_components, indices.size)) # 3. Solve stationary state for it p = np.zeros(n) if indices.size == 1: # Simple case p[indices] = 1 else: p[indices] = markov_stationary_one(P[indices,:][:,indices], tol=tol) return p
def assertSingleClass(self,P): """ Check whether the rate/probability matrix consists of a single connected class. Otherwise, the steady state distribution is not well defined. """ components, _ = csgraph.connected_components(P, directed=True, connection='weak') assert components==1, "The Markov chain has %r communicating classes. Make sure there is a single communicating class." %components
def __init__(self, g_map=None, poly_map=None, connect_using_avg_resistances=False, connect_four_neighbors_only=False, g_graph=None, node_names=None): if g_map is not None: self.is_network = False self.g_map = g_map self.poly_map = poly_map self.connect_using_avg_resistances = connect_using_avg_resistances self.connect_four_neighbors_only = connect_four_neighbors_only self.node_map = HabitatGraph._construct_node_map(g_map, poly_map) (component_map, components) = HabitatGraph._construct_component_map(g_map, self.node_map, connect_using_avg_resistances, connect_four_neighbors_only) self.component_map = component_map self.components = components self.num_components = components.max() self.num_nodes = self.node_map.max() else: self.is_network = True self.g_graph = g_graph # is the sparse CSR matrix self.node_map = node_names # list of node names (_num_components, C) = connected_components(g_graph) C += 1 self.components = C self.num_components = C.max() self.num_nodes = self.node_map.size
def get_cc(self): """ :return: """ return csgraph.connected_components(self.edges, directed=True, connection='weak', return_labels=True)
def clusters(self): from scipy.sparse.csgraph import connected_components from scipy.sparse import csr_matrix n_samples = self.problem.n_samples n_features = self.problem.n_features # two centers are "the same" if they're much closer to each other than any # 2 data points are (excepting overlapping points) norm = self.problem.norm X, w = self.problem.X, self.problem.w distances = [norm(X[i] - X[j]) for (i, j, _) in iterrows(w)] distances = [d for d in distances if d > 0] if len(distances) == 0: epsilon = 1e-5 # arbitrary else: epsilon = min(d for d in distances if d > 0) * 1e-2 edgelist = [] for l,(i,j,_) in enumerate(iterrows(self.problem.w)): if np.linalg.norm(self.v[l]) / n_features < epsilon: edgelist.append( (1, i, j) ) edgelist.append( (1, j, i) ) if len(edgelist) > 0: vals, rows, cols = zip(*edgelist) else: vals, rows, cols = [], [], [] adjacency = csr_matrix((vals, (rows, cols)), shape=(n_samples, n_samples)) n_components, labels = connected_components(adjacency, directed=False) return labels
def _connected(cm, nodes, connection): """Test connectivity for the connectivity matrix.""" if nodes is not None: cm = cm[np.ix_(nodes, nodes)] num_components, _ = connected_components(cm, connection=connection) return num_components < 2
def merge_candidates_scan(candidates, seriesuid, distance=5.): distances = pdist(candidates, metric='euclidean') adjacency_matrix = squareform(distances) # Determine nodes within distance, replace by 1 (=adjacency matrix) adjacency_matrix = np.where(adjacency_matrix<=distance,1,0) # Determine all connected components in the graph n, labels = connected_components(adjacency_matrix) new_candidates = np.zeros((n,3)) # Take the mean for these connected components for cluster_i in range(n): points = candidates[np.where(labels==cluster_i)] center = np.mean(points,axis=0) new_candidates[cluster_i,:] = center x = new_candidates[:,0] y = new_candidates[:,1] z = new_candidates[:,2] labels = [seriesuid]*len(x) class_name = [0]*len(x) data= zip(labels,x,y,z,class_name) new_candidates = pd.DataFrame(data,columns=CANDIDATES_COLUMNS) return new_candidates
def split_dist_matrix(dist_matrix, overwrite=False): # Create the minimum spanning tree. # `overwrite=True` will make changes in place, which is more efficient. mst = minimum_spanning_tree(csr_matrix(dist_matrix), overwrite=overwrite) mst = mst.toarray() # Get the index of the maximum value. # `argmax` returns the index of the _flattened_ array; # `unravel_index` converts it back. idx = np.unravel_index(mst.argmax(), mst.shape) # Clear out the maximum value to split the tree. mst[idx] = 0 # Label connected components. num_graphs, labels = connected_components(mst, directed=False) # We should have two trees. assert(num_graphs == 2) # Use indices as node ids and group them according to their graph. results = [[] for i in range(max(labels) + 1)] for idx, label in enumerate(labels): results[label].append(idx) return results
def compute_Ws(X, num_ccs): with Timer('Calculating pairwise distances...'): D = pairwise_distances(X, metric='sqeuclidean') np.save('mnist_D.npy', D) # k-nn with Timer('Calculating knn graph...'): for k in xrange(1,10): Wknn = neighbor_graph(D, precomputed=True, k=k, symmetrize=True) n = connected_components(Wknn, directed=False, return_labels=False) if n <= num_ccs: break else: assert False, 'k too low' np.save('mnist_Wknn.npy', Wknn) print 'knn (k=%d)' % k # b-matching with Timer('Calculating b-matching graph...'): # using 8 decimal places kills the disk Wbma = hacky_b_matching(D, k, fmt='%.1f') np.save('mnist_Wbma.npy', Wbma) # msg with Timer('Calculating MSG graph...'): Wmsg = manifold_spanning_graph(X, 2, num_ccs=num_ccs) np.save('mnist_Wmsg.npy', Wmsg) return D, Wknn, Wbma, Wmsg
def is_connected(C, directed=True): r"""Return true, if the input count matrix is completely connected. Effectively checking if the number of connected components equals one. (EMMA function) Parameters ---------- C : scipy.sparse matrix or numpy ndarray Count matrix specifying edge weights. directed : bool, optional Whether to compute connected components for a directed or undirected graph. Default is True. Returns ------- connected : boolean, returning true only if C is connected. """ from scipy.sparse import csr_matrix from scipy.sparse.sputils import isdense import scipy.sparse.csgraph as csgraph if isdense(C): C = csr_matrix(C) nc=csgraph.connected_components(C, directed=directed, connection='strong', return_labels=False) return nc == 1
def assert_feasible(solution, adj, n_regions=None): """ Parameters ---------- solution : :class:`numpy.ndarray` Array of region labels. adj : :class:`scipy.sparse.csr_matrix` Adjacency matrix representing the contiguity relation. n_regions : `int` or `None` An `int` represents the desired number of regions. If `None`, then the number of regions is not checked. Raises ------ exc : `ValueError` A `ValueError` is raised if clustering is not spatially contiguous. Given the `n_regions` argument is not `None`, a `ValueError` is raised also if the number of regions is not equal to the `n_regions` argument. """ if n_regions is not None: if len(set(solution)) != n_regions: raise ValueError("The number of regions is {} but " "should be {}".format(len(solution), n_regions)) for region_label in set(solution): _, comp_labels = csg.connected_components(adj) # check whether equal region_label implies equal comp_label comp_labels_in_region = comp_labels[solution == region_label] if not all_elements_equal(comp_labels_in_region): raise ValueError("Region {} is not spatially " "contiguous.".format(region_label))
def is_ergodic(T, tol): """ checks if T is 'ergodic' Parameters ---------- T : scipy.sparse matrix Transition matrix tol : float tolerance Returns ------- Truth value : bool True, if # strongly connected components = 1 False, otherwise """ if isdense(T): T = T.tocsr() if not is_transition_matrix(T, tol): raise ValueError("given matrix is not a valid transition matrix.") num_components = connected_components(T, directed=True, \ connection='strong', \ return_labels=False) return num_components == 1
def _spectral_embedding(self, affinity_matrix): """ Computes spectral embedding. First calculates normalized laplacian Then does the eigenvalue decomposition """ numComponents, labels = connected_components(affinity_matrix) if numComponents > 1: # for each component figure out embedding, return the complete embedding embedding = [] connected_component = np.zeros(affinity_matrix.shape) for i in xrange(numComponents): for j in affinity_matrix.shape[0]: if labels[j] == i: connected_component[:, i] embedding.append(self._spectral_embedding(connected_component)) return embedding self.n_components += 1 L, diag_vector = laplacian(affinity_matrix, normed=True, return_diag=True) D = np.diag(diag_vector) # eigvals, eigvects = eigsh(-L, k=self.n_components, sigma=1.0, which='LM') eigvals, eigvects = eigh(L) embedding = eigvects.T[: self.n_components] * diag_vector return embedding[1 : self.n_components].T
def is_connected(adj): """ Parameters ---------- adj : :class:`scipy.sparse.csr_matrix` Adjacency matrix. Returns ------- connected : `bool` `True` if graph defined by adjecency matrix `adj` is connected. `False` otherwise. Examples -------- >>> import numpy as np >>> from scipy.sparse import csr_matrix >>> connected = csr_matrix(np.array([[0, 1], ... [1, 0]])) >>> is_connected(connected) True >>> disconnected = csr_matrix(np.array([[0, 0], ... [0, 0]])) >>> is_connected(disconnected) False """ n_connected_components = csg.connected_components(adj, directed=False, return_labels=False) return True if n_connected_components == 1 else False
def nearest_neighbor_grouping(connectivity, data_matrix, n_clusters, thr): """ Cluster according to nn and reduce the data and connectivity """ # Nearest neighbor conenctivity nn_connectivity = _nn_connectivity(connectivity, thr) n_features = connectivity.shape[0] n_labels = n_features - (nn_connectivity + nn_connectivity.T).nnz / 2 if n_labels < n_clusters: # cut some links to achieve the desired number of clusters alpha = n_features - n_clusters nn_connectivity = nn_connectivity + nn_connectivity.T edges_ = np.array(nn_connectivity.nonzero()) plop = edges_[0] - edges_[1] select = np.argsort(plop)[:alpha] nn_connectivity = coo_matrix( (np.ones(2 * alpha), np.hstack((edges_[:, select], edges_[::-1, select]))), (n_features, n_features)) # Clustering step: getting the connected components of the nn matrix n_labels, labels = csgraph.connected_components(nn_connectivity) # Reduction step: reduction by averaging reduced_connectivity, reduced_data_matrix = reduce_data_and_connectivity( labels, n_labels, connectivity, data_matrix, thr) return reduced_connectivity, reduced_data_matrix, labels
def test_strongly_connected(): # A disconnected matrix. cm1 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]]) # A strongly connected matrix. cm2 = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0]]) # A weakly connected matrix. cm3 = np.array([[0, 1, 0], [0, 0, 1], [0, 1, 0]]) assert connected_components(csr_matrix(cm1), connection='strong')[0] > 1 assert connected_components(csr_matrix(cm2), connection='strong')[0] == 1 assert connected_components(csr_matrix(cm3), connection='strong')[0] > 1
def number_of_islands(geom, mask): """ Search a given pixel mask for connected clusters. This can be used to seperate between gamma and hadronic showers. Parameters ---------- geom: `~ctapipe.instrument.CameraGeometry` Camera geometry information mask: ndarray input mask (array of booleans) Returns ------- num_islands: int Total number of clusters island_labels: ndarray Contains cluster membership of each pixel. Dimesion equals input mask. Entries range from 0 (not in the pixel mask) to num_islands. """ # compress sparse neighbor matrix neighbor_matrix_compressed = geom.neighbor_matrix_sparse[mask][:, mask] # pixels in no cluster have label == 0 island_labels = np.zeros(geom.n_pixels) num_islands, island_labels_compressed = connected_components( neighbor_matrix_compressed, directed=False ) # count clusters from 1 onwards island_labels[mask] = island_labels_compressed + 1 return num_islands, island_labels
def steadystate_solve(K): # Reformulate K to remove sink/source states n_components, component_assignments = csgraph.connected_components(K, connection="strong") largest_component = Counter(component_assignments).most_common(1)[0][0] components = np.where(component_assignments == largest_component)[0] ii = np.ix_(components, components) K_mod = K[ii] K_mod = normalize(K_mod) eigvals, eigvecs = np.linalg.eig(K_mod.T) eigvals = np.real(eigvals) eigvecs = np.real(eigvecs) maxi = np.argmax(eigvals) if not np.allclose(np.abs(eigvals[maxi]), 1.0): print('WARNING: Steady-state undetermined for current iteration') bin_prob = K.diagonal().copy() bin_prob = bin_prob / np.sum(bin_prob) return bin_prob sub_bin_prob = eigvecs[:, maxi] / np.sum(eigvecs[:, maxi]) bin_prob = np.zeros(K.shape[0]) bin_prob[components] = sub_bin_prob return bin_prob
def tree_information_sparse(forest, n_features): """Computes mutual information objective from forest. Parameters ---------- forest: sparse matrix graph containing trees representing cluster n_features: int dimensionality of input space. """ entropy = 0 sym_forest = forest + forest.T n_components, components = connected_components(sym_forest) if np.any(components < 0): # there is a lonely node entropy -= 1e10 # n_samples = len(components) for i in range(n_components): inds = np.where(components == i)[0] subforest = forest[inds[:, np.newaxis], inds] L = subforest.sum() n_samples_c = len(inds) if L == 0: warnings.warn("L is zero. This means there are identical points in" " the dataset") L = 1e-10 entropy += (n_samples_c * ((n_features - 1) * np.log(n_samples_c) - n_features * np.log(L))) return entropy
def generate_threshold_mesh(self, min_value=0.0, max_value=1.0e9): r""" Generates a mesh excluding all blocks below the min_value arg. Regions that are isolated by the thresholding are also automatically removed. """ # # thresholding the data and then checking for isolated clusters self._field.threshold_data(min_value, max_value, repl=0.0) self._field.copy_data(self) # adj_matrix = self._field.create_adjacency_matrix() num_cs, cs_ids = csgraph.connected_components(csgraph=adj_matrix, directed=False) # only saving the largest cluster if num_cs > 1: cs_count = sp.zeros(num_cs, dtype=int) for cs_num in cs_ids: cs_count[cs_num] += 1 self.data_vector[sp.where(cs_ids != sp.argmax(cs_count))[0]] = 0.0 self.data_map = sp.reshape(self.data_vector, (self.nz, self.nx)) # self._field.data_map = self.data_map self._field.data_vector = sp.ravel(self.data_map) # # generating blocks and vertices mask = self.data_map > 0.0 self._generate_masked_mesh(cell_mask=mask)
def mergeable(self, clusters, parent=None): clusters = list(clusters) return connected_components(self._neighbours.loc[clusters, clusters], directed=False, return_labels=False) == 1
def reduction_implication_network(self, rounding_parameter): """ Use an implication network to reduce the current problem instance. :param rounding_parameter: the projection data will be truncated to [rounding_parameter], in order to be able to use integer capacities in the implication network. """ current_N = self.A.shape[1] A_csc = self.A.tocsc() B = 10**rounding_parameter * A_csc.transpose() * A_csc int_y = (10**rounding_parameter * self.y + 0.1 * np.ones_like(self.y)).astype(int) diagonal = B.diagonal() B -= csr_matrix(np.diag(diagonal)) A_y = self.A.transpose() @ int_y outeredges = csr_matrix(2 * A_y - diagonal) Adjacency_matrix = 2 * sparse.bmat( [[None, 2 * B, None, None], [None, None, None, outeredges.transpose()], [outeredges, None, None, None], [None, None, 0, None]], format='csr') max_flow_output = csgraph.maximum_flow(Adjacency_matrix, 2 * current_N, 2 * current_N + 1) flow = max_flow_output.residual symmetric_central_flow = flow[:current_N, current_N:2 * current_N] + flow[:current_N, current_N:2 * current_N].transpose() symmetric_central_flow.data //= 2 flow[:current_N, current_N:2 * current_N] = symmetric_central_flow flow[current_N:2 * current_N, :current_N] = -symmetric_central_flow symmetric_outer_flow = \ flow[2*current_N, :current_N] + flow[current_N: 2 * current_N, 2*current_N + 1].transpose() symmetric_outer_flow.data //= 2 flow[2 * current_N, :current_N] = symmetric_outer_flow flow[current_N:2 * current_N, 2 * current_N + 1] = symmetric_outer_flow.transpose() residual = Adjacency_matrix - flow residual.eliminate_zeros() n_components, labels = csgraph.connected_components( residual, connection='strong') component_type = np.zeros(n_components, dtype=int) # Type 14: u and 1 - u are both contained in the component # Type 15: u is contained in the component, 1 - u is not, no path from u to u - 1 # Type 16: u is contained in the component, 1 - u is not, there exists a path from u to u - 1 indices = [] vals = [] for i in range(current_N): component = labels[i] if component_type[component] == 0: if component == labels[i + current_N]: component_type[component] = 14 else: reachable = csgraph.breadth_first_order( residual, i, return_predecessors=False) if i + current_N in reachable: component_type[component] = 16 else: component_type[component] = 15 if component_type[component] == 15: indices.append(i) vals.append(1) elif component_type[component] == 16: indices.append(i) vals.append(0) no_reductions = len(indices) order = np.array(indices).argsort() for j in range(no_reductions): i = order[no_reductions - j - 1] self.problem_reduction_single(indices[i], vals[i]) return 0
def mergeROIS(Y_res, A, b, C, f, d1, d2, P_, thr=0.8, mx=50, sn=None, deconv_method='spgl1', min_size=3, max_size=8, dist=3, method_exp='ellipse', expandCore=iterate_structure(generate_binary_structure(2, 1), 2).astype(int)): """ merging of spatially overlapping components that have highly correlated tmeporal activity % The correlation threshold for merging overlapping components is user specified in P.merge_thr (default value 0.85) % Inputs: % Y_res: residual movie after subtracting all found components % A: matrix of spatial components % b: spatial background % C: matrix of temporal components % f: temporal background % P: parameter struct % Outputs: % A: matrix of new spatial components % C: matrix of new temporal components % nr: new number of components % merged_ROIs: list of old components that were merged % Written by: % Andrea Giovannucci from implementation of Eftychios A. Pnevmatikakis, Simons Foundation, 2015 """ #% nr = A.shape[1] [d, T] = np.shape(Y_res) C_corr = np.corrcoef(C[:nr, :], C[:nr, :])[:nr, :nr] FF1 = C_corr >= thr #find graph of strongly correlated temporal components A_corr = A.T * A A_corr.setdiag(0) FF2 = A_corr > 0 # % find graph of overlapping spatial components FF3 = np.logical_and(FF1, FF2.todense()) FF3 = coo_matrix(FF3) c, l = csgraph.connected_components(FF3) # % extract connected components p = len(P_[0]['gn']) MC = [] for i in range(c): if np.sum(l == i) > 1: MC.append((l == i).T) MC = np.asarray(MC).T if MC.ndim > 1: cor = np.zeros((np.shape(MC)[1], 1)) for i in range(np.size(cor)): fm = np.where(MC[:, i])[0] for j1 in range(np.size(fm)): for j2 in range(j1 + 1, np.size(fm)): print j1, j2 cor[i] = cor[i] + C_corr[fm[j1], fm[j2]] Y_res = Y_res + np.dot(b, f) if np.size(cor) > 1: ind = np.argsort(np.squeeze(cor))[::-1] else: ind = [0] nm = min((np.size(ind), mx)) # number of merging operations A_merged = coo_matrix((d, nm)).tocsr() C_merged = np.zeros((nm, T)) P_merged = [] merged_ROIs = [] #% for i in range(nm): P_cycle = dict() merged_ROI = np.where(MC[:, ind[i]])[0] merged_ROIs.append(merged_ROI) nC = np.sqrt(np.sum(C[merged_ROI, :]**2, axis=1)) # A_merged[:,i] = np.squeeze((A[:,merged_ROI]*spdiags(nC,0,len(nC),len(nC))).sum(axis=1)) A_merged[:, i] = csr_matrix( (A[:, merged_ROI] * spdiags(nC, 0, len(nC), len(nC))).sum(axis=1)) Y_res = Y_res + A[:, merged_ROI] * C[merged_ROI, :] aa_1 = scipy.sparse.linalg.spsolve( spdiags(nC, 0, len(nC), len(nC)), C[merged_ROI, :]) aa_2 = (aa_1).mean(axis=0) ff = np.nonzero(A_merged[:, i])[0] cc, _, _, Ptemp = update_temporal_components( np.asarray(Y_res[ff, :]), A_merged[ff, i], b[ff], aa_2, f, p=p, deconv_method=deconv_method) aa, bb, cc = update_spatial_components(np.asarray(Y_res), cc, f, A_merged[:, i], d1=d1, d2=d2, sn=sn, min_size=min_size, max_size=max_size, dist=dist, method=method_exp, expandCore=expandCore) A_merged[:, i] = aa.tocsr() cc, _, _, Ptemp = update_temporal_components( Y_res[ff, :], A_merged[ff, i], bb[ff], cc, f, p=p, deconv_method=deconv_method) P_cycle = P_[merged_ROI[0]].copy() P_cycle['gn'] = Ptemp[0]['gn'] P_cycle['b'] = Ptemp[0]['b'] P_cycle['c1'] = Ptemp[0]['c1'] P_cycle['neuron_sn'] = Ptemp[0]['neuron_sn'] P_merged.append(P_cycle) C_merged[i, :] = cc if i + 1 < nm: Y_res[ff, :] = Y_res[ff, :] - A_merged[ff, i] * cc #% neur_id = np.unique(np.hstack(merged_ROIs)) good_neurons = np.setdiff1d(range(nr), neur_id) A = scipy.sparse.hstack((A[:, good_neurons], A_merged.tocsc())) C = np.vstack((C[good_neurons, :], C_merged)) # P_new=list(P_[good_neurons].copy()) P_new = [P_[pp] for pp in good_neurons] for p in P_merged: P_new.append(p) nr = nr - len(neur_id) + nm else: warnings.warn('No neurons merged!') merged_ROIs = [] P_new = P_ return A, C, nr, merged_ROIs, P_new
def merge_quantified_calls(args, dbo_args, endpoint_args): myprint('merging candidate calls') quantified_svcall_list = read_object_file(args.quantified_bk_pair_file, QuantifiedBKCand) edge_list = list() myprint('building edges for candidate calls') frm_id_set_list = list() for i in range(0, len(quantified_svcall_list)): frm_id_set = quantified_svcall_list[i].all_frm_id_set() frm_id_set_list.append(frm_id_set) for i in range(0, len(quantified_svcall_list)): for j in range(i + 1, len(quantified_svcall_list)): frm_id_set1 = frm_id_set_list[i] frm_id_set2 = frm_id_set_list[j] shared_frm_id_set = frm_id_set1.intersection(frm_id_set2) n_frm_id1 = len(frm_id_set1) n_frm_id2 = len(frm_id_set2) n_shared_frm = len(shared_frm_id_set) if n_shared_frm >= min(n_frm_id1, n_frm_id2) / 2: edge_list.append((i, j)) edge_list.append((j, i)) row = list() col = list() data = list() for edge in edge_list: row.append(edge[0]) col.append(edge[1]) data.append(1) n_node = len(quantified_svcall_list) myprint('connected components') bedpe_csr_matrix = csr_matrix((data, (row, col)), shape=[n_node, n_node]) n_components, label_list = connected_components(bedpe_csr_matrix, directed=False) component_element_db = [0] * n_components for i in range(0, len(component_element_db)): component_element_db[i] = list() # component_element_db[component_id] = list of bedpe index for i in range(0, len(label_list)): component_element_db[label_list[i]].append(i) merged_call_list = list() for component_id in range(0, len(component_element_db)): bedpe_merge_group = list() for index in component_element_db[component_id]: bedpe_merge_group.append(quantified_svcall_list[index]) merged_call = merge1call_group(bedpe_merge_group) merged_call_list.append(merged_call) merged_call_bedpe_file = args.merged_bedpe_file merged_call_bedpe_fp = open(merged_call_bedpe_file, 'w') for merged_call in merged_call_list: if merged_call.score < 20: continue merged_call_bedpe_fp.write(merged_call.output_core() + endl) merged_call_bedpe_fp.close() return
for proto, conns in node['connections'].items(): for conn in conns: if isinstance(conn, str): conn = {'id': conn, 'quality': 1.0, 'state': 'open'} if conn['id'] not in ids or conn['state'] != 'open': continue if proto == 'webrtc-stream': stream_df.at[node['id'], conn['id']] = 1 else: mesh_df.at[node['id'], conn['id']] = conn['quality'] mesh_path = csg.dijkstra(mesh_df, directed=False, unweighted=True) dm = pd.DataFrame(mesh_path, index=ids, columns=ids) dm = clean_frame(dm) if dm.get(router, pd.Series([])).any(): connected_components, labels = csg.connected_components(dm.values) average_distance_to_router = dm.get(router).sum() / len(dm) components, _ = np.histogram(labels.T, labels.max() + 1) component_distribution = ','.join(str(c) for c in sorted(components)) largest_component = max(components) analysis.append({ 'average_distance_to_router': average_distance_to_router, 'number_of_connected_nodes': len(dm), 'number_of_total_nodes': len(mesh), 'connected_components': connected_components, 'component_distribution': component_distribution, 'largest_component': largest_component }) if mesh_index == len(mesh_series) - 1: for node_id, distance in dm.get(router).to_dict().items():
def filter_hypothesis(hypothesisPoints, votingDirection, scoreSum, pointsDirection, votingScore, similiarityThresh, neighThresh, scoreThres, minClusterSize, returnCopies=False): # Filter hypothesis according to minimum number of neighbour, minimum score # and minimum size of cluster they belong to. if returnCopies: print( 'gagggggggg\ngagagagawwge\ngawegewagagaew\ngwagawegaweg\ngawegawgwgeaag' ) hypothesisPoints = hypothesisPoints.clone() votingDirection = votingDirection.clone() scoreSum = scoreSum.clone() pointsDirection = pointsDirection.clone() votingScore = votingScore.clone() # Keep hypotheses with enough score hasEnoughScore = scoreSum > scoreThres hypothesisPoints = hypothesisPoints[hasEnoughScore] votingDirection = votingDirection[hasEnoughScore] scoreSum = scoreSum[hasEnoughScore] pointsDirection = pointsDirection[hasEnoughScore] votingScore = votingScore[hasEnoughScore] # Keep removing hypotheses until all fulfill the different minimum thresholds. keepHypotheses = torch.tensor([0]).byte() while not torch.all(keepHypotheses): # Find hypotheses with enough neighbours pointsDistances = torch.from_numpy(squareform( pdist(hypothesisPoints))).cuda() votingDistances = torch.from_numpy(squareform( pdist(votingDirection))).cuda() affinity_matrix = torch.exp(-(pointsDistances * 1 / 5)**2) * torch.exp( -(votingDistances * 2)**2) adjMatrix = (affinity_matrix > similiarityThresh) & ~torch.eye( affinity_matrix.shape[0]).byte().cuda() nNeighbours = torch.sum(adjMatrix, dim=1) hasEnoughNeighbours = nNeighbours > neighThresh # Partition hypotheses into subclusters uding adjacancy matrix n_clusters, clusterLabels = connected_components(csgraph=adjMatrix, directed=False, return_labels=True) # Determine the size of the subclusters clusterLabels = torch.from_numpy(clusterLabels).cuda() uniqueClusterLabels = torch.unique(clusterLabels) clusterVotes = clusterLabels == uniqueClusterLabels[:, None] clusterSizes = torch.sum(clusterVotes, dim=1) # Find which clusters are big enough and keep the belonging hypotheses isBigCluster = clusterSizes > minClusterSize uniqueClusterLabelsKeep = uniqueClusterLabels[isBigCluster] hasEnoughClusterSize = torch.any( clusterLabels == uniqueClusterLabelsKeep[:, None], dim=0) # Keep hypotheses which are both belonging to a big enough cluster and have # enough neighbours keepHypotheses = hasEnoughNeighbours & hasEnoughClusterSize hypothesisPoints = hypothesisPoints[keepHypotheses] votingDirection = votingDirection[keepHypotheses] scoreSum = scoreSum[keepHypotheses] pointsDirection = pointsDirection[keepHypotheses] votingScore = votingScore[keepHypotheses] clusterLabels = clusterLabels[keepHypotheses] # Create weights for hypotheses according to 1/(cluster size they belong to) clusterWeight = torch.sum(clusterVotes.float() / clusterSizes[:, None].float(), dim=0) # Create weights for hypotheses according to their inlierCount/(total inlierCount of cluster they belong to) weight = clusterVotes.float() * scoreSum[None] scoreAndClusterWeight = torch.sum(weight / torch.sum(weight, dim=1)[:, None], dim=0) if returnCopies: return hypothesisPoints, votingDirection, scoreSum, pointsDirection, votingScore, adjMatrix, nNeighbours, clusterWeight, scoreAndClusterWeight, clusterLabels else: return adjMatrix, nNeighbours, clusterWeight, scoreAndClusterWeight, clusterLabels
import numpy as np from scipy.sparse.csgraph import connected_components from scipy.sparse import csr_matrix l = [[0, 1, 1, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 0]] # n, labels = connected_components(l) # AttributeError: 'list' object has no attribute 'dtype' a = np.array(l) print(type(a)) # <class 'numpy.ndarray'> n, labels = connected_components(a) print(n) # 2 print(labels) # [0 0 0 1 1] csr = csr_matrix(l) print(csr) # (0, 1) 1 # (0, 2) 1 # (1, 2) 1 # (3, 4) 1 print(type(csr)) # <class 'scipy.sparse.csr.csr_matrix'>
def test_planted_distribution_model(): """ Tests the main graph generator with statistics and visualized degree distribution and edge adjacency matrix """ print("\n--- 'planted_distribution_model_H', 'planted_distribution_model_P', 'number_of_connectedComponents', 'create_blocked_matrix_from_graph' --") CHOICE = 21 print("CHOICE:", CHOICE) debug = 0 # directed = True # !!! TODO: not yet clear what undirected means here, only P accepts directed backEdgesAllowed = True # ??? should be enforced in code sameInAsOutDegreeRanking = False distribution = 'powerlaw' exponent = -0.3 VERSION_P = True # --- AAAI figures --- if CHOICE in [1, 2, 3, 4, 5, 6]: n = 120 alpha0 = [1/6, 1/3, 1/2] h = 8 P = np.array([[1, h, 1], [1, 1, h], [h, 1, 1]]) if CHOICE == 1: # P (equivalent to 2), AAAI 2 m = 1080 elif CHOICE == 2: # H (equivalent to 1) H0 = row_normalize_matrix(P) d_vec = [18, 9, 6] VERSION_P = False elif CHOICE == 3: # H (equivalent to 4), AAAI 3 H0 = row_normalize_matrix(P) d_vec = 9 VERSION_P = False elif CHOICE == 4: # P (equivalent to 3) P = np.array([[1, h, 1], [2, 2, 2*h], [3*h, 3, 3]]) m = 1080 elif CHOICE == 5: # H (equivalent to 2), but backedges=False H0 = row_normalize_matrix(P) d_vec = [18, 9, 6] VERSION_P = False backEdgesAllowed = False elif CHOICE == 6: # P undirected, AAAI 4 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) directed = False backEdgesAllowed = False m = 540 # --- AGAIN DIRECTED --- if CHOICE == 12: n = 1001 alpha0 = [0.6, 0.2, 0.2] P = np.array([[0.1, 0.8, 0.1], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8]]) m = 3000 distribution = 'uniform' # uniform powerlaw exponent = None backEdgesAllowed = False # ??? should be enforced in code if CHOICE == 13: # Nice for block matrix visualization n = 1000 alpha0 = [0.334, 0.333, 0.333] h = 2 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) m = 2000 distribution = 'uniform' # uniform powerlaw exponent = None backEdgesAllowed = False # ??? should be enforced in code if CHOICE == 14: n = 1000 alpha0 = [0.3334, 0.3333, 0.3333] h = 10 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) m = 10000 exponent = -0.55 # --- UNDIRECTED --- if CHOICE == 20: n = 100 alpha0 = [0.6, 0.2, 0.2] h = 1.4 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) H0 = row_normalize_matrix(P) d_vec = 5 directed = False exponent = -0.3 VERSION_P = False elif CHOICE == 21: n = 1001 alpha0 = [0.6, 0.2, 0.2] h = 4 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) H0 = row_normalize_matrix(P) d_vec = 3.4 # don't specify vector for undirected distribution = 'uniform' # uniform powerlaw exponent = -0.5 directed = False backEdgesAllowed = True # ignored in code for undirected VERSION_P = False sameInAsOutDegreeRanking = True # ignored in code for undirected elif CHOICE == 22: n = 1000 m = 3000 alpha0 = [0.6, 0.2, 0.2] h = 4 P = np.array([[1, 3*h, 1], [2*h, 1, 1], [1, 1, h]]) distribution = 'uniform' # uniform powerlaw exponent = -0.5 directed = False backEdgesAllowed = False # ignored in code for undirected sameInAsOutDegreeRanking = True # ignored in code for undirected debug=0 VERSION_P = True H0 = row_normalize_matrix(P) # --- Create the graph start = time.time() if VERSION_P: W, Xd = planted_distribution_model(n, alpha=alpha0, P=P, m=m, distribution=distribution, exponent=exponent, directed=directed, backEdgesAllowed=backEdgesAllowed, sameInAsOutDegreeRanking=sameInAsOutDegreeRanking, debug=debug) else: W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d_vec, distribution=distribution, exponent=exponent, directed=directed, backEdgesAllowed=backEdgesAllowed, sameInAsOutDegreeRanking=sameInAsOutDegreeRanking, debug=debug) time_est = time.time()-start print("Time for graph generation: {}".format(time_est)) # - Undirectd degrees: In + Out W_und = W.multiply(W.transpose()) """if backEdgesAllowed then there can be edges in both directions.""" # W_und.data[:] = np.sign(W_und.data) # W contains weighted edges -> unweighted before counting edges with Ptot print("Fraction of edges that go in both directions: {}".format(np.sum(W_und.data) / np.sum(W.data))) # --- Statistics on created graph print("\n- 'calculate_Ptot_from_graph':") P_tot = calculate_Ptot_from_graph(W, Xd) print("P_tot:\n{}".format(P_tot)) print("sum(P_tot): {}".format(np.sum(P_tot))) print("P (normalized to sum=1):\n{}".format(1. * P_tot / np.sum(P_tot))) # Potential: normalized sum = 1 H = row_normalize_matrix(P_tot) print("H (row-normalized):\n{}".format(H)) print("\n- 'calculate_nVec_from_Xd':") n_vec = calculate_nVec_from_Xd(Xd) print("n_vec: {}".format(n_vec)) print("alpha: {}".format(1.*n_vec / sum(n_vec))) print("\n- Average Out/Indegree 'calculate_average_outdegree_from_graph' (assumes directed for total; for undirected the totals are incorrect):") print("Average outdegree: {}".format(calculate_average_outdegree_from_graph(W))) print("Average indegree: {}".format(calculate_average_outdegree_from_graph(W.transpose()))) print("Average total degree: {}".format(calculate_average_outdegree_from_graph(W + W.transpose()))) print("Average outdegree per class: {}".format(calculate_average_outdegree_from_graph(W, Xd))) print("Average indegree per class: {}".format(calculate_average_outdegree_from_graph(W.transpose(), Xd))) print("Average total degree per class: {}".format(calculate_average_outdegree_from_graph(W + W.transpose(), Xd))) # - Overall degree distribution: In / out print("\n- Overall Out/In/Total degree distribution 'calculate_outdegree_distribution_from_graph':") print("Overall Out and Indegree distribution:") d_out_vec_tot = calculate_outdegree_distribution_from_graph(W, Xd=None) d_in_vec_tot = calculate_outdegree_distribution_from_graph(W.transpose(), Xd=None) print("Outdegree distribution (degree / number):\n{}".format(np.array([d_out_vec_tot.keys(), d_out_vec_tot.values()]))) print("Indegree distribution (degree / number):\n{}".format(np.array([d_in_vec_tot.keys(), d_in_vec_tot.values()]))) # - Overall degree distribution: In + Out d_tot_vec_tot = calculate_outdegree_distribution_from_graph(W + W.transpose(), Xd=None) print("Total degree distribution (degree / number):\n{}".format(np.array([d_tot_vec_tot.keys(), d_tot_vec_tot.values()]))) # - Per-class degree distribution: In / out print("\n- Per-class Out/In/Total degree distribution 'calculate_outdegree_distribution_from_graph':") print("\nOutdegree distribution per class:") d_out_vec = calculate_outdegree_distribution_from_graph(W, Xd) for i in range(len(d_out_vec)): print("Class {}:".format(i)) print(np.array([d_out_vec[i].keys(), d_out_vec[i].values()])) print("Indegree distribution per class:") d_in_vec = calculate_outdegree_distribution_from_graph(W.transpose(), Xd) for i in range(len(d_in_vec)): print("Class {}:".format(i)) print(np.array([d_in_vec[i].keys(), d_in_vec[i].values()])) # - per-class degree distribution: In + out print("\nTotal degree distribution per class:") d_vec_und = calculate_outdegree_distribution_from_graph(W + W.transpose(), Xd) for i in range(len(d_vec_und)): print("Class {}:".format(i)) print(np.array([d_vec_und[i].keys(), d_vec_und[i].values()])) print("\n- number of weakly connected components':") print("Number of weakly connected components: {}".format(connected_components(W, directed=True, connection='weak', return_labels=False))) # --- convergence boundary # print("\n- '_out_eps_convergence_directed_linbp', 'eps_convergence_linbp'") # if directed: # eps_noEcho = _out_eps_convergence_directed_linbp(P, W, echo=False) # eps_Echo = _out_eps_convergence_directed_linbp(P, W, echo=True) # else: Hc = to_centering_beliefs(H) eps_noEcho = eps_convergence_linbp(Hc, W, echo=False) eps_Echo = eps_convergence_linbp(Hc, W, echo=True) print("Eps (w/ echo): {}".format(eps_Echo)) print("Eps (no echo): {}".format(eps_noEcho)) # --- Fig1: Draw edge distributions print("\n- Fig1: Draw degree distributions") params = {'backend': 'pdf', 'lines.linewidth': 4, 'font.size': 10, 'axes.labelsize': 24, # fontsize for x and y labels (was 10) 'axes.titlesize': 22, 'xtick.labelsize': 20, 'ytick.labelsize': 20, 'legend.fontsize': 8, 'figure.figsize': [5, 4], 'font.family': 'sans-serif' } mpl.rcdefaults() mpl.rcParams.update(params) fig = plt.figure(1) ax = fig.add_axes([0.15, 0.15, 0.8, 0.8]) # main axes ax.xaxis.labelpad = -12 ax.yaxis.labelpad = -12 # A: Draw directed degree distribution y_vec = [] for i in range(len(d_out_vec)): y = np.repeat(list(d_out_vec[i].keys()), list(d_out_vec[i].values()) ) # !!! np.repeat y = -np.sort(-y) y_vec.append(y) # print ("Class {}:\n{}".format(i,y)) y_tot = np.repeat(list(d_out_vec_tot.keys()), list(d_out_vec_tot.values())) # total outdegree y_tot = -np.sort(-y_tot) plt.loglog(range(1, len(y_vec[0])+1), y_vec[0], lw=4, color='orange', label=r"A out", linestyle='-') # !!! plot default index starts from 0 otherwise plt.loglog(range(1, len(y_vec[1])+1), y_vec[1], lw=4, color='blue', label=r"B out", linestyle='--') plt.loglog(range(1, len(y_vec[2])+1), y_vec[2], lw=4, color='green', label=r"C out", linestyle=':') plt.loglog(range(1, len(y_tot)+1), y_tot, lw=1, color='black', label=r"tot out", linestyle='-') # B: Draw second edge distribution of undirected degree distribution y_vec = [] for i in range(len(d_vec_und)): y = np.repeat(list(d_vec_und[i].keys()), list(d_vec_und[i].values()) ) # !!! np.repeat y = -np.sort(-y) y_vec.append(y) # print ("Class {}:\n{}".format(i,y)) y_tot = np.repeat(list(d_tot_vec_tot.keys()), list(d_tot_vec_tot.values())) # total outdegree y_tot = -np.sort(-y_tot) plt.loglog(range(1, len(y_vec[0])+1), y_vec[0], lw=4, color='orange', label=r"A", linestyle='-') plt.loglog(range(1, len(y_vec[1])+1), y_vec[1], lw=4, color='blue', label=r"B", linestyle='--') plt.loglog(range(1, len(y_vec[2])+1), y_vec[2], lw=4, color='green', label=r"C", linestyle=':') plt.loglog(range(1, len(y_tot)+1), y_tot, lw=1, color='black', label=r"tot", linestyle='-') plt.legend(loc='upper right', labelspacing=0) filename = 'figs/Fig_test_planted_distribution_model1_{}.pdf'.format(CHOICE) plt.savefig(filename, dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype='letter', format='pdf', transparent=True, bbox_inches='tight', pad_inches=0.1, # frameon=None, # TODO: frameon deprecated ) os.system("open " + filename) # --- Fig2: Draw block matrix print("\n- Fig2: 'create_blocked_matrix_from_graph'") W_new, Xd_new = create_blocked_matrix_from_graph(W, Xd) fig = plt.figure(2) row, col = W_new.nonzero() # transform the sparse W back to row col format plt.plot(col, row, 'o', color='r', markersize=2, markeredgewidth=2, lw=0, zorder=3) # Notice (col, row) because first axis is vertical in matrices # plt.matshow(W_new.todense(), cmap=plt.cm.Greys) # cmap=plt.cm.gray / Blues # alternative that does not work as well plt.gca().invert_yaxis() # invert the y-axis to start on top and go down # Show quadrants d1 = alpha0[0] * n d2 = (alpha0[0] + alpha0[1]) * n plt.grid(which='major', color='0.7', linestyle='-', linewidth=1) plt.xticks([0, d1, d2, n]) plt.yticks([0, d1, d2, n]) plt.xlabel('to', labelpad=-1) plt.ylabel('from', rotation=90, labelpad=0) frame = plt.gca() # frame.axes.xaxis.set_ticklabels([]) # would hide the labels # frame.axes.yaxis.set_ticklabels([]) frame.tick_params(direction='inout', width=1, length=10) filename = 'figs/Fig_test_planted_distribution_model2_{}.pdf'.format(CHOICE) plt.savefig(filename, dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype='letter', format='pdf', transparent=True, bbox_inches='tight', pad_inches=0.1) os.system("open " + filename)
def scc(n, m, edge_array): tmp = np.ones(m, dtype=np.int32).T graph = csr_matrix((tmp, (edge_array[:])), (n, n)) return connected_components(graph, directed=True, connection='strong')
def compute_neighbors( self, n_neighbors: int = 30, knn: bool = True, n_pcs: Optional[int] = None, use_rep: Optional[str] = None, method: _Method = 'umap', random_state: AnyRandom = 0, write_knn_indices: bool = False, metric: _Metric = 'euclidean', metric_kwds: Mapping[str, Any] = MappingProxyType({}), ) -> None: """\ Compute distances and connectivities of neighbors. Parameters ---------- n_neighbors Use this number of nearest neighbors. knn Restrict result to `n_neighbors` nearest neighbors. {n_pcs} {use_rep} Returns ------- Writes sparse graph attributes `.distances` and `.connectivities`. Also writes `.knn_indices` and `.knn_distances` if `write_knn_indices==True`. """ from sklearn.metrics import pairwise_distances start_neighbors = logg.debug('computing neighbors') if n_neighbors > self._adata.shape[0]: # very small datasets n_neighbors = 1 + int(0.5*self._adata.shape[0]) logg.warning(f'n_obs too small: adjusting to `n_neighbors = {n_neighbors}`') if method == 'umap' and not knn: raise ValueError('`method = \'umap\' only with `knn = True`.') if method == 'rapids' and metric != 'euclidean': raise ValueError("`method` 'rapids' only supports the 'euclidean' `metric`.") if method not in {'umap', 'gauss', 'rapids'}: raise ValueError("`method` needs to be 'umap', 'gauss', or 'rapids'.") if self._adata.shape[0] >= 10000 and not knn: logg.warning('Using high n_obs without `knn=True` takes a lot of memory...') self.n_neighbors = n_neighbors self.knn = knn X = _choose_representation(self._adata, use_rep=use_rep, n_pcs=n_pcs) # neighbor search use_dense_distances = (metric == 'euclidean' and X.shape[0] < 8192) or knn == False if use_dense_distances: _distances = pairwise_distances(X, metric=metric, **metric_kwds) knn_indices, knn_distances = _get_indices_distances_from_dense_matrix( _distances, n_neighbors) if knn: self._distances = _get_sparse_matrix_from_indices_distances_numpy( knn_indices, knn_distances, X.shape[0], n_neighbors) else: self._distances = _distances elif method == 'rapids': knn_indices, knn_distances = compute_neighbors_rapids(X, n_neighbors) else: # non-euclidean case and approx nearest neighbors if X.shape[0] < 4096: X = pairwise_distances(X, metric=metric, **metric_kwds) metric = 'precomputed' knn_indices, knn_distances, forest = compute_neighbors_umap( X, n_neighbors, random_state, metric=metric, metric_kwds=metric_kwds) # very cautious here try: if forest: self._rp_forest = _make_forest_dict(forest) except: pass # write indices as attributes if write_knn_indices: self.knn_indices = knn_indices self.knn_distances = knn_distances start_connect = logg.debug('computed neighbors', time=start_neighbors) if not use_dense_distances or method in {'umap', 'rapids'}: # we need self._distances also for method == 'gauss' if we didn't # use dense distances self._distances, self._connectivities = _compute_connectivities_umap( knn_indices, knn_distances, self._adata.shape[0], self.n_neighbors, ) # overwrite the umap connectivities if method is 'gauss' # self._distances is unaffected by this if method == 'gauss': self._compute_connectivities_diffmap() logg.debug('computed connectivities', time=start_connect) self._number_connected_components = 1 if issparse(self._connectivities): from scipy.sparse.csgraph import connected_components self._connected_components = connected_components(self._connectivities) self._number_connected_components = self._connected_components[0]
def set_saturation_level_imposed_joined_coarse(self): levels = self.data_impress['LEVEL'].copy() dual_flag = self.data_impress['DUAL_1'].copy() gid1 = self.data_impress['GID_1'] gid0 = self.data_impress['GID_0'] level_0_ini = set(gid0[levels==0]) saturation = self.data_impress['saturation'] all_wells = set(self.all_wells_ids) gids_lv1_sat = set() gidsc = np.unique(gid1) internal_faces = self.elements_lv0['internal_faces'] v0 = self.elements_lv0['neig_internal_faces'] ds = saturation[v0] ds = np.absolute(ds[:,1] - ds[:,0]) inds = ds >= self.delta_sat_max levels[v0[inds][:,0]] = 0 levels[v0[inds][:,1]] = 0 all_lv0 = set(gid0[levels==0]) for gidc in gidsc: gids0 = gid0[gid1==gidc] vertex = gids0[dual_flag[gids0]==3] if (levels[gids0].max()-levels[gids0].min())>0: facs=np.unique(np.concatenate(self.elements_lv0['volumes_face_faces'][gids0])) facs=np.intersect1d(facs,internal_faces) ad=np.vstack(self.elements_lv0['faces_face_volumes'][facs]) ad0=ad[:,0] ad1=ad[:,1] l0=levels[ad0] l1=levels[ad1] map_lid=-np.ones(max(gid0)+1) map_lid[gids0]=np.arange(len(gids0)) l0[map_lid[ad0]<0]=-1 l1[map_lid[ad1]<0]=-1 fadj1=l0+l1>=0 # import pdb; pdb.set_trace() lines=map_lid[ad0[fadj1]].astype(int) cols=map_lid[ad1[fadj1]].astype(int) data=np.ones(len(lines)) graph=csc_matrix((data,(lines,cols)),shape=(len(gids0),len(gids0))) n_l,labels=csgraph.connected_components(graph,connection='weak') groups=[gids0[labels==k] for k in range(n_l)] ls=np.array([len(g) for g in groups]) print(ls.max()) if ls.max()>1: vols_nv1=np.array(groups)[ls==ls.max()][0] levels[np.setdiff1d(gids0,vols_nv1)]=0 # import pdb; pdb.set_trace() for gidc in gidsc: gids0 = gid0[gid1==gidc] if set(gids0) & all_lv0: gids_fora = np.array(list(set(gids0) - all_lv0)) if len(gids_fora) > 0: levels[gids_fora] = 1 gids_lv1_sat.add(gidc) cids_neigh = self.ml_data['coarse_id_neig_face_level_'+str(1)] cids_level = self.ml_data['coarse_primal_id_level_'+str(1)] for gidc in gids_lv1_sat: vizs = cids_neigh[cids_level==gidc][0] for viz in vizs: if set([viz]) & gids_lv1_sat: continue gids0 = gid0[gid1==viz] if set(gids0) & all_lv0: gids_fora = np.array(list(set(gids0) - all_lv0)) levels[gids_fora] = 1 else: levels[gids0] = 1 self.data_impress['LEVEL'] = levels.copy()
def get_separable_problems(problem): """Return a list of separable problems whose sum is the original one. Parameters ---------- problem : Problem A problem that consists of separable (sub)problems. Returns ------- List A list of problems which are separable whose sum is the original one. """ # obj_terms contains the terms in the objective functions. We have to # deal with the special case where the objective function is not a sum. if isinstance(problem.objective.args[0], cvxtypes.add_expr()): obj_terms = problem.objective.args[0].args else: obj_terms = [problem.objective.args[0]] # Remove constant terms, which will be appended to the first separable # problem. constant_terms = [term for term in obj_terms if term.is_constant()] obj_terms = [term for term in obj_terms if not term.is_constant()] constraints = problem.constraints num_obj_terms = len(obj_terms) num_terms = len(obj_terms) + len(constraints) # Objective terms and constraints are indexed from 0 to num_terms - 1. var_sets = [frozenset(func.variables()) for func in obj_terms + constraints ] all_vars = frozenset().union(*var_sets) adj_matrix = dok_matrix((num_terms, num_terms), dtype=bool) for var in all_vars: # Find all functions that contain this variable term_ids = [i for i, var_set in enumerate(var_sets) if var in var_set] # Add an edge between any two objetive terms/constraints sharing # this variable. if len(term_ids) > 1: for i, j in itertools.combinations(term_ids, 2): adj_matrix[i, j] = adj_matrix[j, i] = True num_components, labels = csgraph.connected_components(adj_matrix, directed=False) # After splitting, construct subproblems from appropriate objective # terms and constraints. term_ids_per_subproblem = [[] for _ in range(num_components)] for i, label in enumerate(labels): term_ids_per_subproblem[label].append(i) problem_list = [] for index in range(num_components): terms = [obj_terms[i] for i in term_ids_per_subproblem[index] if i < num_obj_terms] # If we just call sum, we'll have an extra 0 in the objective. obj = sum(terms[1:], terms[0]) if terms else Constant(0) constrs = [constraints[i - num_obj_terms] for i in term_ids_per_subproblem[index] if i >= num_obj_terms] problem_list.append(Problem(problem.objective.copy([obj]), constrs)) # Append constant terms to the first separable problem. if constant_terms: # Avoid adding an extra 0 in the objective sum_constant_terms = sum(constant_terms[1:], constant_terms[0]) if problem_list: problem_list[0].objective.args[0] += sum_constant_terms else: problem_list.append(Problem(problem.objective.copy( [sum_constant_terms]))) return problem_list
def train_val_test_split_adjacency(A, p_val=0.10, p_test=0.05, seed=0, neg_mul=1, every_node=True, connected=False, undirected=False, use_edge_cover=True, set_ops=True, asserts=False): """ Split the edges of the adjacency matrix into train, validation and test edges and randomly samples equal amount of validation and test non-edges. Parameters ---------- A : scipy.sparse.spmatrix Sparse unweighted adjacency matrix p_val : float Percentage of validation edges. Default p_val=0.10 p_test : float Percentage of test edges. Default p_test=0.05 seed : int Seed for numpy.random. Default seed=0 neg_mul : int What multiplicity of negative samples (non-edges) to have in the test/validation set w.r.t the number of edges, i.e. len(non-edges) = L * len(edges). Default neg_mul=1 every_node : bool Make sure each node appears at least once in the train set. Default every_node=True connected : bool Make sure the training graph is still connected after the split undirected : bool Whether to make the split undirected, that is if (i, j) is in val/test set then (j, i) is there as well. Default undirected=False use_edge_cover: bool Whether to use (approximate) edge_cover to find the minimum set of edges that cover every node. Only active when every_node=True. Default use_edge_cover=True set_ops : bool Whether to use set operations to construction the test zeros. Default setwise_zeros=True Otherwise use a while loop. asserts : bool Unit test like checks. Default asserts=False Returns ------- train_ones : array-like, shape [n_train, 2] Indices of the train edges val_ones : array-like, shape [n_val, 2] Indices of the validation edges val_zeros : array-like, shape [n_val, 2] Indices of the validation non-edges test_ones : array-like, shape [n_test, 2] Indices of the test edges test_zeros : array-like, shape [n_test, 2] Indices of the test non-edges """ assert p_val + p_test > 0 assert A.max() == 1 # no weights assert A.min() == 0 # no negative edges assert A.diagonal().sum() == 0 # no self-loops #assert not np.any(A.sum(0).A1 + A.sum(1).A1 == 0) # no dangling nodes is_undirected = (A != A.T).nnz == 0 if undirected: assert is_undirected # make sure is directed A = sp.tril(A).tocsr() # consider only upper triangular A.eliminate_zeros() else: if is_undirected: warnings.warn( 'Graph appears to be undirected. Did you forgot to set undirected=True?' ) np.random.seed(seed) E = A.nnz N = A.shape[0] s_train = int(E * (1 - p_val - p_test)) idx = np.arange(N) # hold some edges so each node appears at least once if every_node: if connected: #assert connected_components(A)[0] == 1 # make sure original graph is connected A_hold = minimum_spanning_tree(A) else: A.eliminate_zeros( ) # makes sure A.tolil().rows contains only indices of non-zero elements d = A.sum(1).A1 if use_edge_cover: hold_edges = np.array(list(nx.maximal_matching(nx.DiGraph(A)))) not_in_cover = np.array( list(set(range(N)).difference(hold_edges.flatten()))) # makes sure the training percentage is not smaller than N/E when every_node is set to True min_size = hold_edges.shape[0] + len(not_in_cover) if min_size > s_train: raise ValueError( 'Training percentage too low to guarantee every node. Min train size needed {:.2f}' .format(min_size / E)) d_nic = d[not_in_cover] hold_edges_d1 = np.column_stack( (not_in_cover[d_nic > 0], np.row_stack( map(np.random.choice, A[not_in_cover[d_nic > 0]].tolil().rows)))) if np.any(d_nic == 0): hold_edges_d0 = np.column_stack((np.row_stack( map(np.random.choice, A[:, not_in_cover[d_nic == 0]].T.tolil().rows)), not_in_cover[d_nic == 0])) hold_edges = np.row_stack( (hold_edges, hold_edges_d0, hold_edges_d1)) else: hold_edges = np.row_stack((hold_edges, hold_edges_d1)) else: # makes sure the training percentage is not smaller than N/E when every_node is set to True if N > s_train: raise ValueError( 'Training percentage too low to guarantee every node. Min train size needed {:.2f}' .format(N / E)) hold_edges_d1 = np.column_stack( (idx[d > 0], np.row_stack(map(np.random.choice, A[d > 0].tolil().rows)))) if np.any(d == 0): hold_edges_d0 = np.column_stack((np.row_stack( map(np.random.choice, A[:, d == 0].T.tolil().rows)), idx[d == 0])) hold_edges = np.row_stack((hold_edges_d0, hold_edges_d1)) else: hold_edges = hold_edges_d1 if asserts: assert np.all(A[hold_edges[:, 0], hold_edges[:, 1]]) assert len(np.unique(hold_edges.flatten())) == N A_hold = edges_to_sparse(hold_edges, N) A_hold[A_hold > 1] = 1 A_hold.eliminate_zeros() A_sample = A - A_hold s_train = s_train - A_hold.nnz else: A_sample = A idx_ones = np.random.permutation(A_sample.nnz) ones = np.column_stack(A_sample.nonzero()) train_ones = ones[idx_ones[:s_train]] test_ones = ones[idx_ones[s_train:]] # return back the held edges if every_node: train_ones = np.row_stack( (train_ones, np.column_stack(A_hold.nonzero()))) n_test = len(test_ones) * neg_mul if set_ops: # generate slightly more completely random non-edge indices than needed and discard any that hit an edge # much faster compared a while loop # in the future: estimate the multiplicity (currently fixed 1.3/2.3) based on A_obs.nnz if undirected: random_sample = np.random.randint(0, N, [int(2.3 * n_test), 2]) random_sample = random_sample[random_sample[:, 0] > random_sample[:, 1]] else: random_sample = np.random.randint(0, N, [int(1.3 * n_test), 2]) random_sample = random_sample[random_sample[:, 0] != random_sample[:, 1]] test_zeros = random_sample[A[random_sample[:, 0], random_sample[:, 1]].A1 == 0] test_zeros = np.row_stack(test_zeros)[:n_test] #assert test_zeros.shape[0] == n_test else: test_zeros = [] while len(test_zeros) < n_test: i, j = np.random.randint(0, N, 2) if A[i, j] == 0 and (not undirected or i > j) and (i, j) not in test_zeros: test_zeros.append((i, j)) test_zeros = np.array(test_zeros) # split the test set into validation and test set s_val_ones = int(len(test_ones) * p_val / (p_val + p_test)) s_val_zeros = int(len(test_zeros) * p_val / (p_val + p_test)) val_ones = test_ones[:s_val_ones] test_ones = test_ones[s_val_ones:] val_zeros = test_zeros[:s_val_zeros] test_zeros = test_zeros[s_val_zeros:] if undirected: # put (j, i) edges for every (i, j) edge in the respective sets and form back original A symmetrize = lambda x: np.row_stack( (x, np.column_stack((x[:, 1], x[:, 0])))) train_ones = symmetrize(train_ones) val_ones = symmetrize(val_ones) val_zeros = symmetrize(val_zeros) test_ones = symmetrize(test_ones) test_zeros = symmetrize(test_zeros) A = A.maximum(A.T) if asserts: set_of_train_ones = set(map(tuple, train_ones)) assert train_ones.shape[0] + test_ones.shape[0] + val_ones.shape[ 0] == A.nnz assert (edges_to_sparse( np.row_stack((train_ones, test_ones, val_ones)), N) != A).nnz == 0 assert set_of_train_ones.intersection(set(map(tuple, test_ones))) == set() assert set_of_train_ones.intersection(set(map(tuple, val_ones))) == set() assert set_of_train_ones.intersection(set(map(tuple, test_zeros))) == set() assert set_of_train_ones.intersection(set(map(tuple, val_zeros))) == set() assert len(set(map(tuple, test_zeros))) == len(test_ones) * neg_mul assert len(set(map(tuple, val_zeros))) == len(val_ones) * neg_mul assert not connected or connected_components(A_hold)[0] == 1 assert not every_node or ((A_hold - A) > 0).sum() == 0 return train_ones, val_ones, val_zeros, test_ones, test_zeros
def find_cut(self, MSF, data=None, quorum=-np.inf, labels=None, target_label=None, make=False, verbose=False): """ Find the best cut from the MSF. MSF: (N,N) scipy sparse matrix with zero elements removed. Represents the adjacency matrix for the minimum spanning forest. Constructed from sparse.csgraph.sparse_from_dense or using MSF.eliminate_zeros(). You MUST remove zero entries for this to work, otherwise they are considered no-cost paths. data: (N,p) attribute matrix. If not provided, replaced with (N,1) vector of ones. quorum: int denoting the minimum number of elements in the region labels: (N,) flat vector of labels for each point. Represents the "cluster labels" for disconnected components of the graph. target_label: int from the labels array to subset the MSF. If passed along with `labels`, then a cut will be found that is restricted to that subset of the MSF. make: bool, whether or not to modify the input MSF in order to make the best cut that was found. verbose: bool/int, denoting how much output to provide to the user, in terms of print statements or progressbars Returns a namedtuple with in_node, out_node, and score. """ if data is None: data = np.ones(MSF.shape) if (labels is None) != (target_label is None): raise ValueError("Both labels and target_label must be supplied! Only {} provided."\ .format(['labels', 'target_label'][int(target_label is None)])) if verbose: try: from tqdm import tqdm except ImportError: def tqdm(noop, desc=''): return noop else: def tqdm(noop, desc=''): return noop zero_in = (labels is not None) and (target_label is not None) current_n_subtrees, current_labels = cg.connected_components( MSF, directed=False) best_deletion = deletion(np.nan, np.nan, np.inf) for in_node, out_node in tqdm( np.vstack(MSF.nonzero()).T, desc='finding cut...'): # iterate over MSF edges if zero_in: if labels[in_node] != target_label: continue local_MSF = copy.deepcopy(MSF) # delete a candidate edge local_MSF[in_node, out_node] = 0 local_MSF.eliminate_zeros() # get the connected components local_n_subtrees, local_labels = cg.connected_components( local_MSF, directed=False) if local_n_subtrees <= current_n_subtrees: raise Exception('Malformed MSF!') # compute the score of these components score = self.score(data, labels=local_labels, quorum=quorum) # if the score is lower than the best score and quorum is met if score < best_deletion.score: best_deletion = deletion(in_node, out_node, score) if make: return self.make_cut(*best_deletion, MSF=MSF) return best_deletion
def fit(self, n_clusters, W, data=None, quorum=-np.inf, trace=False, islands='increase', verbose=False): """ n_clusters : int of clusters wanted W : pysal W object expressing the neighbor relationships between observations. Should be symmetric and binary, so Queen/Rook, DistanceBand, or a symmetrized KNN. data: np.ndarray of (N,P) shape with N observations and P features quorum: floor on the size of regions. trace: bool denoting whether to store intermediate labelings as the tree gets pruned islands: string describing what to do with islands. If "ignore", will discover `n_clusters` regions, treating islands as their own regions. If "increase", will discover `n_clusters` regions, treating islands as separate from n_clusters. verbose: bool/int describing how much output to provide to the user, in terms of print statements or progressbars. NOTE: Optimization occurs with respect to a *dissimilarity* metric, so the problem *minimizes* the map dissimilarity. So, lower scores are better. """ if trace: self._trace = [] if data is None: attribute_kernel = np.ones((W.n, W.n)) data = np.ones((W.n, 1)) else: attribute_kernel = self.metric(data) W.transform = 'b' W = W.sparse start = time.time() super_verbose = verbose > 1 start_W = time.time() dissim = W.multiply(attribute_kernel) dissim.eliminate_zeros() end_W = time.time() - start_W if super_verbose: print('Computing Affinity Kernel took {:.2f}s'.format(end_W)) tree_time = time.time() MSF = cg.minimum_spanning_tree(dissim) tree_time = time.time() - tree_time if super_verbose: print('Computing initial MST took {:.2f}s'.format(tree_time)) initial_component_time = time.time() current_n_subtrees, current_labels = cg.connected_components( MSF, directed=False) initial_component_time = time.time() - initial_component_time if super_verbose: print('Computing connected components took {:.2f}s.'.format( initial_component_time)) if current_n_subtrees > 1: island_warnings = ['Increasing `n_clusters` from {} to {} in order to account for islands.'\ .format(n_clusters, n_clusters+current_n_subtrees), 'Counting islands towards the remaining {} clusters.'\ .format(n_clusters - (current_n_subtrees))] ignoring_islands = int(islands.lower() == 'ignore') chosen_warning = island_warnings[ignoring_islands] warn("By default, the graph is disconnected! {}".format( chosen_warning), OptimizeWarning, stacklevel=2) if not ignoring_islands: n_clusters += (current_n_subtrees) _, island_populations = np.unique(current_labels, return_counts=True) if (island_populations < quorum).any(): raise ValueError( "Islands must be larger than the quorum. If not, drop the small islands and solve for" " clusters in the remaining field.") if trace: self._trace.append( (current_labels, deletion(np.nan, np.nan, np.inf))) if super_verbose: print(self._trace[-1]) while current_n_subtrees < n_clusters: # while we don't have enough regions best_deletion = self.find_cut(MSF, data, quorum=quorum, labels=None, target_label=None, verbose=verbose) if np.isfinite(best_deletion.score): # if our search succeeds # accept the best move as *the* move if super_verbose: print('making cut {}...'.format(best_deletion)) MSF, current_n_subtrees, current_labels = self.make_cut( *best_deletion, MSF=MSF) else: # otherwise, it means the MSF admits no further cuts (no backtracking here) current_n_subtrees, current_labels = cg.connected_components( MSF, directed=False) warn( "MSF contains no valid moves after finding {} subtrees." "Decrease the size of your quorum to find the remaining {} subtrees." .format(current_n_subtrees, n_clusters - current_n_subtrees), OptimizeWarning, stacklevel=2) self.current_labels_ = current_labels self.minimum_spanning_forest_ = MSF self._elapsed_time = time.time() - start return self if trace: self._trace.append((current_labels, best_deletion)) self.current_labels_ = current_labels self.minimum_spanning_forest_ = MSF self._elapsed_time = time.time() - start return self
def transitive_closure(M): # WARNING: Not for large M! labels = connected_components(M)[1] closure = csr(labels == labels[:, None]) return closure
def __init__(self, adata: AnnData, n_dcs: Optional[int] = None): self._adata = adata self._init_iroot() # use the graph in adata info_str = '' self.knn: Optional[bool] = None self._distances: Union[np.ndarray, csr_matrix, None] = None self._connectivities: Union[np.ndarray, csr_matrix, None] = None self._transitions_sym: Union[np.ndarray, csr_matrix, None] = None self._number_connected_components: Optional[int] = None self._rp_forest: Optional[RPForestDict] = None if 'neighbors' in adata.uns: if 'distances' in adata.uns['neighbors']: self.knn = issparse(adata.uns['neighbors']['distances']) self._distances = adata.uns['neighbors']['distances'] if 'connectivities' in adata.uns['neighbors']: self.knn = issparse(adata.uns['neighbors']['connectivities']) self._connectivities = adata.uns['neighbors']['connectivities'] if 'rp_forest' in adata.uns['neighbors']: self._rp_forest = adata.uns['neighbors']['rp_forest'] if 'params' in adata.uns['neighbors']: self.n_neighbors = adata.uns['neighbors']['params'][ 'n_neighbors'] else: def count_nonzero(a: Union[np.ndarray, csr_matrix]) -> int: return a.count_nonzero() if issparse( a) else np.count_nonzero(a) # estimating n_neighbors if self._connectivities is None: self.n_neighbors = int( count_nonzero(self._distances) / self._distances.shape[0]) else: self.n_neighbors = int( count_nonzero(self._connectivities) / self._connectivities.shape[0] / 2) info_str += '`.distances` `.connectivities` ' self._number_connected_components = 1 if issparse(self._connectivities): from scipy.sparse.csgraph import connected_components self._connected_components = connected_components( self._connectivities) self._number_connected_components = self._connected_components[ 0] if 'X_diffmap' in adata.obsm_keys(): self._eigen_values = _backwards_compat_get_full_eval(adata) self._eigen_basis = _backwards_compat_get_full_X_diffmap(adata) if n_dcs is not None: if n_dcs > len(self._eigen_values): raise ValueError( 'Cannot instantiate using `n_dcs`={}. ' 'Compute diffmap/spectrum with more components first.'. format(n_dcs)) self._eigen_values = self._eigen_values[:n_dcs] self._eigen_basis = self._eigen_basis[:, :n_dcs] self.n_dcs = len(self._eigen_values) info_str += '`.eigen_values` `.eigen_basis` `.distances_dpt`' else: self._eigen_values = None self._eigen_basis = None self.n_dcs = None if info_str != '': logg.debug(f' initialized {info_str}')
def autoSegment2(mol, sel='(protein or resname ACE NME)', basename='P', fields=('segid', ), residgaps=False, residgaptol=1, chaingaps=True, mode='alphanumeric', _logger=True): """ Detects bonded segments in a selection and assigns incrementing segid to each segment Parameters ---------- mol : :class:`Molecule <moleculekit.molecule.Molecule>` object The Molecule object sel : str Atom selection string on which to check for gaps. See more `here <http://www.ks.uiuc.edu/Research/vmd/vmd-1.9.2/ug/node89.html>`__ basename : str The basename for segment ids. For example if given 'P' it will name the segments 'P1', 'P2', ... fields : tuple of strings Field to fix. Can be "segid" (default) or any other Molecule field or combinations thereof. residgaps : bool Set to True to consider gaps in resids as structural gaps. Set to False to ignore resids residgaptol : int Above what resid difference is considered a gap. I.e. with residgaptol 1, 235-233 = 2 > 1 hence is a gap. We set default to 2 because in many PDBs single residues are missing in the proteins without any gaps. chaingaps : bool Set to True to consider changes in chains as structural gaps. Set to False to ignore chains mode : str If set to 'numeric' it will use numbers for segment IDs. If set to 'alphabetic' it will use letters for segment IDs. If set to 'alphanumeric' it will use both numbers and letters for segment IDs. Returns ------- newmol : :class:`Molecule <moleculekit.molecule.Molecule>` object A new Molecule object with modified segids Example ------- >>> newmol = autoSegment2(mol) """ from scipy.sparse import csr_matrix from scipy.sparse.csgraph import connected_components if isinstance(fields, str): fields = (fields, ) sel += ' and backbone or (resname NME ACE and name N C O CH3)' # Looking for bonds only over the backbone of the protein idx = mol.atomselect( sel, indexes=True ) # Keep the original atom indexes to map from submol to mol submol = mol.copy( ) # We filter out everything not on the backbone to calculate only those bonds submol.filter(sel, _logger=False) bonds = submol._getBonds() # Calculate both file and guessed bonds if residgaps: # Remove bonds between residues without continuous resids bondresiddiff = np.abs(submol.resid[bonds[:, 0]] - submol.resid[bonds[:, 1]]) bonds = bonds[bondresiddiff <= residgaptol, :] else: # Warning about bonds bonding non-continuous resids bondresiddiff = np.abs(submol.resid[bonds[:, 0]] - submol.resid[bonds[:, 1]]) if _logger and np.any(bondresiddiff > 1): for i in np.where(bondresiddiff > residgaptol)[0]: logger.warning( 'Bonds found between resid gaps: resid {} and {}'.format( submol.resid[bonds[i, 0]], submol.resid[bonds[i, 1]])) if chaingaps: # Remove bonds between residues without same chain bondsamechain = submol.chain[bonds[:, 0]] == submol.chain[bonds[:, 1]] bonds = bonds[bondsamechain, :] else: # Warning about bonds bonding different chains bondsamechain = submol.chain[bonds[:, 0]] == submol.chain[bonds[:, 1]] if _logger and np.any(bondsamechain == False): for i in np.where(bondsamechain == False)[0]: logger.warning( 'Bonds found between chain gaps: resid {}/{} and {}/{}'. format(submol.resid[bonds[i, 0]], submol.chain[bonds[i, 0]], submol.resid[bonds[i, 1]], submol.chain[bonds[i, 1]])) # Calculate connected components using the bonds sparsemat = csr_matrix( ( np.ones(bonds.shape[0] * 2), # Values ( np.hstack((bonds[:, 0], bonds[:, 1])), # Rows np.hstack((bonds[:, 1], bonds[:, 0])))), shape=[submol.numAtoms, submol.numAtoms]) # Columns numcomp, compidx = connected_components(sparsemat, directed=False) # Letters to be used for chains, if free: 0123456789abcd...ABCD..., minus chain symbols already used used_chains = set(mol.chain) chain_alphabet = _getChainAlphabet(mode) available_chains = [x for x in chain_alphabet if x not in used_chains] mol = mol.copy() prevsegres = None for i in range(numcomp): # For each connected component / segment segid = basename + str(i) backboneSegIdx = idx[compidx == i] # The backbone atoms of the segment segres = mol.atomselect('same residue as index {}'.format(' '.join( map(str, backboneSegIdx)))) # Get whole residues # Warning about separating segments with continuous resids if _logger and i > 0 and (np.min(mol.resid[segres]) - np.max(mol.resid[prevsegres])) == 1: logger.warning( 'Separated segments {} and {}, despite continuous resids, due to lack of bonding.' .format(basename + str(i - 1), segid)) # Add the new segment ID to all fields the user specified for f in fields: if f != 'chain': if np.any(mol.__dict__[f] == segid): raise RuntimeError( 'Segid {} already exists in the molecule. Please choose different prefix.' .format(segid)) mol.__dict__[f][ segres] = segid # Assign the segid to the correct atoms else: mol.__dict__[f][segres] = available_chains[ i % len(available_chains)] if _logger: logger.info('Created segment {} between resid {} and {}.'.format( segid, np.min(mol.resid[segres]), np.max(mol.resid[segres]))) prevsegres = segres # Store old segment atom indexes for the warning about continuous resids return mol
def calc_scc(graph: csr_matrix): return connected_components(csgraph=graph, directed=True, connection="strong", return_labels=True)[1]
def _fix_connectivity(X, connectivity, affinity): """ Fixes the connectivity matrix. The different steps are: - copies it - makes it symmetric - converts it to LIL if necessary - completes it if necessary. Parameters ---------- X : array-like of shape (n_samples, n_features) Feature matrix representing `n_samples` samples to be clustered. connectivity : sparse matrix, default=None Connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. The matrix is assumed to be symmetric and only the upper triangular half is used. Default is `None`, i.e, the Ward algorithm is unstructured. affinity : {"euclidean", "precomputed"}, default="euclidean" Which affinity to use. At the moment `precomputed` and ``euclidean`` are supported. `euclidean` uses the negative squared Euclidean distance between points. Returns ------- connectivity : sparse matrix The fixed connectivity matrix. n_connected_components : int The number of connected components in the graph. """ n_samples = X.shape[0] if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples: raise ValueError( "Wrong shape for connectivity matrix: %s when X is %s" % (connectivity.shape, X.shape) ) # Make the connectivity matrix symmetric: connectivity = connectivity + connectivity.T # Convert connectivity matrix to LIL if not sparse.isspmatrix_lil(connectivity): if not sparse.isspmatrix(connectivity): connectivity = sparse.lil_matrix(connectivity) else: connectivity = connectivity.tolil() # Compute the number of nodes n_connected_components, labels = connected_components(connectivity) if n_connected_components > 1: warnings.warn( "the number of connected components of the " "connectivity matrix is %d > 1. Completing it to avoid " "stopping the tree early." % n_connected_components, stacklevel=2, ) # XXX: Can we do without completing the matrix? connectivity = _fix_connected_components( X=X, graph=connectivity, n_connected_components=n_connected_components, component_labels=labels, metric=affinity, mode="connectivity", ) return connectivity, n_connected_components
def getMoves(): updateIsMunched() updateAdjMatrix() n_comp, labels = connected_components(adjmatrix, False, 'weak', True) numMunchersToDeploy = getNumMunchersToDeploy() print numMunchersToDeploy updateMunched() munched.update(myMuncherPositions) nodesToCheck = [] program_opp = [] program_gen = [] program = [] if len(otherLiveMunchers) > 0: nodesToCheck = getOpponentsNeighborNodes() if len(nodesToCheck) > 0: munchers, prog, pathCount = getMunchers(nodesToCheck) munch = [] program = [] count = [] for i in xrange(len(munchers)): if munchers[i] not in munch: munch.append(munchers[i]) program.append(prog[i]) count.append(pathCount[i]) program = zip(munch, program, count) program_opp = sorted(program, key=lambda prog: prog[2], reverse=True) #if len(nodesToCheck) < numMunchersToDeploy: components = getNLargestConnectedComponents(numMunchersToDeploy, n_comp, labels) nodesToCheck = [] for component in components: if len(component) > 2: nodesToCheck.append(getNodesWithMinWeight(component)) else: nodesToCheck.append(component) munchers, prog, pathCount = getMunchers(nodesToCheck) munch = [] program = [] count = [] for i in xrange(len(munchers)): if munchers[i] not in munch: munch.append(munchers[i]) program.append(prog[i]) count.append(pathCount[i]) program = zip(munch, program, count) program_gen = sorted(program, key=lambda prog: prog[2], reverse=True) print program_opp print program_gen if len(program_opp) > 0: program = [] temp_opp = [] temp_gen = [] for prog in program_opp: if prog[2] > 0: program.append(prog) else: temp_opp.append(prog) for prog in program_gen: if prog[2] > 0: program.append(prog) else: temp_gen.append(prog) for prog in temp_opp: program.append(prog) for prog in temp_gen: program.append(prog) print program else: components = getNLargestConnectedComponents(numMunchersToDeploy, n_comp, labels) for component in components: if len(component) > 2: nodesToCheck.append(getNodesWithMinWeight(component)) else: nodesToCheck.append(component) munchers, prog, pathCount = getMunchers(nodesToCheck) munch = [] program = [] count = [] for i in xrange(len(munchers)): if munchers[i] not in munch: munch.append(munchers[i]) program.append(prog[i]) count.append(pathCount[i]) program = zip(munch, program, count) program = sorted(program, key=lambda prog: prog[2], reverse=True) prog = [] for item in program: prog.append(str(item[0]) + '/' + str(item[1])) #prog = ['{}/{}'.format(a, b) for a, b in zip(munchers, prog)] if len(otherLiveMunchers) > 0: if remainingStuff[0] >= numMunchersToDeploy: prog = prog[:numMunchersToDeploy] else: prog = prog[:remainingStuff[0]] else: prog = prog[:1] prog = str(len(prog)) + ':' + ','.join(prog) print prog return prog
a, b = map(int, input().split()) a -= 1 b -= 1 abl.append((a, b)) adict[a] += 1 adict[b] += 1 A[a][b] = 1 A[b][a] = 1 cdl = [] for _ in range(k): c, d = map(int, input().split()) c -= 1 d -= 1 cdl.append((c, d)) kk, labels = connected_components(np.array(A)) ldict = [0 for _ in range(max(labels) + 1)] for label in labels: ldict[label] += 1 cdict = [0 for _ in range(n)] for (c, d) in cdl: if labels[c] == labels[d]: if c not in cdict: cdict[c] = 1 else: cdict[c] += 1 if d not in cdict: cdict[d] = 1 else: cdict[d] += 1 s = ""
def __compute_neighbors(self, swarm, k): """Helper method to compute the adjacency matrix of the topology This method computes the adjacency matrix of the topology using the randomized algorithm proposed in [TSWJ2013]. The resulting topology is a connected graph. This is achieved by creating three matrices: * adj_matrix : The adjacency matrix of the generated graph. It's initialized as an identity matrix to make sure that every particle has itself as a neighbour. This matrix is the return value of the method. * neighbor_matrix : The matrix of randomly generated neighbors. This matrix is a matrix of shape :code:`(swarm.n_particles, k)`: with randomly generated elements. It's used to create connections in the adj_matrix. * dist_matrix : The distance matrix computed with Dijkstra's algorithm. It is used to determine where the graph needs edges to change it to a connected graph. Parameters ---------- swarm : pyswarms.backend.swarms.Swarm a Swarm instance k : int number of neighbors to be considered. Must be a positive integer less than :code:`n_particles-1` Returns ------- numpy.ndarray Adjacency matrix of the topology """ adj_matrix = np.identity(swarm.n_particles, dtype=int) neighbor_matrix = np.array([ np.random.choice( # Exclude i from the array np.setdiff1d(np.arange(swarm.n_particles), np.array([i])), k, replace=False, ) for i in range(swarm.n_particles) ]) # Set random elements to one using the neighbor matrix adj_matrix[np.arange(swarm.n_particles).reshape(swarm.n_particles, 1), neighbor_matrix, ] = 1 adj_matrix[ neighbor_matrix, np.arange(swarm.n_particles).reshape(swarm.n_particles, 1), ] = 1 dist_matrix = dijkstra( adj_matrix, directed=False, return_predecessors=False, unweighted=True, ) # Generate connected graph. while (connected_components( adj_matrix, directed=False, return_labels=False) != 1): for i, j in itertools.product(range(swarm.n_particles), repeat=2): if dist_matrix[i][j] == np.inf: adj_matrix[i][j] = 1 return adj_matrix
def simplify_links(n): ## Complex multi-node links are folded into end-points logger.info("Simplifying connected link components") if n.links.empty: return n, n.buses.index.to_series() # Determine connected link components, ignore all links but DC adjacency_matrix = n.adjacency_matrix( branch_components=['Link'], weights=dict(Link=(n.links.carrier == 'DC').astype(float))) _, labels = connected_components(adjacency_matrix, directed=False) labels = pd.Series(labels, n.buses.index) G = n.graph() def split_links(nodes): nodes = frozenset(nodes) seen = set() supernodes = { m for m in nodes if len(G.adj[m]) > 2 or (set(G.adj[m]) - nodes) } for u in supernodes: for m, ls in iteritems(G.adj[u]): if m not in nodes or m in seen: continue buses = [u, m] links = [list(ls)] #[name for name in ls]] while m not in (supernodes | seen): seen.add(m) for m2, ls in iteritems(G.adj[m]): if m2 in seen or m2 == u: continue buses.append(m2) links.append(list(ls)) # [name for name in ls]) break else: # stub break m = m2 if m != u: yield pd.Index((u, m)), buses, links seen.add(u) busmap = n.buses.index.to_series() connection_costs_per_link = _prepare_connection_costs_per_link(n) connection_costs_to_bus = pd.DataFrame( 0., index=n.buses.index, columns=list(connection_costs_per_link)) for lbl in labels.value_counts().loc[lambda s: s > 2].index: for b, buses, links in split_links(labels.index[labels == lbl]): if len(buses) <= 2: continue logger.debug('nodes = {}'.format(labels.index[labels == lbl])) logger.debug('b = {}\nbuses = {}\nlinks = {}'.format( b, buses, links)) m = sp.spatial.distance_matrix( n.buses.loc[b, ['x', 'y']], n.buses.loc[buses[1:-1], ['x', 'y']]) busmap.loc[buses] = b[np.r_[0, m.argmin(axis=0), 1]] connection_costs_to_bus.loc[ buses] += _compute_connection_costs_to_bus( n, busmap, connection_costs_per_link, buses) all_links = [i for _, i in sum(links, [])] p_max_pu = snakemake.config['links'].get('p_max_pu', 1.) lengths = n.links.loc[all_links, 'length'] name = lengths.idxmax() + '+{}'.format(len(links) - 1) params = dict(carrier='DC', bus0=b[0], bus1=b[1], length=sum(n.links.loc[[i for _, i in l], 'length'].mean() for l in links), p_nom=min(n.links.loc[[i for _, i in l], 'p_nom'].sum() for l in links), underwater_fraction=sum( lengths / lengths.sum() * n.links.loc[all_links, 'underwater_fraction']), p_max_pu=p_max_pu, p_min_pu=-p_max_pu, underground=False, under_construction=False) logger.info( "Joining the links {} connecting the buses {} to simple link {}" .format(", ".join(all_links), ", ".join(buses), name)) n.mremove("Link", all_links) static_attrs = n.components["Link"]["attrs"].loc[ lambda df: df.static] for attr, default in static_attrs.default.iteritems(): params.setdefault(attr, default) n.links.loc[name] = pd.Series(params) # n.add("Link", **params) logger.debug("Collecting all components using the busmap") _aggregate_and_move_components(n, busmap, connection_costs_to_bus) return n, busmap
def generate_process_mining(dataframe, user_id_column='user_id', time_column='time', event_label_columns=['event_name', 'event_type'], types_to_include=None, filter_encoding_dict={}, num_nodes=15, edge_weight_lower_bound=5): df_f = dataframe abbreviation_dict = filter_encoding_dict # node name preprocessing print('Creating the column "node_name"...') if types_to_include: type_df_filter = pd.DataFrame({'event_type': types_to_include}) df_f = df_f.merge(type_df_filter, how='inner', on='event_type') df_f = df_f.sort_values(by=[user_id_column, time_column]).reset_index( drop=True) num_users = df_f['user_id'].nunique() num_actions = len(df_f['user_id']) df_freq = df_f.groupby([ 'event_name', 'event_type' ]).size().reset_index(name='counts').sort_values(by='counts', ascending=False) # filter out the too prevalent actions (more than 2X of the cases) df_low_count = df_freq[df_freq.counts < (num_users * 2)] df_f_f1 = df_f.merge(df_low_count, how='inner', on=['event_name', 'event_type']) # Apply abbreviations if len(abbreviation_dict.keys()) > 0: Apply_ABB = True # Apply abbreviation else: Apply_ABB = False # Create The column node_name if isinstance(event_label_columns, str): df_f['node_name'] = df_f[event_label_columns] else: df_f['node_name'] = df_f[event_label_columns[0]] for i in range(1, len(event_label_columns)): df_f['node_name'] = df_f['node_name'] + ' - ' + df_f[ event_label_columns[i]] # Node name edits and abbrevations if Apply_ABB == True: for x in abbreviation_dict.keys(): df_f['node_name'] = df_f['node_name'].replace( abbreviation_dict, regex=True) #replace(x,ABB[x]) # choose top k most frequent events k = num_nodes df_enc_event = df_f[[user_id_column, time_column, 'node_name']] df_enc_event = df_enc_event.rename(columns={ user_id_column: 'enc_id', time_column: 'time_diff' }) # drop duplicate node_names (might be of different time_diff) df_enc_event = df_enc_event.drop_duplicates(subset=['enc_id', 'node_name']) # find the sorted event types def my_f(x): d = [] d.append(len(x['node_name'])) d.append(x['time_diff'].median()) return pd.Series(d) #, index=[['count', 'time_avg']]) df_sorted = df_enc_event.groupby('node_name').apply( my_f).reset_index().rename(columns={ 0: 'count', 1: 'time_avg' }).sort_values('count', ascending=False) #.sort_values('count',ascending=False) df_sorted = df_sorted.head(k) # filter the events to include top k events df_enc_event = df_enc_event.merge(df_sorted, how='inner', on='node_name')[[ 'enc_id', 'node_name', 'time_diff', 'time_avg' ]].sort_values(['enc_id', 'time_diff'], ascending=True).reset_index(drop=True) df_enc_event = df_enc_event[df_enc_event.time_diff >= 0].reset_index( drop=True) # Create table of unique encounter id unq_PC_enc_event = df_enc_event.groupby([ 'enc_id' ])['node_name'].apply(list).reset_index().rename(columns={ 'enc_id': 'enc_id', 0: 'event_list' }).sort_values('enc_id', ascending=True) unq_PC_enc_time = df_enc_event.groupby([ 'enc_id' ])['time_diff'].apply(list).reset_index().rename(columns={ 'enc_id': 'enc_id', 0: 'time_diff' }).sort_values('enc_id', ascending=True) unq_PC_enc = unq_PC_enc_event.merge(unq_PC_enc_time, how='left', on='enc_id') superlist = unq_PC_enc['node_name'].tolist() superlist_time = unq_PC_enc['time_diff'].tolist() # Get consecuative pairs A = {} A['node1'] = [] A['node2'] = [] A['time_weight'] = [] A['node1_time'] = [] A['node2_time'] = [] df_g = pd.DataFrame(A) for i in range(len(superlist)): t = 0 for x, y in zip(superlist[i], superlist[i][1:]): t += 1 td = superlist_time[i][t] - superlist_time[i][t - 1] df_temp = pd.DataFrame({ 'node1': [x], 'node2': [y], 'time_weight': [td], 'node1_time': [df_sorted[df_sorted.node_name == x].time_avg.iloc[0]], 'node2_time': [df_sorted[df_sorted.node_name == y].time_avg.iloc[0]] }) df_g = pd.concat([df_g, df_temp]) # Save the weighted edges E = {} # wighted frequency T = {} Node1_Time = {} Node2_Time = {} for i in range(len(superlist)): t = 0 for x, y in zip(superlist[i], superlist[i][1:]): t += 1 td = superlist_time[i][t] - superlist_time[i][t - 1] t1 = df_sorted[df_sorted.node_name == x].time_avg.iloc[0] t2 = df_sorted[df_sorted.node_name == y].time_avg.iloc[0] if (x, y) not in E.keys(): E[(x, y)] = 1 T[(x, y)] = td Node1_Time[(x, y)] = t1 Node2_Time[(x, y)] = t2 else: E[(x, y)] += 1 T[(x, y)] += td # Filter edges and save to a list of edges with different weight types alpha = edge_weight_lower_bound # lower boud for edge frequency weight to show in the final graph A = [] E_max = 0 # maximum frequency of the edges for e in E.keys(): if E[e] > alpha and e[0] != e[1]: t1 = Node1_Time[e] #/E[e] t2 = Node2_Time[e] #/E[e] rtw = T[e] / E[e] # average relative time of all this type edges tw = t2 - t1 e_type = (e[0], e[1], tw, E[e], t1, t2, rtw) if True: #t1<=t2 and tw<up_threshold and A.append(e_type) # find maximum freq of edges in order to normalize the freq if E[e] > E_max: E_max = E[e] # Find Adjacency matrix # assign index for the vertices I = {} # index dict i = 0 for e in A: if e[0] not in I.keys(): I[e[0]] = i i += 1 if e[1] not in I.keys(): I[e[1]] = i i += 1 ####################### # create adjacency matrix I_inv = {value: key for (key, value) in I.items()} Adj = np.zeros((i, i)) Adj_lag = np.zeros((i, i)) for e in A: Adj[I[e[0]], I[e[1]]] = e[3] Adj_lag[I[e[0]], I[e[1]]] = e[6] I_org = I # Normalize the weights to be probability for i in range(Adj.shape[0]): if np.sum(Adj[i]) > 0: Adj[i] = Adj[i] / np.sum(Adj[i]) if True: #not node_clustering: dot = graphviz.Digraph() for e in A: prob = Adj[I[e[0]], I[ e[1]]] # find the probability weight from the adjacency matrix lag = Adj_lag[I[e[0]], I[ e[1]]] # find the probability weight from the adjacency matrix label = '(' + str(int(prob * 100) / 100) + ', ' + str( int(lag * 100) / 100) + ')' dot.edge(e[0], e[1], label=label, penwidth=str(3 * prob)) #, penwidth=1) #dot.edge.attr['penwidth'] = 1 engin = sorted(graphviz.ENGINES)[0] dot.render('process_graph_no_clustering.gv', view=False) ### Apply graph clustering ############################# # Create A_eta: threshold graph eta = 0.1 # threshold for finding connected graphs # find edges with less than threshold and the same type A_eta = [ e for e in A if (e[2] <= eta and e[0].split("-")[0] == e[1].split("-")[0]) ] ############################# # assign index for the vertices I = {} # index dict i = 0 for e in A_eta: if e[0] not in I.keys(): I[e[0]] = i i += 1 if e[1] not in I.keys(): I[e[1]] = i i += 1 ####################### # create adjacency matrix I_inv = {value: key for (key, value) in I.items()} Adj_sh = np.zeros((i, i)) for e in A_eta: Adj_sh[I[e[0]], I[e[1]]] = 1 Adj_sh[I[e[1]], I[e[0]]] = 1 ######################## #### extract connected components graph = csr_matrix(Adj_sh) n_components, labels = connected_components(csgraph=graph, directed=False, return_labels=True) #Create Node names for the clustered components: W = {} for i in range(np.max(labels) + 1): I_eq = np.arange(labels.shape[0])[labels == i] W[i] = I_inv[I_eq[0]].split("-")[ 0] #+' : [' #+' : [' + [I_inv[i].split("-")[1] for i in ] W[i] = W[i] + ' [' k = 0 for j in I_eq: if k > 0: W[i] = W[i] + ' ,\n' k += 1 W[i] = W[i] + I_inv[j].split("-")[1] W[i] = W[i] + ' ]' W[i] = W[i].replace("&", "&") W[i] = W[i].replace(":", "") ##### Assign hashing from graph nodes to the component labels C = { } # C {dict} contains the event names that can be grouped together (keys are event names) for i in range(labels.shape[0]): C[I_inv[i]] = W[labels[i]] ################### ###### Create a new graph (list of edges) with shrinked nodes A_new = [] for e in A: # consider the edges from the original graph, if one node is in the connected components, # then create a new edge with the new aggregated nodes e0, e1 = e[0], e[1] # nodes of a edge e0_org = e0 e1_org = e1 if e0 in C: e0 = C[e0] # new node if e1 in C: e1 = C[e1] # new node # add the edge if not a self-loop and not already existing if e0 != e1: # compute the weights prob = Adj[I_org[e0_org], I_org[ e1_org]] # find the probability weight from the adjacency matrix lag = Adj_lag[I_org[e0_org], I_org[ e1_org]] # find the probability weight from the adjacency matrix # check if the edge is already in the list isinlist = False for i in range(len(A_new)): if A_new[i][0] == e0 and A_new[i][1] == e1: isinlist = True # add to the edge count new_prob = np.min([A_new[i][2] + prob, 1]) new_lag = ( (A_new[i][3] * A_new[i][4]) + prob) / (1 + A_new[i][4]) new_count = 1 + A_new[i][4] A_new[i] = (e0, e1, new_prob, new_lag, new_count) if isinlist == False: count = 1 A_new.append((e0, e1, prob, lag, count)) ################# ####### draw the new graph if True: dot = graphviz.Digraph() for e in A_new: label = '(' + str(int(e[2] * 100) / 100) + ', ' + str( int(e[3] * 100) / 100) + ')' dot.edge(e[0], e[1], label=label) engin = sorted(graphviz.ENGINES)[0] dot.render('process_graph_with_clustering.gv', view=True) # Create print('Process Mining File Saved!') # Conformity Scores list_of_the_scores = [] for i in range(1, len(unq_PC_enc)): total_cost = 0 total_edge_count = 0 L = unq_PC_enc[unq_PC_enc.enc_id == i].node_name.to_numpy()[0] # for on all of the nodes of the patient i for j in range(len(L) - 1): e0 = L[j] e1 = L[j + 1] # is there any aggregation? if e0 in C: e0 = C[e0] # new node if e1 in C: e1 = C[e1] # new node total_edge_count += 1 for e in A_new: if e[0] == e0 and e[1] == e1: total_cost += e[2] total_edge_count = max(total_edge_count, 1) list_of_the_scores.append(total_cost / total_edge_count) list_of_the_scores = np.array(list_of_the_scores) # Some printing functions def return_node_list(i): edge_list = [] L = unq_PC_enc[unq_PC_enc.enc_id == i].node_name.to_numpy()[0] # for on all of the nodes of the patient i node_list = [] for j in range(len(L)): e0 = L[j] # is there any aggregation? if e0 in I_org: if e0 in C: e0 = C[e0] # new node node_list.append(e0) return node_list def return_edge_list(i): edge_list = [] node_list = return_node_list(i) for j in range(len(node_list) - 1): e0 = node_list[j] e1 = node_list[j + 1] edge_list.append((e0, e1)) return edge_list def print_major_events(i): print(df_enc_event[df_enc_event.enc_id == i]) return 0 def print_event_log(i): print(dataframe[dataframe.user_id == i]) return 0 # print the highest and lowest conformity scores i = np.argmax(list_of_the_scores) + 1 max_path = return_node_list(i) print('The user index with the highst conformity score: ', i) print('Conformity score: ', list_of_the_scores[i - 1]) print( 'Event log (major events) of the user with the highst conformity score:' ) print_major_events(i) ####### draw the new graph G = nx.DiGraph() for e in A_new: label = '(' + str(int(e[2] * 100) / 100) + ', ' + str( int(e[3] * 100) / 100) + ')' if (e[0], e[1]) in max_path: G.add_edge(e[0], e[1], label=label, penwidth="3", color="green") #, penwidth=str(3*prob))#, penwidth=1) else: G.add_edge(e[0], e[1], label=label) new_G = nx.nx_agraph.to_agraph(G) display(new_G) new_G.draw('process_graph_with_max_path.png', prog='dot') # print the highest and lowest conformity scores i = np.argmin(list_of_the_scores) + 1 max_path = return_node_list(i) print('The user index with the lowest conformity score: ', i) print('Conformity score: ', list_of_the_scores[i - 1]) print( 'Event log (major events) of the user with the lowest conformity score:' ) print_major_events(i)
mat.sum_duplicates() print(mat) print() # Converting from csr to csc with the tocsc() method newarr = csr_matrix(arr).tocsc() print(newarr) print() # SciPy Graphs # Connected Components from scipy.sparse.csgraph import connected_components, dijkstra, floyd_warshall, bellman_ford, depth_first_order, breadth_first_order from scipy.sparse import csr_matrix arr = np.array([[0, 1, 2], [1, 0, 0], [2, 0, 0]]) newarr = csr_matrix(arr) print(connected_components(newarr)) print() # Dijkstra print(dijkstra(newarr, return_predecessors=True, indices=0)) print() # Floyd Warshall print(floyd_warshall(newarr, return_predecessors=True)) print() # Bellman Ford print(bellman_ford(newarr, return_predecessors=True, indices=0)) print() # Depth First Order
def compute_neighbors(self, n_neighbors: int = 30, knn: bool = True, random_state: Optional[Union[RandomState, int]] = 0, write_knn_indices: bool = False, metric: str = 'euclidean', metric_kwds: Mapping[str, Any] = {}, smoothknn: bool = True) -> None: """\ Compute distances and connectivities of neighbors. Parameters ---------- n_neighbors Use this number of nearest neighbors. knn Restrict result to `n_neighbors` nearest neighbors. Returns ------- Writes sparse graph attributes `.distances` and `.connectivities`. Also writes `.knn_indices` and `.knn_distances` if `write_knn_indices==True`. """ if n_neighbors > self._data.shape[0]: # very small datasets n_neighbors = 1 + int(0.5 * self._data.shape[0]) print('Warning: n_obs too small: adjusting to `n_neighbors = {}`'. format(n_neighbors)) if self._data.shape[0] >= 10000 and not knn: print( 'Warning: Using high n_obs without `knn=True` takes a lot of memory...' ) self.n_neighbors = n_neighbors self.knn = knn X = self._data # neighbor search knn_indices, knn_distances = compute_neighbors_umap( X, n_neighbors, random_state, metric=metric, metric_kwds=metric_kwds) # write indices as attributes if write_knn_indices: self.knn_indices = knn_indices self.knn_distances = knn_distances if smoothknn: # we need self._distances also for method == 'gauss' if we didn't # use dense distances self._distances, self._connectivities = compute_connectivities_umap( knn_indices, knn_distances, self._data.shape[0], self.n_neighbors) else: s = np.repeat(np.arange(knn_indices.shape[0]), knn_indices.shape[1]) t = knn_indices.flatten() w = np.ones(t.shape) self._connectivities = scipy.sparse.csr_matrix( (w, (s.astype(np.int), t.astype(np.int))), (X.shape[0], X.shape[0])) self._distances = scipy.sparse.csr_matrix( (knn_distances.flatten(), (s.astype(np.int), t.astype(np.int))), (X.shape[0], X.shape[0])) self._number_connected_components = 1 if issparse(self._connectivities): from scipy.sparse.csgraph import connected_components self._connected_components = connected_components( self._connectivities) self._number_connected_components = self._connected_components[0]
def merge_components(Y, A, b, C, f, S, sn_pix, temporal_params, spatial_params, dview=None, thr=0.85, fast_merge=True, mx=1000, bl=None, c1=None, sn=None, g=None): """ Merging of spatially overlapping components that have highly correlated temporal activity The correlation threshold for merging overlapping components is user specified in thr Parameters: ----------- Y: np.ndarray residual movie after subtracting all found components (Y_res = Y - A*C - b*f) (d x T) A: sparse matrix matrix of spatial components (d x K) b: np.ndarray spatial background (vector of length d) C: np.ndarray matrix of temporal components (K x T) f: np.ndarray temporal background (vector of length T) S: np.ndarray matrix of deconvolved activity (spikes) (K x T) sn_pix: ndarray noise standard deviation for each pixel temporal_params: dictionary all the parameters that can be passed to the update_temporal_components function spatial_params: dictionary all the parameters that can be passed to the update_spatial_components function thr: scalar between 0 and 1 correlation threshold for merging (default 0.85) mx: int maximum number of merging operations (default 50) sn_pix: nd.array noise level for each pixel (vector of length d) fast_merge: bool if true perform rank 1 merging, otherwise takes best neuron bl: baseline for fluorescence trace for each row in C c1: initial concentration for each row in C g: discrete time constant for each row in C sn: noise level for each row in C Returns: -------- A: sparse matrix matrix of merged spatial components (d x K) C: np.ndarray matrix of merged temporal components (K x T) nr: int number of components after merging merged_ROIs: list index of components that have been merged S: np.ndarray matrix of merged deconvolved activity (spikes) (K x T) bl: float baseline for fluorescence trace c1: float initial concentration g: float discrete time constant sn: float noise level Raise: ----- Exception("The number of elements of bl\c1\g\sn must match the number of components") See Also: -------- """ #tests and initialization nr = A.shape[1] if bl is not None and len(bl) != nr: raise Exception( "The number of elements of bl must match the number of components") if c1 is not None and len(c1) != nr: raise Exception( "The number of elements of c1 must match the number of components") if sn is not None and len(sn) != nr: raise Exception( "The number of elements of sn must match the number of components") if g is not None and len(g) != nr: raise Exception( "The number of elements of g must match the number of components") [d, t] = np.shape(Y) # % find graph of overlapping spatial components A_corr = scipy.sparse.triu(A.T * A) A_corr.setdiag(0) A_corr = A_corr.tocsc() FF2 = A_corr > 0 C_corr = scipy.sparse.csc_matrix(A_corr.shape) for ii in range(nr): overlap_indeces = A_corr[ii, :].nonzero()[1] if len(overlap_indeces) > 0: #we chesk the correlation of the calcium traces for eahc overlapping components corr_values = [ scipy.stats.pearsonr(C[ii, :], C[jj, :])[0] for jj in overlap_indeces ] C_corr[ii, overlap_indeces] = corr_values FF1 = (C_corr + C_corr.T) > thr FF3 = FF1.multiply(FF2) nb, connected_comp = csgraph.connected_components( FF3) # % extract connected components p = temporal_params['p'] list_conxcomp = [] for i in range(nb): # we list them if np.sum(connected_comp == i) > 1: list_conxcomp.append((connected_comp == i).T) list_conxcomp = np.asarray(list_conxcomp).T if list_conxcomp.ndim > 1: cor = np.zeros((np.shape(list_conxcomp)[1], 1)) for i in range(np.size(cor)): fm = np.where(list_conxcomp[:, i])[0] for j1 in range(np.size(fm)): for j2 in range(j1 + 1, np.size(fm)): cor[i] = cor[i] + C_corr[fm[j1], fm[j2]] # if not fast_merge: # Y_res = Y - A.dot(C) #residuals=background=noise if np.size(cor) > 1: ind = np.argsort(np.squeeze(cor))[::-1] #we get the size (indeces) else: ind = [0] nbmrg = min((np.size(ind), mx)) # number of merging operations #we initialize the values A_merged = lil_matrix((d, nbmrg)) C_merged = np.zeros((nbmrg, t)) S_merged = np.zeros((nbmrg, t)) bl_merged = np.zeros((nbmrg, 1)) c1_merged = np.zeros((nbmrg, 1)) sn_merged = np.zeros((nbmrg, 1)) g_merged = np.zeros((nbmrg, p)) merged_ROIs = [] for i in range(nbmrg): merged_ROI = np.where(list_conxcomp[:, ind[i]])[0] merged_ROIs.append(merged_ROI) #we l2 the traces to have normalization values C_to_norm = np.sqrt( [computedC.dot(computedC) for computedC in C[merged_ROI]]) # fast_merge = False # from here we are computing initial values for C and A Acsc = A.tocsc()[:, merged_ROI] Ctmp = np.array(C)[merged_ROI, :] print((merged_ROI.T)) #this is a big normalization value that for every one of the merged neuron C_to_norm = np.sqrt( np.ravel(Acsc.power(2).sum(axis=0)) * np.sum(Ctmp**2, axis=1)) indx = np.argmax(C_to_norm) if fast_merge: #we normalize the values of different A's to be able to compare them efficiently. we then sum them computedA = Acsc.dot( scipy.sparse.diags( C_to_norm, 0, (len(C_to_norm), len(C_to_norm)))).sum(axis=1) for _ in range( 10 ): # we operate a rank one NMF, refining it multiple times (see cnmf demos ) computedC = np.maximum( Acsc.T.dot(computedA).T.dot(Ctmp) / (computedA.T * computedA), 0) computedA = np.maximum( Acsc.dot(Ctmp.dot(computedC.T)) / (computedC * computedC.T), 0) else: print('Simple Merging Take Best Neuron') computedC = Ctmp[indx] computedA = Acsc[:, indx] # then we de-normalize them using A_to_norm A_to_norm = np.sqrt( computedA.T.dot(computedA)[0, 0] / Acsc.power(2).sum(0).max()) computedA /= A_to_norm computedC *= A_to_norm # we then compute the traces ( deconvolution ) to have a clean c and noise in the background if g is not None: computedC, bm, cm, gm, sm, ss, lam_ = constrained_foopsi( np.array(computedC).squeeze(), g=g[merged_ROI[indx]], **temporal_params) else: computedC, bm, cm, gm, sm, ss, lam_ = constrained_foopsi( np.array(computedC).squeeze(), g=None, **temporal_params) A_merged[:, i] = computedA C_merged[i, :] = computedC S_merged[i, :] = ss[:t] bl_merged[i] = bm c1_merged[i] = cm sn_merged[i] = sm g_merged[i, :] = gm #we want to remove merged neuron from the initial part and replace them with merged ones neur_id = np.unique(np.hstack(merged_ROIs)) good_neurons = np.setdiff1d(list(range(nr)), neur_id) A = scipy.sparse.hstack((A.tocsc()[:, good_neurons], A_merged.tocsc())) C = np.vstack((C[good_neurons, :], C_merged)) #we continue for the variables if S is not None: S = np.vstack((S[good_neurons, :], S_merged)) if bl is not None: bl = np.hstack((bl[good_neurons], np.array(bl_merged).flatten())) if c1 is not None: c1 = np.hstack((c1[good_neurons], np.array(c1_merged).flatten())) if sn is not None: sn = np.hstack((sn[good_neurons], np.array(sn_merged).flatten())) if g is not None: g = np.vstack((np.vstack(g)[good_neurons], g_merged)) nr = nr - len(neur_id) + nbmrg else: print('No neurons merged!') merged_ROIs = [] return A, C, nr, merged_ROIs, S, bl, c1, sn, g
def get_visible_points(verPredOrig, classMask, filterSize=8, normThresh=0.07, pointsDistanceThresh=5): nKeypoints = verPredOrig.shape[1] // 2 # Create conv layer which does mean operation over a (filterSize x filterSize) pixel area meanFilter = torch.nn.Conv2d(nKeypoints * 2, nKeypoints * 2, filterSize, stride=1, padding=filterSize // 2, dilation=1, groups=nKeypoints * 2, bias=False).cuda() meanFilter.state_dict()['weight'][:] = 1 / filterSize**2 # Use mean filter on vertex field, exclude image border element to make # shapes match (since an even filterSize should be used). filteredVerPred = meanFilter(verPredOrig)[:, :, :-1, :-1] _, _, height, width = filteredVerPred.shape maskedPixels, _ = matrixToIndices(classMask) maskedPixels = torch.index_select(maskedPixels, 1, torch.tensor([1, 0]).cuda()) verPred = torch.reshape(verPredOrig, [nKeypoints, 2, height, width]).squeeze() verPredPixels = verPred[:, :, maskedPixels[:, 1], maskedPixels[:, 0]] verPredPixels = verPredPixels / torch.norm(verPredPixels, dim=1)[:, None] verPredPixels = verPredPixels.permute(0, 2, 1) filteredVerPredAlt = torch.reshape(filteredVerPred, [nKeypoints, 2, height, width]) # Calculate norm of mean filtered vertex filed verNorms = torch.norm(filteredVerPredAlt, dim=1) verNorms_np = verNorms.cpu().detach().numpy() # Find pixels within segmented areas whose filtered norms are small visibilityMatrix = (verNorms < normThresh) & classMask visibilityMatrix_np = visibilityMatrix.cpu().detach().numpy() visiblePointsList = [] for iKeypoint in range(nKeypoints): # Reshape visible pixels from matrix to point form #visibleClusterPoints=np.stack(np.where(visibilityMatrix_np[iKeypoint])).T visibleClusterPoints, _ = matrixToIndices(visibilityMatrix[iKeypoint]) if len(visibleClusterPoints) == 0: visiblePointsList.append([]) continue # Find connectivity between visible points adjMatrix = radius_neighbors_graph( visibleClusterPoints, radius=pointsDistanceThresh, include_self=False, mode='connectivity').toarray().astype(bool) # Cluster points that are connected, i.e. belong to the same GT point n_components, labels = connected_components(csgraph=adjMatrix, directed=False, return_labels=True) labels = torch.from_numpy(labels).cuda() # For each different cluster of visible points, find the one with the smallest norm # TODO: change to the one with most inlier counts? visiblePoints = torch.zeros((n_components, 2)) for iFeature in range(n_components): # Find points belonging to current cluster isLabel = labels == (iFeature) # Get filtered vertex norms for current cluster points pointCluster = visibleClusterPoints[isLabel] #pointClusterNorms = verNorms[iKeypoint][pointCluster[:,0],pointCluster[:,1]] pointsDirection = getPointDirections(pointCluster, maskedPixels, normalized=True) votingFunction = lambda x, y: innerProductExponentiated( x, y, innerProductExponent=1, frequencyMultiplierExponent=0, threshold=0.999) votingScore = getVotingScore(pointsDirection, verPredPixels[iKeypoint], votingFunction) scoreSum, _ = getScoreSum(votingScore, pointsDirection) biggestScoreIdx = torch.argmax(scoreSum) bestPoint = pointCluster[biggestScoreIdx] visiblePoints[iFeature] = torch.index_select( bestPoint, 0, torch.tensor([1, 0]).cuda()) - 0.5 # Select point with smallest norm (deprecated) #verNormSMallIdx = np.argmin(pointClusterNorms) #visiblePoints[iFeature] = pointCluster[verNormSMallIdx,::-1] - 0.5 # Create a list of arrays which holds the detected visible points visiblePointsList.append(visiblePoints) return visiblePointsList
def compute_neighbors(self, n_neighbors: int = 30, knn: bool = True, n_pcs: Optional[int] = None, use_rep: Optional[str] = None, method: str = 'umap', random_state: Optional[Union[RandomState, int]] = 0, write_knn_indices: bool = False, metric: str = 'euclidean', metric_kwds: Mapping[str, Any] = {}) -> None: """\ Compute distances and connectivities of neighbors. Parameters ---------- n_neighbors Use this number of nearest neighbors. knn Restrict result to `n_neighbors` nearest neighbors. {n_pcs} {use_rep} Returns ------- Writes sparse graph attributes `.distances` and `.connectivities`. Also writes `.knn_indices` and `.knn_distances` if `write_knn_indices==True`. """ if n_neighbors > self._adata.shape[0]: # very small datasets n_neighbors = 1 + int(0.5 * self._adata.shape[0]) logg.warn( 'n_obs too small: adjusting to `n_neighbors = {}`'.format( n_neighbors)) if method == 'umap' and not knn: raise ValueError('`method = \'umap\' only with `knn = True`.') if method not in {'umap', 'gauss'}: raise ValueError('`method` needs to be \'umap\' or \'gauss\'.') if self._adata.shape[0] >= 10000 and not knn: logg.warn( 'Using high n_obs without `knn=True` takes a lot of memory...') self.n_neighbors = n_neighbors self.knn = knn X = choose_representation(self._adata, use_rep=use_rep, n_pcs=n_pcs) # neighbor search use_dense_distances = (metric == 'euclidean' and X.shape[0] < 8192) or knn == False if use_dense_distances: _distances = pairwise_distances(X, metric=metric, **metric_kwds) knn_indices, knn_distances = get_indices_distances_from_dense_matrix( _distances, n_neighbors) if knn: self._distances = get_sparse_matrix_from_indices_distances_numpy( knn_indices, knn_distances, X.shape[0], n_neighbors) else: self._distances = _distances else: # non-euclidean case and approx nearest neighbors if X.shape[0] < 4096: X = pairwise_distances(X, metric=metric, **metric_kwds) metric = 'precomputed' knn_indices, knn_distances = compute_neighbors_umap( X, n_neighbors, random_state, metric=metric, metric_kwds=metric_kwds) # write indices as attributes if write_knn_indices: self.knn_indices = knn_indices self.knn_distances = knn_distances logg.msg('computed neighbors', t=True, v=4) if not use_dense_distances or method == 'umap': # we need self._distances also for method == 'gauss' if we didn't # use dense distances self._distances, self._connectivities = compute_connectivities_umap( knn_indices, knn_distances, self._adata.shape[0], self.n_neighbors) # overwrite the umap connectivities if method is 'gauss' # self._distances is unaffected by this if method == 'gauss': self._compute_connectivities_diffmap() logg.msg('computed connectivities', t=True, v=4) self._number_connected_components = 1 if issparse(self._connectivities): from scipy.sparse.csgraph import connected_components self._connected_components = connected_components( self._connectivities) self._number_connected_components = self._connected_components[0]