def stack_on_matching_axis(m1: Union[sparse.spmatrix, np.ndarray], m2: sparse.spmatrix, default: str = 'cols'): if m1.ndim == 1: m1 = m1.reshape(-1, 1) if m2.ndim == 1: m2 = m2.reshape(-1, 1) if m1.shape == m2.shape: choice = default elif m1.shape[0] == m2.shape[0]: choice = 'cols' elif m1.shape[1] == m2.shape[1]: choice = 'rows' elif m1.shape[0] == m2.shape[1]: m2 = m2.transpose() choice = 'cols' elif m1.shape[1] == m2.shape[0]: m2 = m2.transpose() choice = 'rows' else: raise AttributeError( 'Unable to find an axis on which the two matrices might be stacked' ) if choice == 'cols': return matrix_hstack((m1, m2)) elif choice == 'rows': return matrix_vstack((m1, m2))
def document_similarity(td_matrix: sp.spmatrix, doc_id: int): """ Takes a topic-document matrix and calculates the similarity for a given id against all other id's which are similar to the given document. :param td_matrix: topic-document matrix :param doc_id: the id of the given document :return: a coo_matrix consisting of one documents similarity score (one row). """ doc = td_matrix.getrow(doc_id) topics_in_doc = doc.nonzero()[1] rows = np.array([]) cols = np.array([]) vals = np.array([]) for topic_id in topics_in_doc: topic = td_matrix.getcol(topic_id) docs_in_topic = topic.nonzero()[0] # filter docs that have already been done, .ie documents earlier in the loop docs_in_topic = [d for d in docs_in_topic if doc_id < d] # put documents that share the same topic as the main document into a dictionary. # mapping doc_id -> topic distribution value Y = {y: topic[y].data[0] for y in docs_in_topic if doc_id < y} x = topic[doc_id].data[0] similarity_sum = similarity_function(x, Y) # add values to numpy arrays based on row_ids, column_ids and values # this is to efficiently construct a coo_matrix rows = np.concatenate((rows, np.zeros(len(similarity_sum)))) cols = np.concatenate((cols, np.array(list(similarity_sum.keys())))) vals = np.concatenate((vals, np.array(list(similarity_sum.values())))) # construct similarity coo_matrix based on the documents that share topics with the main document. sim_dict = sp.coo_matrix((vals, (rows, cols)), shape=(1, td_matrix.shape[0])) print(f"Doc: {doc_id} done.") return sim_dict
def normalize_adj(adj: sps.spmatrix) -> sps.spmatrix: """Symmetrically normalize adjacency matrix.""" rowsum = np.array(adj.sum(1)) d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sps.diags(d_inv_sqrt) return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt)
def to_normalized_sparse_tensor(sparse_matrix: sp.spmatrix): """ Args: sparse_matrix: Sparse matrix to be normalized and converted into a tensor. Returns: Sparse tensor normalized according to the rules specified in the original GCN paper. See https://openreview.net/pdf?id=SJU4ayYgl for further information. """ sparse_matrix = sp.coo_matrix(sparse_matrix, shape=sparse_matrix.shape, dtype=np.float32) # Add the identity matrix for inclusion of self features. sparse_matrix += sp.identity( sparse_matrix.shape[0], dtype=np.float32) # Data type conversion is important. row_sum = np.array(sparse_matrix.sum(axis=1)) # deg is the D\hat^(-1/2) matrix in the paper, the diagonal matrix of the identity augmented adjacency matrix. deg = sp.diags((row_sum**-0.5).ravel()) # Normalized adjacency matrix. adj_norm = sp.coo_matrix(sparse_matrix.dot(deg).transpose().dot(deg), shape=sparse_matrix.shape, dtype=np.float32) adj_norm.eliminate_zeros() return to_sparse_tensor(adj_norm)
def sparseMatVariance(mat:spmatrix): """ Calculates variance for given spmatrix. :param mat: The matrix. :type mat: spmatrix """ return mat.power(2).mean() - mat.mean()**2
def _transform_a_spectral(a: spmatrix) -> spmatrix: if not isspmatrix_csr(a): a = a.tocsr() degrees = np.squeeze(np.array(np.sqrt(1.0 / a.sum(axis=0)))) a = a.multiply(outer(a.indices, a.indptr, degrees)) a.eliminate_zeros() return a
def block_diagonalize_permutation(A: sp.spmatrix): A = A.tocsc() ATA = A.transpose().dot(A) n, cmp = csg.connected_components(ATA) col_list = [list(np.where(cmp == j)[0]) for j in range(n)] row_list = [ list(np.unique(A[:, col_list[j]].nonzero()[0])) for j in range(n) ] return row_list, col_list
def normalize_sparse(A: sp.spmatrix) -> sp.spmatrix: """Get (D**-0.5) * A * (D ** -0.5), where D is the diagonalized row sum.""" A = sp.coo_matrix(A) A.eliminate_zeros() rowsum = np.array(A.sum(1)) assert np.all(rowsum >= 0) d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0 d_mat_inv_sqrt = sp.diags(d_inv_sqrt) return d_mat_inv_sqrt.dot(A).dot(d_mat_inv_sqrt)
def __init__(self, adj: sp.spmatrix, dims): super().__init__() self.weights = nn.Parameter( nn.init.xavier_normal_(torch.zeros(dims, device="cuda"))) self.bias = nn.Parameter( nn.init.normal_(torch.zeros(dims[1], device="cuda"))) self.D, self.H = dims if adj.format != "csr": adj = adj.tocsr() self.adj = adj.astype("float32") self.adj_tensor = sparse_tensor(self.adj).cuda()
def apply_bc(A: sparse.spmatrix, b: numpy.ndarray, dofs: numpy.ndarray, value: float = 0): if sparse.isspmatrix_csr(A): for dof in dofs: A.data[A.indptr[dof]:A.indptr[dof + 1]] = 0.0 A[dof, dof] = 1.0 A.eliminate_zeros() else: raise TypeError("Matrix must be of csr format.") b[dofs] = value
def append(self, sparse_matrix: ss.spmatrix): # Prep variables shape = self.shape if isinstance(sparse_matrix, SparseDataset): sparse_matrix = sparse_matrix.to_backed() # Check input if not ss.isspmatrix(sparse_matrix): raise NotImplementedError( "Currently, only sparse matrices of equivalent format can be " "appended to a SparseDataset." ) if self.format_str not in {"csr", "csc"}: raise NotImplementedError( f"The append method for format {self.format_str} " f"is not implemented." ) if self.format_str != get_format_str(sparse_matrix): raise ValueError( f"Matrices must have same format. Currently are " f"'{self.format_str}' and '{get_format_str(sparse_matrix)}'" ) # shape if self.format_str == "csr": assert ( shape[1] == sparse_matrix.shape[1] ), "CSR matrices must have same size of dimension 1 to be appended." new_shape = (shape[0] + sparse_matrix.shape[0], shape[1]) elif self.format_str == "csc": assert ( shape[0] == sparse_matrix.shape[0] ), "CSC matrices must have same size of dimension 0 to be appended." new_shape = (shape[0], shape[1] + sparse_matrix.shape[1]) else: assert False, "We forgot to update this branching to a new format" if "h5sparse_shape" in self.group.attrs: del self.group.attrs["h5sparse_shape"] self.group.attrs["shape"] = new_shape # data data = self.group["data"] orig_data_size = data.shape[0] data.resize((orig_data_size + sparse_matrix.data.shape[0],)) data[orig_data_size:] = sparse_matrix.data # indptr indptr = self.group["indptr"] orig_data_size = indptr.shape[0] append_offset = indptr[-1] indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,)) indptr[orig_data_size:] = ( sparse_matrix.indptr[1:].astype(np.int64) + append_offset ) # indices indices = self.group["indices"] orig_data_size = indices.shape[0] indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) indices[orig_data_size:] = sparse_matrix.indices
def solve_kfe(A: spmatrix, g0: np.ndarray, weight_mat: spmatrix, maxit_kfe: int = 1000, tol_kfe: float = 1e-12, d_kfe: float = 1e6): if weight_mat.shape != A.shape: raise Exception('Dimension of weight matrix is incorrect.') weight_mat = csr_matrix(weight_mat) dim_size = A.shape[0] gg = g0.flatten(order='F') # stack distribution matrix into vector # Solve linear system for ikfe in range(maxit_kfe): gg_tilde = weight_mat @ gg # weight distribution points by their measure across wealth gg1_tilde = spsolve( (speye(dim_size, dtype=np.complex) - d_kfe * A.conj().T), gg_tilde) gg1_tilde = gg1_tilde / gg1_tilde.sum() gg1 = spsolve(weight_mat, gg1_tilde) # Check iteration for convergence err_kfe = max(abs(gg1 - gg)) if err_kfe < tol_kfe: #print('converged!') break gg = gg1 return gg.flatten(order='F')
def _sds(P: spmatrix) -> np.ndarray: # get the top two eigenvalues and vecs so we can check for irreducibility vals, vecs = eigs(P.transpose(), k=2, which="LR", ncv=None) # check for irreducibility if np.allclose(vals, 1, rtol=1e2 * EPS, atol=1e2 * EPS): raise ValueError("This matrix is reducible.") # sort by real part and take the top one p = np.argsort(vals.real)[::-1] vecs = vecs[:, p] top_vec = vecs[:, 0] # check for imaginary component imaginary_component = top_vec.imag if not np.allclose(imaginary_component, 0, rtol=EPS, atol=EPS): raise ValueError("Top eigenvector has imaginary component.") top_vec = top_vec.real # check the sign structure if not (top_vec > -1e4 * EPS).all() and not (top_vec < 1e4 * EPS).all(): raise ValueError( "Top eigenvector has both positive and negative entries.") top_vec = np.abs(top_vec) # normalize to 1 and return return top_vec / np.sum(top_vec)
def compute_node_pattern_score(pattern_features: np.ndarray, adj_mat: sp.spmatrix, neighbors: Dict[int, Set[int]], node_labels: np.ndarray) -> np.ndarray: """ Scoring nodes based on local structures. """ n_labels = node_labels.max() + 1 n_nodes = adj_mat.shape[0] count_mat = np.zeros([n_labels, n_labels]) # Local structure node_local_features = np.zeros((n_nodes, n_labels * (n_labels + 1) // 2)) for u in tqdm.tqdm(range(n_nodes), desc='NodeLocalFeature'): count_mat.fill(0) for v in neighbors[u]: i, j = node_labels[u], node_labels[v] i, j = (i, j) if i < j else (j, i) count_mat[i, j] += 1 arr = count_mat[np.triu_indices_from(count_mat)] arr /= arr.sum() node_local_features[u] = arr # First Order: Pass 1 in the paper node_first_order_scores = node_local_features @ pattern_features.T # Second Order: Pass 2 in the paper deg_vec = np.array(adj_mat.sum(1)).squeeze() node_second_order_scores = sp.diags( (adj_mat @ deg_vec)** -1) @ adj_mat @ (deg_vec[:, None] * node_first_order_scores) node_pattern_scores = node_first_order_scores + node_second_order_scores return node_pattern_scores
def csr2bow(csr: sp.spmatrix) -> Iterable[List[Tuple[int, float]]]: """ CSR is the standard CSR representation where the column indices for row i are stored in indices [indptr[i]:indptr[i+1]] and their corresponding values are stored in data[indptr[i]:indptr[i+1]]. data is an array containing all the non zero elements of the sparse matrix. indices is an array mapping each element in data to its column in the sparse matrix. indptr then maps the elements of data and indices to the rows of the sparse matrix. This is done with the following reasoning: """ assert sp.issparse(csr) bow: Iterable[Tuple[int, float]] = None if not sp.isspmatrix_csr(csr): logger.warning("csr2bow: called with non csr (inefficient") csr = csr.tocsr() if sp.isspmatrix_csr(csr): data = csr.data indices = csr.indices indptr = csr.indptr bow: Iterable[Tuple[int, float]] = (list( zip(indices[indptr[i]:indptr[i + 1]], data[indptr[i]:indptr[i + 1]])) for i in range(0, csr.shape[0])) return bow
def scipy_to_torch(mat: sp.spmatrix) -> torch.sparse.Tensor: mat: sp.coo_matrix = mat.tocoo() indices = np.vstack((mat.row, mat.col)) indices = torch.LongTensor(indices) values = torch.FloatTensor(mat.data) tensor = torch.sparse_coo_tensor(indices, values, torch.Size(mat.shape)) return tensor
def is_bipartite_fix_scipy( A: spmatrix, fix_flag: bool = False) -> Tuple[bool, List[int], spmatrix]: A = A.tolil() n_node = A.shape[-1] vtx2reach = set(range(n_node)) vtx_color = [-1 for _ in range(n_node)] flag = True while len(vtx2reach) > 0: r = random.sample(list(vtx2reach), 1)[0] # [r] --> r q = deque([r]) while len(q) > 0: cur_node = q.popleft() vtx2reach.remove(cur_node) if vtx_color[cur_node] == -1: vtx_color[cur_node] = 0 nbr = A[cur_node].rows[0] for i in nbr: if vtx_color[i] == -1: vtx_color[i] = 1 - vtx_color[cur_node] q.append(i) elif vtx_color[i] == vtx_color[cur_node]: if fix_flag: A[cur_node, i] = 0 else: # do not fix flag = False return flag, vtx_color, A return flag, vtx_color, A
def sparse_matrix_to_tensor(X: sp.spmatrix) -> tf.SparseTensor: coo = X.tocoo() indices = np.mat([coo.row, coo.col]).transpose() return tf.SparseTensor( indices, np.array(coo.data, dtype=np.float32), coo.shape)
def sparse_tensor(spmat: sp.spmatrix, grad: bool = False): """ Convert a scipy.sparse matrix to a torch.SparseTensor. Parameters ---------- spmat: sp.spmatrix The input (sparse) matrix. grad: bool Whether the resulting tensor should have "requires_grad". Returns ------- sparse_tensor: torch.SparseTensor The output sparse tensor. """ if str(spmat.dtype) == "float32": dtype = torch.float32 elif str(spmat.dtype) == "float64": dtype = torch.float64 elif str(spmat.dtype) == "int32": dtype = torch.int32 elif str(spmat.dtype) == "int64": dtype = torch.int64 elif str(spmat.dtype) == "bool": dtype = torch.uint8 else: dtype = torch.float32 return torch.sparse_coo_tensor(spmat.nonzero(), spmat.data, size=spmat.shape, dtype=dtype, requires_grad=grad).coalesce()
def build_pc_ilu(A: spmatrix, drop_tol: Optional[float] = 1e-4, fill_factor: Optional[float] = 20) -> spl.LinearOperator: """Incomplete LU preconditioner.""" P = spl.spilu(A.tocsc(), drop_tol=drop_tol, fill_factor=fill_factor) M = spl.LinearOperator(A.shape, matvec=P.solve) return M
def __init__(self, adj_matrix: sp.spmatrix, attr_matrix: Union[np.ndarray, sp.spmatrix] = None, labels: Union[np.ndarray, sp.spmatrix] = None, node_names: np.ndarray = None, attr_names: np.ndarray = None, class_names: np.ndarray = None, metadata: Any = None): # Make sure that the dimensions of matrices / arrays all agree if sp.isspmatrix(adj_matrix): adj_matrix = adj_matrix.tocsr().astype(np.float32) else: raise ValueError( "Adjacency matrix must be in sparse format (got {0} instead).". format(type(adj_matrix))) if adj_matrix.shape[0] != adj_matrix.shape[1]: raise ValueError("Dimensions of the adjacency matrix don't agree.") if attr_matrix is not None: if sp.isspmatrix(attr_matrix): attr_matrix = attr_matrix.tocsr().astype(np.float32) elif isinstance(attr_matrix, np.ndarray): attr_matrix = attr_matrix.astype(np.float32) else: raise ValueError( "Attribute matrix must be a sp.spmatrix or a np.ndarray (got {0} instead)." .format(type(attr_matrix))) if attr_matrix.shape[0] != adj_matrix.shape[0]: raise ValueError( "Dimensions of the adjacency and attribute matrices don't agree." ) if labels is not None: if labels.shape[0] != adj_matrix.shape[0]: raise ValueError( "Dimensions of the adjacency matrix and the label vector don't agree." ) if node_names is not None: if len(node_names) != adj_matrix.shape[0]: raise ValueError( "Dimensions of the adjacency matrix and the node names don't agree." ) if attr_names is not None: if len(attr_names) != attr_matrix.shape[1]: raise ValueError( "Dimensions of the attribute matrix and the attribute names don't agree." ) self.adj_matrix = adj_matrix self.attr_matrix = attr_matrix self.labels = labels self.node_names = node_names self.attr_names = attr_names self.class_names = class_names self.metadata = metadata
def interpolate_adjacencies( positive_adj: sparse.spmatrix, negative_adj: sparse.spmatrix, interpolate: int = 0, random_state: Optional[np.random.RandomState] = None, ) -> Tuple[List[sparse.spmatrix], List[float]]: """ Examples: >>> from scipy import sparse >>> positive_adj = sparse.coo_matrix(( ... np.ones(11), ( ... np.r_[np.arange(7), [0, 0, 5, 6]], ... np.r_[np.arange(7), [5, 6, 0, 0]]), )) >>> negative_adj = sparse.coo_matrix((np.ones(7), (np.arange(7), np.arange(7)), )) >>> interpolated_adjs, interpolated_targets = interpolate_adjacencies( ... positive_adj, negative_adj, 1, np.random.RandomState(42)) >>> interpolated_targets [0.5] >>> interpolated_adjs[0].row array([0, 0, 1, 2, 3, 4, 5, 6, 6], dtype=int32) >>> interpolated_adjs[0].col array([0, 6, 1, 2, 3, 4, 5, 0, 6], dtype=int32) """ if random_state is None: random_state = np.random.RandomState() interpolated_adjs = [] fraction_positive = np.linspace(1, 0, interpolate + 2)[1:-1] if interpolate: # TODO: This can be sped up significantly if we actually use it. positive_adj = positive_adj.tocsr() negative_adj = negative_adj.tocsr() mismatches = np.array( np.where((positive_adj != negative_adj).todense())) mismatches = mismatches[:, mismatches[0, :] <= mismatches[1, :]] for i in range(interpolate): fp = fraction_positive[i] idxs = random_state.choice(np.arange(mismatches.shape[0]), int(round(mismatches.shape[0] * fp)), replace=False) adj = negative_adj.tocsr() idx_1, idx_2 = mismatches[:, idxs] adj[idx_1, idx_2] = positive_adj[idx_1, idx_2] adj[idx_2, idx_1] = positive_adj[idx_2, idx_1] interpolated_adjs.append(adj.tocoo()) return interpolated_adjs, fraction_positive.tolist()
def take_top(matrix: sparse.spmatrix, n: int) -> sparse.lil_matrix: """Filter the top nearest neighbours from a sprse distance matrix """ arr_ll = matrix.tolil(copy=True) for i in range(arr_ll.shape[0]): d, r = min_n(np.array(arr_ll.data[i]), np.array(arr_ll.rows[i]), n) arr_ll.data[i] = d.tolist() arr_ll.rows[i] = r.tolist() return arr_ll
def penalize(A: spmatrix, b: Optional[Union[ndarray, spmatrix]] = None, x: Optional[ndarray] = None, I: Optional[DofsCollection] = None, D: Optional[DofsCollection] = None, epsilon: Optional[float] = None, overwrite: bool = False) -> LinearSystem: r"""Penalize degrees-of-freedom of a linear system. Parameters ---------- A The system matrix b Optionally, the right hand side vector. x The values of the penalized degrees-of-freedom. If not given, assumed to be zero. I Specify either this or ``D``: The set of degree-of-freedom indices to solve for. D Specify either this or ``I``: The set of degree-of-freedom indices to enforce (rows/diagonal set to zero/one). epsilon Very small value, the reciprocal of which penalizes deviations from the Dirichlet condition overwrite Optionally, the original system is both modified (for performance) and returned (for compatibility with :func:`skfem.utils.solve`). By default, ``False``. Returns ------- LinearSystem A linear system with the penalized diagonal and RHS entries set to very large values, 1/epsilon and x/epsilon, respectively. """ b, x, I, D = _init_bc(A, b, x, I, D) Aout = A if overwrite else A.copy() d = Aout.diagonal() if epsilon is None: epsilon = 1e-10 / np.linalg.norm(d[D], np.inf).astype(float) d[D] = 1. / epsilon Aout.setdiag(d) if b is None: return Aout bout = b if overwrite else b.copy() # Nothing needs doing for mass matrix, but RHS vector needs penalty factor if not isinstance(b, spmatrix): bout[D] = x[D] / epsilon return Aout, bout
def scipy_spmatrix_to_j_sparse_vector_wrapper(sv: spmatrix): assert sv.shape[0] == 1, \ "The first dimension of the sparse matrix must have size 1 to be convertible to a SparseVector." # Convert to dok representation to get indices and values sv: csr_matrix = sv.tocsr() sparse_vector_wrapper_cls = gateway.jvm.__getattr__( SPARSE_VECTOR_WRAPPER_CLASS_NAME) return sparse_vector_wrapper_cls.fromPy(sv.indices.tobytes(), sv.data.tobytes(), sv.shape[1])
def construct_laplacian(A: sp.spmatrix, type: str = 'unnormalized') -> sp.csr_matrix: """Construct Laplacian of a graph given by an adjacency matrix. Parameters ---------- A Symmetric adjacency matrix in scipy sparse format. type One of {'unnormalized', 'random_walk', 'symmetrized'}, default 'unnormalized'. Type of the Laplacian to compute. unnormalized = D - A random_walk = I - D^{-1} A symmetrized = I - D^{-1/2} A D^{-1/2} Returns ------- sp.csr_matrix Laplacian matrix in the same format as A. """ if (A != A.T).sum() != 0: warnings.warn( "Adjacency matrix is not symmetric, the Laplacian might not be PSD." ) # Make sure that there are no self-loops A.setdiag(0) A.eliminate_zeros() num_nodes = A.shape[0] D = np.ravel(A.sum(1)) D[D == 0] = 1 # avoid division by 0 error if type == 'unnormalized': L = sp.diags(D) - A elif type == 'random_walk': L = sp.eye(num_nodes, dtype=A.dtype) - A / D[:, None] elif type == 'symmetrized': D_sqrt = np.sqrt(D) L = sp.eye(num_nodes, dtype=A.dtype) - A / D_sqrt[:, None] / D_sqrt[None, :] else: raise ValueError("Unsupported Laplacian type {}.".format(type)) return L
def post_process(adj: ssp.spmatrix, nodes_dict: dict, dataset: str): if not os.path.exists(f'./output/{dataset}'): os.mkdir(f'./output/{dataset}') path = os.path.join('output', dataset, 'nodes_dict.pkl') pickle.dump(nodes_dict, open(path, 'wb')) degs = np.array(adj.sum(axis=1)).squeeze() P, P_ = construct_P(degs, nodes_dict, dataset) adj_s = P_.T @ adj @ P_ ssp.save_npz(os.path.join('output', dataset, 'A_s.npz'), adj_s)
def yield_nnz(mat: sp.spmatrix) -> Iterator[Tuple[int]]: """ Helper function to extract nonzero values from a scipy.sparse matrix and returns an iterator of nonzero coordinates, in the form of (row, col) tuples. """ # Split nonzero coordinates into rows and columns nnr, nnc = mat.nonzero() nnr = nnr.tolist() nnc = nnc.tolist() return zip(nnr, nnc)
def save_sparse(sparse_matrix: sparse.spmatrix, output_filename: str): assert sparse.issparse(sparse_matrix) if sparse.isspmatrix_coo(sparse_matrix): coo = sparse_matrix else: coo = sparse_matrix.tocoo() row = coo.row col = coo.col data = coo.data shape = coo.shape np.savez(output_filename, row=row, col=col, data=data, shape=shape)
def _itmd(T: spmatrix, tol: float = 1e-12) -> bool: T = ensure_ndarray_or_sparse(T, ndim=2, uniform=True, kind="numeric") if not isspmatrix_csr(T): T = csr_matrix(T) # compressed sparse row for fast row slicing values = T.data # non-zero entries of T # check entry-wise positivity is_positive: bool = np.allclose(values, np.abs(values), rtol=tol) # check row normalization is_normed: bool = np.allclose(T.sum(axis=1), 1.0, rtol=tol) return is_positive and is_normed