def mat_all_point(m_mat: coo_matrix, vertex: list, alpha: float): """ 获取E-alpha*m_mat.T :param m_mat: :param vertex: total user and item point :param alpha: the prob for random walking :return: """ total_len = len(vertex) row = np.array(range(total_len)) col = np.array(range(total_len)) data = np.ones(total_len) eye_t = coo_matrix((data, (row, col)), shape=(total_len, total_len)) print(eye_t.todense()) return eye_t.tocsr() - alpha * m_mat.tocsr().transpose()
def layout_knn(self, knn: sparse.coo_matrix) -> np.ndarray: edges = np.stack((knn.row, knn.col), axis=1) # Calculate Jaccard similarities js = [] # type: List[float] knncsr = knn.tocsr() for i, j in edges: r = knncsr.getrow(i) c = knncsr.getrow(j) shared = r.minimum(c).nnz total = r.maximum(c).nnz js.append(shared / total) weights = np.array(js) + 0.00001 # OpenOrd doesn't like 0 weights self.graph = nx.Graph() self.graph.add_nodes_from(range(knn.shape[0])) for i, edge in enumerate(edges): self.graph.add_edge(edge[0], edge[1], {'weight': weights[i]}) return self.layout(self.graph)
def _iter_ids(self, ids: ndarray, mat: coo_matrix, n_dim: int) -> Iterator[ndarray]: """ Iterate over metadata vectors of size 'n_dim' encoded in 'mat'. If too few metadata elements are found to satisfy 'n_dim' requirement, the vectors will be padded with zeros to ensure homogeneity. See Also -------- Dataset._iter_meta """ if mat is not None: yield from self._iter_meta(ids, mat.tocsr(), n_dim) elif n_dim > 1: ids = np.c_[ids, np.zeros((len(ids), n_dim - 1), dtype=int)] yield from (_ for _ in ids) else: yield from (_ for _ in ids.reshape(-1, 1))
def fit_predict(self, knn: sparse.coo_matrix) -> np.ndarray: """ Given a sparse adjacency matrix, perform Louvain-Jaccard clustering Args: knn: The sparse adjacency matrix Returns: labels: The cluster labels Remarks: After clustering, the Louvain-Jaccard weighted undirected graph is available as the property 'graph' of type nx.Graph, and also in the form of a sparse adjacency matrix as the property 'lj_knn' of type scipy.sparse.coo_matrix """ if self.jaccard: edges = np.stack((knn.row, knn.col), axis=1) # Calculate Jaccard similarities js = [] # type: List[float] knncsr = knn.tocsr() for i, j in edges: r = knncsr.getrow(i) c = knncsr.getrow(j) shared = r.minimum(c).nnz total = r.maximum(c).nnz if total > 0: js.append(shared / total) else: js.append(0) weights = np.array(js) + 0.00001 # OpenOrd doesn't like 0 weights self.lj_knn = sparse.coo_matrix((weights, (knn.row, knn.col))) self.graph = nx.Graph() for i, edge in enumerate(edges): self.graph.add_edge(edge[0], edge[1], {'weight': weights[i]}) else: self.graph = nx.from_scipy_sparse_matrix(knn) partitions = community.best_partition(self.graph, resolution=self.resolution) return np.array([partitions[key] for key in range(knn.shape[0])])
def crop_roi_mask(roi_mask: coo_matrix) -> coo_matrix: """Crop ROI mask into smallest rectangle that fits all nonzero elements Parameters ---------- roi_mask : coo_matrix Returns ------- coo_matrix A cropped ROI mask or None if coo_matrix is empty """ bounds = roi_bounds(roi_mask) if bounds is None: return None # Convert coo to csr matrix so we can take advantage of indexing cropped_mask = roi_mask.tocsr()[bounds[0]:bounds[1], bounds[2]:bounds[3]] return cropped_mask.tocoo()
def stage_data( spots: pd.DataFrame, coo: coo_matrix) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Reads the spots and the label image that are passed in and calculates which cell (if any) encircles any given spot within its boundaries. It also retrieves the coordinates of the cell boundaries, the cell centroids and the cell area """ logger.info(' Number of spots passed-in: %d' % spots.shape[0]) logger.info(' Number of segmented cells: %d' % len(set(coo.data))) logger.info( ' Segmentation array implies that image has width: %dpx and height: %dpx' % (coo.shape[1], coo.shape[0])) mask_x = (spots.x >= 0) & (spots.x <= coo.shape[1]) mask_y = (spots.y >= 0) & (spots.y <= coo.shape[0]) spots = spots[mask_x & mask_y] # Debugging code! # resuffle # spots = spots.sample(frac=1).reset_index(drop=True) # _point = [5471-14, 110] # logger.info('label at (y, x): (%d, %d) is %d' % (_point[0], _point[1], coo.toarray()[_point[0], _point[1]])) # coo = remap_labels(coo) # logger.info('remapped label at (y, x): (%d, %d) is %d' % (_point[0], _point[1], coo.toarray()[_point[0], _point[1]])) # 1. Find which cell the spots lie within yx_coords = spots[['y', 'x']].values.T inc = inside_cell(coo.tocsr(), yx_coords) spots = spots.assign(label=inc) # 2. Get cell centroids and area props = skmeas.regionprops(coo.toarray().astype(np.int32)) props_df = pd.DataFrame(data=[ (d.label, d.area, d.centroid[1], d.centroid[0]) for d in props ], columns=['label', 'area', 'x_cell', 'y_cell']) # 3. Get the cell boundaries cell_boundaries = extract_borders_dip(coo.toarray().astype(np.uint32), 0, 0, [0]) assert props_df.shape[0] == cell_boundaries.shape[0] == coo.data.max() assert set(spots.label[spots.label > 0]) <= set(props_df.label) cells = props_df.merge(cell_boundaries) cells.sort_values(by=['label', 'x_cell', 'y_cell']) assert cells.shape[0] == cell_boundaries.shape[0] == props_df.shape[0] # join spots and cells on the cell label so you can get the x,y coords of the cell for any given spot spots = spots.merge(cells, how='left', on=['label']) _cells = cells[['label', 'area', 'x_cell', 'y_cell']].rename(columns={ 'x_cell': 'x', 'y_cell': 'y' }) _cell_boundaries = cells[['label', 'coords']] _spots = spots[['x', 'y', 'label', 'Gene', 'x_cell', 'y_cell']].rename(columns={ 'Gene': 'target', 'x': 'x_global', 'y': 'y_global' }) return _cells, _cell_boundaries, _spots
def get_entity_corr_coef(interactions: coo_matrix, entity_id: int, entity_type: str, embeddings: dict, ignore_sparse_zeros=True, use_zero_mean=False, corr_type='pearson', neg_sampling=False, check_normal_dist=True): """ Assumes a rating matrix with rows for users and columns for items """ p = embeddings['user'].shape[1] cov_for_p_variables = [] if entity_type == 'user': embed = embeddings['user'][entity_id] # embedding used for covariance computation cov_embed = embeddings['item'] # ratings used for covariance computation ratings = np.squeeze( np.asarray(interactions.tocsr()[entity_id, :].todense())) elif entity_type == 'item': embed = embeddings['item'][entity_id] # embedding used for covariance computation cov_embed = embeddings['user'] # ratings used for covariance computation ratings = np.squeeze( np.asarray(interactions.tocsr()[:, entity_id].todense())) if ignore_sparse_zeros: idx = np.where(ratings != 0)[0] ratings = ratings[idx] # TODO: Use `sample_items` method # Use this for BPR if neg_sampling: if entity_type == 'user': n_sample = interactions.shape[1] else: n_sample = interactions.shape[0] neg_idx = np.random.randint(n_sample, size=len(idx)) # neg_idx = np.random.choice(np.setdiff1d(np.arange(interactions.n_items), # pos_idx), size=len(pos_idx), # replace=False) neg_ratings = [0] * len(ratings) idx = np.concatenate([idx, neg_idx]) ratings = np.concatenate([ratings, neg_ratings]) cov_embed = cov_embed[idx] for k in range(p): cov_embed_latent_variables_at_k = cov_embed[:, k] cov_mat_for_k = get_cov(ratings, cov_embed_latent_variables_at_k, use_zero_mean=use_zero_mean) cov_for_k = cov_mat_for_k[0, 1] cov_for_p_variables.append(cov_for_k) # TODO: Change from printing back to logging if check_normal_dist: alpha = 1e-3 p_embed = normaltest(embed)[1] p_cov_for_p_variables = normaltest(cov_for_p_variables)[1] if p_embed < alpha: print( f"{entity_type}-{entity_id}: Entity Embeddings are unlikely normally distributed." ) if p_cov_for_p_variables < alpha: print( f"{entity_type}-{entity_id}: Covariances are unlikely normally distributed." ) cov_for_p_variables = np.array(cov_for_p_variables) corr_coef = get_corr_coef(embed, cov_for_p_variables, corr_type=corr_type) return corr_coef