Beispiel #1
0
def mat_all_point(m_mat: coo_matrix, vertex: list, alpha: float):
    """
    获取E-alpha*m_mat.T
    :param m_mat:
    :param vertex: total user and item point
    :param alpha: the prob for random walking
    :return:
    """
    total_len = len(vertex)
    row = np.array(range(total_len))
    col = np.array(range(total_len))
    data = np.ones(total_len)
    eye_t = coo_matrix((data, (row, col)), shape=(total_len, total_len))
    print(eye_t.todense())
    return eye_t.tocsr() - alpha * m_mat.tocsr().transpose()
Beispiel #2
0
    def layout_knn(self, knn: sparse.coo_matrix) -> np.ndarray:
        edges = np.stack((knn.row, knn.col), axis=1)

        # Calculate Jaccard similarities
        js = []  # type: List[float]
        knncsr = knn.tocsr()
        for i, j in edges:
            r = knncsr.getrow(i)
            c = knncsr.getrow(j)
            shared = r.minimum(c).nnz
            total = r.maximum(c).nnz
            js.append(shared / total)
        weights = np.array(js) + 0.00001  # OpenOrd doesn't like 0 weights

        self.graph = nx.Graph()
        self.graph.add_nodes_from(range(knn.shape[0]))
        for i, edge in enumerate(edges):
            self.graph.add_edge(edge[0], edge[1], {'weight': weights[i]})

        return self.layout(self.graph)
Beispiel #3
0
    def _iter_ids(self, ids: ndarray, mat: coo_matrix,
                  n_dim: int) -> Iterator[ndarray]:
        """
        Iterate over metadata vectors of size 'n_dim' encoded in 'mat'.

        If too few metadata elements are found to satisfy 'n_dim' requirement, the
        vectors will be padded with zeros to ensure homogeneity.

        See Also
        --------
        Dataset._iter_meta

        """

        if mat is not None:
            yield from self._iter_meta(ids, mat.tocsr(), n_dim)
        elif n_dim > 1:
            ids = np.c_[ids, np.zeros((len(ids), n_dim - 1), dtype=int)]
            yield from (_ for _ in ids)
        else:
            yield from (_ for _ in ids.reshape(-1, 1))
Beispiel #4
0
    def fit_predict(self, knn: sparse.coo_matrix) -> np.ndarray:
        """
		Given a sparse adjacency matrix, perform Louvain-Jaccard clustering

		Args:
			knn:	The sparse adjacency matrix

		Returns:
			labels:	The cluster labels

		Remarks:
			After clustering, the Louvain-Jaccard weighted undirected graph is available as
			the property 'graph' of type nx.Graph, and also in the form of a sparse adjacency
			matrix as the property 'lj_knn' of type scipy.sparse.coo_matrix
		"""
        if self.jaccard:
            edges = np.stack((knn.row, knn.col), axis=1)
            # Calculate Jaccard similarities
            js = []  # type: List[float]
            knncsr = knn.tocsr()
            for i, j in edges:
                r = knncsr.getrow(i)
                c = knncsr.getrow(j)
                shared = r.minimum(c).nnz
                total = r.maximum(c).nnz
                if total > 0:
                    js.append(shared / total)
                else:
                    js.append(0)
            weights = np.array(js) + 0.00001  # OpenOrd doesn't like 0 weights

            self.lj_knn = sparse.coo_matrix((weights, (knn.row, knn.col)))
            self.graph = nx.Graph()
            for i, edge in enumerate(edges):
                self.graph.add_edge(edge[0], edge[1], {'weight': weights[i]})
        else:
            self.graph = nx.from_scipy_sparse_matrix(knn)
        partitions = community.best_partition(self.graph,
                                              resolution=self.resolution)
        return np.array([partitions[key] for key in range(knn.shape[0])])
Beispiel #5
0
def crop_roi_mask(roi_mask: coo_matrix) -> coo_matrix:
    """Crop ROI mask into smallest rectangle that fits all nonzero elements

    Parameters
    ----------
    roi_mask : coo_matrix

    Returns
    -------
    coo_matrix
        A cropped ROI mask or None if coo_matrix is empty

    """

    bounds = roi_bounds(roi_mask)
    if bounds is None:
        return None

    # Convert coo to csr matrix so we can take advantage of indexing
    cropped_mask = roi_mask.tocsr()[bounds[0]:bounds[1], bounds[2]:bounds[3]]

    return cropped_mask.tocoo()
Beispiel #6
0
def stage_data(
        spots: pd.DataFrame,
        coo: coo_matrix) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Reads the spots and the label image that are passed in and calculates which cell (if any) encircles any
    given spot within its boundaries. It also retrieves the coordinates of the cell boundaries, the cell
    centroids and the cell area
    """
    logger.info(' Number of spots passed-in: %d' % spots.shape[0])
    logger.info(' Number of segmented cells: %d' % len(set(coo.data)))
    logger.info(
        ' Segmentation array implies that image has width: %dpx and height: %dpx'
        % (coo.shape[1], coo.shape[0]))
    mask_x = (spots.x >= 0) & (spots.x <= coo.shape[1])
    mask_y = (spots.y >= 0) & (spots.y <= coo.shape[0])
    spots = spots[mask_x & mask_y]

    # Debugging code!
    # resuffle
    # spots = spots.sample(frac=1).reset_index(drop=True)

    # _point = [5471-14, 110]
    # logger.info('label at (y, x): (%d, %d) is %d' % (_point[0], _point[1], coo.toarray()[_point[0], _point[1]]))

    # coo = remap_labels(coo)
    # logger.info('remapped label at (y, x): (%d, %d) is %d' % (_point[0], _point[1], coo.toarray()[_point[0], _point[1]]))

    # 1. Find which cell the spots lie within
    yx_coords = spots[['y', 'x']].values.T
    inc = inside_cell(coo.tocsr(), yx_coords)
    spots = spots.assign(label=inc)

    # 2. Get cell centroids and area
    props = skmeas.regionprops(coo.toarray().astype(np.int32))
    props_df = pd.DataFrame(data=[
        (d.label, d.area, d.centroid[1], d.centroid[0]) for d in props
    ],
                            columns=['label', 'area', 'x_cell', 'y_cell'])

    # 3. Get the cell boundaries
    cell_boundaries = extract_borders_dip(coo.toarray().astype(np.uint32), 0,
                                          0, [0])

    assert props_df.shape[0] == cell_boundaries.shape[0] == coo.data.max()
    assert set(spots.label[spots.label > 0]) <= set(props_df.label)

    cells = props_df.merge(cell_boundaries)
    cells.sort_values(by=['label', 'x_cell', 'y_cell'])
    assert cells.shape[0] == cell_boundaries.shape[0] == props_df.shape[0]

    # join spots and cells on the cell label so you can get the x,y coords of the cell for any given spot
    spots = spots.merge(cells, how='left', on=['label'])

    _cells = cells[['label', 'area', 'x_cell', 'y_cell']].rename(columns={
        'x_cell': 'x',
        'y_cell': 'y'
    })
    _cell_boundaries = cells[['label', 'coords']]
    _spots = spots[['x', 'y', 'label', 'Gene', 'x_cell',
                    'y_cell']].rename(columns={
                        'Gene': 'target',
                        'x': 'x_global',
                        'y': 'y_global'
                    })

    return _cells, _cell_boundaries, _spots
Beispiel #7
0
def get_entity_corr_coef(interactions: coo_matrix,
                         entity_id: int,
                         entity_type: str,
                         embeddings: dict,
                         ignore_sparse_zeros=True,
                         use_zero_mean=False,
                         corr_type='pearson',
                         neg_sampling=False,
                         check_normal_dist=True):
    """
    Assumes a rating matrix with rows for users and columns for items
    """
    p = embeddings['user'].shape[1]
    cov_for_p_variables = []

    if entity_type == 'user':
        embed = embeddings['user'][entity_id]
        # embedding used for covariance computation
        cov_embed = embeddings['item']
        # ratings used for covariance computation
        ratings = np.squeeze(
            np.asarray(interactions.tocsr()[entity_id, :].todense()))
    elif entity_type == 'item':
        embed = embeddings['item'][entity_id]
        # embedding used for covariance computation
        cov_embed = embeddings['user']
        # ratings used for covariance computation
        ratings = np.squeeze(
            np.asarray(interactions.tocsr()[:, entity_id].todense()))

    if ignore_sparse_zeros:
        idx = np.where(ratings != 0)[0]
        ratings = ratings[idx]

    # TODO: Use `sample_items` method
    # Use this for BPR
    if neg_sampling:
        if entity_type == 'user':
            n_sample = interactions.shape[1]
        else:
            n_sample = interactions.shape[0]
        neg_idx = np.random.randint(n_sample, size=len(idx))
        # neg_idx = np.random.choice(np.setdiff1d(np.arange(interactions.n_items),
        #                                         pos_idx), size=len(pos_idx),
        #                            replace=False)
        neg_ratings = [0] * len(ratings)
        idx = np.concatenate([idx, neg_idx])
        ratings = np.concatenate([ratings, neg_ratings])

    cov_embed = cov_embed[idx]

    for k in range(p):
        cov_embed_latent_variables_at_k = cov_embed[:, k]
        cov_mat_for_k = get_cov(ratings,
                                cov_embed_latent_variables_at_k,
                                use_zero_mean=use_zero_mean)
        cov_for_k = cov_mat_for_k[0, 1]
        cov_for_p_variables.append(cov_for_k)

    # TODO: Change from printing back to logging
    if check_normal_dist:
        alpha = 1e-3
        p_embed = normaltest(embed)[1]
        p_cov_for_p_variables = normaltest(cov_for_p_variables)[1]
        if p_embed < alpha:
            print(
                f"{entity_type}-{entity_id}: Entity Embeddings are unlikely normally distributed."
            )
        if p_cov_for_p_variables < alpha:
            print(
                f"{entity_type}-{entity_id}: Covariances are unlikely normally distributed."
            )

    cov_for_p_variables = np.array(cov_for_p_variables)
    corr_coef = get_corr_coef(embed, cov_for_p_variables, corr_type=corr_type)

    return corr_coef