Ejemplo n.º 1
0
def weight_matrix_by_user_feature_counts(dataMatrix: sps.csr_matrix,
                                         UCM: sps.csr_matrix,
                                         strategy="linear"):
    """
    Assumes that rows of dataMatrix are users and it returns the weighted dataMatrix based on user feature counts of
    UCM

    :param dataMatrix:
    :param UCM:
    :param strategy: strategy to use in order to put weights
    :return:
    """
    if UCM.shape[0] != dataMatrix.shape[0]:
        raise ValueError("UCM does not contain all users in dataMatrix")

    UCM_popularity = (UCM > 0).sum(axis=0)
    UCM_popularity = np.array(UCM_popularity).squeeze()
    user_list = UCM.tocoo().row
    feature_list = UCM.tocoo().col

    user_feature_list = np.full(shape=dataMatrix.shape[0], fill_value=1)
    user_feature_list[user_list] = UCM_popularity[feature_list]

    users = dataMatrix.tocoo().row
    feature_list_for_user = np.array(user_feature_list[users],
                                     dtype=np.float32)

    return _weight_matrix(dataMatrix, feature_list_for_user, strategy)
Ejemplo n.º 2
0
def sparse_adj_to_edge(adj_matrix: sp.csr_matrix):
    """Convert a Scipy sparse matrix to (edge_index, edge_weight) representation"""
    adj_matrix = adj_matrix.tocoo(copy=False)
    edge_index = np.asarray((adj_matrix.row, adj_matrix.col))
    edge_weight = adj_matrix.data.copy()

    return edge_index, edge_weight
Ejemplo n.º 3
0
 def _save(
     filepath: Union[str, Path],
     matrix: csr_matrix,
     cell_ids: pd.DataFrame,
     features: pd.DataFrame,
     save_pickle: bool = False,
     save_rds: bool = False,
     save_h5ad: bool = False,
     save_loom: bool = False,
     meta: Optional[pd.DataFrame] = None,
 ):
     filepath = Path(filepath)
     if save_pickle:
         build_counts_store(matrix.tocoo(),
                            cell_ids,
                            features,
                            save_path=filepath)
     if save_rds:
         Convert.pickle_to_rds_dir(filepath.parent)
     if save_h5ad or save_loom:
         cell_ids = meta if meta is not None else pd.DataFrame(
             cell_ids).set_index(0)
         cell_ids.index.name = "index"
         features = features.rename(columns={
             "ensgs": "gene_ids",
             "genes": "index"
         }).set_index("index")
         adata = AnnData(matrix.tocsr(), cell_ids, features)
         if save_h5ad:
             adata.write_h5ad(filepath.parent / "rna.h5ad")
         if save_loom:
             adata.write_loom(filepath.parent / "rna.loom")
Ejemplo n.º 4
0
def convert_URM_to_FM(URM: csr_matrix):
    """
    Convert positive interactions of an URM in the way that is needed for the FM model.
    - In each row there are 3 interactions: 1 for the user, 1 for the item
    - Only positive samples are encoded here

    Note: this method works only for implicit dataset

    :param URM: URM to be preprocessed
    :return: csr_matrix containing the URM preprocessed in the described way
    """
    n_users = URM.shape[0]
    n_items = URM.shape[1]
    n_sample = len(URM.data)
    FM_matrix = sps.coo_matrix((n_sample, n_users + n_items))

    # Setting rows
    FM_matrix.row = np.repeat(np.arange(n_sample), 2)  # one row has two ones

    # Setting cols
    row = np.reshape(URM.tocoo().row, newshape=(n_sample, 1))
    col = np.reshape(URM.tocoo().col + n_users, newshape=(n_sample, 1))
    row_col = np.concatenate([row, col], axis=1)
    unrolled_row_col = np.reshape(row_col, newshape=len(FM_matrix.row))
    FM_matrix.col = unrolled_row_col

    # Setting data
    FM_matrix.data = np.ones(len(FM_matrix.row), dtype=np.float32)
    return FM_matrix.tocsr()
Ejemplo n.º 5
0
def sparse2dict(matrix: csr_matrix):
    """ csr_matrix을 dictionary format으로 변환
    """
    coo = matrix.tocoo()
    return dict(data=coo.data.tolist(),
                row=coo.row.tolist(),
                col=coo.col.tolist(),
                shape=coo.shape)
Ejemplo n.º 6
0
def calculate_normalized_affinity(
    W: csr_matrix
) -> Tuple[csr_matrix, np.array, np.array]:
    diag = W.sum(axis=1).A1
    diag_half = np.sqrt(diag)
    W_norm = W.tocoo(copy=True)
    W_norm.data /= diag_half[W_norm.row]
    W_norm.data /= diag_half[W_norm.col]
    W_norm = W_norm.tocsr()

    return W_norm, diag, diag_half
Ejemplo n.º 7
0
 def fit(self, graph: sp.csr_matrix):
     """
     Fitting a NodeSketch model.
     """
     self._graph = graph
     self._num_nodes = graph.shape[0]
     self._hash_values = self._generate_hash_values()
     self._sla = graph.tocoo()
     self._sla.data = np.array([1 for _ in range(len(self._sla.data))])
     self._sla_original = self._sla.copy()
     self._do_single_sketch()
     for _ in range(self.iterations - 1):
         self._augment_sla()
         self._do_single_sketch()
    def _get_log_distances(self,
                           y_distances: csr_matrix,
                           base=0.5) -> csr_matrix:
        """
        Returns the logarithmic version (base default: 0.5) of the distance matrix returned by TagEmebddingClassifier.
        This must be used in order to compute valid precision@k scores
        since small Distances should be ranked better than great ones.
        :param y_distances: sparse distance matrix (multilabel matrix with distances instead of binary indicators)
        :param base: base of the log function (must be smaller then one)
        :return: sparse matrix with the log values
        """

        log_y_distances = y_distances.tocoo()
        log_y_distances.data = np.log(log_y_distances.data) / np.log(base)
        return log_y_distances.tocsr()
Ejemplo n.º 9
0
    def csr2tensor(self, matrix: sp.csr_matrix):
        r"""Convert csr_matrix to tensor.

        Args:
            matrix (scipy.csr_matrix): Sparse matrix to be converted.

        Returns:
            torch.sparse.FloatTensor: Transformed sparse matrix.
        """
        matrix = matrix.tocoo()
        x = torch.sparse.FloatTensor(
            torch.LongTensor(np.array([matrix.row, matrix.col])),
            torch.FloatTensor(matrix.data.astype(np.float32)),
            matrix.shape).to(self.device)
        return x
Ejemplo n.º 10
0
def dropcols_coo(csr_mat: sp.csr_matrix, idx_to_drop):
    """
    Drop columns of sparse matrix
    http://stackoverflow.com/questions/23966923/delete-columns-of-matrix-of-csr-format-in-python
    """
    idx_to_drop = np.unique(idx_to_drop)
    coo_mat = csr_mat.tocoo()
    keep = ~np.in1d(coo_mat.col, idx_to_drop)

    coo_mat.data = coo_mat.data[keep]
    coo_mat.row = coo_mat.row[keep]
    coo_mat.col = coo_mat.col[keep]

    # decrement column indices
    coo_mat.col -= idx_to_drop.searchsorted(coo_mat.col)
    coo_mat._shape = (coo_mat.shape[0], coo_mat.shape[1] - len(idx_to_drop))
    return coo_mat.tocsr()
Ejemplo n.º 11
0
def weight_matrix_by_user_profile(dataMatrix: sps.csr_matrix,
                                  URM,
                                  strategy="linear"):
    """

    :param dataMatrix:
    :param URM:
    :param strategy: strategy to use in order to put weights
    :return:
    """
    if URM.shape[0] != dataMatrix.shape[0]:
        raise ValueError("URM does not contain all users in dataMatrix")

    user_activity = (URM > 0).sum(axis=1)
    user_activity = np.array(user_activity).squeeze()

    users = dataMatrix.tocoo().row
    user_profile_for_user = np.array(user_activity[users], dtype=np.float32)

    return _weight_matrix(dataMatrix, user_profile_for_user, strategy)
Ejemplo n.º 12
0
def weight_matrix_by_item_feature_value(dataMatrix: sps.csr_matrix,
                                        ICM: sps.csr_matrix,
                                        strategy="linear"):
    """
    Assumes that rows of dataMatrix are items and it returns the weighted dataMatrix based on item feature value
    of ICM

    :param dataMatrix:
    :param ICM: ICM with only one columns
    :param strategy:
    :return:
    """
    if ICM.shape[0] != dataMatrix.shape[0]:
        raise ValueError("ICM does not contain all items in dataMatrix")

    item_list = dataMatrix.tocoo().row
    item_feature_weights = np.array(ICM[item_list].todense()).squeeze()
    mean_value = item_feature_weights.mean()
    item_feature_weights[item_feature_weights == 0] = mean_value

    return _weight_matrix(dataMatrix, item_feature_weights, strategy)
Ejemplo n.º 13
0
def weight_matrix_by_item_popularity(dataMatrix: sps.csr_matrix,
                                     R_iu: sps.csr_matrix,
                                     strategy="linear"):
    """
    Assumes that dataMatrix has items as row

    :param dataMatrix: csr matrix with items as rows
    :param R_iu: Rating item x user matrix
    :param strategy: strategy to use in order to put weights
    :return:
    """
    if R_iu.shape[0] != dataMatrix.shape[0]:
        raise ValueError("R_iu does not contain all items in dataMatrix")

    item_popularity = (R_iu > 0).sum(axis=1)
    item_popularity = np.array(item_popularity).squeeze()

    items = dataMatrix.tocoo().row
    item_popularity_for_item = np.array(item_popularity[items],
                                        dtype=np.float32)

    return _weight_matrix(dataMatrix, item_popularity_for_item, strategy)
Ejemplo n.º 14
0
    def _reweight_values(self,
                         doc_term_matrix: sp.csr_matrix) -> sp.csr_matrix:
        """
        Re-weight values in a doc-term matrix according to parameters specified
        in :class:`Vectorizer` initialization: binary or tf-idf weighting,
        sublinear term-frequency, document-normalized weights.

        Args:
            doc_term_matrix

        Returns:
            Reweighted doc-term matrix.
        """
        # re-weight the local components (term freqs)
        if self.tf_type == "binary":
            doc_term_matrix.data.fill(1)
        elif self.tf_type == "bm25":
            if not self.dl_type:
                doc_term_matrix.data = (doc_term_matrix.data *
                                        (BM25_K1 + 1.0) /
                                        (BM25_K1 + doc_term_matrix.data))
            else:
                dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type)
                length_norm = (1 - BM25_B) + (BM25_B *
                                              (dls / self._avg_doc_length))
                doc_term_matrix = doc_term_matrix.tocoo(copy=False)
                doc_term_matrix.data = (
                    doc_term_matrix.data * (BM25_K1 + 1.0) /
                    (doc_term_matrix.data +
                     (BM25_K1 * length_norm[doc_term_matrix.row])))
                doc_term_matrix = doc_term_matrix.tocsr(copy=False)
        elif self.tf_type == "sqrt":
            _ = np.sqrt(doc_term_matrix.data,
                        doc_term_matrix.data,
                        casting="unsafe")
        elif self.tf_type == "log":
            _ = np.log(doc_term_matrix.data,
                       doc_term_matrix.data,
                       casting="unsafe")
            doc_term_matrix.data += 1.0
        elif self.tf_type == "linear":
            pass  # tfs are already linear
        else:
            # this should never raise, i'm just being a worrywart
            raise ValueError(
                errors.value_invalid_msg(
                    "tf_type", self.tf_type,
                    {"binary", "bm25", "sqrt", "log", "linear"}))

        # apply the global component (idfs), column-wise
        if self.idf_type:
            doc_term_matrix = doc_term_matrix * self._idf_diag

        # apply normalizations, row-wise
        # unless we've already handled it for bm25-style tf
        if self.dl_type and self.tf_type != "bm25":
            n_docs, _ = doc_term_matrix.shape
            dls = get_doc_lengths(doc_term_matrix, type_=self.dl_type)
            dl_diag = sp.spdiags(1.0 / dls,
                                 diags=0,
                                 m=n_docs,
                                 n=n_docs,
                                 format="csr")
            doc_term_matrix = dl_diag * doc_term_matrix
        if self.norm is not None:
            doc_term_matrix = normalize_mat(doc_term_matrix,
                                            norm=self.norm,
                                            axis=1,
                                            copy=False)

        return doc_term_matrix
Ejemplo n.º 15
0
def augment_adj(adj_matrix: sp.csr_matrix, nodes: Union[list, int, np.ndarray],
                edge_weight: np.ndarray = None, *,
                nbrs_to_link: Union[list, np.ndarray, None] = None,
                common_nbrs: Union[list, np.ndarray, None] = None,
                fill_weight: float = 1.0) -> sp.csr_matrix:
    """Augment a specified adjacency matrix by linking nodes to
        each element in `nbrs_to_link`.

    Examples
    ----------
    # add 2 nodes adjacent to [2,3] and 3, respectively.
    >>> augmented_adj = augment_adj(adj_matrix, nodes=2, 
                                nbrs_to_link=[[2,3],3], 
                                fill_weight=1.0)


    # add 2 nodes all adjacent to [1,2,3].
    >>> augmented_adj = augment_adj(adj_matrix, nodes=2, 
                                common_nbrs=[1,2,3], 
                                fill_weight=1.0)  

     # add 3 edges, [3,1], [4,2], [5,3].
    >>> augmented_adj = augment_adj(adj_matrix, nodes=[3,4,5], 
                                common_nbrs=[1,2,3], 
                                fill_weight=1.0)                                 
    Parameters
    ----------
    adj_matrix: shape [num_nodes, num_nodes].
        A Scipy sparse adjacency matrix.
    nodes: the nodes that will be linked to the graph.
        list or np.array: the nodes connected to `nbrs_to_link`
        int: new added nodes connected to `nbrs_to_link`, 
            node ids [num_nodes, ..., num_nodes+nodes-1].            
    nbrs_to_link: a list of N elements,
        where N is the length of 'nodes'.
        the specified neighbor(s) for each added node.
        if `None`, it will be set to `[0, ..., N-1]`.
    common_nbrs: shape [None,].
        specified common neighbors for each added node.
    fill_weight: edge weight for the augmented edges.

    NOTE:
    ----------
    Both `nbrs_to_link` and `common_nbrs` should not be specified together.


    See Also
    ----------
    graphgallery.functional.augment_edge    

    """

    adj_matrix = adj_matrix.tocoo(copy=False)
    edge_index = adj_matrix.row, adj_matrix.col

    augmented_edge_index, augmented_edge_weight = augment_edge(edge_index, nodes,
                                                               edge_weight=adj_matrix.data,
                                                               nbrs_to_link=nbrs_to_link,
                                                               common_nbrs=common_nbrs,
                                                               fill_weight=fill_weight)

    N = augmented_edge_index.max() + 1
    augmented_adj = sp.csr_matrix((augmented_edge_weight, augmented_edge_index),
                                  shape=(N, N))

    augmented_adj.eliminate_zeros()
    return augmented_adj