def RMSE(prediction: np.array, ground_truth: csr_matrix) -> float:
    """
    calculate Root Mean Square Error
    Params:
        prediction: predicted matrix
        ground_truth: real matrix
    """
    logging.getLogger(__name__).debug('RMSE calculating...')
    prediction = prediction[ground_truth.nonzero()].flatten()
    logging.getLogger(__name__).debug("Predict: " + str(prediction) + "   length:" + str(len(prediction)))
    ground_truth = ground_truth[ground_truth.nonzero()].A.flatten()
    logging.getLogger(__name__).debug("Test: " + str(ground_truth) + "   length:" + str(len(ground_truth)))
    ret = sqrt(mean_squared_error(prediction, ground_truth))
    logging.getLogger(__name__).info('RSME: ' + str(ret))
    return ret
Esempio n. 2
0
def make_sparse_tensor(x: sparse.csr_matrix):
    rows, cols = x.nonzero()
    data = x.data
    i = torch.LongTensor([rows, cols])
    v = torch.FloatTensor(data)
    res = torch.sparse.FloatTensor(i, v, torch.Size(x.shape))
    return res
Esempio n. 3
0
def _csr_swap_zero_nonzero_in_row(row: sparse.csr_matrix,
                                  rng: np.random.Generator,
                                  p: float = 0.1) -> sparse.csr_matrix:
    """
    Swap 0 and nonzero values
    Typically, most values are 0, so we do this for p * num_zero entries
    This is exact, meaning we never swap fewer or more values
    """
    assert row.shape[0] == 1
    nonzero_idx = row.nonzero()[1]
    arr = row.toarray().squeeze()
    zero_idx = np.where(arr == 0)[0]
    # Because # nonzero << # zero, we use # nonzero to determine number of swaps
    n = int(round(len(nonzero_idx) * p))
    # Choose indices to swap
    zero_idx_swap = rng.choice(zero_idx, n, replace=False)
    nonzero_idx_swap = rng.choice(nonzero_idx, n, replace=False)
    # Transfer nonzero values to selected "aero" indices
    arr[zero_idx_swap] = arr[nonzero_idx_swap]
    # Zero out the original values at the nonzero indices
    arr[nonzero_idx_swap] = 0
    retval = sparse.csr_matrix(arr)
    assert retval.shape == row.shape
    assert len(retval.nonzero()[1]) == len(nonzero_idx)
    return retval
Esempio n. 4
0
def calculate_sparse_tf_idf_matrix(
        term_frequencies_sparse_matrix: csr_matrix,
        documents_frequencies: Dict[str, int]) -> csr_matrix:
    """
    Считает TF-IDF матрицу на основе матрицы TF и ветора DF
    :param term_frequencies_sparse_matrix: разреженная TF матрица: матрица размера
    (число документов, размер словаря), содержащая частоты слов в документах
    :param documents_frequencies: Словарь (вектор) документных частот терминов
    :return: Разреженная TF-IDF матрица
    """
    # Узнаём общее число документов и размер словаря
    num_documents, vocab_size = term_frequencies_sparse_matrix.shape
    # Создаём пустую разреженную матрицу
    tf_idf_sparse_matrix = csr_matrix((num_documents, vocab_size), dtype=float)
    non_empty_row_ids, non_empty_col_ids = term_frequencies_sparse_matrix.nonzero(
    )
    # Итерируемся по ненулевым элементам разреженной матрицы TF
    for doc_id, token_id in zip(non_empty_row_ids, non_empty_col_ids):
        # Берём частоту термина в документе из матрицы TF
        tf = term_frequencies_sparse_matrix[doc_id, token_id]
        # Считаем инвертированную документную частоту термина (IDF)
        idf = num_documents / documents_frequencies[token_id]
        # TF-IDF = TF * log(IDF)
        tf_log_idf_value = tf * math.log2(idf)
        tf_idf_sparse_matrix[doc_id, token_id] = tf_log_idf_value
    return tf_idf_sparse_matrix
Esempio n. 5
0
def augment_with_user_similarity_best_scores(urm: sps.csr_matrix,
                                             similarity,
                                             topK,
                                             value=0.5,
                                             remove_seen=True,
                                             users=None):
    # Create a copy of the urm
    augmented_urm = urm.tolil(copy=True).astype(np.float)

    # Compute the score matrix
    score_matrix = similarity.dot(urm).astype(np.float)

    # Remove items that has already been seen
    if remove_seen:
        indices_seen = urm.nonzero()
        score_matrix[indices_seen] = float("-inf")

    # Filtering the data that are not in the users list
    if users is not None:
        score_matrix = score_matrix[users]

    # Find the topK generated interactions
    top_indices = score_matrix.data.argpartition(-topK)[-topK:]
    max_k = score_matrix.data[top_indices].min()
    x = sps.find(score_matrix)
    user_item_data = zip(x[0], x[1], x[2])
    user_item = [(user, item) for user, item, data in user_item_data
                 if data >= max_k]

    # Insert the best items in the urm matrix
    for user, item in user_item:
        augmented_urm[user, item] += value

    # Return the augmented urm
    return augmented_urm.tocsr()
Esempio n. 6
0
def _get_igraph_from_adjacency(adj: csr_matrix, simplify=True):
    """Get an undirected igraph graph from adjacency matrix.
    Better than Graph.Adjacency for sparse matrices.

    Parameters
    ----------
    adj
        sparse, weighted, symmetrical adjacency matrix.
    """
    sources, targets = adj.nonzero()
    weights = adj[sources, targets]
    if isinstance(weights, np.matrix):
        weights = weights.A1
    if isinstance(weights, csr_matrix):
        # this is the case when len(sources) == len(targets) == 0, see #236
        weights = weights.toarray()

    g = ig.Graph(directed=not simplify)
    g.add_vertices(adj.shape[0])  # this adds adjacency.shape[0] vertices
    g.add_edges(list(zip(sources, targets)))

    g.es["weight"] = weights

    if g.vcount() != adj.shape[0]:
        logging.warning(
            f"The constructed graph has only {g.vcount()} nodes. "
            "Your adjacency matrix contained redundant nodes.")  # type: ignore

    if simplify:
        # since we start from a symmetrical matrix, and the graph is undirected,
        # it is fine to take either of the two edges when simplifying.
        g.simplify(combine_edges="first")

    return g
def construct_graph(W: csr_matrix,
                    directed: bool = False,
                    adjust_weights: bool = True) -> "igraph":

    assert issparse(W)

    s, t = W.nonzero()
    w = W.data

    if not directed:
        idx = s < t
        s = s[idx]
        t = t[idx]
        w = w[idx]

    if adjust_weights:
        w = ((w / np.median(w)) * 100.0 +
             0.5).astype(int) / 100.0  # round to 2 decimal points
        idx = w > 0.0
        if idx.sum() < w.size:
            s = s[idx]
            t = t[idx]
            w = w[idx]

    G = igraph.Graph(directed=directed)
    G.add_vertices(W.shape[0])
    G.add_edges(zip(s, t))
    G.es["weight"] = w

    return G
Esempio n. 8
0
 def _X_to_df(self, X: sps.csr_matrix, user_ids: List[Any]) -> pd.DataFrame:
     if self.item_ids is None:
         raise RuntimeError(
             "Setting item_ids is required to use this method.")
     X.sort_indices()
     row, col = X.nonzero()
     data = X.data
     return pd.DataFrame(
         dict(
             user_id=[user_ids[r] for r in row],
             item_id=[self.item_ids[c] for c in col],
             rating=data,
         ))
Esempio n. 9
0
def _compute_sparse_gradient(hat_vect_matrix: sparse.csr_matrix,
                             X: sparse.csr_matrix, z: np.ndarray,
                             y: np.ndarray) -> np.ndarray:
    # grad_z = (hat_vect_matrix.multiply(z @ y.T - X)).sum(axis=1) <-- compressed but memory inefficient (`A`=z @ y.T is dense) implementation
    # return grad_z.A
    # sparse matrix are represented by data, rows, cols indices
    sparse_X_tuple = (X.data, *X.nonzero())
    # create difference matrix sparse representation to avoid passing `hat_vect_matrix` as full dense matrix
    diff_matrix = sparse.csr_matrix(
        (_compute_sparse_difference_matrix(sparse_X_tuple, z, y)),
        shape=X.shape,
        dtype=z.dtype)

    # print("Norm-check", np.linalg.norm(hat_vect_matrix.multiply(diff_matrix).toarray() - hat_vect_matrix.multiply(z @ y.T - X).toarray() ) )

    # sum over rows (axis=1)
    return hat_vect_matrix.multiply(diff_matrix).sum(axis=1)
def row_normalize_csr_matrix(matrix: csr_matrix) -> csr_matrix:
    """
    Row normalize a csr matrix without mutating the input
    :param matrix: scipy.sparse.csr_matrix instance
    """
    if not isinstance(matrix, csr_matrix):
        raise TypeError('expected input to be a scipy csr_matrix')
    if any(matrix.data == 0):
        raise ValueError(
            'input must be scipy.sparse.csr_matrix and must not store zeros')
    # get row index for every nonzero element in matrix
    row_idx, col_idx = matrix.nonzero()
    # compute unraveled row sums
    row_sums = matrix.sum(axis=1).A1
    # divide data by (broadcasted) row sums
    normalized = matrix.data / row_sums[row_idx]
    return csr_matrix((normalized, (row_idx, col_idx)), shape=matrix.shape)
Esempio n. 11
0
def extreme_multilabel_classification_report(
    y_true: csr_matrix, y_score: csr_matrix,
    k_range: Iterable = range(1, 11)) -> dict:
    """
    Unused function to get an overview over prediction results
    1. Precision at k
    2. DCG at k
    3. nDCG at k
    4. F1 (macro) score
    :param y_true:
    :param y_score:
    :param k_range:
    :return:
    """
    # TODO use sklearn function to check dimensions
    if y_true.shape != y_score.shape:
        raise Exception('y_true and y_score must have same dimension')

    # init dict
    result = dict()
    result['precision@k'] = {}
    result['dcg@k'] = {}

    # precision at k
    for k in k_range:
        result['precision@k'][str(k)] = sparse_average_precision_at_k(y_true,
                                                                      y_score,
                                                                      k=k)
        result['dcg@k'][str(k)] = average_discounted_cumulative_gain_at_k(
            y_true, y_score, k=k)

    # TODO nDCG

    # F1 Macro Average
    # cast scores to binary matrix
    binary_pred = lil_matrix(y_score.shape, dtype='int8')
    binary_pred[y_score.nonzero()] = 1
    # binary_pred = binary_pred.tocsr()

    result['f1_marco'] = f1_score(y_true, binary_pred, average='macro')
    result[
        'label_ranking_average_precision_score'] = label_ranking_average_precision_score(
            y_true.toarray(), y_score.toarray())

    return result
def rank_nodes(network: sparse.csr_matrix, num_walks=1024, max_walk_length=10):
    samples = np.random.uniform(0, 1, num_walks)
    distribution = np.histogram(samples, max_walk_length)[0]
    sparse_pointers = network.indptr
    sparse_neighbors = network.indices
    hashes = []
    degree = Counter(network.nonzero()[0])
    degree = [degree[i] if i in degree else 0 for i in range(network.shape[0])]
    for i in range(network.shape[0]):
        generated_walks = []
        # Generate walks
        for j, num in enumerate(distribution):
            walk_matrix = -np.ones((num, (j + 2)), dtype=np.uint32, order='C')
            walk_matrix = np.reshape(walk_matrix, (walk_matrix.size,), order='C')
            numba_walk_kernel(walk_matrix, i, sparse_pointers, sparse_neighbors, num_steps=j + 1, num_walks=num)
            wm = walk_matrix.tolist()
            generated_walks += [np.mean([degree[node] for node in wm[k:k + num]]) for k in range(0, len(wm), num)]
        hashes.append(np.mean(generated_walks))
    return hashes
Esempio n. 13
0
def _csr_swap_in_row(row: sparse.csr_matrix,
                     rng: np.random.Generator,
                     p: float = 0.1) -> sparse.csr_matrix:
    """
    Helper function for swapping nonzero values in a given row
    """
    assert row.shape[0] == 1, f"Did not get a row!"
    nonzero_idx = row.nonzero()[1]
    shuffle_idx = np.arange(len(nonzero_idx))
    # Randomly choose a proportion of the nonzero indices to shuffle
    n = int(round(len(shuffle_idx) * p))
    swap_idx = nonzero_idx[rng.choice(shuffle_idx, size=n, replace=False)]
    # Shuffle the indices we chose above
    dest_idx = rng.choice(swap_idx, size=len(swap_idx), replace=False)
    assert swap_idx.shape == dest_idx.shape

    arr = row.toarray().squeeze()
    assert np.all(arr[swap_idx] != 0)
    arr[dest_idx] = arr[swap_idx]
    retval = sparse.csr_matrix(arr)
    return retval
Esempio n. 14
0
def sparse_mat_get_rmse(u_mat: ss.csr_matrix, v_mat: ss.csr_matrix, user_preference: ss.csr_matrix,
                        show_process: bool = True) -> np.float64:
    """
    稀疏矩阵情况下计算 RMSE
    :param u_mat: U
    :param v_mat: V
    :param user_preference: 用户偏好矩阵
    :param show_process: 是否显示计算进度
    :return: RMSE
    """
    non_zero = user_preference.nonzero()
    residue = 0
    total = non_zero[0].size
    for i in range(non_zero[0].size):
        if show_process:
            print('step', i, 'of', total)
        conducted = u_mat[non_zero[0][i], :].dot(v_mat[:, non_zero[1][i]])
        user_conducted = user_preference[non_zero[0][i], non_zero[1][i]]
        # print("user_conducted", user_conducted, "conducted", conducted)
        residue_each_element = user_conducted - conducted[0, 0]
        residue += residue_each_element ** 2
    return np.sqrt(residue / np.size(user_preference))
Esempio n. 15
0
    def to_csv(self, filename: str, X: sparse.csr_matrix):
        """ Dump csr sparse matrix to csv file restoring original MovieLens format.
        Args:
            filename (str): 
            X (scipy.sparse.csr_matrix): Matrix of ratings to dump.
        """

        data, rows, cols = X.data, *X.nonzero()
        with open(filename, mode='w') as file:
            file_matrix = csv.writer(file,
                                     delimiter=',',
                                     quotechar='"',
                                     quoting=csv.QUOTE_MINIMAL)
            file_matrix.writerow(['UserId', 'MovieId', 'Rating'])
            for rating, user_id, movie_id in zip(data, rows, cols):
                # user_id start from 1 in movielens
                user_id += 1
                # restore ratings to their original scale
                rating = self._rescale_back_rating(rating)
                # restore movie id to MovieLens system
                movie_id = self.inverse_movie_map[movie_id]
                file_matrix.writerow([user_id, movie_id, rating])
Esempio n. 16
0
    def _iter_meta(ids: ndarray, meta: csr_matrix,
                   n_dim: int) -> Iterator[List[int]]:
        """
        Lazily evaluate metadata in the provided CSR matrix.

        Parameters
        ----------
        ids: ndarray
            An array of IDs. For items, this will correspond to individual item IDs.
            For users, this will correspond to individual user IDs.
        meta: csr_matrix
            A sparse matrix of (NxM) dimensions, where N corresponds to the number of
            user/item IDs (above) and M corresponds to the number of user/item metadata
            features (vocab) in the dataset.
        n_dim: int
            The length of the output vectors. Makes sure this is large enough to
            actually append some metadata to your output vectors (i.e. > 1).

        Returns
        -------
        output: Iterator
            An iterator, where each ID in the list is mapped to corresponding metadata.
            The output shape of each element is then a list of 'n_dim' length.

        """

        groups = defaultdict(list)
        _ids, tags = meta.nonzero()

        for _id, _tag in zip(_ids, tags):
            groups[_id].append(_tag)

        for _id in ids:
            group = groups[_id]
            padding = [0] * max(0, n_dim - len(group))
            features = [_id, *group, *padding][:n_dim]
            yield features
Esempio n. 17
0
def normalize_vectors(mx: sparse.csr_matrix, axis: int) -> sparse.csr_matrix:
    """Performs normalization of vectors (i.e. divide each vector
    by its corresponding Euclidean norm).

    Parameter `axis` can be 0 (column-vectors) or 1 (row-vectors)

    :param mx: sparse matrix
    :param axis: 0 or 1
    :return: sparse matrix
    """

    if axis not in {0, 1}:
        raise ValueError('Axis must be either 0 or 1.')

    mx = mx.copy().astype(np.float64)

    mx_norms = mx.copy()
    mx_norms.data **= 2
    mx_norms = mx_norms.sum(axis=axis).A.flatten()**0.5
    mx_norms = mx_norms[mx.nonzero()[1 - axis]]

    mx.data /= mx_norms

    return mx
Esempio n. 18
0
 def __save_to_docword_file(self, bag_of_words: csr_matrix,
                          issues: List[TokenizedIssue], target_dir: str) -> None:
     """
     Save words to docword file in following format:
     D (documents number)
     W (words number)
     NNZ (total rows)
     docID wordID count
     docID wordID count
     .....
     :param bag_of_words: Matrix where each cell represents number of word appearance in document
     :param issues: Tokenized issues
     :param target_dir: Target directory where docword file will be created
     :return: None
     """
     target_path = os.path.join(target_dir, "docword.issues.txt")
     with open(target_path, "w") as docword_file:
         docword_file.write(str(len(issues)) + "\n")
         docword_file.write(str(len(self.count_vectorizer.get_feature_names())) + "\n")
         docword_file.write(str(bag_of_words.nnz) + "\n")
         nnz_x, nnz_y = bag_of_words.nonzero()
         for x, y in zip(nnz_x, nnz_y):
             docword_file.write(
                 "%s %s %s\n" % (str(issues[x].id), str(y + 1), str(bag_of_words[x, y])))
Esempio n. 19
0
def csr_to_dicts(x:csr_matrix,dim_names=None):
    if dim_names is None:
        dim_names = [i for i in range(x.shape[1])]
    vert_idx,horiz_idx = x.nonzero()
    return [{dim_names[k]:v for k,v in zip(horiz_idx[np.where(vert_idx==row_idx)],x.data[np.where(vert_idx==row_idx)])} for row_idx in range(x.shape[0])]
Esempio n. 20
0
def _mutual_proximity_empiric_sparse(S: csr_matrix,
                                     test_set_ind: np.ndarray = None,
                                     min_nnz=0,
                                     verbose: int = 0,
                                     log=None,
                                     n_jobs=None):
    """MP empiric for sparse similarity matrices.

    Please do not directly use this function, but invoke via 
    mutual_proximity_empiric()
    """
    if verbose and log:
        log.message("Starting MP empiric for sparse matrices.")
    self_value = 1.  # similarity matrix
    n = S.shape[0]
    if not n_jobs:
        n_jobs = 1
    elif n_jobs == -1:
        n_jobs = cpu_count()
    else:
        pass

    # This will become S_mp.data
    shared_data = Array(ctypes.c_double, S.data.size)
    shared_data_np = np.ctypeslib.as_array(shared_data.get_obj())

    if verbose and log:
        log.message("Spawning processes and starting MP computation.")
    with Pool(processes=n_jobs,
              initializer=_mpes_init,
              initargs=(S, shared_data)) as pool:
        S_nonzero = filterfalse(lambda ij: ij[0] > ij[1], zip(*S.nonzero()))
        for _ in pool.imap(func=partial(_mpes_sec_dist,
                                        args=(verbose, log, n, min_nnz)),
                           iterable=S_nonzero,
                           chunksize=int(1e5)):
            pass  # output stored by function in shared array
    pool.join()
    if verbose and log:
        log.message("Assemble upper-triangular MP matrix.")
    S_mp = csr_matrix((shared_data_np, S.indices, S.indptr),
                      shape=S.shape,
                      copy=False).tolil()
    del shared_data, shared_data_np
    if verbose and log:
        log.message("Symmetrizing matrix.")
    S_mp += S_mp.T
    # Retain original distances for objects with too few neighbors.
    # That is, keep distances FROM these objects to others (rows), but
    # set distances of other objects TO them to NaN (columns).
    # Returned matrix is thus NOT SYMMETRIC.
    if verbose and log:
        log.message(("Retain original similarities for objects with too few "
                     "neighbors. If there are any, the output matrix will "
                     "not be symmetric anymore! (Rows corresponding to these "
                     "objects will be in original space; corresponding "
                     "columns will contain NaN)."))
    for row in np.argwhere(S.getnnz(axis=1) <= min_nnz):
        row = row[0]  # use scalar for indexing instead of array
        S_mp[row, :] = S.getrow(row)
    if verbose and log:
        log.message("Setting self similarities.")
    for i in range(n):
        S_mp[i, i] = self_value  #need to set self values
    if verbose and log:
        log.message("Converting to CSR matrix and returning.")
    return S_mp.tocsr()
 def _getPossiblePositiveEdgeIdxs(self, mtx: sp.csr_matrix) -> np.array:
     nonzeroTpl = mtx.nonzero()
     return np.dstack([nonzeroTpl[0], nonzeroTpl[1]]).reshape(-1, 2)
Esempio n. 22
0
 def X_to_df(X: sps.csr_matrix, uids: np.ndarray) -> pd.DataFrame:
     rows, cols = X.nonzero()
     return pd.DataFrame(
         dict(user_id=[uids[row] for row in rows], item_id=unique_item_ids[cols])
     )
Esempio n. 23
0
 def X_to_df(X: sps.csr_matrix, uids: List[Any]) -> pd.DataFrame:
     rows, cols = X.nonzero()
     return pd.DataFrame(
         dict(user_id=[uids[row] for row in rows], item_id=item_id_reprod[cols])
     )