Exemple #1
0
def sparse_mat_get_rmse(u_mat: ss.csr_matrix, v_mat: ss.csr_matrix, user_preference: ss.csr_matrix,
                        show_process: bool = True) -> np.float64:
    """
    稀疏矩阵情况下计算 RMSE
    :param u_mat: U
    :param v_mat: V
    :param user_preference: 用户偏好矩阵
    :param show_process: 是否显示计算进度
    :return: RMSE
    """
    non_zero = user_preference.nonzero()
    residue = 0
    total = non_zero[0].size
    for i in range(non_zero[0].size):
        if show_process:
            print('step', i, 'of', total)
        conducted = u_mat[non_zero[0][i], :].dot(v_mat[:, non_zero[1][i]])
        user_conducted = user_preference[non_zero[0][i], non_zero[1][i]]
        # print("user_conducted", user_conducted, "conducted", conducted)
        residue_each_element = user_conducted - conducted[0, 0]
        residue += residue_each_element ** 2
    return np.sqrt(residue / np.size(user_preference))
Exemple #2
0
def is_acyclic(adjacency: sparse.csr_matrix) -> bool:
    """Check whether a graph has no cycle.

    Parameters
    ----------
    adjacency:
        Adjacency matrix of the graph.

    Returns
    -------
    is_acyclic : bool
        A boolean with value True if the graph has no cycle and False otherwise
    """
    n_nodes = adjacency.shape[0]
    n_cc = sparse.csgraph.connected_components(adjacency,
                                               (not is_symmetric(adjacency)),
                                               'strong', False)
    if n_cc == n_nodes:
        # check for self-loops (= cycles)
        return (adjacency.diagonal() == 0).all()
    else:
        return False
Exemple #3
0
def sparse_average_precision_at_k(y_true: csr_matrix, y_scores: csr_matrix, k: int = 5) -> float:
    """
    Computes the average precision at k for sparse binary matrices.
    :param y_true: grounded truth in binary format (n_samples, n_labels)
    :param y_scores: predictions in representation that can be ranked (e.g. probabilities)
    :param k: top k labels to check
    :return: precision at k score
    """
    if y_true.shape != y_scores.shape:
        raise Exception('y_true and y_pred must have same shape')
    if y_true.shape[1] < k:
        raise Exception('Less labels than k')

    # get indices of k top values of y_pred
    top_idx = top_n_idx_sparse(y_scores, k)
    # create new matrix with shape == y_true.shape with only top ranked labels
    y_pred_binary_only_top = lil_matrix(y_true.shape, dtype='int8')
    for index, (binary_row, idx_row) in enumerate(zip(y_pred_binary_only_top, top_idx)):
        y_pred_binary_only_top[index, idx_row] = 1
    y_pred_binary_only_top = y_pred_binary_only_top.tocsr()
    # compute precision

    # get correct predicted labels
    correct_labelled = y_true.multiply(y_pred_binary_only_top)
    summed_precision = []

    for index, (row, score_row) in enumerate(zip(correct_labelled, y_scores)):
        # check special case that corresponding y_true row is empty => unlabeled instance
        if y_true[index].count_nonzero() == 0:
            # if no labels where predicted add 1 to sum
            if score_row.count_nonzero() == 0:
                summed_precision.append(1.0)
            else:
                summed_precision.append(0)
        else:
            summed_precision.append(row.count_nonzero() / k)

    return sum(summed_precision) / len(summed_precision)
    def train(self, x_train: csr_matrix, y_train: csr_matrix,
              x_test: csr_matrix, y_test: csr_matrix, epochs, mem_size,
              batch_size):
        saved_weights = find_checkpoint_file('.', self.model_name)

        k_start = 1
        if len(saved_weights) != 0:
            print('[INFO] Saved weights found, loading...')
            epoch = saved_weights[saved_weights.find('epoch_') +
                                  6:saved_weights.
                                  find('_',
                                       saved_weights.find('epoch_') + 6)]
            self.model.load_weights(saved_weights)
            k_start = int(epoch) + 1

        test_data = (self._vectorize(x_test, self.X_vocab_len),
                     self._vectorize(y_test, self.y_vocab_len))
        y_test_array = y_test.toarray()

        prev_acc = 0.0
        epoch = k_start
        lr = self.learning_rate
        while epoch <= epochs and lr > 0.0000001:
            # for epoch in range(k_start, epochs + 1):
            acc = self._train_epoch(epoch, batch_size, mem_size, test_data,
                                    x_train, y_test_array, y_train)
            print('Accuracy', acc)
            if acc >= prev_acc:
                self.model.save_weights(
                    f'{self.model_name}_epoch_{epoch}_{acc}_{self.embedding_dim}_{self.hidden_dim}_{self.layer_num}_{self.dropout}_{lr}.hdf5'
                )
                epoch += 1
                prev_acc = acc
            else:
                saved_weights = find_checkpoint_file('.', self.model_name)
                self.model.load_weights(saved_weights)
                lr *= 0.1
                self.model.optimizer.lr.assign(lr)
Exemple #5
0
    def fit(self, X: sp.csr_matrix, n_samples: int):
        """Learn the idf vector (global term weights).

        Arguments:
            X: A matrix of term/token counts.
            n_samples: Number of total documents
        """
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = np.float64

        if self.use_idf:
            _, n_features = X.shape
            self.df = np.squeeze(np.asarray(X.sum(axis=0)))
            idf = np.log(n_samples / self.df)
            self._idf_diag = sp.diags(idf,
                                      offsets=0,
                                      shape=(n_features, n_features),
                                      format='csr',
                                      dtype=dtype)

        return self
 def __init__(self, A: sparse.csr_matrix, L: sparse.csr_matrix, batch_size=1):
     '''
     This is trick dataset for graph. I pass batch_size here so when training, DataLoader is always batch_size =1
     :param A:
     :param L:
     :param batch_size:
     '''
     # self.dts = []
     # dataset_size = A.shape[0]
     # steps_per_epoch = (dataset_size - 1) // batch_size + 1
     # for i in range(steps_per_epoch):
     #     index = np.arange(
     #         i * batch_size, min((i + 1) * batch_size, dataset_size))
     #     A_train = A[index, :].todense()
     #     L_train = L[index][:, index].todense()
     #
     #     A_train = torch.tensor(A_train)
     #     L_train = torch.tensor(L_train)
     #     batch_inp = [A_train, L_train]
     #     self.dts.append(batch_inp)
     self.A = A
     self.L = L
     self.size = A.get_shape()[0]
Exemple #7
0
def make_weights(distribution: str, adjacency: sparse.csr_matrix) -> np.ndarray:
    """Array of weights from a matrix and a desired distribution.

   Parameters
   ----------
   distribution:
       Distribution for node sampling. Only ``'degree'`` or ``'uniform'`` are accepted.
   adjacency:
       The adjacency matrix of the neighbors.

   Returns
   -------
   node_weights: np.ndarray
       Valid weights of nodes.
    """
    n = adjacency.shape[0]
    if distribution == 'degree':
        node_weights_vec = adjacency.dot(np.ones(adjacency.shape[1]))
    elif distribution == 'uniform':
        node_weights_vec = np.ones(n)
    else:
        raise ValueError('Unknown distribution of node weights.')
    return node_weights_vec
Exemple #8
0
def _split_features_to_input(x: ss.csr_matrix, idx1: int, idx2: int)\
        -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Convert individual features to arrays corresponding to the three inputs for
    the neural network.

    Parameters
    ----------
    x : List[ss.csr_matrix]
        Sparse feature array with encodings as rows.
    idx1 : int
        First index to split the feature arrays.
    idx2 : int
        Second index to split the feature arrays.

    Returns
    -------
    Tuple[np.ndarray, np.ndarray, np.ndarray]
        The features are split in three arrays according to the two split
        indexes.
    """
    x = x.toarray()
    return x[:, :idx1], x[:, idx1:idx2], x[:, idx2:]
Exemple #9
0
    def _iter_meta(ids: ndarray, meta: csr_matrix,
                   n_dim: int) -> Iterator[List[int]]:
        """
        Lazily evaluate metadata in the provided CSR matrix.

        Parameters
        ----------
        ids: ndarray
            An array of IDs. For items, this will correspond to individual item IDs.
            For users, this will correspond to individual user IDs.
        meta: csr_matrix
            A sparse matrix of (NxM) dimensions, where N corresponds to the number of
            user/item IDs (above) and M corresponds to the number of user/item metadata
            features (vocab) in the dataset.
        n_dim: int
            The length of the output vectors. Makes sure this is large enough to
            actually append some metadata to your output vectors (i.e. > 1).

        Returns
        -------
        output: Iterator
            An iterator, where each ID in the list is mapped to corresponding metadata.
            The output shape of each element is then a list of 'n_dim' length.

        """

        groups = defaultdict(list)
        _ids, tags = meta.nonzero()

        for _id, _tag in zip(_ids, tags):
            groups[_id].append(_tag)

        for _id in ids:
            group = groups[_id]
            padding = [0] * max(0, n_dim - len(group))
            features = [_id, *group, *padding][:n_dim]
            yield features
Exemple #10
0
 def fit(self,
         X: spa.csr_matrix,
         Y: spa.csr_matrix,
         X_val=None,
         Y_val=None):
     ''' Fit model to data. X_val and Y_val are only used to report accuracy
     during optimization they do not affect the fitted W,b parameters'''
     if X.ndim == 1:
         X = X.reshape(1, -1)
     N, D = X.shape
     self.encoder = LabelEncoder()
     y = self.encoder.fit_transform(Y)
     K = len(self.encoder.classes_)
     Z = np.zeros((N, K), dtype=int)
     Z[np.arange(N), y] = 1
     if not (X_val is None):
         N_val = len(X_val)
         y_val = self.encoder.transform(Y_val)
         Z_val = np.zeros((N_val, K), dtype=int)
         Z_val[np.arange(N_val), y_val] = 1
     else:
         Z_val = None
     b_guess = np.zeros(K)
     W_guess = np.random.normal(0, 1, (K, D)) / np.sqrt(D)
     self.b, self.W = LR.optimize_logistic_weights(
         X,
         Z,
         b_guess,
         W_guess,
         X_val=X_val,
         Z_val=Z_val,
         penalty=self.penalty,
         learning_rate=self.learning_rate,
         batch_size=self.batch_size,
         tol=self.tol,
         max_iter=self.max_iter,
         verbose=self.verbose)
Exemple #11
0
def with_attribute_anomolies(
        node_attrs: sp.csr_matrix,
        num_candidates: int,
        num_anomolies: int = 1) -> Tuple[sp.csr_matrix, np.ndarray]:
    """
    Get attribute matrix with some rows replaced with others.

    For each anomoly, we replace the attributes with those of the node with attributes
    furthest away from the original w.r.t. Euclidean norm from `num_candidates`
    candidates of the original.

    Args:
        node_attrs: [num_nodes, num_attrs] sparse attributes.
        num_candidates: number of candidates per anomoly.
        num_anomolies: number of anomolies to overwrite.

    Returns:
        augmented_node_attrs: node attributes with anomolous node attributes replaced.
        mapping: [num_anomolies, 2] int32 array, where
        `augmented_node_attrs[mapping[i, 1]] == node_attrs[mapping[i, 0]]`
    """
    num_nodes = node_attrs.shape[0]
    node_attrs_lil = node_attrs.tolil()
    anomolies = np.random.choice(num_nodes, num_anomolies, replace=False)
    anomolies.sort()
    mapping = np.empty((num_anomolies, 2), dtype=np.int32)
    for i, a in enumerate(anomolies):
        candidates = np.random.choice(num_nodes, num_candidates, replace=False)
        norms = np.linalg.norm(node_attrs[a].todense() -
                               node_attrs[candidates].todense(),
                               axis=-1)
        max_norm = np.argmax(norms)
        replacement = candidates[max_norm]
        node_attrs_lil[a] = node_attrs[replacement]
        mapping[i] = a, replacement
    return node_attrs_lil.tocsr(), mapping
def _mutual_proximity_empiric_sparse(S: csr_matrix, test_set_ind: np.ndarray = None, verbose: int = 0, log=None):
    """MP empiric for sparse similarity matrices. 
    
    Please do not directly use this function, but invoke via 
    mutual_proximity_empiric()
    """
    self_value = 1.0  # similarity matrix
    n = S.shape[0]
    S_mp = lil_matrix(S.shape)

    for i, j in zip(*triu(S).nonzero()):
        if verbose and log and ((i + 1) % 1000 == 0 or i == n - 2):
            log.message("MP_empiric: {} of {}.".format(i + 1, n - 1), flush=True)
        d = S[j, i]
        dI = S.getrow(i).toarray()
        dJ = S.getrow(j).toarray()
        nz = (dI > 0) & (dJ > 0)
        S_mp[i, j] = (nz & (dI <= d) & (dJ <= d)).sum() / (nz.sum() - 1)

    S_mp += S_mp.T
    for i in range(n):
        S_mp[i, i] = self_value  # need to set self values

    return S_mp.tocsr()
Exemple #13
0
def ndcg(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array:
    """ Calculate ndcg@R for each users in X_true and X_pred matrices

    Args:
        X_true: Matrix containing True values for user-item interactions
        X_top_k: Matrix containing inidices picked by model
        R: Number of elements taken into consideration

    Returns:
        Numpy array containing calculated ndcg@R for each user
    """

    penalties = 1. / np.log2(np.arange(2, R + 2))
    selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1)

    DCG = selected * penalties

    cpenalties = np.empty(R + 1)
    np.cumsum(penalties, out=cpenalties[1:])
    cpenalties[0] = 0
    maxhit = np.minimum(X_true.getnnz(axis=1), R)
    IDCG = cpenalties[maxhit]

    return DCG / IDCG
Exemple #14
0
def calculate_min_violations(A: csr_matrix) -> (float, float):
    """
    Calculate the minimum number of violations in a graph for all possible rankings
    A violaton is an edge going from a lower ranked node to a higher ranked one
    Minimum number is calculated by summing bidirectional interactions.
    Input:
        A: graph adjacency matrix where A[i,j] is the weight of an edge from node i to j
    Output:
        minimum number of violations
        proportion of all edges against minimum violations
    """

    ii, ji, v = scipy.sparse.find(
        A
    )  # I,J,V contain the row, column indices, and values of the nonzero entries.

    min_viol = 0.0
    for e in range(len(v)):  # for all nodes interactions
        i, j = ii[e], ji[e]
        if A[i, j] > 0 and A[j, i] > 0:
            min_viol = min_viol + min(A[i, j], A[j, i])

    m = A.sum()
    return (min_viol, min_viol / m)
Exemple #15
0
def compute_tf_idf(doc_matrix: sparse.csr_matrix) -> sparse.csr_matrix:

    # 假设这里的 doc_matrix 已经是行和为 1,那么就只需计算 idf

    # 首先找出所有非零元的坐标
    i, nonzero_cols, v = sparse.find(doc_matrix)

    # 然后统计每一列的非零元的个数
    nonzero_cols, nonzero_col_appearences = np.unique(nonzero_cols,
                                                      return_counts=True)

    # 有些列可能全为 0,所以
    indicator = np.zeros(shape=(
        1,
        doc_matrix.shape[1],
    ), dtype=np.float32)
    indicator[0, nonzero_cols] = nonzero_col_appearences[:]
    n_articles = doc_matrix.shape[0]
    indicator = np.log(n_articles / indicator)

    # 此处为按元素乘
    tf_idf = sparse.csr_matrix(doc_matrix.multiply(indicator))

    return tf_idf
 def __init__(self, k, data: csr_matrix):
     self.__spent_norm_diff = 0
     self.k = k
     if issparse(data) or isinstance(data, np.ndarray):
         self.__toArray = lambda x: x.toarray() if hasattr(
             data[0], 'toarray') else lambda x: np.array(x)
         try:  # Sempre que pssível, usar arrays pois são absurdamente mas rápidos para operar sobre
             if isinstance(data, np.ndarray):
                 tmp = data - np.array(tmp[0])
             else:
                 tmp = data.toarray() - np.array(tmp[0])
             self._data = tmp
             self.isSparse = False
         except BaseException as e:
             print("Warning: sparse data", e)
             self._data = data
             self.isSparse = True
         self._dataLen = dataLen = data.shape[0]
     elif isinstance(data, list) or isinstance(data, tuple):
         self._data = np.array(data)
         self._dataLen = dataLen = len(data)
         self.isSparse = False
     else:
         raise TypeError(
             """The 'data' argument must be one of the following wypes:
   <scipy_sparse_matrix>, np.ndarray, list os tuple""")
     if not 0 < self.k <= self._dataLen:
         raise ValueError(
             "The 'k' number of centers must be in the range [1, data_sample]"
         )
     self._centroidsIndex = np.zeros(k, dtype=np.uint16)
     self._computedCentroids = 0
     self._probab = np.full((k, dataLen), -np.inf)
     self._minDistanceToNearestCentroid = np.full(
         (1, dataLen), np.inf, dtype=np.float64
     )  # ditancias até qqer centroid inicialmente é infinita
def snn_dissimilarity_func(graph: csr_matrix, n_neighbors: int, *args,
                           **kwargs) -> csr_matrix:
    """Default SNN dissimilarity function

    Computes the dissimilarity between two points in terms of shared nearest neighbors

    Args:
        graph (scipy.sparse.csr_matrix): sparse matrix with dimensions (n_samples, n_samples),
         where the element ij represents the distance between the point i and j 
        n_neighbors (int): number of neighbors in the k-neighborhood search
    """

    graph.data[graph.data > 0] = 1
    n_samples = graph.shape[0]

    # Add the point as its own neighbor
    graph += spdiags(np.ones(n_samples), diags=0, m=n_samples, n=n_samples)
    matrix = graph * graph.transpose()
    matrix.sort_indices()

    # The lower the "closer"
    matrix.data = n_neighbors - matrix.data

    return matrix
Exemple #18
0
 def __save_to_docword_file(self, bag_of_words: csr_matrix,
                          issues: List[TokenizedIssue], target_dir: str) -> None:
     """
     Save words to docword file in following format:
     D (documents number)
     W (words number)
     NNZ (total rows)
     docID wordID count
     docID wordID count
     .....
     :param bag_of_words: Matrix where each cell represents number of word appearance in document
     :param issues: Tokenized issues
     :param target_dir: Target directory where docword file will be created
     :return: None
     """
     target_path = os.path.join(target_dir, "docword.issues.txt")
     with open(target_path, "w") as docword_file:
         docword_file.write(str(len(issues)) + "\n")
         docword_file.write(str(len(self.count_vectorizer.get_feature_names())) + "\n")
         docword_file.write(str(bag_of_words.nnz) + "\n")
         nnz_x, nnz_y = bag_of_words.nonzero()
         for x, y in zip(nnz_x, nnz_y):
             docword_file.write(
                 "%s %s %s\n" % (str(issues[x].id), str(y + 1), str(bag_of_words[x, y])))
Exemple #19
0
def csr_to_dicts(x:csr_matrix,dim_names=None):
    if dim_names is None:
        dim_names = [i for i in range(x.shape[1])]
    vert_idx,horiz_idx = x.nonzero()
    return [{dim_names[k]:v for k,v in zip(horiz_idx[np.where(vert_idx==row_idx)],x.data[np.where(vert_idx==row_idx)])} for row_idx in range(x.shape[0])]
Exemple #20
0
 def left_sparse_dot(self, matrix: sparse.csr_matrix):
     """Left dot product with a sparse matrix"""
     self.backward = matrix.dot(self.backward)
     return self
Exemple #21
0
def svg_bigraph(biadjacency: sparse.csr_matrix,
                names_row: Optional[np.ndarray] = None, names_col: Optional[np.ndarray] = None,
                labels_row: Optional[Union[dict, np.ndarray]] = None,
                labels_col: Optional[Union[dict, np.ndarray]] = None,
                scores_row: Optional[Union[dict, np.ndarray]] = None,
                scores_col: Optional[Union[dict, np.ndarray]] = None,
                membership_row: Optional[sparse.csr_matrix] = None,
                membership_col: Optional[sparse.csr_matrix] = None,
                seeds_row: Union[list, dict] = None, seeds_col: Union[list, dict] = None,
                position_row: Optional[np.ndarray] = None, position_col: Optional[np.ndarray] = None,
                reorder: bool = True, width: Optional[float] = 400,
                height: Optional[float] = 300, margin: float = 20, margin_text: float = 3, scale: float = 1,
                node_size: float = 7, node_size_min: float = 1, node_size_max: float = 20,
                display_node_weight: bool = False,
                node_weights_row: Optional[np.ndarray] = None, node_weights_col: Optional[np.ndarray] = None,
                node_width: float = 1, node_width_max: float = 3,
                color_row: str = 'gray', color_col: str = 'gray', label_colors: Optional[Iterable] = None,
                display_edges: bool = True, edge_labels: Optional[list] = None, edge_width: float = 1,
                edge_width_min: float = 0.5, edge_width_max: float = 10, edge_color: str = 'black',
                display_edge_weight: bool = True,
                font_size: int = 12, filename: Optional[str] = None) -> str:
    """Return SVG image of a bigraph.

    Parameters
    ----------
    biadjacency :
        Biadjacency matrix of the graph.
    names_row :
        Names of the rows.
    names_col :
        Names of the columns.
    labels_row :
        Labels of the rows (negative values mean no label).
    labels_col :
        Labels of the columns (negative values mean no label).
    scores_row :
        Scores of the rows (measure of importance).
    scores_col :
        Scores of the columns (measure of importance).
    membership_row :
        Membership of the rows (label distribution).
    membership_col :
        Membership of the columns (label distribution).
    seeds_row :
        Rows to be highlighted (if dict, only keys are considered).
    seeds_col :
        Columns to be highlighted (if dict, only keys are considered).
    position_row :
        Positions of the rows.
    position_col :
        Positions of the columns.
    reorder :
        Use clustering to order nodes.
    width :
        Width of the image.
    height :
        Height of the image.
    margin :
        Margin of the image.
    margin_text :
        Margin between node and text.
    scale :
        Multiplicative factor on the dimensions of the image.
    node_size :
        Size of nodes.
    node_size_min :
        Minimum size of nodes.
    node_size_max :
        Maximum size of nodes.
    display_node_weight :
        If ``True``, display node weights through node size.
    node_weights_row :
        Weights of rows (used only if **display_node_weight** is ``True``).
    node_weights_col :
        Weights of columns (used only if **display_node_weight** is ``True``).
    node_width :
        Width of node circle.
    node_width_max :
        Maximum width of node circle.
    color_row :
        Default color of rows (svg color).
    color_col :
        Default color of cols (svg color).
    label_colors :
        Colors of the labels (svg color).
    display_edges :
        If ``True``, display edges.
    edge_labels :
        Labels of the edges, as a list of tuples (source, destination, label)
    edge_width :
        Width of edges.
    edge_width_min :
        Minimum width of edges.
    edge_width_max :
        Maximum width of edges.
    display_edge_weight :
        If ``True``, display edge weights through edge widths.
    edge_color :
        Default color of edges (svg color).
    font_size :
        Font size.
    filename :
        Filename for saving image (optional).

    Returns
    -------
    image : str
        SVG image.

    Example
    -------
    >>> from sknetwork.data import movie_actor
    >>> biadjacency = movie_actor()
    >>> from sknetwork.visualization import svg_bigraph
    >>> image = svg_bigraph(biadjacency)
    >>> image[1:4]
    'svg'
    """
    n_row, n_col = biadjacency.shape

    # node positions
    if position_row is None or position_col is None:
        position_row = np.zeros((n_row, 2))
        position_col = np.ones((n_col, 2))
        if reorder:
            bilouvain = BiLouvain()
            bilouvain.fit(biadjacency)
            index_row = np.argsort(bilouvain.labels_row_)
            index_col = np.argsort(bilouvain.labels_col_)
        else:
            index_row = np.arange(n_row)
            index_col = np.arange(n_col)
        position_row[index_row, 1] = np.arange(n_row)
        position_col[index_col, 1] = np.arange(n_col) + .5 * (n_row - n_col)
    position = np.vstack((position_row, position_col))

    # node colors
    colors_row = get_node_colors(n_row, labels_row, scores_row, membership_row, color_row, label_colors)
    colors_col = get_node_colors(n_col, labels_col, scores_col, membership_col, color_col, label_colors)

    # node sizes
    if node_weights_row is None:
        node_weights_row = biadjacency.dot(np.ones(n_col))
    if node_weights_col is None:
        node_weights_col = biadjacency.T.dot(np.ones(n_row))
    node_sizes_row, node_sizes_col = get_node_sizes_bipartite(node_weights_row, node_weights_col,
                                                              node_size, node_size_min, node_size_max,
                                                              display_node_weight)

    # node widths
    node_widths_row = get_node_widths(n_row, seeds_row, node_width, node_width_max)
    node_widths_col = get_node_widths(n_col, seeds_col, node_width, node_width_max)

    # rescaling
    if not width and not height:
        raise ValueError("You must specify either the width or the height of the image.")
    position, width, height = rescale(position, width, height, margin, node_size, node_size_max, display_node_weight)

    # node names
    if names_row is not None:
        text_length = np.max(np.array([len(str(name)) for name in names_row]))
        position[:, 0] += text_length * font_size * .5
        width += text_length * font_size * .5
    if names_col is not None:
        text_length = np.max(np.array([len(str(name)) for name in names_col]))
        width += text_length * font_size * .5

    # scaling
    position *= scale
    height *= scale
    width *= scale
    position_row = position[:n_row]
    position_col = position[n_row:]

    svg = """<svg width="{}" height="{}"  xmlns="http://www.w3.org/2000/svg">\n""".format(width, height)

    # edges
    if display_edges:
        biadjacency_coo = sparse.coo_matrix(biadjacency)

        if edge_color is None:
            if names_row is None and names_col is None:
                edge_color = 'black'
            else:
                edge_color = 'gray'

        edge_colors, edge_order, edge_colors_residual = get_edge_colors(biadjacency, edge_labels, edge_color,
                                                                        label_colors)
        edge_widths = get_edge_widths(biadjacency_coo, edge_width, edge_width_min, edge_width_max, display_edge_weight)

        for ix in edge_order:
            i = biadjacency_coo.row[ix]
            j = biadjacency_coo.col[ix]
            color = edge_colors[ix]
            svg += svg_edge(pos_1=position_row[i], pos_2=position_col[j], edge_width=edge_widths[ix], edge_color=color)

        for i, j, color in edge_colors_residual:
            svg += svg_edge(pos_1=position_row[i], pos_2=position_col[j], edge_width=edge_width, edge_color=color)

    # nodes
    for i in range(n_row):
        if membership_row is None:
            svg += svg_node(position_row[i], node_sizes_row[i], colors_row[i], node_widths_row[i])
        else:
            if membership_row[i].nnz == 1:
                index = membership_row[i].indices[0]
                svg += svg_node(position_row[i], node_sizes_row[i], colors_row[index], node_widths_row[i])
            else:
                svg += svg_pie_chart_node(position_row[i], node_sizes_row[i], membership_row[i].todense(),
                                          colors_row, node_widths_row[i])

    for i in range(n_col):
        if membership_col is None:
            svg += svg_node(position_col[i], node_sizes_col[i], colors_col[i], node_widths_col[i])
        else:
            if membership_col[i].nnz == 1:
                index = membership_col[i].indices[0]
                svg += svg_node(position_col[i], node_sizes_col[i], colors_col[index], node_widths_col[i])
            else:
                svg += svg_pie_chart_node(position_col[i], node_sizes_col[i], membership_col[i].todense(),
                                          colors_col, node_widths_col[i])
    # text
    if names_row is not None:
        for i in range(n_row):
            svg += svg_text(position_row[i] - (margin_text + node_sizes_row[i], 0), names_row[i], font_size, True)
    if names_col is not None:
        for i in range(n_col):
            svg += svg_text(position_col[i] + (margin_text + node_sizes_col[i], 0), names_col[i], font_size)
    svg += """</svg>\n"""

    if filename is not None:
        with open(filename + '.svg', 'w') as f:
            f.write(svg)

    return svg
Exemple #22
0
 def predict(self, X: csr_matrix) -> np.array:
     return self.clf.predict(X.toarray())
Exemple #23
0
 def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data):
     feature_data[feature_data > 1] = np.log(feature_data[feature_data > 1])
     dataMatrix.data = dataMatrix.data * feature_data
     return dataMatrix
Exemple #24
0
def compare_news_vector_with_1(arr, vec: csr_matrix):
    return 1 - vec._mul_vector(arr)
Exemple #25
0
def compare_news_vector_with_(arr, vec: csr_matrix):
    return 1 - vec.dot(arr)
Exemple #26
0
 def predict(self, X: csr_matrix):
     X = X.toarray()
     return self.clf.predict(X)
Exemple #27
0
 def _eliminate(matrix: sp.csr_matrix, user_indices, item_indices):
     matrix = matrix.copy()
     # `lil_matrix` is too slow
     matrix[list(user_indices), list(item_indices)] = 0
     matrix.eliminate_zeros()
     return matrix
Exemple #28
0
 def predict(self, X: csr_matrix):
     X = X.todense()  # TensorFlow/Skflow doesn't support sparse matrices
     return self.clf.predict(X)
Exemple #29
0
def advanced_subclass_handling(data_frame: pd.DataFrame,
                               URM_train: csr_matrix,
                               path="../../data/",
                               add_subclass=False):
    """
    Here we want to include in the training set sub class information in the following way:
    - A column encoding the mean of 'label' for a certain couple (user, subclass): i.e. how many
    items of that subclass the user liked
    - Including information about the popularity of the subclass (how many items for that subclass
    - Including ratings of that subclass

    :param URM_train: mean response will be retrieved from here
    :param data_frame: dataframe being pre-processed for boosting
    :param path: path to the folder containing subclass dataframe
    :return: dataframe with augmented information
    """
    print("Adding subclass and feature engineering subclass...")
    data_frame = data_frame.copy()

    df_subclass: pd.DataFrame = pd.read_csv(path + "data_ICM_sub_class.csv")
    df_subclass = df_subclass[['row', 'col']]
    df_subclass = df_subclass.rename(columns={"col": "subclass"})

    # Merging sub class information
    data_frame = pd.merge(data_frame,
                          df_subclass,
                          right_on="row",
                          left_on="item_id")
    data_frame = data_frame.drop(columns=["row"], inplace=False)

    print("\t- Add items present for each subclass")
    # Add subclass item-popularity: how many items are present of that subclass
    subclass_item_count = df_subclass.groupby("subclass").count()
    data_frame = pd.merge(data_frame,
                          subclass_item_count,
                          right_index=True,
                          left_on="subclass")
    data_frame = data_frame.rename(columns={"row": "item_per_subclass"})

    print("\t- Add ratings popularity for each subclass")
    # Add subclass ratings-popularity: how many interactions we have for each subclass
    URM_train_csc = URM_train.tocsc()
    n_ratings_sub = []

    sorted_sub_indices = np.argsort(df_subclass['subclass'].values)
    sorted_sub = df_subclass['subclass'][sorted_sub_indices].values
    sorted_item_subclass = df_subclass['row'][sorted_sub_indices].values

    unique_sorted_sub, sub_indptr = np.unique(sorted_sub, return_index=True)
    sub_indptr = np.concatenate([sub_indptr, [sorted_sub.size]])
    for i, sub in tqdm(enumerate(unique_sorted_sub),
                       total=unique_sorted_sub.size,
                       desc="\t\tProcessing"):
        item_sub = sorted_item_subclass[sub_indptr[i]:sub_indptr[i + 1]]
        n_ratings_sub.append(URM_train_csc[:, item_sub].data.size)

    ratings_sub = np.array([unique_sorted_sub, n_ratings_sub])
    ratings_per_sub_df = pd.DataFrame(
        data=np.transpose(ratings_sub),
        columns=["subclass", "global_ratings_per_subclass"])

    data_frame = pd.merge(data_frame,
                          ratings_per_sub_df,
                          left_on="subclass",
                          right_on="subclass")

    # Add subclass ratings-popularity for each user using rating percentage
    print("\t- Add ratings popularity for pairs (user, subclass)")
    users = data_frame['user_id'].values
    sub = data_frame['subclass'].values

    perc_array = np.zeros(users.size)
    rat_array = np.zeros(users.size)
    for i, user in tqdm(enumerate(users),
                        total=users.size,
                        desc="\t\tProcessing"):
        curr_sub = sub[i]
        curr_sub_index = np.searchsorted(unique_sorted_sub, curr_sub)

        # Find items of this subclass
        item_sub = sorted_item_subclass[
            sub_indptr[curr_sub_index]:sub_indptr[curr_sub_index + 1]]
        user_item = URM_train.indices[URM_train.indptr[user]:URM_train.
                                      indptr[user + 1]]

        total_user_likes = user_item.size
        mask = np.in1d(item_sub, user_item)
        likes_per_sub = item_sub[mask].size
        user_p = likes_per_sub / total_user_likes
        perc_array[i] = user_p
        rat_array[i] = likes_per_sub

    data_frame["subclass_user_like_perc"] = perc_array
    data_frame["subclass_user_like_quantity"] = rat_array

    if not add_subclass:
        data_frame = data_frame.drop(columns=["subclass"], inplace=False)

    return data_frame
Exemple #30
0
 def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data):
     dataMatrix.data = dataMatrix.data * feature_data
     return dataMatrix
Exemple #31
0
def compare_news_vector_with_1(arr, vec: csr_matrix):
    return 1 - vec._mul_vector(arr)
Exemple #32
0
 def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data):
     dataMatrix.data = dataMatrix.data * (1 / np.log1p(feature_data))
     return dataMatrix
Exemple #33
0
 def predict_proba(self, X: csr_matrix):
     return self.clf.predict_proba(X.todense())
Exemple #34
0
def compare_news_vector_with_(arr, vec: csr_matrix):
    return 1 - vec.dot(arr)
Exemple #35
0
 def predict_proba(self, X: csr_matrix):
     return self.clf.predict_proba(X.todense())
Exemple #36
0
def compute_norms(matrix: sparse.csr_matrix) -> np.ndarray:
    """Computes norms for each row."""
    return np.sqrt(matrix.multiply(matrix).sum(axis=1).A).flatten()
Exemple #37
0
 def predict_proba(self, X: csr_matrix):
     return self.clf.predict(X.toarray())
Exemple #38
0
def unshift_label_matrix(L_sparse: csr_matrix) -> np.ndarray:
    """Unshift a sparse label matrix (ABATAIN as 0) to a dense one (ABSTAIN as -1)."""
    return L_sparse.toarray() - 1
Exemple #39
0
 def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False):
     super().__init__(X, Y, tune_parameters)
     self.X, self.Y = X.toarray(), Y
     self.classifier = SVC(decision_function_shape='ovo')
     self.clf = BaggingClassifier(self.classifier, n_estimators=self.estimators, n_jobs=8,
                                  max_samples=self.max_samples, max_features=self.max_features)