def sparse_mat_get_rmse(u_mat: ss.csr_matrix, v_mat: ss.csr_matrix, user_preference: ss.csr_matrix, show_process: bool = True) -> np.float64: """ 稀疏矩阵情况下计算 RMSE :param u_mat: U :param v_mat: V :param user_preference: 用户偏好矩阵 :param show_process: 是否显示计算进度 :return: RMSE """ non_zero = user_preference.nonzero() residue = 0 total = non_zero[0].size for i in range(non_zero[0].size): if show_process: print('step', i, 'of', total) conducted = u_mat[non_zero[0][i], :].dot(v_mat[:, non_zero[1][i]]) user_conducted = user_preference[non_zero[0][i], non_zero[1][i]] # print("user_conducted", user_conducted, "conducted", conducted) residue_each_element = user_conducted - conducted[0, 0] residue += residue_each_element ** 2 return np.sqrt(residue / np.size(user_preference))
def is_acyclic(adjacency: sparse.csr_matrix) -> bool: """Check whether a graph has no cycle. Parameters ---------- adjacency: Adjacency matrix of the graph. Returns ------- is_acyclic : bool A boolean with value True if the graph has no cycle and False otherwise """ n_nodes = adjacency.shape[0] n_cc = sparse.csgraph.connected_components(adjacency, (not is_symmetric(adjacency)), 'strong', False) if n_cc == n_nodes: # check for self-loops (= cycles) return (adjacency.diagonal() == 0).all() else: return False
def sparse_average_precision_at_k(y_true: csr_matrix, y_scores: csr_matrix, k: int = 5) -> float: """ Computes the average precision at k for sparse binary matrices. :param y_true: grounded truth in binary format (n_samples, n_labels) :param y_scores: predictions in representation that can be ranked (e.g. probabilities) :param k: top k labels to check :return: precision at k score """ if y_true.shape != y_scores.shape: raise Exception('y_true and y_pred must have same shape') if y_true.shape[1] < k: raise Exception('Less labels than k') # get indices of k top values of y_pred top_idx = top_n_idx_sparse(y_scores, k) # create new matrix with shape == y_true.shape with only top ranked labels y_pred_binary_only_top = lil_matrix(y_true.shape, dtype='int8') for index, (binary_row, idx_row) in enumerate(zip(y_pred_binary_only_top, top_idx)): y_pred_binary_only_top[index, idx_row] = 1 y_pred_binary_only_top = y_pred_binary_only_top.tocsr() # compute precision # get correct predicted labels correct_labelled = y_true.multiply(y_pred_binary_only_top) summed_precision = [] for index, (row, score_row) in enumerate(zip(correct_labelled, y_scores)): # check special case that corresponding y_true row is empty => unlabeled instance if y_true[index].count_nonzero() == 0: # if no labels where predicted add 1 to sum if score_row.count_nonzero() == 0: summed_precision.append(1.0) else: summed_precision.append(0) else: summed_precision.append(row.count_nonzero() / k) return sum(summed_precision) / len(summed_precision)
def train(self, x_train: csr_matrix, y_train: csr_matrix, x_test: csr_matrix, y_test: csr_matrix, epochs, mem_size, batch_size): saved_weights = find_checkpoint_file('.', self.model_name) k_start = 1 if len(saved_weights) != 0: print('[INFO] Saved weights found, loading...') epoch = saved_weights[saved_weights.find('epoch_') + 6:saved_weights. find('_', saved_weights.find('epoch_') + 6)] self.model.load_weights(saved_weights) k_start = int(epoch) + 1 test_data = (self._vectorize(x_test, self.X_vocab_len), self._vectorize(y_test, self.y_vocab_len)) y_test_array = y_test.toarray() prev_acc = 0.0 epoch = k_start lr = self.learning_rate while epoch <= epochs and lr > 0.0000001: # for epoch in range(k_start, epochs + 1): acc = self._train_epoch(epoch, batch_size, mem_size, test_data, x_train, y_test_array, y_train) print('Accuracy', acc) if acc >= prev_acc: self.model.save_weights( f'{self.model_name}_epoch_{epoch}_{acc}_{self.embedding_dim}_{self.hidden_dim}_{self.layer_num}_{self.dropout}_{lr}.hdf5' ) epoch += 1 prev_acc = acc else: saved_weights = find_checkpoint_file('.', self.model_name) self.model.load_weights(saved_weights) lr *= 0.1 self.model.optimizer.lr.assign(lr)
def fit(self, X: sp.csr_matrix, n_samples: int): """Learn the idf vector (global term weights). Arguments: X: A matrix of term/token counts. n_samples: Number of total documents """ X = check_array(X, accept_sparse=('csr', 'csc')) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = np.float64 if self.use_idf: _, n_features = X.shape self.df = np.squeeze(np.asarray(X.sum(axis=0))) idf = np.log(n_samples / self.df) self._idf_diag = sp.diags(idf, offsets=0, shape=(n_features, n_features), format='csr', dtype=dtype) return self
def __init__(self, A: sparse.csr_matrix, L: sparse.csr_matrix, batch_size=1): ''' This is trick dataset for graph. I pass batch_size here so when training, DataLoader is always batch_size =1 :param A: :param L: :param batch_size: ''' # self.dts = [] # dataset_size = A.shape[0] # steps_per_epoch = (dataset_size - 1) // batch_size + 1 # for i in range(steps_per_epoch): # index = np.arange( # i * batch_size, min((i + 1) * batch_size, dataset_size)) # A_train = A[index, :].todense() # L_train = L[index][:, index].todense() # # A_train = torch.tensor(A_train) # L_train = torch.tensor(L_train) # batch_inp = [A_train, L_train] # self.dts.append(batch_inp) self.A = A self.L = L self.size = A.get_shape()[0]
def make_weights(distribution: str, adjacency: sparse.csr_matrix) -> np.ndarray: """Array of weights from a matrix and a desired distribution. Parameters ---------- distribution: Distribution for node sampling. Only ``'degree'`` or ``'uniform'`` are accepted. adjacency: The adjacency matrix of the neighbors. Returns ------- node_weights: np.ndarray Valid weights of nodes. """ n = adjacency.shape[0] if distribution == 'degree': node_weights_vec = adjacency.dot(np.ones(adjacency.shape[1])) elif distribution == 'uniform': node_weights_vec = np.ones(n) else: raise ValueError('Unknown distribution of node weights.') return node_weights_vec
def _split_features_to_input(x: ss.csr_matrix, idx1: int, idx2: int)\ -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Convert individual features to arrays corresponding to the three inputs for the neural network. Parameters ---------- x : List[ss.csr_matrix] Sparse feature array with encodings as rows. idx1 : int First index to split the feature arrays. idx2 : int Second index to split the feature arrays. Returns ------- Tuple[np.ndarray, np.ndarray, np.ndarray] The features are split in three arrays according to the two split indexes. """ x = x.toarray() return x[:, :idx1], x[:, idx1:idx2], x[:, idx2:]
def _iter_meta(ids: ndarray, meta: csr_matrix, n_dim: int) -> Iterator[List[int]]: """ Lazily evaluate metadata in the provided CSR matrix. Parameters ---------- ids: ndarray An array of IDs. For items, this will correspond to individual item IDs. For users, this will correspond to individual user IDs. meta: csr_matrix A sparse matrix of (NxM) dimensions, where N corresponds to the number of user/item IDs (above) and M corresponds to the number of user/item metadata features (vocab) in the dataset. n_dim: int The length of the output vectors. Makes sure this is large enough to actually append some metadata to your output vectors (i.e. > 1). Returns ------- output: Iterator An iterator, where each ID in the list is mapped to corresponding metadata. The output shape of each element is then a list of 'n_dim' length. """ groups = defaultdict(list) _ids, tags = meta.nonzero() for _id, _tag in zip(_ids, tags): groups[_id].append(_tag) for _id in ids: group = groups[_id] padding = [0] * max(0, n_dim - len(group)) features = [_id, *group, *padding][:n_dim] yield features
def fit(self, X: spa.csr_matrix, Y: spa.csr_matrix, X_val=None, Y_val=None): ''' Fit model to data. X_val and Y_val are only used to report accuracy during optimization they do not affect the fitted W,b parameters''' if X.ndim == 1: X = X.reshape(1, -1) N, D = X.shape self.encoder = LabelEncoder() y = self.encoder.fit_transform(Y) K = len(self.encoder.classes_) Z = np.zeros((N, K), dtype=int) Z[np.arange(N), y] = 1 if not (X_val is None): N_val = len(X_val) y_val = self.encoder.transform(Y_val) Z_val = np.zeros((N_val, K), dtype=int) Z_val[np.arange(N_val), y_val] = 1 else: Z_val = None b_guess = np.zeros(K) W_guess = np.random.normal(0, 1, (K, D)) / np.sqrt(D) self.b, self.W = LR.optimize_logistic_weights( X, Z, b_guess, W_guess, X_val=X_val, Z_val=Z_val, penalty=self.penalty, learning_rate=self.learning_rate, batch_size=self.batch_size, tol=self.tol, max_iter=self.max_iter, verbose=self.verbose)
def with_attribute_anomolies( node_attrs: sp.csr_matrix, num_candidates: int, num_anomolies: int = 1) -> Tuple[sp.csr_matrix, np.ndarray]: """ Get attribute matrix with some rows replaced with others. For each anomoly, we replace the attributes with those of the node with attributes furthest away from the original w.r.t. Euclidean norm from `num_candidates` candidates of the original. Args: node_attrs: [num_nodes, num_attrs] sparse attributes. num_candidates: number of candidates per anomoly. num_anomolies: number of anomolies to overwrite. Returns: augmented_node_attrs: node attributes with anomolous node attributes replaced. mapping: [num_anomolies, 2] int32 array, where `augmented_node_attrs[mapping[i, 1]] == node_attrs[mapping[i, 0]]` """ num_nodes = node_attrs.shape[0] node_attrs_lil = node_attrs.tolil() anomolies = np.random.choice(num_nodes, num_anomolies, replace=False) anomolies.sort() mapping = np.empty((num_anomolies, 2), dtype=np.int32) for i, a in enumerate(anomolies): candidates = np.random.choice(num_nodes, num_candidates, replace=False) norms = np.linalg.norm(node_attrs[a].todense() - node_attrs[candidates].todense(), axis=-1) max_norm = np.argmax(norms) replacement = candidates[max_norm] node_attrs_lil[a] = node_attrs[replacement] mapping[i] = a, replacement return node_attrs_lil.tocsr(), mapping
def _mutual_proximity_empiric_sparse(S: csr_matrix, test_set_ind: np.ndarray = None, verbose: int = 0, log=None): """MP empiric for sparse similarity matrices. Please do not directly use this function, but invoke via mutual_proximity_empiric() """ self_value = 1.0 # similarity matrix n = S.shape[0] S_mp = lil_matrix(S.shape) for i, j in zip(*triu(S).nonzero()): if verbose and log and ((i + 1) % 1000 == 0 or i == n - 2): log.message("MP_empiric: {} of {}.".format(i + 1, n - 1), flush=True) d = S[j, i] dI = S.getrow(i).toarray() dJ = S.getrow(j).toarray() nz = (dI > 0) & (dJ > 0) S_mp[i, j] = (nz & (dI <= d) & (dJ <= d)).sum() / (nz.sum() - 1) S_mp += S_mp.T for i in range(n): S_mp[i, i] = self_value # need to set self values return S_mp.tocsr()
def ndcg(X_true: csr_matrix, X_top_k: np.array, R=100) -> np.array: """ Calculate ndcg@R for each users in X_true and X_pred matrices Args: X_true: Matrix containing True values for user-item interactions X_top_k: Matrix containing inidices picked by model R: Number of elements taken into consideration Returns: Numpy array containing calculated ndcg@R for each user """ penalties = 1. / np.log2(np.arange(2, R + 2)) selected = np.take_along_axis(X_true, X_top_k[:, :R], axis=-1) DCG = selected * penalties cpenalties = np.empty(R + 1) np.cumsum(penalties, out=cpenalties[1:]) cpenalties[0] = 0 maxhit = np.minimum(X_true.getnnz(axis=1), R) IDCG = cpenalties[maxhit] return DCG / IDCG
def calculate_min_violations(A: csr_matrix) -> (float, float): """ Calculate the minimum number of violations in a graph for all possible rankings A violaton is an edge going from a lower ranked node to a higher ranked one Minimum number is calculated by summing bidirectional interactions. Input: A: graph adjacency matrix where A[i,j] is the weight of an edge from node i to j Output: minimum number of violations proportion of all edges against minimum violations """ ii, ji, v = scipy.sparse.find( A ) # I,J,V contain the row, column indices, and values of the nonzero entries. min_viol = 0.0 for e in range(len(v)): # for all nodes interactions i, j = ii[e], ji[e] if A[i, j] > 0 and A[j, i] > 0: min_viol = min_viol + min(A[i, j], A[j, i]) m = A.sum() return (min_viol, min_viol / m)
def compute_tf_idf(doc_matrix: sparse.csr_matrix) -> sparse.csr_matrix: # 假设这里的 doc_matrix 已经是行和为 1,那么就只需计算 idf # 首先找出所有非零元的坐标 i, nonzero_cols, v = sparse.find(doc_matrix) # 然后统计每一列的非零元的个数 nonzero_cols, nonzero_col_appearences = np.unique(nonzero_cols, return_counts=True) # 有些列可能全为 0,所以 indicator = np.zeros(shape=( 1, doc_matrix.shape[1], ), dtype=np.float32) indicator[0, nonzero_cols] = nonzero_col_appearences[:] n_articles = doc_matrix.shape[0] indicator = np.log(n_articles / indicator) # 此处为按元素乘 tf_idf = sparse.csr_matrix(doc_matrix.multiply(indicator)) return tf_idf
def __init__(self, k, data: csr_matrix): self.__spent_norm_diff = 0 self.k = k if issparse(data) or isinstance(data, np.ndarray): self.__toArray = lambda x: x.toarray() if hasattr( data[0], 'toarray') else lambda x: np.array(x) try: # Sempre que pssível, usar arrays pois são absurdamente mas rápidos para operar sobre if isinstance(data, np.ndarray): tmp = data - np.array(tmp[0]) else: tmp = data.toarray() - np.array(tmp[0]) self._data = tmp self.isSparse = False except BaseException as e: print("Warning: sparse data", e) self._data = data self.isSparse = True self._dataLen = dataLen = data.shape[0] elif isinstance(data, list) or isinstance(data, tuple): self._data = np.array(data) self._dataLen = dataLen = len(data) self.isSparse = False else: raise TypeError( """The 'data' argument must be one of the following wypes: <scipy_sparse_matrix>, np.ndarray, list os tuple""") if not 0 < self.k <= self._dataLen: raise ValueError( "The 'k' number of centers must be in the range [1, data_sample]" ) self._centroidsIndex = np.zeros(k, dtype=np.uint16) self._computedCentroids = 0 self._probab = np.full((k, dataLen), -np.inf) self._minDistanceToNearestCentroid = np.full( (1, dataLen), np.inf, dtype=np.float64 ) # ditancias até qqer centroid inicialmente é infinita
def snn_dissimilarity_func(graph: csr_matrix, n_neighbors: int, *args, **kwargs) -> csr_matrix: """Default SNN dissimilarity function Computes the dissimilarity between two points in terms of shared nearest neighbors Args: graph (scipy.sparse.csr_matrix): sparse matrix with dimensions (n_samples, n_samples), where the element ij represents the distance between the point i and j n_neighbors (int): number of neighbors in the k-neighborhood search """ graph.data[graph.data > 0] = 1 n_samples = graph.shape[0] # Add the point as its own neighbor graph += spdiags(np.ones(n_samples), diags=0, m=n_samples, n=n_samples) matrix = graph * graph.transpose() matrix.sort_indices() # The lower the "closer" matrix.data = n_neighbors - matrix.data return matrix
def __save_to_docword_file(self, bag_of_words: csr_matrix, issues: List[TokenizedIssue], target_dir: str) -> None: """ Save words to docword file in following format: D (documents number) W (words number) NNZ (total rows) docID wordID count docID wordID count ..... :param bag_of_words: Matrix where each cell represents number of word appearance in document :param issues: Tokenized issues :param target_dir: Target directory where docword file will be created :return: None """ target_path = os.path.join(target_dir, "docword.issues.txt") with open(target_path, "w") as docword_file: docword_file.write(str(len(issues)) + "\n") docword_file.write(str(len(self.count_vectorizer.get_feature_names())) + "\n") docword_file.write(str(bag_of_words.nnz) + "\n") nnz_x, nnz_y = bag_of_words.nonzero() for x, y in zip(nnz_x, nnz_y): docword_file.write( "%s %s %s\n" % (str(issues[x].id), str(y + 1), str(bag_of_words[x, y])))
def csr_to_dicts(x:csr_matrix,dim_names=None): if dim_names is None: dim_names = [i for i in range(x.shape[1])] vert_idx,horiz_idx = x.nonzero() return [{dim_names[k]:v for k,v in zip(horiz_idx[np.where(vert_idx==row_idx)],x.data[np.where(vert_idx==row_idx)])} for row_idx in range(x.shape[0])]
def left_sparse_dot(self, matrix: sparse.csr_matrix): """Left dot product with a sparse matrix""" self.backward = matrix.dot(self.backward) return self
def svg_bigraph(biadjacency: sparse.csr_matrix, names_row: Optional[np.ndarray] = None, names_col: Optional[np.ndarray] = None, labels_row: Optional[Union[dict, np.ndarray]] = None, labels_col: Optional[Union[dict, np.ndarray]] = None, scores_row: Optional[Union[dict, np.ndarray]] = None, scores_col: Optional[Union[dict, np.ndarray]] = None, membership_row: Optional[sparse.csr_matrix] = None, membership_col: Optional[sparse.csr_matrix] = None, seeds_row: Union[list, dict] = None, seeds_col: Union[list, dict] = None, position_row: Optional[np.ndarray] = None, position_col: Optional[np.ndarray] = None, reorder: bool = True, width: Optional[float] = 400, height: Optional[float] = 300, margin: float = 20, margin_text: float = 3, scale: float = 1, node_size: float = 7, node_size_min: float = 1, node_size_max: float = 20, display_node_weight: bool = False, node_weights_row: Optional[np.ndarray] = None, node_weights_col: Optional[np.ndarray] = None, node_width: float = 1, node_width_max: float = 3, color_row: str = 'gray', color_col: str = 'gray', label_colors: Optional[Iterable] = None, display_edges: bool = True, edge_labels: Optional[list] = None, edge_width: float = 1, edge_width_min: float = 0.5, edge_width_max: float = 10, edge_color: str = 'black', display_edge_weight: bool = True, font_size: int = 12, filename: Optional[str] = None) -> str: """Return SVG image of a bigraph. Parameters ---------- biadjacency : Biadjacency matrix of the graph. names_row : Names of the rows. names_col : Names of the columns. labels_row : Labels of the rows (negative values mean no label). labels_col : Labels of the columns (negative values mean no label). scores_row : Scores of the rows (measure of importance). scores_col : Scores of the columns (measure of importance). membership_row : Membership of the rows (label distribution). membership_col : Membership of the columns (label distribution). seeds_row : Rows to be highlighted (if dict, only keys are considered). seeds_col : Columns to be highlighted (if dict, only keys are considered). position_row : Positions of the rows. position_col : Positions of the columns. reorder : Use clustering to order nodes. width : Width of the image. height : Height of the image. margin : Margin of the image. margin_text : Margin between node and text. scale : Multiplicative factor on the dimensions of the image. node_size : Size of nodes. node_size_min : Minimum size of nodes. node_size_max : Maximum size of nodes. display_node_weight : If ``True``, display node weights through node size. node_weights_row : Weights of rows (used only if **display_node_weight** is ``True``). node_weights_col : Weights of columns (used only if **display_node_weight** is ``True``). node_width : Width of node circle. node_width_max : Maximum width of node circle. color_row : Default color of rows (svg color). color_col : Default color of cols (svg color). label_colors : Colors of the labels (svg color). display_edges : If ``True``, display edges. edge_labels : Labels of the edges, as a list of tuples (source, destination, label) edge_width : Width of edges. edge_width_min : Minimum width of edges. edge_width_max : Maximum width of edges. display_edge_weight : If ``True``, display edge weights through edge widths. edge_color : Default color of edges (svg color). font_size : Font size. filename : Filename for saving image (optional). Returns ------- image : str SVG image. Example ------- >>> from sknetwork.data import movie_actor >>> biadjacency = movie_actor() >>> from sknetwork.visualization import svg_bigraph >>> image = svg_bigraph(biadjacency) >>> image[1:4] 'svg' """ n_row, n_col = biadjacency.shape # node positions if position_row is None or position_col is None: position_row = np.zeros((n_row, 2)) position_col = np.ones((n_col, 2)) if reorder: bilouvain = BiLouvain() bilouvain.fit(biadjacency) index_row = np.argsort(bilouvain.labels_row_) index_col = np.argsort(bilouvain.labels_col_) else: index_row = np.arange(n_row) index_col = np.arange(n_col) position_row[index_row, 1] = np.arange(n_row) position_col[index_col, 1] = np.arange(n_col) + .5 * (n_row - n_col) position = np.vstack((position_row, position_col)) # node colors colors_row = get_node_colors(n_row, labels_row, scores_row, membership_row, color_row, label_colors) colors_col = get_node_colors(n_col, labels_col, scores_col, membership_col, color_col, label_colors) # node sizes if node_weights_row is None: node_weights_row = biadjacency.dot(np.ones(n_col)) if node_weights_col is None: node_weights_col = biadjacency.T.dot(np.ones(n_row)) node_sizes_row, node_sizes_col = get_node_sizes_bipartite(node_weights_row, node_weights_col, node_size, node_size_min, node_size_max, display_node_weight) # node widths node_widths_row = get_node_widths(n_row, seeds_row, node_width, node_width_max) node_widths_col = get_node_widths(n_col, seeds_col, node_width, node_width_max) # rescaling if not width and not height: raise ValueError("You must specify either the width or the height of the image.") position, width, height = rescale(position, width, height, margin, node_size, node_size_max, display_node_weight) # node names if names_row is not None: text_length = np.max(np.array([len(str(name)) for name in names_row])) position[:, 0] += text_length * font_size * .5 width += text_length * font_size * .5 if names_col is not None: text_length = np.max(np.array([len(str(name)) for name in names_col])) width += text_length * font_size * .5 # scaling position *= scale height *= scale width *= scale position_row = position[:n_row] position_col = position[n_row:] svg = """<svg width="{}" height="{}" xmlns="http://www.w3.org/2000/svg">\n""".format(width, height) # edges if display_edges: biadjacency_coo = sparse.coo_matrix(biadjacency) if edge_color is None: if names_row is None and names_col is None: edge_color = 'black' else: edge_color = 'gray' edge_colors, edge_order, edge_colors_residual = get_edge_colors(biadjacency, edge_labels, edge_color, label_colors) edge_widths = get_edge_widths(biadjacency_coo, edge_width, edge_width_min, edge_width_max, display_edge_weight) for ix in edge_order: i = biadjacency_coo.row[ix] j = biadjacency_coo.col[ix] color = edge_colors[ix] svg += svg_edge(pos_1=position_row[i], pos_2=position_col[j], edge_width=edge_widths[ix], edge_color=color) for i, j, color in edge_colors_residual: svg += svg_edge(pos_1=position_row[i], pos_2=position_col[j], edge_width=edge_width, edge_color=color) # nodes for i in range(n_row): if membership_row is None: svg += svg_node(position_row[i], node_sizes_row[i], colors_row[i], node_widths_row[i]) else: if membership_row[i].nnz == 1: index = membership_row[i].indices[0] svg += svg_node(position_row[i], node_sizes_row[i], colors_row[index], node_widths_row[i]) else: svg += svg_pie_chart_node(position_row[i], node_sizes_row[i], membership_row[i].todense(), colors_row, node_widths_row[i]) for i in range(n_col): if membership_col is None: svg += svg_node(position_col[i], node_sizes_col[i], colors_col[i], node_widths_col[i]) else: if membership_col[i].nnz == 1: index = membership_col[i].indices[0] svg += svg_node(position_col[i], node_sizes_col[i], colors_col[index], node_widths_col[i]) else: svg += svg_pie_chart_node(position_col[i], node_sizes_col[i], membership_col[i].todense(), colors_col, node_widths_col[i]) # text if names_row is not None: for i in range(n_row): svg += svg_text(position_row[i] - (margin_text + node_sizes_row[i], 0), names_row[i], font_size, True) if names_col is not None: for i in range(n_col): svg += svg_text(position_col[i] + (margin_text + node_sizes_col[i], 0), names_col[i], font_size) svg += """</svg>\n""" if filename is not None: with open(filename + '.svg', 'w') as f: f.write(svg) return svg
def predict(self, X: csr_matrix) -> np.array: return self.clf.predict(X.toarray())
def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data): feature_data[feature_data > 1] = np.log(feature_data[feature_data > 1]) dataMatrix.data = dataMatrix.data * feature_data return dataMatrix
def compare_news_vector_with_1(arr, vec: csr_matrix): return 1 - vec._mul_vector(arr)
def compare_news_vector_with_(arr, vec: csr_matrix): return 1 - vec.dot(arr)
def predict(self, X: csr_matrix): X = X.toarray() return self.clf.predict(X)
def _eliminate(matrix: sp.csr_matrix, user_indices, item_indices): matrix = matrix.copy() # `lil_matrix` is too slow matrix[list(user_indices), list(item_indices)] = 0 matrix.eliminate_zeros() return matrix
def predict(self, X: csr_matrix): X = X.todense() # TensorFlow/Skflow doesn't support sparse matrices return self.clf.predict(X)
def advanced_subclass_handling(data_frame: pd.DataFrame, URM_train: csr_matrix, path="../../data/", add_subclass=False): """ Here we want to include in the training set sub class information in the following way: - A column encoding the mean of 'label' for a certain couple (user, subclass): i.e. how many items of that subclass the user liked - Including information about the popularity of the subclass (how many items for that subclass - Including ratings of that subclass :param URM_train: mean response will be retrieved from here :param data_frame: dataframe being pre-processed for boosting :param path: path to the folder containing subclass dataframe :return: dataframe with augmented information """ print("Adding subclass and feature engineering subclass...") data_frame = data_frame.copy() df_subclass: pd.DataFrame = pd.read_csv(path + "data_ICM_sub_class.csv") df_subclass = df_subclass[['row', 'col']] df_subclass = df_subclass.rename(columns={"col": "subclass"}) # Merging sub class information data_frame = pd.merge(data_frame, df_subclass, right_on="row", left_on="item_id") data_frame = data_frame.drop(columns=["row"], inplace=False) print("\t- Add items present for each subclass") # Add subclass item-popularity: how many items are present of that subclass subclass_item_count = df_subclass.groupby("subclass").count() data_frame = pd.merge(data_frame, subclass_item_count, right_index=True, left_on="subclass") data_frame = data_frame.rename(columns={"row": "item_per_subclass"}) print("\t- Add ratings popularity for each subclass") # Add subclass ratings-popularity: how many interactions we have for each subclass URM_train_csc = URM_train.tocsc() n_ratings_sub = [] sorted_sub_indices = np.argsort(df_subclass['subclass'].values) sorted_sub = df_subclass['subclass'][sorted_sub_indices].values sorted_item_subclass = df_subclass['row'][sorted_sub_indices].values unique_sorted_sub, sub_indptr = np.unique(sorted_sub, return_index=True) sub_indptr = np.concatenate([sub_indptr, [sorted_sub.size]]) for i, sub in tqdm(enumerate(unique_sorted_sub), total=unique_sorted_sub.size, desc="\t\tProcessing"): item_sub = sorted_item_subclass[sub_indptr[i]:sub_indptr[i + 1]] n_ratings_sub.append(URM_train_csc[:, item_sub].data.size) ratings_sub = np.array([unique_sorted_sub, n_ratings_sub]) ratings_per_sub_df = pd.DataFrame( data=np.transpose(ratings_sub), columns=["subclass", "global_ratings_per_subclass"]) data_frame = pd.merge(data_frame, ratings_per_sub_df, left_on="subclass", right_on="subclass") # Add subclass ratings-popularity for each user using rating percentage print("\t- Add ratings popularity for pairs (user, subclass)") users = data_frame['user_id'].values sub = data_frame['subclass'].values perc_array = np.zeros(users.size) rat_array = np.zeros(users.size) for i, user in tqdm(enumerate(users), total=users.size, desc="\t\tProcessing"): curr_sub = sub[i] curr_sub_index = np.searchsorted(unique_sorted_sub, curr_sub) # Find items of this subclass item_sub = sorted_item_subclass[ sub_indptr[curr_sub_index]:sub_indptr[curr_sub_index + 1]] user_item = URM_train.indices[URM_train.indptr[user]:URM_train. indptr[user + 1]] total_user_likes = user_item.size mask = np.in1d(item_sub, user_item) likes_per_sub = item_sub[mask].size user_p = likes_per_sub / total_user_likes perc_array[i] = user_p rat_array[i] = likes_per_sub data_frame["subclass_user_like_perc"] = perc_array data_frame["subclass_user_like_quantity"] = rat_array if not add_subclass: data_frame = data_frame.drop(columns=["subclass"], inplace=False) return data_frame
def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data): dataMatrix.data = dataMatrix.data * feature_data return dataMatrix
def weight_matrix(self, dataMatrix: sps.csr_matrix, feature_data): dataMatrix.data = dataMatrix.data * (1 / np.log1p(feature_data)) return dataMatrix
def predict_proba(self, X: csr_matrix): return self.clf.predict_proba(X.todense())
def compute_norms(matrix: sparse.csr_matrix) -> np.ndarray: """Computes norms for each row.""" return np.sqrt(matrix.multiply(matrix).sum(axis=1).A).flatten()
def predict_proba(self, X: csr_matrix): return self.clf.predict(X.toarray())
def unshift_label_matrix(L_sparse: csr_matrix) -> np.ndarray: """Unshift a sparse label matrix (ABATAIN as 0) to a dense one (ABSTAIN as -1).""" return L_sparse.toarray() - 1
def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False): super().__init__(X, Y, tune_parameters) self.X, self.Y = X.toarray(), Y self.classifier = SVC(decision_function_shape='ovo') self.clf = BaggingClassifier(self.classifier, n_estimators=self.estimators, n_jobs=8, max_samples=self.max_samples, max_features=self.max_features)