def get_S_incremental_and_set_W(self): self.S_incremental = self.cythonEpoch.get_S() if self.train_with_sparse_weights: self.W_sparse = self.S_incremental self.W_sparse = check_matrix(self.W_sparse, format="csr") else: self.W_sparse = similarityMatrixTopK(self.S_incremental, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format="csr")
def fit(self, topK=None, l2_norm=1e3, normalize_matrix=False, verbose=True): self.verbose = verbose start_time = time.time() self._print("Fitting model... ") if normalize_matrix: # Normalize rows and then columns self.URM_train = normalize(self.URM_train, norm="l2", axis=1) self.URM_train = normalize(self.URM_train, norm="l2", axis=0) self.URM_train = sps.csr_matrix(self.URM_train) # Grahm matrix is X X^t, compute dot product similarity = Compute_Similarity( self.URM_train, shrink=0, topK=self.URM_train.shape[1], normalize=False, similarity="cosine", ) grahm_matrix = similarity.compute_similarity().toarray() diag_indices = np.diag_indices(grahm_matrix.shape[0]) grahm_matrix[diag_indices] += l2_norm P = np.linalg.inv(grahm_matrix) B = P / (-np.diag(P)) B[diag_indices] = 0.0 new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time) self._print("Fitting model... done in {:.2f} {}".format( new_time_value, new_time_unit)) # Check if the matrix should be saved in a sparse or dense format # The matrix is sparse, regardless of the presence of the topK, if nonzero cells are less than sparse_threshold_quota % if topK is not None: B = similarityMatrixTopK(B, k=topK, verbose=False) if self._is_content_sparse_check(B): self._print("Detected model matrix to be sparse, changing format.") self.W_sparse = check_matrix(B, format="csr", dtype=np.float32) else: self.W_sparse = check_matrix(B, format="npy", dtype=np.float32) self._W_sparse_format_checked = True self._compute_item_score = self._compute_score_W_dense
def __init__(self, URM_train, Similarity_1, Similarity_2): super(ItemKNNSimilarityHybridRecommender, self).__init__(URM_train) if Similarity_1.shape != Similarity_2.shape: raise ValueError( "ItemKNNSimilarityHybridRecommender: similarities have different size, S1 is {}, S2 is {}" .format(Similarity_1.shape, Similarity_2.shape)) # CSR is faster during evaluation self.Similarity_1 = check_matrix(Similarity_1.copy(), 'csr') self.Similarity_2 = check_matrix(Similarity_2.copy(), 'csr')
def __init__(self, URM_train, Similarity_1, Similarity_2, sparse_weights=True): super(ItemKNNSimilarityHybridRecommender, self).__init__(URM_train) if Similarity_1.shape != Similarity_2.shape: raise ValueError( "ItemKNNSimilarityHybridRecommender: similarities have different size, S1 is {}, S2 is {}".format( Similarity_1.shape, Similarity_2.shape ) ) self.Similarity_1 = check_matrix(Similarity_1.copy(), "csr") self.Similarity_2 = check_matrix(Similarity_2.copy(), "csr")
def __init__(self, urm_train, Similarity_1, Similarity_2, verbose=True): super(UserSimilarityHybridRecommender, self).__init__(urm_train, verbose=verbose) if Similarity_1.shape != Similarity_2.shape: raise ValueError( "UserSimilarityHybridRecommender: similarities have different size, S1 is {}, S2 is {}" .format(Similarity_1.shape, Similarity_2.shape)) # CSR is faster during evaluation self.Similarity_1 = check_matrix(Similarity_1.copy(), 'csr') self.Similarity_2 = check_matrix(Similarity_2.copy(), 'csr')
def fit(self, lambda_user=10, lambda_item=25): self.lambda_user = lambda_user self.lambda_item = lambda_item self.n_items = self.URM_train.shape[1] # convert to csc matrix for faster column-wise sum self.URM_train = check_matrix(self.URM_train, "csc", dtype=np.float32) # 1) global average self.mu = ( self.URM_train.data.sum(dtype=np.float32) / self.URM_train.data.shape[0] ) # 2) item average bias # compute the number of non-zero elements for each column col_nnz = np.diff(self.URM_train.indptr) # it is equivalent to: # col_nnz = X.indptr[1:] - X.indptr[:-1] # and it is **much faster** than # col_nnz = (X != 0).sum(axis=0) URM_train_unbiased = self.URM_train.copy() URM_train_unbiased.data -= self.mu self.item_bias = URM_train_unbiased.sum(axis=0) / (col_nnz + self.lambda_item) self.item_bias = np.asarray( self.item_bias ).ravel() # converts 2-d matrix to 1-d array without anycopy # 3) user average bias # NOTE: the user bias is *useless* for the sake of ranking items. We just show it here for educational purposes. # first subtract the item biases from each column # then repeat each element of the item bias vector a number of times equal to col_nnz # and subtract it from the data vector URM_train_unbiased.data -= np.repeat(self.item_bias, col_nnz) # now convert the csc matrix to csr for efficient row-wise computation URM_train_unbiased_csr = URM_train_unbiased.tocsr() row_nnz = np.diff(URM_train_unbiased_csr.indptr) # finally, let's compute the bias self.user_bias = URM_train_unbiased_csr.sum(axis=1).ravel() / ( row_nnz + self.lambda_user ) # 4) precompute the item ranking by using the item bias only # the global average and user bias won't change the ranking, so there is no need to use them # self.item_ranking = np.argsort(self.bi)[::-1] self.URM_train = check_matrix(self.URM_train, "csr", dtype=np.float32)
def __init__(self, URM_train, UCM_train, ICM_train, verbose=True): super(HERSWrapper, self).__init__(URM_train, verbose=verbose) assert self.n_users == UCM_train.shape[ 0], "{}: URM_train has {} users but UCM_train has {}".format( self.RECOMMENDER_NAME, self.n_users, UCM_train.shape[0]) self.UCM_train = check_matrix(UCM_train.copy(), 'csr', dtype=np.float32) self.UCM_train.eliminate_zeros() self._cold_user_CBF_mask = np.ediff1d(self.UCM_train.indptr) == 0 if self._cold_user_CBF_mask.any(): print("{}: UCM Detected {} ({:.2f} %) cold users.".format( self.RECOMMENDER_NAME, self._cold_user_CBF_mask.sum(), self._cold_user_CBF_mask.sum() / self.n_users * 100)) assert self.n_items == ICM_train.shape[ 0], "{}: URM_train has {} items but ICM_train has {}".format( self.RECOMMENDER_NAME, self.n_items, ICM_train.shape[0]) self.ICM_train = check_matrix(ICM_train.copy(), 'csr', dtype=np.float32) self.ICM_train.eliminate_zeros() self._cold_item_CBF_mask = np.ediff1d(self.ICM_train.indptr) == 0 if self._cold_item_CBF_mask.any(): print("{}: ICM Detected {} ({:.2f} %) items with no features.". format(self.RECOMMENDER_NAME, self._cold_item_CBF_mask.sum(), self._cold_item_CBF_mask.sum() / self.n_items * 100)) self.G_ui = np.swapaxes( np.asarray(self.URM_train.nonzero(), dtype=np.int32), 0, 1) self.G_user = nx.convert_matrix.from_scipy_sparse_matrix( self.UCM_train, create_using=nx.DiGraph()) self.G_user = self.G_user.to_undirected() self.G_user.remove_nodes_from(list(nx.isolates(self.G_user))) self.G_item = nx.convert_matrix.from_scipy_sparse_matrix( self.ICM_train, create_using=nx.DiGraph()) self.G_item = self.G_item.to_undirected() self.G_item.remove_nodes_from(list(nx.isolates(self.G_item))) # This is used in _compute_item_score self._item_indices = np.arange(0, self.n_items, dtype=np.int32)
def fit(self, l1_ratio=0.1, positive_only=True, topK=100, workers=multiprocessing.cpu_count(), max_iter=100, alpha=0.01): assert l1_ratio >= 0 and l1_ratio <= 1, "SLIM_ElasticNet: l1_ratio must be between 0 and 1, provided value was {}".format( l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK self.max_iter = max_iter self.alpha = alpha self.workers = workers self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = self.URM_train.shape[1] # fit item's factors in parallel #oggetto riferito alla funzione nel quale predefinisco parte dell'input _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK, alpha=self.alpha, max_iter=self.max_iter) #creo un pool con un certo numero di processi pool = Pool(processes=self.workers) #avvio il pool passando la funzione (con la parte fissa dell'input) #e il rimanente parametro, variabile res = pool.map(_pfit, np.arange(n_items)) # res contains a vector of (values, rows, cols) tuples values, rows, cols = [], [], [] for values_, rows_, cols_ in res: values.extend(values_) rows.extend(rows_) cols.extend(cols_) # generate the sparse weight matrix self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32) self.URM_train = check_matrix(self.URM_train, 'csr', dtype=np.float32)
def __init__(self, URM_train, ICM_train): super(ScoresHybridKNNCFKNNCBF, self).__init__(URM_train) self.URM_train = check_matrix(URM_train.copy(), 'csr') self.ICM_train = ICM_train self.itemKNNCF = ItemKNNCFRecommender.ItemKNNCFRecommender(URM_train) self.itemKNNCBF = ItemKNNCBFRecommender.ItemKNNCBFRecommender(URM_train, ICM_train)
def __init__(self, URM_train): super(PureSVDRecommender, self).__init__() # CSR is faster during evaluation self.URM_train = check_matrix(URM_train, 'csr') self.compute_item_score = self.compute_score_SVD
def __init__(self, urm_train, eurm=False): super(HybridGenRecommender, self).__init__(urm_train) self.data_folder = Path(__file__).parent.parent.absolute() self.eurm = eurm self.num_users = urm_train.shape[0] data = DataManager() urm_train = check_matrix(urm_train.copy(), 'csr') icm_price, icm_asset, icm_sub, icm_all = data.get_icm() ucm_age, ucm_region, ucm_all = data.get_ucm() recommender_1 = ItemKNNCBFRecommender(urm_train, icm_all) recommender_1.fit(shrink=40, topK=20, feature_weighting='BM25') # recommender_2 = UserKNNCBFRecommender(urm_train, ucm_all) # recommender_2.fit(shrink=500, topK=1600, normalize=True) recommender_2 = UserKNNCBFRecommender(urm_train, ucm_all) recommender_2.fit(shrink=1777, topK=1998, similarity='tversky', feature_weighting='BM25', tversky_alpha=0.1604953616, tversky_beta=0.9862348646) self.recommender_1 = recommender_1 self.recommender_2 = recommender_2
def __init__(self, URM_train): super(BaseRecommender, self).__init__() self.URM_train = check_matrix(URM_train.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self.n_users, self.n_items = self.URM_train.shape self.normalize = False self.filterTopPop = False self.filterTopPop_ItemsID = np.array([], dtype=np.int) self.items_to_ignore_flag = False self.items_to_ignore_ID = np.array([], dtype=np.int) self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): print("{}: Detected {} ({:.2f} %) cold users.".format( self.RECOMMENDER_NAME, self._cold_user_mask.sum(), self._cold_user_mask.sum() / len(self._cold_user_mask) * 100)) self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0 if self._cold_item_mask.any(): print("{}: Detected {} ({:.2f} %) cold items.".format( self.RECOMMENDER_NAME, self._cold_item_mask.sum(), self._cold_item_mask.sum() / len(self._cold_item_mask) * 100))
def remove_empty_rows_and_cols(URM, ICM=None): URM = check_matrix(URM, "csr") rows = URM.indptr numRatings = np.ediff1d(rows) user_mask = numRatings >= 1 URM = URM[user_mask, :] cols = URM.tocsc().indptr numRatings = np.ediff1d(cols) item_mask = numRatings >= 1 URM = URM[:, item_mask] removedUsers = np.arange(0, len(user_mask))[np.logical_not(user_mask)] removedItems = np.arange(0, len(item_mask))[np.logical_not(item_mask)] if ICM is not None: ICM = ICM[item_mask, :] return URM.tocsr(), ICM.tocsr(), removedUsers, removedItems return URM.tocsr(), removedUsers, removedItems
def __init__(self, URM_train, Recommender_1, Recommender_2, Recommender_3): super(ScoresHybrid3Recommender, self).__init__(URM_train) self.URM_train = check_matrix(URM_train.copy(), 'csr') self.Recommender_1 = Recommender_1 self.Recommender_2 = Recommender_2 self.Recommender_3 = Recommender_3
def __init__(self, URM_train, Recommender_1, Recommender_2): super(ItemKNNScoresHybridRecommender_Normalized, self).__init__(URM_train) self.URM_train = check_matrix(URM_train.copy(), 'csr') self.Recommender_1 = Recommender_1 self.Recommender_2 = Recommender_2
def fit(self, URM_train, topK=500, alpha=1., min_rating=0, implicit=False, normalize_similarity=False, tuning=False, similarity_path=SIMILARITY_PATH): self.URM_train = check_matrix(URM_train.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self.n_users, self.n_items = self.URM_train.shape self.topK = topK self.alpha = alpha self.min_rating = min_rating self.implicit = implicit self.normalize_similarity = normalize_similarity if tuning: if not os.path.exists(os.getcwd() + similarity_path): self.run_fit() self.helper.export_similarity_matrix(os.getcwd() + similarity_path, self.W_sparse, name=RECOMMENDER_NAME) self.W_sparse = self.helper.import_similarity_matrix( os.getcwd() + similarity_path) else: self.run_fit() self.similarityProduct = self.URM_train.dot(self.W_sparse)
def __init__(self, URM_train, ICM, recommender_list, d_weights=None, dynamic=False, weights=None, URM_validation=None, sparse_weights=True): super(Recommender, self).__init__() # CSR is faster during evaluation self.URM_train = check_matrix(URM_train, 'csr') self.URM_validation = URM_validation self.dynamic = dynamic self.dataset = None self.d_weights = d_weights self.sparse_weights = sparse_weights self.recommender_list = [] self.weights = weights for recommender in recommender_list: if recommender in [ SLIM_BPR_Cython, MatrixFactorization_BPR_Cython ]: print("class recognized") self.recommender_list.append( recommender(URM_train, URM_validation=URM_validation)) elif recommender is ItemKNNCBFRecommender: self.recommender_list.append(recommender(ICM, URM_train)) else: self.recommender_list.append(recommender(URM_train))
def fit(self, similarities, weights=None, topK=100, normalize_weights=True): # Initialize weights array if not already initialized if weights is None: weights = np.array([1 for _ in similarities]) # Checking the input parameters are well formatted assert len(similarities) == len(weights) assert len(similarities) > 0 # Cast weights to numpy array if it is not weights = np.array(weights, dtype=np.float) # Normalize the weights if normalize_weights: weights /= weights.max() # Create a list of pairs (similarity, weight) similarity_and_weight = zip(similarities, weights) # Initialize the result W_sparse = sps.csr_matrix(similarities[0].shape, dtype=np.float) # Compute the new Similarity matrix for similarity, weight in similarity_and_weight: W_sparse += (similarity * weight) self.W_sparse = similarityMatrixTopK(W_sparse, k=topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def __init__(self, urm_train): super(HybridNorm3Recommender, self).__init__(urm_train) urm_train = check_matrix(urm_train.copy(), 'csr') self.num_users = urm_train.shape[0] # recommender_1 = HybridGenRecommender(urm_train) # recommender_1.fit() recommender_1 = RP3betaRecommender(urm_train) recommender_1.fit(topK=16, alpha=0.03374950051351756, beta=0.24087176329409027, normalize_similarity=True) recommender_3 = UserKNNCFRecommender(urm_train) recommender_3.fit(shrink=2, topK=600, normalize=True) recommender_2 = ItemKNNCFRecommender(urm_train) recommender_2.fit(topK=5, shrink=500, feature_weighting='BM25', similarity='tversky', normalize=False, tversky_alpha=0.0, tversky_beta=1.0) self.recommender_1 = recommender_1 self.recommender_2 = recommender_2 self.recommender_3 = recommender_3
def applyPearsonCorrelation(self): """ Remove from every data point the average for the corresponding column :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csc') interactionsPerCol = np.diff(self.dataMatrix.indptr) nonzeroCols = interactionsPerCol > 0 sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() colAverage = np.zeros_like(sumPerCol) colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[nonzeroCols] # Split in blocks to avoid duplicating the whole data structure start_col = 0 end_col= 0 blockSize = 1000 while end_col < self.n_columns: end_col = min(self.n_columns, end_col + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) start_col += blockSize
def __init__(self, URM_train): super(GlobalEffects, self).__init__() self.URM_train = check_matrix(URM_train, 'csc', dtype=np.float32) self.compute_item_score = self.compute_score_global_effects
def applyAdjustedCosine(self): """ Remove from every data point the average for the corresponding row :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csr') interactionsPerRow = np.diff(self.dataMatrix.indptr) nonzeroRows = interactionsPerRow > 0 sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() rowAverage = np.zeros_like(sumPerRow) rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[nonzeroRows] # Split in blocks to avoid duplicating the whole data structure start_row = 0 end_row= 0 blockSize = 1000 while end_row < self.n_rows: end_row = min(self.n_rows, end_row + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) start_row += blockSize
def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = okapi_BM_25(self.UCM_train) elif feature_weighting == "TF-IDF": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = TF_IDF(self.UCM_train) similarity = Compute_Similarity(self.UCM_train.T, shrink=shrink, topK=topK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def __init__(self, URM_train): super(Random, self).__init__() # convert to csc matrix for faster column-wise sum self.URM_train = check_matrix(URM_train, 'csr', dtype=np.float32) self.compute_item_score = self.compute_score_random
def __init__(self, URM_train, verbose=True): super(BaseRecommender, self).__init__() self.URM_train = check_matrix(URM_train.copy(), "csr", dtype=np.float32) self.URM_train.eliminate_zeros() self.n_users, self.n_items = self.URM_train.shape self.verbose = verbose self.filterTopPop = False self.filterTopPop_ItemsID = np.array([], dtype=np.int) self.items_to_ignore_flag = False self.items_to_ignore_ID = np.array([], dtype=np.int) self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): self._print( "URM Detected {} ({:.2f} %) cold users.".format( self._cold_user_mask.sum(), self._cold_user_mask.sum() / self.n_users * 100, ) ) self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0 if self._cold_item_mask.any(): self._print( "URM Detected {} ({:.2f} %) cold items.".format( self._cold_item_mask.sum(), self._cold_item_mask.sum() / self.n_items * 100, ) )
def __init__(self, URM_train): super(RP3betaRecommender, self).__init__() self.URM_train = check_matrix(URM_train, format='csr', dtype=np.float32) self.sparse_weights = True
def fit(self, item_weights, URM_train, selectTopK=False): self.URM_train = check_matrix(URM_train, format='csc') # If no topK selection is required, just save the similarity if (not selectTopK): if isinstance(item_weights, np.ndarray): #self.W = item_weights #self.sparse_weights = False self.W_sparse = sps.csr_matrix(item_weights) self.sparse_weights = True else: self.W_sparse = check_matrix(item_weights, format='csr') self.sparse_weights = True return # If matrix is not dense, make it dense to select top K if not isinstance(item_weights, np.ndarray): item_weights = item_weights.toarray() idx_sorted = np.argsort(item_weights, axis=0) # sort by column # for each column, keep only the top-k scored items if not self.sparse_weights: self.W = item_weights.copy() # index of the items that don't belong to the top-k similar items of each column not_top_k = idx_sorted[:-self.k, :] # use numpy fancy indexing to zero-out the values in sim without using a for loop self.W[not_top_k, np.arange(item_weights.shape[1])] = 0.0 else: # iterate over each column and keep only the top-k similar items values, rows, cols = [], [], [] nitems = self.URM_train.shape[1] for i in range(nitems): top_k_idx = idx_sorted[-self.k:, i] values.extend(item_weights[top_k_idx, i]) rows.extend(np.arange(nitems)[top_k_idx]) cols.extend(np.ones(self.k) * i) # During testing CSR is faster self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(nitems, nitems), dtype=np.float32)
def _build_confidence_matrix(self, confidence_scaling): if confidence_scaling == 'linear': self.C = self._linear_scaling_confidence() else: self.C = self._log_scaling_confidence() self.C_csc= check_matrix(self.C.copy(), format="csc", dtype = np.float32)
def __init__(self, URM_train): super(TopPop, self).__init__() # convert to csc matrix for faster column-wise sum self.URM_train = check_matrix(URM_train, 'csc', dtype=np.float32) self.URM_train.eliminate_zeros() self.compute_item_score = self.compute_score_top_pop
def __init__(self, URM_train, ICM_train): super(ScoresHybridSpecializedV2Mid, self).__init__(URM_train) self.URM_train = check_matrix(URM_train.copy(), 'csr') self.ICM_train = ICM_train self.P3alpha = P3alphaRecommender.P3alphaRecommender(URM_train) self.itemKNNCBF = ItemKNNCBFRecommender.ItemKNNCBFRecommender( URM_train, ICM_train)