def get_S_incremental_and_set_W(self): self.S_incremental = self.cythonEpoch.get_S() if self.train_with_sparse_weights: self.W_sparse = self.S_incremental self.W_sparse = check_matrix(self.W_sparse, format='csr') else: self.W_sparse = similarityMatrixTopK(self.S_incremental, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')
def __init__(self, URM_train, verbose=True): super(BaseRecommender, self).__init__() self.URM_train = check_matrix(URM_train.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self.n_users, self.n_items = self.URM_train.shape self.verbose = verbose self.filterTopPop = False self.filterTopPop_ItemsID = np.array([], dtype=np.int) self.items_to_ignore_flag = False self.items_to_ignore_ID = np.array([], dtype=np.int) self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): self._print("URM Detected {} ({:.2f} %) cold users.".format( self._cold_user_mask.sum(), self._cold_user_mask.sum()/self.n_users*100)) self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0 if self._cold_item_mask.any(): self._print("URM Detected {} ({:.2f} %) cold items.".format( self._cold_item_mask.sum(), self._cold_item_mask.sum()/self.n_items*100))
def applyPearsonCorrelation(self): """ Remove from every data point the average for the corresponding column :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csc') interactionsPerCol = np.diff(self.dataMatrix.indptr) nonzeroCols = interactionsPerCol > 0 sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() colAverage = np.zeros_like(sumPerCol) colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[ nonzeroCols] # Split in blocks to avoid duplicating the whole data structure start_col = 0 end_col = 0 blockSize = 1000 while end_col < self.n_columns: end_col = min(self.n_columns, end_col + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) start_col += blockSize
def applyAdjustedCosine(self): """ Remove from every data point the average for the corresponding row :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csr') interactionsPerRow = np.diff(self.dataMatrix.indptr) nonzeroRows = interactionsPerRow > 0 sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() rowAverage = np.zeros_like(sumPerRow) rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[ nonzeroRows] # Split in blocks to avoid duplicating the whole data structure start_row = 0 end_row = 0 blockSize = 1000 while end_row < self.n_rows: end_row = min(self.n_rows, end_row + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) start_row += blockSize
def fit(self, lambda_user=10, lambda_item=25): self.lambda_user = lambda_user self.lambda_item = lambda_item self.n_items = self.URM_train.shape[1] # convert to csc matrix for faster column-wise sum self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) # 1) global average self.mu = self.URM_train.data.sum(dtype=np.float32) / self.URM_train.data.shape[0] # 2) item average bias # compute the number of non-zero elements for each column col_nnz = np.diff(self.URM_train.indptr) # it is equivalent to: # col_nnz = X.indptr[1:] - X.indptr[:-1] # and it is **much faster** than # col_nnz = (X != 0).sum(axis=0) URM_train_unbiased = self.URM_train.copy() URM_train_unbiased.data -= self.mu self.item_bias = URM_train_unbiased.sum(axis=0) / (col_nnz + self.lambda_item) self.item_bias = np.asarray(self.item_bias).ravel() # converts 2-d matrix to 1-d array without anycopy # 3) user average bias # NOTE: the user bias is *useless* for the sake of ranking items. We just show it here for educational purposes. # first subtract the item biases from each column # then repeat each element of the item bias vector a number of times equal to col_nnz # and subtract it from the data vector URM_train_unbiased.data -= np.repeat(self.item_bias, col_nnz) # now convert the csc matrix to csr for efficient row-wise computation URM_train_unbiased_csr = URM_train_unbiased.tocsr() row_nnz = np.diff(URM_train_unbiased_csr.indptr) # finally, let's compute the bias self.user_bias = URM_train_unbiased_csr.sum(axis=1).ravel() / (row_nnz + self.lambda_user) # 4) precompute the item ranking by using the item bias only # the global average and user bias won't change the ranking, so there is no need to use them #self.item_ranking = np.argsort(self.bi)[::-1] self.URM_train = check_matrix(self.URM_train, 'csr', dtype=np.float32)
def fit(self, l1_ratio=0.1, positive_only=True, topK=100, workers=multiprocessing.cpu_count(), #ElasticNet Parameters alpha=1.0, fit_intercept=False, selection='random', max_iter=100, tol=1e-4, random_state=None, bm_25_norm=False): assert 0 <= l1_ratio <= 1, "SLIM_ElasticNet: l1_ratio must be between 0 and 1, provided value was {}".format( l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK self.alpha = alpha self.fit_intercept = fit_intercept self.selection = selection self.max_iter = max_iter self.tol = tol self.random_state = None self.workers = workers if bm_25_norm: self.URM_train = Helper().bm25_normalization(self.URM_train) self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = self.URM_train.shape[1] # fit item's factors in parallel # oggetto riferito alla funzione nel quale predefinisco parte dell'input _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK) # creo un pool con un certo numero di processi pool = Pool(processes=self.workers) # avvio il pool passando la funzione (con la parte fissa dell'input) # e il rimanente parametro, variabile print("Starting parallelized fit...") res = pool.map(_pfit, np.arange(n_items)) pool.close() print("Done!") # res contains a vector of (values, rows, cols) tuples values, rows, cols = [], [], [] for values_, rows_, cols_ in res: values.extend(values_) rows.extend(rows_) cols.extend(cols_) # generate the sparse weight matrix print("Now generating W matrix...") self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32) print("Done!")
def __init__(self, URM_recommendations_items): super(PredefinedListRecommender, self).__init__() # convert to csc matrix for faster column-wise sum self.URM_recommendations = check_matrix(URM_recommendations_items, 'csr', dtype=np.int) self.URM_train = sps.csr_matrix((self.URM_recommendations.shape))
def _remove_seen_on_scores(self, user_id, scores): URM_train = check_matrix(self.URM_train, 'csr', dtype=np.float32) assert URM_train.getformat() == "csr", "Recommender_Base_Class: URM_train is not CSR, this will cause errors in filtering seen items" seen = URM_train.indices[URM_train.indptr[user_id]:URM_train.indptr[user_id + 1]] scores[seen] = -np.inf return scores
def _build_confidence_matrix(self, confidence_scaling): if confidence_scaling == 'linear': self.C = self._linear_scaling_confidence() else: self.C = self._log_scaling_confidence() self.C_csc = check_matrix(self.C.copy(), format="csc", dtype=np.float32)
def fit(self, W_sparse, selectTopK=False, topK=100): assert W_sparse.shape[0] == W_sparse.shape[1],\ "ItemKNNCustomSimilarityRecommender: W_sparse matrice is not square. Current shape is {}".format(W_sparse.shape) assert self.URM_train.shape[1] == W_sparse.shape[0],\ "ItemKNNCustomSimilarityRecommender: URM_train and W_sparse matrices are not consistent. " \ "The number of columns in URM_train must be equal to the rows in W_sparse. " \ "Current shapes are: URM_train {}, W_sparse {}".format(self.URM_train.shape, W_sparse.shape) if selectTopK: W_sparse = similarityMatrixTopK(W_sparse, k=topK) self.W_sparse = check_matrix(W_sparse, format='csr')
def set_URM_train(self, URM_train_new, **kwargs): assert self.URM_train.shape == URM_train_new.shape, "{}: set_URM_train old and new URM train have different shapes".format(self.RECOMMENDER_NAME) if len(kwargs)>0: print("{}: set_URM_train keyword arguments not supported for this recommender class. Received: {}".format(self.RECOMMENDER_NAME, kwargs)) self.URM_train = check_matrix(URM_train_new.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): print("{}: Detected {} ({:.2f} %) cold users.".format( self.RECOMMENDER_NAME, self._cold_user_mask.sum(), self._cold_user_mask.sum()/len(self._cold_user_mask)*100))
def fit(self,l1_ratio=0.1, positive_only=True, topK = 100, workers=multiprocessing.cpu_count()): assert l1_ratio>= 0 and l1_ratio<=1, "SLIM_ElasticNet: l1_ratio must be between 0 and 1, provided value was {}".format(l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK self.workers = workers self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = self.URM_train.shape[1] # fit item's factors in parallel #oggetto riferito alla funzione nel quale predefinisco parte dell'input _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK) #creo un pool con un certo numero di processi pool = Pool(processes=self.workers) #avvio il pool passando la funzione (con la parte fissa dell'input) #e il rimanente parametro, variabile res = pool.map(_pfit, np.arange(n_items)) # res contains a vector of (values, rows, cols) tuples values, rows, cols = [], [], [] for values_, rows_, cols_ in res: values.extend(values_) rows.extend(rows_) cols.extend(cols_) # generate the sparse weight matrix self.W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)
def _check_format(self): if not self._URM_train_format_checked: self.URM_train = check_matrix(self.URM_train, 'csr', dtype=np.float32) if self.URM_train.getformat() != "csr": self._print( "PERFORMANCE ALERT compute_item_score: {} is not {}, this will significantly slow down the computation." .format("URM_train", "csr")) self._URM_train_format_checked = True if not self._W_sparse_format_checked: if self.W_sparse.getformat() != "csr": self._print( "PERFORMANCE ALERT compute_item_score: {} is not {}, this will significantly slow down the computation." .format("W_sparse", "csr")) self._W_sparse_format_checked = True
def _linear_scaling_confidence(self): C = check_matrix(self.URM_train, format="csr", dtype=np.float32) C.data = 1.0 + self.alpha * C.data return C
def compute_similarity(self, start_col=None, end_col=None, block_size=100): """ Compute the similarity for the given dataset :param self: :param start_col: column to begin with :param end_col: column to stop before, end_col is excluded :return: """ values = [] rows = [] cols = [] start_time = time.time() start_time_print_batch = start_time processedItems = 0 if self.adjusted_cosine: self.applyAdjustedCosine() elif self.pearson_correlation: self.applyPearsonCorrelation() elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient: self.useOnlyBooleanInteractions() # We explore the matrix column-wise self.dataMatrix = check_matrix(self.dataMatrix, 'csc') # Compute sum of squared values to be used in normalization sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() # Tanimoto does not require the square root to be applied if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient): sumOfSquared = np.sqrt(sumOfSquared) if self.asymmetric_cosine: sumOfSquared_to_1_minus_alpha = np.power( sumOfSquared, 2 * (1 - self.asymmetric_alpha)) sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha) self.dataMatrix = check_matrix(self.dataMatrix, 'csc') start_col_local = 0 end_col_local = self.n_columns if start_col is not None and start_col > 0 and start_col < self.n_columns: start_col_local = start_col if end_col is not None and end_col > start_col_local and end_col < self.n_columns: end_col_local = end_col start_col_block = start_col_local this_block_size = 0 # Compute all similarities for each item using vectorization while start_col_block < end_col_local: end_col_block = min(start_col_block + block_size, end_col_local) this_block_size = end_col_block - start_col_block # All data points for a given item item_data = self.dataMatrix[:, start_col_block:end_col_block] item_data = item_data.toarray().squeeze() # If only 1 feature avoid last dimension to disappear if item_data.ndim == 1: item_data = np.atleast_2d(item_data) if self.use_row_weights: this_block_weights = self.dataMatrix_weighted.T.dot(item_data) else: # Compute item similarities this_block_weights = self.dataMatrix.T.dot(item_data) for col_index_in_block in range(this_block_size): if this_block_size == 1: this_column_weights = this_block_weights else: this_column_weights = this_block_weights[:, col_index_in_block] columnIndex = col_index_in_block + start_col_block this_column_weights[columnIndex] = 0.0 # Apply normalization and shrinkage, ensure denominator != 0 if self.normalize: if self.asymmetric_cosine: denominator = sumOfSquared_to_alpha[ columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6 else: denominator = sumOfSquared[ columnIndex] * sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # Apply the specific denominator for Tanimoto elif self.tanimoto_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.dice_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.tversky_coefficient: denominator = this_column_weights + \ (sumOfSquared[columnIndex] - this_column_weights) * self.tversky_alpha + \ (sumOfSquared - this_column_weights) * self.tversky_beta + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # If no normalization or tanimoto is selected, apply only shrink elif self.shrink != 0: this_column_weights = this_column_weights / self.shrink # this_column_weights = this_column_weights.toarray().ravel() # Sort indices and select TopK # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index relevant_items_partition = ( -this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] relevant_items_partition_sorting = np.argsort( -this_column_weights[relevant_items_partition]) top_k_idx = relevant_items_partition[ relevant_items_partition_sorting] # Incrementally build sparse matrix, do not add zeros notZerosMask = this_column_weights[top_k_idx] != 0.0 numNotZeros = np.sum(notZerosMask) values.extend(this_column_weights[top_k_idx][notZerosMask]) rows.extend(top_k_idx[notZerosMask]) cols.extend(np.ones(numNotZeros) * columnIndex) # Add previous block size processedItems += this_block_size if time.time( ) - start_time_print_batch >= 30 or end_col_block == end_col_local: columnPerSec = processedItems / (time.time() - start_time + 1e-9) print( "Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min" .format( processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, (time.time() - start_time) / 60)) sys.stdout.flush() sys.stderr.flush() start_time_print_batch = time.time() start_col_block += block_size # End while on columns W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(self.n_columns, self.n_columns), dtype=np.float32) return W_sparse
def set_URM_train(self, URM_train_new, estimate_model_for_cold_users=False, topK=100, **kwargs): """ :param URM_train_new: :param estimate_item_similarity_for_cold_users: Set to TRUE if you want to estimate the item-item similarity for cold users to be used as in a KNN algorithm :param topK: 100 :param kwargs: :return: """ assert self.URM_train.shape == URM_train_new.shape, "{}: set_URM_train old and new URM train have different shapes".format( self.RECOMMENDER_NAME) if len(kwargs) > 0: print( "{}: set_URM_train keyword arguments not supported for this recommender class. Received: {}" .format(self.RECOMMENDER_NAME, kwargs)) self.URM_train = check_matrix(URM_train_new.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() if estimate_model_for_cold_users == "itemKNN": print("{}: Estimating ItemKNN model from ITEM latent factors...". format(self.RECOMMENDER_NAME)) W_sparse = compute_W_sparse_from_item_latent_factors( self.ITEM_factors, topK=topK) self._ItemKNNRecommender = ItemKNNCustomSimilarityRecommender( self.URM_train) self._ItemKNNRecommender.fit(W_sparse, topK=topK) self._cold_user_KNN_model_available = True self._warm_user_KNN_mask = np.ediff1d(self.URM_train.indptr) > 0 print( "{}: Estimating ItemKNN model from ITEM latent factors... done!" .format(self.RECOMMENDER_NAME)) elif estimate_model_for_cold_users == "mean_item_factors": print( "{}: Estimating USER latent factors from ITEM latent factors..." .format(self.RECOMMENDER_NAME)) self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 profile_length = np.ediff1d(self.URM_train.indptr) profile_length_sqrt = np.sqrt(profile_length) self.USER_factors = self.URM_train.dot(self.ITEM_factors) #Divide every row for the sqrt of the profile length for user_index in range(self.n_users): if profile_length_sqrt[user_index] > 0: self.USER_factors[ user_index, :] /= profile_length_sqrt[user_index] print( "{}: Estimating USER latent factors from ITEM latent factors... done!" .format(self.RECOMMENDER_NAME))
def similarityMatrixTopK(self, item_weights, force_sparse_output=True, k=100, verbose=False, inplace=True): """ The function selects the TopK most similar elements, column-wise :param item_weights: :param force_sparse_output: :param k: :param verbose: :param inplace: Default True, WARNING matrix will be modified :return: """ assert (item_weights.shape[0] == item_weights.shape[1] ), "selectTopK: ItemWeights is not a square matrix" start_time = time.time() if verbose: print("Generating topK matrix") nitems = item_weights.shape[1] k = min(k, nitems) # for each column, keep only the top-k scored items sparse_weights = not isinstance(item_weights, np.ndarray) if not sparse_weights: print("Sorting columns...") idx_sorted = np.argsort(item_weights, axis=0) # sort data inside each column print("Done!") if inplace: W = item_weights else: W = item_weights.copy() # index of the items that don't belong to the top-k similar items of each column not_top_k = idx_sorted[:-k, :] # use numpy fancy indexing to zero-out the values in sim without using a for loop W[not_top_k, np.arange(nitems)] = 0.0 if force_sparse_output: if verbose: print("Starting CSR compression...") W_sparse = sps.csr_matrix(W, shape=(nitems, nitems)) if verbose: print("Sparse TopK matrix generated in {:.2f} seconds". format(time.time() - start_time)) return W_sparse if verbose: print("Dense TopK matrix generated in {:.2f} seconds".format( time.time() - start_time)) return W else: # iterate over each column and keep only the top-k similar items data, rows_indices, cols_indptr = [], [], [] item_weights = check_matrix(item_weights, format='csc', dtype=np.float32) for item_idx in range(nitems): cols_indptr.append(len(data)) start_position = item_weights.indptr[item_idx] end_position = item_weights.indptr[item_idx + 1] column_data = item_weights.data[start_position:end_position] column_row_index = item_weights.indices[ start_position:end_position] non_zero_data = column_data != 0 idx_sorted = np.argsort( column_data[non_zero_data]) # sort by column top_k_idx = idx_sorted[-k:] data.extend(column_data[non_zero_data][top_k_idx]) rows_indices.extend(column_row_index[non_zero_data][top_k_idx]) cols_indptr.append(len(data)) # During testing CSR is faster if verbose: print("Generating CSC matrix...") W_sparse = sps.csc_matrix((data, rows_indices, cols_indptr), shape=(nitems, nitems), dtype=np.float32) if verbose: print("Converting to CSR...") W_sparse = W_sparse.tocsr() if verbose: print("Sparse TopK matrix generated in {:.2f} seconds".format( time.time() - start_time)) return W_sparse
def fit(self, l1_ratio=0.1, alpha = 1.0, positive_only=True, topK = 100, verbose = True): assert l1_ratio>= 0 and l1_ratio<=1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(self.RECOMMENDER_NAME, l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK # initialize the ElasticNet model self.model = ElasticNet(alpha=alpha, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=False, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4) URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = URM_train.shape[1] # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in trange(n_items): # get the target column y = URM_train[:, currentItem].toarray() # set the j-th column of X to zero start_pos = URM_train.indptr[currentItem] end_pos = URM_train.indptr[currentItem + 1] current_item_data_backup = URM_train.data[start_pos: end_pos].copy() URM_train.data[start_pos: end_pos] = 0.0 # fit one ElasticNet model per column self.model.fit(URM_train, y) # self.model.coef_ contains the coefficient of the ElasticNet model # let's keep only the non-zero values # Select topK values # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index # nonzero_model_coef_index = self.model.coef_.nonzero()[0] # nonzero_model_coef_value = self.model.coef_[nonzero_model_coef_index] nonzero_model_coef_index = self.model.sparse_coef_.indices nonzero_model_coef_value = self.model.sparse_coef_.data local_topK = min(len(nonzero_model_coef_value)-1, self.topK) relevant_items_partition = (-nonzero_model_coef_value).argpartition(local_topK)[0:local_topK] relevant_items_partition_sorting = np.argsort(-nonzero_model_coef_value[relevant_items_partition]) ranking = relevant_items_partition[relevant_items_partition_sorting] for index in range(len(ranking)): if numCells == len(rows): rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = nonzero_model_coef_index[ranking[index]] cols[numCells] = currentItem values[numCells] = nonzero_model_coef_value[ranking[index]] numCells += 1 # finally, replace the original values of the j-th column URM_train.data[start_pos:end_pos] = current_item_data_backup if verbose and (time.time() - start_time_printBatch > 300 or currentItem == n_items-1): print("{}: Processed {} ( {:.2f}% ) in {:.2f} minutes. Items per second: {:.0f}".format( self.RECOMMENDER_NAME, currentItem+1, 100.0* float(currentItem+1)/n_items, (time.time()-start_time)/60, float(currentItem)/(time.time()-start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])), shape=(n_items, n_items), dtype=np.float32)
def _log_scaling_confidence(self): C = check_matrix(self.URM_train, format="csr", dtype=np.float32) C.data = 1.0 + self.alpha * np.log(1.0 + C.data / self.epsilon) return C
def fit(self, l1_ratio=0.0007553368138338653, alpha=0.0024081648139725204, positive_only=False, topK=65, verbose=True, side_alpha=3.86358712510434, bm_25_all=False, random_state=None, bm_25_urm=False, bm_25_icm=False): assert l1_ratio >= 0 and l1_ratio <= 1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format( self.RECOMMENDER_NAME, l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK if bm_25_icm: self.ICM = Helper().bm25_normalization(self.ICM) if bm_25_urm: self.URM_train = Helper().bm25_normalization(self.URM_train) self.ICM = self.ICM.transpose() self.ICM *= np.sqrt(side_alpha) self.URM_train = sps.vstack([self.URM_train, self.ICM]) if bm_25_all: self.URM_train = Helper().bm25_normalization(self.URM_train) # initialize the ElasticNet model self.model = ElasticNet(alpha=alpha, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=False, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4, random_state=random_state) URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = URM_train.shape[1] # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in trange(n_items): # get the target column y = URM_train[:, currentItem].toarray() # set the j-th column of X to zero start_pos = URM_train.indptr[currentItem] end_pos = URM_train.indptr[currentItem + 1] current_item_data_backup = URM_train.data[start_pos:end_pos].copy() URM_train.data[start_pos:end_pos] = 0.0 # fit one ElasticNet model per column self.model.fit(URM_train, y) nonzero_model_coef_index = self.model.sparse_coef_.indices nonzero_model_coef_value = self.model.sparse_coef_.data local_topK = min(len(nonzero_model_coef_value) - 1, self.topK) relevant_items_partition = ( -nonzero_model_coef_value ).argpartition(local_topK)[0:local_topK] relevant_items_partition_sorting = np.argsort( -nonzero_model_coef_value[relevant_items_partition]) ranking = relevant_items_partition[ relevant_items_partition_sorting] for index in range(len(ranking)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = nonzero_model_coef_index[ranking[index]] cols[numCells] = currentItem values[numCells] = nonzero_model_coef_value[ranking[index]] numCells += 1 # finally, replace the original values of the j-th column URM_train.data[start_pos:end_pos] = current_item_data_backup if verbose and (time.time() - start_time_printBatch > 300 or currentItem == n_items - 1): print( "{}: Processed {} ( {:.2f}% ) in {:.2f} minutes. Items per second: {:.0f}" .format(self.RECOMMENDER_NAME, currentItem + 1, 100.0 * float(currentItem + 1) / n_items, (time.time() - start_time) / 60, float(currentItem) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(n_items, n_items), dtype=np.float32)
def fit(self, alpha=1., beta=0.6, min_rating=0, topK=100, implicit=True, normalize_similarity=True): self.alpha = alpha self.beta = beta self.min_rating = min_rating self.topK = topK self.implicit = implicit self.normalize_similarity = normalize_similarity # if X.dtype != np.float32: # print("RP3beta fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset") if self.min_rating > 0: self.URM_train.data[self.URM_train.data < self.min_rating] = 0 self.URM_train.eliminate_zeros() if self.implicit: self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32) #Pui is the row-normalized urm Pui = normalize(self.URM_train, norm='l1', axis=1) #Piu is the column-normalized, "boolean" urm transposed X_bool = self.URM_train.transpose(copy=True) X_bool.data = np.ones(X_bool.data.size, np.float32) # Taking the degree of each item to penalize top popular # Some rows might be zero, make sure their degree remains zero X_bool_sum = np.array(X_bool.sum(axis=1)).ravel() degree = np.zeros(self.URM_train.shape[1]) nonZeroMask = X_bool_sum != 0.0 degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self.beta) #ATTENTION: axis is still 1 because i transposed before the normalization Piu = normalize(X_bool, norm='l1', axis=1) del (X_bool) # Alfa power if self.alpha != 1.: Pui = Pui.power(self.alpha) Piu = Piu.power(self.alpha) # Final matrix is computed as Pui * Piu * Pui # Multiplication unpacked for memory usage reasons block_dim = 200 d_t = Piu # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time for current_block_start_row in range(0, Pui.shape[1], block_dim): if current_block_start_row + block_dim > Pui.shape[1]: block_dim = Pui.shape[1] - current_block_start_row similarity_block = d_t[ current_block_start_row:current_block_start_row + block_dim, :] * Pui similarity_block = similarity_block.toarray() for row_in_block in range(block_dim): row_data = np.multiply(similarity_block[row_in_block, :], degree) row_data[current_block_start_row + row_in_block] = 0 best = row_data.argsort()[::-1][:self.topK] notZerosMask = row_data[best] != 0.0 values_to_add = row_data[best][notZerosMask] cols_to_add = best[notZerosMask] for index in range(len(values_to_add)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = current_block_start_row + row_in_block cols[numCells] = cols_to_add[index] values[numCells] = values_to_add[index] numCells += 1 if time.time() - start_time_printBatch > 60: print( "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}" .format( current_block_start_row, 100.0 * float(current_block_start_row) / Pui.shape[1], (time.time() - start_time) / 60, float(current_block_start_row) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(Pui.shape[1], Pui.shape[1])) if self.normalize_similarity: self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1) if self.topK != False: self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr')