def fit(self, topK=50, shrink=100, similarity='cosine', feature_weighting="none", **similarity_args): # Similaripy returns also self similarity, which will be set to 0 afterwards topK += 1 self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = okapi_BM_25(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') elif feature_weighting == "TF-IDF": self.URM_train = self.URM_train.astype(np.float32) self.URM_train = TF_IDF(self.URM_train.T).T self.URM_train = check_matrix(self.URM_train, 'csr') if similarity == "cosine": self.W_sparse = sim.cosine(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "jaccard": self.W_sparse = sim.jaccard(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "dice": self.W_sparse = sim.dice(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "jaccard": self.W_sparse = sim.tversky(self.URM_train, k=topK, shrink=shrink, **similarity_args) elif similarity == "splus": self.W_sparse = sim.s_plus(self.URM_train, k=topK, shrink=shrink, **similarity_args) else: raise ValueError( "Unknown value '{}' for similarity".format(similarity)) self.W_sparse.setdiag(0) self.W_sparse = self.W_sparse.transpose().tocsr()
def __init__(self, URM_train, Similarity_1, Similarity_2): super(ItemKNNSimilarityHybrid, self).__init__(URM_train) if Similarity_1.shape != Similarity_2.shape: raise ValueError( "ItemKNNSimilarityHybrid: similarities have different size, S1 is {}, S2 is {}" .format(Similarity_1.shape, Similarity_2.shape)) # CSR is faster during evaluation self.Similarity_1 = check_matrix(Similarity_1.copy(), 'csr') self.Similarity_2 = check_matrix(Similarity_2.copy(), 'csr') self.URM_train = check_matrix(URM_train.copy(), 'csr')
def fit(self, lambda_user=10, lambda_item=25): self.lambda_user = lambda_user self.lambda_item = lambda_item self.n_items = self.URM_train.shape[1] # convert to csc matrix for faster column-wise sum self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) # 1) global average self.mu = self.URM_train.data.sum( dtype=np.float32) / self.URM_train.data.shape[0] # 2) item average bias # compute the number of non-zero elements for each column col_nnz = np.diff(self.URM_train.indptr) # it is equivalent to: # col_nnz = X.indptr[1:] - X.indptr[:-1] # and it is **much faster** than # col_nnz = (X != 0).sum(axis=0) URM_train_unbiased = self.URM_train.copy() URM_train_unbiased.data -= self.mu self.item_bias = URM_train_unbiased.sum(axis=0) / (col_nnz + self.lambda_item) self.item_bias = np.asarray(self.item_bias).ravel( ) # converts 2-d matrix to 1-d array without anycopy # 3) user average bias # NOTE: the user bias is *useless* for the sake of ranking items. We just show it here for educational purposes. # first subtract the item biases from each column # then repeat each element of the item bias vector a number of times equal to col_nnz # and subtract it from the data vector URM_train_unbiased.data -= np.repeat(self.item_bias, col_nnz) # now convert the csc matrix to csr for efficient row-wise computation URM_train_unbiased_csr = URM_train_unbiased.tocsr() row_nnz = np.diff(URM_train_unbiased_csr.indptr) # finally, let's compute the bias self.user_bias = URM_train_unbiased_csr.sum( axis=1).ravel() / (row_nnz + self.lambda_user) # 4) precompute the item ranking by using the item bias only # the global average and user bias won't change the ranking, so there is no need to use them #self.item_ranking = np.argsort(self.bi)[::-1] self.URM_train = check_matrix(self.URM_train, 'csr', dtype=np.float32)
def __init__(self, URM_train): super(Recommender, self).__init__() # check if the matrix is in the CSR form self.URM_train = check_matrix(URM_train.copy(), 'csr', dtype=np.float32) # check if inside the data array there are zeros self.URM_train.eliminate_zeros() self.n_users, self.n_items = self.URM_train.shape self.verbose = True self.filterTopPop = False self.filterTopPop_ItemsID = np.array([], dtype=np.int) self.items_to_ignore_flag = False self.items_to_ignore_ID = np.array([], dtype=np.int) # cold user = user with 0 interactions # save the indices of the users which have 0 interactions self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): self._print("URM Detected {} ({:.2f} %) cold users.".format( self._cold_user_mask.sum(), self._cold_user_mask.sum()/self.n_users*100)) # cold item = item that are not assigned to any user # save the indices of the items which are assigned to 0 users self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0 if self._cold_item_mask.any(): self._print("URM Detected {} ({:.2f} %) cold items.".format( self._cold_item_mask.sum(), self._cold_item_mask.sum()/self.n_items*100))
def applyPearsonCorrelation(self): """ Remove from every data point the average for the corresponding column :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csc') interactionsPerCol = np.diff(self.dataMatrix.indptr) nonzeroCols = interactionsPerCol > 0 sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() colAverage = np.zeros_like(sumPerCol) colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[ nonzeroCols] # Split in blocks to avoid duplicating the whole data structure start_col = 0 end_col = 0 blockSize = 1000 while end_col < self.n_columns: end_col = min(self.n_columns, end_col + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) start_col += blockSize
def applyAdjustedCosine(self): """ Remove from every data point the average for the corresponding row :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csr') interactionsPerRow = np.diff(self.dataMatrix.indptr) nonzeroRows = interactionsPerRow > 0 sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() rowAverage = np.zeros_like(sumPerRow) rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[ nonzeroRows] # Split in blocks to avoid duplicating the whole data structure start_row = 0 end_row = 0 blockSize = 1000 while end_row < self.n_rows: end_row = min(self.n_rows, end_row + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) start_row += blockSize
def __init__(self, URM_recommendations_items): super(PredefinedListRecommender, self).__init__() # convert to csc matrix for faster column-wise sum self.URM_recommendations = check_matrix(URM_recommendations_items, 'csr', dtype=np.int) self.URM_train = sps.csr_matrix((self.URM_recommendations.shape))
def fit(self, W_sparse, selectTopK=False, topK=100): assert W_sparse.shape[0] == W_sparse.shape[1],\ "ItemKNNCustomSimilarityRecommender: W_sparse matrice is not square. Current shape is {}".format(W_sparse.shape) assert self.URM_train.shape[1] == W_sparse.shape[0],\ "ItemKNNCustomSimilarityRecommender: URM_train and W_sparse matrices are not consistent. " \ "The number of columns in URM_train must be equal to the rows in W_sparse. " \ "Current shapes are: URM_train {}, W_sparse {}".format(self.URM_train.shape, W_sparse.shape) if selectTopK: W_sparse = similarityMatrixTopK(W_sparse, k=topK) self.W_sparse = check_matrix(W_sparse, format='csr')
def set_URM_train(self, URM_train_new, **kwargs): assert self.URM_train.shape == URM_train_new.shape, \ "{}: set_URM_train old and new URM train have different shapes".format(self.RECOMMENDER_NAME) if len(kwargs) > 0: self._print("set_URM_train keyword arguments not supported " "for this recommender class. Received: {}".format(kwargs)) self.URM_train = check_matrix(URM_train_new.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0 if self._cold_user_mask.any(): self._print("Detected {} ({:.2f} %) cold users.".format( self._cold_user_mask.sum(), self._cold_user_mask.sum()/len(self._cold_user_mask)*100))
def __init__(self, URM_train, UCM_train): super(BaseUserCBFRecommender, self).__init__(URM_train) assert self.n_users == UCM_train.shape[0], "{}: URM_train has {} users but UCM_train has {}"\ .format(self.RECOMMENDER_NAME, self.n_items, UCM_train.shape[0]) self.UCM_train = check_matrix(UCM_train.copy(), 'csr', dtype=np.float32) self.UCM_train.eliminate_zeros() _, self.n_features = self.UCM_train.shape self._cold_user_CBF_mask = np.ediff1d(self.UCM_train.indptr) == 0 if self._cold_user_CBF_mask.any(): self._print("UCM Detected {} ({:.2f} %) cold users.".format( self.RECOMMENDER_NAME, self._cold_user_CBF_mask.sum(), self._cold_user_CBF_mask.sum() / self.n_users * 100))
def __init__(self, URM_train, ICM_train): super(BaseItemCBFRecommender, self).__init__(URM_train) assert self.n_items == ICM_train.shape[0], "{}: URM_train has {} items but ICM_train has {}"\ .format(self.RECOMMENDER_NAME, self.n_items, ICM_train.shape[0]) self.ICM_train = check_matrix(ICM_train.copy(), 'csr', dtype=np.float32) self.ICM_train.eliminate_zeros() _, self.n_features = self.ICM_train.shape self._cold_item_CBF_mask = np.ediff1d(self.ICM_train.indptr) == 0 if self._cold_item_CBF_mask.any(): self._print( "ICM Detected {} ({:.2f} %) items with no features.".format( self.RECOMMENDER_NAME, self._cold_item_CBF_mask.sum(), self._cold_item_CBF_mask.sum() / self.n_items * 100))
def remove_empty_rows_and_cols(URM, ICM=None): URM = check_matrix(URM, "csr") numRatings = np.ediff1d(URM.indptr) user_mask = numRatings >= 1 URM = URM[user_mask, :] numRatings = np.ediff1d(URM.tocsc().indptr) item_mask = numRatings >= 1 URM = URM[:, item_mask] removedUsers = np.arange(len(user_mask))[np.logical_not(user_mask)] removedItems = np.arange(len(item_mask))[np.logical_not(item_mask)] if ICM is not None: ICM = ICM[item_mask, :] return URM.tocsr(), ICM.tocsr(), removedUsers, removedItems return URM.tocsr(), removedUsers, removedItems
def removeFeatures(ICM, minOccurrence=5, maxPercOccurrence=0.30, reconcile_mapper=None): """ The function eliminates the values associated to feature occurring in less than the minimal percentage of items or more then the max. Shape of ICM is reduced deleting features. :param ICM: :param minPercOccurrence: :param maxPercOccurrence: :param reconcile_mapper: DICT mapper [token] -> index :return: ICM :return: deletedFeatures :return: DICT mapper [token] -> index """ ICM = check_matrix(ICM, 'csc') n_items = ICM.shape[0] cols = ICM.indptr numOccurrences = np.ediff1d(cols) feature_mask = np.logical_and(numOccurrences >= minOccurrence, numOccurrences <= n_items*maxPercOccurrence) ICM = ICM[:,feature_mask] deletedFeatures = np.arange(0, len(feature_mask))[np.logical_not(feature_mask)] print("RemoveFeatures: removed {} features with less then {} occurrencies, removed {} features with more than {} occurrencies".format( sum(numOccurrences < minOccurrence), minOccurrence, sum(numOccurrences > n_items*maxPercOccurrence), int(n_items*maxPercOccurrence) )) if reconcile_mapper is not None: reconcile_mapper = reconcile_mapper_with_removed_tokens(reconcile_mapper, deletedFeatures) return ICM, deletedFeatures, reconcile_mapper return ICM, deletedFeatures
def compute_similarity(self, start_col=None, end_col=None, block_size=100): """ Compute the similarity for the given dataset :param self: :param start_col: column to begin with :param end_col: column to stop before, end_col is excluded :return: """ values = [] rows = [] cols = [] start_time = time.time() start_time_print_batch = start_time processedItems = 0 if self.adjusted_cosine: self.applyAdjustedCosine() elif self.pearson_correlation: self.applyPearsonCorrelation() elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient: self.useOnlyBooleanInteractions() # We explore the matrix column-wise self.dataMatrix = check_matrix(self.dataMatrix, 'csc') # Compute sum of squared values to be used in normalization sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() # Tanimoto does not require the square root to be applied if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient): sumOfSquared = np.sqrt(sumOfSquared) if self.asymmetric_cosine: sumOfSquared_to_1_minus_alpha = np.power( sumOfSquared, 2 * (1 - self.asymmetric_alpha)) sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha) self.dataMatrix = check_matrix(self.dataMatrix, 'csc') start_col_local = 0 end_col_local = self.n_columns if start_col is not None and start_col > 0 and start_col < self.n_columns: start_col_local = start_col if end_col is not None and end_col > start_col_local and end_col < self.n_columns: end_col_local = end_col start_col_block = start_col_local this_block_size = 0 # Compute all similarities for each item using vectorization while start_col_block < end_col_local: # Add previous block size processedItems += this_block_size end_col_block = min(start_col_block + block_size, end_col_local) this_block_size = end_col_block - start_col_block if time.time( ) - start_time_print_batch >= 30 or end_col_block == end_col_local: columnPerSec = processedItems / (time.time() - start_time) print( "Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min" .format( processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, (time.time() - start_time) / 60)) sys.stdout.flush() sys.stderr.flush() start_time_print_batch = time.time() # All data points for a given item item_data = self.dataMatrix[:, start_col_block:end_col_block] item_data = item_data.toarray().squeeze() if self.use_row_weights: #item_data = np.multiply(item_data, self.row_weights) #item_data = item_data.T.dot(self.row_weights_diag).T this_block_weights = self.dataMatrix_weighted.T.dot(item_data) else: # Compute item similarities this_block_weights = self.dataMatrix.T.dot(item_data) for col_index_in_block in range(this_block_size): if this_block_size == 1: this_column_weights = this_block_weights else: this_column_weights = this_block_weights[:, col_index_in_block] columnIndex = col_index_in_block + start_col_block this_column_weights[columnIndex] = 0.0 # Apply normalization and shrinkage, ensure denominator != 0 if self.normalize: if self.asymmetric_cosine: denominator = sumOfSquared_to_alpha[ columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6 else: denominator = sumOfSquared[ columnIndex] * sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # Apply the specific denominator for Tanimoto elif self.tanimoto_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.dice_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.tversky_coefficient: denominator = this_column_weights + \ (sumOfSquared[columnIndex] - this_column_weights)*self.tversky_alpha + \ (sumOfSquared - this_column_weights)*self.tversky_beta + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # If no normalization or tanimoto is selected, apply only shrink elif self.shrink != 0: this_column_weights = this_column_weights / self.shrink if self.TopK == 0: self.W_dense[:, columnIndex] = this_column_weights else: # Sort indices and select TopK # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index relevant_items_partition = ( -this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] relevant_items_partition_sorting = np.argsort( -this_column_weights[relevant_items_partition]) top_k_idx = relevant_items_partition[ relevant_items_partition_sorting] # Incrementally build sparse matrix, do not add zeros notZerosMask = this_column_weights[top_k_idx] != 0.0 numNotZeros = np.sum(notZerosMask) values.extend(this_column_weights[top_k_idx][notZerosMask]) rows.extend(top_k_idx[notZerosMask]) cols.extend(np.ones(numNotZeros) * columnIndex) start_col_block += block_size # End while on columns if self.TopK == 0: return self.W_dense else: W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(self.n_columns, self.n_columns), dtype=np.float32) return W_sparse