def __init__(self, URM_train, ICM, S_matrix_target): super(CFW_D_Similarity_Linalg, self).__init__() if (URM_train.shape[1] != ICM.shape[0]): raise ValueError( "Number of items not consistent. URM contains {} but ICM contains {}" .format(URM_train.shape[1], ICM.shape[0])) if (S_matrix_target.shape[0] != S_matrix_target.shape[1]): raise ValueError( "Items imilarity matrix is not square: rows are {}, columns are {}" .format(S_matrix_target.shape[0], S_matrix_target.shape[1])) if (S_matrix_target.shape[0] != ICM.shape[0]): raise ValueError( "Number of items not consistent. S_matrix contains {} but ICM contains {}" .format(S_matrix_target.shape[0], ICM.shape[0])) self.URM_train = check_matrix(URM_train, 'csr') self.S_matrix_target = check_matrix(S_matrix_target, 'csr') self.ICM = check_matrix(ICM, 'csr') self.n_items = self.URM_train.shape[1] self.n_users = self.URM_train.shape[0] self.n_features = self.ICM.shape[1] self.sparse_weights = True
def __init__(self, URM_recommendations_items): super(PredefinedListRecommender, self).__init__() # convert to csc matrix for faster column-wise sum self.URM_recommendations = check_matrix(URM_recommendations_items, 'csr', dtype=np.int) self.URM_train = sps.csr_matrix((self.URM_recommendations.shape))
def __init__(self, URM_train): super(RP3betaRecommender, self).__init__() self.URM_train = check_matrix(URM_train, format='csr', dtype=np.float32) self.sparse_weights = True
def __init__(self, URM_train): super(PureSVDRecommender, self).__init__() # CSR is faster during evaluation self.URM_train = check_matrix(URM_train, 'csr') self.compute_item_score = self.compute_score_SVD
def applyPearsonCorrelation(self): """ Remove from every data point the average for the corresponding column :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csc') interactionsPerCol = np.diff(self.dataMatrix.indptr) nonzeroCols = interactionsPerCol > 0 sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() colAverage = np.zeros_like(sumPerCol) colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[ nonzeroCols] # Split in blocks to avoid duplicating the whole data structure start_col = 0 end_col = 0 blockSize = 1000 while end_col < self.n_columns: end_col = min(self.n_columns, end_col + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) start_col += blockSize
def applyAdjustedCosine(self): """ Remove from every data point the average for the corresponding row :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csr') interactionsPerRow = np.diff(self.dataMatrix.indptr) nonzeroRows = interactionsPerRow > 0 sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() rowAverage = np.zeros_like(sumPerRow) rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[ nonzeroRows] # Split in blocks to avoid duplicating the whole data structure start_row = 0 end_row = 0 blockSize = 1000 while end_row < self.n_rows: end_row = min(self.n_rows, end_row + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) start_row += blockSize
def __init__(self, ICM, URM_train, sparse_weights=True): super(RevisedCBF, self).__init__() self.ICM = ICM.copy() # CSR is faster during evaluation self.URM_train = check_matrix(URM_train.copy(), 'csr') self.sparse_weights = sparse_weights
def fit(self, R): print("Starting AsySVD fitting") self.dataset = R R = check_matrix(R, 'csr', dtype=np.float32) self.X, self.Y = AsySVD_sgd(R, self.num_factors, self.lrate, self.reg, self.iters, self.init_mean, self.init_std, self.lrate_decay, self.rnd_seed) # precompute the user factors M = R.shape[0] self.U = np.vstack([AsySVD_compute_user_factors(R[i], self.Y) for i in range(M)]) print("Done AsySVD Fitting")
def fit(self, R): self.dataset = R R = check_matrix(R, 'csr', dtype=np.float32) self.X, self.Y = BPRMF_sgd(R, num_factors=self.num_factors, lrate=self.lrate, user_reg=self.user_reg, pos_reg=self.pos_reg, neg_reg=self.neg_reg, iters=self.iters, sampling_type=self.sampling_type, sample_with_replacement=self.sample_with_replacement, use_resampling=self.use_resampling, sampling_pop_alpha=self.sampling_pop_alpha, init_mean=self.init_mean, init_std=self.init_std, lrate_decay=self.lrate_decay, rnd_seed=self.rnd_seed, verbose=self.verbose)
def fit(self, l1_penalty=0.1, l2_penalty=0.1, positive_only=True, topK=100, workers=multiprocessing.cpu_count()): self.l1_penalty = l1_penalty self.l2_penalty = l2_penalty self.positive_only = positive_only self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty) self.topK = topK self.workers = workers self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = self.URM_train.shape[1] # fit item's factors in parallel # oggetto riferito alla funzione nel quale predefinisco parte dell'input _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK) # creo un pool con un certo numero di processi pool = Pool(processes=self.workers) # avvio il pool passando la funzione (con la parte fissa dell'input) # e il rimanente parametro, variabile res = pool.map(_pfit, np.arange(n_items)) # res contains a vector of (values, rows, cols) tuples values, rows, cols = [], [], [] for values_, rows_, cols_ in res: values.extend(values_) rows.extend(rows_) cols.extend(cols_) # generate the sparse weight matrix self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)
def __init__(self, URM_train): super(FunkSVD, self).__init__() self.URM_train = check_matrix(URM_train, 'csr', dtype=np.float32)
def fit(self, l1_penalty=0.1, l2_penalty=0.1, positive_only=True, topK=100): self.l1_penalty = l1_penalty self.l2_penalty = l2_penalty self.positive_only = positive_only self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty) self.topK = topK X = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = X.shape[1] # initialize the ElasticNet model self.model = ElasticNet(alpha=.001, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=False, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4) # we'll store the W matrix into a sparse csr_matrix # let's initialize the vectors used by the sparse.csc_matrix constructor values, rows, cols = [], [], [] start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in range(n_items): # get the target column y = X[:, currentItem].toarray() # set the j-th column of X to zero startptr = X.indptr[currentItem] endptr = X.indptr[currentItem + 1] bak = X.data[startptr:endptr].copy() X.data[startptr:endptr] = 0.0 # fit one ElasticNet model per column self.model.fit(X, y) # self.model.coef_ contains the coefficient of the ElasticNet model # let's keep only the non-zero values # nnz_idx = self.model.coef_ > 0.0 # Select topK values # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index relevant_items_partition = (-self.model.coef_).argpartition( self.topK)[0:self.topK] relevant_items_partition_sorting = np.argsort( -self.model.coef_[relevant_items_partition]) ranking = relevant_items_partition[ relevant_items_partition_sorting] notZerosMask = self.model.coef_[ranking] > 0.0 ranking = ranking[notZerosMask] values.extend(self.model.coef_[ranking]) rows.extend(ranking) cols.extend([currentItem] * len(ranking)) # finally, replace the original values of the j-th column X.data[startptr:endptr] = bak if time.time() - start_time_printBatch > 300: print( "Processed {} ( {:.2f}% ) in {:.2f} minutes. Columns per second: {:.0f}" .format(currentItem, 100.0 * float(currentItem) / n_items, (time.time() - start_time) / 60, float(currentItem) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)
def _generateTrainData_low_ram(self): print(self.RECOMMENDER_NAME + ": Generating train data") start_time_batch = time.time() # Here is important only the structure self.similarity = Compute_Similarity(self.ICM.T, shrink=0, topK=self.topK, normalize=False) S_matrix_contentKNN = self.similarity.compute_similarity() S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr") self._writeLog( self.RECOMMENDER_NAME + ": Collaborative S density: {:.2E}, nonzero cells {}".format( self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz)) self._writeLog( self.RECOMMENDER_NAME + ": Content S density: {:.2E}, nonzero cells {}".format( S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz)) if self.normalize_similarity: # Compute sum of squared sum_of_squared_features = np.array( self.ICM.T.power(2).sum(axis=0)).ravel() sum_of_squared_features = np.sqrt(sum_of_squared_features) num_common_coordinates = 0 estimated_n_samples = int(S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 1.2) self.row_list = np.zeros(estimated_n_samples, dtype=np.int32) self.col_list = np.zeros(estimated_n_samples, dtype=np.int32) self.data_list = np.zeros(estimated_n_samples, dtype=np.float64) num_samples = 0 for row_index in range(self.n_items): start_pos_content = S_matrix_contentKNN.indptr[row_index] end_pos_content = S_matrix_contentKNN.indptr[row_index + 1] content_coordinates = S_matrix_contentKNN.indices[ start_pos_content:end_pos_content] start_pos_target = self.S_matrix_target.indptr[row_index] end_pos_target = self.S_matrix_target.indptr[row_index + 1] target_coordinates = self.S_matrix_target.indices[ start_pos_target:end_pos_target] # Chech whether the content coordinate is associated to a non zero target value # If true, the content coordinate has a collaborative non-zero value # if false, the content coordinate has a collaborative zero value is_common = np.in1d(content_coordinates, target_coordinates) num_common_in_current_row = is_common.sum() num_common_coordinates += num_common_in_current_row for index in range(len(is_common)): if num_samples == estimated_n_samples: dataBlock = 1000000 self.row_list = np.concatenate( (self.row_list, np.zeros(dataBlock, dtype=np.int32))) self.col_list = np.concatenate( (self.col_list, np.zeros(dataBlock, dtype=np.int32))) self.data_list = np.concatenate( (self.data_list, np.zeros(dataBlock, dtype=np.float64))) if is_common[index]: # If cell exists in target matrix, add its value # Otherwise it will remain zero with a certain probability col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index new_data_value = self.S_matrix_target[row_index, col_index] if self.normalize_similarity: new_data_value *= sum_of_squared_features[ row_index] * sum_of_squared_features[col_index] self.data_list[num_samples] = new_data_value num_samples += 1 elif np.random.rand() <= self.add_zeros_quota: col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index self.data_list[num_samples] = 0.0 num_samples += 1 if time.time( ) - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * ( 1 + self.add_zeros_quota): print(self.RECOMMENDER_NAME + ": Generating train data. Sample {} ( {:.2f} %) ".format( num_samples, num_samples / S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 100)) sys.stdout.flush() sys.stderr.flush() start_time_batch = time.time() self._writeLog( self.RECOMMENDER_NAME + ": Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells" .format(num_common_coordinates, S_matrix_contentKNN.nnz, num_common_coordinates / S_matrix_contentKNN.nnz * 100)) # Discard extra cells at the left of the array self.row_list = self.row_list[:num_samples] self.col_list = self.col_list[:num_samples] self.data_list = self.data_list[:num_samples] data_nnz = sum(np.array(self.data_list) != 0) data_sum = sum(self.data_list) collaborative_nnz = self.S_matrix_target.nnz collaborative_sum = sum(self.S_matrix_target.data) self._writeLog( self.RECOMMENDER_NAME + ": Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, " "average over all collaborative data is {:.2E}".format( data_sum, data_sum / data_nnz, collaborative_sum / collaborative_nnz))
def compute_similarity(self, start_col=None, end_col=None, block_size=100): """ Compute the similarity for the given dataset :param self: :param start_col: column to begin with :param end_col: column to stop before, end_col is excluded :return: """ values = [] rows = [] cols = [] start_time = time.time() start_time_print_batch = start_time processedItems = 0 if self.adjusted_cosine: self.applyAdjustedCosine() elif self.pearson_correlation: self.applyPearsonCorrelation() elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient: self.useOnlyBooleanInteractions() # We explore the matrix column-wise self.dataMatrix = check_matrix(self.dataMatrix, 'csc') # Compute sum of squared values to be used in normalization sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() # Tanimoto does not require the square root to be applied if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient): sumOfSquared = np.sqrt(sumOfSquared) if self.asymmetric_cosine: sumOfSquared_to_1_minus_alpha = np.power( sumOfSquared, 2 * (1 - self.asymmetric_alpha)) sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha) self.dataMatrix = check_matrix(self.dataMatrix, 'csc') start_col_local = 0 end_col_local = self.n_columns if start_col is not None and start_col > 0 and start_col < self.n_columns: start_col_local = start_col if end_col is not None and end_col > start_col_local and end_col < self.n_columns: end_col_local = end_col start_col_block = start_col_local this_block_size = 0 # Compute all similarities for each item using vectorization while start_col_block < end_col_local: # Add previous block size processedItems += this_block_size end_col_block = min(start_col_block + block_size, end_col_local) this_block_size = end_col_block - start_col_block if time.time( ) - start_time_print_batch >= 30 or end_col_block == end_col_local: columnPerSec = processedItems / (time.time() - start_time) print( "Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min" .format( processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, (time.time() - start_time) / 60)) sys.stdout.flush() sys.stderr.flush() start_time_print_batch = time.time() # All data points for a given item item_data = self.dataMatrix[:, start_col_block:end_col_block] item_data = item_data.toarray().squeeze() if self.use_row_weights: #item_data = np.multiply(item_data, self.row_weights) #item_data = item_data.T.dot(self.row_weights_diag).T this_block_weights = self.dataMatrix_weighted.T.dot(item_data) else: # Compute item similarities this_block_weights = self.dataMatrix.T.dot(item_data) for col_index_in_block in range(this_block_size): if this_block_size == 1: this_column_weights = this_block_weights else: this_column_weights = this_block_weights[:, col_index_in_block] columnIndex = col_index_in_block + start_col_block this_column_weights[columnIndex] = 0.0 # Apply normalization and shrinkage, ensure denominator != 0 if self.normalize: if self.asymmetric_cosine: denominator = sumOfSquared_to_alpha[ columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6 else: denominator = sumOfSquared[ columnIndex] * sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # Apply the specific denominator for Tanimoto elif self.tanimoto_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.dice_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.tversky_coefficient: denominator = this_column_weights + \ (sumOfSquared[columnIndex] - this_column_weights)*self.tversky_alpha + \ (sumOfSquared - this_column_weights)*self.tversky_beta + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # If no normalization or tanimoto is selected, apply only shrink elif self.shrink != 0: this_column_weights = this_column_weights / self.shrink #this_column_weights = this_column_weights.toarray().ravel() if self.TopK == 0: self.W_dense[:, columnIndex] = this_column_weights else: # Sort indices and select TopK # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index relevant_items_partition = ( -this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] relevant_items_partition_sorting = np.argsort( -this_column_weights[relevant_items_partition]) top_k_idx = relevant_items_partition[ relevant_items_partition_sorting] # Incrementally build sparse matrix, do not add zeros notZerosMask = this_column_weights[top_k_idx] != 0.0 numNotZeros = np.sum(notZerosMask) values.extend(this_column_weights[top_k_idx][notZerosMask]) rows.extend(top_k_idx[notZerosMask]) cols.extend(np.ones(numNotZeros) * columnIndex) start_col_block += block_size # End while on columns if self.TopK == 0: return self.W_dense else: W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(self.n_columns, self.n_columns), dtype=np.float32) return W_sparse