def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none", **similarity_args): self.topK = topK self.shrink = shrink if feature_weighting not in self.FEATURE_WEIGHTING_VALUES: raise ValueError( "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'" .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting)) if feature_weighting == "BM25": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = okapi_BM_25(self.UCM_train) elif feature_weighting == "TF-IDF": self.UCM_train = self.UCM_train.astype(np.float32) self.UCM_train = TF_IDF(self.UCM_train) similarity = Compute_Similarity(self.UCM_train.T, shrink=shrink, topK=topK, normalize=normalize, similarity=similarity, **similarity_args) self.W_sparse = similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr')
def fit(self, topK=None, l2_norm=1e3, normalize_matrix=False, verbose=True): self.verbose = verbose start_time = time.time() self._print("Fitting model... ") if normalize_matrix: # Normalize rows and then columns self.URM_train = normalize(self.URM_train, norm='l2', axis=1) self.URM_train = normalize(self.URM_train, norm='l2', axis=0) self.URM_train = sps.csr_matrix(self.URM_train) # Grahm matrix is X^t X, compute dot product similarity = Compute_Similarity(self.URM_train, shrink=0, topK=self.URM_train.shape[1], normalize=False, similarity="cosine") grahm_matrix = similarity.compute_similarity().toarray() diag_indices = np.diag_indices(grahm_matrix.shape[0]) # The Compute_Similarity object ensures the diagonal of the similarity matrix is zero # in this case we need the diagonal as well, which is just the item popularity item_popularity = np.ediff1d(self.URM_train.tocsc().indptr) grahm_matrix[diag_indices] = item_popularity + l2_norm P = np.linalg.inv(grahm_matrix) B = P / (-np.diag(P)) B[diag_indices] = 0.0 new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time) self._print("Fitting model... done in {:.2f} {}".format( new_time_value, new_time_unit)) # Check if the matrix should be saved in a sparse or dense format # The matrix is sparse, regardless of the presence of the topK, if nonzero cells are less than sparse_threshold_quota % if topK is not None: B = similarityMatrixTopK(B, k=topK, verbose=False) if self._is_content_sparse_check(B): self._print("Detected model matrix to be sparse, changing format.") self.W_sparse = check_matrix(B, format='csr', dtype=np.float32) else: self.W_sparse = check_matrix(B, format='npy', dtype=np.float32) self._W_sparse_format_checked = True self._compute_item_score = self._compute_score_W_dense
class CFW_D_Similarity_Cython(BaseItemCBFRecommender, BaseItemSimilarityMatrixRecommender, Incremental_Training_Early_Stopping): RECOMMENDER_NAME = "CFW_D_Similarity_Cython" INIT_TYPE_VALUES = ["random", "one", "BM25", "TF-IDF"] def __init__(self, URM_train, ICM_train, S_matrix_target): super(CFW_D_Similarity_Cython, self).__init__(URM_train, ICM_train) if (URM_train.shape[1] != ICM_train.shape[0]): raise ValueError( "Number of items not consistent. URM contains {} but ICM contains {}" .format(URM_train.shape[1], ICM_train.shape[0])) if (S_matrix_target.shape[0] != S_matrix_target.shape[1]): raise ValueError( "Items imilarity matrix is not square: rows are {}, columns are {}" .format(S_matrix_target.shape[0], S_matrix_target.shape[1])) if (S_matrix_target.shape[0] != ICM_train.shape[0]): raise ValueError( "Number of items not consistent. S_matrix contains {} but ICM contains {}" .format(S_matrix_target.shape[0], ICM_train.shape[0])) self.S_matrix_target = check_matrix(S_matrix_target, 'csr') self.ICM = check_matrix(ICM_train, 'csr') self.n_features = self.ICM.shape[1] def fit(self, show_max_performance=False, precompute_common_features=False, learning_rate=0.1, positive_only_D=True, initialization_mode_D="random", normalize_similarity=False, use_dropout=True, dropout_perc=0.3, l1_reg=0.0, l2_reg=0.0, epochs=50, topK=300, add_zeros_quota=0.0, log_file=None, verbose=False, sgd_mode='adagrad', gamma=0.9, beta_1=0.9, beta_2=0.999, **earlystopping_kwargs): if initialization_mode_D not in self.INIT_TYPE_VALUES: raise ValueError( "Value for 'initialization_mode_D' not recognized. Acceptable values are {}, provided was '{}'" .format(self.INIT_TYPE_VALUES, initialization_mode_D)) # Import compiled module from Recommenders.FeatureWeighting.Cython.CFW_D_Similarity_Cython_SGD import CFW_D_Similarity_Cython_SGD self.show_max_performance = show_max_performance self.normalize_similarity = normalize_similarity self.learning_rate = learning_rate self.add_zeros_quota = add_zeros_quota self.l1_reg = l1_reg self.l2_reg = l2_reg self.epochs = epochs self.topK = topK self.log_file = log_file self.verbose = verbose self._generate_train_data() weights_initialization_D = None if initialization_mode_D == "random": weights_initialization_D = np.random.normal( 0.001, 0.1, self.n_features).astype(np.float64) elif initialization_mode_D == "one": weights_initialization_D = np.ones(self.n_features, dtype=np.float64) elif initialization_mode_D == "zero": weights_initialization_D = np.zeros(self.n_features, dtype=np.float64) elif initialization_mode_D == "BM25": weights_initialization_D = np.ones(self.n_features, dtype=np.float64) self.ICM = self.ICM.astype(np.float32) self.ICM = okapi_BM_25(self.ICM) elif initialization_mode_D == "TF-IDF": weights_initialization_D = np.ones(self.n_features, dtype=np.float64) self.ICM = self.ICM.astype(np.float32) self.ICM = TF_IDF(self.ICM) else: raise ValueError( "CFW_D_Similarity_Cython: 'init_type' not recognized") # Instantiate fast Cython implementation self.FW_D_Similarity = CFW_D_Similarity_Cython_SGD( self.row_list, self.col_list, self.data_list, self.n_features, self.ICM, precompute_common_features=precompute_common_features, positive_only_D=positive_only_D, weights_initialization_D=weights_initialization_D, use_dropout=use_dropout, dropout_perc=dropout_perc, learning_rate=learning_rate, l1_reg=l1_reg, l2_reg=l2_reg, sgd_mode=sgd_mode, verbose=self.verbose, gamma=gamma, beta_1=beta_1, beta_2=beta_2) if self.verbose: print(self.RECOMMENDER_NAME + ": Initialization completed") self.D_incremental = self.FW_D_Similarity.get_weights() self.D_best = self.D_incremental.copy() self._train_with_early_stopping(epochs, algorithm_name=self.RECOMMENDER_NAME, **earlystopping_kwargs) self.compute_W_sparse(model_to_use="best") sys.stdout.flush() def _prepare_model_for_validation(self): self.D_incremental = self.FW_D_Similarity.get_weights() self.compute_W_sparse(model_to_use="last") def _update_best_model(self): self.D_best = self.D_incremental.copy() def _run_epoch(self, num_epoch): self.loss = self.FW_D_Similarity.fit() def _generate_train_data(self): if self.verbose: print(self.RECOMMENDER_NAME + ": Generating train data") start_time_batch = time.time() # Here is important only the structure self.similarity = Compute_Similarity(self.ICM.T, shrink=0, topK=self.topK, normalize=False) S_matrix_contentKNN = self.similarity.compute_similarity() S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr") self._print("Collaborative S density: {:.2E}, nonzero cells {}".format( self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz)) self._print("Content S density: {:.2E}, nonzero cells {}".format( S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz)) if self.normalize_similarity: # Compute sum of squared sum_of_squared_features = np.array( self.ICM.T.power(2).sum(axis=0)).ravel() sum_of_squared_features = np.sqrt(sum_of_squared_features) num_common_coordinates = 0 estimated_n_samples = int(S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 1.2) self.row_list = np.zeros(estimated_n_samples, dtype=np.int32) self.col_list = np.zeros(estimated_n_samples, dtype=np.int32) self.data_list = np.zeros(estimated_n_samples, dtype=np.float64) num_samples = 0 for row_index in range(self.n_items): start_pos_content = S_matrix_contentKNN.indptr[row_index] end_pos_content = S_matrix_contentKNN.indptr[row_index + 1] content_coordinates = S_matrix_contentKNN.indices[ start_pos_content:end_pos_content] start_pos_target = self.S_matrix_target.indptr[row_index] end_pos_target = self.S_matrix_target.indptr[row_index + 1] target_coordinates = self.S_matrix_target.indices[ start_pos_target:end_pos_target] # Chech whether the content coordinate is associated to a non zero target value # If true, the content coordinate has a collaborative non-zero value # if false, the content coordinate has a collaborative zero value is_common = np.in1d(content_coordinates, target_coordinates) num_common_in_current_row = is_common.sum() num_common_coordinates += num_common_in_current_row for index in range(len(is_common)): if num_samples == estimated_n_samples: dataBlock = 1000000 self.row_list = np.concatenate( (self.row_list, np.zeros(dataBlock, dtype=np.int32))) self.col_list = np.concatenate( (self.col_list, np.zeros(dataBlock, dtype=np.int32))) self.data_list = np.concatenate( (self.data_list, np.zeros(dataBlock, dtype=np.float64))) if is_common[index]: # If cell exists in target matrix, add its value # Otherwise it will remain zero with a certain probability col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index new_data_value = self.S_matrix_target[row_index, col_index] if self.normalize_similarity: new_data_value *= sum_of_squared_features[ row_index] * sum_of_squared_features[col_index] self.data_list[num_samples] = new_data_value num_samples += 1 elif np.random.rand() <= self.add_zeros_quota: col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index self.data_list[num_samples] = 0.0 num_samples += 1 if self.verbose and (time.time() - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota)): print(self.RECOMMENDER_NAME + ": Generating train data. Sample {} ({:4.1f}%) ".format( num_samples, num_samples / S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 100)) sys.stdout.flush() sys.stderr.flush() start_time_batch = time.time() self._print( "Content S structure has {} out of {} ({:4.1f}%) nonzero collaborative cells" .format(num_common_coordinates, S_matrix_contentKNN.nnz, num_common_coordinates / S_matrix_contentKNN.nnz * 100)) # Discard extra cells at the left of the array self.row_list = self.row_list[:num_samples] self.col_list = self.col_list[:num_samples] self.data_list = self.data_list[:num_samples] data_nnz = sum(np.array(self.data_list) != 0) data_sum = sum(self.data_list) collaborative_nnz = self.S_matrix_target.nnz collaborative_sum = sum(self.S_matrix_target.data) self._print( "Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, " "average over all collaborative data is {:.2E}".format( data_sum, data_sum / data_nnz, collaborative_sum / collaborative_nnz)) def compute_W_sparse(self, model_to_use="best"): if model_to_use == "last": feature_weights = self.D_incremental elif model_to_use == "best": feature_weights = self.D_best else: assert False, "{}: compute_W_sparse, 'model_to_use' parameter not recognized".format( self.RECOMMENDER_NAME) self.similarity = Compute_Similarity( self.ICM.T, shrink=0, topK=self.topK, normalize=self.normalize_similarity, row_weights=feature_weights) self.W_sparse = self.similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr') def set_ICM_and_recompute_W(self, ICM_new, recompute_w=True): self.ICM = ICM_new.copy() if recompute_w: self.compute_W_sparse(model_to_use="best") def save_model(self, folder_path, file_name=None): if file_name is None: file_name = self.RECOMMENDER_NAME print("{}: Saving model in file '{}'".format(self.RECOMMENDER_NAME, folder_path + file_name)) data_dict_to_save = { "D_best": self.D_best, "topK": self.topK, "W_sparse": self.W_sparse, "normalize_similarity": self.normalize_similarity } dataIO = DataIO(folder_path=folder_path) dataIO.save_data(file_name=file_name, data_dict_to_save=data_dict_to_save) print("{}: Saving complete".format(self.RECOMMENDER_NAME))
def compute_W_sparse(self, use_D=True, use_V=True, model_to_use="best"): assert model_to_use in [ "last", "best" ], "{}: compute_W_sparse, 'model_to_use' parameter not recognized".format( self.RECOMMENDER_NAME) if self.verbose: print("FBSM_Rating_Cython: Building similarity matrix...") start_time = time.time() start_time_print_batch = start_time # Diagonal if use_D: if model_to_use == "last": D = self.D_incremental else: D = self.D_best similarity = Compute_Similarity(self.ICM.T, shrink=0, topK=self.topK, normalize=False, row_weights=D) self.W_sparse = similarity.compute_similarity() else: self.W_sparse = sps.csr_matrix((self.n_items, self.n_items)) if use_V: if model_to_use == "last": V = self.V_incremental else: V = self.V_best # V * V.T W1 = self.ICM.dot(V.T) #self.W_sparse += W1.dot(W1.T) # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 values = np.zeros(dataBlock, dtype=np.float32) rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) numCells = 0 for numItem in range(self.n_items): V_weights = W1[numItem, :].dot(W1.T) V_weights[numItem] = 0.0 relevant_items_partition = ( -V_weights).argpartition(self.topK - 1)[0:self.topK] relevant_items_partition_sorting = np.argsort( -V_weights[relevant_items_partition]) top_k_idx = relevant_items_partition[ relevant_items_partition_sorting] # Incrementally build sparse matrix, do not add zeros notZerosMask = V_weights[top_k_idx] != 0.0 numNotZeros = np.sum(notZerosMask) values_to_add = V_weights[top_k_idx][notZerosMask] rows_to_add = top_k_idx[notZerosMask] cols_to_add = np.ones(numNotZeros) * numItem for index in range(len(values_to_add)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = rows_to_add[index] cols[numCells] = cols_to_add[index] values[numCells] = values_to_add[index] numCells += 1 if self.verbose and (time.time() - start_time_print_batch >= 30 or numItem == self.n_items - 1): columnPerSec = numItem / (time.time() - start_time) print( "Weighted similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min" .format(numItem, numItem / self.n_items * 100, columnPerSec, (time.time() - start_time) / 60)) sys.stdout.flush() sys.stderr.flush() start_time_print_batch = time.time() V_weights = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(self.n_items, self.n_items), dtype=np.float32) self.W_sparse += V_weights self.W_sparse = check_matrix(self.W_sparse, format='csr') if self.verbose: print("FBSM_Rating_Cython: Building similarity matrix... complete")
class CFW_D_Similarity_Linalg(BaseItemCBFRecommender, BaseItemSimilarityMatrixRecommender): RECOMMENDER_NAME = "CFW_D_Similarity_Linalg" def __init__(self, URM_train, ICM_train, S_matrix_target): super(CFW_D_Similarity_Linalg, self).__init__(URM_train, ICM_train) if (URM_train.shape[1] != ICM_train.shape[0]): raise ValueError( "Number of items not consistent. URM contains {} but ICM contains {}" .format(URM_train.shape[1], ICM_train.shape[0])) if (S_matrix_target.shape[0] != S_matrix_target.shape[1]): raise ValueError( "Items imilarity matrix is not square: rows are {}, columns are {}" .format(S_matrix_target.shape[0], S_matrix_target.shape[1])) if (S_matrix_target.shape[0] != ICM_train.shape[0]): raise ValueError( "Number of items not consistent. S_matrix contains {} but ICM contains {}" .format(S_matrix_target.shape[0], ICM_train.shape[0])) self.S_matrix_target = check_matrix(S_matrix_target, 'csr') self.ICM = check_matrix(ICM_train, 'csr') self.n_features = self.ICM.shape[1] def fit(self, show_max_performance=False, loss_tolerance=1e-6, iteration_limit=50000, damp_coeff=0.0, topK=300, add_zeros_quota=0.0, normalize_similarity=False): self.show_max_performance = show_max_performance self.add_zeros_quota = add_zeros_quota self.normalize_similarity = normalize_similarity self.topK = topK self._generate_train_data() commonFeatures = self.ICM[self.row_list].multiply( self.ICM[self.col_list]) linalg_result = linalg.lsqr(commonFeatures, self.data_list, show=False, atol=loss_tolerance, btol=loss_tolerance, iter_lim=iteration_limit, damp=damp_coeff) # res = linalg.lsmr(commonFeatures, self.data_list, show = False, atol=loss_tolerance, btol=loss_tolerance, # maxiter = iteration_limit, damp=damp_coeff) self.D_best = linalg_result[0].copy() self.loss = linalg_result[3] self.compute_W_sparse() def _generate_train_data(self): if self.verbose: print(self.RECOMMENDER_NAME + ": Generating train data") start_time_batch = time.time() # Here is important only the structure self.similarity = Compute_Similarity(self.ICM.T, shrink=0, topK=self.topK, normalize=False) S_matrix_contentKNN = self.similarity.compute_similarity() S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr") self._print("Collaborative S density: {:.2E}, nonzero cells {}".format( self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz)) self._print("Content S density: {:.2E}, nonzero cells {}".format( S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz)) if self.normalize_similarity: # Compute sum of squared sum_of_squared_features = np.array( self.ICM.T.power(2).sum(axis=0)).ravel() sum_of_squared_features = np.sqrt(sum_of_squared_features) num_common_coordinates = 0 estimated_n_samples = int(S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 1.2) self.row_list = np.zeros(estimated_n_samples, dtype=np.int32) self.col_list = np.zeros(estimated_n_samples, dtype=np.int32) self.data_list = np.zeros(estimated_n_samples, dtype=np.float64) num_samples = 0 for row_index in range(self.n_items): start_pos_content = S_matrix_contentKNN.indptr[row_index] end_pos_content = S_matrix_contentKNN.indptr[row_index + 1] content_coordinates = S_matrix_contentKNN.indices[ start_pos_content:end_pos_content] start_pos_target = self.S_matrix_target.indptr[row_index] end_pos_target = self.S_matrix_target.indptr[row_index + 1] target_coordinates = self.S_matrix_target.indices[ start_pos_target:end_pos_target] # Chech whether the content coordinate is associated to a non zero target value # If true, the content coordinate has a collaborative non-zero value # if false, the content coordinate has a collaborative zero value is_common = np.in1d(content_coordinates, target_coordinates) num_common_in_current_row = is_common.sum() num_common_coordinates += num_common_in_current_row for index in range(len(is_common)): if num_samples == estimated_n_samples: dataBlock = 1000000 self.row_list = np.concatenate( (self.row_list, np.zeros(dataBlock, dtype=np.int32))) self.col_list = np.concatenate( (self.col_list, np.zeros(dataBlock, dtype=np.int32))) self.data_list = np.concatenate( (self.data_list, np.zeros(dataBlock, dtype=np.float64))) if is_common[index]: # If cell exists in target matrix, add its value # Otherwise it will remain zero with a certain probability col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index new_data_value = self.S_matrix_target[row_index, col_index] if self.normalize_similarity: new_data_value *= sum_of_squared_features[ row_index] * sum_of_squared_features[col_index] self.data_list[num_samples] = new_data_value num_samples += 1 elif np.random.rand() <= self.add_zeros_quota: col_index = content_coordinates[index] self.row_list[num_samples] = row_index self.col_list[num_samples] = col_index self.data_list[num_samples] = 0.0 num_samples += 1 if self.verbose and (time.time() - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota)): print(self.RECOMMENDER_NAME + ": Generating train data. Sample {} ({:4.1f}%) ".format( num_samples, num_samples / S_matrix_contentKNN.nnz * (1 + self.add_zeros_quota) * 100)) sys.stdout.flush() sys.stderr.flush() start_time_batch = time.time() self._print( "Content S structure has {} out of {} ({:4.1f}%) nonzero collaborative cells" .format(num_common_coordinates, S_matrix_contentKNN.nnz, num_common_coordinates / S_matrix_contentKNN.nnz * 100)) # Discard extra cells at the left of the array self.row_list = self.row_list[:num_samples] self.col_list = self.col_list[:num_samples] self.data_list = self.data_list[:num_samples] data_nnz = sum(np.array(self.data_list) != 0) data_sum = sum(self.data_list) collaborative_nnz = self.S_matrix_target.nnz collaborative_sum = sum(self.S_matrix_target.data) self._print( "Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, " "average over all collaborative data is {:.2E}".format( data_sum, data_sum / data_nnz, collaborative_sum / collaborative_nnz)) def compute_W_sparse(self): self.similarity = Compute_Similarity( self.ICM.T, shrink=0, topK=self.topK, normalize=self.normalize_similarity, row_weights=self.D_best) self.W_sparse = self.similarity.compute_similarity() self.W_sparse = check_matrix(self.W_sparse, format='csr') def save_model(self, folder_path, file_name=None): if file_name is None: file_name = self.RECOMMENDER_NAME print("{}: Saving model in file '{}'".format(self.RECOMMENDER_NAME, folder_path + file_name)) data_dict_to_save = { "D_best": self.D_best, "topK": self.topK, "W_sparse": self.W_sparse, "normalize_similarity": self.normalize_similarity } dataIO = DataIO(folder_path=folder_path) dataIO.save_data(file_name=file_name, data_dict_to_save=data_dict_to_save) print("{}: Saving complete".format(self.RECOMMENDER_NAME))