def fit(self, URM, verbose=True, l1_ratio=1.0, alpha=1.0, positive_only=True, topK=494, tuning=False, similarity_path=SIMILARITY_PATH): self.URM = URM self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK #1e-4 self.helper = BaseFunction() if tuning: if not os.path.exists(os.getcwd() + similarity_path): self.run_fit() self.helper.export_similarity_matrix(os.getcwd() + similarity_path, self.W_sparse, name=RECOMMENDER_NAME) self.W_sparse = self.helper.import_similarity_matrix( os.getcwd() + similarity_path) self.similarityProduct = self.URM.dot(self.W_sparse) else: self.run_fit() self.similarityProduct = self.URM.dot(self.W_sparse)
def __init__(self, recommender, name): self.recommender = recommender self.name = name self.helper = BaseFunction() self.helper.get_URM() self.helper.get_ICM() self.helper.get_UCM() self.helper.split_80_20() self.helper.get_target_users()
class AlternatingLeastSquare: def __init__(self, n_factors=400, regularization=0.1104, iterations=50): self.n_factors = n_factors self.regularization = regularization self.iterations = iterations self.helper = BaseFunction() def run_fit(self): # Initialize the als model and fit it using the sparse item-user matrix model = implicit.als.AlternatingLeastSquares(factors=self.n_factors, regularization=self.regularization, iterations=self.iterations) alpha_val = 24 # Calculate the confidence by multiplying it by our alpha value. data_conf = (self.sparse_item_user * alpha_val).astype('double') # Fit the model model.fit(data_conf) # Get the user and item vectors from our trained model self.user_factors = model.user_factors self.item_factors = model.item_factors def fit(self, URM, tuning=False, user_path=USER_PATH, item_path=ITEM_PATH): self.URM = URM self.sparse_item_user = self.URM.T if tuning: if not os.path.exists(os.getcwd() + user_path) and not os.path.exists(os.getcwd() + user_path): self.run_fit() self.helper.export_nparr(user_path, self.user_factors) self.helper.export_nparr(item_path, self.item_factors) self.user_factors = self.helper.import_nparr(user_path) self.item_factors = self.helper.import_nparr(item_path) else: self.run_fit() def get_expected_ratings(self, user_id): scores = np.dot(self.user_factors[user_id], self.item_factors.T) return np.squeeze(scores) def recommend(self, user_id, at=10): expected_ratings = self.get_expected_ratings(user_id) recommended_items = np.flip(np.argsort(expected_ratings), 0) unseen_items_mask = np.in1d(recommended_items, self.URM[user_id].indices, assume_unique=True, invert=True) recommended_items = recommended_items[unseen_items_mask] return recommended_items[0:at]
def __init__(self, positive_threshold=1, recompile_cython=False, final_model_sparse_weights=True, train_with_sparse_weights=False, symmetric=True, epochs=200, batch_size=1, lambda_i=0.01, lambda_j=0.001, learning_rate=0.01, topK=10, sgd_mode='adagrad', gamma=0.995, beta_1=0.9, beta_2=0.999): #### Retreiving parameters for fitting ####### self.epochs = epochs self.batch_size = batch_size self.lambda_i = lambda_i self.lambda_j = lambda_j self.learning_rate = learning_rate self.topK = topK self.sgd_mode = sgd_mode self.gamma = gamma self.beta_1 = beta_1 self.beta_2 = beta_2 self.symmetric = symmetric ############################################# self.normalize = False self.positive_threshold = positive_threshold self.train_with_sparse_weights = train_with_sparse_weights self.sparse_weights = final_model_sparse_weights self.helper = BaseFunction() if self.train_with_sparse_weights: self.sparse_weights = True if recompile_cython: print("Compiling in Cython") self.runCompilationScript() print("Compilation Complete")
class SLIM_BPR_Cython(object): ####################################################################################### # INIT SLIM_BPR # ####################################################################################### def __init__(self, positive_threshold=1, recompile_cython=False, final_model_sparse_weights=True, train_with_sparse_weights=False, symmetric=True, epochs=200, batch_size=1, lambda_i=0.01, lambda_j=0.001, learning_rate=0.01, topK=10, sgd_mode='adagrad', gamma=0.995, beta_1=0.9, beta_2=0.999): #### Retreiving parameters for fitting ####### self.epochs = epochs self.batch_size = batch_size self.lambda_i = lambda_i self.lambda_j = lambda_j self.learning_rate = learning_rate self.topK = topK self.sgd_mode = sgd_mode self.gamma = gamma self.beta_1 = beta_1 self.beta_2 = beta_2 self.symmetric = symmetric ############################################# self.normalize = False self.positive_threshold = positive_threshold self.train_with_sparse_weights = train_with_sparse_weights self.sparse_weights = final_model_sparse_weights self.helper = BaseFunction() if self.train_with_sparse_weights: self.sparse_weights = True if recompile_cython: print("Compiling in Cython") self.runCompilationScript() print("Compilation Complete") ####################################################################################### # RUN FITTNG # ####################################################################################### def fit(self, URM_train, tuning=False, similarity_path=SIMILARITY_PATH): self.__init__() self.URM = URM_train self.tuning = tuning self.n_users = URM_train.shape[0] self.n_items = URM_train.shape[1] # Select only positive interactions URM_train_positive = self.URM.copy() self.URM_mask = self.URM.copy() self.URM_mask.data = self.URM_mask.data >= self.positive_threshold self.URM_mask.eliminate_zeros() assert self.URM_mask.nnz > 0, "MatrixFactorization_Cython: URM_train_positive is empty, positive threshold is too high" # Start fitting URM_train_positive.data = URM_train_positive.data >= self.positive_threshold URM_train_positive.eliminate_zeros() from Recommenders.Slim.SlimBPR.Cython.SLIM_BPR_Cython_Epoch import SLIM_BPR_Cython_Epoch self.cythonEpoch = SLIM_BPR_Cython_Epoch( self.URM_mask, train_with_sparse_weights=self.train_with_sparse_weights, final_model_sparse_weights=self.sparse_weights, topK=self.topK, learning_rate=self.learning_rate, li_reg=self.lambda_i, lj_reg=self.lambda_j, batch_size=self.batch_size, symmetric=self.symmetric, sgd_mode=self.sgd_mode, gamma=self.gamma, beta_1=self.beta_1, beta_2=self.beta_2) self._initialize_incremental_model() self.epochs_best = 0 currentEpoch = 0 while currentEpoch < self.epochs: self._run_epoch() self._update_best_model() currentEpoch += 1 self.get_S_incremental_and_set_W(similarity_path) self.cythonEpoch._dealloc() sys.stdout.flush() self.score = self.URM.dot(self.W_sparse) def _initialize_incremental_model(self): self.S_incremental = self.cythonEpoch.get_S() self.S_best = self.S_incremental.copy() def _update_incremental_model(self): self.get_S_incremental_and_set_W() def _update_best_model(self): self.S_best = self.S_incremental.copy() def _run_epoch(self): self.cythonEpoch.epochIteration_Cython() def get_S_incremental_and_set_W(self, similarity_path): self.S_incremental = self.cythonEpoch.get_S() if self.train_with_sparse_weights: if self.tuning: if not os.path.exists(os.getcwd() + similarity_path): self.W_sparse = self.S_incremental self.helper.export_similarity_matrix(os.getcwd() + similarity_path, self.W_sparse, name=RECOMMENDER_NAME) self.W_sparse = self.helper.import_similarity_matrix( os.getcwd() + similarity_path) else: self.W_sparse = self.S_incremental else: if self.tuning: if not os.path.exists(os.getcwd() + similarity_path): self.W_sparse = similarityMatrixTopK(self.S_incremental, k=self.topK) self.helper.export_similarity_matrix(os.getcwd() + similarity_path, self.W_sparse, name=RECOMMENDER_NAME) self.W_sparse = self.helper.import_similarity_matrix( os.getcwd() + similarity_path) else: self.W_sparse = similarityMatrixTopK(self.S_incremental, k=self.topK) def runCompilationScript(self): # Run compile script setting the working directory to ensure the compiled file are contained in the # appropriate subfolder and not the project root file_subfolder = "/Slim/Cython" file_to_compile_list = ['SLIM_BPR_Cython_Epoch.pyx'] run_compile_subprocess(file_subfolder, file_to_compile_list) print("{}: Compiled module {} in subfolder: {}".format( RECOMMENDER_NAME, file_to_compile_list, file_subfolder)) # Command to run compilation script # python compile_script.py SLIM_BPR_Cython_Epoch.pyx build_ext --inplace # Command to generate html report # cython -a SLIM_BPR_Cython_Epoch.pyx def get_expected_ratings(self, user_id): expected_ratings = self.score[user_id].todense() return np.squeeze(np.asarray(expected_ratings)) def recommend(self, user_id, at=10): # compute the scores using the dot product scores = self.get_expected_ratings(user_id) ranking = scores.argsort()[::-1] unseen_items_mask = np.in1d(ranking, self.URM[user_id].indices, assume_unique=True, invert=True) ranking = ranking[unseen_items_mask] return ranking[:at]
def __init__(self): self.helper = BaseFunction() self.URM = None self.URM_train = None self.URM_test = None self.URM = self.helper.get_URM() self.helper.split_80_20() self.URM_train, self.URM_test = self.helper.URM_train, self.helper.URM_test self.helper.get_ICM() self.helper.get_UCM() self.helper.get_target_users() self.ICM_all = self.helper.ICM_all self.UCM_all = self.helper.UCM_all self.initial_target_user = self.helper.userlist_unique MAP_ItemCF_per_group = [] MAP_UserCF_per_group = [] MAP_ItemCBF_per_group = [] MAP_UserCBF_per_group = [] MAP_ItemCBF_BM25_per_group = [] MAP_UserCBF_BM25_per_group = [] MAP_ItemCBF_TFIDF_per_group = [] MAP_UserCBF_TFIDF_per_group = [] MAP_Slim_per_group = [] MAP_Elastic_per_group = [] MAP_PureSVD_per_group = [] MAP_P3Alpha_per_group = [] MAP_RP3Beta_per_group = [] MAP_ALS_per_group = [] MAP_Hybrid2_per_group = [] MAP_Hybrid6_per_group = [] MAP_H6_bis_per_group = [] MAP_Hybrid7_per_group = [] MAP_Hybrid8_per_group = [] MAP_HybridCB_per_group = [] self.profile_length = np.ediff1d(self.URM_train.indptr) self.blocksize = int(len(self.profile_length) * 0.05) self.sortedusers = np.argsort(self.profile_length) self.ItemCF = ItemKNNCFRecommender() self.UserCF = UserKNNCFRecommender() self.ItemCBF = ItemCBFKNNRecommender() self.UserCBF = UserCBFKNNRecommender() self.Slim = SLIM_BPR_Cython() self.Elastic = SLIMElasticNetRecommender() self.PureSVD = PureSVDRecommender() self.P3Alpha = P3AlphaRecommender() self.RP3Beta = RP3BetaRecommender() self.ALS = AlternatingLeastSquare() self.H6_bis = Hybrid_Combo6_bis("Combo6_bis", UserCBFKNNRecommender()) self.ItemCBF.fit( self.URM_train, self.ICM_all, tuning=True, similarity_path="/SimilarityProduct/ItemCBF_similarity.npz") self.UserCBF.fit( self.URM_train, self.UCM_all, tuning=True, similarity_path="/SimilarityProduct/UserCBF_similarity.npz") self.ItemCF.fit( self.URM_train, tuning=True, similarity_path="/SimilarityProduct/ItemCF_similarity.npz") self.UserCF.fit( self.URM_train, tuning=True, similarity_path="/SimilarityProduct/UserCF_similarity.npz") self.Slim.fit(self.URM_train, tuning=True, similarity_path="/SimilarityProduct/Slim_similarity.npz") self.Elastic.fit( self.URM_train, tuning=True, similarity_path="/SimilarityProduct/Elastic_similarity.npz") self.PureSVD.fit(self.URM_train) self.P3Alpha.fit( self.URM_train, tuning=True, similarity_path="/SimilarityProduct/P3Aplha_similarity.npz") self.RP3Beta.fit( self.URM_train, tuning=True, similarity_path="/SimilarityProduct/RP3Beta_similarity.npz") self.ALS.fit(self.URM_train) self.H6_bis.fit(self.URM_train, self.ICM_all, self.UCM_all, tuning=True) for group_id in range(0, 20): start_pos = group_id * self.blocksize end_pos = min((group_id + 1) * self.blocksize, len(self.profile_length)) users_in_group = self.sortedusers[start_pos:end_pos] users_in_group_p_len = self.profile_length[users_in_group] print("Group {}, average p.len {:.2f}, min {}, max {}".format( group_id, users_in_group_p_len.mean(), users_in_group_p_len.min(), users_in_group_p_len.max())) users_not_in_group_flag = np.isin(self.sortedusers, users_in_group, invert=True) users_not_in_group = self.sortedusers[users_not_in_group_flag] users_in_group = list( set(self.initial_target_user) - set(list(users_not_in_group))) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.ItemCBF, at=10) MAP_ItemCBF_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.ItemCF, at=10) MAP_ItemCF_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.UserCF, at=10) MAP_UserCF_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.Slim, at=10) MAP_Slim_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.Elastic, at=10) MAP_Elastic_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.PureSVD, at=10) MAP_PureSVD_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.P3Alpha, at=10) MAP_P3Alpha_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.RP3Beta, at=10) MAP_RP3Beta_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.UserCBF, at=10) MAP_UserCBF_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.ALS, at=10) MAP_ALS_per_group.append(results) results = evaluate_algorithm_classes(self.URM_test, users_in_group, self.H6_bis, at=10) MAP_H6_bis_per_group.append(results) pyplot.plot(MAP_UserCBF_per_group, label="UserCBF") pyplot.plot(MAP_ItemCBF_per_group, label="ItemCBF") pyplot.plot(MAP_ItemCF_per_group, label="ItemCF") pyplot.plot(MAP_UserCF_per_group, label="UserCF") pyplot.plot(MAP_Slim_per_group, label="Slim") pyplot.plot(MAP_Elastic_per_group, label="Elastic") pyplot.plot(MAP_P3Alpha_per_group, label="P3Alpha") pyplot.plot(MAP_RP3Beta_per_group, label="RP3Beta") pyplot.plot(MAP_PureSVD_per_group, label="PureSVD") pyplot.plot(MAP_ALS_per_group, label="ALS") pyplot.plot(MAP_H6_bis_per_group, label="H6_bis") pyplot.xlabel('User Group') pyplot.ylabel('MAP') pyplot.xticks(np.arange(0, 20, 1)) pyplot.grid(b=True, axis='both', color='firebrick', linestyle='--', linewidth=0.5) pyplot.legend(loc='lower right') pyplot.show()
def __init__(self, n_factors=400, regularization=0.1104, iterations=50): self.n_factors = n_factors self.regularization = regularization self.iterations = iterations self.helper = BaseFunction()
import pandas as pd from collections import Counter from Base.BaseFunction import BaseFunction filename = os.path.join(os.getcwd(), "Results/Hybrid-2020-01-06_20.12.34.csv") def load_sample(): cols = ['user_id', 'item_list'] sample_data = pd.read_csv(filename, names=cols, header=0) return sample_data if __name__ == "__main__": h = BaseFunction() h.get_URM() s = load_sample() x = s.item_list.values it = [] for i in x: it.append(re.findall(r'\d+', i)) flattened = [] for sublist in it: for val in sublist: flattened.append(int(val)) item_pop = np.ediff1d(h.URM_all.tocsc().indptr) coldi = list(np.where(item_pop == 0)[0])
class RP3BetaRecommender(object): ####################################################################################### # INIT CLASS # ####################################################################################### def __init__(self): self.verbose = True self.helper = BaseFunction() def __str__(self): return "RP3beta(alpha={}, beta={}, min_rating={}, topk={}, implicit={}, normalize_similarity={})".format( self.alpha, self.beta, self.min_rating, self.topK, self.implicit, self.normalize_similarity) def _print(self, string): if self.verbose: print("{}: {}".format(RECOMMENDER_NAME, string)) def _check_format(self): if not self._URM_train_format_checked: if self.URM_train.getformat() != "csr": self._print( "PERFORMANCE ALERT compute_item_score: {} is not {}, this will significantly slow down the computation." .format("URM_train", "csr")) self._URM_train_format_checked = True if not self._W_sparse_format_checked: if self.W_sparse.getformat() != "csr": self._print( "PERFORMANCE ALERT compute_item_score: {} is not {}, this will significantly slow down the computation." .format("W_sparse", "csr")) self._W_sparse_format_checked = True ####################################################################################### # FIT RECOMMENDER # ####################################################################################### def run_fit(self): # if X.dtype != np.float32: # print("RP3beta fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset") if self.min_rating > 0: self.URM_train.data[self.URM_train.data < self.min_rating] = 0 self.URM_train.eliminate_zeros() if self.implicit: self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32) # Pui is the row-normalized urm Pui = normalize(self.URM_train, norm='l1', axis=1) # Piu is the column-normalized, "boolean" urm transposed X_bool = self.URM_train.transpose(copy=True) X_bool.data = np.ones(X_bool.data.size, np.float32) # Taking the degree of each item to penalize top popular # Some rows might be zero, make sure their degree remains zero X_bool_sum = np.array(X_bool.sum(axis=1)).ravel() degree = np.zeros(self.URM_train.shape[1]) nonZeroMask = X_bool_sum != 0.0 degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self.beta) # ATTENTION: axis is still 1 because i transposed before the normalization Piu = normalize(X_bool, norm='l1', axis=1) del (X_bool) # Alfa power if self.alpha != 1.: Pui = Pui.power(self.alpha) Piu = Piu.power(self.alpha) # Final matrix is computed as Pui * Piu * Pui # Multiplication unpacked for memory usage reasons block_dim = 200 d_t = Piu # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time for current_block_start_row in range(0, Pui.shape[1], block_dim): if current_block_start_row + block_dim > Pui.shape[1]: block_dim = Pui.shape[1] - current_block_start_row similarity_block = d_t[ current_block_start_row:current_block_start_row + block_dim, :] * Pui similarity_block = similarity_block.toarray() for row_in_block in range(block_dim): row_data = np.multiply(similarity_block[row_in_block, :], degree) row_data[current_block_start_row + row_in_block] = 0 best = row_data.argsort()[::-1][:self.topK] notZerosMask = row_data[best] != 0.0 values_to_add = row_data[best][notZerosMask] cols_to_add = best[notZerosMask] for index in range(len(values_to_add)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = current_block_start_row + row_in_block cols[numCells] = cols_to_add[index] values[numCells] = values_to_add[index] numCells += 1 if time.time() - start_time_printBatch > 60: self._print( "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}" .format( current_block_start_row, 100.0 * float(current_block_start_row) / Pui.shape[1], (time.time() - start_time) / 60, float(current_block_start_row) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(Pui.shape[1], Pui.shape[1])) if self.normalize_similarity: self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1) if self.topK != False: self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK) self.W_sparse = check_matrix(self.W_sparse, format='csr') def fit(self, URM_train, alpha=0.41417, beta=0.04995, min_rating=0, topK=54, implicit=False, normalize_similarity=True, tuning=False, similarity_path=SIMILARITY_PATH): self.URM_train = check_matrix(URM_train.copy(), 'csr', dtype=np.float32) self.URM_train.eliminate_zeros() self.n_users, self.n_items = self.URM_train.shape self.alpha = alpha self.beta = beta self.min_rating = min_rating self.topK = topK self.implicit = implicit self.normalize_similarity = normalize_similarity if tuning: if not os.path.exists(os.getcwd() + similarity_path): self.run_fit() self.helper.export_similarity_matrix(os.getcwd() + similarity_path, self.W_sparse, name=RECOMMENDER_NAME) self.W_sparse = self.helper.import_similarity_matrix( os.getcwd() + similarity_path) else: self.run_fit() self.similarityProduct = self.URM_train.dot(self.W_sparse) ####################################################################################### # RUN RECOMMENATION # ####################################################################################### def get_expected_ratings(self, user_id): expected_ratings = self.similarityProduct[user_id].toarray().ravel() return np.squeeze(np.asarray(expected_ratings)) def recommend(self, user_id, at=10): # compute the scores using the dot product expected_ratings = self.get_expected_ratings(user_id) ranking = expected_ratings.argsort()[::-1] unseen_items_mask = np.in1d(ranking, self.URM_train[user_id].indices, assume_unique=True, invert=True) ranking = ranking[unseen_items_mask] return ranking[:at]
class BayesianSearch: ####################################################################################### # INIT CLASS BAYESIAN SEARCH # ####################################################################################### def __init__(self, recommender, name): self.recommender = recommender self.name = name self.helper = BaseFunction() self.helper.get_URM() self.helper.split_80_20() self.helper.get_target_users() self.helper.get_UCM() self.helper.get_ICM() self.optimazer = None def instanziate_optimazer(self, bayesian_method_call, pbounds): optimizer = BayesianOptimization( f=bayesian_method_call, pbounds=pbounds, verbose= 2, # verbose = 1 prints only when a maximum is observed, verbose = 0 is silent ) optimizer.maximize(init_points=30, n_iter=1000, acq='ucb', kappa=0.1) ####################################################################################### # STEP TO MAXIMAXE # ####################################################################################### def step_hybrid_three(self, weight1=0, weight2=0, weight3=0): start_time = time.time() UCM_all = self.helper.UCM_all ICM_all = self.helper.ICM_all self.recommender.fit(self.helper.URM_train, ICM_all=ICM_all, UCM_all=UCM_all, weights=[weight1, weight2, weight3], tuning=True) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_hybrid_four(self, weight1=0, weight2=0, weight3=0, weight4=0): start_time = time.time() UCM_all = self.helper.UCM_all ICM_all = self.helper.ICM_all self.recommender.fit(self.helper.URM_train, ICM_all=ICM_all, UCM_all=UCM_all, weights=[weight1, weight2, weight3, weight4], tuning=True) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_hybrid_six(self, weight1=0, weight2=0, weight3=0, weight4=0, weight5=0, weight6=0): start_time = time.time() UCM_all = self.helper.UCM_all ICM_all = self.helper.ICM_all self.recommender.fit( self.helper.URM_train, ICM_all=ICM_all, UCM_all=UCM_all, weights=[weight1, weight2, weight3, weight4, weight5, weight6], tuning=True) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_hybrid_seven(self, weight1=0, weight2=0, weight3=0, weight4=0, weight5=0, weight6=0, weight7=0): start_time = time.time() UCM_all = self.helper.UCM_all ICM_all = self.helper.ICM_all self.recommender.fit(self.helper.URM_train, ICM_all=ICM_all, UCM_all=UCM_all, weights=[ weight1, weight2, weight3, weight4, weight5, weight6, weight7 ], tuning=True) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_fallBack_Hybrid(self, weight1=0, weight2=0): start_time = time.time() UCM_all = self.helper.UCM_all ICM_all = self.helper.ICM_all self.recommender.fit(self.helper.URM_train, ICM_all=ICM_all, UCM_all=UCM_all, weights_fallback=[int(weight1), int(weight2)], tuning=True) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_slim(self, weight1=0, weight2=0, weight3=0): start_time = time.time() self.recommender = SLIM_BPR_Cython(lambda_i=weight1, lambda_j=weight2, learning_rate=weight3) self.recommender.fit(self.helper.URM_train) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_elastic(self, weight1=0, weight2=0, weight3=0): start_time = time.time() self.recommender.fit(self.helper.URM_train, l1_ratio=weight1, alpha=weight2, topK=int(weight3)) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_ALS(self, weight1=0, weight2=0, weight3=0): start_time = time.time() self.recommender = AlternatingLeastSquare(n_factors=int(weight1), regularization=weight2, iterations=int(weight3)) self.recommender.fit(self.helper.URM_train) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_Item_CB(self, weight1=0, weight2=0): start_time = time.time() ICM_all = self.helper.ICM_all self.recommender.fit(self.helper.URM_train, ICM_all, knn=int(weight1), shrink=int(weight2), tuning=False) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_User_CB(self, weight1=0, weight2=0): start_time = time.time() UCM_all = self.helper.UCM_all self.recommender.fit(self.helper.URM_train, UCM_all, knn=int(weight1), shrink=int(weight2)) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_P3Alpha(self, weight1=0, weight2=0): start_time = time.time() self.recommender.fit(self.helper.URM_train, topK=int(weight1), alpha=weight2) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_RP3Beta(self, alpha=0, beta=0, min_rating=0, topK=0): start_time = time.time() self.recommender.fit(self.helper.URM_train, alpha=alpha, beta=beta, min_rating=min_rating, topK=int(topK)) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_PureSVD_randomSVD(self, n_components, n_iter): start_time = time.time() self.recommender.fit(self.helper.URM_train, n_components=int(n_components), n_iter=int(n_iter)) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_FunkSVD(self, epoch, num_factors, learning_rate, user_reg, item_reg): start_time = time.time() self.recommender = MatrixFactorization_FunkSVD_Cython( int(epoch), int(num_factors), learning_rate, user_reg, item_reg) self.recommender.fit(self.helper.URM_train) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_TEST(self, t1, t2, t3, t4, t5): start_time = time.time() UCM_all = self.helper.UCM_all ICM_all = self.helper.ICM_all self.recommender = Hybrid_User_Wise("Hybrid User Wise", UserCBFKNNRecommender()) self.recommender.fit(self.helper.URM_train, ICM_all=ICM_all, UCM_all=UCM_all, thre1=t1, thre2=t2, thre3=t3, thre4=t4, thre5=t5, tuning=True) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative def step_all(self, H0_ICF_sh=0, H0_ICF_tK=0, H1_UCF_sh=0, H1_UCF_tK=0, H2_ICB_sh=0, H2_ICB_tK=0, H3_UCB_sh=0, H3_UCB_tK=0, H4_El_tK=0, H5_RP3_a=0, H5_RP3_b=0, H5_RP3_tK=0, H6_SL_bs=0, H6_SL_ep=0, H6_SL_l_i=0, H6_SL_l_j=0, H6_SL_l_r=0, H6_SL_tK=0, H7_ALS_i=0, H7_ALS_nf=0, H7_ALS_re=0, weight1=0, weight2=0, weight3=0, weight4=0, weight5=0, weight6=0, weight7=0): start_time = time.time() UCM_all = self.helper.UCM_all ICM_all = self.helper.ICM_all ItemCF = ItemKNNCFRecommender() UserCF = UserKNNCFRecommender() ItemCB = ItemCBFKNNRecommender() UserCB = UserCBFKNNRecommender() ElasticNet = SLIMElasticNetRecommender() RP3Beta = RP3BetaRecommender() Slim = SLIM_BPR_Cython(batch_size=int(H6_SL_bs), epochs=int(H6_SL_ep), lambda_i=H6_SL_l_i, lambda_j=H6_SL_l_j, learning_rate=H6_SL_l_r, topK=int(H6_SL_tK)) ALS = AlternatingLeastSquare(iterations=int(H7_ALS_i), n_factors=int(H7_ALS_nf), regularization=H7_ALS_re) ItemCF.fit(self.helper.URM_train, knn=int(H0_ICF_tK), shrink=H0_ICF_sh) UserCF.fit(self.helper.URM_train, knn=int(H1_UCF_tK), shrink=H1_UCF_sh) ItemCB.fit(self.helper.URM_train, ICM_all, knn=int(H2_ICB_tK), shrink=H2_ICB_sh) UserCB.fit(self.helper.URM_train, UCM_all, knn=int(H3_UCB_tK), shrink=H3_UCB_sh) ElasticNet.fit(self.helper.URM_train, topK=int(H4_El_tK)) RP3Beta.fit(self.helper.URM_train, alpha=H5_RP3_a, beta=H5_RP3_b, topK=int(H5_RP3_tK)) Slim.fit(self.helper.URM_train) ALS.fit(self.helper.URM_train) self.recommender = Hybrid_Achille_Tuning("Hybrid_Achille_Tuning_All", UserCB) self.recommender.fit(self.helper.URM_train, ICM_all=ICM_all, UCM_all=UCM_all, weights=[ weight1, weight2, weight3, weight4, weight5, weight6, weight7 ], ItemCF=ItemCF, UserCF=UserCF, ItemCB=ItemCB, ElasticNet=ElasticNet, RP3=RP3Beta, Slim=Slim, ALS=ALS) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative
class Tuner_Singles(): ####################################################################################### # INIT CLASS # ####################################################################################### def __init__(self, recommender, name): self.recommender = recommender self.name = name self.helper = BaseFunction() self.helper.get_URM() self.helper.get_ICM() self.helper.get_UCM() self.helper.split_80_20() self.helper.get_target_users() ####################################################################################### # TEP FOR TUNING # ####################################################################################### def step_weight(self, w1, w2): start_time = time.time() print("----------------------------------------") print("HybridCombination: " + self.name) print([w1, w2]) print("----------------------------------------") list_UCM = [self.helper.UCM_age, self.helper.UCM_region] list_ICM = [ self.helper.ICM, self.helper.ICM_price, self.helper.ICM_asset ] self.recommender.fit(self.helper.URM_train, [w1, w2], list_ICM=list_ICM, list_UCM=list_UCM, tuning=False) cumulative = evaluation.evaluate_algorithm(self.helper.URM_test, self.recommender, at=10) elapsed_time = time.time() - start_time print("----------------" + str(elapsed_time) + "----------------") return cumulative ####################################################################################### # GENETIC ALGORITHM # ####################################################################################### def random_pop(self): weights = [] for i in range(self.pop_size): w1 = random.randint(250, 600) # epoch w2 = random.randint(100, 300) # knn line = [w1, w2] weights.append(np.array(line)) return weights def evaluate_pop(self): appo = [] for chromosome in self.pop: res = self.evaluate_chromosome(chromosome) appo.append(res) return appo def evaluate_chromosome(self, chromosome): return self.step_weight(w1=chromosome[0], w2=chromosome[1]) def my_index(self, l, item): for i in range(len(l)): if (item == l[i]).all(): return i return -1 def select_parents(self): sorted_pop_score = sorted(self.pop_scores, reverse=False) probs = [] taken_pop = [False] * self.pop_size taken_score = [False] * self.pop_size l = (self.pop_size * (self.pop_size + 1)) / 2 for i in self.pop: pos_of_i_in_pop = self.my_index(self.pop, i) while taken_pop[pos_of_i_in_pop]: pos_of_i_in_pop += self.my_index( self.pop[pos_of_i_in_pop + 1:], i) + 1 score_of_pos = self.pop_scores[pos_of_i_in_pop] ranking = self.my_index(sorted_pop_score, score_of_pos) while taken_score[ranking]: ranking += self.my_index(sorted_pop_score[ranking + 1:], score_of_pos) + 1 taken_score[ranking] = True taken_pop[pos_of_i_in_pop] = True prob = (ranking + 1) / l probs.append(prob) parents = [ self.pop[i] for i in np.random.choice(len(self.pop), 2, p=probs) ] return parents def generate_offspring(self, p1, p2): size = len(p1) offspring = np.empty((size), dtype='object') offspring[0] = p1[0] offspring[1] = p2[1] return offspring def crossover(self, parents): offspring1 = self.generate_offspring(parents[0], parents[1]) offspring2 = self.generate_offspring(parents[1], parents[0]) offspring1 = self.mutation(offspring1) offspring2 = self.mutation(offspring2) return offspring1, offspring2 def mutation(self, offspring): if np.random.choice([True, False], 1, p=[self.p_mutation, 1 - self.p_mutation]) == True: offspring += random.randint(0, 100) return offspring def elitism(self): els = self.pop[:] score_c = self.pop_scores[:] for _ in range(4): index = np.argmax(score_c) score_c.pop(index) self.new_pop.append(els.pop(index)) ####################################################################################### # RUN GENETIC ALGORITHM # ####################################################################################### def run(self, max=1000, pop_size=10, p_mutation=0.1): self.pop_size = pop_size self.p_mutation = p_mutation self.pop = self.random_pop() self.pop_scores = self.evaluate_pop() for i in range(max): self.new_pop = [] self.elitism() while len(self.new_pop) < len(self.pop): parents = self.select_parents() off1, off2 = self.crossover(parents) self.new_pop.append(off1) self.new_pop.append(off2) self.pop = self.new_pop self.pop_scores = self.evaluate_pop() print("-----------------ENDED------------------") print(self.pop) print(np.argmax(self.pop_scores)) print("----------------------------------------")
def __init__(self): self.verbose = True self.helper = BaseFunction() self._URM_train_format_checked = False self._W_sparse_format_checked = False
class SLIMElasticNetRecommender(BaseRecommender): def run_fit(self): # Display ConvergenceWarning only once and not for every item it occurs warnings.simplefilter("once", category=ConvergenceWarning) # initialize the ElasticNet model self.model = ElasticNet(alpha=1e-4, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=False, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4) URM_train = check_matrix(self.URM, 'csc', dtype=np.float32) n_items = URM_train.shape[1] # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in range(n_items): # get the target column y = URM_train[:, currentItem].toarray() if y.sum() == 0.0: continue # set the j-th column of X to zero start_pos = URM_train.indptr[currentItem] end_pos = URM_train.indptr[currentItem + 1] current_item_data_backup = URM_train.data[start_pos:end_pos].copy() URM_train.data[start_pos:end_pos] = 0.0 # fit one ElasticNet model per column self.model.fit(URM_train, y) nonzero_model_coef_index = self.model.sparse_coef_.indices nonzero_model_coef_value = self.model.sparse_coef_.data local_topK = min(len(nonzero_model_coef_value) - 1, self.topK) relevant_items_partition = ( -nonzero_model_coef_value ).argpartition(local_topK)[0:local_topK] relevant_items_partition_sorting = np.argsort( -nonzero_model_coef_value[relevant_items_partition]) ranking = relevant_items_partition[ relevant_items_partition_sorting] for index in range(len(ranking)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = nonzero_model_coef_index[ranking[index]] cols[numCells] = currentItem values[numCells] = nonzero_model_coef_value[ranking[index]] numCells += 1 # finally, replace the original values of the j-th column URM_train.data[start_pos:end_pos] = current_item_data_backup elapsed_time = time.time() - start_time new_time_value, new_time_unit = seconds_to_biggest_unit( elapsed_time) if time.time( ) - start_time_printBatch > 300 or currentItem == n_items - 1: print( "Processed {} ( {:.2f}% ) in {:.2f} {}. Items per second: {:.2f}" .format(currentItem + 1, 100.0 * float(currentItem + 1) / n_items, new_time_value, new_time_unit, float(currentItem) / elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(n_items, n_items), dtype=np.float32) def fit(self, URM, verbose=True, l1_ratio=1.0, alpha=1.0, positive_only=True, topK=494, tuning=False, similarity_path=SIMILARITY_PATH): self.URM = URM self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK #1e-4 self.helper = BaseFunction() if tuning: if not os.path.exists(os.getcwd() + similarity_path): self.run_fit() self.helper.export_similarity_matrix(os.getcwd() + similarity_path, self.W_sparse, name=RECOMMENDER_NAME) self.W_sparse = self.helper.import_similarity_matrix( os.getcwd() + similarity_path) self.similarityProduct = self.URM.dot(self.W_sparse) else: self.run_fit() self.similarityProduct = self.URM.dot(self.W_sparse)
class Runner: ####################################################################################### # INSTANCE OF RUNNER # ####################################################################################### def __init__(self, recommender, name, evaluate=True): print("Evaluation: " + str(evaluate)) self.recommender = recommender self.evaluate = evaluate self.name = name self.functionality = BaseFunction() ####################################################################################### # WRITE RESULT # ####################################################################################### def write_csv(self, rows, name): fields = FIELDS timestr = time.strftime("%Y-%m-%d_%H.%M.%S") file_path = "Results/" + name + "-" + timestr + ".csv" with open(file_path, 'w') as csv_file: csv_write_head = csv.writer(csv_file, delimiter=',') csv_write_head.writerow(fields) csv_write_content = csv.writer(csv_file, delimiter=' ') csv_write_content.writerows(rows) ####################################################################################### # RUN FITTNG # ####################################################################################### def fit_recommender(self, requires_icm=False, requires_ucm=False, load_similarity=True): print("Fitting model...") ICM_all = self.functionality.ICM_all UCM_all = self.functionality.UCM_all if not self.evaluate: if requires_icm and requires_ucm: self.recommender.fit(self.functionality.URM_all, ICM_all, UCM_all) elif requires_icm: self.recommender.fit(self.functionality.URM_all, ICM_all) elif requires_ucm: self.recommender.fit(self.functionality.URM_all, UCM_all) else: self.recommender.fit(self.functionality.URM_all) else: self.functionality.split_80_20(0.8) if requires_icm and requires_ucm: self.recommender.fit(self.functionality.URM_train, ICM_all, UCM_all, tuning=True) elif requires_icm: self.recommender.fit(self.functionality.URM_train, ICM_all, tuning=True) elif requires_ucm: self.recommender.fit(self.functionality.URM_train, UCM_all, tuning=True) else: self.recommender.fit(self.functionality.URM_train, tuning=True) print("Model fitted") ####################################################################################### # RUN RECOMMENDATION # ####################################################################################### def run_recommendations(self): recommendations = [] saved_tuple = [] print("Computing recommendations...") for user in tqdm(self.functionality.userlist_unique): index = [str(user) + ","] recommendations.clear() for recommendation in self.recommender.recommend(user): recommendations.append(recommendation) saved_tuple.append(index + recommendations) print("Recommendations computed") if not self.evaluate: print("Printing csv...") self.write_csv(saved_tuple, self.name) print("Ended") return saved_tuple ####################################################################################### # RUN COMPUTATION # ####################################################################################### def run(self, requires_ucm=False, requires_icm=False): self.functionality.get_URM() if requires_icm: self.functionality.get_ICM() if requires_ucm: self.functionality.get_UCM() self.functionality.get_target_users() self.fit_recommender(requires_icm, requires_ucm) if not self.evaluate: # Recommendations on target users are necessary only during file printing self.run_recommendations() if self.evaluate: evaluation.evaluate_algorithm(self.functionality.URM_test, self.recommender, at=10)
def __init__(self, dataMatrix, topK=100, shrink = 0, normalize = True, asymmetric_alpha = 0.5, tversky_alpha = 1.0, tversky_beta = 1.0, similarity = "cosine", row_weights = None): """ Computes the cosine similarity on the columns of dataMatrix If it is computed on URM=|users|x|items|, pass the URM as is. If it is computed on ICM=|items|x|features|, pass the ICM transposed. :param dataMatrix: :param topK: :param shrink: :param normalize: If True divide the dot product by the product of the norms :param row_weights: Multiply the values in each row by a specified value. Array :param asymmetric_alpha Coefficient alpha for the asymmetric cosine :param similarity: "cosine" computes Cosine similarity "adjusted" computes Adjusted Cosine, removing the average of the users "asymmetric" computes Asymmetric Cosine "pearson" computes Pearson Correlation, removing the average of the items "jaccard" computes Jaccard similarity for binary interactions using Tanimoto "dice" computes Dice similarity for binary interactions "tversky" computes Tversky similarity for binary interactions "tanimoto" computes Tanimoto coefficient for binary interactions """ """ Asymmetric Cosine as described in: Aiolli, F. (2013, October). Efficient top-n recommendation for very large scale binary rated datasets. In Proceedings of the 7th ACM conference on Recommender systems (pp. 273-280). ACM. """ super(Compute_Similarity_Python, self).__init__() self.helper = BaseFunction() self.shrink = shrink self.normalize = normalize self.n_rows, self.n_columns = dataMatrix.shape self.TopK = min(topK, self.n_columns) self.asymmetric_alpha = asymmetric_alpha self.tversky_alpha = tversky_alpha self.tversky_beta = tversky_beta self.dataMatrix = dataMatrix.copy() self.adjusted_cosine = False self.asymmetric_cosine = False self.pearson_correlation = False self.tanimoto_coefficient = False self.dice_coefficient = False self.tversky_coefficient = False if similarity == "adjusted": self.adjusted_cosine = True elif similarity == "asymmetric": self.asymmetric_cosine = True elif similarity == "pearson": self.pearson_correlation = True elif similarity == "jaccard" or similarity == "tanimoto": self.tanimoto_coefficient = True # Tanimoto has a specific kind of normalization self.normalize = False elif similarity == "dice": self.dice_coefficient = True self.normalize = False elif similarity == "tversky": self.tversky_coefficient = True self.normalize = False elif similarity == "cosine": pass else: raise ValueError("Cosine_Similarity: value for parameter 'mode' not recognized." " Allowed values are: 'cosine', 'pearson', 'adjusted', 'asymmetric', 'jaccard', 'tanimoto'," "dice, tversky." " Passed value was '{}'".format(similarity)) self.use_row_weights = False if row_weights is not None: if dataMatrix.shape[0] != len(row_weights): raise ValueError("Cosine_Similarity: provided row_weights and dataMatrix have different number of rows." "Col_weights has {} columns, dataMatrix has {}.".format(len(row_weights), dataMatrix.shape[0])) self.use_row_weights = True self.row_weights = row_weights.copy() self.row_weights_diag = sps.diags(self.row_weights) self.dataMatrix_weighted = self.dataMatrix.T.dot(self.row_weights_diag).T
def __init__(self, recommender, name, evaluate=True): print("Evaluation: " + str(evaluate)) self.recommender = recommender self.evaluate = evaluate self.name = name self.functionality = BaseFunction()
class Compute_Similarity_Python: def __init__(self, dataMatrix, topK=100, shrink = 0, normalize = True, asymmetric_alpha = 0.5, tversky_alpha = 1.0, tversky_beta = 1.0, similarity = "cosine", row_weights = None): """ Computes the cosine similarity on the columns of dataMatrix If it is computed on URM=|users|x|items|, pass the URM as is. If it is computed on ICM=|items|x|features|, pass the ICM transposed. :param dataMatrix: :param topK: :param shrink: :param normalize: If True divide the dot product by the product of the norms :param row_weights: Multiply the values in each row by a specified value. Array :param asymmetric_alpha Coefficient alpha for the asymmetric cosine :param similarity: "cosine" computes Cosine similarity "adjusted" computes Adjusted Cosine, removing the average of the users "asymmetric" computes Asymmetric Cosine "pearson" computes Pearson Correlation, removing the average of the items "jaccard" computes Jaccard similarity for binary interactions using Tanimoto "dice" computes Dice similarity for binary interactions "tversky" computes Tversky similarity for binary interactions "tanimoto" computes Tanimoto coefficient for binary interactions """ """ Asymmetric Cosine as described in: Aiolli, F. (2013, October). Efficient top-n recommendation for very large scale binary rated datasets. In Proceedings of the 7th ACM conference on Recommender systems (pp. 273-280). ACM. """ super(Compute_Similarity_Python, self).__init__() self.helper = BaseFunction() self.shrink = shrink self.normalize = normalize self.n_rows, self.n_columns = dataMatrix.shape self.TopK = min(topK, self.n_columns) self.asymmetric_alpha = asymmetric_alpha self.tversky_alpha = tversky_alpha self.tversky_beta = tversky_beta self.dataMatrix = dataMatrix.copy() self.adjusted_cosine = False self.asymmetric_cosine = False self.pearson_correlation = False self.tanimoto_coefficient = False self.dice_coefficient = False self.tversky_coefficient = False if similarity == "adjusted": self.adjusted_cosine = True elif similarity == "asymmetric": self.asymmetric_cosine = True elif similarity == "pearson": self.pearson_correlation = True elif similarity == "jaccard" or similarity == "tanimoto": self.tanimoto_coefficient = True # Tanimoto has a specific kind of normalization self.normalize = False elif similarity == "dice": self.dice_coefficient = True self.normalize = False elif similarity == "tversky": self.tversky_coefficient = True self.normalize = False elif similarity == "cosine": pass else: raise ValueError("Cosine_Similarity: value for parameter 'mode' not recognized." " Allowed values are: 'cosine', 'pearson', 'adjusted', 'asymmetric', 'jaccard', 'tanimoto'," "dice, tversky." " Passed value was '{}'".format(similarity)) self.use_row_weights = False if row_weights is not None: if dataMatrix.shape[0] != len(row_weights): raise ValueError("Cosine_Similarity: provided row_weights and dataMatrix have different number of rows." "Col_weights has {} columns, dataMatrix has {}.".format(len(row_weights), dataMatrix.shape[0])) self.use_row_weights = True self.row_weights = row_weights.copy() self.row_weights_diag = sps.diags(self.row_weights) self.dataMatrix_weighted = self.dataMatrix.T.dot(self.row_weights_diag).T def applyAdjustedCosine(self): """ Remove from every Data point the average for the corresponding row :return: """ self.dataMatrix = self.helper.check_matrix(self.dataMatrix, 'csr') interactionsPerRow = np.diff(self.dataMatrix.indptr) nonzeroRows = interactionsPerRow > 0 sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() rowAverage = np.zeros_like(sumPerRow) rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[nonzeroRows] # Split in blocks to avoid duplicating the whole Data structure start_row = 0 end_row= 0 blockSize = 1000 while end_row < self.n_rows: end_row = min(self.n_rows, end_row + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) start_row += blockSize def applyPearsonCorrelation(self): """ Remove from every Data point the average for the corresponding column :return: """ self.dataMatrix = self.helper.check_matrix(self.dataMatrix, 'csc') interactionsPerCol = np.diff(self.dataMatrix.indptr) nonzeroCols = interactionsPerCol > 0 sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() colAverage = np.zeros_like(sumPerCol) colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[nonzeroCols] # Split in blocks to avoid duplicating the whole Data structure start_col = 0 end_col= 0 blockSize = 1000 while end_col < self.n_columns: end_col = min(self.n_columns, end_col + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) start_col += blockSize def useOnlyBooleanInteractions(self): # Split in blocks to avoid duplicating the whole Data structure start_pos = 0 end_pos= 0 blockSize = 1000 while end_pos < len(self.dataMatrix.data): end_pos = min(len(self.dataMatrix.data), end_pos + blockSize) self.dataMatrix.data[start_pos:end_pos] = np.ones(end_pos-start_pos) start_pos += blockSize def compute_similarity(self, start_col=None, end_col=None, block_size = 100): """ Compute the similarity for the given dataset :param self: :param start_col: column to begin with :param end_col: column to stop before, end_col is excluded :return: """ values = [] rows = [] cols = [] start_time = time.time() start_time_print_batch = start_time processedItems = 0 if self.adjusted_cosine: self.applyAdjustedCosine() elif self.pearson_correlation: self.applyPearsonCorrelation() elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient: self.useOnlyBooleanInteractions() # We explore the matrix column-wise self.dataMatrix = self.helper.check_matrix(self.dataMatrix, 'csc') # Compute sum of squared values to be used in normalization sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() # Tanimoto does not require the square root to be applied if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient): sumOfSquared = np.sqrt(sumOfSquared) if self.asymmetric_cosine: sumOfSquared_to_1_minus_alpha = np.power(sumOfSquared, 2 * (1 - self.asymmetric_alpha)) sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha) self.dataMatrix = self.helper.check_matrix(self.dataMatrix, 'csc') start_col_local = 0 end_col_local = self.n_columns if start_col is not None and start_col>0 and start_col<self.n_columns: start_col_local = start_col if end_col is not None and end_col>start_col_local and end_col<self.n_columns: end_col_local = end_col start_col_block = start_col_local this_block_size = 0 # Compute all similarities for each item using vectorization while start_col_block < end_col_local: end_col_block = min(start_col_block + block_size, end_col_local) this_block_size = end_col_block-start_col_block # All Data points for a given item item_data = self.dataMatrix[:, start_col_block:end_col_block] item_data = item_data.toarray().squeeze() # If only 1 feature avoid last dimension to disappear if item_data.ndim == 1: item_data = np.atleast_2d(item_data) if self.use_row_weights: this_block_weights = self.dataMatrix_weighted.T.dot(item_data) else: # Compute item similarities this_block_weights = self.dataMatrix.T.dot(item_data) for col_index_in_block in range(this_block_size): if this_block_size == 1: this_column_weights = this_block_weights else: this_column_weights = this_block_weights[:,col_index_in_block] columnIndex = col_index_in_block + start_col_block this_column_weights[columnIndex] = 0.0 # Apply normalization and shrinkage, ensure denominator != 0 if self.normalize: if self.asymmetric_cosine: denominator = sumOfSquared_to_alpha[columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6 else: denominator = sumOfSquared[columnIndex] * sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # Apply the specific denominator for Tanimoto elif self.tanimoto_coefficient: denominator = sumOfSquared[columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.dice_coefficient: denominator = sumOfSquared[columnIndex] + sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.tversky_coefficient: denominator = this_column_weights + \ (sumOfSquared[columnIndex] - this_column_weights)*self.tversky_alpha + \ (sumOfSquared - this_column_weights)*self.tversky_beta + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # If no normalization or tanimoto is selected, apply only shrink elif self.shrink != 0: this_column_weights = this_column_weights/self.shrink #this_column_weights = this_column_weights.toarray().ravel() # Sort indices and select TopK # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the Data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index relevant_items_partition = (-this_column_weights).argpartition(self.TopK-1)[0:self.TopK] relevant_items_partition_sorting = np.argsort(-this_column_weights[relevant_items_partition]) top_k_idx = relevant_items_partition[relevant_items_partition_sorting] # Incrementally build sparse matrix, do not add zeros notZerosMask = this_column_weights[top_k_idx] != 0.0 numNotZeros = np.sum(notZerosMask) values.extend(this_column_weights[top_k_idx][notZerosMask]) rows.extend(top_k_idx[notZerosMask]) cols.extend(np.ones(numNotZeros) * columnIndex) # Add previous block size processedItems += this_block_size if time.time() - start_time_print_batch >= 30 or end_col_block==end_col_local: columnPerSec = processedItems / (time.time() - start_time + 1e-9) print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format( processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, (time.time() - start_time)/ 60)) sys.stdout.flush() sys.stderr.flush() start_time_print_batch = time.time() start_col_block += block_size # End while on columns W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(self.n_columns, self.n_columns), dtype=np.float32) return W_sparse
def __init__(self): self.verbose = True self.helper = BaseFunction()
def __init__(self): self.helper = BaseFunction() self.similarityProduct = None self.URM = None self.UserCBF = None