def __init__(self, URM_train, ICM, target_model, training=True): super(CFWBoostingRecommender, self).__init__() if (URM_train.shape[1] != ICM.shape[0]): raise ValueError( "Number of items not consistent. URM contains {} but ICM contains {}" .format(URM_train.shape[1], ICM.shape[0])) # if(S_matrix_target.shape[0] != S_matrix_target.shape[1]): # raise ValueError("Items imilarity matrix is not square: rows are {}, columns are {}".format(S_matrix_target.shape[0], # S_matrix_target.shape[1])) # if(S_matrix_target.shape[0] != ICM.shape[0]): # raise ValueError("Number of items not consistent. S_matrix contains {} but ICM contains {}".format(S_matrix_target.shape[0], # ICM.shape[0])) self.URM_train = check_matrix(URM_train, 'csr') self.ICM = check_matrix(ICM, 'csr') m = OfflineDataLoader() fold, file = m.get_model(target_model.RECOMMENDER_NAME, training=training) m1 = target_model(self.URM_train) print(m1.RECOMMENDER_NAME) m1.loadModel(folder_path=fold, file_name=file) self.S_matrix_target = check_matrix(m1.W_sparse, 'csr') self.n_items = self.URM_train.shape[1] self.n_users = self.URM_train.shape[0] self.n_features = self.ICM.shape[1] self.sparse_weights = True
def __init__(self, URM_train, URM_train_tfidf, UCM, ICM, sequential_playlists, sparse_weights=True, verbose=False, similarity_mode="jaccard", normalize=False, alpha=0.168, beta=0.317, gamma=0.546, omega=0.666): super(ScrodingerRecommender, self).__init__() self.URM_train = check_matrix(URM_train, "csr") self.URM_train_tfidf = check_matrix(URM_train_tfidf, "csr") self.ICM = check_matrix(ICM, "csr") self.UCM = check_matrix(UCM, "csr") self.seq_list = sequential_playlists self.sparse_weights = sparse_weights self.verbose = verbose self.similarity_mode = similarity_mode self.normalize = normalize self.alpha = alpha self.beta = beta self.gamma = gamma self.omega = omega self.parameters = None
def __init__(self, URM_train, ICM): super(PyramidItemTreeRecommender_offline, self).__init__() self.URM_train = check_matrix(URM_train, "csr", dtype=np.float32) self.ICM = check_matrix(ICM, "csr", dtype=np.float32) self.parameters = None self.dataset = None self.normalize = False
def __init__(self, URM_train, URM_train_tfidf, ICM, sparse_weights=True): super(ItemTreeRecommender, self).__init__() self.URM_train = check_matrix(URM_train, "csr") self.URM_train_tfidf = check_matrix(URM_train_tfidf, "csr") self.ICM = check_matrix(ICM, "csr") self.sparse_weights = sparse_weights self.parameters = None self.RECOMMENDER_NAME = "ItemTreeRecommender"
def applyPearsonCorrelation(self): """ Remove from every data point the average for the corresponding column :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csc') interactionsPerCol = np.diff(self.dataMatrix.indptr) nonzeroCols = interactionsPerCol > 0 sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() sumPerCol = np.sqrt(sumPerCol) colAverage = np.zeros_like(sumPerCol) colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / \ interactionsPerCol[nonzeroCols] # Split in blocks to avoid duplicating the whole data structure start_col = 0 end_col = 0 blockSize = 1000 while end_col < self.n_columns: end_col = min(self.n_columns, end_col + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) start_col += blockSize
def applyAdjustedCosine(self): """ Remove from every data point the average for the corresponding row :return: """ self.dataMatrix = check_matrix(self.dataMatrix, 'csr') interactionsPerRow = np.diff(self.dataMatrix.indptr) nonzeroRows = interactionsPerRow > 0 sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() sumPerRow = np.sqrt(sumPerRow) rowAverage = np.zeros_like(sumPerRow) rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / \ interactionsPerRow[nonzeroRows] # Split in blocks to avoid duplicating the whole data structure start_row = 0 end_row = 0 blockSize = 1000 while end_row < self.n_rows: end_row = min(self.n_rows, end_row + blockSize) self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ (1/np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row])) start_row += blockSize
def __init__(self,URM_train,URM_train_tfidf,UCM,ICM,sequential_playlists,sparse_weights=True, verbose=False,similarity_mode="tanimoto", normalize=False, alpha=0.168, beta = 0.375, gamma = 0.717): super(SeqRandRecommender,self).__init__() self.URM_train = check_matrix(URM_train,"csr") self.URM_train_tfidf = check_matrix(URM_train_tfidf,"csr") self.UCM = check_matrix(UCM,"csr") self.ICM = check_matrix(ICM,"csr") self.seq_list = sequential_playlists self.sparse_weights = sparse_weights self.similarity_mode = similarity_mode self.verbose = verbose self.normalize = normalize self.alpha = alpha self.beta = beta self.gamma = gamma self.parameters = None
def __init__(self, URM_train): super(RP3betaRecommender, self).__init__() self.URM_train = check_matrix(URM_train, format='csr', dtype=np.float32) self.sparse_weights = True self.parameters = None
def __init__(self, URM_train, sparse_weights=True): super(UserKNNCFRecommender, self).__init__() self.FEATURE_WEIGHTING_VALUES = ["BM25", "TF-IDF", "none"] self.URM_train = check_matrix(URM_train, 'csr') self.sparse_weights = sparse_weights self.parameters = None self.dataset = None self.compute_item_score = self.compute_score_user_based
def __init__(self, URM_train): super(Novelty, self).__init__() URM_train = check_matrix(URM_train,"csc") URM_train.eliminate_zeros() self.item_popularity = np.ediff1d(URM_train.indptr) self.novelty = 0.0 self.n_evaluated_users = 0 self.n_items = len(self.item_popularity) self.n_interactions = self.item_popularity.sum()
def __init__(self, URM_train, UCM, ICM, sparse_weights=True, verbose=True, similarity_mode="cosine", normalize=False, alpha=0.18): super(UserItemAvgRecommender, self).__init__() self.verbose = verbose self.URM_train = check_matrix(URM_train, "csr") self.UCM = check_matrix(UCM, "csr") self.ICM = check_matrix(ICM, "csr") self.sparse_weights = sparse_weights self.similarity_mode = similarity_mode self.parameters = None self.normalize = normalize self.alpha = alpha
def recommend(self,playlist_id,at=10,remove_seen=True): if remove_seen: self.urm_train = check_matrix(self.urm_train,format="csr") unseen_items_mask = np.in1d(self.popularItems, self.urm_train[playlist_id].indices, assume_unique=True, invert=True) unseen_items = self.popularItems[unseen_items_mask] recommended_items = unseen_items[0:at] else: recommended_items = self.popularItems[0:at] return str(recommended_items).strip("[]")
def updateWeightsBatch(self, u, i, j): """ Define the update rules to be used in the train phase and compile the train function :return: """ if self.batch_size == 1: seenItems = self.userSeenItems[u[0]] x_ui = self.S[i, seenItems] x_uj = self.S[j, seenItems] # The difference is computed on the user_seen items x_uij = x_ui - x_uj # x_uij = x_uij[0,seenItems] x_uij = np.sum(x_uij) # log(sigm(+x_uij)) gradient = 1 / (1 + np.exp(x_uij)) # sigm(-x_uij) # exp = np.exp(x_uij) # gradient = exp/np.power(exp+1, 2) else: x_ui = self.S[i] x_uj = self.S[j] # The difference is computed on the user_seen items x_uij = x_ui - x_uj #x_uij = check_matrix(x_uij,'csr') self.URM_mask = check_matrix(self.URM_mask, 'lil') x_uij = self.URM_mask[u, :].dot(x_uij.T).diagonal() gradient = np.sum(1 / (1 + np.exp(x_uij))) / self.batch_size # Sigmoid whose argument is minus in order for the exponent of the exponential to be positive # Best performance with: gradient = np.sum(expit(-x_uij)) / self.batch_size # gradient = np.sum(x_uij) / self.batch_size # gradient = expit(-gradient) # gradient = np.sum(expit(-x_uij)) / self.batch_size # gradient = np.sum(np.log(expit(x_uij))) / self.batch_size # gradient = np.sum(1/(1+np.exp(x_uij))) / self.batch_size # gradient = min(10, max(-10, gradient))+10 if self.batch_size == 1: userSeenItems = self.userSeenItems[u[0]] self.S[i, userSeenItems] += self.learning_rate * gradient self.S[i, i] = 0 self.S[j, userSeenItems] -= self.learning_rate * gradient self.S[j, j] = 0 else: itemsToUpdate = np.array( self.URM_mask[u, :].sum(axis=0) > 0).ravel() # Do not update items i, set all user-posItem to false # itemsToUpdate[i] = False self.S[i] += self.learning_rate * gradient * itemsToUpdate self.S[i, i] = 0 # Now update i, setting all user-posItem to true # Do not update j # itemsToUpdate[i] = True # itemsToUpdate[j] = False self.S[j] -= self.learning_rate * gradient * itemsToUpdate self.S[j, j] = 0
def fit(self, R): self.dataset = R R = check_matrix(R, 'csr', dtype=np.float32) self.X, self.Y = mf.AsySVD_sgd(R, self.num_factors, self.lrate, self.reg, self.iters, self.init_mean, self.init_std, self.lrate_decay, self.rnd_seed) # precompute the user factors M = R.shape[0] self.U = np.vstack( [mf.AsySVD_compute_user_factors(R[i], self.Y) for i in range(M)])
def __init__( self, URM_train, ICM, sparse_weights=True, ): super(ItemKNNCBFRecommender, self).__init__() self.FEATURE_WEIGHTING_VALUES = ["BM25", "TF-IDF", "none"] self.URM_train = check_matrix(URM_train, 'csr') self.ICM = ICM.copy() self.sparse_weights = sparse_weights self.parameters = None self.dataset = None
def __init__(self, URM_train, positive_threshold=1, sparse_weights=False): super(Slim_BPR_Recommender_Python, self).__init__() self.URM_train = check_matrix(URM_train, "csr") self.normalize = False self.positive_threshold = positive_threshold self.sparse_weights = sparse_weights self.num_users = self.URM_train.shape[0] self.num_items = self.URM_train.shape[1] # Eliminate the zeros in the train matrix but we dont need that. self.URM_mask = self.URM_train.copy() self.URM_mask.data = self.URM_mask.data >= self.positive_threshold self.URM_mask.eliminate_zeros() self.parameters = None self.W_sparse = None self.W = None
def fit(self, R): self.dataset = R R = check_matrix(R, 'csr', dtype=np.float32) self.X, self.Y = MF.Cython.AsySVD_sgd(R, self.num_factors, self.lrate, self.reg, self.iters, self.init_mean, self.init_std, self.lrate_decay, self.rnd_seed) # precompute the user factors M = R.shape[0] self.U = np.vstack([ MF.Cython.AsySVD_compute_user_factors(R[i], self.Y) for i in range(M) ]) self.parameters = "num_factors={}, lrate={}, reg={}, iters={}, init_mean={}, " \ "init_std={}, lrate_decay={}, rnd_seed={}".format( self.num_factors, self.lrate, self.reg, self.iters, self.init_mean, self.init_std, self.lrate_decay, self.rnd_seed )
def compute_score_item_based(self, playlist_id): if self.sparse_weights: self.URM_train = check_matrix(self.URM_train, "csr") user_profile = self.URM_train[playlist_id] return user_profile.dot(self.W_sparse).toarray() else: result = [] for playlist in playlist_id: user_profile = self.URM_train.indices[self.URM_train. indptr[playlist]:self. URM_train.indptr[playlist + 1]] user_ratings = self.URM_train.data[self.URM_train. indptr[playlist]:self. URM_train.indptr[playlist + 1]] relevant_weights = self.W[user_profile] result.append(relevant_weights.T.dot(user_ratings)) return np.array(result)
def fit(self, R): self.dataset = R R = check_matrix(R, 'csr', dtype=np.float32) self.X, self.Y = BPRMF_sgd( R, num_factors=self.num_factors, lrate=self.lrate, user_reg=self.user_reg, pos_reg=self.pos_reg, neg_reg=self.neg_reg, iters=self.iters, sampling_type=self.sampling_type, sample_with_replacement=self.sample_with_replacement, use_resampling=self.use_resampling, sampling_pop_alpha=self.sampling_pop_alpha, init_mean=self.init_mean, init_std=self.init_std, lrate_decay=self.lrate_decay, rnd_seed=self.rnd_seed, verbose=self.verbose)
def fit(self, l1_penalty=0.1, l2_penalty=0.1, positive_only=True, topK=100, workers=multiprocessing.cpu_count()): self.l1_penalty = l1_penalty self.l2_penalty = l2_penalty self.positive_only = positive_only self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty) self.topK = topK self.workers = workers self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = self.URM_train.shape[1] # fit item's factors in parallel # oggetto riferito alla funzione nel quale predefinisco parte dell'input _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK) # creo un pool con un certo numero di processi pool = Pool(processes=self.workers) # avvio il pool passando la funzione (con la parte fissa dell'input) # e il rimanente parametro, variabile res = pool.map(_pfit, np.arange(n_items)) # res contains a vector of (values, rows, cols) tuples values, rows, cols = [], [], [] for values_, rows_, cols_ in res: values.extend(values_) rows.extend(rows_) cols.extend(cols_) # generate the sparse weight matrix self.W_sparse = sps.csc_matrix((values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32) return self.values, self.rows, self.cols
def fit( self, alpha=1.3167219260598073, beta=15.939928536132701, gamma=0.6048873602128846, delta=1.0527588765188267, epsilon=2.08444591782293, zeta=1.2588273098979674, eta=18.41012777389885, theta=18.000293943452448, # psi = 0.00130805010990942, normalize=False, save_model=False, submission=False, best_parameters=False, offline=False, location="submission"): if offline: m = OfflineDataLoader() folder_path, file_name = m.get_model(self.RECOMMENDER_NAME) self.loadModel(folder_path=folder_path, file_name=file_name) else: if best_parameters: m = OfflineDataLoader() folder_path, file_name = m.get_parameter(self.RECOMMENDER_NAME) self.loadModel(folder_path=folder_path, file_name=file_name) else: self.alpha = alpha self.beta = beta self.gamma = gamma self.delta = delta self.epsilon = epsilon self.zeta = zeta self.eta = eta self.theta = theta # self.psi = psi self.normalize = normalize self.submission = not submission m = OfflineDataLoader() self.m_user_knn_cf = UserKNNCFRecommender(self.URM_train) folder_path_ucf, file_name_ucf = m.get_model( UserKNNCFRecommender.RECOMMENDER_NAME, training=self.submission) self.m_user_knn_cf.loadModel(folder_path=folder_path_ucf, file_name=file_name_ucf) self.m_item_knn_cf = ItemKNNCFRecommender(self.URM_train) folder_path_icf, file_name_icf = m.get_model( ItemKNNCFRecommender.RECOMMENDER_NAME, training=self.submission) self.m_item_knn_cf.loadModel(folder_path=folder_path_icf, file_name=file_name_icf) self.m_item_knn_cbf = ItemKNNCBFRecommender( self.URM_train, self.ICM) folder_path_icf, file_name_icf = m.get_model( ItemKNNCBFRecommender.RECOMMENDER_NAME, training=self.submission) self.m_item_knn_cbf.loadModel(folder_path=folder_path_icf, file_name=file_name_icf) self.m_slim_mark1 = Slim_mark1(self.URM_train) folder_path_slim, file_name_slim = m.get_model( Slim_mark1.RECOMMENDER_NAME, training=self.submission) self.m_slim_mark1.loadModel(folder_path=folder_path_slim, file_name=file_name_slim) self.m_slim_mark2 = Slim_mark2(self.URM_train) folder_path_slim, file_name_slim = m.get_model( Slim_mark2.RECOMMENDER_NAME, training=self.submission) self.m_slim_mark2.loadModel(folder_path=folder_path_slim, file_name=file_name_slim) self.m_alpha = P3alphaRecommender(self.URM_train) folder_path_alpha, file_name_alpha = m.get_model( P3alphaRecommender.RECOMMENDER_NAME, training=self.submission) self.m_alpha.loadModel(folder_path=folder_path_alpha, file_name=file_name_alpha) self.m_beta = RP3betaRecommender(self.URM_train) folder_path_beta, file_name_beta = m.get_model( RP3betaRecommender.RECOMMENDER_NAME, training=self.submission) self.m_beta.loadModel(folder_path=folder_path_beta, file_name=file_name_beta) self.m_slim_elastic = SLIMElasticNetRecommender(self.URM_train) folder_path_elastic, file_name_elastic = m.get_model( SLIMElasticNetRecommender.RECOMMENDER_NAME, training=self.submission) self.m_slim_elastic.loadModel(folder_path=folder_path_elastic, file_name=file_name_elastic) # self.m_cfw = CFWBoostingRecommender(self.URM_train,self.ICM,Slim_mark2,training=self.submission) # fold, file = m.get_model(CFWBoostingRecommender.RECOMMENDER_NAME,training= self.submission) # self.m_cfw.loadModel(folder_path=fold,file_name=file) self.W_sparse_URM = check_matrix(self.m_user_knn_cf.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_URM.getrow(0).data) self.W_sparse_URM_T = check_matrix(self.m_item_knn_cf.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_URM_T.getrow(0).data) self.W_sparse_ICM = check_matrix(self.m_item_knn_cbf.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_ICM.getrow(0).data) self.W_sparse_Slim1 = check_matrix(self.m_slim_mark1.W, "csr", dtype=np.float32) #print(self.W_sparse_Slim1.getrow(0).data) self.W_sparse_Slim2 = check_matrix(self.m_slim_mark2.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_Slim2.getrow(0).data) self.W_sparse_alpha = check_matrix(self.m_alpha.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_alpha.getrow(0).data) self.W_sparse_beta = check_matrix(self.m_beta.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_beta.getrow(0).data) self.W_sparse_elastic = check_matrix(self.m_slim_elastic.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_elastic.getrow(0).data) #self.W_sparse_cfw = check_matrix(self.m_cfw.W_sparse,"csr",dtype=np.float32) # Precomputations self.matrix_wo_user = self.alpha * self.W_sparse_URM_T +\ self.beta * self.W_sparse_ICM +\ self.gamma * self.W_sparse_Slim1 +\ self.delta * self.W_sparse_Slim2 +\ self.epsilon * self.W_sparse_alpha +\ self.zeta * self.W_sparse_beta + \ self.eta * self.W_sparse_elastic #+ \ #self.psi * self.W_sparse_cfw self.parameters = "alpha={}, beta={}, gamma={},delta={}, epsilon={}, zeta={}, eta={}, theta={}".format( self.alpha, self.beta, self.gamma, self.delta, self.epsilon, self.zeta, self.eta, self.theta) if save_model: self.saveModel("saved_models/" + location + "/", file_name=self.RECOMMENDER_NAME)
def fit(self, l1_ratio=0.1, positive_only=True, topK=400, save_model=False, best_parameters=False, offline=False, submission=False): self.parameters = "l1_ratio= {}, topK= {},alpha= {},tol= {},max_iter= {}".format( l1_ratio, topK, 0.0001, 1e-4, 100) if offline: m = OfflineDataLoader() folder, file = m.get_model(self.RECOMMENDER_NAME, training=(not submission)) self.loadModel(folder_path=folder, file_name=file) else: assert l1_ratio >= 0 and l1_ratio <= 1, "SLIM_ElasticNet: l1_ratio must be between 0 and 1, provided value was {}".format( l1_ratio) self.l1_ratio = l1_ratio self.positive_only = positive_only self.topK = topK # initialize the ElasticNet model self.model = ElasticNet(alpha=0.0001, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=False, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4) URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32) n_items = URM_train.shape[1] # Use array as it reduces memory requirements compared to lists dataBlock = 10000000 rows = np.zeros(dataBlock, dtype=np.int32) cols = np.zeros(dataBlock, dtype=np.int32) values = np.zeros(dataBlock, dtype=np.float32) numCells = 0 start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in tqdm(range(n_items)): # get the target column y = URM_train[:, currentItem].toarray() # set the j-th column of X to zero start_pos = URM_train.indptr[currentItem] end_pos = URM_train.indptr[currentItem + 1] current_item_data_backup = URM_train.data[ start_pos:end_pos].copy() URM_train.data[start_pos:end_pos] = 0.0 # fit one ElasticNet model per column self.model.fit(URM_train, y) nonzero_model_coef_index = self.model.sparse_coef_.indices nonzero_model_coef_value = self.model.sparse_coef_.data local_topK = min(len(nonzero_model_coef_value) - 1, self.topK) relevant_items_partition = ( -nonzero_model_coef_value ).argpartition(local_topK)[0:local_topK] relevant_items_partition_sorting = np.argsort( -nonzero_model_coef_value[relevant_items_partition]) ranking = relevant_items_partition[ relevant_items_partition_sorting] for index in range(len(ranking)): if numCells == len(rows): rows = np.concatenate( (rows, np.zeros(dataBlock, dtype=np.int32))) cols = np.concatenate( (cols, np.zeros(dataBlock, dtype=np.int32))) values = np.concatenate( (values, np.zeros(dataBlock, dtype=np.float32))) rows[numCells] = nonzero_model_coef_index[ranking[index]] cols[numCells] = currentItem values[numCells] = nonzero_model_coef_value[ranking[index]] numCells += 1 # finally, replace the original values of the j-th column URM_train.data[start_pos:end_pos] = current_item_data_backup if time.time( ) - start_time_printBatch > 300 or currentItem == n_items - 1: print( "Processed {} ( {:.2f}% ) in {:.2f} minutes. Items per second: {:.0f}" .format( currentItem + 1, 100.0 * float(currentItem + 1) / n_items, (time.time() - start_time) / 60, float(currentItem) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csr_matrix( (values[:numCells], (rows[:numCells], cols[:numCells])), shape=(n_items, n_items), dtype=np.float32) if save_model: self.saveModel("saved_models/submission/", file_name=self.RECOMMENDER_NAME)
def _remove_seen_on_scores(self, playlist_id, scores): self.URM_train = check_matrix(self.URM_train, "csr") seen = self.URM_train.indices[self.URM_train.indptr[playlist_id]:self. URM_train.indptr[playlist_id + 1]] scores[seen] = -np.inf return scores
def fit(self, alpha=0.0500226666668111, beta=0.9996482062853596, gamma=0.36595766622100967, theta=0.22879224932897924, omega=0.5940982982110466, normalize=False, save_model=False, submission=False, best_parameters=False): if best_parameters: m = OfflineDataLoader() folder_path, file_name = m.get_parameter(self.RECOMMENDER_NAME) self.loadModel(folder_path=folder_path, file_name=file_name) else: self.alpha = alpha self.beta = beta self.gamma = gamma self.theta = theta self.omega = omega self.normalize = normalize self.submission = not submission m = OfflineDataLoader() self.m_user_knn_cf = UserKNNCFRecommender(self.URM_train) folder_path_ucf, file_name_ucf = m.get_model( UserKNNCFRecommender.RECOMMENDER_NAME, training=self.submission) self.m_user_knn_cf.loadModel(folder_path=folder_path_ucf, file_name=file_name_ucf) self.m_item_knn_cf = ItemKNNCFRecommender(self.URM_train) folder_path_icf, file_name_icf = m.get_model( ItemKNNCFRecommender.RECOMMENDER_NAME, training=self.submission) self.m_item_knn_cf.loadModel(folder_path=folder_path_icf, file_name=file_name_icf) self.m_item_knn_cbf = ItemKNNCBFRecommender(self.URM_train, self.ICM) folder_path_icbf, file_name_icbf = m.get_model( ItemKNNCBFRecommender.RECOMMENDER_NAME, training=self.submission) self.m_item_knn_cbf.loadModel(folder_path=folder_path_icbf, file_name=file_name_icbf) self.m_slim_mark1 = Slim_mark1(self.URM_train) folder_path_slim, file_name_slim = m.get_model( Slim_mark1.RECOMMENDER_NAME, training=self.submission) self.m_slim_mark1.loadModel(folder_path=folder_path_slim, file_name=file_name_slim) self.m_alpha = P3alphaRecommender(self.URM_train) folder_path_alpha, file_name_alpha = m.get_model( P3alphaRecommender.RECOMMENDER_NAME, training=self.submission) self.m_alpha.loadModel(folder_path=folder_path_alpha, file_name=file_name_alpha) self.m_beta = RP3betaRecommender(self.URM_train) folder_path_beta, file_name_beta = m.get_model( RP3betaRecommender.RECOMMENDER_NAME, training=self.submission) self.m_beta.loadModel(folder_path=folder_path_beta, file_name=file_name_beta) self.W_sparse_URM = check_matrix(self.m_user_knn_cf.W_sparse, "csr", dtype=np.float32) self.W_sparse_ICM = check_matrix(self.m_item_knn_cbf.W_sparse, "csr", dtype=np.float32) self.W_sparse_URM_T = check_matrix(self.m_item_knn_cf.W_sparse, "csr", dtype=np.float32) self.W_sparse_Slim = check_matrix(self.m_slim_mark1.W, "csr", dtype=np.float32) self.W_sparse_alpha = check_matrix(self.m_alpha.W_sparse, "csr", dtype=np.float32) self.W_sparse_beta = check_matrix(self.m_beta.W_sparse, "csr", dtype=np.float32) # Precomputations self.matrix_first_branch = self.alpha * self.W_sparse_ICM + ( 1 - self.alpha) * self.W_sparse_Slim self.matrix_right = self.beta * self.matrix_first_branch + ( 1 - self.beta) * self.W_sparse_URM_T self.matrix_alpha_beta = self.gamma * self.W_sparse_alpha + ( 1 - self.gamma) * self.W_sparse_beta self.parameters = "alpha={}, beta={}, gamma={}, omega={}, theta={}".format( self.alpha, self.beta, self.gamma, self.omega, self.theta) if save_model: self.saveModel("saved_models/submission/", file_name="ItemTreeRecommender_offline")
def fit(self, alpha=0.80849266253816, beta=0.7286503831547066, gamma=0.02895704968752022, sigma=0.453342, tau=0.542421, chi=1.8070865821028037, psi=4.256005405227253, omega=5.096018341419944, coeff=39.966898886531645, normalize=False, save_model=False, submission=False, best_parameters=False, offline=False, location="submission"): if offline: m = OfflineDataLoader() folder_path, file_name = m.get_model(self.RECOMMENDER_NAME) self.loadModel(folder_path=folder_path, file_name=file_name) else: if best_parameters: m = OfflineDataLoader() folder_path, file_name = m.get_parameter(self.RECOMMENDER_NAME) self.loadModel(folder_path=folder_path, file_name=file_name) else: self.alpha = alpha self.beta = beta self.gamma = gamma self.sigma = sigma self.tau = tau self.chi = chi self.psi = psi self.omega = omega self.coeff = coeff self.normalize = normalize self.submission = not submission m = OfflineDataLoader() self.m_user_knn_cf = UserKNNCFRecommender(self.URM_train) folder_path_ucf, file_name_ucf = m.get_model( UserKNNCFRecommender.RECOMMENDER_NAME, training=self.submission) self.m_user_knn_cf.loadModel(folder_path=folder_path_ucf, file_name=file_name_ucf) self.m_item_knn_cf = ItemKNNCFRecommender(self.URM_train) folder_path_icf, file_name_icf = m.get_model( ItemKNNCFRecommender.RECOMMENDER_NAME, training=self.submission) self.m_item_knn_cf.loadModel(folder_path=folder_path_icf, file_name=file_name_icf) self.m_item_knn_cbf = ItemKNNCBFRecommender( self.URM_train, self.ICM) folder_path_icf, file_name_icf = m.get_model( ItemKNNCBFRecommender.RECOMMENDER_NAME, training=self.submission) self.m_item_knn_cbf.loadModel(folder_path=folder_path_icf, file_name=file_name_icf) self.m_slim_mark1 = Slim_mark1(self.URM_train) folder_path_slim, file_name_slim = m.get_model( Slim_mark1.RECOMMENDER_NAME, training=self.submission) self.m_slim_mark1.loadModel(folder_path=folder_path_slim, file_name=file_name_slim) self.m_slim_mark2 = Slim_mark2(self.URM_train) folder_path_slim, file_name_slim = m.get_model( Slim_mark2.RECOMMENDER_NAME, training=self.submission) self.m_slim_mark2.loadModel(folder_path=folder_path_slim, file_name=file_name_slim) self.m_alpha = P3alphaRecommender(self.URM_train) folder_path_alpha, file_name_alpha = m.get_model( P3alphaRecommender.RECOMMENDER_NAME, training=self.submission) self.m_alpha.loadModel(folder_path=folder_path_alpha, file_name=file_name_alpha) self.m_beta = RP3betaRecommender(self.URM_train) folder_path_beta, file_name_beta = m.get_model( RP3betaRecommender.RECOMMENDER_NAME, training=self.submission) self.m_beta.loadModel(folder_path=folder_path_beta, file_name=file_name_beta) self.m_slim_elastic = SLIMElasticNetRecommender(self.URM_train) folder_path_elastic, file_name_elastic = m.get_model( SLIMElasticNetRecommender.RECOMMENDER_NAME, training=self.submission) self.m_slim_elastic.loadModel(folder_path=folder_path_elastic, file_name=file_name_elastic) self.W_sparse_URM = check_matrix(self.m_user_knn_cf.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_URM.getrow(0).data) self.W_sparse_URM_T = check_matrix(self.m_item_knn_cf.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_URM_T.getrow(0).data) self.W_sparse_ICM = check_matrix(self.m_item_knn_cbf.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_ICM.getrow(0).data) self.W_sparse_Slim1 = check_matrix(self.m_slim_mark1.W, "csr", dtype=np.float32) #print(self.W_sparse_Slim1.getrow(0).data) self.W_sparse_Slim2 = check_matrix(self.m_slim_mark2.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_Slim2.getrow(0).data) self.W_sparse_alpha = check_matrix(self.m_alpha.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_alpha.getrow(0).data) self.W_sparse_beta = check_matrix(self.m_beta.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_beta.getrow(0).data) self.W_sparse_elastic = check_matrix(self.m_slim_elastic.W_sparse, "csr", dtype=np.float32) #print(self.W_sparse_elastic.getrow(0).data) # Precomputations #TODO self.matrix_alpha_beta = self.alpha * self.W_sparse_alpha + ( 1 - self.alpha) * self.W_sparse_beta self.matrix_slim = self.beta * self.W_sparse_Slim2 + ( (1 - self.beta) * self.W_sparse_elastic * self.coeff) + self.sigma * self.W_sparse_Slim1 self.parameters = "alpha={}, beta={}, gamma={},sigma={}, tau={}, chi={}, psi={}, omega={}, coeff={}".format( self.alpha, self.beta, self.gamma, self.sigma, self.tau, self.chi, self.psi, self.omega, self.coeff) if save_model: self.saveModel("saved_models/" + location + "/", file_name=self.RECOMMENDER_NAME)
def compute_similarity(self): values = [] rows = [] cols = [] if self.verbose: print("Computation of Similarity matrix with {} mode is started.". format((self.mode))) start_time = time.time() start_time_print_batch = start_time processedItems = 0 if self.adjusted_cosine: self.applyAdjustedCosine() elif self.pearson_correlation: self.applyPearsonCorrelation() elif self.tanimoto_coefficient: self.useOnlyBooleanInteractions() # We explore the matrix column-wise self.dataMatrix = check_matrix(self.dataMatrix, 'csc') # Compute sum of squared values to be used in normalization sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() # Tanimoto does not require the square root to be applied if not self.tanimoto_coefficient: sumOfSquared = np.sqrt(sumOfSquared) # Compute all similarities for each item using vectorization for columnIndex in tqdm(range(self.n_columns)): processedItems += 1 # if time.time() - start_time_print_batch >= 30 or processedItems == self.n_columns: # columnPerSec = processedItems / (time.time() - start_time) # print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format( # processedItems, processedItems / self.n_columns * 100, columnPerSec, (time.time() - start_time) / 60)) # sys.stdout.flush() # sys.stderr.flush() # start_time_print_batch = time.time() # All data points for a given item item_data = self.dataMatrix[:, columnIndex] item_data = item_data.toarray().squeeze() # Compute item similarities this_column_weights = self.dataMatrix.T.dot(item_data) this_column_weights[columnIndex] = 0.0 #Apply normalization and shrinkage, ensure denominator != 0 if self.normalize: denominator = sumOfSquared[columnIndex] * \ sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # Apply the specific denominator for Tanimoto if self.tanimoto_coefficient: denominator = sumOfSquared[columnIndex] + \ sumOfSquared - this_column_weights + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # If no normalization or tanimoto is selected, apply only shrink elif self.shrink != 0: this_column_weights = this_column_weights / self.shrink if self.TopK == 0: self.W_dense[:, columnIndex] = this_column_weights else: relevant_items_partition = ( -this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] relevant_items_partition_sorting = np.argsort( -this_column_weights[relevant_items_partition]) top_k_idx = relevant_items_partition[ relevant_items_partition_sorting] # Incrementally build sparse matrix values.extend(this_column_weights[top_k_idx]) rows.extend(top_k_idx) cols.extend(np.ones(self.TopK) * columnIndex) if self.verbose: print("Computation is completend in {} minutes".format( (time.time() - start_time) / 60)) if self.TopK == 0: return self.W_dense else: W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(self.n_columns, self.n_columns), dtype=np.float32) return W_sparse
def __init__(self, URM_train): super(FunkSVD, self).__init__() self.URM_train = check_matrix(URM_train, 'csr', dtype=np.float32) self.parameters = None
def compute_similarity(self, start_col=None, end_col=None, block_size=100): """ Compute the similarity for the given dataset :param self: :param start_col: column to begin with :param end_col: column to stop before, end_col is excluded :return: """ values = [] rows = [] cols = [] start_time = time.time() start_time_print_batch = start_time processedItems = 0 if self.adjusted_cosine: self.applyAdjustedCosine() elif self.pearson_correlation: self.applyPearsonCorrelation() elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient: self.useOnlyBooleanInteractions() # We explore the matrix column-wise self.dataMatrix = check_matrix(self.dataMatrix, 'csc') # Compute sum of squared values to be used in normalization sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() # Tanimoto does not require the square root to be applied if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient): sumOfSquared = np.sqrt(sumOfSquared) if self.asymmetric_cosine: sumOfSquared_to_1_minus_alpha = np.power( sumOfSquared, 2 * (1 - self.asymmetric_alpha)) sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha) self.dataMatrix = check_matrix(self.dataMatrix, 'csc') start_col_local = 0 end_col_local = self.n_columns if start_col is not None and start_col > 0 and start_col < self.n_columns: start_col_local = start_col if end_col is not None and end_col > start_col_local and end_col < self.n_columns: end_col_local = end_col start_col_block = start_col_local this_block_size = 0 # Compute all similarities for each item using vectorization while start_col_block < end_col_local: # Add previous block size processedItems += this_block_size end_col_block = min(start_col_block + block_size, end_col_local) this_block_size = end_col_block - start_col_block if time.time( ) - start_time_print_batch >= 30 or end_col_block == end_col_local: columnPerSec = processedItems / (time.time() - start_time) print( "Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min" .format( processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, (time.time() - start_time) / 60)) sys.stdout.flush() sys.stderr.flush() start_time_print_batch = time.time() # All data points for a given item item_data = self.dataMatrix[:, start_col_block:end_col_block] item_data = item_data.toarray().squeeze() if self.use_row_weights: #item_data = np.multiply(item_data, self.row_weights) #item_data = item_data.T.dot(self.row_weights_diag).T this_block_weights = self.dataMatrix_weighted.T.dot(item_data) else: # Compute item similarities this_block_weights = self.dataMatrix.T.dot(item_data) for col_index_in_block in range(this_block_size): if this_block_size == 1: this_column_weights = this_block_weights else: this_column_weights = this_block_weights[:, col_index_in_block] columnIndex = col_index_in_block + start_col_block this_column_weights[columnIndex] = 0.0 # Apply normalization and shrinkage, ensure denominator != 0 if self.normalize: if self.asymmetric_cosine: denominator = sumOfSquared_to_alpha[ columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6 else: denominator = sumOfSquared[ columnIndex] * sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # Apply the specific denominator for Tanimoto elif self.tanimoto_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.dice_coefficient: denominator = sumOfSquared[ columnIndex] + sumOfSquared + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) elif self.tversky_coefficient: denominator = this_column_weights + \ (sumOfSquared[columnIndex] - this_column_weights)*self.tversky_alpha + \ (sumOfSquared - this_column_weights)*self.tversky_beta + self.shrink + 1e-6 this_column_weights = np.multiply(this_column_weights, 1 / denominator) # If no normalization or tanimoto is selected, apply only shrink elif self.shrink != 0: this_column_weights = this_column_weights / self.shrink #this_column_weights = this_column_weights.toarray().ravel() if self.TopK == 0: self.W_dense[:, columnIndex] = this_column_weights else: # Sort indices and select TopK # Sorting is done in three steps. Faster then plain np.argsort for higher number of items # - Partition the data to extract the set of relevant items # - Sort only the relevant items # - Get the original item index relevant_items_partition = ( -this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] relevant_items_partition_sorting = np.argsort( -this_column_weights[relevant_items_partition]) top_k_idx = relevant_items_partition[ relevant_items_partition_sorting] # Incrementally build sparse matrix, do not add zeros notZerosMask = this_column_weights[top_k_idx] != 0.0 numNotZeros = np.sum(notZerosMask) values.extend(this_column_weights[top_k_idx][notZerosMask]) rows.extend(top_k_idx[notZerosMask]) cols.extend(np.ones(numNotZeros) * columnIndex) start_col_block += block_size # End while on columns if self.TopK == 0: return self.W_dense else: W_sparse = sps.csr_matrix((values, (rows, cols)), shape=(self.n_columns, self.n_columns), dtype=np.float32) return W_sparse
def __init__(self, URM_train): super(PureSVDRecommender, self).__init__() # CSR is faster during evaluation self.URM_train = check_matrix(URM_train, 'csr') self.compute_item_score = self.compute_score_SVD self.parameters = None
def fit(self, l1_penalty=0.01, l2_penalty=0.01, positive_only=True, topK=100): self.l1_penalty = l1_penalty self.l2_penalty = l2_penalty self.positive_only = positive_only self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty) self.topK = topK self.parameters = "sparse_weights= {0},normalize= {1}, l1_penalty= {2}, l2_penalty= {3}, positive_only= {4}".format( self.sparse_weights, self.normalize, self.l1_penalty, self.l2_penalty, self.positive_only) X = check_matrix(self.URM_train.T, 'csc', dtype=np.float32) n_items = X.shape[1] # initialize the ElasticNet model print("Slim: ElasticNet model fitting begins") self.model = ElasticNet(alpha=1.0, l1_ratio=self.l1_ratio, positive=self.positive_only, fit_intercept=True, copy_X=False, precompute=True, selection='random', max_iter=100, tol=1e-4) print("Slim: Fitting is completed!") values, rows, cols = [], [], [] start_time = time.time() start_time_printBatch = start_time # fit each item's factors sequentially (not in parallel) for currentItem in range(n_items): # get the target column y = X[:, currentItem].toarray() # set the j-th column of X to zero startptr = X.indptr[currentItem] endptr = X.indptr[currentItem + 1] bak = X.data[startptr: endptr].copy() X.data[startptr: endptr] = 0.0 # fit one ElasticNet model per column # TODO self.model.fit(X, y) relevant_items_partition = (-self.model.coef_).argpartition(self.topK)[0:self.topK] relevant_items_partition_sorting = np.argsort(-self.model.coef_[relevant_items_partition]) ranking = relevant_items_partition[relevant_items_partition_sorting] notZerosMask = self.model.coef_[ranking] > 0.0 ranking = ranking[notZerosMask] values.extend(self.model.coef_[ranking]) rows.extend(ranking) cols.extend([currentItem] * len(ranking)) # finally, replace the original values of the j-th column X.data[startptr:endptr] = bak if time.time() - start_time_printBatch > 300: print("Processed {} ( {:.2f}% ) in {:.2f} minutes. Columns per second: {:.0f}".format( currentItem, 100.0 * float(currentItem) / n_items, (time.time() - start_time) / 60, float(currentItem) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_printBatch = time.time() # generate the sparse weight matrix self.W_sparse = sps.csc_matrix( (values, (rows, cols)), shape=(n_items, n_items), dtype=np.float32)