def recommend_batch(self, userids, N=10, urm=None, filter_already_liked=True, with_scores=False, items_to_exclude=[], verbose=False): user_profile_batch = self.URM_train[userids] scores_array = user_profile_batch.dot(self.W_sparse).toarray() if filter_already_liked: scores_array[user_profile_batch.nonzero()] = -np.inf if len(items_to_exclude) > 0: raise NotImplementedError('Items to exclude functionality is not implemented yet') i = 0 l = [] for row_index in range(scores_array.shape[0]): scores = scores_array[row_index] relevant_items_partition = (-scores).argpartition(N)[0:N] relevant_items_partition_sorting = np.argsort(-scores[relevant_items_partition]) ranking = relevant_items_partition[relevant_items_partition_sorting] if with_scores: s = scores_array[row_index, ranking] l.append([userids[row_index]] + [list(zip(list(ranking), list(s)))]) else: l.append([userids[row_index]] + list(ranking)) if verbose: i += 1 log.progressbar(i, scores_array.shape[0], prefix='Building recommendations ') return l
def fit(self, R): self.dataset = R # compute the confidence matrix if self.scaling == 'linear': C = self._linear_scaling(R) else: C = self._log_scaling(R) Ct = C.T.tocsr() M, N = R.shape # set the seed np.random.seed(self.rnd_seed) # initialize the latent factors self.X = np.random.normal(self.init_mean, self.init_std, size=(M, self.num_factors)) self.Y = np.random.normal(self.init_mean, self.init_std, size=(N, self.num_factors)) for it in range(self.iters): self.X = self._lsq_solver_fast(C, self.X, self.Y, self.reg) self.Y = self._lsq_solver_fast(Ct, self.Y, self.X, self.reg) log.progressbar(it + 1, self.iters) log.error('Finished iter {}'.format(it + 1))
def recommend_batch(self, userids, urm, N=10, filter_already_liked=True, with_scores=True, items_to_exclude=[], verbose=False): """ Recommend the N best items for the specified list of users Parameters ---------- userids : list of int The user ids to calculate recommendations for urm : csr_matrix A sparse matrix of shape (number_users, number_items). This allows to look up the liked items and their weights for the user. It is used to filter out items that have already been liked from the output, and to also potentially giving more information to choose the best items for this user. N : int, optional The number of recommendations to return items_to_exclude : list of ints, optional List of extra item ids to filter out from the output Returns ------- list List of (user_id, recommendations), where recommendation is a list of length N of (itemid, score) tuples: [ [7, [(18,0.7), (11,0.6), ...] ], [13, [(65,0.9), (83,0.4), ...] ], [25, [(30,0.8), (49,0.3), ...] ], ... ] """ i = 0 L = len(userids) result = [] for userid in userids: recs = self.recommend(userid, N=N, urm=urm, filter_already_liked=filter_already_liked, with_scores=with_scores, items_to_exclude=items_to_exclude) result.append(recs) if verbose: i += 1 log.progressbar(i, L, prefix='Building recommendations ') return result
def validate(self, ks, alphas, betas, ls, cs, shrinks, filename='splus_validation', path='validation_results', verbose=False): distance = CFItemBased.SIM_SPLUS # ks = [100, 200, 300] # alphas = [0.25, 0.5, 0.75] # betas = [0.25, 0.5, 0.75] # ls = [0.25, 0.5, 0.75] # cs = [0.25, 0.5, 0.75] # shrinks = [0, 10, 30] i = 0 tot = len(ks) * len(alphas) * len(betas) * len(ls) * len(cs) * len(shrinks) filename = datedir.create_folder(rootpath=path, filename=filename, extension='txt') with open(filename, 'w') as file: for k in ks: for a in alphas: for b in betas: for l in ls: for c in cs: for shrink in shrinks: model = CFItemBased() recs, map10 = model.run(distance=distance, k=k, shrink=shrink, alpha=a, beta=b, c=c, l=l, export=False, verbose=verbose) logmsg = 'MAP: {} \tknn: {} \ta: {} \tb: {} \tl: {} \tc: {} \tshrink: {}\n'.format( map10, k, a, b, l, c, shrink) #log.warning(logmsg) file.write(logmsg) i += 1 log.progressbar(i, tot, prefix='Validation: ')
def create_ucm_from_urm(urm_train): """ Create ucm @Params proc_int (ProcessInteractions) personalizes the preprocess of the train.csv dataframe split (Split) personalizes the split into train and test of data coming after ProcessInteractions save_dataframes (Bool) whether to save the train and test dataframes or not """ path = "raw_data/ucm" + str(randint(1, 100)) print('starting dataset creation of UCM in ' + path) # maybe can be better a dense array? ICM = csr_matrix(create_icm(d.get_tracks_df(), [])) UCM = lil_matrix((d.N_PLAYLISTS,ICM.shape[1]), dtype=np.int) for p in range(d.N_PLAYLISTS): track_indices = urm_train[p].nonzero()[1] for track_id in track_indices: UCM[p] += ICM.getrow(track_id) log.progressbar(p, d.N_PLAYLISTS) # save matrices os.mkdir(path) save_npz(path + '/ucm', UCM)