def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=10, k_filtering=50): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset S = None print("CBF started") urm = urm.tocsr() if self.r_hat_aug is None: cbf = ContentBasedFiltering() cbf.fit(urm, tg_playlist, tg_tracks, ds) # get R_hat self.r_hat_aug = cbf.getR_hat() # save S # compute cosine similarity between users S = compute_cosine(self.r_hat_aug[[dataset.get_playlist_index_from_id(x) for x in self.pl_id_list]], self.r_hat_aug.transpose(), k_filtering=k_filtering, shrinkage=shrinkage) # normalize s s_norm = S.sum(axis=1) s_norm[s_norm == 0] = 1 S = S.multiply(csr_matrix(np.reciprocal(s_norm))) # compute ratings print("Similarity matrix ready!") urm_cleaned = urm[:, [dataset.get_track_index_from_id(x) for x in self.tr_id_list]] R_hat = S.dot(urm_cleaned) # clean from already rated items print("R_hat done") urm_cleaned = urm_cleaned[[dataset.get_playlist_index_from_id(x) for x in self.pl_id_list]] R_hat[urm_cleaned.nonzero()] = 0 R_hat.eliminate_zeros() # eliminate playlist that are not target, already done, to check # R_hat = R_hat[:, [dataset.get_track_index_from_id( # x) for x in self.tr_id_list]] print("Shape of final matrix: ", R_hat.shape) self.R_hat = R_hat
def fit(self, urm, target_playlist, target_tracks, dataset): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset S = None print("UBF started") # get UCM from dataset ucm = dataset.build_ucm() # add user ratings to ucm ucm = vstack([urm.transpose()], format='csr') # compute cosine similarity between users S = compute_cosine(ucm.transpose()[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]], ucm, k_filtering=self.k_filtering, shrinkage=self.shrinkage) s_norm = S.sum(axis=1) s_norm[s_norm == 0] = 1 # normalize s S = S.multiply(csr_matrix(np.reciprocal(s_norm))) # compute ratings print("Similarity matrix ready!") urm_cleaned = urm[:, [ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]] R_hat = S.dot(urm_cleaned) # clean from already rated items print("R_hat done") urm_cleaned = urm_cleaned[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]] R_hat[urm_cleaned.nonzero()] = 0 R_hat.eliminate_zeros() # eliminate playlist that are not target, already done, to check # R_hat = R_hat[:, [dataset.get_track_index_from_id( # x) for x in self.tr_id_list]] print("Shape of final matrix: ", R_hat.shape) self.R_hat = R_hat
def fit(self, urm, tg_playlist, tg_tracks, dataset): self.pl_id_list = list(tg_playlist) self.tr_id_list = list(tg_tracks) self.dataset = dataset self.urm = urm # Build slim similarity slim = SLIM() slim.fit(urm, self.pl_id_list, self.tr_id_list, dataset) S_cslim = slim.getW() S_cslim = S_cslim[:, [ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]].transpose() # Build content based similarity icm = dataset.build_icm() S_cbf = compute_cosine(icm.transpose()[[ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]], icm, k_filtering=200, shrinkage=10) # Build collaborative similarity S_cf = compute_cosine(urm.transpose()[[ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]], urm, k_filtering=200, shrinkage=10) # Build similarity from implicit model # ials = IALS(500, 50, 1e-4, 800) # ials.fit(urm, tg_playlist, tg_tracks, dataset) # item_factors = ials.model.item_factors # S_ials = compute_cosine(item_factors[[dataset.get_track_index_from_id(x) for x in self.tr_id_list]], item_factors.transpose(), k_filtering=200, shrinkage=10) # append all similarities self.similarities.append(S_cbf) self.similarities.append(S_cf) self.similarities.append(S_cslim)
def fit(self, urm, target_playlist, target_tracks, dataset): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting """ # initialize model fields self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) urm = csr_matrix(urm) # Build collaborative similarity S_cf = compute_cosine(urm.transpose()[[ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]], urm, k_filtering=self.k_filtering, shrinkage=self.shrinkage) # normalize S_cf = normalize_by_row(S_cf) self.R_hat = urm[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]].dot(S_cf.transpose()) urm_cleaned = urm[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]] urm_cleaned = urm_cleaned[:, [ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]] self.R_hat[urm_cleaned.nonzero()] = 0 self.R_hat.eliminate_zeros()
def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=50, k_filtering=200, test_dict={}): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset S = None print("CBF started") # get ICM from dataset, assume it already cleaned icm = csr_matrix(dataset.build_icm()) ufm = urm.dot(icm.transpose()) ufm = top_k_filtering(ufm, 100) # Iu contains for each user the number of tracks rated Iu = urm.sum(axis=1) # save from divide by zero! Iu[Iu == 0] = 1 # Add a term for shrink Iu = Iu + 10 # since we have to divide the ufm get the reciprocal of this vector Iu = np.reciprocal(Iu) # multiply the ufm by Iu. Normalize UFM ufm = csr_matrix(ufm.multiply(Iu)).transpose() ucm = csr_matrix(dataset.build_ucm()) ucm = vstack([ucm, urm.transpose().multiply, ufm], format='csr') u_sim = compute_cosine(ucm.transpose(), ucm, k_filtering=k_filtering, shrinkage=shrinkage) u_sim_norm = u_sim.sum(axis=1) # normalize u_sim = u_sim.multiply(np.reciprocal(u_sim_norm)) # compute augmented urm aUrm = u_sim.dot(urm) print("Augmented URM ready!") aUrm[urm.nonzero()] = 1 # keep only 500 items for each user for row_i in range(0, aUrm.shape[0]): row = aUrm.data[aUrm.indptr[row_i]:aUrm.indptr[row_i + 1]] k_filtering = 500 if row.shape[0] >= k_filtering: sorted_idx = np.argpartition(row, row.shape[0] - k_filtering)[:-k_filtering] row[sorted_idx] = 0 aUrm.eliminate_zeros() # aUrm.data = np.ones_like(aUrm.data) # put all to one: icm = dataset.add_playlist_to_icm(icm, aUrm, 0.4) S = compute_cosine(icm.transpose()[[ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]], icm, k_filtering=k_filtering, shrinkage=shrinkage) sim_norm = S.sum(axis=1) # normalize S = S.multiply(np.reciprocal(sim_norm)) R_hat = aUrm[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]].dot(S.transpose()) urm_red = urm[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]] urm_red = urm_red[:, [ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]] R_hat[urm_red.nonzero()] = 0 R_hat.eliminate_zeros() print("R_hat done") print("Shape of final matrix: ", R_hat.shape) self.R_hat = R_hat
def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=50, k_filtering=200, features=100): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset print("CBF started") # Compute URM matrix factorization and predict missing ratings print('SVD on eURM') icm = dataset.build_icm() ucm = dataset.build_ucm().transpose() # ucm is playlist x attributes zeros_matrix = lil_matrix((icm.shape[0], ucm.shape[1])).tocsr() left_part = vstack([urm.multiply(0.5), icm], format='csr') # Stack all matrices: right_part = vstack([ucm, zeros_matrix], format='csr') eURM = hstack([left_part, right_part], format='csr') # DO SVD! u, s, v = sparsesvd(eURM.tocsc(), features) print("SHAPE of V: ", v.shape) print("SHAPE OF U", u.shape) # get only the features of the items v = csr_matrix(v[:, 0:icm.shape[1]]) # get only the item part and compute cosine S = utils.compute_cosine(v.transpose()[[ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]], v, k_filtering=k_filtering, shrinkage=shrinkage) print("Similarity matrix ready, let's normalize it!") # zero out diagonal # in the diagonal there is the sim between i and i (1) # maybe it's better to have a lil matrix here # S.setdiag(0) # S.eliminate_zeros() # keep only target rows of URM and target columns urm_cleaned = urm[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]] s_norm = S.sum(axis=1) # normalize s S = S.multiply(csr_matrix(np.reciprocal(s_norm))) self.S = S.transpose() # compute ratings R_hat = urm_cleaned.dot(S.transpose().tocsc()).tocsr() print("R_hat done") # apply mask for eliminating already rated items urm_cleaned = urm_cleaned[:, [ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]] R_hat[urm_cleaned.nonzero()] = 0 R_hat.eliminate_zeros() # eliminate playlist that are not target, already done, to check # R_hat = R_hat[:, [dataset.get_track_index_from_id( # x) for x in self.tr_id_list]] print("Shape of final matrix: ", R_hat.shape) self.R_hat = R_hat
def fit(self, urm, target_playlist, target_tracks, dataset): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset S = None urm = urm.tocsr() print("CBF started") # get ICM from dataset, assume it already cleaned icm = dataset.build_icm() # Build the tag matrix, apply TFIDF print("Build tags matrix and apply TFIDF...") icm_tag = dataset.build_tags_matrix() tags = applyTFIDF(icm_tag) # Before stacking tags with the rest of the ICM, we keep only # the top K tags for each item. This way we try to reduce the # natural noise added by such sparse features. tags = top_k_filtering(tags.transpose(), topK=55).transpose() # User augmented UCM # print("Building User augmented ICM") # ucm = dataset.build_ucm() # ua_icm = user_augmented_icm(urm, ucm) # ua_icm = top_k_filtering(ua_icm.transpose(), topK=55).transpose() # stack all icm = vstack([icm, tags, urm * 0.8], format='csr') # icm = vstack([icm, tags, applyTFIDF(urm)], format='csr') S = compute_cosine(icm.transpose(), icm, k_filtering=self.k_filtering, shrinkage=self.shrinkage, n_threads=4, chunksize=1000) s_norm = S.sum(axis=1) # Normalize S S = S.multiply(csr_matrix(np.reciprocal(s_norm))) print("Similarity matrix ready!") self.S = S.transpose() # Compute ratings R_hat = urm.dot(S.transpose().tocsc()).tocsr() print("R_hat done") R_hat[urm.nonzero()] = 0 R_hat.eliminate_zeros() R_hat = top_k_filtering(R_hat, topK=5) # Remove the entries in R_hat that are already present in the URM R_hat[urm.nonzero()] = 1 print("Shape of final matrix: ", R_hat.shape) self.R_hat = R_hat
def fit(self, urm, target_playlist, target_tracks, dataset, k_feature=1000, k_similar=1000): # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset self.urm = urm print("FWUM started!") # get ICM from dataset icm = dataset.build_icm() tags = dataset.build_tags_matrix() tags = applyTfIdf(tags) icm = vstack([icm, tags], format='csr') # icm = dataset.add_tracks_num_rating_to_icm(icm, urm).tocsr() # CONTENT BASED USER PROFILE # ucm_red = dataset.build_ucm() # ucm_red = dataset.add_playlist_num_rating_to_icm(ucm_red, urm) # build the user feature matrix # UxF ufm = urm.dot(icm.transpose()) # Iu contains for each user the number of tracks rated Iu = urm.sum(axis=1) # save from divide by zero! Iu[Iu == 0] = 1 # since we have to divide the ufm get the reciprocal of this vector Iu = np.reciprocal(Iu) # multiply the ufm by Iu. Normalize UFM # FxU ufm = csr_matrix(ufm.multiply(Iu).transpose()) print("UCM ready") # build owner rating matrix self.ucm = dataset.build_ucm() orm = dataset.build_owner_item_matrix(self.ucm, urm)[[dataset.get_playlist_index_from_id(x) for x in target_playlist]] # build owner feature matrix # for each owner the average of the feature of its tracks ofm = orm.dot(icm.transpose()) # usual normalization Iu = orm.sum(axis=1) # save from divide by zero! Iu[Iu == 0] = 1 # since we have to divide the ufm get the reciprocal of this vector Iu = np.reciprocal(Iu) # multiply the ufm by Iu. Normalize OFM ofm = csr_matrix(ofm.multiply(Iu).transpose()) # ofm is user x item feature print("OFM ready") # put together the user profile and the owner profile # by doing a weighted average ufm = ufm[:, [dataset.get_playlist_index_from_id( x) for x in target_playlist]] ufm = ufm.multiply(1) + ofm.multiply(0.1) # S = compute_cosine(ufm, # ufm.transpose(), # k_filtering=100) # S = normalize_by_row(S) # ufm_aug = S.dot(ufm[:, [dataset.get_playlist_index_from_id( # x) for x in target_playlist]]) # restore original preferences # ufm_aug[ufm.nonzero()] = ufm[ufm.nonzero()] self.R_hat_fwum = compute_cosine(ufm.transpose(), icm[:, [dataset.get_track_index_from_id(x) for x in target_tracks]], k_filtering=500) # stack the urm # ufm = vstack([ufm, urm.transpose().multiply(2)], format='csr') # ufm = TfidfTransformer().fit_transform(ufm.transpose()).transpose() # # compute profile based prediction # # R_hat_1 = compute_cosine(ufm.transpose(), # # icm[:, [dataset.get_track_index_from_id(x) # # for x in target_tracks]], # # k_filtering=500) # S_user = compute_cosine(ufm.transpose()[[dataset.get_playlist_index_from_id( # x) for x in target_playlist]], ufm, k_filtering=500, shrinkage=100) # # normalize # S_user = normalize_by_row(S_user) # self.R_hat_ubf = S_user.dot(urm[:, # [dataset.get_track_index_from_id(x) for x in target_tracks]]) # # compute content based predictions # icm = dataset.add_playlist_to_icm(icm, urm, 0.4) # S_cbf = compute_cosine(icm.transpose()[ # [dataset.get_track_index_from_id(x) for x in target_tracks]], # icm, k_filtering=200, shrinkage=50) # # normalize # norm = S_cbf.sum(axis=1) # # save from divide by zero! # norm[norm == 0] = 1 # # since we have to divide the ufm get the reciprocal of this vector # norm = np.reciprocal(norm) # S_cbf = csr_matrix(S_cbf.multiply(norm)) # # compute content based predictions # self.R_hat_cbf = urm[[dataset.get_playlist_index_from_id( # x) for x in target_playlist]].dot(S_cbf.transpose()) self.urm = self.urm[:, [ dataset.get_track_index_from_id(x) for x in target_tracks]] self.urm = self.urm[[dataset.get_playlist_index_from_id( x) for x in target_playlist]] # self.R_hat_cbf[self.urm.nonzero()] = 0 # self.R_hat_cbf.eliminate_zeros() # self.R_hat_cbf = top_k_filtering(self.R_hat_cbf, 10) # self.R_hat_ubf[self.urm.nonzero()] = 0 # self.R_hat_ubf.eliminate_zeros() self.R_hat_fwum[self.urm.nonzero()] = 0 self.R_hat_fwum.eliminate_zeros() # R_hat computation self.R_hat = self.R_hat_fwum print("R_hat done")
def fit(self, urm, target_playlist, target_tracks, dataset, urm_weight=0.8): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset S = None print("CBF started") # get ICM from dataset, assume it already cleaned icm = dataset.build_icm() artist = dataset.build_artist_matrix(icm) album = dataset.build_album_matrix(icm) playcount = dataset.build_playcount_matrix(icm) duration = dataset.build_duration_matrix(icm) # icm = dataset.add_tracks_num_rating_to_icm(icm, urm) # urm_n = np.reciprocal(urm.sum(axis=1)) # urm = csr_matrix(urm.multiply(urm_n)) # rationale behind this. If in a playlist there are 1000 songs the similarity between them is low #urm_mod = applyTfIdf(urm, topK=1000) #icm = dataset.add_playlist_to_icm(icm, urm, urm_weight) tags = dataset.build_tags_matrix() tags = applyTfIdf(tags, topK=55) playcount = applyTfIdf(playcount) duration = applyTfIdf(duration) icm = vstack([artist, album, playcount, duration, tags], format='csr') icm = dataset.add_playlist_to_icm(icm, urm, urm_weight) # build user content matrix # ucm = dataset.build_ucm() # build item user-feature matrix: UFxI # iucm = ucm.dot(urm) # iucm_norm = urm.sum(axis=0) # iucm_norm[iucm_norm == 0] = 1 # iucm_norm = np.reciprocal(iucm_norm) # iucm = csr_matrix(iucm.multiply(iucm_norm)) # # for each item keep only top 100 user attributes # iucm = top_k_filtering(iucm.transpose(), 100).transpose() # icm = vstack([icm.multiply(2), iucm], format='csr') # applytfidf # icm = TfidfTransformer(norm='l1').fit_transform(icm.transpose()).transpose() S = compute_cosine(icm.transpose()[[dataset.get_track_index_from_id(x) for x in self.tr_id_list]], icm, k_filtering=self.k_filtering, shrinkage=self.shrinkage, chunksize=1000) s_norm = S.sum(axis=1) # normalize s S = S.multiply(csr_matrix(np.reciprocal(s_norm))) # compute ratings print("Similarity matrix ready!") urm_cleaned = urm[[dataset.get_playlist_index_from_id(x) for x in self.pl_id_list]] self.S = S.transpose() # compute ratings R_hat = urm_cleaned.dot(S.transpose().tocsc()).tocsr() print("R_hat done") urm_cleaned = urm_cleaned[:, [dataset.get_track_index_from_id(x) for x in self.tr_id_list]] R_hat[urm_cleaned.nonzero()] = 0 R_hat.eliminate_zeros() self.R_hat = top_k_filtering(R_hat, 20)
def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=50, k_filtering=200): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset S = None print("CBF started") urm = urm.tocsr() # get ICM from dataset icm = dataset.build_icm_2() # add n_ratings to icm icm = dataset.add_tracks_num_rating_to_icm(icm, urm) # weight urm: Each playlist is a feature # weight each playlist by the inverse of the number of its tracks urm_weights = urm.sum(axis=1) urm_weighted = urm.multiply(np.reciprocal(urm_weights)) # add urm icm = dataset.add_playlist_to_icm(icm, urm_weighted, 2) # compute cosine similarity (only for tg tracks) wrt to all tracks S = compute_cosine(icm.transpose()[[dataset.get_track_index_from_id(x) for x in self.tr_id_list]], icm, k_filtering=k_filtering, shrinkage=shrinkage) # Normalize S s_norm = S.sum(axis=1) S = S.multiply(csr_matrix(np.reciprocal(s_norm))) # keep only target rows of URM and target columns urm_cleaned = urm[[dataset.get_playlist_index_from_id(x) for x in self.pl_id_list]] # save S self.S = S.transpose() # compute ratings R_hat = urm_cleaned.dot(S.transpose()).tocsr() # apply mask for eliminating already rated items urm_cleaned = urm_cleaned[:, [dataset.get_track_index_from_id(x) for x in self.tr_id_list]] R_hat[urm_cleaned.nonzero()] = 0 R_hat.eliminate_zeros() print("R_hat done") self.R_hat = R_hat
def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=50, k_filtering=200, test_dict={}): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset print("CBF started") # Compute URM matrix factorization and predict missing ratings print('Implicit matrix factorization on URM...') ials = implicit.als.AlternatingLeastSquares(factors=10) ials.fit(urm.transpose().multiply(40)) # Get latent factors user_factors = ials.user_factors item_factors = ials.item_factors print('Estimating URM multiplying latent factors...') urm_hat = utils.dot_chunked(user_factors, item_factors.transpose(), topK=500, chunksize=1000) urm_hat = lil_matrix(urm_hat) urm_hat[urm.nonzero()] = 1 urm_hat = csr_matrix(urm_hat) # get ICM from dataset, assume it already cleaned icm = dataset.add_playlist_to_icm(dataset.build_icm(), urm_hat, 0.25).tocsr() # Get tags feature matrix and build the aggregated weighted matrix # tags = dataset.build_tags_matrix() # tags = build_aggregated_feature_space(tags, n_features=3, topK=10) # tags = tags.multiply(0.5) # Weight tags # Stack the ICM on top of aggregated weighted tags features # icm = vstack((icm, tags)) print("SHAPE of ICM: ", icm.shape) # apply tfidf # transformer = TfidfTransformer() # icm = transformer.fit_transform(icm.transpose()).transpose() # calculate similarity between items: # S_ij=(sum for k belonging to attributes t_ik*t_jk)/norm_i * norm_k # first calculate norm # sum over rows (obtaining a row vector) # Compute cosine similarity matrix on ICM icm_t = icm.transpose()[[dataset.get_track_index_from_id(x) for x in self.tr_id_list]] # S is a (n_target_tracks, n_tracks) S = utils.compute_cosine(icm_t, icm, k_filtering=k_filtering, shrinkage=shrinkage) print("Similarity matrix ready, let's normalize it!") # zero out diagonal # in the diagonal there is the sim between i and i (1) # maybe it's better to have a lil matrix here # S.setdiag(0) # S.eliminate_zeros() # keep only target rows of URM and target columns urm_cleaned = urm[[dataset.get_playlist_index_from_id(x) for x in self.pl_id_list]] s_norm = S.sum(axis=1) # normalize s S = S.multiply(csr_matrix(np.reciprocal(s_norm))) self.S = S.transpose() # compute ratings R_hat = urm_cleaned.dot(S.transpose().tocsc()).tocsr() print("R_hat done") # apply mask for eliminating already rated items urm_cleaned = urm_cleaned[:, [dataset.get_track_index_from_id(x) for x in self.tr_id_list]] R_hat[urm_cleaned.nonzero()] = 0 R_hat.eliminate_zeros() # eliminate playlist that are not target, already done, to check # R_hat = R_hat[:, [dataset.get_track_index_from_id( # x) for x in self.tr_id_list]] print("Shape of final matrix: ", R_hat.shape) self.R_hat = R_hat
def fit(self, urm, target_playlist, target_tracks, dataset, k_feature=1000, k_similar=1000): # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset print("FWUM started!") # get ICM from dataset icm = dataset.build_icm() self.urm = urm # CONTENT BASED USER PROFILE ucm_red = dataset.build_ucm() # build the user feature matrix # FxUt ufm = urm.dot(icm.transpose())[[dataset.get_playlist_index_from_id(x) for x in target_playlist]].transpose() print("Start filtering") ufm = self.filter_by_topic(ufm, dataset).transpose() # Iu contains for each user the number of tracks rated Iu = urm[[dataset.get_playlist_index_from_id(x) for x in target_playlist]].sum(axis=1) # save from divide by zero! Iu[Iu == 0] = 1 # since we have to divide the ufm get the reciprocal of this vector Iu = np.reciprocal(Iu) # multiply the ufm by Iu. Normalize UFM print("UFM ready") ufm = ufm.multiply(Iu).transpose() ucm = vstack([ufm, ucm_red[:,[dataset.get_playlist_index_from_id(x) for x in target_playlist]]], format='csr') print("UCM ready") ## User Based content profile # uFxI iucm = ucm_red.dot(urm)[:, [dataset.get_track_index_from_id(x) for x in target_tracks]] i_sum = urm[:, [dataset.get_track_index_from_id(x) for x in target_tracks]].sum(axis=0) # save from divide by zero! i_sum[i_sum == 0] = 1 # since we have to divide the ufm get the reciprocal of this vector i_sum = np.reciprocal(i_sum) # multiply the ufm by Iu. Normalize UFM iucm = csr_matrix(iucm.multiply(i_sum)) # Add playlist to icm print("SHAPE of ICM: ", icm.shape) # filter iucm owner = dataset.build_owner_matrix(iucm) owner = top_k_filtering(owner, 50) title = dataset.build_title_matrix(iucm) title = top_k_filtering(title, 50) created_at = dataset.build_created_at_matrix(iucm) created_at = top_k_filtering(created_at, 10) duration = dataset.build_pl_duration_matrix(iucm) duration = top_k_filtering(duration, 10) numtracks = dataset.build_numtracks_matrix(iucm) numtracks = top_k_filtering(numtracks, 10) icm = vstack([icm[:, [dataset.get_track_index_from_id(x) for x in target_tracks]], title, owner, created_at, duration, numtracks], format='csr') print("UFM and ICM Done!") # NEIGHBOR FORMATION # normalize matrix R_hat_1 = compute_cosine(ucm.transpose(), icm, k_filtering=500, shrinkage=10) # R_hat computation self.R_hat = R_hat_1.tocsr() # self.R_hat = self.R_hat[[dataset.get_playlist_index_from_id(x) for x in target_playlist]] # self.R_hat = self.R_hat[:, [dataset.get_track_index_from_id(x) for x in target_tracks]] # restore original ratings # self.R_hat[self.urm.nonzero()] = 1 # clean urm self.urm = self.urm[:, [dataset.get_track_index_from_id(x) for x in target_tracks]] self.urm = self.urm[[dataset.get_playlist_index_from_id(x) for x in target_playlist]] # put to zero already rated elements self.R_hat[self.urm.nonzero()] = 0 self.R_hat.eliminate_zeros() print("R_hat done") print("Shape:", self.R_hat.shape)
def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=10, k_filtering=200, alfa=0.95): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset S = None print("CBF started") # get ICM from dataset icm = dataset.build_icm_2() # add urm icm = dataset.add_playlist_to_icm(icm, urm, 0.8) icm_tag = dataset.build_tags_matrix() tags = applyTfIdf(icm_tag, 55) icm = vstack([icm, tags], format='csr') # add n_ratings to icm # icm = dataset.add_tracks_num_rating_to_icm(icm, urm) # build user content matrix ucm = dataset.build_ucm() # build item user-feature matrix: UFxI iucm = ucm.dot(urm) # icm = vstack([icm.multiply(0), iucm], format='csr') iucm = applyTfIdf(iucm, 100) S_user = compute_cosine(iucm.transpose()[[ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]], iucm, k_filtering=k_filtering, normalize=True) # To filter or not to filter? Who knows? # title = dataset.build_title_matrix(iucm) # title = top_k_filtering(title.transpose(), topK=100) # title.data = np.ones_like(title.data) # title = title.multiply(0.05) # # owner = dataset.build_owner_matrix(iucm) # # owner = top_k_filtering(owner.transpose(), topK=500) # # owner.data = np.ones_like(owner.data) # # owner = owner.multiply(0.05) # created_at = dataset.build_created_at_matrix(iucm) # print(created_at.shape) # created_at = top_k_filtering(created_at.transpose(), topK=10) # created_at.data = np.ones_like(created_at.data) # created_at = created_at.multiply(0.01) # compute cosine similarity (only for tg tracks) wrt to all tracks S = compute_cosine(icm.transpose()[[ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]], icm, k_filtering=k_filtering, chunksize=1000) # compute a weighted average S = S.multiply(alfa) + S_user.multiply(1 - alfa) # Normalize S s_norm = S.sum(axis=1) S = S.multiply(csr_matrix(np.reciprocal(s_norm))) # keep only target rows of URM and target columns urm_cleaned = urm[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]] # save S self.S = S.transpose() # compute ratings R_hat = urm_cleaned.dot(S.transpose()).tocsr() # apply mask for eliminating already rated items urm_cleaned = urm_cleaned[:, [ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]] R_hat[urm_cleaned.nonzero()] = 0 R_hat.eliminate_zeros() print("R_hat done") self.R_hat = R_hat
def fit(self, urm, target_playlist, target_tracks, dataset): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset S = None print("UBF started") # get UCM from dataset ucm = dataset.build_ucm() # add user ratings to ucm #ucm = vstack([ucm, urm.transpose()]) #ucm = applyTfIdf(ucm) # build user profile from urm and icm icm = dataset.build_icm_2() tags = dataset.build_tags_matrix() tags = applyTfIdf(tags, topK=55) icm = vstack([icm, tags], format='csr') ufm = urm.dot(icm.transpose()) # # Iu contains for each user the number of tracks rated Iu = urm.sum(axis=1) # save from divide by zero! Iu[Iu == 0] = 1 # since we have to divide the ufm get the reciprocal of this vector Iu = np.reciprocal(Iu) # multiply the ufm by Iu. Normalize UFM ufm = ufm.multiply(Iu).transpose() ucm = vstack([ucm.multiply(5), urm.transpose().multiply(5), ufm], format='csr') # compute cosine similarity between users S = compute_cosine(ucm.transpose()[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]], ucm, k_filtering=self.k_filtering, shrinkage=self.shrinkage) s_norm = S.sum(axis=1) s_norm[s_norm == 0] = 1 # normalize s S = S.multiply(csr_matrix(np.reciprocal(s_norm))) # compute ratings print("Similarity matrix ready!") urm_cleaned = urm[:, [ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]] R_hat = S.dot(urm_cleaned) # clean from already rated items print("R_hat done") urm_cleaned = urm_cleaned[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]] R_hat[urm_cleaned.nonzero()] = 0 R_hat.eliminate_zeros() # eliminate playlist that are not target, already done, to check # R_hat = R_hat[:, [dataset.get_track_index_from_id( # x) for x in self.tr_id_list]] print("Shape of final matrix: ", R_hat.shape) self.R_hat = R_hat
def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=10, k_filtering=50): """ urm: user rating matrix target playlist is a list of playlist id target_tracks is a list of track id shrinkage: shrinkage factor for significance weighting S = ICM' ICM R = URM S In between eliminate useless row of URM and useless cols of S """ # initialization self.pl_id_list = list(target_playlist) self.tr_id_list = list(target_tracks) self.dataset = dataset S = None print("CBF started") urm = urm.tocsr() if self.r_hat_aug is None: # augment r_hat cbf = ContentBasedFiltering() cbf.fit(urm, tg_playlist, tg_tracks, ds) # get R_hat self.r_hat_aug = cbf.getR_hat() # do collaborative filtering S_cf = compute_cosine(self.r_hat_aug.transpose()[[ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]], self.r_hat_aug, k_filtering=k_filtering, shrinkage=shrinkage) # normalize S_cf = normalize_by_row(S_cf) self.R_hat = csr_matrix(urm[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]].dot(S_cf.transpose())) urm_cleaned = urm[[ dataset.get_playlist_index_from_id(x) for x in self.pl_id_list ]] urm_cleaned = urm_cleaned[:, [ dataset.get_track_index_from_id(x) for x in self.tr_id_list ]] self.R_hat[urm_cleaned.nonzero()] = 0 self.R_hat.eliminate_zeros() print("R_hat done") self.R_hat = csr_matrix(self.R_hat)