def online(): datareader = Datareader(mode='online', only_load=True) print('NLP...') stopwords = STOP_WORDS token_weights = np.array(TOKEN_WEIGHTS) nlp = NLP(datareader, stopwords=[]) ucm = nlp.get_ucm() #ucm = bm25_row(ucm) #inplace_csr_column_scale(ucm, token_weights) urm = datareader.get_urm_shrinked()[0] print('Computing similarity...') start = time.time() # Compute similarity similarity = tversky_similarity(ucm, shrink=200, alpha=0.1, beta=1) similarity = similarity.tocsr() print(time.time() - start) print('Computing eurm...') start = time.time() # Compute eurm eurm_nlp = dot_product(similarity, urm, k=500) eurm_nlp = eurm_nlp.tocsr() print(eurm_nlp.shape) eurm_nlp = eurm_nlp[-10000:, :] sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_no_stop_online.npz', eurm_nlp)
def icm(): datareader = Datareader(mode='offline', only_load=True) evaluator = Evaluator(datareader) print('NLP...') stopwords = STOP_WORDS token_weights = np.array(TOKEN_WEIGHTS) test_playlists = datareader.get_test_pids() nlp = NLP(datareader=datareader, stopwords=[], mode='tracks') print('Getting ucm and icm...') icm = nlp.get_icm() icm = bm25_row(icm) print('Computing similarity...') start = time.time() # Compute similarity similarity = tversky_similarity(icm, shrink=200, alpha=0.1, beta=1) similarity = similarity.tocsr() print(time.time() - start) urm = datareader.get_urm() print('Computing eurm...') start = time.time() # Compute eurm eurm_nlp = dot_product(urm[test_playlists, :], similarity, k=500) eurm_nlp = eurm_nlp.tocsr() # sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_weighted_offline.npz', eurm_nlp) evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp), name='nlp_enriched')
def __init__(self): self.dr_on = Datareader(verbose=False, mode='online', only_load=True) self.dr_of = Datareader(verbose=False, mode='online', only_load=True) self.urm_on = self.dr_on.get_urm() self.urm_of = self.dr_on.get_urm() self.urm_col = sps.csc_matrix(self.urm_of) self.top_p = np.zeros(self.urm_of.shape[1])
def __init__(self, UCM, URM_train, test_playlists_indices, logFile, bestIndividualFile, mode="selection", numGenerations=30, populationSize=30, initialRandomDistribution=np.random.uniform(0, 1), verbose=True): self.UCM = UCM self.URM_train = URM_train self.test_playlists_indices = test_playlists_indices.astype(np.int) self.logFile = open(logFile, "a") self.bestIndividualFile = open(bestIndividualFile, "a") self.initialRandomDistribution = initialRandomDistribution self.verbose = verbose self.top = 0 self.current = 0 self.evaluator = Evaluator(Datareader(mode='offline', only_load=True, verbose=False)) self.NUM_VARIABLES = UCM.shape[1] if (mode == "weighting" or mode == "selection"): self.mode = mode # Crossover probability self.CXPB = 0.5 # Mutation probability self.MUTPB = 0.2 # Number of generations for which the evolution runs self.NGEN = numGenerations self.POPULATION_SIZE = populationSize
def reorder_test_playlists(): # Playlists reordering dr = Datareader(test_num=1, mode='offline', only_load=True) path_test_csv = dr._Datareader__path + dr._Datareader__test_playlist_file pids = [] for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: pids.extend(dr.get_test_pids(cat=i)) test_playlists_df = pd.read_csv(path_test_csv, sep='\t', encoding='utf-8') test_playlists_df = test_playlists_df.set_index(['pid']) # Load and resave csv file ordered by cat test_playlists_df = test_playlists_df.reindex(pids) test_playlists_df['pid'] = test_playlists_df.index test_playlists_df.to_csv(path_test_csv, sep='\t', index=False, encoding='utf-8')
def reorder_old_eurm(eurm): """ ATTENTION: this function is intended to be used only for old eurms, which are ordered by test pids and not by categories. :param eurm: the old-ordered eurm :return: eurm: the new-ordered eurm """ dr_old = Datareader(mode='online', only_load='True', type='old') res = [] for cat in range(1, 11): indices = dr_old.get_test_pids_indices(cat=cat) res.append(eurm[indices]) eurm_new = sps.vstack(res) return eurm_new
class Top_pop_p(object): def __init__(self): self.dr_on = Datareader(verbose=False, mode='online', only_load=True) self.dr_of = Datareader(verbose=False, mode='online', only_load=True) self.urm_on = self.dr_on.get_urm() self.urm_of = self.dr_on.get_urm() self.urm_col = sps.csc_matrix(self.urm_of) self.top_p = np.zeros(self.urm_of.shape[1]) def album(self): eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() ucm_album = self.dr_of.get_ucm_albums().tocsc() album_dic = self.dr_of.get_track_to_album_dict() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.urm_on.indptr[row + 1]][0] album = album_dic[track_ind] playlists = ucm_album.indices[ucm_album.indptr[album]:ucm_album.indptr[album+1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) return eurm def track(self): eurm = sps.lil_matrix(self.urm_of.shape) pids = self.dr_on.get_test_pids(cat=2) pids_all = self.dr_of.get_test_pids() for row in tqdm(pids): track_ind = self.urm_on.indices[self.urm_on.indptr[row]:self.urm_on.indptr[row + 1]][0] playlists = self.urm_col.indices[ self.urm_col.indptr[track_ind]: self.urm_col.indptr[track_ind+1]] top = self.urm_of[playlists].sum(axis=0).A1.astype(np.int32) track_ind_rec = top.argsort()[-501:][::-1] eurm[row, track_ind_rec] = top[track_ind_rec] eurm = eurm.tocsr()[pids_all] eurm = eurm_remove_seed(eurm, self.dr_on) print(eurm) return eurm.copy()
def new(): datareader = Datareader(mode='offline', only_load=True) evaluator = Evaluator(datareader) print('NLP...') stopwords = STOP_WORDS token_weights = np.array(TOKEN_WEIGHTS) test_playlists = datareader.get_test_pids() nlp = NLP(datareader=datareader, stopwords=[], mode='both') print('Getting ucm and icm...') ucm = nlp.get_ucm() ucm = bm25_row(ucm) icm = nlp.get_icm() icm = bm25_row(icm) icm_T = icm.T #ucm = bm25_row(ucm) #urm = datareader.get_urm() print('Computing eurm...') start = time.time() eurm_nlp = dot_product(ucm[test_playlists, :], icm_T, k=500) print(time.time() - start) print('Converting to csr...') eurm_nlp = eurm_nlp.tocsr() print(eurm_nlp.shape) #eurm_nlp = eurm_nlp[test_playlists:, :] sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_new_method_offline.npz', eurm_nlp) evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp), name='nlp_new_method', show_plot=False)
def compute_rating(self, urm2=None, datareader=None, top_k=750, verbose=False, small=False, mode="offline", remove_seed=True): """ :param urm: sparse matrix :param model: sparse matrix :param top_k: int, element to take for each row after fitting process :param small: boolean, if true return an eurm matrix with just the target playlist :param verbose: boolean, if true print debug information :param remove_seed: boolean, if true remove seed from eurm :return: sparse matrix, estimated urm """ if small: self.urm = sps.csr_matrix(self.urm[self.pid]) self.urm = sps.csr_matrix(self.urm) self.model = sps.csr_matrix(self.model) if verbose: print("[ Compute ratings ]") start_time = time.time() if urm2 != None: self.urm = urm2[self.pid] self.eurm = dot(self.urm, self.model, k=top_k) print("eurm shape: " + str(self.eurm.shape)) if remove_seed: if datareader is None: print( '[ WARNING! Datareader is None in "compute rating". mode is set to' + mode.upper() + ', creating it again. ' 'A future version will require it. ]') from utils.datareader import Datareader datareader = Datareader(mode=mode, only_load=True) self.eurm = eurm_remove_seed(self.eurm, datareader=datareader) if verbose: print("time: " + str(int(time.time() - start_time) / 60)) return self.eurm.tocsr()
def prova(): dr = Datareader(mode='offline', only_load=True) print(dr.get_artist_to_tracks_dict()) exit() dr = Datareader(mode='offline', only_load=True, verbose=False) test_playlists = dr.get_test_pids() stopwords = STOP_WORDS token_weights = np.array(TOKEN_WEIGHTS) nlp = NLP(mode='playlists', datareader=dr, stopwords=STOP_WORDS) s = nlp.get_ucm() print(s.shape) evaluator = Evaluator(dr) ucm = nlp.get_ucm() sim = sparse.load_npz(ROOT_DIR + '/data/cf_user_similarity.npz') print('Computing dot...') ucm = dot_product(sim, ucm, k=200) print('NNZ', ucm.nnz) exit() urm = dr.get_urm() # ucm = ucm.astype(np.float64) # inplace_csr_column_scale(ucm, token_weights) print('Computing similarity...') start = time.time() # Compute similarity similarity = tversky_similarity(ucm, shrink=200, alpha=0.1, beta=1) similarity = similarity.tocsr() print(time.time() - start) print('Computing eurm...') start = time.time() # Compute eurm eurm_nlp = dot_product(similarity, urm, k=500) eurm_nlp = eurm_nlp.tocsr() eurm_nlp = eurm_nlp[test_playlists, :] #sparse.save_npz(ROOT_DIR + '/data/eurm_nlp_weighted_offline.npz', eurm_nlp) evaluator.evaluate(eurm_to_recommendation_list(eurm_nlp), name='nlp_enriched')
if list_of_list_of_lists: sequences_spm.append([[i] for i in to_append]) else: sequences_spm.append(to_append) return sequences_spm def fast_argpart(arr): if len(arr) > 500: max_n = 500 else: max_n = len(arr) return np.argpartition(arr, -max_n)[-max_n:] dr = Datareader(mode='offline', verbose=False, only_load=True) ev = Evaluator(dr) test_known_tracks = build_test_dict(dr) test_pids_cat2 = dr.get_test_pids(cat=2) urm_pos = dr.get_position_matrix(position_type='last') urm_pos_csc = sps.csc_matrix(urm_pos) ###### NON FARE QUESTA CELLA # for i in tqdm(range(1000, 2000)): # song_target = test_known_tracks[test_pids_cat2[i - 1000]][0][1] # not_empty_lines = urm_pos_csc[:, song_target].nonzero()[0] # filtered = urm_pos[not_empty_lines] # sequences_spm = [] # for row in range(filtered.shape[0]):
from utils.datareader import Datareader from utils.evaluator import Evaluator from utils.submitter import Submitter from utils.post_processing import eurm_to_recommendation_list_submission from utils.post_processing import eurm_to_recommendation_list, eurm_remove_seed import recommenders.similarity.s_plus as ss import recommenders.similarity.p3alpha_rp3beta as p3r3 import numpy as np from utils import ensembler import scipy.sparse as sps import gc from sklearn.preprocessing import normalize import sys dr = Datareader(verbose=False, mode='offline', only_load=True) ev = Evaluator(dr) #Getting for the recommender algorithm urm = dr.get_urm() #urm.data = np.ones(len(urm.data)) p_ui = normalize(urm, norm="l1") p_iu = normalize(urm.T, norm="l1") pop = urm.sum(axis=0).A1 pids = dr.get_test_pids() t_urm = sps.csr_matrix(p_ui.copy()[pids]) def recsys(alpha, beta): alpha = alpha beta = beta k = 200
from scipy import sparse import utils.pre_processing as pre from boosts.hole_boost import HoleBoost from utils.datareader import Datareader from utils.definitions import ROOT_DIR from utils.evaluator import Evaluator from utils.post_processing import eurm_to_recommendation_list # Initialization dr = Datareader(mode='offline', only_load=True) ev = Evaluator(dr) # Load matrices eurm = sparse.load_npz(ROOT_DIR + '/data/eurm_rp3_offline.npz') sim = sparse.load_npz(ROOT_DIR + '/data/sim_offline.npz') print('Loaded') # Normalization eurm = pre.norm_l2_row(eurm) sim = pre.norm_l2_row(sim) # HoleBoost h = HoleBoost(sim, eurm, dr) eurm_b = h.boost_eurm(categories=[2, 3, 4, 5, 6, 7, 8, 9, 10], k=200, gamma=10) #sparse.save_npz(ROOT_DIR + '/data/eurm_boosted_online.npz', eurm_b) rec_list = eurm_to_recommendation_list(eurm_b) # Evaluation
if arid in self.dictionary.keys(): self.dictionary[arid].union(self.tokens_dict[token]) else: self.dictionary[arid] = set().union( self.tokens_dict[token]) def get_playlist_artist_matrix(self): rows = [] cols = [] data = [] for arid in tqdm(self.dictionary.keys(), desc='Build (playlists, artists) matrix'): for playlist in self.dictionary[arid]: rows.append(playlist) cols.append(arid) data.append(1) self.playlist_artist_matrix = sparse.csr_matrix( (data, (rows, cols)), shape=(len(self.playlist_titles), len(self.artists))) return self.playlist_artist_matrix if __name__ == '__main__': n = ArtistToken(Datareader(mode='offline', only_load=True, verbose=False)) m = n.get_playlist_artist_matrix() sparse.save_npz('pam.npz', m)
filtered = urm_pos[not_empty_lines] for row in (range(filtered.shape[0])): to_append = list( filtered.indices[filtered.indptr[row]:filtered.indptr[row + 1]] [np.argsort( filtered.data[filtered.indptr[row]:filtered.indptr[row + 1]])]) if list_of_list_of_list: sequences_spm.append([[i] for i in to_append]) else: sequences.append(to_append) return sequences_spm if __name__ == "__main__": dr = Datareader(mode='offline', verbose=False, only_load=True) ev = Evaluator(dr) print("building dict", end=" ") test_interactions_df = dr.get_df_test_interactions() test_interactions_df.sort_values(['pos'], ascending=True) test_playlists_df = dr.get_df_test_playlists() test_playlists = test_playlists_df['pid'].as_matrix() # A list of list [pos, tid] for each playlist sorted by pos test_known_tracks = test_interactions_df.groupby( ['pid'])[['pos', 'tid']].apply(lambda x: x.values.tolist()) for s in test_known_tracks: s = s.sort(key=lambda x: x[0]) print("> done")
import scipy.sparse as sps import sys from utils.evaluator import Evaluator from utils.datareader import Datareader from utils.post_processing import eurm_to_recommendation_list from utils.ensembler import ensembler import numpy as np import os.path dr = Datareader(verbose=False, mode="offline", only_load="False") cat = 8 a = sps.load_npz("../offline/offline-cbf_item_album-cat" + str(cat) + ".npz") b = sps.load_npz("../offline/offline-cbf_item_artist-cat" + str(cat) + ".npz") c = sps.load_npz("../offline/nlp_eurm_offline_bm25-cat" + str(cat) + ".npz") d = sps.load_npz("../offline/offline-rp3beta-cat" + str(cat) + ".npz") e = sps.load_npz("../offline/offline-cfuser-cat" + str(cat) + ".npz") f = sps.load_npz("../offline/slim_bpr_completo_test1-cat" + str(cat) + ".npz") g = sps.load_npz("../offline/eurm_cbfu_artists_offline-cat" + str(cat) + ".npz") matrix = [a, b, c, d, e, f, g] a = float(sys.argv[1]) b = float(sys.argv[2]) c = float(sys.argv[3]) d = float(sys.argv[4]) e = float(sys.argv[5]) f = float(sys.argv[6]) g = float(sys.argv[7])
import sys if __name__ == '__main__': # SELECT EXECUTION MODE mode = "online" name = "cbf_user_artists" knn = 800 topk = 750 save_eurm = True complete_name = mode + "_" + name + "_knn=" + str(knn) + "_topk=" + str( topk) if mode == "offline": # Initialization dr = Datareader(verbose=False, mode=mode, only_load=True) test_pids = list(dr.get_test_pids()) ev = Evaluator(dr) urm = dr.get_urm() # UCM ucm_artists = dr.get_ucm_artists() ucm_artists = bm25_row(ucm_artists) # Similarity print('Similarity..') sim = tversky_similarity(ucm_artists, ucm_artists.T, shrink=200, target_items=test_pids, alpha=0.1,
import sys from scipy import sparse import numpy as np import utils.pre_processing as pre from utils.definitions import * from utils.datareader import Datareader from utils.evaluator import Evaluator from utils.pre_processing import * from utils.post_processing import * dr = Datareader(mode='offline', only_load=True, verbose=False) ev = Evaluator(dr) urm = dr.get_urm(binary=True) pos_matrix = dr.get_position_matrix(position_type='last') rows = [] cols = [] data = [] for p in tqdm(range(pos_matrix.shape[0])): start = pos_matrix.indptr[p] end = pos_matrix.indptr[p + 1] tracks = pos_matrix.indices[start:end] positions = pos_matrix.indices[start:end] for idx in range(len(tracks)): if positions[idx] <= 250: rows.append(p) cols.append((tracks[idx] * positions[idx]) + tracks[idx])
from nltk.tokenize import RegexpTokenizer from utils.datareader import Datareader from tqdm import tqdm from scipy import sparse from difflib import SequenceMatcher from difflib import get_close_matches from utils.pre_processing import * from recommenders.similarity.dot_product import dot_product from recommenders.similarity.s_plus import tversky_similarity from utils.evaluator import Evaluator from utils.post_processing import * from personal.Tommaso.NLP.NLP import NLP from utils.definitions import * # Datareader dr = Datareader(mode='online', only_load=True) #ev = Evaluator(dr) # Dataframe with interactions df_train = dr.get_df_train_interactions() df_test = dr.get_df_test_interactions() df = pd.concat([df_train, df_test], axis=0, join='outer') playlists = df['pid'].as_matrix() tracks = df['tid'].as_matrix() dictionary = dr.get_track_to_artist_dict() pids = list(dr.get_train_pids()) + list(dr.get_test_pids()) # URM urm = dr.get_urm()
knn = 850 topk = 750 if len(sys.argv) > 1: mode = sys.argv[1] name = sys.argv[2] knn = int(sys.argv[3]) topk = int(sys.argv[4]) complete_name = mode + "_" + name + "_knn=" + str(knn) + "_topk=" + str( topk) if mode == "offline": """Test Set""" #Data initialization dr = Datareader(verbose=False, mode=mode, only_load=True) #Evaluetor initialization #Recommender algorithm initialization rec = Knn_collabrative_user() #Getting for the recommender algorithm urm = dr.get_urm() urm.data = np.ones(len(urm.data)) pid = dr.get_test_pids() # Depopularize top = urm.sum(axis=0).A1 mask = np.argsort(top)[::-1][:2000] ut.inplace_set_cols_zero(urm, mask)
not_empty_lines = not_empty_lines_by_target(urm_pos, target_list, min_common) filtered = urm_pos[not_empty_lines] for row in tqdm((range(filtered.shape[0])), desc='Converting eurm into list of lists'): to_append = list( filtered.indices[filtered.indptr[row]:filtered.indptr[row + 1]] [np.argsort( filtered.data[filtered.indptr[row]:filtered.indptr[row + 1]])]) sequences_spm.append([[i] for i in to_append]) return sequences_spm if __name__ == '__main__': dr = Datareader(mode='offline', verbose=False, only_load=True) ev = Evaluator(datareader=dr) nlp = NLP2(dr, stopwords=[], norm=True, work=True, split=True, date=False, skip_words=True, porter=False, porter2=True, lanca=False, lanca2=True) ucm_csr = nlp.get_UCM(data1=True).tocsr()
import logging import scipy.sparse as sps from boosts.hole_boost import HoleBoost from boosts.tail_boost import TailBoost from utils.datareader import Datareader from utils.definitions import ROOT_DIR from utils.evaluator import Evaluator from utils.post_processing import eurm_to_recommendation_list from utils.pre_processing import * logging.basicConfig(filename='result.log', level=logging.DEBUG) dr = Datareader(verbose=False, mode="offline", only_load=True) ev = Evaluator(dr) sim = sps.load_npz(ROOT_DIR + "/data/sim_offline.npz") # rp3b = sps.load_npz(ROOT_DIR + "/data/sub/EURM-rp3beta-online.npz") # knn_c_i_al = sps.load_npz(ROOT_DIR + "/data/sub/KNN CONTENT ITEM-album-top_k=850-sm_type=cosine-shrink=100.npz") # knn_c_i_ar = sps.load_npz(ROOT_DIR + "/data/sub/KNN CONTENT ITEM-artist-top_k=850-sm_type=cosine-shrink=100.npz") nlp = sps.load_npz(ROOT_DIR + "/data/eurm_nlp_offline.npz") # cf_u = sps.load_npz(ROOT_DIR + "/data/sub/eurm_cfu_online.npz") eurm_ens = sps.load_npz(ROOT_DIR + "/data/ENSEMBLED.npz") #matrix = [rp3b, knn_c_i_ar, knn_c_i_al, nlp, cf_u] #eurm_ens = ensembler(matrix, [0.720, 0.113, 0.177, 0.194, 1.0], normalization_type="max")
from utils.datareader import Datareader import scipy.sparse as sps import sys from utils.definitions import ROOT_DIR arg = sys.argv[1:] mode = arg[0] dr = Datareader(verbose=False, mode=mode, only_load=True) top_pop = dr.get_eurm_top_pop(top_pop_k=750, remove_duplicates=True, binary=True) sps.save_npz(ROOT_DIR+"/recommenders/script/main/"+mode+"_npz/top_pop.npz", top_pop)
from utils.definitions import ROOT_DIR from utils.post_processing import eurm_to_recommendation_list, eurm_remove_seed import scipy.sparse as sps mode = "offline" knn = 100 topk = 750 complete_name = "maurizio_" + mode + "__knn=" + str(knn) + "_topk=" + str(topk) if __name__ == '__main__': sim = sps.load_npz(ROOT_DIR + "/similarities/offline-similarity_rp3beta_knn100.npz") dr = Datareader(mode=mode, only_load=True) ######### MAURIZ ICM = dr.get_icm(alid=True) cfw = CFW_D_Similarity_Linalg(URM_train=dr.get_urm(), ICM=ICM.copy(), S_matrix_target=sim, URM_validation=None) cfw.fit() weights = sps.diags(cfw.D_best) sps.save_npz("ICM_fw_maurizio", weights)
matrix.append([a, b, c, d, e, f, g, h]) rprec = [] for i in range(0, 10): print("[ Ensembling cat", i + 1, "]") rprec.append(ensembler(matrix[i], w[i], normalization_type="max")) res = sps.vstack(rprec).tocsr() import time name = "ensemble-" + mode + "-data-" + time.strftime( "%x") + "-" + time.strftime("%X") name = name.replace("/", "_") sps.save_npz("results/" + name + ".npz", res) print("[ Initizalizing Datereader ]") dr = Datareader(verbose=False, mode=mode, only_load="False") res = eurm_to_recommendation_list(res, datareader=dr) if mode == "offline": print("[ Initizalizing Evaluator ]") ev = Evaluator(dr) ev.evaluate(res, name="ens") if mode == "online": print("[ Initizalizing Submitter ]") sb = Submitter(dr) sb.submit(recommendation_list=res, name=name, track="main", verify=True,
if mode=="offline": test_num = int(sys.argv[7]) name ="DSLIM" complete_name = mode+"_"+name+"_knn="+str(knn)+"_topk="+str(topk)\ + '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") if mode=="offline": complete_name+="_test="+str(test_num) bot = Bot_v1(complete_name) try: ######################SHRINKED dr = Datareader(mode=mode, test_num=test_num, train_format="50k", only_load=True) ev = Evaluator(dr) pids = dr.get_test_pids() urm, dictns, dict2 = dr.get_urm_shrinked() urm_evaluation = dr.get_evaluation_urm()[pids] pids_converted = np.array([dictns[x] for x in pids], dtype=np.int32) slim = MultiThreadDSLIM_RMSE(urm.T) slim.fit(l1_penalty=l1, l2_penalty=l2, positive_only=True, beta=beta, topK=topk) evaluate_shrinked(W_sparse= slim.W_sparse, urm_shrinked= urm, pids_shrinked= pids_converted) sps.save_npz(complete_name+".npz",slim.W_sparse,)
from utils.datareader import Datareader from utils.evaluator import Evaluator from utils.post_processing import eurm_remove_seed from utils.post_processing import eurm_to_recommendation_list import numpy as np import scipy.sparse as sps from tqdm import tqdm from utils.definitions import * from utils.post_processing import eurm_remove_seed, append_rec_list dr = Datareader(verbose=False, mode='offline', only_load=True) urm = dr.get_urm() urm_col = sps.csc_matrix(urm) top_p = np.zeros(urm.shape[1]) rec = [] eurm1 = sps.lil_matrix(urm.shape) eurm2 = sps.lil_matrix(urm.shape) print(eurm1.shape) pids = dr.get_test_pids(cat=2) pids_all = dr.get_test_pids() # TopPop Album # ucm_album = dr.get_ucm_albums().tocsc() # album_dic = dr.get_track_to_album_dict() # TopPop Artist ucm_album = dr.get_ucm_albums().tocsc() artists_dic = dr.get_track_to_artist_dict() album_to_tracks = load_obj(name="album_tracks_dict_offline",
}) df = df[['tid', 'arid', 'alid', 'track_name']] df.head() # In[4]: names = df['track_name'].str.lower().values tids = df['tid'].values alids = df['alid'].values arids = df['arid'].values #print('%d total tracks'%tids.shape[0]) # In[5]: # get the full matrix (dataset + testset) dr = Datareader(mode='online', only_load=True, verbose=False) urm = dr.get_urm() #print(urm.shape) # In[6]: # just focus on songs that appear more than 1 time (-> threshold=2) popularity = urm.sum(axis=0).A1 threshold = 0 ids_usefull_tracks = np.argwhere(popularity >= threshold) #print('%d / %d usefull tracks (threshold >= %d)'%(ids_usefull_tracks.shape[0], popularity.shape[0], threshold)) # In[7]:
import sys from scipy import sparse import numpy as np import utils.pre_processing as pre from utils.definitions import * from utils.datareader import Datareader from utils.evaluator import Evaluator from utils.pre_processing import * from utils.post_processing import * dr = Datareader(mode='offline', only_load=True, verbose=False) ev = Evaluator(dr) urm = dr.get_urm(binary=True) urm_csc = urm.tocsc(copy=True) sim_nlp = sparse.load_npz(ROOT_DIR + '/data/sim_nlp_lele.npz') for k in [1, 2, 3, 4, 5]: eurm_top = dr.get_eurm_top_pop_filter_cat_1(sim_nlp, k, topk=500) eurm_top = norm_l1_row(eurm_top) eurm_nlp = sparse.load_npz(ROOT_DIR + '/data/nlp_fusion_tuned_offline.npz') eurm_nlp = norm_l1_row(eurm_nlp) for a in [0.05, 0.10, 0.15, 0.20]: eurm = eurm_nlp * (1.0 - a) + eurm_top * a rec_list = eurm_to_recommendation_list(eurm, datareader=dr) ev.evaluate(rec_list, name='pop_first_k=' + str(k) + '_a=' + str(a))
test_playlists = test_playlists_df['pid'].values # A list of list [pos, tid] for each playlist sorted by pos test_known_tracks = test_interactions_df.groupby(['pid'])[['pos', 'tid']].apply(lambda x: x.values.tolist()) for s in test_known_tracks: s = s.sort(key=lambda x: x[0]) print("> done") return test_known_tracks verbose = False if __name__ == "__main__": dr = Datareader(mode='offline', train_format='50k', verbose=False, only_load=True) ev = Evaluator(dr) test_known_tracks = build_test_dict(dr) test_pids_cat2 = dr.get_test_pids(cat=2) rec_list = np.zeros(shape=(10000,500)) pred = np.zeros(shape=(10000, 2262292)) for i in tqdm(range(1000,2000)): # print("prima target") # print(test_pids_cat2[0]) # print(test_known_tracks[test_pids_cat2[0]]) # print([x[1] for x in test_known_tracks[test_pids_cat2[0]]])