def _generate_answers(self, song_meta_json, train, questions): song_meta = {int(song["id"]): song for song in song_meta_json} song_mp_counter, song_mp = most_popular(train, "songs", 200) tag_mp_counter, tag_mp = most_popular(train, "tags", 100) song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter) answers = [] for q in tqdm(questions): genre_counter = Counter() for sid in q["songs"]: for genre in song_meta[sid]["song_gn_gnr_basket"]: genre_counter.update({genre: 1}) top_genre = genre_counter.most_common(1) if len(top_genre) != 0: cur_songs = song_mp_per_genre[top_genre[0][0]] else: cur_songs = song_mp answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], cur_songs)[:100], "tags": remove_seen(q["tags"], tag_mp)[:10] }) return answers
def _generate_answers(self, song_meta_json, train, questions): # key를 song_id value를 해당 song_id에 대한 정보로 dictionary 생성 song_meta = {int(song["id"]): song for song in song_meta_json} # 상위 200개 곡 song_mp_counter, song_mp = most_popular(train, "songs", 200) # 상위 100개 태그 tag_mp_counter, tag_mp = most_popular(train, "tags", 100) song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter) answers = [] for q in tqdm(questions): genre_counter = Counter() for sid in q["songs"]: for genre in song_meta[sid]["song_gn_gnr_basket"]: genre_counter.update({genre: 1}) top_genre = genre_counter.most_common(1) # 가장 인기있는 장르가 존재하면 if len(top_genre) != 0: # 해당 장르에서 가장 많이 등장한 song 추천 cur_songs = song_mp_per_genre[top_genre[0][0]] else: # 아니면 가장 많이 등장한 노래 추천 cur_songs = song_mp answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], cur_songs)[:100], "tags": remove_seen(q["tags"], tag_mp)[:10] }) return answers
def _generate_answers(self, val): # Loading all the files required for recommendation with open("songtag_length.pkl", "rb") as f: songtag_length = pickle.load(f) with open("popular_song_dict.pkl", "rb") as f: popular_song_dict = pickle.load(f) with open("popular_tag_dict.pkl", "rb") as f: popular_tag_dict = pickle.load(f) with open("trainval_id_dict.pkl", "rb") as f: trainval_id_dict = pickle.load(f) songtag_matrix = sparse.load_npz('songtag_matrix.npz') with open('model.sav', 'rb') as f: model = pickle.load(f) # Setting the number of popular songs equal to train.py popular_num_song = songtag_length[0] ########################### print("Finished 1st step!") ########################### # Making recommendation lists (takes approximately 50 minutes) song_fill = [] tag_fill = [] for j in [trainval_id_dict[i] for i in val.id.values]: song_fill.append([popular_song_dict[k] for k,_ in model.recommend(j, songtag_matrix, filter_items = range(popular_num_song, songtag_matrix.shape[1]), N = 200)]) tag_fill.append([popular_tag_dict[k] for k in [i for i,_ in model.rank_items(j, songtag_matrix, list(range(popular_num_song, songtag_matrix.shape[1])))[:15]]]) ########################### print("Finished 2nd step!") ########################### # Creating the final dictionary for results.json answers = [] for i in range(len(val)): answers.append({ "id": val.id.values[i], "songs": remove_seen(val.songs.values[i], song_fill[i])[:100], "tags": remove_seen(val.tags.values[i], tag_fill[i])[:10] }) ########################### print("Finished 3rd step!") print("Finished writing answers!") ########################### return answers
def _generate_answers(self, train, questions): _, song_mp = most_popular(train, "songs", 200) _, tag_mp = most_popular(train, "tags", 100) answers = [] for q in tqdm(questions): answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], song_mp)[:100], "tags": remove_seen(q["tags"], tag_mp)[:10], }) return answers
def _generate_answers(self, train, questions, song_meta): song_infos = {} for t in train: song_infos[t['id']] = [song_meta[a] for a in t['songs']] plylst_list = {} for plylst, songs in song_infos.items(): plylst_list[plylst] = songs2vec(songs) answers = [] for q in tqdm(questions): answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], song_mp)[:100], "tags": remove_seen(q["tags"], tag_mp)[:10], }) return answers
def generate_answers(train, questions): _, song_mp = most_popular(train, "songs", 200) _, tag_mp = most_popular(train, "tags", 100) answers = [] for q in questions: if len(q["songs"]) != 0 and len(q["tags"]) != 0: answers.append({ "id": q["id"], "songs": q["songs"], "tags": q["tags"] }) else: answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], song_mp)[:100], "tags": remove_seen(q["tags"], tag_mp)[:10] }) return answers
def cf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50): print("CF...") train_songs_A_T = train_songs_A.T.tocsr( ) # shape) n_songs * n_train ply train_tags_A_T = train_tags_A.T.tocsr() # shape) n_tags * n_train ply res = [] song_val = test_songs_A.dot(train_songs_A_T) tag_val = test_tags_A.dot(train_tags_A_T) cand_song_matrix = song_val.dot(train_songs_A) cand_tag_matrix = tag_val.dot(train_tags_A) del song_val del tag_val for r, pid in tqdm(enumerate(self.plylst_test.index), 0): songs_already = self.plylst_test.loc[pid, "songs_id"] tags_already = self.plylst_test.loc[pid, "tags_id"] ''' if self.plylst_test.loc[pid,"song_added"]: songs_already = self.orig_test.loc[self.plylst_nid_id[pid],"songs"] if self.plylst_test.loc[pid,"tag_added"]: tags_already = self.orig_test.loc[self.plylst_nid_id[pid],"tags"] ''' song_row = cand_song_matrix.getrow(r).toarray().reshape( -1, ) # 1 * n_songs > 점수 행렬 cand_song_idx = song_row.argsort( )[-song_ntop - 50:][::-1] # 점수 순 idx(= 곡 sid) sort cand_song_idx = remove_seen( songs_already, cand_song_idx )[:song_ntop] # cand_song_idx에 있는 곡들 중 songs_already에 없는 곡 #cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:song_ntop] # 원래 있던 곡 제외, 상위 n개 rec_song_score = [song_row[i] for i in cand_song_idx] tag_row = cand_tag_matrix.getrow(r).toarray().reshape(-1, ) cand_tag_idx = tag_row.argsort()[-tag_ntop - 5:][::-1] cand_tag_idx = remove_seen(tags_already, cand_song_idx)[:tag_ntop] #cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:tag_ntop] rec_tag_score = [tag_row.data[i] for i in cand_tag_idx] res.append({ "id": self.plylst_nid_id[pid], # id로 반환 "songs": [self.song_sid_id[i] for i in cand_song_idx], # id로 반환 "tags": [self.tag_tid_id[i] for i in cand_tag_idx], # id로 반환 "songs_score": rec_song_score, "tags_score": rec_tag_score }) print("DONE") return res
def put_most_popular(seq, pop): unseen = remove_seen(seq, pop) return seq + unseen[:len(pop) - len(seq)]
def _generate_answers(self, song_meta_json, train, questions): #train : arena_data/orig/train.json song_meta = {int(song["id"]): song for song in song_meta_json } #song id를 키값으로 저장하고 그 song의 특징(tags, name 등) 저장 song_mp_counter, song_mp = most_popular( train, "songs", 200 ) #song_mp_counter : 딕셔너리값({... , 18273 : 1, ...}),song_mp : train에서 'songs'에 가장 많이 있는 200개 곡 tag_mp_counter, tag_mp = most_popular( train, "tags", 100) #train에서 'tags'에 가장 많이 있는 100개 태그 song_mp_per_genre = self._song_mp_per_genre( song_meta, song_mp_counter ) #song_mp_per_genre = res ex) res = { pop : ['hello' : 200 ... ], } art_dic = self._artist_songs( song_meta, song_mp_counter) #200넘는곡을 가진 가수의 이름과 곡 딕셔너리 tag_id = self._songs_most_tag(train) answers = [] for q in tqdm(questions): genre_counter = Counter() art_c = Counter() tag_c = Counter() for sid in q["songs"]: for genre in song_meta[sid]["artist_name_basket"]: art_c.update({genre: 1}) artist_name = list(art_c.keys()) for sid in q["songs"]: for genre in song_meta[sid]["song_gn_gnr_basket"]: genre_counter.update({genre: 1}) top_genre = genre_counter.most_common(1) if len(artist_name) == 1 and artist_name[0] in art_dic.keys(): cur_songs = list(art_dic[artist_name[0]]) elif len(top_genre) != 0: cur_songs = song_mp_per_genre[top_genre[0][0]] else: cur_songs = song_mp tag_list = [] if (q['songs'] != []): for sid in q["songs"]: if (sid in tag_id): for a in tag_id[sid]: tag_c.update({a: 1}) tag_list = [k for k, v in tag_c.most_common()] if len(tag_list) > 10: cur_tags = tag_list[:10] else: new_list = remove_seen(tag_list, tag_mp)[:10] cur_tags = (tag_list + new_list)[:10] else: cur_tags = remove_seen(q["tags"], tag_mp)[:10] answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], cur_songs)[:100], "tags": cur_tags }) return answers
def mixed_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A, song_ntop=500, tag_ntop=50, iteration=20): print("MF for song / CF for tag...") res = [] # song songs_A = spr.vstack([test_songs_A, train_songs_A]) als_model = ALS(factors=256, regularization=0.08, use_gpu=True, iterations=iteration) als_model.fit(songs_A.T * 100) # tag train_tags_A_T = train_tags_A.T.tocsr() # shape) n_tags * n_train ply tag_val = test_tags_A.dot(train_tags_A_T) cand_tag_matrix = tag_val.dot(train_tags_A) del tag_val for r, pid in tqdm(enumerate(range(test_songs_A.shape[0]), 0)): # song if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop, filter_already_liked_items=False) else: cand_song = als_model.recommend( pid, test_songs_A, N=song_ntop, filter_already_liked_items=True) rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song] rec_song_score = [x[1] for x in cand_song] # tag tag_row = cand_tag_matrix.getrow(r).toarray().reshape(-1, ) cand_tag_idx = tag_row.argsort()[-tag_ntop - 5:][::-1] tags_already = self.plylst_test.loc[self.n_train + pid, "tags_id"] if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1: rec_tag_idx = remove_seen(tags_already, cand_tag_idx)[:tag_ntop] else: tags_already = self.plylst_test.loc[self.n_train + pid, "tags_id"] rec_tag_idx = remove_seen(tags_already, cand_tag_idx)[:tag_ntop] rec_tag_score = [tag_row.data[i] for i in cand_tag_idx] res.append({ "id": self.plylst_nid_id[self.n_train + pid], "songs": rec_song_idx, "tags": [self.tag_tid_id[i] for i in rec_tag_idx], "songs_score": rec_song_score, "tags_score": rec_tag_score }) return res
def _generate_answers(self, train, questions): _, song_mp = self._most_popular(train, "songs", 200) _, tag_mp = self._most_popular(train, "tags", 100) len_train = len(train) # 92056 개 len_question = len(questions) # 23015 개 plylst = pd.concat([train, questions]) plylst["nid"] = range(len_train + len_question) # plylst_id_nid = dict(zip(plylst["id"], plylst["nid"])) plylst_nid_id = dict(zip(plylst["nid"], plylst["id"])) # song에 새로운 id로 sid 부여 # len_songs : 576168 song_id_sid, song_sid_id, len_songs = self._add_new_id(plylst, "songs") # tag에 id로 tid 부여 # len_tags : 26586 tag_id_tid, tag_tid_id, len_tags = self._add_new_id(plylst, "tags") plylst['songs_id'] = plylst['songs']\ .map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) is not None]) plylst['tags_id'] = plylst['tags']\ .map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) is not None]) plylst_use = plylst[['nid', 'songs_id', 'tags_id']].copy() plylst_use['num_songs'] = plylst_use['songs_id'].str.len() plylst_use['num_tags'] = plylst_use['tags_id'].str.len() plylst_use = plylst_use.set_index('nid') plylst_train = plylst_use.iloc[:len_train, :] plylst_test = plylst_use.iloc[len_train:, :] row = np.repeat(range(len_train), plylst_train['num_songs']) # 4239978 col = [song for songs in plylst_train['songs_id'] for song in songs] # 4239978 dat = np.repeat(1, plylst_train['num_songs'].sum()) # 4239978 train_songs_A = coo_matrix((dat, (row, col)), shape=(len_train, len_songs)) # (92056, 576168) train_songs_A_T = train_songs_A.T.tocsr() row = np.repeat(range(len_train), plylst_train['num_tags']) col = [tag for tags in plylst_train['tags_id'] for tag in tags] dat = np.repeat(1, plylst_train['num_tags'].sum()) train_tags_A = coo_matrix((dat, (row, col)), shape=(len_train, len_tags)) train_tags_A_T = train_tags_A.T.tocsr() # song, tag 추천 ans = [] for pid in tqdm(plylst_test.index): # 예측할 플레이리스트에 들어있는 song과 tag 확인 songs_already = plylst_test.loc[pid, "songs_id"] tags_already = plylst_test.loc[pid, "tags_id"] if not songs_already: rec_song_idx = song_mp rec_tag_idx = remove_seen(tags_already, tag_mp) else: p = np.zeros((len_songs, 1)) # (576168, 1) p[plylst_test.loc[pid, 'songs_id']] = 1 val = train_songs_A.dot(p).reshape(-1) # (92056, ) cand_song = train_songs_A_T.dot(val) # (576168, ) cand_song_idx = cand_song.reshape(-1).argsort()[-200:][::-1] cand_song_idx = remove_seen(songs_already, cand_song_idx) rec_song_idx = [song_sid_id[i] for i in cand_song_idx] cand_tag = train_tags_A_T.dot(val) cand_tag_idx = cand_tag.reshape(-1).argsort()[-20:][::-1] cand_tag_idx = remove_seen(tags_already, cand_tag_idx) rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] ans.append({ "id": plylst_nid_id[pid], "songs": rec_song_idx[:100], "tags": rec_tag_idx[:10] }) return ans
def train(): MODE = "Test" if MODE == "Valid": train = load_json("arena_data/orig/train.json") + load_json( "arena_data/questions/val.json") dev = load_json("res/val.json") test = load_json("res/test.json") else: train = load_json("res/train.json") dev = load_json("res/val.json") test = load_json("res/test.json") pred_tag = load_json("arena_data/model/pred_tag.json") dic_pred_tag = {} for p_t in pred_tag: dic_pred_tag[p_t['id']] = p_t['predict_tag'] for doc in train: doc['tags_org'] = doc['tags'].copy() doc['tags'] += dic_pred_tag[doc['id']] for doc in dev: doc['tags_org'] = doc['tags'].copy() doc['tags'] += dic_pred_tag[doc['id']] for doc in test: doc['tags_org'] = doc['tags'].copy() doc['tags'] += dic_pred_tag[doc['id']] item_list = [] len_item = [] for doc in train + dev + test: song_list = [] for i in doc['songs']: song_list.append(str(i)) item_list.append(song_list + doc['tags']) len_item.append(len(song_list + doc['tags'])) print("Max length of item list :", max(len_item), ", Min :", min(len_item)) item_list = [x for x in item_list if len(x) > 1] print("Train set :", len(item_list)) print("Training Item2Vec model") SIZE = 100 model = Word2Vec(sentences=item_list, size=SIZE, window=240, min_count=2, sg=1, workers=8, iter=10, negative=7, compute_loss=True, callbacks=[LossPrinter()]) model.save("arena_data/model/word2vec.model") print("Vocab : ", len(model.wv.vocab)) print("Building and saving playlist embeddings") song_dic = {} tag_dic = {} for q in tqdm(train + test + dev): song_dic[str(q['id'])] = q['songs'] tag_dic[str(q['id'])] = q['tags_org'] p2v_song = WordEmbeddingsKeyedVectors(SIZE) ID = [] vec = [] for q in tqdm(train + test + dev): tmp_vec = 0 cnt_vocab = 0 if len(q['songs']) >= 1: for item in q['songs']: try: tmp_vec += model.wv.get_vector(str(item)) * 2 cnt_vocab += 1 except KeyError: pass if len(q['tags']) >= 1: for item in q['tags']: try: tmp_vec += model.wv.get_vector(str(item)) cnt_vocab += 1 except KeyError: pass if type(tmp_vec) != int: ID.append(str(q['id'])) vec.append(tmp_vec) p2v_song.add(ID, vec) p2v_song.save("arena_data/model/p2v_song.model") p2v_tag = WordEmbeddingsKeyedVectors(SIZE) ID = [] vec = [] for q in tqdm(train + test + dev): tmp_vec = 0 cnt_vocab = 0 if len(q['songs']) >= 1: for item in q['songs']: try: tmp_vec += model.wv.get_vector(str(item)) cnt_vocab += 1 except KeyError: pass if len(q['tags']) >= 1: for item in q['tags']: try: tmp_vec += model.wv.get_vector(str(item)) * 2 cnt_vocab += 1 except KeyError: pass if type(tmp_vec) != int: ID.append(str(q['id'])) vec.append(tmp_vec) p2v_tag.add(ID, vec) p2v_tag.save("arena_data/model/p2v_tag.model") if MODE == "Valid": print("Testing") questions = load_json("arena_data/questions/val.json") cnt_wv_song = 0 cnt_wv_tag = 0 res = [] for q in tqdm(questions): dic_song_score = {} dic_tag_score = {} song_result = [] tag_result = [] if str(q['id']) in p2v_song.wv.vocab: most_id = [ x for x in p2v_song.most_similar(str(q['id']), topn=50) ] for ID in most_id: for s in song_dic[ID[0]]: if s in dic_song_score: dic_song_score[s] += ID[1] else: dic_song_score[s] = ID[1] if str(q['id']) in p2v_tag.wv.vocab: most_id = [ x for x in p2v_tag.most_similar(str(q['id']), topn=50) ] for t in tag_dic[ID[0]]: if t in dic_tag_score: dic_tag_score[t] += ID[1] else: dic_tag_score[t] = ID[1] if len(dic_song_score) > 0: sort_song_score = sorted(dic_song_score.items(), key=lambda x: x[1], reverse=True) for s in sort_song_score: song_result.append(s[0]) cnt_wv_song += 1 if len(dic_tag_score) > 0: sort_tag_score = sorted(dic_tag_score.items(), key=lambda x: x[1], reverse=True) for s in sort_tag_score: tag_result.append(s[0]) cnt_wv_tag += 1 res.append({ "id": q["id"], "songs": remove_seen(q["songs"], song_result)[:100], "tags": remove_seen(q["tags"], tag_result)[:10], }) print(len(questions), cnt_wv_song, cnt_wv_tag) ans = load_json("arena_data/answers/val.json") evaluator = CustomEvaluator() evaluator._evaluate(ans, res)
def _generate_answers(self, song_meta_json, train, questions): song_meta = {int(song["id"]): song for song in song_meta_json} train_meta = {int(plylst["id"]): plylst for plylst in train} song_mp_counter, song_mp = most_popular(train, "songs", 200) tag_mp_counter, tag_mp = most_popular(train, "tags", 100) song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter) tag_per_song = self._tag_per_song(train_meta) ## modified for song prediction ## pre-processing train set data _, song_pop = most_popular(train, "songs", 200000) #song_pop = set(song_pop) voca_dict, voca_dict_t = self._build_vocadict(song_pop) # filtering song list num_users = len(train) f_song_lst, f_usr_lst = self._const_filtered_lst(train, voca_dict, num_users, to_idx=num_users, val=False) num_items = len(set(f_song_lst)) data_len = len(f_song_lst) # re-setting index of filtered songs item_ids = np.array([voca_dict[i] for i in f_song_lst]) data = np.ones(data_len) rows, cols, data = zip(*set(zip(f_usr_lst, item_ids, data))) print('train preproc done', num_items) ## pre-processing valid/test set data v_num_users = len(questions) f_song_lst_v, f_usr_lst_v = self._const_filtered_lst( questions, voca_dict, num_users, to_idx=v_num_users, val=True) data_len_v = len(f_song_lst_v) v_item_ids = np.array([voca_dict[i] for i in f_song_lst_v]) v_data = np.ones(data_len_v) v_rows, v_cols, v_data = zip( *set(zip(f_usr_lst_v, v_item_ids, v_data))) print('valid preproc done', num_items) n_rows = rows + v_rows n_cols = cols + v_cols n_data = data + v_data t_num_users = num_users + v_num_users usr_item_mat = sp.csr_matrix((n_data, (n_rows, n_cols)), shape=(t_num_users, num_items)) item_usr_mat = usr_item_mat.T als_model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.05, iterations=50) #als_model = implicit.bpr.BayesianPersonalizedRanking(factors=50) ### actually bpr #als_model = implicit.lmf.LogisticMatrixFactorization(factors=50) ### actually Logistic MF als_model.fit(item_usr_mat) print("als model fitting done") ### for cold-start users (plylst containing no song) title_to_tok, vocab = title_to_token(train) v_title_to_tok, v_vocab = title_to_token(questions) title_to_tok.extend(v_title_to_tok) vocab.extend(v_vocab) print("title to token converted", len(title_to_tok), len(vocab)) fin_vocab = get_fin_vocab(vocab) print("final vocab size", len(fin_vocab)) title_to_idx = [] for plylst in title_to_tok: res_idx = tok_to_idx(plylst, fin_vocab) title_to_idx.append(res_idx) user_lst, vocab_lst = preproc_for_csr(title_to_idx, 0) cb_rows = np.array(user_lst) cb_cols = np.array(vocab_lst) cb_data = np.ones(len(user_lst)) plylst_tt_mat = sp.csr_matrix( (cb_data, (cb_rows, cb_cols)), shape=(len(title_to_tok), len(fin_vocab))) print("csr matrix for tf-idf matrix made") tfidf_mat = build_tfidf_mat(plylst_tt_mat) #### answers = [] for idx, q in tqdm(enumerate(questions)): genre_counter = Counter() for sid in q["songs"]: for genre in song_meta[sid]["song_gn_gnr_basket"]: genre_counter.update({genre: 1}) top_genre = genre_counter.most_common(1) if len(top_genre) != 0: cur_songs = song_mp_per_genre[top_genre[0][0]] else: cur_songs = song_mp ## modified for tag prediction tag_lst = self._tag_per_plylst(q, tag_per_song) tag_res = remove_seen(q["tags"], tag_lst)[:10] if len(tag_res) < 10: tag_res = remove_seen(q["tags"], tag_mp)[:10] ## modified for song prediction if len(q["songs"]) == 0: n_idx = idx + len(train) most_sim_lst = get_sim_plylst(tfidf_mat, given=n_idx, topn=30) cands = gather_cand(train, questions, most_sim_lst) song_res = remove_seen(q["songs"], cands)[:100] #print(n_idx, song_res) else: song_lst = self._cal_alsmodel(idx, num_users, usr_item_mat, als_model, voca_dict_t) song_res = remove_seen(q["songs"], song_lst)[:100] if len(song_res) < 100: print('checked here', idx) song_res = remove_seen(q["songs"], cur_songs)[:100] answers.append({"id": q["id"], "songs": song_res, "tags": tag_res}) return answers
def Recommender(train, questions, n_msp, n_mtp, mode, sim_measure, song_meta, freq_song, save=False): ## 최종 추천리스트 rec_list = [] ## 1단계: 전처리 # 1) 추천 결과가 없거나 모자란 경우를 위해 most_popular 생성 _, song_mp = most_popular(train, "songs", 200) _, tag_mp = most_popular(train, "tags", 20) # 2) 빠른 접근을 위한 Dictionary 생성 song_plylst_dic, song_tag_dic, plylst_song_dic, plylst_tag_dic, tag_plylst_dic, tag_song_dic, song_issue_dic, song_artist_dic = DicGenerator( train, song_meta) # 3) 미리 계산한 플레이리스트 유사도 불러오기 ''' sim_scores: 입력으로 들어온 questions과 train간 유사도 (Autoencoder 기반) gnr_scores: 입력으로 들어온 questions과 train간 유사도 (genre 정보 추가) title_scores: 입력으로 들어온 questions과 train간 유사도 (Word2vec 기반) ''' sim_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}.npy', allow_pickle=True).item() gnr_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}_gnr.npy', allow_pickle=True).item() title_scores = np.load( f'scores/{mode}_scores_title_{sim_measure}_24000.npy', allow_pickle=True).item() ## 2단계: 함수 정의 # 1) Counter 객체에서 빈도수 기준 topk개 출력 def most_similar(cnt, topk): cnt_topk = cnt.most_common(topk) return [k for k, v in cnt_topk] # 2) 미리 계산한 유사도 기준 topk개의 플레이리스트의 plylsts와 scores 출력 def most_similar_emb(q_id, topk, title=False, genre=False): # title_scores 기준 if title: plylsts = [t[0] for t in title_scores[q_id][:topk]] scores = [t[1] for t in title_scores[q_id][:topk]] # gnr_scores 기준 elif genre: plylsts = [t[0] for t in gnr_scores[q_id][:topk]] scores = [t[1] for t in gnr_scores[q_id][:topk]] # sim_scores 기준 else: plylsts = [t[0] for t in sim_scores[q_id][:topk]] scores = [t[1] for t in sim_scores[q_id][:topk]] return plylsts, scores # 3) new_song_plylst_dict def get_new_song_plylst_dict(plylst_ms): new_song_plylst_dict = defaultdict(set) for plylst in plylst_ms: for _song in plylst_song_dic[plylst]: new_song_plylst_dict[_song].add(plylst) return new_song_plylst_dict ## 3단계: 입력으로 들어온 questions 플레이리스트에 대해 추천 for q in tqdm(questions): # 1) question 플레이리스트의 정보 # 수록 song/tag q_songs = q['songs'] q_tags = q['tags'] # 수록 song/tag와 함께 등장한 song/tag/plylst 빈도 수 song_plylst_C = Counter() song_tag_C = Counter() tag_plylst_C = Counter() tag_song_C = Counter() # 수록 song/tag가 둘 다 없거나 적을 때 no_songs_tags, few_songs_tags = False, False if len(q_songs) == 0 and len(q_tags) == 0: no_songs_tags = True elif len(q_songs) <= 3: few_songs_tags = True # 2) 빈도수 기반 추천을 위해 카운트 # 수록 song에 대해 for q_s in q_songs: song_plylst_C.update(song_plylst_dic[q_s]) song_tag_C.update(song_tag_dic[q_s]) # 수록 tag에 대해 for q_t in q_tags: tag_plylst_C.update(tag_plylst_dic[q_t]) tag_song_C.update(tag_song_dic[q_t]) # 수록곡 수로 나눠서 비율로 계산 for i, j in list(song_plylst_C.items()): if len(plylst_song_dic[i]) > 0: song_plylst_C[i] = (j / len(plylst_song_dic[i])) # 3) 유사도 기반 추천을 위해 점수 계산 plylst_song_scores = defaultdict(lambda: 0) plylst_tag_scores = defaultdict(lambda: 0) # Case 1: song과 tag가 둘 다 없는 경우 if no_songs_tags: # plylst_ms / plylst_mt: title_scores 기준 유사한 플레이리스트 n_msp / n_mtp개 plylst_ms, song_scores = most_similar_emb(q['id'], n_msp, title=True) plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, title=True) plylst_add, add_scores = most_similar_emb(q['id'], n_mtp) # Case 2: song과 tag가 부족한 경우 elif few_songs_tags: # plylst_ms / plylst_mt: sim_scores 기준 n_msp개 / title_scores 기준 n_mtp개 plylst_ms, song_scores = most_similar_emb(q['id'], n_msp) plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, title=True) plylst_add, add_scores = most_similar_emb(q['id'], n_mtp, genre=True) # Case 3: song과 tag가 충분한 경우 else: # plylst_ms / plylst_mt: sim_scores 기준 유사한 플레이리스트 n_msp / n_mtp개 plylst_ms, song_scores = most_similar_emb(q['id'], n_msp) plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, genre=True) plylst_add, add_scores = most_similar_emb(q['id'], n_mtp, title=True) new_song_plylst_dict = get_new_song_plylst_dict(plylst_ms) # 3-1. plylst_song_scores 계산 for idx, ms_p in enumerate(plylst_ms): for song in plylst_song_dic[ms_p]: song_score = 0 for q_s in q_songs: try: song_score += len(new_song_plylst_dict[q_s] & new_song_plylst_dict[song]) / len( new_song_plylst_dict[q_s]) except: pass if song in freq_song: plylst_song_scores[song] += song_plylst_C[ ms_p] * song_score * song_scores[idx] * (n_msp - idx) * 4 else: plylst_song_scores[song] += song_plylst_C[ ms_p] * song_score * song_scores[idx] * (n_msp - idx) for tag in plylst_tag_dic[ms_p]: plylst_tag_scores[tag] += tag_scores[idx] * (n_msp - idx) # 3-2. plylst_tag_scores 계산 for idx, mt_p in enumerate(plylst_mt): for tag in plylst_tag_dic[mt_p]: plylst_tag_scores[tag] += tag_scores[idx] * (n_mtp - idx) for song in plylst_song_dic[mt_p]: plylst_song_scores[song] += tag_scores[idx] # 3-3. plylst_{song/tag}_scores 보정 for idx, mt_p in enumerate(plylst_add): for tag in plylst_tag_dic[mt_p]: plylst_tag_scores[tag] += add_scores[idx] * (n_mtp - idx) # 4) song과 tag 둘 다 없거나 적은 경우 예측해서 채워넣기 if no_songs_tags: # q_songs 새롭게 채워넣기 (원래는 song가 없지만 title_scores 기준 유사한 플레이리스트로부터 song 예측) pre_songs = sorted(plylst_song_scores.items(), key=lambda x: x[1], reverse=True) pre_songs = [scores[0] for scores in pre_songs][:200] pre_songs = pre_songs + remove_seen(pre_songs, song_mp) q_songs = pre_songs[:100] # q_tags 새롭게 채워넣기 (원래는 tag가 없지만 title_scores 기준 유사한 플레이리스트로부터 tag 예측) pre_tags = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True) pre_tags = [scores[0] for scores in pre_tags][:20] pre_tags = pre_tags + remove_seen(pre_tags, tag_mp) q_tags = pre_tags[:10] # 5) questions 플레이리스트에 대해 추천 ## song 추천 # song 있을 때 lt_song_art = [] if len(q_songs) > 0: plylst_song_scores = sorted(plylst_song_scores.items(), key=lambda x: x[1], reverse=True) lt_artist = [] for w_song in q_songs: lt_artist.extend(song_artist_dic[w_song]) counter_artist = Counter(lt_artist) counter_artist = sorted(counter_artist.items(), key=lambda x: x[1], reverse=True) if few_songs_tags: artist = [art[0] for art in counter_artist] else: artist = [x[0] for x in counter_artist if x[1] > 1] cand_ms = [scores[0] for scores in plylst_song_scores ][(100 - len(artist)):1000] for cand in cand_ms: if artist == []: break if cand in q_songs: break for art in song_artist_dic[cand]: if art in artist: lt_song_art.append(cand) artist.remove(art) break song_ms = [scores[0] for scores in plylst_song_scores][:200] # song 없고, tag 있을 때 else: song_ms = most_similar(tag_song_C, 200) ## tag 추천 # tag 있을 때 if len(q_tags) > 0: plylst_tag_scores = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True) tag_ms = [scores[0] for scores in plylst_tag_scores][:20] # tag 없고, song 있을 때 else: plylst_tag_scores = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True) tag_ms = [scores[0] for scores in plylst_tag_scores][:20] ## issue date 늦은 song 제거 if q['updt_date']: q_updt_date = q['updt_date'][:4] + q['updt_date'][5:7] + q[ 'updt_date'][8:10] song_ms = [x for x in song_ms if song_issue_dic[x] < q_updt_date] ## 중복 제거 및 부족하면 most_popular로 채워넣기 song_candidate = song_ms + remove_seen(song_ms, song_mp) tag_candidate = tag_ms + remove_seen(tag_ms, tag_mp) song_remove = q_songs tag_remove = q_tags song_candidate = song_candidate[:100] if no_songs_tags else remove_seen( song_remove, song_candidate)[:100] if len(lt_song_art) > 0: lt_song_art = [x for x in lt_song_art if x not in song_candidate] song_candidate[(100 - len(lt_song_art)):100] = lt_song_art rec_list.append({ "id": q["id"], "songs": song_candidate, "tags": tag_candidate[:10] if no_songs_tags else remove_seen( tag_remove, tag_candidate)[:10] }) # 6) results.json 파일 저장 여부 if save == True: write_json( rec_list, 'results/results_' + dt.datetime.now().strftime("%y%m%d-%H%M%S") + '_' + mode + '.json') return rec_list