def _generate_answers(self, song_meta_json, train, questions): song_meta = {int(song["id"]): song for song in song_meta_json} song_mp_counter, song_mp = most_popular(train, "songs", 200) tag_mp_counter, tag_mp = most_popular(train, "tags", 100) song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter) answers = [] for q in tqdm(questions): genre_counter = Counter() for sid in q["songs"]: for genre in song_meta[sid]["song_gn_gnr_basket"]: genre_counter.update({genre: 1}) top_genre = genre_counter.most_common(1) if len(top_genre) != 0: cur_songs = song_mp_per_genre[top_genre[0][0]] else: cur_songs = song_mp answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], cur_songs)[:100], "tags": remove_seen(q["tags"], tag_mp)[:10] }) return answers
def _generate_answers(self, song_meta_json, train, questions): # key를 song_id value를 해당 song_id에 대한 정보로 dictionary 생성 song_meta = {int(song["id"]): song for song in song_meta_json} # 상위 200개 곡 song_mp_counter, song_mp = most_popular(train, "songs", 200) # 상위 100개 태그 tag_mp_counter, tag_mp = most_popular(train, "tags", 100) song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter) answers = [] for q in tqdm(questions): genre_counter = Counter() for sid in q["songs"]: for genre in song_meta[sid]["song_gn_gnr_basket"]: genre_counter.update({genre: 1}) top_genre = genre_counter.most_common(1) # 가장 인기있는 장르가 존재하면 if len(top_genre) != 0: # 해당 장르에서 가장 많이 등장한 song 추천 cur_songs = song_mp_per_genre[top_genre[0][0]] else: # 아니면 가장 많이 등장한 노래 추천 cur_songs = song_mp answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], cur_songs)[:100], "tags": remove_seen(q["tags"], tag_mp)[:10] }) return answers
def fill_X(train, val): # embedding load ( need to mk s2v model by ply_tag_embedding.py ) embed = PlyEmbedding(train) embed.load_s2v(s2v_path) # p2v for tag_by_song, title-vector for t2r titles, vectors = embed.song_based( mode='s2v', by='mean', keyedvector=False) ## title, vectors not in KeydVectos # T2R t2r = Title2Rec() # remove non alpha or hangul. tokenize, ( t : title, v : vectors, ID : plylst id ) t, v, ID = t2r.preprocess_clustering(titles, vectors, ID=True, khaiii=True, verbose=True) # load cluster ( need to mk cluster pkl file ) t2r.load_cluster(cluster_path) # sort by cluster & distance from center data = t2r.pre_fasttext(t, v) # fit fasttext & title2rec t2r.fit_fasttext(data) t2r.fit_title2rec(t, ID) # most popular _, pop_songs = most_popular(train, 'songs', 100) _, pop_tags = most_popular(train, 'tags', 10) for ply in tqdm(val): ply['song_dirty'] = 0 ply['tag_dirty'] = 0 if ply['songs'] != []: if ply['tags'] != []: pass else: ply['tags'] = embed.tag_by_songs(ply, 10, 3.9) if len(ply['tags']) < 10: ply['tags'] = put_most_popular(ply['tags'], pop_tags) ply['tag_dirty'] = 1 else: songs, tags, song_sign, tag_sign = t2r.title2rec( ply, 100, 10, song_const, tag_const) if (song_sign) and (len(songs) == 0): songs = put_most_popular(songs, pop_songs) #raise RuntimeError("song length < 100") if (tag_sign) and (len(tags) < 10): tags = put_most_popular(tags, pop_tags) ply['songs'] = songs ply['tags'] = tags ply['song_dirty'] = song_sign ply['tag_dirty'] = tag_sign return val
def _generate_answers(self, train, questions): _, song_mp = most_popular(train, "songs", 200) _, tag_mp = most_popular(train, "tags", 100) answers = [] for q in tqdm(questions): answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], song_mp)[:100], "tags": remove_seen(q["tags"], tag_mp)[:10], }) return answers
def generate_answers(train, questions): _, song_mp = most_popular(train, "songs", 200) _, tag_mp = most_popular(train, "tags", 100) answers = [] for q in questions: if len(q["songs"]) != 0 and len(q["tags"]) != 0: answers.append({ "id": q["id"], "songs": q["songs"], "tags": q["tags"] }) else: answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], song_mp)[:100], "tags": remove_seen(q["tags"], tag_mp)[:10] }) return answers
def _generate_answers(self, song_meta_json, train, questions): #train : arena_data/orig/train.json song_meta = {int(song["id"]): song for song in song_meta_json } #song id를 키값으로 저장하고 그 song의 특징(tags, name 등) 저장 song_mp_counter, song_mp = most_popular( train, "songs", 200 ) #song_mp_counter : 딕셔너리값({... , 18273 : 1, ...}),song_mp : train에서 'songs'에 가장 많이 있는 200개 곡 tag_mp_counter, tag_mp = most_popular( train, "tags", 100) #train에서 'tags'에 가장 많이 있는 100개 태그 song_mp_per_genre = self._song_mp_per_genre( song_meta, song_mp_counter ) #song_mp_per_genre = res ex) res = { pop : ['hello' : 200 ... ], } art_dic = self._artist_songs( song_meta, song_mp_counter) #200넘는곡을 가진 가수의 이름과 곡 딕셔너리 tag_id = self._songs_most_tag(train) answers = [] for q in tqdm(questions): genre_counter = Counter() art_c = Counter() tag_c = Counter() for sid in q["songs"]: for genre in song_meta[sid]["artist_name_basket"]: art_c.update({genre: 1}) artist_name = list(art_c.keys()) for sid in q["songs"]: for genre in song_meta[sid]["song_gn_gnr_basket"]: genre_counter.update({genre: 1}) top_genre = genre_counter.most_common(1) if len(artist_name) == 1 and artist_name[0] in art_dic.keys(): cur_songs = list(art_dic[artist_name[0]]) elif len(top_genre) != 0: cur_songs = song_mp_per_genre[top_genre[0][0]] else: cur_songs = song_mp tag_list = [] if (q['songs'] != []): for sid in q["songs"]: if (sid in tag_id): for a in tag_id[sid]: tag_c.update({a: 1}) tag_list = [k for k, v in tag_c.most_common()] if len(tag_list) > 10: cur_tags = tag_list[:10] else: new_list = remove_seen(tag_list, tag_mp)[:10] cur_tags = (tag_list + new_list)[:10] else: cur_tags = remove_seen(q["tags"], tag_mp)[:10] answers.append({ "id": q["id"], "songs": remove_seen(q["songs"], cur_songs)[:100], "tags": cur_tags }) return answers
def _generate_answers(self, song_meta_json, train, questions): song_meta = {int(song["id"]): song for song in song_meta_json} train_meta = {int(plylst["id"]): plylst for plylst in train} song_mp_counter, song_mp = most_popular(train, "songs", 200) tag_mp_counter, tag_mp = most_popular(train, "tags", 100) song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter) tag_per_song = self._tag_per_song(train_meta) ## modified for song prediction ## pre-processing train set data _, song_pop = most_popular(train, "songs", 200000) #song_pop = set(song_pop) voca_dict, voca_dict_t = self._build_vocadict(song_pop) # filtering song list num_users = len(train) f_song_lst, f_usr_lst = self._const_filtered_lst(train, voca_dict, num_users, to_idx=num_users, val=False) num_items = len(set(f_song_lst)) data_len = len(f_song_lst) # re-setting index of filtered songs item_ids = np.array([voca_dict[i] for i in f_song_lst]) data = np.ones(data_len) rows, cols, data = zip(*set(zip(f_usr_lst, item_ids, data))) print('train preproc done', num_items) ## pre-processing valid/test set data v_num_users = len(questions) f_song_lst_v, f_usr_lst_v = self._const_filtered_lst( questions, voca_dict, num_users, to_idx=v_num_users, val=True) data_len_v = len(f_song_lst_v) v_item_ids = np.array([voca_dict[i] for i in f_song_lst_v]) v_data = np.ones(data_len_v) v_rows, v_cols, v_data = zip( *set(zip(f_usr_lst_v, v_item_ids, v_data))) print('valid preproc done', num_items) n_rows = rows + v_rows n_cols = cols + v_cols n_data = data + v_data t_num_users = num_users + v_num_users usr_item_mat = sp.csr_matrix((n_data, (n_rows, n_cols)), shape=(t_num_users, num_items)) item_usr_mat = usr_item_mat.T als_model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.05, iterations=50) #als_model = implicit.bpr.BayesianPersonalizedRanking(factors=50) ### actually bpr #als_model = implicit.lmf.LogisticMatrixFactorization(factors=50) ### actually Logistic MF als_model.fit(item_usr_mat) print("als model fitting done") ### for cold-start users (plylst containing no song) title_to_tok, vocab = title_to_token(train) v_title_to_tok, v_vocab = title_to_token(questions) title_to_tok.extend(v_title_to_tok) vocab.extend(v_vocab) print("title to token converted", len(title_to_tok), len(vocab)) fin_vocab = get_fin_vocab(vocab) print("final vocab size", len(fin_vocab)) title_to_idx = [] for plylst in title_to_tok: res_idx = tok_to_idx(plylst, fin_vocab) title_to_idx.append(res_idx) user_lst, vocab_lst = preproc_for_csr(title_to_idx, 0) cb_rows = np.array(user_lst) cb_cols = np.array(vocab_lst) cb_data = np.ones(len(user_lst)) plylst_tt_mat = sp.csr_matrix( (cb_data, (cb_rows, cb_cols)), shape=(len(title_to_tok), len(fin_vocab))) print("csr matrix for tf-idf matrix made") tfidf_mat = build_tfidf_mat(plylst_tt_mat) #### answers = [] for idx, q in tqdm(enumerate(questions)): genre_counter = Counter() for sid in q["songs"]: for genre in song_meta[sid]["song_gn_gnr_basket"]: genre_counter.update({genre: 1}) top_genre = genre_counter.most_common(1) if len(top_genre) != 0: cur_songs = song_mp_per_genre[top_genre[0][0]] else: cur_songs = song_mp ## modified for tag prediction tag_lst = self._tag_per_plylst(q, tag_per_song) tag_res = remove_seen(q["tags"], tag_lst)[:10] if len(tag_res) < 10: tag_res = remove_seen(q["tags"], tag_mp)[:10] ## modified for song prediction if len(q["songs"]) == 0: n_idx = idx + len(train) most_sim_lst = get_sim_plylst(tfidf_mat, given=n_idx, topn=30) cands = gather_cand(train, questions, most_sim_lst) song_res = remove_seen(q["songs"], cands)[:100] #print(n_idx, song_res) else: song_lst = self._cal_alsmodel(idx, num_users, usr_item_mat, als_model, voca_dict_t) song_res = remove_seen(q["songs"], song_lst)[:100] if len(song_res) < 100: print('checked here', idx) song_res = remove_seen(q["songs"], cur_songs)[:100] answers.append({"id": q["id"], "songs": song_res, "tags": tag_res}) return answers
def Recommender(train, questions, n_msp, n_mtp, mode, sim_measure, song_meta, freq_song, save=False): ## 최종 추천리스트 rec_list = [] ## 1단계: 전처리 # 1) 추천 결과가 없거나 모자란 경우를 위해 most_popular 생성 _, song_mp = most_popular(train, "songs", 200) _, tag_mp = most_popular(train, "tags", 20) # 2) 빠른 접근을 위한 Dictionary 생성 song_plylst_dic, song_tag_dic, plylst_song_dic, plylst_tag_dic, tag_plylst_dic, tag_song_dic, song_issue_dic, song_artist_dic = DicGenerator( train, song_meta) # 3) 미리 계산한 플레이리스트 유사도 불러오기 ''' sim_scores: 입력으로 들어온 questions과 train간 유사도 (Autoencoder 기반) gnr_scores: 입력으로 들어온 questions과 train간 유사도 (genre 정보 추가) title_scores: 입력으로 들어온 questions과 train간 유사도 (Word2vec 기반) ''' sim_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}.npy', allow_pickle=True).item() gnr_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}_gnr.npy', allow_pickle=True).item() title_scores = np.load( f'scores/{mode}_scores_title_{sim_measure}_24000.npy', allow_pickle=True).item() ## 2단계: 함수 정의 # 1) Counter 객체에서 빈도수 기준 topk개 출력 def most_similar(cnt, topk): cnt_topk = cnt.most_common(topk) return [k for k, v in cnt_topk] # 2) 미리 계산한 유사도 기준 topk개의 플레이리스트의 plylsts와 scores 출력 def most_similar_emb(q_id, topk, title=False, genre=False): # title_scores 기준 if title: plylsts = [t[0] for t in title_scores[q_id][:topk]] scores = [t[1] for t in title_scores[q_id][:topk]] # gnr_scores 기준 elif genre: plylsts = [t[0] for t in gnr_scores[q_id][:topk]] scores = [t[1] for t in gnr_scores[q_id][:topk]] # sim_scores 기준 else: plylsts = [t[0] for t in sim_scores[q_id][:topk]] scores = [t[1] for t in sim_scores[q_id][:topk]] return plylsts, scores # 3) new_song_plylst_dict def get_new_song_plylst_dict(plylst_ms): new_song_plylst_dict = defaultdict(set) for plylst in plylst_ms: for _song in plylst_song_dic[plylst]: new_song_plylst_dict[_song].add(plylst) return new_song_plylst_dict ## 3단계: 입력으로 들어온 questions 플레이리스트에 대해 추천 for q in tqdm(questions): # 1) question 플레이리스트의 정보 # 수록 song/tag q_songs = q['songs'] q_tags = q['tags'] # 수록 song/tag와 함께 등장한 song/tag/plylst 빈도 수 song_plylst_C = Counter() song_tag_C = Counter() tag_plylst_C = Counter() tag_song_C = Counter() # 수록 song/tag가 둘 다 없거나 적을 때 no_songs_tags, few_songs_tags = False, False if len(q_songs) == 0 and len(q_tags) == 0: no_songs_tags = True elif len(q_songs) <= 3: few_songs_tags = True # 2) 빈도수 기반 추천을 위해 카운트 # 수록 song에 대해 for q_s in q_songs: song_plylst_C.update(song_plylst_dic[q_s]) song_tag_C.update(song_tag_dic[q_s]) # 수록 tag에 대해 for q_t in q_tags: tag_plylst_C.update(tag_plylst_dic[q_t]) tag_song_C.update(tag_song_dic[q_t]) # 수록곡 수로 나눠서 비율로 계산 for i, j in list(song_plylst_C.items()): if len(plylst_song_dic[i]) > 0: song_plylst_C[i] = (j / len(plylst_song_dic[i])) # 3) 유사도 기반 추천을 위해 점수 계산 plylst_song_scores = defaultdict(lambda: 0) plylst_tag_scores = defaultdict(lambda: 0) # Case 1: song과 tag가 둘 다 없는 경우 if no_songs_tags: # plylst_ms / plylst_mt: title_scores 기준 유사한 플레이리스트 n_msp / n_mtp개 plylst_ms, song_scores = most_similar_emb(q['id'], n_msp, title=True) plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, title=True) plylst_add, add_scores = most_similar_emb(q['id'], n_mtp) # Case 2: song과 tag가 부족한 경우 elif few_songs_tags: # plylst_ms / plylst_mt: sim_scores 기준 n_msp개 / title_scores 기준 n_mtp개 plylst_ms, song_scores = most_similar_emb(q['id'], n_msp) plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, title=True) plylst_add, add_scores = most_similar_emb(q['id'], n_mtp, genre=True) # Case 3: song과 tag가 충분한 경우 else: # plylst_ms / plylst_mt: sim_scores 기준 유사한 플레이리스트 n_msp / n_mtp개 plylst_ms, song_scores = most_similar_emb(q['id'], n_msp) plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, genre=True) plylst_add, add_scores = most_similar_emb(q['id'], n_mtp, title=True) new_song_plylst_dict = get_new_song_plylst_dict(plylst_ms) # 3-1. plylst_song_scores 계산 for idx, ms_p in enumerate(plylst_ms): for song in plylst_song_dic[ms_p]: song_score = 0 for q_s in q_songs: try: song_score += len(new_song_plylst_dict[q_s] & new_song_plylst_dict[song]) / len( new_song_plylst_dict[q_s]) except: pass if song in freq_song: plylst_song_scores[song] += song_plylst_C[ ms_p] * song_score * song_scores[idx] * (n_msp - idx) * 4 else: plylst_song_scores[song] += song_plylst_C[ ms_p] * song_score * song_scores[idx] * (n_msp - idx) for tag in plylst_tag_dic[ms_p]: plylst_tag_scores[tag] += tag_scores[idx] * (n_msp - idx) # 3-2. plylst_tag_scores 계산 for idx, mt_p in enumerate(plylst_mt): for tag in plylst_tag_dic[mt_p]: plylst_tag_scores[tag] += tag_scores[idx] * (n_mtp - idx) for song in plylst_song_dic[mt_p]: plylst_song_scores[song] += tag_scores[idx] # 3-3. plylst_{song/tag}_scores 보정 for idx, mt_p in enumerate(plylst_add): for tag in plylst_tag_dic[mt_p]: plylst_tag_scores[tag] += add_scores[idx] * (n_mtp - idx) # 4) song과 tag 둘 다 없거나 적은 경우 예측해서 채워넣기 if no_songs_tags: # q_songs 새롭게 채워넣기 (원래는 song가 없지만 title_scores 기준 유사한 플레이리스트로부터 song 예측) pre_songs = sorted(plylst_song_scores.items(), key=lambda x: x[1], reverse=True) pre_songs = [scores[0] for scores in pre_songs][:200] pre_songs = pre_songs + remove_seen(pre_songs, song_mp) q_songs = pre_songs[:100] # q_tags 새롭게 채워넣기 (원래는 tag가 없지만 title_scores 기준 유사한 플레이리스트로부터 tag 예측) pre_tags = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True) pre_tags = [scores[0] for scores in pre_tags][:20] pre_tags = pre_tags + remove_seen(pre_tags, tag_mp) q_tags = pre_tags[:10] # 5) questions 플레이리스트에 대해 추천 ## song 추천 # song 있을 때 lt_song_art = [] if len(q_songs) > 0: plylst_song_scores = sorted(plylst_song_scores.items(), key=lambda x: x[1], reverse=True) lt_artist = [] for w_song in q_songs: lt_artist.extend(song_artist_dic[w_song]) counter_artist = Counter(lt_artist) counter_artist = sorted(counter_artist.items(), key=lambda x: x[1], reverse=True) if few_songs_tags: artist = [art[0] for art in counter_artist] else: artist = [x[0] for x in counter_artist if x[1] > 1] cand_ms = [scores[0] for scores in plylst_song_scores ][(100 - len(artist)):1000] for cand in cand_ms: if artist == []: break if cand in q_songs: break for art in song_artist_dic[cand]: if art in artist: lt_song_art.append(cand) artist.remove(art) break song_ms = [scores[0] for scores in plylst_song_scores][:200] # song 없고, tag 있을 때 else: song_ms = most_similar(tag_song_C, 200) ## tag 추천 # tag 있을 때 if len(q_tags) > 0: plylst_tag_scores = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True) tag_ms = [scores[0] for scores in plylst_tag_scores][:20] # tag 없고, song 있을 때 else: plylst_tag_scores = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True) tag_ms = [scores[0] for scores in plylst_tag_scores][:20] ## issue date 늦은 song 제거 if q['updt_date']: q_updt_date = q['updt_date'][:4] + q['updt_date'][5:7] + q[ 'updt_date'][8:10] song_ms = [x for x in song_ms if song_issue_dic[x] < q_updt_date] ## 중복 제거 및 부족하면 most_popular로 채워넣기 song_candidate = song_ms + remove_seen(song_ms, song_mp) tag_candidate = tag_ms + remove_seen(tag_ms, tag_mp) song_remove = q_songs tag_remove = q_tags song_candidate = song_candidate[:100] if no_songs_tags else remove_seen( song_remove, song_candidate)[:100] if len(lt_song_art) > 0: lt_song_art = [x for x in lt_song_art if x not in song_candidate] song_candidate[(100 - len(lt_song_art)):100] = lt_song_art rec_list.append({ "id": q["id"], "songs": song_candidate, "tags": tag_candidate[:10] if no_songs_tags else remove_seen( tag_remove, tag_candidate)[:10] }) # 6) results.json 파일 저장 여부 if save == True: write_json( rec_list, 'results/results_' + dt.datetime.now().strftime("%y%m%d-%H%M%S") + '_' + mode + '.json') return rec_list