def run(self, question_fname): print("Loading question file...") questions = pd.read_json(question_fname) print("Writing answers...") answers = self._generate_answers(questions) write_json(answers, "results/results.json")
def run(self, train_fname, question_fname): print("Loading train file...") train = load_json(train_fname) print("Loading question file...") questions = load_json(question_fname) print("Writing answers...") answers = self._generate_answers(train, questions) write_json(answers, "results/results.json")
def run(self, train_fname, question_fname): print("Loading train file...") train_data = pd.read_json(train_fname) # arena_data/orig/train.json print("Loading question file...") questions = pd.read_json(question_fname) # arena_data/questions/val.json print("Generating answers...") answers = self._generate_answers(train_data, questions) print("Writing answers...") write_json(answers, "results/results.json")
def run(self, song_meta_fname, train_fname, question_fname): print("Loading song meta...") song_meta_json = load_json(song_meta_fname) print("Loading train file...") train_data = load_json(train_fname) print("Loading question file...") questions = load_json(question_fname) print("Writing answers...") answers = self._generate_answers(song_meta_json, train_data, questions) write_json(answers, "results/results.json")
def run(self, fname): # raondom shuffle 때문에 seed 지정 random.seed(777) print("Reading data...\n") # json 파일 불러오기 playlists = load_json(fname) # 불러온 파일 순서 섞기 random.shuffle(playlists) print(f"Total playlists: {len(playlists)}") print("Splitting data...") # split train, val = self._split_data(playlists) # train, val 나눈 데이터 json 파일 작성 print("Original train...") write_json(train, "orig/train.json") print("Original val...") write_json(val, "orig/val.json") print("Masked val...") # masking 작업 val_q, val_a = self._mask_data(val) write_json(val_q, "questions/val.json") write_json(val_a, "answers/val.json")
def run(self, train_fname, question_fname): print('set logger') logger = log.get_logger() print(logger) logger.set_log_level(WARN) print('logger set') print("Loading train file...") train = load_json(train_fname) print("Loading question file...") questions = load_json(question_fname) print("Writing answers...") answers = self._generate_answers(train, questions) write_json(answers, "results/results.json")
def run(self): song_meta = pd.read_json('./res/song_meta.json') print("Loading train file...") train_data = pd.read_json('./arena_data/orig/train.json', encoding='utf-8') print("Loading question file...") question_data = pd.read_json('./arena_data/questions/val.json', encoding='utf-8') print("Generating answers...") answers = self.related(train_data, question_data) print("Writing answers...") write_json(answers, 'results/results.json')
def _load_train(self): data = load_json('./arena_data/orig/train.json') self.train = [] self.song_list = set() self.tag_list = set() print('train data filtering...') for t in tqdm(data): if t['like_cnt'] > 50: self.train.append(t) self.song_list.update(t['songs']) self.tag_list.update(t['tags']) self.song_list = list(self.song_list) self.tag_list = list(self.tag_list) self.total_song_num = 707989 write_json(self.tag_list, 'meta/AE_tag_list.json')
def run(self, song_meta_fname, train_fname, question_fname): print("Loading song meta...") song_meta_json = load_json(song_meta_fname) print("Loading train file...") train_data = load_json(train_fname) print("Loading question file...") questions = load_json(question_fname) print("Writing answers...") result_df = pd.DataFrame( columns=['id', 'means_music_score', 'mean_tag_score', 'mean_title_score']) answers = self._generate_answers(song_meta_json, train_data, questions, result_df) result_df.to_csv('./arena_data/question_k_score.csv', index=False) write_json(answers, "results/results.json")
def run(train_fname, val_fname, test_fname): tags = set() print("Reading train data...\n") playlists_train = load_json(train_fname) print("Reading val data...\n") playlists_val = load_json(val_fname) print("Reading test data...\n") playlists_test = load_json(test_fname) print("Get tags...\n") for ply in playlists_train + playlists_test + playlists_val: tags.update(ply['tags']) tag_to_id = {tag: i for i, tag in enumerate(list(tags))} id_to_tag = {i: tag for i, tag in enumerate(list(tags))} print("Write tag_to_id.json...\n") write_json(tag_to_id, 'tag_to_id.json') print("Write id_to_tag.json...\n") write_json(id_to_tag, 'id_to_tag.json')
def run(self, song_meta_fname, train_fname, question_fname, genre_fname, val_fname=None, test_fname=None, jobs=1): wanna_use_merged_train = (val_fname is not None) and (test_fname is not None) if wanna_use_merged_train: # self.merge_trains([train_fname, val_fname, test_fname], MERGED_TRAIN_FNAME) self.reproduce_train(train_fname, val_fname, test_fname, MERGED_TRAIN_FNAME) graph_train_fname = MERGED_TRAIN_FNAME if wanna_use_merged_train else train_fname graph_jobs = 1 if RUNNING_IN_WINDOWS else jobs if RUNNING_IN_WINDOWS and jobs > 1: print("[Warning] 그래프 추천은 윈도우 환경에서 멀티프로세싱이 불가능 합니다.") cf_solver = MultiprocessSolver() cf_solver.run(song_meta_fname=song_meta_fname, train_fname=train_fname, question_fname=question_fname, jobs=jobs) graph_solver = GrapeRecommender() graph_solver.run(song_meta_fname=song_meta_fname, train_fname=graph_train_fname, question_fname=question_fname, genre_fname=genre_fname, jobs=jobs) ensembler = Ensembler( ['./graph/results/results.json', './cf/results/results.json'], question_fname) res = ensembler.ensemble() print(res[0]) write_json(res, './results/results.json')
def run(self): print('loading meta data...') self._load_meta() model = models.load_model('./models/auto_encoder.h5') result = [] for t in tqdm(self.test): input_song_vec = np.zeros((1, 707989)) input_tag_vec = np.zeros((1, len(self.tag_list))) predict = dict() predict['id'] = t['id'] for song in t['songs']: input_song_vec[0][song] = 1 for tag in t['tags']: if tag in self.tag_list: input_tag_vec[0][self.tag_list.index(tag)] = 1 song_vec, tag_vec = model([input_song_vec, input_tag_vec]) song_vec = np.array(song_vec[0]) tag_vec = np.array(tag_vec[0]) song_rank = song_vec.argsort() tag_rank = tag_vec.argsort() pred_songs = [] pred_tags = [] i = -1 while len(pred_songs) < 100: if song_rank[i] not in t['songs']: pred_songs.append(song_rank[i]) i -= 1 i = -1 while len(pred_tags) < 10: if self.tag_list[tag_rank[i]] not in t['tags']: pred_tags.append(self.tag_list[tag_rank[i]]) i -= 1 predict['songs'] = pred_songs predict['tags'] = pred_tags result.append(predict) write_json(result, 'result.json')
def val_inference(self): model = models.load_model('./models/auto_encoder.h5') val = load_json('./arena_data/questions/val.json') tag_list = load_json('./arena_data/meta/AE_tag_list.json') result = [] for v in tqdm(val): input_song_vec = np.zeros((1, 707989)) input_tag_vec = np.zeros((1, len(tag_list))) predict = dict() predict['id'] = v['id'] for s in v['songs']: input_song_vec[0][s] = 1 for t in v['tags']: if t in tag_list: input_tag_vec[0][tag_list.index(t)] = 1 song_vec, tag_vec = model([input_song_vec, input_tag_vec]) song_vec = np.array(song_vec[0]) tag_vec = np.array(tag_vec[0]) song_rank = song_vec.argsort() tag_rank = tag_vec.argsort() pred_songs = [] pred_tags = [] i = -1 while len(pred_songs) < 100: if song_rank[i] not in v['songs']: pred_songs.append(song_rank[i]) i -= 1 i = -1 while len(pred_tags) < 10: if tag_list[tag_rank[i]] not in v['tags']: pred_tags.append(tag_list[tag_rank[i]]) i -= 1 predict['songs'] = pred_songs predict['tags'] = pred_tags result.append(predict) write_json(result, 'AE_results.json')
def run(self, fname): random.seed(777) print("Reading data...\n") playlists = load_json(fname) random.shuffle(playlists) print(f"Total playlists: {len(playlists)}") print("Splitting data...") train, val = self._split_data(playlists) print("Original train...") write_json(train, "orig/train.json") print("Original val...") write_json(val, "orig/val.json") print("Masked val...") val_q, val_a = self._mask_data(val) write_json(val_q, "questions/val.json") write_json(val_a, "answers/val.json")
def run(self, fname, train_size): random.seed(777) print('Reading data...\n') playlists = load_json(fname) random.shuffle(playlists) print(f'Total playlists: {len(playlists)}') print(f'Splitting data... train_size is {train_size}') train, val = self._split_data(playlists, train_size) print('Original train...') write_json(train, 'orig/train.json') print('Original val...') write_json(val, 'orig/val.json') print('Masked val...') val_q, val_a = self._mask_data(val) write_json(val_q, 'questions/val.json') write_json(val_a, 'answers/val.json')
def run(self, fname): random.seed(777) print("Reading data...\n") playlists = load_json(fname) #playlist는 태그, id, title, 곡들, 좋아요, 업데이트 날짜가 들어있는 딕셔너리 리스트 random.shuffle(playlists) print(f"Total playlists: {len(playlists)}") print("Splitting data...") train, val = self._split_data(playlists) #플레이리스트 나누기 print("Original train...") write_json(train, "orig/train.json") #train.json은 새로 만든다 orig 폴더에 print("Original val...") write_json(val, "orig/val.json") print("Masked val...") val_q, val_a = self._mask_data(val) #validation할것을 마스크해서 write_json(val_q, "questions/val.json") #q는 퀘스쳔 폴더, a는 앤서 폴더에 넣기 write_json(val_a, "answers/val.json")
def eval_dcg(self, res): print("Caculating dcg...") write_json(res, "results/results.json") evaluator = ArenaEvaluator() evaluator.evaluate("arena_data/answers/val_hye.json", "arena_data/results/results.json")
return solve_tag_main(indx) # In[22]: rec_tags = {} timer = 0 for playlist in playlists_valq: if timer % 1000 == 0: print('timer:', timer) pid = playlist['id'] rec_tags[pid] = solve_tag(timer) timer += 1 # In[23]: answers = [] for playlist in playlists_valq: pid = playlist['id'] answer = {'id': pid, 'songs': rec_songs[pid], 'tags': rec_tags[pid]} answers.append(answer) # In[24]: write_json(answers, 'results.json') # In[ ]:
def eval_dcg(self, res): write_json(res, "results/results.json") evaluator = ArenaEvaluator() evaluator.evaluate("arena_data/answers/val_hye.json", "arena_data/results/results.json")
def train(train_dataset, model_file_path, id2prep_song_file_path, id2tag_file_path, question_dataset, answer_file_path): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') id2tag_dict = dict(np.load(id2tag_file_path, allow_pickle=True).item()) id2prep_song_dict = dict( np.load(id2prep_song_file_path, allow_pickle=True).item()) # parameters num_songs = train_dataset.num_songs num_tags = train_dataset.num_tags # hyper parameters D_in = D_out = num_songs + num_tags #local_val mode인 경우 중간 중간 결과 확인 q_data_loader = None check_every = 5 tmp_result_file_path = 'results/tmp_results.json' evaluator = ArenaEvaluator() if question_dataset is not None: q_data_loader = DataLoader(question_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers) data_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers) model = AutoEncoder(D_in, H, D_out, dropout=dropout).to(device) parameters = model.parameters() loss_func = nn.BCELoss() optimizer = torch.optim.Adam(parameters, lr=learning_rate) try: model = torch.load(model_file_path) print("\n--------model restored--------\n") except: print("\n--------model not restored--------\n") pass temp_fn = 'arena_data/answers/temp.json' if os.path.exists(temp_fn): os.remove(temp_fn) for epoch in range(epochs): print() print('epoch: ', epoch) running_loss = 0.0 for idx, (_id, _data) in enumerate(tqdm(data_loader, desc='training...')): _data = _data.to(device) optimizer.zero_grad() output = model(_data) loss = loss_func(output, _data) loss.backward() optimizer.step() running_loss += loss.item() print('loss: %d %d%% %.4f' % (epoch, epoch / epochs * 100, running_loss)) torch.save(model, model_file_path) if mode == 0: if epoch % check_every == 0: if os.path.exists(tmp_result_file_path): os.remove(tmp_result_file_path) elements = [] for idx, (_id, _data) in enumerate( tqdm(q_data_loader, desc='testing...')): with torch.no_grad(): _data = _data.to(device) output = model(_data) songs_input, tags_input = torch.split(_data, num_songs, dim=1) songs_output, tags_output = torch.split(output, num_songs, dim=1) songs_ids = binary_songs2ids(songs_input, songs_output, id2prep_song_dict) tag_ids = binary_tags2ids(tags_input, tags_output, id2tag_dict) _id = list(map(int, _id)) for i in range(len(_id)): element = { 'id': _id[i], 'songs': list(songs_ids[i]), 'tags': tag_ids[i] } elements.append(element) write_json(elements, tmp_result_file_path) evaluator.evaluate(answer_file_path, tmp_result_file_path) os.remove(tmp_result_file_path)
# Calculate the confidence by multiplying it by our alpha value. tag_conf = (train_tags_A_T * alpha_val).astype('double') # Model 학습 tag_recommend_model.fit(tag_conf) answers = [] for nid in tqdm(test.index): recommendations_songs_tuples = song_recommend_model.recommend( int(nid), train_songs_A, 100) recommendations_tags_tuples = tag_recommend_model.recommend( int(nid), train_tags_A, 10) # extract only songs/tags from (songs/tags, score) tuple recommendations_songs = [t[0] for t in recommendations_songs_tuples] recommendations_tags = [t[0] for t in recommendations_tags_tuples] ans_songs = [song_sid_id[song] for song in recommendations_songs] ans_tags = [tag_tid_id[tag] for tag in recommendations_tags] answers.append({ "id": plylst_nid_id[nid], "songs": ans_songs, "tags": ans_tags }) # write_json write_json(answers, "results/results.json")
def merge_trains(self, train_fnames, output_fname): merged_train = [] for train_fname in train_fnames: merged_train += load_json(train_fname) write_json(merged_train, output_fname)
### 3.1 hyperparameters: k, rho, weights ### 3.2 parameters: sim_songs, sim_tags, sim_normalize song_k = 500 tag_k = 90 song_k_step = 50 tag_k_step = 10 rho = 0.4 weight_val_songs = 0.9 weight_pred_songs = 1 - weight_val_songs weight_val_tags = 0.7 weight_pred_tags = 1 - weight_val_tags sim_songs = 'idf' sim_tags = 'idf' sim_normalize = True ### 3.3 run KNN.predict() : returns pandas.DataFrame pred = KNN(song_k=song_k, tag_k=tag_k, rho=rho, \ song_k_step=song_k_step, tag_k_step=tag_k_step, \ weight_val_songs=weight_val_songs, weight_pred_songs=weight_pred_songs, \ weight_val_tags=weight_val_tags, weight_pred_tags=weight_pred_tags, \ sim_songs=sim_songs, sim_tags=sim_tags, sim_normalize=sim_normalize, \ train=train, val=val, song_meta=song_meta, pred=pred).predict() ### 4. post-processing ### 4.1 convert "tag_id" to "tag" pred = convert_id_to_tag(pred, id_to_tag) pred = generate_answers(load_json(train_path), to_list(pred)) write_json(pred, 'results.json')
def Recommender(train, questions, n_msp, n_mtp, mode, sim_measure, song_meta, freq_song, save=False): ## 최종 추천리스트 rec_list = [] ## 1단계: 전처리 # 1) 추천 결과가 없거나 모자란 경우를 위해 most_popular 생성 _, song_mp = most_popular(train, "songs", 200) _, tag_mp = most_popular(train, "tags", 20) # 2) 빠른 접근을 위한 Dictionary 생성 song_plylst_dic, song_tag_dic, plylst_song_dic, plylst_tag_dic, tag_plylst_dic, tag_song_dic, song_issue_dic, song_artist_dic = DicGenerator( train, song_meta) # 3) 미리 계산한 플레이리스트 유사도 불러오기 ''' sim_scores: 입력으로 들어온 questions과 train간 유사도 (Autoencoder 기반) gnr_scores: 입력으로 들어온 questions과 train간 유사도 (genre 정보 추가) title_scores: 입력으로 들어온 questions과 train간 유사도 (Word2vec 기반) ''' sim_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}.npy', allow_pickle=True).item() gnr_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}_gnr.npy', allow_pickle=True).item() title_scores = np.load( f'scores/{mode}_scores_title_{sim_measure}_24000.npy', allow_pickle=True).item() ## 2단계: 함수 정의 # 1) Counter 객체에서 빈도수 기준 topk개 출력 def most_similar(cnt, topk): cnt_topk = cnt.most_common(topk) return [k for k, v in cnt_topk] # 2) 미리 계산한 유사도 기준 topk개의 플레이리스트의 plylsts와 scores 출력 def most_similar_emb(q_id, topk, title=False, genre=False): # title_scores 기준 if title: plylsts = [t[0] for t in title_scores[q_id][:topk]] scores = [t[1] for t in title_scores[q_id][:topk]] # gnr_scores 기준 elif genre: plylsts = [t[0] for t in gnr_scores[q_id][:topk]] scores = [t[1] for t in gnr_scores[q_id][:topk]] # sim_scores 기준 else: plylsts = [t[0] for t in sim_scores[q_id][:topk]] scores = [t[1] for t in sim_scores[q_id][:topk]] return plylsts, scores # 3) new_song_plylst_dict def get_new_song_plylst_dict(plylst_ms): new_song_plylst_dict = defaultdict(set) for plylst in plylst_ms: for _song in plylst_song_dic[plylst]: new_song_plylst_dict[_song].add(plylst) return new_song_plylst_dict ## 3단계: 입력으로 들어온 questions 플레이리스트에 대해 추천 for q in tqdm(questions): # 1) question 플레이리스트의 정보 # 수록 song/tag q_songs = q['songs'] q_tags = q['tags'] # 수록 song/tag와 함께 등장한 song/tag/plylst 빈도 수 song_plylst_C = Counter() song_tag_C = Counter() tag_plylst_C = Counter() tag_song_C = Counter() # 수록 song/tag가 둘 다 없거나 적을 때 no_songs_tags, few_songs_tags = False, False if len(q_songs) == 0 and len(q_tags) == 0: no_songs_tags = True elif len(q_songs) <= 3: few_songs_tags = True # 2) 빈도수 기반 추천을 위해 카운트 # 수록 song에 대해 for q_s in q_songs: song_plylst_C.update(song_plylst_dic[q_s]) song_tag_C.update(song_tag_dic[q_s]) # 수록 tag에 대해 for q_t in q_tags: tag_plylst_C.update(tag_plylst_dic[q_t]) tag_song_C.update(tag_song_dic[q_t]) # 수록곡 수로 나눠서 비율로 계산 for i, j in list(song_plylst_C.items()): if len(plylst_song_dic[i]) > 0: song_plylst_C[i] = (j / len(plylst_song_dic[i])) # 3) 유사도 기반 추천을 위해 점수 계산 plylst_song_scores = defaultdict(lambda: 0) plylst_tag_scores = defaultdict(lambda: 0) # Case 1: song과 tag가 둘 다 없는 경우 if no_songs_tags: # plylst_ms / plylst_mt: title_scores 기준 유사한 플레이리스트 n_msp / n_mtp개 plylst_ms, song_scores = most_similar_emb(q['id'], n_msp, title=True) plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, title=True) plylst_add, add_scores = most_similar_emb(q['id'], n_mtp) # Case 2: song과 tag가 부족한 경우 elif few_songs_tags: # plylst_ms / plylst_mt: sim_scores 기준 n_msp개 / title_scores 기준 n_mtp개 plylst_ms, song_scores = most_similar_emb(q['id'], n_msp) plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, title=True) plylst_add, add_scores = most_similar_emb(q['id'], n_mtp, genre=True) # Case 3: song과 tag가 충분한 경우 else: # plylst_ms / plylst_mt: sim_scores 기준 유사한 플레이리스트 n_msp / n_mtp개 plylst_ms, song_scores = most_similar_emb(q['id'], n_msp) plylst_mt, tag_scores = most_similar_emb(q['id'], n_mtp, genre=True) plylst_add, add_scores = most_similar_emb(q['id'], n_mtp, title=True) new_song_plylst_dict = get_new_song_plylst_dict(plylst_ms) # 3-1. plylst_song_scores 계산 for idx, ms_p in enumerate(plylst_ms): for song in plylst_song_dic[ms_p]: song_score = 0 for q_s in q_songs: try: song_score += len(new_song_plylst_dict[q_s] & new_song_plylst_dict[song]) / len( new_song_plylst_dict[q_s]) except: pass if song in freq_song: plylst_song_scores[song] += song_plylst_C[ ms_p] * song_score * song_scores[idx] * (n_msp - idx) * 4 else: plylst_song_scores[song] += song_plylst_C[ ms_p] * song_score * song_scores[idx] * (n_msp - idx) for tag in plylst_tag_dic[ms_p]: plylst_tag_scores[tag] += tag_scores[idx] * (n_msp - idx) # 3-2. plylst_tag_scores 계산 for idx, mt_p in enumerate(plylst_mt): for tag in plylst_tag_dic[mt_p]: plylst_tag_scores[tag] += tag_scores[idx] * (n_mtp - idx) for song in plylst_song_dic[mt_p]: plylst_song_scores[song] += tag_scores[idx] # 3-3. plylst_{song/tag}_scores 보정 for idx, mt_p in enumerate(plylst_add): for tag in plylst_tag_dic[mt_p]: plylst_tag_scores[tag] += add_scores[idx] * (n_mtp - idx) # 4) song과 tag 둘 다 없거나 적은 경우 예측해서 채워넣기 if no_songs_tags: # q_songs 새롭게 채워넣기 (원래는 song가 없지만 title_scores 기준 유사한 플레이리스트로부터 song 예측) pre_songs = sorted(plylst_song_scores.items(), key=lambda x: x[1], reverse=True) pre_songs = [scores[0] for scores in pre_songs][:200] pre_songs = pre_songs + remove_seen(pre_songs, song_mp) q_songs = pre_songs[:100] # q_tags 새롭게 채워넣기 (원래는 tag가 없지만 title_scores 기준 유사한 플레이리스트로부터 tag 예측) pre_tags = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True) pre_tags = [scores[0] for scores in pre_tags][:20] pre_tags = pre_tags + remove_seen(pre_tags, tag_mp) q_tags = pre_tags[:10] # 5) questions 플레이리스트에 대해 추천 ## song 추천 # song 있을 때 lt_song_art = [] if len(q_songs) > 0: plylst_song_scores = sorted(plylst_song_scores.items(), key=lambda x: x[1], reverse=True) lt_artist = [] for w_song in q_songs: lt_artist.extend(song_artist_dic[w_song]) counter_artist = Counter(lt_artist) counter_artist = sorted(counter_artist.items(), key=lambda x: x[1], reverse=True) if few_songs_tags: artist = [art[0] for art in counter_artist] else: artist = [x[0] for x in counter_artist if x[1] > 1] cand_ms = [scores[0] for scores in plylst_song_scores ][(100 - len(artist)):1000] for cand in cand_ms: if artist == []: break if cand in q_songs: break for art in song_artist_dic[cand]: if art in artist: lt_song_art.append(cand) artist.remove(art) break song_ms = [scores[0] for scores in plylst_song_scores][:200] # song 없고, tag 있을 때 else: song_ms = most_similar(tag_song_C, 200) ## tag 추천 # tag 있을 때 if len(q_tags) > 0: plylst_tag_scores = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True) tag_ms = [scores[0] for scores in plylst_tag_scores][:20] # tag 없고, song 있을 때 else: plylst_tag_scores = sorted(plylst_tag_scores.items(), key=lambda x: x[1], reverse=True) tag_ms = [scores[0] for scores in plylst_tag_scores][:20] ## issue date 늦은 song 제거 if q['updt_date']: q_updt_date = q['updt_date'][:4] + q['updt_date'][5:7] + q[ 'updt_date'][8:10] song_ms = [x for x in song_ms if song_issue_dic[x] < q_updt_date] ## 중복 제거 및 부족하면 most_popular로 채워넣기 song_candidate = song_ms + remove_seen(song_ms, song_mp) tag_candidate = tag_ms + remove_seen(tag_ms, tag_mp) song_remove = q_songs tag_remove = q_tags song_candidate = song_candidate[:100] if no_songs_tags else remove_seen( song_remove, song_candidate)[:100] if len(lt_song_art) > 0: lt_song_art = [x for x in lt_song_art if x not in song_candidate] song_candidate[(100 - len(lt_song_art)):100] = lt_song_art rec_list.append({ "id": q["id"], "songs": song_candidate, "tags": tag_candidate[:10] if no_songs_tags else remove_seen( tag_remove, tag_candidate)[:10] }) # 6) results.json 파일 저장 여부 if save == True: write_json( rec_list, 'results/results_' + dt.datetime.now().strftime("%y%m%d-%H%M%S") + '_' + mode + '.json') return rec_list
def run(self, fname): np.random.seed(self.SEED) train_org = pd.read_json(fname) split = self._generateIdx(train_org) if self.DATA_FOLDER not in os.listdir("."): os.mkdir(self.DATA_FOLDER) for i, (tr_idx, te_idx) in enumerate(split): folder = "fold" + str(i) path = self.DATA_FOLDER + "/" + folder if folder not in os.listdir(self.DATA_FOLDER): os.mkdir(path) print("Splitting data...") train, val1, val2 = self._split_data(train_org, tr_idx, te_idx) print(f"fold {i} Original train...") write_json(train, path + "/train.json") #train.json은 새로 만든다 orig 폴더에 print(f"fold {i} Original val1...") write_json(val1, path + "/val1.json") print(f"fold {i} Original val2...") write_json(val2, path + "/val2.json") print(f"fold {i} Masked val1...") val1_q, val1_a = self._mask_data(val1) #validation할것을 마스크해서 write_json(val1_q, path + "/val1_q.json") write_json(val1_a, path + "/val1_a.json") print(f"fold {i} Masked val2...") val2_q, val2_a = self._mask_data(val2) #validation할것을 마스크해서 write_json(val2_q, path + "/val2_q.json") write_json(val2_a, path + "/val2_a.json")
def infer(MODE="Test"): mode_opt = { "Valid": { "train_path": "arena_data/orig/train.json", "test_path": "arena_data/questions/val.json", "results_path": "cf2/val/results.json", "eval": True }, "Dev": { "train_path": "res/train.json", "test_path": "res/val.json", "results_path": "cf2/dev/results.json", "eval": False }, "Test": { "train_path": "res/train.json", "test_path": "res/test.json", "results_path": "cf2/test/results.json", "eval": False } } opt = mode_opt[MODE] train = pd.read_json(opt["train_path"]) test = pd.read_json(opt["test_path"]) if MODE != "Dev": dev = pd.read_json("res/val.json") if MODE != "Test": test_res = pd.read_json("res/test.json") print("Preprocessing dates") test_date = {} for i in tqdm(test.index): test_date[test.at[i, 'id']] = test.at[i, 'updt_date'] song_meta = pd.read_json("res/song_meta.json") song_date = {} for i in tqdm(song_meta.index): song_date[song_meta.at[i, "id"]] = str(song_meta.at[i, "issue_date"]) del song_meta song_update_date = [] for i in train.index: updt_date = train.loc[i, 'updt_date'][:4] + train.loc[ i, 'updt_date'][5:7] + train.loc[i, 'updt_date'][8:10] for t in train.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) for i in test.index: updt_date = test.loc[i, 'updt_date'][:4] + test.loc[ i, 'updt_date'][5:7] + test.loc[i, 'updt_date'][8:10] for t in test.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) if MODE != "Dev": for i in dev.index: updt_date = dev.loc[i, 'updt_date'][:4] + dev.loc[ i, 'updt_date'][5:7] + dev.loc[i, 'updt_date'][8:10] for t in dev.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) if MODE != "Test": for i in test_res.index: updt_date = test_res.loc[i, 'updt_date'][:4] + test_res.loc[ i, 'updt_date'][5:7] + test_res.loc[i, 'updt_date'][8:10] for t in test_res.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) print("The number of processed songs :", len(set(song_update_date))) # Loading tags extracted from tiltle pred_tag = load_json("arena_data/model/pred_tag.json") dic_pred_tag = {} for p_t in pred_tag: dic_pred_tag[p_t['id']] = p_t['predict_tag'] train['tags_org'] = train['tags'] for i in train.index: train.at[i, 'tags'] = train.at[i, 'tags'] + dic_pred_tag[train.at[i, 'id']] test['tags_org'] = test['tags'] for i in test.index: test.at[i, 'tags'] = test.at[i, 'tags'] + dic_pred_tag[test.at[i, 'id']] if MODE != "Dev": dev['tags_org'] = dev['tags'] for i in dev.index: dev.at[i, 'tags'] = dev.at[i, 'tags'] + dic_pred_tag[dev.at[i, 'id']] if MODE != "Test": test_res['tags_org'] = test_res['tags'] for i in test_res.index: test_res.at[i, 'tags'] = test_res.at[i, 'tags'] + dic_pred_tag[ test_res.at[i, 'id']] # Calculating IDF inv_doc_freq = {} for d in train['songs'] + train['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 for d in test['songs'] + test['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 if MODE != "Dev": for d in dev['songs'] + dev['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 if MODE != "Test": for d in test_res['songs'] + test_res['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 for k in inv_doc_freq: if MODE == "Valid": inv_doc_freq[k] = math.log10( (len(train) + len(test) + len(dev) + len(test_res)) / inv_doc_freq[k]) elif MODE == "Dev": inv_doc_freq[k] = math.log10( (len(train) + len(test) + len(test_res)) / inv_doc_freq[k]) else: inv_doc_freq[k] = math.log10( (len(train) + len(test) + len(dev)) / inv_doc_freq[k]) # Preprocessing data for CF matrix if MODE == "Valid": n_train = len(train) + len(dev) + len(test_res) elif MODE == "Dev": n_train = len(train) + len(test_res) else: n_train = len(train) + len(dev) n_test = len(test) # train + test if MODE == "Valid": plylst = pd.concat([train, dev, test_res, test], ignore_index=True) elif MODE == "Dev": plylst = pd.concat([train, test_res, test], ignore_index=True) else: plylst = pd.concat([train, dev, test], ignore_index=True) # playlist id plylst["nid"] = range(n_train + n_test) # nid -> id plylst_nid_id = dict(zip(plylst["nid"], plylst["id"])) plylst_tag = plylst['tags'] tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs]) tag_dict = {x: tag_counter[x] for x in tag_counter} id_type = dict() tag_id_tid = dict() tag_tid_id = dict() for i, t in enumerate(tag_dict): tag_id_tid[t] = i tag_tid_id[i] = t id_type[t] = 1 n_tags = len(tag_dict) plylst_song = plylst['songs'] song_counter = Counter([sg for sgs in plylst_song for sg in sgs]) song_dict = {x: song_counter[x] for x in song_counter} song_id_sid = dict() song_sid_id = dict() for i, t in enumerate(song_dict): song_id_sid[t] = i song_sid_id[i] = t id_type[t] = 1 n_songs = len(song_dict) plylst_st = plylst['songs'] + plylst['tags'] st_counter = Counter([st for sts in plylst_st for st in sts]) st_dict = {x: st_counter[x] for x in st_counter} st_id_tid = dict() st_tid_id = dict() for i, t in enumerate(st_dict): st_id_tid[t] = i st_tid_id[i] = t n_sts = len(st_dict) print("Tags : ", n_tags, ", Songs : ", n_songs, ", Total : ", n_sts) plylst['songs_id'] = plylst['songs'].map( lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None]) plylst['tags_id'] = plylst['tags_org'].map( lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None]) plylst['sts_id'] = (plylst['songs'] + plylst['tags']).map( lambda x: [st_id_tid.get(st) for st in x if st_id_tid.get(st) != None]) plylst_use = plylst[['nid', 'updt_date', 'songs_id', 'tags_id', 'sts_id']] plylst_use.loc[:, 'num_songs'] = plylst_use['songs_id'].map(len) plylst_use.loc[:, 'num_tags'] = plylst_use['tags_id'].map(len) plylst_use.loc[:, 'num_sts'] = plylst_use['sts_id'].map(len) plylst_use = plylst_use.set_index('nid') plylst_train = plylst_use.iloc[:, :] plylst_test = plylst_use.iloc[n_train:, :] n_train = len(plylst_train) np.random.seed(33) test_set = plylst_test print("The number of test samples : ", len(test_set)) # Building CF matrices avg_len_songs = 0 for songs in plylst_train['songs_id']: avg_len_songs += len(songs) avg_len_songs /= len(plylst_train['songs_id']) avg_len_tags = 0 for tags in plylst_train['tags_id']: avg_len_tags += len(tags) avg_len_tags /= len(plylst_train['tags_id']) avg_len_sts = 0 for sts in plylst_train['sts_id']: avg_len_sts += len(sts) avg_len_sts /= len(plylst_train['sts_id']) row = np.repeat(range(n_train), plylst_train['num_songs']) col = [song for songs in plylst_train['songs_id'] for song in songs] dat = [1 for songs in plylst_train['songs_id'] for song in songs] train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs)) row = np.repeat(range(n_train), plylst_train['num_tags']) col = [tag for tags in plylst_train['tags_id'] for tag in tags] dat = [1 for tags in plylst_train['tags_id'] for tag in tags] train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags)) row = np.repeat(range(n_train), plylst_train['num_sts']) col = [st for sts in plylst_train['sts_id'] for st in sts] dat = [ inv_doc_freq[st_tid_id[st]] / (len(sts) + 50) for sts in plylst_train['sts_id'] for st in sts ] train_sts_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_sts)) train_songs_A_T = train_songs_A.T.tocsr() train_tags_A_T = train_tags_A.T.tocsr() # Building map playlist id to songs or tags for playlist2vec if MODE == "Valid": p2v_targets = [train, test, dev, test_res] elif MODE == "Dev": p2v_targets = [train, test, test_res] else: p2v_targets = [train, test, dev] song_dic = {} tag_dic = {} for i, q in tqdm(pd.concat(p2v_targets).iterrows()): song_dic[str(q['id'])] = q['songs'] tag_dic[str(q['id'])] = q['tags_org'] # Loading playlist embedding vectors p2v_song = WordEmbeddingsKeyedVectors.load( "arena_data/model/p2v_song.model") p2v_tag = WordEmbeddingsKeyedVectors.load("arena_data/model/p2v_tag.model") print("Predicting") res = [] filtered_lot_song = [] filtered_lot_tag = [] for pid in tqdm(test_set.index): songs_already = test_set.loc[pid, "songs_id"] tags_already = test_set.loc[pid, "tags_id"] # Song prediction - 1. Query vector to predict songs p = np.zeros((n_sts, 1)) if len(test_set.loc[pid, 'sts_id']) > 0: for st in test_set.loc[pid, 'sts_id']: if st_tid_id[st] in inv_doc_freq: p[st] = inv_doc_freq[st_tid_id[st]] / ( len(test_set.loc[pid, 'sts_id']) + 50) # Song prediction - 2. K-nn playlists val = train_sts_A.dot(p).reshape(-1) val_idx = val.reshape(-1).argsort()[-250:][::-1] val_knn = np.zeros((n_train)) val_knn[val_idx] = val[val_idx] val = val_knn**2 # Song prediction - 3. Candidates cand_song = train_songs_A_T.dot(val) # Song prediction - 4. Rescoring using playlist2vec dic_song_score = {} if str(plylst_nid_id[pid]) in p2v_song.wv.vocab: most_id = [ x for x in p2v_song.most_similar(str(plylst_nid_id[pid]), topn=50) ] for ID in most_id: for s in song_dic[ID[0]]: if s in dic_song_score: dic_song_score[s] += ID[1] else: dic_song_score[s] = ID[1] for k in dic_song_score: cand_song[song_id_sid[k]] *= dic_song_score[k]**0.2 cand_song_idx = cand_song.reshape(-1).argsort()[-5000:][::-1] # Song prediction - 5. Filtering by score and date cand_song_idx_filtered = [] for cand in cand_song_idx: if cand_song[cand] > 0 and song_date[song_sid_id[ cand]] <= test_date[plylst_nid_id[pid]][:4] + test_date[ plylst_nid_id[pid]][5:7] + test_date[ plylst_nid_id[pid]][8:10]: cand_song_idx_filtered.append(cand) if len(cand_song_idx_filtered) < 400: filtered_lot_song.append(len(cand_song_idx_filtered)) cand_song_idx = np.array(cand_song_idx_filtered) # Song prediction - 6. Rescoring using heuristics dict_score = {} for idx in cand_song_idx: dict_score[idx] = cand_song[idx] mean_doc_freq = 0 std_doc_freq = 0 list_doc_freq = [] mean_song_date = 0 list_song_date = [] if len(test_set.loc[pid, "songs_id"]) > 0: for t in test_set.loc[pid, "songs_id"]: if song_sid_id[t] in inv_doc_freq: list_doc_freq.append(inv_doc_freq[song_sid_id[t]]) song_d = int(song_date[song_sid_id[t]]) if song_d > 19000000 and song_d < 20210000: list_song_date.append(song_d) if len(list_doc_freq) > 0: mean_doc_freq = np.mean(list_doc_freq) std_doc_freq = np.std(list_doc_freq) if len(list_song_date) > 0: mean_song_date = np.mean(list_song_date) # Song prediction - 6-1. Rescoring by IDF comparison if len(list_doc_freq) > 0: for c in dict_score: if song_sid_id[c] in inv_doc_freq: dict_score[c] = 1 / ( len(list_doc_freq)**0.5) * dict_score[c] + ( 1 - 1 / (len(list_doc_freq)**0.5)) * dict_score[c] * 2 / ( np.abs(inv_doc_freq[song_sid_id[c]] - mean_doc_freq) / (std_doc_freq + 1) + 2) else: dict_score[c] = 1 / (len(list_doc_freq)** 0.5) * dict_score[c] # Song prediction - 6-2. Rescoring by Date comparison if len(list_song_date) > 0: for c in dict_score: song_d = int(song_date[song_sid_id[c]]) if song_d > 19000000 and song_d < 20210000: dict_score[c] = 1 / ( len(list_song_date)**0.5) * dict_score[c] + ( 1 - 1 / (len(list_song_date)**0.5)) * dict_score[c] / ( np.abs(song_d - mean_song_date) / 500000 + 1) else: dict_score[c] = 1 / (len(list_song_date)** 0.5) * dict_score[c] score_sorted = sorted(dict_score.items(), key=lambda x: x[1], reverse=True) cand_song_idx = [] for t in score_sorted: cand_song_idx.append(t[0]) cand_song_idx = np.array(cand_song_idx) cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:300] rec_song_idx = [song_sid_id[i] for i in cand_song_idx] # Tag prediction - 1. Query vector to predict tags p = np.zeros((n_sts, 1)) p[test_set.loc[pid, 'sts_id']] = 1 # Tag prediction - 2. K-nn playlists val = train_sts_A.dot(p).reshape(-1) val_idx = val.reshape(-1).argsort()[-250:][::-1] val_knn = np.zeros((n_train)) val_knn[val_idx] = val[val_idx] val = val_knn**2 # Tag prediction - 3. Candidates cand_tag = train_tags_A_T.dot(val) # Tag prediction - 4. Rescoring using playlist2vec dic_tag_score = {} if str(plylst_nid_id[pid]) in p2v_tag.wv.vocab: most_id = [ x for x in p2v_tag.most_similar(str(plylst_nid_id[pid]), topn=50) ] for ID in most_id: for t in tag_dic[ID[0]]: if t in dic_tag_score: dic_tag_score[t] += ID[1] else: dic_tag_score[t] = ID[1] for k in dic_tag_score: cand_tag[tag_id_tid[k]] *= dic_tag_score[k]**0.5 cand_tag_idx = cand_tag.reshape(-1).argsort()[-35:][::-1] # Tag prediction - 5. Filtering by score cand_tag_idx_filtered = [] for cand in cand_tag_idx: if cand_tag[cand] > 0: cand_tag_idx_filtered.append(cand) if len(cand_tag_idx_filtered) != 35: filtered_lot_tag.append(len(cand_tag_idx_filtered)) cand_tag_idx = np.array(cand_tag_idx_filtered) cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:30] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res.append({ "id": plylst_nid_id[pid], "songs": rec_song_idx, "tags": rec_tag_idx }) print(len(filtered_lot_song), filtered_lot_song) print(len(filtered_lot_tag), filtered_lot_tag) write_json(res, "results/" + opt["results_path"]) if opt["eval"]: evaluator = CustomEvaluator() evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/" + opt["results_path"])