def make_input4tokenizer(train_file_path, genre_file_path, result_file_path, valid_file_path=None, test_file_path=None): def _wv_tags(tags_list): taS = [] for tags in tags_list: taS.append(' '.join(tags)) return(taS) def _wv_genre(genre): genre_dict = dict() for code, value in genre: code_num = int(code[2:]) if not code_num % 100: cur_genre = value genre_dict[cur_genre] = [] else: value = ' '.join(value.split('/')) genre_dict[cur_genre].append(value) genre_sentences = [] for key in genre_dict: sub_list = genre_dict[key] key = ' '.join(key.split('/')) if not len(sub_list): continue for sub in sub_list: genre_sentences.append(key+' '+sub) return genre_sentences try: plylsts = load_json(train_file_path) if valid_file_path is not None: val_plylsts = load_json(valid_file_path) plylsts += val_plylsts if test_file_path is not None: test_plylsts = load_json(test_file_path) plylsts += test_plylsts genre_all = load_json(genre_file_path) genre_all_lists = [] for code, gnr in genre_all.items(): if gnr != '세부장르전체': genre_all_lists.append([code, gnr]) genre_all_lists = np.asarray(genre_all_lists) sentences = [] for plylst in plylsts: tiS = plylst['plylst_title'] taS = ' '.join(plylst['tags']) upS = ' '.join(plylst['updt_date'][:7].split('-')) sentences.append(' '.join([tiS, taS, upS])) geS = _wv_genre(genre_all_lists) sentences = sentences + geS with open(result_file_path, 'w', encoding='utf8') as f: for sentence in sentences: f.write(sentence+'\n') except Exception as e: print(e.with_traceback()) return False return sentences
def run(self, train_fname, question_fname): print("Loading train file...") train = load_json(train_fname) print("Loading question file...") questions = load_json(question_fname) print("Writing answers...") answers = self._generate_answers(train, questions) write_json(answers, "results/results.json")
def run(tag_to_id_fname, id_to_tag_fname, train_fname, test_fname): print("Loading tag_to_id...") tag_to_id = load_json(tag_to_id_fname) print("Loading id_to_tag...") id_to_tag = load_json(id_to_tag_fname) print("Loading train file...") train_data = load_json(train_fname) for ply in train_data: ply['tags'] = [tag_to_id[tag] for tag in ply['tags']] print("Loading test file...") test_data = load_json(test_fname) for ply in test_data: ply['tags'] = [tag_to_id[tag] for tag in ply['tags']] # print("Writing answers...") # answers = self._generate_answers(song_meta_json, train_data, questions) # write_json(answers, "results/results.json") print("Make Training dataset...") def train_generator(): for x in train_data: songs = np.zeros(707989) tags = np.zeros(30653) songs[x['songs']] = 1 tags[x['tags']] = 1 yield np.concatenate([songs, tags]) def test_generator(): for x in test_data: songs = np.zeros(707989) tags = np.zeros(30653) songs[x['songs']] = 1 tags[x['tags']] = 1 yield np.concatenate([songs, tags]) training_dataset = tf.data.Dataset.from_generator(generator=train_generator, output_types=tf.float32, output_shapes=tf.TensorShape([707989+30653])).batch(256) test_dataset = tf.data.Dataset.from_generator(generator=test_generator, output_types=tf.float32, output_shapes=tf.TensorShape([707989+30653])).batch(256) model = AutoEncoder(intermediate_dim=128, original_dim=707989+30653) opt = tf.keras.optimizers.Adam(learning_rate=1e-2) print("Train Loop...") train_loop(model, opt, loss, training_dataset, 20) print("Predict...") # preds = model(test_dataset) # pred_songs = preds[:, :707989] pred_tags = [id_to_tag[idx] for idx in preds[:, 707989:]] # print(pred_songs) print(pred_tags) model.save('saved_model')
def run(self, song_meta_fname, train_fname, question_fname): print("Loading song meta...") song_meta_json = load_json(song_meta_fname) print("Loading train file...") train_data = load_json(train_fname) print("Loading question file...") questions = load_json(question_fname) print("Writing answers...") answers = self._generate_answers(song_meta_json, train_data, questions) write_json(answers, "results/results.json")
def run(self, song_meta_fname, train_fname, question_fname, jobs=1, train_ans_fname=None): global NUM_CORE NUM_CORE = jobs print("Loading song meta...") song_meta_json = load_json(song_meta_fname) print("Loading train file...") train_data = load_json(train_fname) print(len(train_data)) print("Loading question file...") questions = load_json(question_fname) print("Loading question file...") ans = None if train_ans_fname != None: ans = load_json(train_ans_fname) result_df = pd.DataFrame( columns=['id', 'means_music_score', 'mean_tag_score', 'mean_title_score']) print("Writing answers...") chunked_train_set = list(self.chunker_list(questions, NUM_CORE)) print(f'run with {len(chunked_train_set)} multiprocess') from nns_ensemble_with_artist_dist import GenreMostPopular import multiprocessing algorithm = GenreMostPopular() return_dict = multiprocessing.Manager().dict() answers_list = list() jobs = [] p_idxs = [] for p_idx, train_chunk in enumerate(chunked_train_set): p = multiprocessing.Process(target=algorithm._generate_answers, args=(song_meta_json, train_data, train_chunk, result_df, ans, p_idx, return_dict)) jobs.append(p) p.start() p_idxs.append(p_idx) for p in jobs: p.join() answers = list() for p_idx in p_idxs: answers = answers + return_dict[p_idx] write_json(answers, "./cf/results/results.json")
def run(self, fname): # raondom shuffle 때문에 seed 지정 random.seed(777) print("Reading data...\n") # json 파일 불러오기 playlists = load_json(fname) # 불러온 파일 순서 섞기 random.shuffle(playlists) print(f"Total playlists: {len(playlists)}") print("Splitting data...") # split train, val = self._split_data(playlists) # train, val 나눈 데이터 json 파일 작성 print("Original train...") write_json(train, "orig/train.json") print("Original val...") write_json(val, "orig/val.json") print("Masked val...") # masking 작업 val_q, val_a = self._mask_data(val) write_json(val_q, "questions/val.json") write_json(val_a, "answers/val.json")
def run(self, train_fname, question_fname): print('set logger') logger = log.get_logger() print(logger) logger.set_log_level(WARN) print('logger set') print("Loading train file...") train = load_json(train_fname) print("Loading question file...") questions = load_json(question_fname) print("Writing answers...") answers = self._generate_answers(train, questions) write_json(answers, "results/results.json")
def run(train_fname, val_fname, test_fname): tags = set() print("Reading train data...\n") playlists_train = load_json(train_fname) print("Reading val data...\n") playlists_val = load_json(val_fname) print("Reading test data...\n") playlists_test = load_json(test_fname) print("Get tags...\n") for ply in playlists_train + playlists_test + playlists_val: tags.update(ply['tags']) tag_to_id = {tag: i for i, tag in enumerate(list(tags))} id_to_tag = {i: tag for i, tag in enumerate(list(tags))} print("Write tag_to_id.json...\n") write_json(tag_to_id, 'tag_to_id.json') print("Write id_to_tag.json...\n") write_json(id_to_tag, 'id_to_tag.json')
def run(self, song_meta_fname, train_fname, question_fname): print("Loading song meta...") song_meta_json = load_json(song_meta_fname) print("Loading train file...") train_data = load_json(train_fname) print("Loading question file...") questions = load_json(question_fname) print("Writing answers...") result_df = pd.DataFrame( columns=['id', 'means_music_score', 'mean_tag_score', 'mean_title_score']) answers = self._generate_answers(song_meta_json, train_data, questions, result_df) result_df.to_csv('./arena_data/question_k_score.csv', index=False) write_json(answers, "results/results.json")
def _eval(self, gt_fname, rec_fname): gt_playlists = load_json(gt_fname) #답지 json 파일을 로드 gt_dict = {g["id"]: g for g in gt_playlists} # id : playlist 딕셔너리 만듬 rec_playlists = load_json(rec_fname) #내가 만든 파일 로드 gt_ids = set([g["id"] for g in gt_playlists]) #id 들 set으로 묶음 rec_ids = set([r["id"] for r in rec_playlists]) #id 들 set으로 묶음 if gt_ids != rec_ids:# id들이 같지 않으면 에러 raise Exception("결과의 플레이리스트 id가 올바르지 않습니다.") rec_song_counts = [len(p["songs"]) for p in rec_playlists] # rec_tag_counts = [len(p["tags"]) for p in rec_playlists] if set(rec_song_counts) != set([100]): #무조건 답지는 raise Exception("추천 곡 결과의 개수가 맞지 않습니다.") if set(rec_tag_counts) != set([10]): raise Exception("추천 태그 결과의 개수가 맞지 않습니다.") rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists] rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists] if set(rec_unique_song_counts) != set([100]): raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.") if set(rec_unique_tag_counts) != set([10]): raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.") music_ndcg = 0.0 tag_ndcg = 0.0 recall = 0.0 for rec in rec_playlists: gt = gt_dict[rec["id"]] music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100]) tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10]) recall += self._recall(gt["songs"], rec["songs"][:100]) music_ndcg = music_ndcg / len(rec_playlists) tag_ndcg = tag_ndcg / len(rec_playlists) recall = recall / len(rec_playlists) score = music_ndcg * 0.85 + tag_ndcg * 0.15 return music_ndcg, tag_ndcg, recall, score
def _eval(self, gt_fname, rec_fname): gt_playlists = load_json(gt_fname) gt_dict = {g["id"]: g for g in gt_playlists} rec_playlists = load_json(rec_fname) gt_ids = set([g["id"] for g in gt_playlists]) rec_ids = set([r["id"] for r in rec_playlists]) if gt_ids != rec_ids: raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.") rec_song_counts = [len(p["songs"]) for p in rec_playlists] rec_tag_counts = [len(p["tags"]) for p in rec_playlists] if set(rec_song_counts) != set([100]): raise Exception("추천 곡 결과의 개수가 맞지 않습니다.") if set(rec_tag_counts) != set([10]): raise Exception("추천 태그 결과의 개수가 맞지 않습니다.") rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists] rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists] if set(rec_unique_song_counts) != set([100]): raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.") if set(rec_unique_tag_counts) != set([10]): raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.") # 랭킹기반 추천시스템에 주로 쓰이는 평가 지표로 # 관련성이 높은 결과를 상위권에 노출시켰는지 기반으로 만들어야한다. # 검색엔진이나 영상추천, 음악추천 등 다양한 추천시스템에서 평가지표로 활용한다. music_ndcg = 0.0 tag_ndcg = 0.0 for rec in rec_playlists: gt = gt_dict[rec["id"]] music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100]) tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10]) music_ndcg = music_ndcg / len(rec_playlists) tag_ndcg = tag_ndcg / len(rec_playlists) score = music_ndcg * 0.85 + tag_ndcg * 0.15 return music_ndcg, tag_ndcg, score
def _eval(self, gt_fname, rec_fname): gt_playlists = load_json(gt_fname) gt_dict = {g['id']: g for g in gt_playlists} rec_playlists = load_json(rec_fname) gt_ids = set([g['id'] for g in gt_playlists]) rec_ids = set([r['id'] for r in rec_playlists]) if gt_ids != rec_ids: raise Exception('결과의 플레이리스트 수가 올바르지 않습니다.') rec_song_counts = [len(p['songs']) for p in rec_playlists] rec_tag_counts = [len(p['tags']) for p in rec_playlists] if set(rec_song_counts) != set([100]): raise Exception('추천 곡 결과의 개수가 맞지 않습니다.') if set(rec_tag_counts) != set([10]): raise Exception('추천 태그 결과의 개수가 맞지 않습니다.') rec_unique_song_counts = [len(set(p['songs'])) for p in rec_playlists] rec_unique_tag_counts = [len(set(p['tags'])) for p in rec_playlists] if set(rec_unique_song_counts) != set([100]): raise Exception('한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.') if set(rec_unique_tag_counts) != set([10]): raise Exception('한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.') music_ndcg = 0.0 tag_ndcg = 0.0 for rec in rec_playlists: gt = gt_dict[rec['id']] music_ndcg += self._ndcg(gt['songs'], rec['songs'][:100]) tag_ndcg += self._ndcg(gt['tags'], rec['tags'][:10]) music_ndcg = music_ndcg / len(rec_playlists) tag_ndcg = tag_ndcg / len(rec_playlists) score = music_ndcg * 0.85 + tag_ndcg * 0.15 return music_ndcg, tag_ndcg, score
def val_inference(self): model = models.load_model('./models/auto_encoder.h5') val = load_json('./arena_data/questions/val.json') tag_list = load_json('./arena_data/meta/AE_tag_list.json') result = [] for v in tqdm(val): input_song_vec = np.zeros((1, 707989)) input_tag_vec = np.zeros((1, len(tag_list))) predict = dict() predict['id'] = v['id'] for s in v['songs']: input_song_vec[0][s] = 1 for t in v['tags']: if t in tag_list: input_tag_vec[0][tag_list.index(t)] = 1 song_vec, tag_vec = model([input_song_vec, input_tag_vec]) song_vec = np.array(song_vec[0]) tag_vec = np.array(tag_vec[0]) song_rank = song_vec.argsort() tag_rank = tag_vec.argsort() pred_songs = [] pred_tags = [] i = -1 while len(pred_songs) < 100: if song_rank[i] not in v['songs']: pred_songs.append(song_rank[i]) i -= 1 i = -1 while len(pred_tags) < 10: if tag_list[tag_rank[i]] not in v['tags']: pred_tags.append(tag_list[tag_rank[i]]) i -= 1 predict['songs'] = pred_songs predict['tags'] = pred_tags result.append(predict) write_json(result, 'AE_results.json')
def _init_song_meta(self): song_meta = load_json('res/song_meta.json') genre_gn_all = pd.read_json('res/genre_gn_all.json', encoding='utf8', typ='series') genre_gn_all = pd.DataFrame(genre_gn_all, columns=[ 'gnr_name' ]).reset_index().rename(columns={'index': 'gnr_code'}) self.gnr_code, self.dtl_gnr_code = genre_gn_all_preprocessing( genre_gn_all) self.num_gnr = len(self.gnr_code) self.num_dtl_gnr = len(self.dtl_gnr_code) self.gnr_dic, self.dtl_dic, self.song_gnr_dic, self.song_dtl_dic = genre_DicGenerator( self.gnr_code, self.dtl_gnr_code, song_meta)
def _load_train(self): data = load_json('./arena_data/orig/train.json') self.train = [] self.song_list = set() self.tag_list = set() print('train data filtering...') for t in tqdm(data): if t['like_cnt'] > 50: self.train.append(t) self.song_list.update(t['songs']) self.tag_list.update(t['tags']) self.song_list = list(self.song_list) self.tag_list = list(self.tag_list) self.total_song_num = 707989 write_json(self.tag_list, 'meta/AE_tag_list.json')
def run(self, fname, train_size): random.seed(777) print('Reading data...\n') playlists = load_json(fname) random.shuffle(playlists) print(f'Total playlists: {len(playlists)}') print(f'Splitting data... train_size is {train_size}') train, val = self._split_data(playlists, train_size) print('Original train...') write_json(train, 'orig/train.json') print('Original val...') write_json(val, 'orig/val.json') print('Masked val...') val_q, val_a = self._mask_data(val) write_json(val_q, 'questions/val.json') write_json(val_a, 'answers/val.json')
def run(self, fname): random.seed(777) print("Reading data...\n") playlists = load_json(fname) random.shuffle(playlists) print(f"Total playlists: {len(playlists)}") print("Splitting data...") train, val = self._split_data(playlists) print("Original train...") write_json(train, "orig/train.json") print("Original val...") write_json(val, "orig/val.json") print("Masked val...") val_q, val_a = self._mask_data(val) write_json(val_q, "questions/val.json") write_json(val_a, "answers/val.json")
def get_w2v_scores(submit_type): if submit_type == 'val': default_file_path = 'res' question_file_path = 'res/val.json' train_file_path = 'res/train.json' elif submit_type == 'test': default_file_path = 'res' val_file_path = 'res/val.json' question_file_path = 'res/test.json' train_file_path = 'res/train.json' elif submit_type == 'local_val': default_file_path = 'arena_data' train_file_path = f'{default_file_path}/orig/train.json' question_file_path = f'{default_file_path}/questions/val.json' default_file_path = f'{default_file_path}/orig' genre_file_path = 'res/genre_gn_all.json' tokenize_input_file_path = f'model/tokenizer_input_{method}_{vocab_size}_{submit_type}.txt' if submit_type == 'local_val': val_file_path = None test_file_path = None train = load_json(train_file_path) question = load_json(question_file_path) elif submit_type == 'val': test_file_path = None val_file_path = question_file_path train = load_json(train_file_path) question = load_json(question_file_path) elif submit_type == 'test': val_file_path = val_file_path test_file_path = question_file_path train = load_json(train_file_path) val = load_json(val_file_path) test = load_json(test_file_path) train = train + val question = test plylst_title_tag_emb = get_plylsts_embeddings(train, question, submit_type) save_scores(train, question, plylst_title_tag_emb, 'cos', submit_type)
def run(self, fname): random.seed(777) print("Reading data...\n") playlists = load_json(fname) #playlist는 태그, id, title, 곡들, 좋아요, 업데이트 날짜가 들어있는 딕셔너리 리스트 random.shuffle(playlists) print(f"Total playlists: {len(playlists)}") print("Splitting data...") train, val = self._split_data(playlists) #플레이리스트 나누기 print("Original train...") write_json(train, "orig/train.json") #train.json은 새로 만든다 orig 폴더에 print("Original val...") write_json(val, "orig/val.json") print("Masked val...") val_q, val_a = self._mask_data(val) #validation할것을 마스크해서 write_json(val_q, "questions/val.json") #q는 퀘스쳔 폴더, a는 앤서 폴더에 넣기 write_json(val_a, "answers/val.json")
def train(): MODE = "Test" if MODE == "Valid": train = load_json("arena_data/orig/train.json") + load_json( "arena_data/questions/val.json") dev = load_json("res/val.json") test = load_json("res/test.json") else: train = load_json("res/train.json") dev = load_json("res/val.json") test = load_json("res/test.json") pred_tag = load_json("arena_data/model/pred_tag.json") dic_pred_tag = {} for p_t in pred_tag: dic_pred_tag[p_t['id']] = p_t['predict_tag'] for doc in train: doc['tags_org'] = doc['tags'].copy() doc['tags'] += dic_pred_tag[doc['id']] for doc in dev: doc['tags_org'] = doc['tags'].copy() doc['tags'] += dic_pred_tag[doc['id']] for doc in test: doc['tags_org'] = doc['tags'].copy() doc['tags'] += dic_pred_tag[doc['id']] item_list = [] len_item = [] for doc in train + dev + test: song_list = [] for i in doc['songs']: song_list.append(str(i)) item_list.append(song_list + doc['tags']) len_item.append(len(song_list + doc['tags'])) print("Max length of item list :", max(len_item), ", Min :", min(len_item)) item_list = [x for x in item_list if len(x) > 1] print("Train set :", len(item_list)) print("Training Item2Vec model") SIZE = 100 model = Word2Vec(sentences=item_list, size=SIZE, window=240, min_count=2, sg=1, workers=8, iter=10, negative=7, compute_loss=True, callbacks=[LossPrinter()]) model.save("arena_data/model/word2vec.model") print("Vocab : ", len(model.wv.vocab)) print("Building and saving playlist embeddings") song_dic = {} tag_dic = {} for q in tqdm(train + test + dev): song_dic[str(q['id'])] = q['songs'] tag_dic[str(q['id'])] = q['tags_org'] p2v_song = WordEmbeddingsKeyedVectors(SIZE) ID = [] vec = [] for q in tqdm(train + test + dev): tmp_vec = 0 cnt_vocab = 0 if len(q['songs']) >= 1: for item in q['songs']: try: tmp_vec += model.wv.get_vector(str(item)) * 2 cnt_vocab += 1 except KeyError: pass if len(q['tags']) >= 1: for item in q['tags']: try: tmp_vec += model.wv.get_vector(str(item)) cnt_vocab += 1 except KeyError: pass if type(tmp_vec) != int: ID.append(str(q['id'])) vec.append(tmp_vec) p2v_song.add(ID, vec) p2v_song.save("arena_data/model/p2v_song.model") p2v_tag = WordEmbeddingsKeyedVectors(SIZE) ID = [] vec = [] for q in tqdm(train + test + dev): tmp_vec = 0 cnt_vocab = 0 if len(q['songs']) >= 1: for item in q['songs']: try: tmp_vec += model.wv.get_vector(str(item)) cnt_vocab += 1 except KeyError: pass if len(q['tags']) >= 1: for item in q['tags']: try: tmp_vec += model.wv.get_vector(str(item)) * 2 cnt_vocab += 1 except KeyError: pass if type(tmp_vec) != int: ID.append(str(q['id'])) vec.append(tmp_vec) p2v_tag.add(ID, vec) p2v_tag.save("arena_data/model/p2v_tag.model") if MODE == "Valid": print("Testing") questions = load_json("arena_data/questions/val.json") cnt_wv_song = 0 cnt_wv_tag = 0 res = [] for q in tqdm(questions): dic_song_score = {} dic_tag_score = {} song_result = [] tag_result = [] if str(q['id']) in p2v_song.wv.vocab: most_id = [ x for x in p2v_song.most_similar(str(q['id']), topn=50) ] for ID in most_id: for s in song_dic[ID[0]]: if s in dic_song_score: dic_song_score[s] += ID[1] else: dic_song_score[s] = ID[1] if str(q['id']) in p2v_tag.wv.vocab: most_id = [ x for x in p2v_tag.most_similar(str(q['id']), topn=50) ] for t in tag_dic[ID[0]]: if t in dic_tag_score: dic_tag_score[t] += ID[1] else: dic_tag_score[t] = ID[1] if len(dic_song_score) > 0: sort_song_score = sorted(dic_song_score.items(), key=lambda x: x[1], reverse=True) for s in sort_song_score: song_result.append(s[0]) cnt_wv_song += 1 if len(dic_tag_score) > 0: sort_tag_score = sorted(dic_tag_score.items(), key=lambda x: x[1], reverse=True) for s in sort_tag_score: tag_result.append(s[0]) cnt_wv_tag += 1 res.append({ "id": q["id"], "songs": remove_seen(q["songs"], song_result)[:100], "tags": remove_seen(q["tags"], tag_result)[:10], }) print(len(questions), cnt_wv_song, cnt_wv_tag) ans = load_json("arena_data/answers/val.json") evaluator = CustomEvaluator() evaluator._evaluate(ans, res)
freq_thr = args.freq_thr mode = args.mode # mode에 따른 train dataset과 관련 데이터 로드 question_data = None question_dataset = None answer_file_path = None if mode == 0: # split data에 대해서는 훈련 중간 중간 성능 확인을 위해서 question, answer 불러옴 default_file_path = 'arena_data/' model_postfix = 'local_val' train_file_path = f'{default_file_path}/orig/train.json' question_file_path = f'{default_file_path}/questions/val.json' answer_file_path = f'{default_file_path}/answers/val.json' train_data = load_json(train_file_path) question_data = load_json(question_file_path) elif mode == 1: default_file_path = 'res' model_postfix = 'val' train_file_path = f'{default_file_path}/train.json' val_file_path = f'{default_file_path}/val.json' train_data = load_json(train_file_path) + load_json(val_file_path) elif mode == 2: default_file_path = 'res' model_postfix = 'test' train_file_path = f'{default_file_path}/train.json'
from arena_util import load_json, write_json import pandas as pd import matplotlib.pyplot as plt res_val = load_json("res/val.json") res_val = pd.DataFrame(res_val) res_val['ply_len'] = res_val['songs'].apply(len) res_val[(res_val['ply_len'] > 0) & (res_val['ply_len'] < 4)] res_val['ply_len'] > 0 len(res_val) res_val['ply_len'].d() plt.hist(res_val[res_val['ply_len'] != 0]['ply_len'])
import pandas as pd from neighbor import Neighbor from knn import KNN from title_to_Tag import Title_to_tag from data_util import * from arena_util import load_json, write_json ### 1. data & preprocessing ### 1.1 load data song_meta_path = 'res/song_meta.json' train_path = 'res/train.json' val_path = 'res/test.json' song_meta = load_json(song_meta_path) train = load_json(train_path) song_meta = pd.DataFrame(song_meta) train = pd.DataFrame(train) ### 1.2 only_title chage to tags val = Title_to_tag(train_path=train_path, val_path=val_path).change() ### 1.3 convert "tag" to "tag_id" tag_to_id, id_to_tag = tag_id_meta(train, val) train = convert_tag_to_id(train, tag_to_id) val = convert_tag_to_id(val, tag_to_id) ### 2. modeling : Neighbor ### 2.1 hyperparameters: pow_alpha, pow_beta
self.ply_embedding = nn.Linear(self.songs_len, self.ply_d) self.decoder = nn.Linear(self.meta_d + self.ply_d, self.song_len) def forward(self, ply, meta): ply_embed = self.activation(self.ply_embedding(ply)) meta_embed = self.activation(self.meta_embedding(meta)) latent = torch.cat((ply_embed, meta_embed), dim=-1) ply_recon = self.decoder(latent) return torch.sigmoid(ply_recon) # train = load_json("arena_data/orig/train.json") # val = load_json("arena_data/questions/val.json") train = load_json("res/train.json") val = load_json("res/val.json") data = train + val # count train songs. filter under 150 def song_count_filter(data, over_n): counter = Counter() for ply in data: counter.update(ply['songs']) song_valid = set( [song_id for song_id, cnt in counter.items() if cnt >= over_n]) print(f"song_count_filter\n- song_valid length: {len(song_valid)}")
def infer(MODE="Test"): mode_opt = { "Valid": { "train_path": "arena_data/orig/train.json", "test_path": "arena_data/questions/val.json", "results_path": "cf2/val/results.json", "eval": True }, "Dev": { "train_path": "res/train.json", "test_path": "res/val.json", "results_path": "cf2/dev/results.json", "eval": False }, "Test": { "train_path": "res/train.json", "test_path": "res/test.json", "results_path": "cf2/test/results.json", "eval": False } } opt = mode_opt[MODE] train = pd.read_json(opt["train_path"]) test = pd.read_json(opt["test_path"]) if MODE != "Dev": dev = pd.read_json("res/val.json") if MODE != "Test": test_res = pd.read_json("res/test.json") print("Preprocessing dates") test_date = {} for i in tqdm(test.index): test_date[test.at[i, 'id']] = test.at[i, 'updt_date'] song_meta = pd.read_json("res/song_meta.json") song_date = {} for i in tqdm(song_meta.index): song_date[song_meta.at[i, "id"]] = str(song_meta.at[i, "issue_date"]) del song_meta song_update_date = [] for i in train.index: updt_date = train.loc[i, 'updt_date'][:4] + train.loc[ i, 'updt_date'][5:7] + train.loc[i, 'updt_date'][8:10] for t in train.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) for i in test.index: updt_date = test.loc[i, 'updt_date'][:4] + test.loc[ i, 'updt_date'][5:7] + test.loc[i, 'updt_date'][8:10] for t in test.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) if MODE != "Dev": for i in dev.index: updt_date = dev.loc[i, 'updt_date'][:4] + dev.loc[ i, 'updt_date'][5:7] + dev.loc[i, 'updt_date'][8:10] for t in dev.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) if MODE != "Test": for i in test_res.index: updt_date = test_res.loc[i, 'updt_date'][:4] + test_res.loc[ i, 'updt_date'][5:7] + test_res.loc[i, 'updt_date'][8:10] for t in test_res.loc[i, 'songs']: if song_date[t] > updt_date: song_date[t] = updt_date song_update_date.append(t) print("The number of processed songs :", len(set(song_update_date))) # Loading tags extracted from tiltle pred_tag = load_json("arena_data/model/pred_tag.json") dic_pred_tag = {} for p_t in pred_tag: dic_pred_tag[p_t['id']] = p_t['predict_tag'] train['tags_org'] = train['tags'] for i in train.index: train.at[i, 'tags'] = train.at[i, 'tags'] + dic_pred_tag[train.at[i, 'id']] test['tags_org'] = test['tags'] for i in test.index: test.at[i, 'tags'] = test.at[i, 'tags'] + dic_pred_tag[test.at[i, 'id']] if MODE != "Dev": dev['tags_org'] = dev['tags'] for i in dev.index: dev.at[i, 'tags'] = dev.at[i, 'tags'] + dic_pred_tag[dev.at[i, 'id']] if MODE != "Test": test_res['tags_org'] = test_res['tags'] for i in test_res.index: test_res.at[i, 'tags'] = test_res.at[i, 'tags'] + dic_pred_tag[ test_res.at[i, 'id']] # Calculating IDF inv_doc_freq = {} for d in train['songs'] + train['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 for d in test['songs'] + test['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 if MODE != "Dev": for d in dev['songs'] + dev['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 if MODE != "Test": for d in test_res['songs'] + test_res['tags']: for i in d: if i in inv_doc_freq: inv_doc_freq[i] += 1 else: inv_doc_freq[i] = 1 for k in inv_doc_freq: if MODE == "Valid": inv_doc_freq[k] = math.log10( (len(train) + len(test) + len(dev) + len(test_res)) / inv_doc_freq[k]) elif MODE == "Dev": inv_doc_freq[k] = math.log10( (len(train) + len(test) + len(test_res)) / inv_doc_freq[k]) else: inv_doc_freq[k] = math.log10( (len(train) + len(test) + len(dev)) / inv_doc_freq[k]) # Preprocessing data for CF matrix if MODE == "Valid": n_train = len(train) + len(dev) + len(test_res) elif MODE == "Dev": n_train = len(train) + len(test_res) else: n_train = len(train) + len(dev) n_test = len(test) # train + test if MODE == "Valid": plylst = pd.concat([train, dev, test_res, test], ignore_index=True) elif MODE == "Dev": plylst = pd.concat([train, test_res, test], ignore_index=True) else: plylst = pd.concat([train, dev, test], ignore_index=True) # playlist id plylst["nid"] = range(n_train + n_test) # nid -> id plylst_nid_id = dict(zip(plylst["nid"], plylst["id"])) plylst_tag = plylst['tags'] tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs]) tag_dict = {x: tag_counter[x] for x in tag_counter} id_type = dict() tag_id_tid = dict() tag_tid_id = dict() for i, t in enumerate(tag_dict): tag_id_tid[t] = i tag_tid_id[i] = t id_type[t] = 1 n_tags = len(tag_dict) plylst_song = plylst['songs'] song_counter = Counter([sg for sgs in plylst_song for sg in sgs]) song_dict = {x: song_counter[x] for x in song_counter} song_id_sid = dict() song_sid_id = dict() for i, t in enumerate(song_dict): song_id_sid[t] = i song_sid_id[i] = t id_type[t] = 1 n_songs = len(song_dict) plylst_st = plylst['songs'] + plylst['tags'] st_counter = Counter([st for sts in plylst_st for st in sts]) st_dict = {x: st_counter[x] for x in st_counter} st_id_tid = dict() st_tid_id = dict() for i, t in enumerate(st_dict): st_id_tid[t] = i st_tid_id[i] = t n_sts = len(st_dict) print("Tags : ", n_tags, ", Songs : ", n_songs, ", Total : ", n_sts) plylst['songs_id'] = plylst['songs'].map( lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None]) plylst['tags_id'] = plylst['tags_org'].map( lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None]) plylst['sts_id'] = (plylst['songs'] + plylst['tags']).map( lambda x: [st_id_tid.get(st) for st in x if st_id_tid.get(st) != None]) plylst_use = plylst[['nid', 'updt_date', 'songs_id', 'tags_id', 'sts_id']] plylst_use.loc[:, 'num_songs'] = plylst_use['songs_id'].map(len) plylst_use.loc[:, 'num_tags'] = plylst_use['tags_id'].map(len) plylst_use.loc[:, 'num_sts'] = plylst_use['sts_id'].map(len) plylst_use = plylst_use.set_index('nid') plylst_train = plylst_use.iloc[:, :] plylst_test = plylst_use.iloc[n_train:, :] n_train = len(plylst_train) np.random.seed(33) test_set = plylst_test print("The number of test samples : ", len(test_set)) # Building CF matrices avg_len_songs = 0 for songs in plylst_train['songs_id']: avg_len_songs += len(songs) avg_len_songs /= len(plylst_train['songs_id']) avg_len_tags = 0 for tags in plylst_train['tags_id']: avg_len_tags += len(tags) avg_len_tags /= len(plylst_train['tags_id']) avg_len_sts = 0 for sts in plylst_train['sts_id']: avg_len_sts += len(sts) avg_len_sts /= len(plylst_train['sts_id']) row = np.repeat(range(n_train), plylst_train['num_songs']) col = [song for songs in plylst_train['songs_id'] for song in songs] dat = [1 for songs in plylst_train['songs_id'] for song in songs] train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs)) row = np.repeat(range(n_train), plylst_train['num_tags']) col = [tag for tags in plylst_train['tags_id'] for tag in tags] dat = [1 for tags in plylst_train['tags_id'] for tag in tags] train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags)) row = np.repeat(range(n_train), plylst_train['num_sts']) col = [st for sts in plylst_train['sts_id'] for st in sts] dat = [ inv_doc_freq[st_tid_id[st]] / (len(sts) + 50) for sts in plylst_train['sts_id'] for st in sts ] train_sts_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_sts)) train_songs_A_T = train_songs_A.T.tocsr() train_tags_A_T = train_tags_A.T.tocsr() # Building map playlist id to songs or tags for playlist2vec if MODE == "Valid": p2v_targets = [train, test, dev, test_res] elif MODE == "Dev": p2v_targets = [train, test, test_res] else: p2v_targets = [train, test, dev] song_dic = {} tag_dic = {} for i, q in tqdm(pd.concat(p2v_targets).iterrows()): song_dic[str(q['id'])] = q['songs'] tag_dic[str(q['id'])] = q['tags_org'] # Loading playlist embedding vectors p2v_song = WordEmbeddingsKeyedVectors.load( "arena_data/model/p2v_song.model") p2v_tag = WordEmbeddingsKeyedVectors.load("arena_data/model/p2v_tag.model") print("Predicting") res = [] filtered_lot_song = [] filtered_lot_tag = [] for pid in tqdm(test_set.index): songs_already = test_set.loc[pid, "songs_id"] tags_already = test_set.loc[pid, "tags_id"] # Song prediction - 1. Query vector to predict songs p = np.zeros((n_sts, 1)) if len(test_set.loc[pid, 'sts_id']) > 0: for st in test_set.loc[pid, 'sts_id']: if st_tid_id[st] in inv_doc_freq: p[st] = inv_doc_freq[st_tid_id[st]] / ( len(test_set.loc[pid, 'sts_id']) + 50) # Song prediction - 2. K-nn playlists val = train_sts_A.dot(p).reshape(-1) val_idx = val.reshape(-1).argsort()[-250:][::-1] val_knn = np.zeros((n_train)) val_knn[val_idx] = val[val_idx] val = val_knn**2 # Song prediction - 3. Candidates cand_song = train_songs_A_T.dot(val) # Song prediction - 4. Rescoring using playlist2vec dic_song_score = {} if str(plylst_nid_id[pid]) in p2v_song.wv.vocab: most_id = [ x for x in p2v_song.most_similar(str(plylst_nid_id[pid]), topn=50) ] for ID in most_id: for s in song_dic[ID[0]]: if s in dic_song_score: dic_song_score[s] += ID[1] else: dic_song_score[s] = ID[1] for k in dic_song_score: cand_song[song_id_sid[k]] *= dic_song_score[k]**0.2 cand_song_idx = cand_song.reshape(-1).argsort()[-5000:][::-1] # Song prediction - 5. Filtering by score and date cand_song_idx_filtered = [] for cand in cand_song_idx: if cand_song[cand] > 0 and song_date[song_sid_id[ cand]] <= test_date[plylst_nid_id[pid]][:4] + test_date[ plylst_nid_id[pid]][5:7] + test_date[ plylst_nid_id[pid]][8:10]: cand_song_idx_filtered.append(cand) if len(cand_song_idx_filtered) < 400: filtered_lot_song.append(len(cand_song_idx_filtered)) cand_song_idx = np.array(cand_song_idx_filtered) # Song prediction - 6. Rescoring using heuristics dict_score = {} for idx in cand_song_idx: dict_score[idx] = cand_song[idx] mean_doc_freq = 0 std_doc_freq = 0 list_doc_freq = [] mean_song_date = 0 list_song_date = [] if len(test_set.loc[pid, "songs_id"]) > 0: for t in test_set.loc[pid, "songs_id"]: if song_sid_id[t] in inv_doc_freq: list_doc_freq.append(inv_doc_freq[song_sid_id[t]]) song_d = int(song_date[song_sid_id[t]]) if song_d > 19000000 and song_d < 20210000: list_song_date.append(song_d) if len(list_doc_freq) > 0: mean_doc_freq = np.mean(list_doc_freq) std_doc_freq = np.std(list_doc_freq) if len(list_song_date) > 0: mean_song_date = np.mean(list_song_date) # Song prediction - 6-1. Rescoring by IDF comparison if len(list_doc_freq) > 0: for c in dict_score: if song_sid_id[c] in inv_doc_freq: dict_score[c] = 1 / ( len(list_doc_freq)**0.5) * dict_score[c] + ( 1 - 1 / (len(list_doc_freq)**0.5)) * dict_score[c] * 2 / ( np.abs(inv_doc_freq[song_sid_id[c]] - mean_doc_freq) / (std_doc_freq + 1) + 2) else: dict_score[c] = 1 / (len(list_doc_freq)** 0.5) * dict_score[c] # Song prediction - 6-2. Rescoring by Date comparison if len(list_song_date) > 0: for c in dict_score: song_d = int(song_date[song_sid_id[c]]) if song_d > 19000000 and song_d < 20210000: dict_score[c] = 1 / ( len(list_song_date)**0.5) * dict_score[c] + ( 1 - 1 / (len(list_song_date)**0.5)) * dict_score[c] / ( np.abs(song_d - mean_song_date) / 500000 + 1) else: dict_score[c] = 1 / (len(list_song_date)** 0.5) * dict_score[c] score_sorted = sorted(dict_score.items(), key=lambda x: x[1], reverse=True) cand_song_idx = [] for t in score_sorted: cand_song_idx.append(t[0]) cand_song_idx = np.array(cand_song_idx) cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:300] rec_song_idx = [song_sid_id[i] for i in cand_song_idx] # Tag prediction - 1. Query vector to predict tags p = np.zeros((n_sts, 1)) p[test_set.loc[pid, 'sts_id']] = 1 # Tag prediction - 2. K-nn playlists val = train_sts_A.dot(p).reshape(-1) val_idx = val.reshape(-1).argsort()[-250:][::-1] val_knn = np.zeros((n_train)) val_knn[val_idx] = val[val_idx] val = val_knn**2 # Tag prediction - 3. Candidates cand_tag = train_tags_A_T.dot(val) # Tag prediction - 4. Rescoring using playlist2vec dic_tag_score = {} if str(plylst_nid_id[pid]) in p2v_tag.wv.vocab: most_id = [ x for x in p2v_tag.most_similar(str(plylst_nid_id[pid]), topn=50) ] for ID in most_id: for t in tag_dic[ID[0]]: if t in dic_tag_score: dic_tag_score[t] += ID[1] else: dic_tag_score[t] = ID[1] for k in dic_tag_score: cand_tag[tag_id_tid[k]] *= dic_tag_score[k]**0.5 cand_tag_idx = cand_tag.reshape(-1).argsort()[-35:][::-1] # Tag prediction - 5. Filtering by score cand_tag_idx_filtered = [] for cand in cand_tag_idx: if cand_tag[cand] > 0: cand_tag_idx_filtered.append(cand) if len(cand_tag_idx_filtered) != 35: filtered_lot_tag.append(len(cand_tag_idx_filtered)) cand_tag_idx = np.array(cand_tag_idx_filtered) cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:30] rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx] res.append({ "id": plylst_nid_id[pid], "songs": rec_song_idx, "tags": rec_tag_idx }) print(len(filtered_lot_song), filtered_lot_song) print(len(filtered_lot_tag), filtered_lot_tag) write_json(res, "results/" + opt["results_path"]) if opt["eval"]: evaluator = CustomEvaluator() evaluator.evaluate("arena_data/answers/val.json", "arena_data/results/" + opt["results_path"])
def merge_trains(self, train_fnames, output_fname): merged_train = [] for train_fname in train_fnames: merged_train += load_json(train_fname) write_json(merged_train, output_fname)
return val if __name__ == '__main__': # paths train_path = "res/train.json" # original train file val_path = "res/val.json" # original validation file test_path = "res/test.json" # original test file meta_path = "res/song_meta.json" # song_meta.json s2v_path = "pretrained/tvt_s2v.model" # train, valid, test song embedding model cluster_path = "pretrained/tvt_500c_s2v_khaiii.pkl" # train, valid, test 500 cluster model # load data train = load_json(train_path) val = load_json(val_path) test = load_json(test_path) song_meta = load_json(meta_path) val_set = pd.DataFrame(val) X_songs = val_set[val_set.songs.str.len() == 0].index X_tags = val_set[val_set.tags.str.len() == 0].index # set index XX = val_set[(val_set.songs.str.len() == 0) & (val_set.tags.str.len() == 0)].index # 1749 XO = val_set[(val_set.songs.str.len() == 0) & (val_set.tags.str.len() != 0)].index # 2630 OX = val_set[(val_set.songs.str.len() != 0) & (val_set.tags.str.len() == 0)].index # 9661
def save_scores(_autoencoder_embs, _score_type, _submit_type, genre=False): if _submit_type == 'val': question_file_path = 'res/val.json' train_file_path = 'res/train.json' val_file_path = 'res/val.json' train_dataset = load_json(train_file_path) elif _submit_type == 'test': question_file_path = 'res/test.json' train_file_path = 'res/train.json' val_file_path = 'res/val.json' train_dataset = load_json(train_file_path) + load_json(val_file_path) elif _submit_type == 'local_val': default_file_path = 'arena_data' train_file_path = f'{default_file_path}/orig/train.json' question_file_path = f'{default_file_path}/questions/val.json' train_dataset = load_json(train_file_path) _train = train_dataset _val = load_json(question_file_path) def pcc(_x, _y): vx = _x - torch.mean(_x) vy = _y - torch.mean(_y, axis=1).reshape(-1, 1) return torch.sum( (vx * vy), axis=1) / (torch.sqrt(torch.sum(vx**2)) * torch.sqrt(torch.sum((vy**2), axis=1))) def euclidean(_x, _y): return torch.sqrt(torch.sum((_y - _x)**2, axis=1)) all_train_ids = [plylst['id'] for plylst in _train] all_val_ids = [plylst['id'] for plylst in _val] train_ids = [] train_embs = [] val_ids = [] val_embs = [] for plylst_id, emb in tqdm(_autoencoder_embs.items()): if plylst_id in all_train_ids: train_ids.append(plylst_id) train_embs.append(emb) elif plylst_id in all_val_ids: val_ids.append(plylst_id) val_embs.append(emb) gpu = torch.device('cuda') cos = nn.CosineSimilarity(dim=1) train_tensor = torch.tensor(train_embs).to(gpu) val_tensor = torch.tensor(val_embs).to(gpu) scores = torch.zeros([val_tensor.shape[0], train_tensor.shape[0]], dtype=torch.float64) sorted_idx = torch.zeros([val_tensor.shape[0], train_tensor.shape[0]], dtype=torch.int32) for idx, val_vector in enumerate(tqdm(val_tensor)): if _score_type == 'pcc': output = pcc(val_vector.reshape(1, -1), train_tensor) elif _score_type == 'cos': output = cos(val_vector.reshape(1, -1), train_tensor) elif _score_type == 'euclidean': output = euclidean(val_vector.reshape(1, -1), train_tensor) index_sorted = torch.argsort(output, descending=True) scores[idx] = output sorted_idx[idx] = index_sorted results = defaultdict(list) for i, val_id in enumerate(tqdm(val_ids)): for j, train_idx in enumerate(sorted_idx[i][:1000]): results[val_id].append( (train_ids[train_idx], scores[i][train_idx].item())) if genre: if _submit_type == 'val': np.save(f'scores/val_scores_bias_{_score_type}_gnr', results) elif _submit_type == 'test': np.save(f'scores/test_scores_bias_{_score_type}_gnr', results) else: np.save(f'scores/local_val_scores_bias_{_score_type}_gnr', results) else: if _submit_type == 'val': np.save(f'scores/val_scores_bias_{_score_type}', results) elif _submit_type == 'test': np.save(f'scores/test_scores_bias_{_score_type}', results) else: np.save(f'scores/local_val_scores_bias_{_score_type}', results)
parser.add_argument('-mode', type=int, help="local_val: 0, val: 1, test: 2", default=2) args = parser.parse_args() _submit_type = args.mode if _submit_type == 0: # split data에 대해서는 훈련 중간 중간 성능 확인을 위해서 question, answer 불러옴 default_file_path = 'arena_data/' model_postfix = 'local_val' train_file_path = f'{default_file_path}/orig/train.json' question_file_path = f'{default_file_path}/questions/val.json' answer_file_path = f'{default_file_path}/answers/val.json' train_data = load_json(train_file_path) question_data = load_json(question_file_path) model_file_path = "model/autoencoder_450_256_0.0005_0.2_2_local_val.pkl" auto_score_file_path = "scores/local_val_scores_bias_cos" w2v_score_file_path = 'scores/local_val_scores_title_cos_24000' elif _submit_type == 1: default_file_path = 'res' model_postfix = 'val' train_file_path = f'{default_file_path}/train.json' val_file_path = f'{default_file_path}/val.json' train_data = load_json(train_file_path) + load_json(val_file_path) question_data = load_json(val_file_path) model_file_path = "model/autoencoder_450_256_0.0005_0.2_2_val.pkl" auto_score_file_path = "scores/val_scores_bias_cos"
def get_plylsts_embeddings(_model_file_path, _submit_type, genre=False): if _submit_type == 'val': default_file_path = 'res' question_file_path = 'res/val.json' train_file_path = 'res/train.json' val_file_path = 'res/val.json' train_dataset = load_json(train_file_path) elif _submit_type == 'test': default_file_path = 'res' question_file_path = 'res/test.json' train_file_path = 'res/train.json' val_file_path = 'res/val.json' train_dataset = load_json(train_file_path) + load_json(val_file_path) elif _submit_type == 'local_val': default_file_path = 'arena_data' train_file_path = f'{default_file_path}/orig/train.json' question_file_path = f'{default_file_path}/questions/val.json' default_file_path = f'{default_file_path}/orig' train_dataset = load_json(train_file_path) tag2id_file_path = f'{default_file_path}/tag2id_{_submit_type}.npy' id2tag_file_path = f'{default_file_path}/id2tag_{_submit_type}.npy' prep_song2id_file_path = f'{default_file_path}/freq_song2id_thr2_{_submit_type}.npy' id2prep_song_file_path = f'{default_file_path}/id2freq_song_thr2_{_submit_type}.npy' if genre: train_dataset = SongTagGenreDataset(train_dataset, tag2id_file_path, prep_song2id_file_path) question_dataset = SongTagGenreDataset(load_json(question_file_path), tag2id_file_path, prep_song2id_file_path) else: train_dataset = SongTagDataset(train_dataset, tag2id_file_path, prep_song2id_file_path) question_dataset = SongTagDataset(load_json(question_file_path), tag2id_file_path, prep_song2id_file_path) plylst_embed_weight = [] plylst_embed_bias = [] model_file_path = _model_file_path model = torch.load(model_file_path) for name, param in model.named_parameters(): if param.requires_grad: if name == 'encoder.1.weight': plylst_embed_weight = param.data elif name == 'encoder.1.bias': plylst_embed_bias = param.data train_loader = DataLoader(train_dataset, shuffle=True, batch_size=256, num_workers=4) question_loader = DataLoader(question_dataset, shuffle=True, batch_size=256, num_workers=4) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') plylst_emb_with_bias = dict() if genre: for idx, (_id, _data, _dnr, _dtl_dnr) in enumerate( tqdm(train_loader, desc='get train vectors...')): with torch.no_grad(): _data = _data.to(device) output_with_bias = ( torch.matmul(_data, plylst_embed_weight.T) + plylst_embed_bias).tolist() output_with_bias = np.concatenate( [output_with_bias, _dnr, _dtl_dnr], axis=1) _id = list(map(int, _id)) for i in range(len(_id)): plylst_emb_with_bias[_id[i]] = output_with_bias[i] for idx, (_id, _data, _dnr, _dtl_dnr) in enumerate( tqdm(question_loader, desc='get question vectors...')): with torch.no_grad(): _data = _data.to(device) output_with_bias = ( torch.matmul(_data, plylst_embed_weight.T) + plylst_embed_bias).tolist() output_with_bias = np.concatenate( [output_with_bias, _dnr, _dtl_dnr], axis=1) _id = list(map(int, _id)) for i in range(len(_id)): plylst_emb_with_bias[_id[i]] = output_with_bias[i] else: for idx, (_id, _data) in enumerate( tqdm(train_loader, desc='get train vectors...')): with torch.no_grad(): _data = _data.to(device) output_with_bias = ( torch.matmul(_data, plylst_embed_weight.T) + plylst_embed_bias).tolist() _id = list(map(int, _id)) for i in range(len(_id)): plylst_emb_with_bias[_id[i]] = output_with_bias[i] for idx, (_id, _data) in enumerate( tqdm(question_loader, desc='get question vectors...')): with torch.no_grad(): _data = _data.to(device) output_with_bias = ( torch.matmul(_data, plylst_embed_weight.T) + plylst_embed_bias).tolist() _id = list(map(int, _id)) for i in range(len(_id)): plylst_emb_with_bias[_id[i]] = output_with_bias[i] return plylst_emb_with_bias