Esempio n. 1
0
def make_input4tokenizer(train_file_path, genre_file_path, result_file_path, valid_file_path=None, test_file_path=None):
    def _wv_tags(tags_list):
        taS = []
        for tags in tags_list:
            taS.append(' '.join(tags))

        return(taS)

    def _wv_genre(genre):
        genre_dict = dict()
        for code, value in genre:
            code_num = int(code[2:])
            if not code_num % 100:
                cur_genre = value
                genre_dict[cur_genre] = []
            else:
                value = ' '.join(value.split('/'))
                genre_dict[cur_genre].append(value)
        genre_sentences = []
        for key in genre_dict:
            sub_list = genre_dict[key]
            key = ' '.join(key.split('/'))
            if not len(sub_list):
                continue
            for sub in sub_list:
                genre_sentences.append(key+' '+sub)
        return genre_sentences

    try:
        plylsts = load_json(train_file_path)
        if valid_file_path is not None:
            val_plylsts = load_json(valid_file_path)
            plylsts += val_plylsts
        if test_file_path is not None:
            test_plylsts = load_json(test_file_path)
            plylsts += test_plylsts

        genre_all = load_json(genre_file_path)
        genre_all_lists = []
        for code, gnr in genre_all.items():
            if gnr != '세부장르전체':
                genre_all_lists.append([code, gnr])
        genre_all_lists = np.asarray(genre_all_lists)

        sentences = []
        for plylst in plylsts:
            tiS = plylst['plylst_title']
            taS = ' '.join(plylst['tags'])
            upS = ' '.join(plylst['updt_date'][:7].split('-'))
            sentences.append(' '.join([tiS, taS, upS]))

        geS = _wv_genre(genre_all_lists)
        sentences = sentences + geS
        with open(result_file_path, 'w', encoding='utf8') as f:
            for sentence in sentences:
                f.write(sentence+'\n')
    except Exception as e:
        print(e.with_traceback())
        return False
    return sentences
Esempio n. 2
0
    def run(self, train_fname, question_fname):
        print("Loading train file...")
        train = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(train, questions)
        write_json(answers, "results/results.json")
def run(tag_to_id_fname, id_to_tag_fname, train_fname, test_fname):
    print("Loading tag_to_id...")
    tag_to_id = load_json(tag_to_id_fname)
    print("Loading id_to_tag...")
    id_to_tag = load_json(id_to_tag_fname)
    print("Loading train file...")
    train_data = load_json(train_fname)
    for ply in train_data:
        ply['tags'] = [tag_to_id[tag] for tag in ply['tags']]

    print("Loading test file...")
    test_data = load_json(test_fname)
    for ply in test_data:
        ply['tags'] = [tag_to_id[tag] for tag in ply['tags']]
    # print("Writing answers...")
    # answers = self._generate_answers(song_meta_json, train_data, questions)
    # write_json(answers, "results/results.json")
    print("Make Training dataset...")

    def train_generator():
        for x in train_data:
            songs = np.zeros(707989)
            tags = np.zeros(30653)
            songs[x['songs']] = 1
            tags[x['tags']] = 1
            yield np.concatenate([songs, tags])

    def test_generator():
        for x in test_data:
            songs = np.zeros(707989)
            tags = np.zeros(30653)
            songs[x['songs']] = 1
            tags[x['tags']] = 1
            yield np.concatenate([songs, tags])

    training_dataset = tf.data.Dataset.from_generator(generator=train_generator, output_types=tf.float32,
                                                      output_shapes=tf.TensorShape([707989+30653])).batch(256)
    test_dataset = tf.data.Dataset.from_generator(generator=test_generator, output_types=tf.float32,
                                                  output_shapes=tf.TensorShape([707989+30653])).batch(256)

    model = AutoEncoder(intermediate_dim=128, original_dim=707989+30653)
    opt = tf.keras.optimizers.Adam(learning_rate=1e-2)
    print("Train Loop...")

    train_loop(model, opt, loss, training_dataset, 20)
    print("Predict...")
    #
    preds = model(test_dataset)
    #
    pred_songs = preds[:, :707989]
    pred_tags = [id_to_tag[idx] for idx in preds[:, 707989:]]
    #
    print(pred_songs)
    print(pred_tags)
    model.save('saved_model')
Esempio n. 4
0
    def run(self, song_meta_fname, train_fname, question_fname):
        print("Loading song meta...")
        song_meta_json = load_json(song_meta_fname)

        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(song_meta_json, train_data, questions)
        write_json(answers, "results/results.json")
    def run(self, song_meta_fname, train_fname, question_fname, jobs=1, train_ans_fname=None):
        global NUM_CORE
        NUM_CORE = jobs

        print("Loading song meta...")
        song_meta_json = load_json(song_meta_fname)

        print("Loading train file...")
        train_data = load_json(train_fname)
        print(len(train_data))

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Loading question file...")
        ans = None
        if train_ans_fname != None:
            ans = load_json(train_ans_fname)

        result_df = pd.DataFrame(
            columns=['id', 'means_music_score', 'mean_tag_score', 'mean_title_score'])

        print("Writing answers...")

        chunked_train_set = list(self.chunker_list(questions, NUM_CORE))
        print(f'run with {len(chunked_train_set)} multiprocess')
        from nns_ensemble_with_artist_dist import GenreMostPopular
        import multiprocessing
        algorithm = GenreMostPopular()

        return_dict = multiprocessing.Manager().dict()
        answers_list = list()
        jobs = []
        p_idxs = []

        for p_idx, train_chunk in enumerate(chunked_train_set):
            p = multiprocessing.Process(target=algorithm._generate_answers,
                                        args=(song_meta_json, train_data, train_chunk, result_df, ans, p_idx, return_dict))
            jobs.append(p)
            p.start()
            p_idxs.append(p_idx)

        for p in jobs:
            p.join()

        answers = list()

        for p_idx in p_idxs:
            answers = answers + return_dict[p_idx]
        write_json(answers, "./cf/results/results.json")
Esempio n. 6
0
    def run(self, fname):
        # raondom shuffle 때문에 seed 지정
        random.seed(777)

        print("Reading data...\n")
        # json 파일 불러오기
        playlists = load_json(fname)
        # 불러온 파일 순서 섞기
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        # split
        train, val = self._split_data(playlists)

        # train, val 나눈 데이터 json 파일 작성
        print("Original train...")
        write_json(train, "orig/train.json")
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        # masking 작업
        val_q, val_a = self._mask_data(val)
        write_json(val_q, "questions/val.json")
        write_json(val_a, "answers/val.json")
    def run(self, train_fname, question_fname):
        print('set logger')
        logger = log.get_logger()
        print(logger)
        logger.set_log_level(WARN)
        print('logger set')

        print("Loading train file...")
        train = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(train, questions)
        write_json(answers, "results/results.json")
Esempio n. 8
0
def run(train_fname, val_fname, test_fname):
    tags = set()
    print("Reading train data...\n")
    playlists_train = load_json(train_fname)
    print("Reading val data...\n")
    playlists_val = load_json(val_fname)
    print("Reading test data...\n")
    playlists_test = load_json(test_fname)
    print("Get tags...\n")
    for ply in playlists_train + playlists_test + playlists_val:
        tags.update(ply['tags'])
    tag_to_id = {tag: i for i, tag in enumerate(list(tags))}
    id_to_tag = {i: tag for i, tag in enumerate(list(tags))}
    print("Write  tag_to_id.json...\n")
    write_json(tag_to_id, 'tag_to_id.json')
    print("Write  id_to_tag.json...\n")
    write_json(id_to_tag, 'id_to_tag.json')
    def run(self, song_meta_fname, train_fname, question_fname):
        print("Loading song meta...")
        song_meta_json = load_json(song_meta_fname)

        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        result_df = pd.DataFrame(
            columns=['id', 'means_music_score', 'mean_tag_score', 'mean_title_score'])
        answers = self._generate_answers(song_meta_json, train_data, questions, result_df)
        result_df.to_csv('./arena_data/question_k_score.csv', index=False)

        write_json(answers, "results/results.json")
    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname) #답지 json 파일을 로드
        gt_dict = {g["id"]: g for g in gt_playlists} # id : playlist 딕셔너리 만듬
        rec_playlists = load_json(rec_fname) #내가 만든 파일 로드

        gt_ids = set([g["id"] for g in gt_playlists]) #id 들 set으로 묶음
        rec_ids = set([r["id"] for r in rec_playlists]) #id 들 set으로 묶음

        if gt_ids != rec_ids:# id들이 같지 않으면 에러
            raise Exception("결과의 플레이리스트 id가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists] #
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]

        if set(rec_song_counts) != set([100]): #무조건 답지는 
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0
        recall = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])
            recall += self._recall(gt["songs"], rec["songs"][:100])
        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        recall = recall / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, recall, score            
Esempio n. 11
0
    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)

        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])

        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]

        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")
        # 랭킹기반 추천시스템에 주로 쓰이는 평가 지표로
        # 관련성이 높은 결과를 상위권에 노출시켰는지 기반으로 만들어야한다.
        # 검색엔진이나 영상추천, 음악추천 등 다양한 추천시스템에서 평가지표로 활용한다.
        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score
    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g['id']: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)

        gt_ids = set([g['id'] for g in gt_playlists])
        rec_ids = set([r['id'] for r in rec_playlists])

        if gt_ids != rec_ids:
            raise Exception('결과의 플레이리스트 수가 올바르지 않습니다.')

        rec_song_counts = [len(p['songs']) for p in rec_playlists]
        rec_tag_counts = [len(p['tags']) for p in rec_playlists]

        if set(rec_song_counts) != set([100]):
            raise Exception('추천 곡 결과의 개수가 맞지 않습니다.')

        if set(rec_tag_counts) != set([10]):
            raise Exception('추천 태그 결과의 개수가 맞지 않습니다.')

        rec_unique_song_counts = [len(set(p['songs'])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p['tags'])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception('한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.')

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception('한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.')

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec['id']]
            music_ndcg += self._ndcg(gt['songs'], rec['songs'][:100])
            tag_ndcg += self._ndcg(gt['tags'], rec['tags'][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score
Esempio n. 13
0
    def val_inference(self):
        model = models.load_model('./models/auto_encoder.h5')
        val = load_json('./arena_data/questions/val.json')
        tag_list = load_json('./arena_data/meta/AE_tag_list.json')
        result = []

        for v in tqdm(val):
            input_song_vec = np.zeros((1, 707989))
            input_tag_vec = np.zeros((1, len(tag_list)))
            predict = dict()
            predict['id'] = v['id']
            for s in v['songs']:
                input_song_vec[0][s] = 1
            for t in v['tags']:
                if t in tag_list:
                    input_tag_vec[0][tag_list.index(t)] = 1
            song_vec, tag_vec = model([input_song_vec, input_tag_vec])

            song_vec = np.array(song_vec[0])
            tag_vec = np.array(tag_vec[0])
            song_rank = song_vec.argsort()
            tag_rank = tag_vec.argsort()
            pred_songs = []
            pred_tags = []
            i = -1
            while len(pred_songs) < 100:
                if song_rank[i] not in v['songs']:
                    pred_songs.append(song_rank[i])
                i -= 1
            i = -1
            while len(pred_tags) < 10:
                if tag_list[tag_rank[i]] not in v['tags']:
                    pred_tags.append(tag_list[tag_rank[i]])
                i -= 1
            predict['songs'] = pred_songs
            predict['tags'] = pred_tags
            result.append(predict)
        write_json(result, 'AE_results.json')
Esempio n. 14
0
    def _init_song_meta(self):
        song_meta = load_json('res/song_meta.json')

        genre_gn_all = pd.read_json('res/genre_gn_all.json',
                                    encoding='utf8',
                                    typ='series')
        genre_gn_all = pd.DataFrame(genre_gn_all, columns=[
            'gnr_name'
        ]).reset_index().rename(columns={'index': 'gnr_code'})

        self.gnr_code, self.dtl_gnr_code = genre_gn_all_preprocessing(
            genre_gn_all)
        self.num_gnr = len(self.gnr_code)
        self.num_dtl_gnr = len(self.dtl_gnr_code)
        self.gnr_dic, self.dtl_dic, self.song_gnr_dic, self.song_dtl_dic = genre_DicGenerator(
            self.gnr_code, self.dtl_gnr_code, song_meta)
Esempio n. 15
0
    def _load_train(self):
        data = load_json('./arena_data/orig/train.json')
        self.train = []
        self.song_list = set()
        self.tag_list = set()
        print('train data filtering...')
        for t in tqdm(data):
            if t['like_cnt'] > 50:
                self.train.append(t)
                self.song_list.update(t['songs'])
                self.tag_list.update(t['tags'])
        self.song_list = list(self.song_list)
        self.tag_list = list(self.tag_list)
        self.total_song_num = 707989

        write_json(self.tag_list, 'meta/AE_tag_list.json')
Esempio n. 16
0
    def run(self, fname, train_size):
        random.seed(777)

        print('Reading data...\n')
        playlists = load_json(fname)
        random.shuffle(playlists)
        print(f'Total playlists: {len(playlists)}')

        print(f'Splitting data... train_size is {train_size}')
        train, val = self._split_data(playlists, train_size)

        print('Original train...')
        write_json(train, 'orig/train.json')
        print('Original val...')
        write_json(val, 'orig/val.json')

        print('Masked val...')
        val_q, val_a = self._mask_data(val)
        write_json(val_q, 'questions/val.json')
        write_json(val_a, 'answers/val.json')
Esempio n. 17
0
    def run(self, fname):
        random.seed(777)

        print("Reading data...\n")
        playlists = load_json(fname)
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        train, val = self._split_data(playlists)

        print("Original train...")
        write_json(train, "orig/train.json")
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        val_q, val_a = self._mask_data(val)
        write_json(val_q, "questions/val.json")
        write_json(val_a, "answers/val.json")
Esempio n. 18
0
def get_w2v_scores(submit_type):
    if submit_type == 'val':
        default_file_path = 'res'
        question_file_path = 'res/val.json'
        train_file_path = 'res/train.json'
    elif submit_type == 'test':
        default_file_path = 'res'
        val_file_path = 'res/val.json'
        question_file_path = 'res/test.json'
        train_file_path = 'res/train.json'
    elif submit_type == 'local_val':
        default_file_path = 'arena_data'
        train_file_path = f'{default_file_path}/orig/train.json'
        question_file_path = f'{default_file_path}/questions/val.json'
        default_file_path = f'{default_file_path}/orig'

    genre_file_path = 'res/genre_gn_all.json'

    tokenize_input_file_path = f'model/tokenizer_input_{method}_{vocab_size}_{submit_type}.txt'

    if submit_type == 'local_val':
        val_file_path = None
        test_file_path = None
        train = load_json(train_file_path)
        question = load_json(question_file_path)
    elif submit_type == 'val':
        test_file_path = None
        val_file_path = question_file_path
        train = load_json(train_file_path)
        question = load_json(question_file_path)
    elif submit_type == 'test':
        val_file_path = val_file_path
        test_file_path = question_file_path
        train = load_json(train_file_path)
        val = load_json(val_file_path)
        test = load_json(test_file_path)
        train = train + val
        question = test

    plylst_title_tag_emb = get_plylsts_embeddings(train, question, submit_type)
    save_scores(train, question, plylst_title_tag_emb, 'cos', submit_type)
Esempio n. 19
0
    def run(self, fname):
        random.seed(777)

        print("Reading data...\n")
        playlists = load_json(fname)
        #playlist는 태그, id, title, 곡들, 좋아요, 업데이트 날짜가 들어있는 딕셔너리 리스트
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        train, val = self._split_data(playlists)
        #플레이리스트 나누기

        print("Original train...")
        write_json(train, "orig/train.json")
        #train.json은 새로 만든다 orig 폴더에
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        val_q, val_a = self._mask_data(val)  #validation할것을 마스크해서
        write_json(val_q, "questions/val.json")  #q는 퀘스쳔 폴더, a는 앤서 폴더에 넣기
        write_json(val_a, "answers/val.json")
Esempio n. 20
0
def train():
    MODE = "Test"
    if MODE == "Valid":
        train = load_json("arena_data/orig/train.json") + load_json(
            "arena_data/questions/val.json")
        dev = load_json("res/val.json")
        test = load_json("res/test.json")
    else:
        train = load_json("res/train.json")
        dev = load_json("res/val.json")
        test = load_json("res/test.json")

    pred_tag = load_json("arena_data/model/pred_tag.json")
    dic_pred_tag = {}
    for p_t in pred_tag:
        dic_pred_tag[p_t['id']] = p_t['predict_tag']

    for doc in train:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    for doc in dev:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    for doc in test:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    item_list = []
    len_item = []

    for doc in train + dev + test:
        song_list = []
        for i in doc['songs']:
            song_list.append(str(i))
        item_list.append(song_list + doc['tags'])
        len_item.append(len(song_list + doc['tags']))
    print("Max length of item list :", max(len_item), ", Min :", min(len_item))
    item_list = [x for x in item_list if len(x) > 1]
    print("Train set :", len(item_list))

    print("Training Item2Vec model")
    SIZE = 100
    model = Word2Vec(sentences=item_list,
                     size=SIZE,
                     window=240,
                     min_count=2,
                     sg=1,
                     workers=8,
                     iter=10,
                     negative=7,
                     compute_loss=True,
                     callbacks=[LossPrinter()])
    model.save("arena_data/model/word2vec.model")
    print("Vocab : ", len(model.wv.vocab))

    print("Building and saving playlist embeddings")
    song_dic = {}
    tag_dic = {}
    for q in tqdm(train + test + dev):
        song_dic[str(q['id'])] = q['songs']
        tag_dic[str(q['id'])] = q['tags_org']

    p2v_song = WordEmbeddingsKeyedVectors(SIZE)
    ID = []
    vec = []
    for q in tqdm(train + test + dev):
        tmp_vec = 0
        cnt_vocab = 0
        if len(q['songs']) >= 1:
            for item in q['songs']:
                try:
                    tmp_vec += model.wv.get_vector(str(item)) * 2
                    cnt_vocab += 1
                except KeyError:
                    pass
        if len(q['tags']) >= 1:
            for item in q['tags']:
                try:
                    tmp_vec += model.wv.get_vector(str(item))
                    cnt_vocab += 1
                except KeyError:
                    pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_song.add(ID, vec)
    p2v_song.save("arena_data/model/p2v_song.model")

    p2v_tag = WordEmbeddingsKeyedVectors(SIZE)
    ID = []
    vec = []
    for q in tqdm(train + test + dev):
        tmp_vec = 0
        cnt_vocab = 0
        if len(q['songs']) >= 1:
            for item in q['songs']:
                try:
                    tmp_vec += model.wv.get_vector(str(item))
                    cnt_vocab += 1
                except KeyError:
                    pass
        if len(q['tags']) >= 1:
            for item in q['tags']:
                try:
                    tmp_vec += model.wv.get_vector(str(item)) * 2
                    cnt_vocab += 1
                except KeyError:
                    pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_tag.add(ID, vec)
    p2v_tag.save("arena_data/model/p2v_tag.model")

    if MODE == "Valid":
        print("Testing")
        questions = load_json("arena_data/questions/val.json")
        cnt_wv_song = 0
        cnt_wv_tag = 0
        res = []
        for q in tqdm(questions):
            dic_song_score = {}
            dic_tag_score = {}

            song_result = []
            tag_result = []

            if str(q['id']) in p2v_song.wv.vocab:
                most_id = [
                    x for x in p2v_song.most_similar(str(q['id']), topn=50)
                ]
                for ID in most_id:
                    for s in song_dic[ID[0]]:
                        if s in dic_song_score:
                            dic_song_score[s] += ID[1]
                        else:
                            dic_song_score[s] = ID[1]

            if str(q['id']) in p2v_tag.wv.vocab:
                most_id = [
                    x for x in p2v_tag.most_similar(str(q['id']), topn=50)
                ]
                for t in tag_dic[ID[0]]:
                    if t in dic_tag_score:
                        dic_tag_score[t] += ID[1]
                    else:
                        dic_tag_score[t] = ID[1]

            if len(dic_song_score) > 0:
                sort_song_score = sorted(dic_song_score.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

                for s in sort_song_score:
                    song_result.append(s[0])
                cnt_wv_song += 1

            if len(dic_tag_score) > 0:
                sort_tag_score = sorted(dic_tag_score.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

                for s in sort_tag_score:
                    tag_result.append(s[0])
                cnt_wv_tag += 1

            res.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_result)[:100],
                "tags": remove_seen(q["tags"], tag_result)[:10],
            })

        print(len(questions), cnt_wv_song, cnt_wv_tag)

        ans = load_json("arena_data/answers/val.json")
        evaluator = CustomEvaluator()
        evaluator._evaluate(ans, res)
Esempio n. 21
0
    freq_thr = args.freq_thr
    mode = args.mode

    # mode에 따른 train dataset과 관련 데이터 로드
    question_data = None
    question_dataset = None
    answer_file_path = None
    if mode == 0:  # split data에 대해서는 훈련 중간 중간 성능 확인을 위해서 question, answer 불러옴
        default_file_path = 'arena_data/'
        model_postfix = 'local_val'

        train_file_path = f'{default_file_path}/orig/train.json'
        question_file_path = f'{default_file_path}/questions/val.json'
        answer_file_path = f'{default_file_path}/answers/val.json'

        train_data = load_json(train_file_path)
        question_data = load_json(question_file_path)

    elif mode == 1:
        default_file_path = 'res'
        model_postfix = 'val'

        train_file_path = f'{default_file_path}/train.json'
        val_file_path = f'{default_file_path}/val.json'
        train_data = load_json(train_file_path) + load_json(val_file_path)

    elif mode == 2:
        default_file_path = 'res'
        model_postfix = 'test'

        train_file_path = f'{default_file_path}/train.json'
Esempio n. 22
0
from arena_util import load_json, write_json
import pandas as pd
import matplotlib.pyplot as plt

res_val = load_json("res/val.json")
res_val = pd.DataFrame(res_val)

res_val['ply_len'] = res_val['songs'].apply(len)

res_val[(res_val['ply_len'] > 0) & (res_val['ply_len'] < 4)]

res_val['ply_len'] > 0

len(res_val)

res_val['ply_len'].d()

plt.hist(res_val[res_val['ply_len'] != 0]['ply_len'])


Esempio n. 23
0
import pandas as pd

from neighbor import Neighbor
from knn import KNN
from title_to_Tag import Title_to_tag

from data_util import *
from arena_util import load_json, write_json

### 1. data & preprocessing
### 1.1 load data
song_meta_path = 'res/song_meta.json'
train_path = 'res/train.json'
val_path = 'res/test.json'

song_meta = load_json(song_meta_path)
train = load_json(train_path)

song_meta = pd.DataFrame(song_meta)
train = pd.DataFrame(train)

### 1.2 only_title chage to tags
val = Title_to_tag(train_path=train_path, val_path=val_path).change()

### 1.3 convert "tag" to "tag_id"
tag_to_id, id_to_tag = tag_id_meta(train, val)
train = convert_tag_to_id(train, tag_to_id)
val = convert_tag_to_id(val, tag_to_id)

### 2. modeling : Neighbor
### 2.1 hyperparameters: pow_alpha, pow_beta
Esempio n. 24
0
        self.ply_embedding = nn.Linear(self.songs_len, self.ply_d)

        self.decoder = nn.Linear(self.meta_d + self.ply_d, self.song_len)

    def forward(self, ply, meta):
        ply_embed = self.activation(self.ply_embedding(ply))
        meta_embed = self.activation(self.meta_embedding(meta))
        latent = torch.cat((ply_embed, meta_embed), dim=-1)
        ply_recon = self.decoder(latent)

        return torch.sigmoid(ply_recon)


# train = load_json("arena_data/orig/train.json")
# val = load_json("arena_data/questions/val.json")
train = load_json("res/train.json")
val = load_json("res/val.json")

data = train + val


# count train songs. filter under 150
def song_count_filter(data, over_n):
    counter = Counter()

    for ply in data:
        counter.update(ply['songs'])

    song_valid = set(
        [song_id for song_id, cnt in counter.items() if cnt >= over_n])
    print(f"song_count_filter\n- song_valid length: {len(song_valid)}")
Esempio n. 25
0
def infer(MODE="Test"):
    mode_opt = {
        "Valid": {
            "train_path": "arena_data/orig/train.json",
            "test_path": "arena_data/questions/val.json",
            "results_path": "cf2/val/results.json",
            "eval": True
        },
        "Dev": {
            "train_path": "res/train.json",
            "test_path": "res/val.json",
            "results_path": "cf2/dev/results.json",
            "eval": False
        },
        "Test": {
            "train_path": "res/train.json",
            "test_path": "res/test.json",
            "results_path": "cf2/test/results.json",
            "eval": False
        }
    }
    opt = mode_opt[MODE]

    train = pd.read_json(opt["train_path"])
    test = pd.read_json(opt["test_path"])

    if MODE != "Dev":
        dev = pd.read_json("res/val.json")

    if MODE != "Test":
        test_res = pd.read_json("res/test.json")

    print("Preprocessing dates")
    test_date = {}
    for i in tqdm(test.index):
        test_date[test.at[i, 'id']] = test.at[i, 'updt_date']

    song_meta = pd.read_json("res/song_meta.json")

    song_date = {}
    for i in tqdm(song_meta.index):
        song_date[song_meta.at[i, "id"]] = str(song_meta.at[i, "issue_date"])

    del song_meta

    song_update_date = []
    for i in train.index:
        updt_date = train.loc[i, 'updt_date'][:4] + train.loc[
            i, 'updt_date'][5:7] + train.loc[i, 'updt_date'][8:10]
        for t in train.loc[i, 'songs']:
            if song_date[t] > updt_date:
                song_date[t] = updt_date
                song_update_date.append(t)
    for i in test.index:
        updt_date = test.loc[i, 'updt_date'][:4] + test.loc[
            i, 'updt_date'][5:7] + test.loc[i, 'updt_date'][8:10]
        for t in test.loc[i, 'songs']:
            if song_date[t] > updt_date:
                song_date[t] = updt_date
                song_update_date.append(t)
    if MODE != "Dev":
        for i in dev.index:
            updt_date = dev.loc[i, 'updt_date'][:4] + dev.loc[
                i, 'updt_date'][5:7] + dev.loc[i, 'updt_date'][8:10]
            for t in dev.loc[i, 'songs']:
                if song_date[t] > updt_date:
                    song_date[t] = updt_date
                    song_update_date.append(t)
    if MODE != "Test":
        for i in test_res.index:
            updt_date = test_res.loc[i, 'updt_date'][:4] + test_res.loc[
                i, 'updt_date'][5:7] + test_res.loc[i, 'updt_date'][8:10]
            for t in test_res.loc[i, 'songs']:
                if song_date[t] > updt_date:
                    song_date[t] = updt_date
                    song_update_date.append(t)
    print("The number of processed songs :", len(set(song_update_date)))

    # Loading tags extracted from tiltle
    pred_tag = load_json("arena_data/model/pred_tag.json")

    dic_pred_tag = {}
    for p_t in pred_tag:
        dic_pred_tag[p_t['id']] = p_t['predict_tag']

    train['tags_org'] = train['tags']
    for i in train.index:
        train.at[i,
                 'tags'] = train.at[i, 'tags'] + dic_pred_tag[train.at[i,
                                                                       'id']]

    test['tags_org'] = test['tags']
    for i in test.index:
        test.at[i,
                'tags'] = test.at[i, 'tags'] + dic_pred_tag[test.at[i, 'id']]

    if MODE != "Dev":
        dev['tags_org'] = dev['tags']
        for i in dev.index:
            dev.at[i,
                   'tags'] = dev.at[i, 'tags'] + dic_pred_tag[dev.at[i, 'id']]

    if MODE != "Test":
        test_res['tags_org'] = test_res['tags']
        for i in test_res.index:
            test_res.at[i, 'tags'] = test_res.at[i, 'tags'] + dic_pred_tag[
                test_res.at[i, 'id']]

    # Calculating IDF
    inv_doc_freq = {}
    for d in train['songs'] + train['tags']:
        for i in d:
            if i in inv_doc_freq:
                inv_doc_freq[i] += 1
            else:
                inv_doc_freq[i] = 1

    for d in test['songs'] + test['tags']:
        for i in d:
            if i in inv_doc_freq:
                inv_doc_freq[i] += 1
            else:
                inv_doc_freq[i] = 1

    if MODE != "Dev":
        for d in dev['songs'] + dev['tags']:
            for i in d:
                if i in inv_doc_freq:
                    inv_doc_freq[i] += 1
                else:
                    inv_doc_freq[i] = 1

    if MODE != "Test":
        for d in test_res['songs'] + test_res['tags']:
            for i in d:
                if i in inv_doc_freq:
                    inv_doc_freq[i] += 1
                else:
                    inv_doc_freq[i] = 1

    for k in inv_doc_freq:
        if MODE == "Valid":
            inv_doc_freq[k] = math.log10(
                (len(train) + len(test) + len(dev) + len(test_res)) /
                inv_doc_freq[k])
        elif MODE == "Dev":
            inv_doc_freq[k] = math.log10(
                (len(train) + len(test) + len(test_res)) / inv_doc_freq[k])
        else:
            inv_doc_freq[k] = math.log10(
                (len(train) + len(test) + len(dev)) / inv_doc_freq[k])

    # Preprocessing data for CF matrix
    if MODE == "Valid":
        n_train = len(train) + len(dev) + len(test_res)
    elif MODE == "Dev":
        n_train = len(train) + len(test_res)
    else:
        n_train = len(train) + len(dev)
    n_test = len(test)

    # train + test
    if MODE == "Valid":
        plylst = pd.concat([train, dev, test_res, test], ignore_index=True)
    elif MODE == "Dev":
        plylst = pd.concat([train, test_res, test], ignore_index=True)
    else:
        plylst = pd.concat([train, dev, test], ignore_index=True)

    # playlist id
    plylst["nid"] = range(n_train + n_test)

    # nid -> id
    plylst_nid_id = dict(zip(plylst["nid"], plylst["id"]))

    plylst_tag = plylst['tags']
    tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
    tag_dict = {x: tag_counter[x] for x in tag_counter}

    id_type = dict()

    tag_id_tid = dict()
    tag_tid_id = dict()
    for i, t in enumerate(tag_dict):
        tag_id_tid[t] = i
        tag_tid_id[i] = t
        id_type[t] = 1

    n_tags = len(tag_dict)

    plylst_song = plylst['songs']
    song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
    song_dict = {x: song_counter[x] for x in song_counter}

    song_id_sid = dict()
    song_sid_id = dict()
    for i, t in enumerate(song_dict):
        song_id_sid[t] = i
        song_sid_id[i] = t
        id_type[t] = 1

    n_songs = len(song_dict)

    plylst_st = plylst['songs'] + plylst['tags']
    st_counter = Counter([st for sts in plylst_st for st in sts])
    st_dict = {x: st_counter[x] for x in st_counter}

    st_id_tid = dict()
    st_tid_id = dict()
    for i, t in enumerate(st_dict):
        st_id_tid[t] = i
        st_tid_id[i] = t

    n_sts = len(st_dict)

    print("Tags : ", n_tags, ", Songs : ", n_songs, ", Total : ", n_sts)

    plylst['songs_id'] = plylst['songs'].map(
        lambda x:
        [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
    plylst['tags_id'] = plylst['tags_org'].map(
        lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])
    plylst['sts_id'] = (plylst['songs'] + plylst['tags']).map(
        lambda x: [st_id_tid.get(st) for st in x if st_id_tid.get(st) != None])

    plylst_use = plylst[['nid', 'updt_date', 'songs_id', 'tags_id', 'sts_id']]
    plylst_use.loc[:, 'num_songs'] = plylst_use['songs_id'].map(len)
    plylst_use.loc[:, 'num_tags'] = plylst_use['tags_id'].map(len)
    plylst_use.loc[:, 'num_sts'] = plylst_use['sts_id'].map(len)
    plylst_use = plylst_use.set_index('nid')

    plylst_train = plylst_use.iloc[:, :]
    plylst_test = plylst_use.iloc[n_train:, :]

    n_train = len(plylst_train)

    np.random.seed(33)
    test_set = plylst_test
    print("The number of test samples : ", len(test_set))

    # Building CF matrices
    avg_len_songs = 0
    for songs in plylst_train['songs_id']:
        avg_len_songs += len(songs)
    avg_len_songs /= len(plylst_train['songs_id'])

    avg_len_tags = 0
    for tags in plylst_train['tags_id']:
        avg_len_tags += len(tags)
    avg_len_tags /= len(plylst_train['tags_id'])

    avg_len_sts = 0
    for sts in plylst_train['sts_id']:
        avg_len_sts += len(sts)
    avg_len_sts /= len(plylst_train['sts_id'])

    row = np.repeat(range(n_train), plylst_train['num_songs'])
    col = [song for songs in plylst_train['songs_id'] for song in songs]
    dat = [1 for songs in plylst_train['songs_id'] for song in songs]
    train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

    row = np.repeat(range(n_train), plylst_train['num_tags'])
    col = [tag for tags in plylst_train['tags_id'] for tag in tags]
    dat = [1 for tags in plylst_train['tags_id'] for tag in tags]
    train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

    row = np.repeat(range(n_train), plylst_train['num_sts'])
    col = [st for sts in plylst_train['sts_id'] for st in sts]
    dat = [
        inv_doc_freq[st_tid_id[st]] / (len(sts) + 50)
        for sts in plylst_train['sts_id'] for st in sts
    ]
    train_sts_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_sts))

    train_songs_A_T = train_songs_A.T.tocsr()
    train_tags_A_T = train_tags_A.T.tocsr()

    # Building map playlist id to songs or tags for playlist2vec
    if MODE == "Valid":
        p2v_targets = [train, test, dev, test_res]
    elif MODE == "Dev":
        p2v_targets = [train, test, test_res]
    else:
        p2v_targets = [train, test, dev]

    song_dic = {}
    tag_dic = {}
    for i, q in tqdm(pd.concat(p2v_targets).iterrows()):
        song_dic[str(q['id'])] = q['songs']
        tag_dic[str(q['id'])] = q['tags_org']

    # Loading playlist embedding vectors
    p2v_song = WordEmbeddingsKeyedVectors.load(
        "arena_data/model/p2v_song.model")
    p2v_tag = WordEmbeddingsKeyedVectors.load("arena_data/model/p2v_tag.model")

    print("Predicting")
    res = []
    filtered_lot_song = []
    filtered_lot_tag = []
    for pid in tqdm(test_set.index):
        songs_already = test_set.loc[pid, "songs_id"]
        tags_already = test_set.loc[pid, "tags_id"]

        # Song prediction - 1. Query vector to predict songs
        p = np.zeros((n_sts, 1))
        if len(test_set.loc[pid, 'sts_id']) > 0:
            for st in test_set.loc[pid, 'sts_id']:
                if st_tid_id[st] in inv_doc_freq:
                    p[st] = inv_doc_freq[st_tid_id[st]] / (
                        len(test_set.loc[pid, 'sts_id']) + 50)

        # Song prediction - 2. K-nn playlists
        val = train_sts_A.dot(p).reshape(-1)

        val_idx = val.reshape(-1).argsort()[-250:][::-1]

        val_knn = np.zeros((n_train))
        val_knn[val_idx] = val[val_idx]

        val = val_knn**2

        # Song prediction - 3. Candidates
        cand_song = train_songs_A_T.dot(val)

        # Song prediction - 4. Rescoring using playlist2vec
        dic_song_score = {}
        if str(plylst_nid_id[pid]) in p2v_song.wv.vocab:
            most_id = [
                x for x in p2v_song.most_similar(str(plylst_nid_id[pid]),
                                                 topn=50)
            ]
            for ID in most_id:
                for s in song_dic[ID[0]]:
                    if s in dic_song_score:
                        dic_song_score[s] += ID[1]
                    else:
                        dic_song_score[s] = ID[1]

        for k in dic_song_score:
            cand_song[song_id_sid[k]] *= dic_song_score[k]**0.2

        cand_song_idx = cand_song.reshape(-1).argsort()[-5000:][::-1]

        # Song prediction - 5. Filtering by score and date
        cand_song_idx_filtered = []
        for cand in cand_song_idx:
            if cand_song[cand] > 0 and song_date[song_sid_id[
                    cand]] <= test_date[plylst_nid_id[pid]][:4] + test_date[
                        plylst_nid_id[pid]][5:7] + test_date[
                            plylst_nid_id[pid]][8:10]:
                cand_song_idx_filtered.append(cand)
        if len(cand_song_idx_filtered) < 400:
            filtered_lot_song.append(len(cand_song_idx_filtered))
        cand_song_idx = np.array(cand_song_idx_filtered)

        # Song prediction - 6. Rescoring using heuristics
        dict_score = {}
        for idx in cand_song_idx:
            dict_score[idx] = cand_song[idx]

        mean_doc_freq = 0
        std_doc_freq = 0
        list_doc_freq = []
        mean_song_date = 0
        list_song_date = []
        if len(test_set.loc[pid, "songs_id"]) > 0:
            for t in test_set.loc[pid, "songs_id"]:
                if song_sid_id[t] in inv_doc_freq:
                    list_doc_freq.append(inv_doc_freq[song_sid_id[t]])
                song_d = int(song_date[song_sid_id[t]])
                if song_d > 19000000 and song_d < 20210000:
                    list_song_date.append(song_d)
            if len(list_doc_freq) > 0:
                mean_doc_freq = np.mean(list_doc_freq)
                std_doc_freq = np.std(list_doc_freq)
            if len(list_song_date) > 0:
                mean_song_date = np.mean(list_song_date)

        # Song prediction - 6-1. Rescoring by IDF comparison
        if len(list_doc_freq) > 0:
            for c in dict_score:
                if song_sid_id[c] in inv_doc_freq:
                    dict_score[c] = 1 / (
                        len(list_doc_freq)**0.5) * dict_score[c] + (
                            1 - 1 /
                            (len(list_doc_freq)**0.5)) * dict_score[c] * 2 / (
                                np.abs(inv_doc_freq[song_sid_id[c]] -
                                       mean_doc_freq) / (std_doc_freq + 1) + 2)
                else:
                    dict_score[c] = 1 / (len(list_doc_freq)**
                                         0.5) * dict_score[c]

        # Song prediction - 6-2. Rescoring by Date comparison
        if len(list_song_date) > 0:
            for c in dict_score:
                song_d = int(song_date[song_sid_id[c]])
                if song_d > 19000000 and song_d < 20210000:
                    dict_score[c] = 1 / (
                        len(list_song_date)**0.5) * dict_score[c] + (
                            1 - 1 /
                            (len(list_song_date)**0.5)) * dict_score[c] / (
                                np.abs(song_d - mean_song_date) / 500000 + 1)
                else:
                    dict_score[c] = 1 / (len(list_song_date)**
                                         0.5) * dict_score[c]

        score_sorted = sorted(dict_score.items(),
                              key=lambda x: x[1],
                              reverse=True)

        cand_song_idx = []
        for t in score_sorted:
            cand_song_idx.append(t[0])
        cand_song_idx = np.array(cand_song_idx)

        cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) ==
                                      False][:300]
        rec_song_idx = [song_sid_id[i] for i in cand_song_idx]

        # Tag prediction - 1. Query vector to predict tags
        p = np.zeros((n_sts, 1))
        p[test_set.loc[pid, 'sts_id']] = 1

        # Tag prediction - 2. K-nn playlists
        val = train_sts_A.dot(p).reshape(-1)

        val_idx = val.reshape(-1).argsort()[-250:][::-1]

        val_knn = np.zeros((n_train))
        val_knn[val_idx] = val[val_idx]

        val = val_knn**2

        # Tag prediction - 3. Candidates
        cand_tag = train_tags_A_T.dot(val)

        # Tag prediction - 4. Rescoring using playlist2vec
        dic_tag_score = {}
        if str(plylst_nid_id[pid]) in p2v_tag.wv.vocab:
            most_id = [
                x
                for x in p2v_tag.most_similar(str(plylst_nid_id[pid]), topn=50)
            ]
            for ID in most_id:
                for t in tag_dic[ID[0]]:
                    if t in dic_tag_score:
                        dic_tag_score[t] += ID[1]
                    else:
                        dic_tag_score[t] = ID[1]

        for k in dic_tag_score:
            cand_tag[tag_id_tid[k]] *= dic_tag_score[k]**0.5

        cand_tag_idx = cand_tag.reshape(-1).argsort()[-35:][::-1]

        # Tag prediction - 5. Filtering by score
        cand_tag_idx_filtered = []
        for cand in cand_tag_idx:
            if cand_tag[cand] > 0:
                cand_tag_idx_filtered.append(cand)
        if len(cand_tag_idx_filtered) != 35:
            filtered_lot_tag.append(len(cand_tag_idx_filtered))
        cand_tag_idx = np.array(cand_tag_idx_filtered)

        cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                    False][:30]
        rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

        res.append({
            "id": plylst_nid_id[pid],
            "songs": rec_song_idx,
            "tags": rec_tag_idx
        })

    print(len(filtered_lot_song), filtered_lot_song)
    print(len(filtered_lot_tag), filtered_lot_tag)

    write_json(res, "results/" + opt["results_path"])

    if opt["eval"]:
        evaluator = CustomEvaluator()
        evaluator.evaluate("arena_data/answers/val.json",
                           "arena_data/results/" + opt["results_path"])
    def merge_trains(self, train_fnames, output_fname):
        merged_train = []
        for train_fname in train_fnames:
            merged_train += load_json(train_fname)

        write_json(merged_train, output_fname)
Esempio n. 27
0
    return val


if __name__ == '__main__':

    # paths
    train_path = "res/train.json"  # original train file
    val_path = "res/val.json"  # original validation file
    test_path = "res/test.json"  # original test file
    meta_path = "res/song_meta.json"  # song_meta.json
    s2v_path = "pretrained/tvt_s2v.model"  # train, valid, test song embedding model
    cluster_path = "pretrained/tvt_500c_s2v_khaiii.pkl"  # train, valid, test 500 cluster model

    # load data
    train = load_json(train_path)
    val = load_json(val_path)
    test = load_json(test_path)
    song_meta = load_json(meta_path)

    val_set = pd.DataFrame(val)
    X_songs = val_set[val_set.songs.str.len() == 0].index
    X_tags = val_set[val_set.tags.str.len() == 0].index

    # set index
    XX = val_set[(val_set.songs.str.len() == 0)
                 & (val_set.tags.str.len() == 0)].index  # 1749
    XO = val_set[(val_set.songs.str.len() == 0)
                 & (val_set.tags.str.len() != 0)].index  # 2630
    OX = val_set[(val_set.songs.str.len() != 0)
                 & (val_set.tags.str.len() == 0)].index  # 9661
Esempio n. 28
0
def save_scores(_autoencoder_embs, _score_type, _submit_type, genre=False):
    if _submit_type == 'val':
        question_file_path = 'res/val.json'
        train_file_path = 'res/train.json'
        val_file_path = 'res/val.json'
        train_dataset = load_json(train_file_path)
    elif _submit_type == 'test':
        question_file_path = 'res/test.json'
        train_file_path = 'res/train.json'
        val_file_path = 'res/val.json'
        train_dataset = load_json(train_file_path) + load_json(val_file_path)
    elif _submit_type == 'local_val':
        default_file_path = 'arena_data'
        train_file_path = f'{default_file_path}/orig/train.json'
        question_file_path = f'{default_file_path}/questions/val.json'
        train_dataset = load_json(train_file_path)

    _train = train_dataset
    _val = load_json(question_file_path)

    def pcc(_x, _y):
        vx = _x - torch.mean(_x)
        vy = _y - torch.mean(_y, axis=1).reshape(-1, 1)
        return torch.sum(
            (vx * vy), axis=1) / (torch.sqrt(torch.sum(vx**2)) *
                                  torch.sqrt(torch.sum((vy**2), axis=1)))

    def euclidean(_x, _y):
        return torch.sqrt(torch.sum((_y - _x)**2, axis=1))

    all_train_ids = [plylst['id'] for plylst in _train]
    all_val_ids = [plylst['id'] for plylst in _val]

    train_ids = []
    train_embs = []
    val_ids = []
    val_embs = []

    for plylst_id, emb in tqdm(_autoencoder_embs.items()):
        if plylst_id in all_train_ids:
            train_ids.append(plylst_id)
            train_embs.append(emb)
        elif plylst_id in all_val_ids:
            val_ids.append(plylst_id)
            val_embs.append(emb)

    gpu = torch.device('cuda')
    cos = nn.CosineSimilarity(dim=1)

    train_tensor = torch.tensor(train_embs).to(gpu)
    val_tensor = torch.tensor(val_embs).to(gpu)

    scores = torch.zeros([val_tensor.shape[0], train_tensor.shape[0]],
                         dtype=torch.float64)
    sorted_idx = torch.zeros([val_tensor.shape[0], train_tensor.shape[0]],
                             dtype=torch.int32)

    for idx, val_vector in enumerate(tqdm(val_tensor)):
        if _score_type == 'pcc':
            output = pcc(val_vector.reshape(1, -1), train_tensor)
        elif _score_type == 'cos':
            output = cos(val_vector.reshape(1, -1), train_tensor)
        elif _score_type == 'euclidean':
            output = euclidean(val_vector.reshape(1, -1), train_tensor)
        index_sorted = torch.argsort(output, descending=True)
        scores[idx] = output
        sorted_idx[idx] = index_sorted

    results = defaultdict(list)
    for i, val_id in enumerate(tqdm(val_ids)):
        for j, train_idx in enumerate(sorted_idx[i][:1000]):
            results[val_id].append(
                (train_ids[train_idx], scores[i][train_idx].item()))
    if genre:
        if _submit_type == 'val':
            np.save(f'scores/val_scores_bias_{_score_type}_gnr', results)
        elif _submit_type == 'test':
            np.save(f'scores/test_scores_bias_{_score_type}_gnr', results)
        else:
            np.save(f'scores/local_val_scores_bias_{_score_type}_gnr', results)
    else:
        if _submit_type == 'val':
            np.save(f'scores/val_scores_bias_{_score_type}', results)
        elif _submit_type == 'test':
            np.save(f'scores/test_scores_bias_{_score_type}', results)
        else:
            np.save(f'scores/local_val_scores_bias_{_score_type}', results)
Esempio n. 29
0
    parser.add_argument('-mode',
                        type=int,
                        help="local_val: 0, val: 1, test: 2",
                        default=2)
    args = parser.parse_args()
    _submit_type = args.mode

    if _submit_type == 0:  # split data에 대해서는 훈련 중간 중간 성능 확인을 위해서 question, answer 불러옴
        default_file_path = 'arena_data/'
        model_postfix = 'local_val'

        train_file_path = f'{default_file_path}/orig/train.json'
        question_file_path = f'{default_file_path}/questions/val.json'
        answer_file_path = f'{default_file_path}/answers/val.json'

        train_data = load_json(train_file_path)
        question_data = load_json(question_file_path)
        model_file_path = "model/autoencoder_450_256_0.0005_0.2_2_local_val.pkl"
        auto_score_file_path = "scores/local_val_scores_bias_cos"
        w2v_score_file_path = 'scores/local_val_scores_title_cos_24000'

    elif _submit_type == 1:
        default_file_path = 'res'
        model_postfix = 'val'

        train_file_path = f'{default_file_path}/train.json'
        val_file_path = f'{default_file_path}/val.json'
        train_data = load_json(train_file_path) + load_json(val_file_path)
        question_data = load_json(val_file_path)
        model_file_path = "model/autoencoder_450_256_0.0005_0.2_2_val.pkl"
        auto_score_file_path = "scores/val_scores_bias_cos"
Esempio n. 30
0
def get_plylsts_embeddings(_model_file_path, _submit_type, genre=False):
    if _submit_type == 'val':
        default_file_path = 'res'
        question_file_path = 'res/val.json'
        train_file_path = 'res/train.json'
        val_file_path = 'res/val.json'
        train_dataset = load_json(train_file_path)
    elif _submit_type == 'test':
        default_file_path = 'res'
        question_file_path = 'res/test.json'
        train_file_path = 'res/train.json'
        val_file_path = 'res/val.json'
        train_dataset = load_json(train_file_path) + load_json(val_file_path)
    elif _submit_type == 'local_val':
        default_file_path = 'arena_data'
        train_file_path = f'{default_file_path}/orig/train.json'
        question_file_path = f'{default_file_path}/questions/val.json'
        default_file_path = f'{default_file_path}/orig'
        train_dataset = load_json(train_file_path)

    tag2id_file_path = f'{default_file_path}/tag2id_{_submit_type}.npy'
    id2tag_file_path = f'{default_file_path}/id2tag_{_submit_type}.npy'
    prep_song2id_file_path = f'{default_file_path}/freq_song2id_thr2_{_submit_type}.npy'
    id2prep_song_file_path = f'{default_file_path}/id2freq_song_thr2_{_submit_type}.npy'

    if genre:
        train_dataset = SongTagGenreDataset(train_dataset, tag2id_file_path,
                                            prep_song2id_file_path)
        question_dataset = SongTagGenreDataset(load_json(question_file_path),
                                               tag2id_file_path,
                                               prep_song2id_file_path)
    else:
        train_dataset = SongTagDataset(train_dataset, tag2id_file_path,
                                       prep_song2id_file_path)
        question_dataset = SongTagDataset(load_json(question_file_path),
                                          tag2id_file_path,
                                          prep_song2id_file_path)

    plylst_embed_weight = []
    plylst_embed_bias = []

    model_file_path = _model_file_path

    model = torch.load(model_file_path)
    for name, param in model.named_parameters():
        if param.requires_grad:
            if name == 'encoder.1.weight':
                plylst_embed_weight = param.data
            elif name == 'encoder.1.bias':
                plylst_embed_bias = param.data

    train_loader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=256,
                              num_workers=4)
    question_loader = DataLoader(question_dataset,
                                 shuffle=True,
                                 batch_size=256,
                                 num_workers=4)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    plylst_emb_with_bias = dict()

    if genre:
        for idx, (_id, _data, _dnr, _dtl_dnr) in enumerate(
                tqdm(train_loader, desc='get train vectors...')):
            with torch.no_grad():
                _data = _data.to(device)
                output_with_bias = (
                    torch.matmul(_data, plylst_embed_weight.T) +
                    plylst_embed_bias).tolist()
                output_with_bias = np.concatenate(
                    [output_with_bias, _dnr, _dtl_dnr], axis=1)

                _id = list(map(int, _id))
                for i in range(len(_id)):
                    plylst_emb_with_bias[_id[i]] = output_with_bias[i]

        for idx, (_id, _data, _dnr, _dtl_dnr) in enumerate(
                tqdm(question_loader, desc='get question vectors...')):
            with torch.no_grad():
                _data = _data.to(device)
                output_with_bias = (
                    torch.matmul(_data, plylst_embed_weight.T) +
                    plylst_embed_bias).tolist()
                output_with_bias = np.concatenate(
                    [output_with_bias, _dnr, _dtl_dnr], axis=1)

                _id = list(map(int, _id))
                for i in range(len(_id)):
                    plylst_emb_with_bias[_id[i]] = output_with_bias[i]
    else:
        for idx, (_id, _data) in enumerate(
                tqdm(train_loader, desc='get train vectors...')):
            with torch.no_grad():
                _data = _data.to(device)
                output_with_bias = (
                    torch.matmul(_data, plylst_embed_weight.T) +
                    plylst_embed_bias).tolist()

                _id = list(map(int, _id))
                for i in range(len(_id)):
                    plylst_emb_with_bias[_id[i]] = output_with_bias[i]

        for idx, (_id, _data) in enumerate(
                tqdm(question_loader, desc='get question vectors...')):
            with torch.no_grad():
                _data = _data.to(device)
                output_with_bias = (
                    torch.matmul(_data, plylst_embed_weight.T) +
                    plylst_embed_bias).tolist()

                _id = list(map(int, _id))
                for i in range(len(_id)):
                    plylst_emb_with_bias[_id[i]] = output_with_bias[i]
    return plylst_emb_with_bias