Exemple #1
0
    def run(self, question_fname):
        print("Loading question file...")
        questions = pd.read_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(questions)
        write_json(answers, "results/results.json")
Exemple #2
0
    def run(self, train_fname, question_fname):
        print("Loading train file...")
        train = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(train, questions)
        write_json(answers, "results/results.json")
    def run(self, train_fname, question_fname):

        print("Loading train file...")
        train_data = pd.read_json(train_fname)  # arena_data/orig/train.json

        print("Loading question file...")
        questions = pd.read_json(question_fname)  # arena_data/questions/val.json

        print("Generating answers...")
        answers = self._generate_answers(train_data, questions)

        print("Writing answers...")
        write_json(answers, "results/results.json")
    def run(self, song_meta_fname, train_fname, question_fname):
        print("Loading song meta...")
        song_meta_json = load_json(song_meta_fname)

        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(song_meta_json, train_data, questions)
        write_json(answers, "results/results.json")
Exemple #5
0
    def run(self, fname):
        # raondom shuffle 때문에 seed 지정
        random.seed(777)

        print("Reading data...\n")
        # json 파일 불러오기
        playlists = load_json(fname)
        # 불러온 파일 순서 섞기
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        # split
        train, val = self._split_data(playlists)

        # train, val 나눈 데이터 json 파일 작성
        print("Original train...")
        write_json(train, "orig/train.json")
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        # masking 작업
        val_q, val_a = self._mask_data(val)
        write_json(val_q, "questions/val.json")
        write_json(val_a, "answers/val.json")
    def run(self, train_fname, question_fname):
        print('set logger')
        logger = log.get_logger()
        print(logger)
        logger.set_log_level(WARN)
        print('logger set')

        print("Loading train file...")
        train = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(train, questions)
        write_json(answers, "results/results.json")
    def run(self):
        song_meta = pd.read_json('./res/song_meta.json')

        print("Loading train file...")
        train_data = pd.read_json('./arena_data/orig/train.json',
                                  encoding='utf-8')

        print("Loading question file...")
        question_data = pd.read_json('./arena_data/questions/val.json',
                                     encoding='utf-8')

        print("Generating answers...")
        answers = self.related(train_data, question_data)

        print("Writing answers...")
        write_json(answers, 'results/results.json')
Exemple #8
0
    def _load_train(self):
        data = load_json('./arena_data/orig/train.json')
        self.train = []
        self.song_list = set()
        self.tag_list = set()
        print('train data filtering...')
        for t in tqdm(data):
            if t['like_cnt'] > 50:
                self.train.append(t)
                self.song_list.update(t['songs'])
                self.tag_list.update(t['tags'])
        self.song_list = list(self.song_list)
        self.tag_list = list(self.tag_list)
        self.total_song_num = 707989

        write_json(self.tag_list, 'meta/AE_tag_list.json')
    def run(self, song_meta_fname, train_fname, question_fname):
        print("Loading song meta...")
        song_meta_json = load_json(song_meta_fname)

        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        result_df = pd.DataFrame(
            columns=['id', 'means_music_score', 'mean_tag_score', 'mean_title_score'])
        answers = self._generate_answers(song_meta_json, train_data, questions, result_df)
        result_df.to_csv('./arena_data/question_k_score.csv', index=False)

        write_json(answers, "results/results.json")
Exemple #10
0
def run(train_fname, val_fname, test_fname):
    tags = set()
    print("Reading train data...\n")
    playlists_train = load_json(train_fname)
    print("Reading val data...\n")
    playlists_val = load_json(val_fname)
    print("Reading test data...\n")
    playlists_test = load_json(test_fname)
    print("Get tags...\n")
    for ply in playlists_train + playlists_test + playlists_val:
        tags.update(ply['tags'])
    tag_to_id = {tag: i for i, tag in enumerate(list(tags))}
    id_to_tag = {i: tag for i, tag in enumerate(list(tags))}
    print("Write  tag_to_id.json...\n")
    write_json(tag_to_id, 'tag_to_id.json')
    print("Write  id_to_tag.json...\n")
    write_json(id_to_tag, 'id_to_tag.json')
    def run(self,
            song_meta_fname,
            train_fname,
            question_fname,
            genre_fname,
            val_fname=None,
            test_fname=None,
            jobs=1):
        wanna_use_merged_train = (val_fname is not None) and (test_fname
                                                              is not None)
        if wanna_use_merged_train:
            # self.merge_trains([train_fname, val_fname, test_fname], MERGED_TRAIN_FNAME)
            self.reproduce_train(train_fname, val_fname, test_fname,
                                 MERGED_TRAIN_FNAME)

        graph_train_fname = MERGED_TRAIN_FNAME if wanna_use_merged_train else train_fname
        graph_jobs = 1 if RUNNING_IN_WINDOWS else jobs

        if RUNNING_IN_WINDOWS and jobs > 1:
            print("[Warning] 그래프 추천은 윈도우 환경에서 멀티프로세싱이 불가능 합니다.")

        cf_solver = MultiprocessSolver()
        cf_solver.run(song_meta_fname=song_meta_fname,
                      train_fname=train_fname,
                      question_fname=question_fname,
                      jobs=jobs)

        graph_solver = GrapeRecommender()
        graph_solver.run(song_meta_fname=song_meta_fname,
                         train_fname=graph_train_fname,
                         question_fname=question_fname,
                         genre_fname=genre_fname,
                         jobs=jobs)

        ensembler = Ensembler(
            ['./graph/results/results.json', './cf/results/results.json'],
            question_fname)
        res = ensembler.ensemble()
        print(res[0])
        write_json(res, './results/results.json')
    def run(self):
        print('loading meta data...')
        self._load_meta()

        model = models.load_model('./models/auto_encoder.h5')
        result = []

        for t in tqdm(self.test):
            input_song_vec = np.zeros((1, 707989))
            input_tag_vec = np.zeros((1, len(self.tag_list)))
            predict = dict()
            predict['id'] = t['id']
            for song in t['songs']:
                input_song_vec[0][song] = 1
            for tag in t['tags']:
                if tag in self.tag_list:
                    input_tag_vec[0][self.tag_list.index(tag)] = 1
            song_vec, tag_vec = model([input_song_vec, input_tag_vec])

            song_vec = np.array(song_vec[0])
            tag_vec = np.array(tag_vec[0])
            song_rank = song_vec.argsort()
            tag_rank = tag_vec.argsort()
            pred_songs = []
            pred_tags = []
            i = -1
            while len(pred_songs) < 100:
                if song_rank[i] not in t['songs']:
                    pred_songs.append(song_rank[i])
                i -= 1
            i = -1
            while len(pred_tags) < 10:
                if self.tag_list[tag_rank[i]] not in t['tags']:
                    pred_tags.append(self.tag_list[tag_rank[i]])
                i -= 1
            predict['songs'] = pred_songs
            predict['tags'] = pred_tags
            result.append(predict)
        write_json(result, 'result.json')
Exemple #13
0
    def val_inference(self):
        model = models.load_model('./models/auto_encoder.h5')
        val = load_json('./arena_data/questions/val.json')
        tag_list = load_json('./arena_data/meta/AE_tag_list.json')
        result = []

        for v in tqdm(val):
            input_song_vec = np.zeros((1, 707989))
            input_tag_vec = np.zeros((1, len(tag_list)))
            predict = dict()
            predict['id'] = v['id']
            for s in v['songs']:
                input_song_vec[0][s] = 1
            for t in v['tags']:
                if t in tag_list:
                    input_tag_vec[0][tag_list.index(t)] = 1
            song_vec, tag_vec = model([input_song_vec, input_tag_vec])

            song_vec = np.array(song_vec[0])
            tag_vec = np.array(tag_vec[0])
            song_rank = song_vec.argsort()
            tag_rank = tag_vec.argsort()
            pred_songs = []
            pred_tags = []
            i = -1
            while len(pred_songs) < 100:
                if song_rank[i] not in v['songs']:
                    pred_songs.append(song_rank[i])
                i -= 1
            i = -1
            while len(pred_tags) < 10:
                if tag_list[tag_rank[i]] not in v['tags']:
                    pred_tags.append(tag_list[tag_rank[i]])
                i -= 1
            predict['songs'] = pred_songs
            predict['tags'] = pred_tags
            result.append(predict)
        write_json(result, 'AE_results.json')
Exemple #14
0
    def run(self, fname):
        random.seed(777)

        print("Reading data...\n")
        playlists = load_json(fname)
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        train, val = self._split_data(playlists)

        print("Original train...")
        write_json(train, "orig/train.json")
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        val_q, val_a = self._mask_data(val)
        write_json(val_q, "questions/val.json")
        write_json(val_a, "answers/val.json")
Exemple #15
0
    def run(self, fname, train_size):
        random.seed(777)

        print('Reading data...\n')
        playlists = load_json(fname)
        random.shuffle(playlists)
        print(f'Total playlists: {len(playlists)}')

        print(f'Splitting data... train_size is {train_size}')
        train, val = self._split_data(playlists, train_size)

        print('Original train...')
        write_json(train, 'orig/train.json')
        print('Original val...')
        write_json(val, 'orig/val.json')

        print('Masked val...')
        val_q, val_a = self._mask_data(val)
        write_json(val_q, 'questions/val.json')
        write_json(val_a, 'answers/val.json')
Exemple #16
0
    def run(self, fname):
        random.seed(777)

        print("Reading data...\n")
        playlists = load_json(fname)
        #playlist는 태그, id, title, 곡들, 좋아요, 업데이트 날짜가 들어있는 딕셔너리 리스트
        random.shuffle(playlists)
        print(f"Total playlists: {len(playlists)}")

        print("Splitting data...")
        train, val = self._split_data(playlists)
        #플레이리스트 나누기

        print("Original train...")
        write_json(train, "orig/train.json")
        #train.json은 새로 만든다 orig 폴더에
        print("Original val...")
        write_json(val, "orig/val.json")

        print("Masked val...")
        val_q, val_a = self._mask_data(val)  #validation할것을 마스크해서
        write_json(val_q, "questions/val.json")  #q는 퀘스쳔 폴더, a는 앤서 폴더에 넣기
        write_json(val_a, "answers/val.json")
Exemple #17
0
 def eval_dcg(self, res):
     print("Caculating dcg...")
     write_json(res, "results/results.json")
     evaluator = ArenaEvaluator()
     evaluator.evaluate("arena_data/answers/val_hye.json",
                        "arena_data/results/results.json")
Exemple #18
0
        return solve_tag_main(indx)


# In[22]:

rec_tags = {}

timer = 0
for playlist in playlists_valq:
    if timer % 1000 == 0:
        print('timer:', timer)

    pid = playlist['id']
    rec_tags[pid] = solve_tag(timer)
    timer += 1

# In[23]:

answers = []

for playlist in playlists_valq:
    pid = playlist['id']
    answer = {'id': pid, 'songs': rec_songs[pid], 'tags': rec_tags[pid]}
    answers.append(answer)

# In[24]:

write_json(answers, 'results.json')

# In[ ]:
Exemple #19
0
 def eval_dcg(self, res):
     write_json(res, "results/results.json")
     evaluator = ArenaEvaluator()
     evaluator.evaluate("arena_data/answers/val_hye.json",
                        "arena_data/results/results.json")
Exemple #20
0
def train(train_dataset, model_file_path, id2prep_song_file_path,
          id2tag_file_path, question_dataset, answer_file_path):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    id2tag_dict = dict(np.load(id2tag_file_path, allow_pickle=True).item())
    id2prep_song_dict = dict(
        np.load(id2prep_song_file_path, allow_pickle=True).item())

    # parameters
    num_songs = train_dataset.num_songs
    num_tags = train_dataset.num_tags

    # hyper parameters
    D_in = D_out = num_songs + num_tags

    #local_val mode인 경우 중간 중간 결과 확인
    q_data_loader = None
    check_every = 5
    tmp_result_file_path = 'results/tmp_results.json'
    evaluator = ArenaEvaluator()
    if question_dataset is not None:
        q_data_loader = DataLoader(question_dataset,
                                   shuffle=True,
                                   batch_size=batch_size,
                                   num_workers=num_workers)

    data_loader = DataLoader(train_dataset,
                             shuffle=True,
                             batch_size=batch_size,
                             num_workers=num_workers)

    model = AutoEncoder(D_in, H, D_out, dropout=dropout).to(device)

    parameters = model.parameters()
    loss_func = nn.BCELoss()
    optimizer = torch.optim.Adam(parameters, lr=learning_rate)

    try:
        model = torch.load(model_file_path)
        print("\n--------model restored--------\n")
    except:
        print("\n--------model not restored--------\n")
        pass

    temp_fn = 'arena_data/answers/temp.json'
    if os.path.exists(temp_fn):
        os.remove(temp_fn)

    for epoch in range(epochs):
        print()
        print('epoch: ', epoch)
        running_loss = 0.0
        for idx, (_id,
                  _data) in enumerate(tqdm(data_loader, desc='training...')):
            _data = _data.to(device)

            optimizer.zero_grad()
            output = model(_data)
            loss = loss_func(output, _data)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print('loss: %d %d%% %.4f' %
              (epoch, epoch / epochs * 100, running_loss))

        torch.save(model, model_file_path)

        if mode == 0:
            if epoch % check_every == 0:
                if os.path.exists(tmp_result_file_path):
                    os.remove(tmp_result_file_path)
                elements = []
                for idx, (_id, _data) in enumerate(
                        tqdm(q_data_loader, desc='testing...')):
                    with torch.no_grad():
                        _data = _data.to(device)
                        output = model(_data)

                        songs_input, tags_input = torch.split(_data,
                                                              num_songs,
                                                              dim=1)
                        songs_output, tags_output = torch.split(output,
                                                                num_songs,
                                                                dim=1)

                        songs_ids = binary_songs2ids(songs_input, songs_output,
                                                     id2prep_song_dict)
                        tag_ids = binary_tags2ids(tags_input, tags_output,
                                                  id2tag_dict)

                        _id = list(map(int, _id))
                        for i in range(len(_id)):
                            element = {
                                'id': _id[i],
                                'songs': list(songs_ids[i]),
                                'tags': tag_ids[i]
                            }
                            elements.append(element)

                write_json(elements, tmp_result_file_path)
                evaluator.evaluate(answer_file_path, tmp_result_file_path)
                os.remove(tmp_result_file_path)
Exemple #21
0
# Calculate the confidence by multiplying it by our alpha value.
tag_conf = (train_tags_A_T * alpha_val).astype('double')

# Model 학습
tag_recommend_model.fit(tag_conf)

answers = []

for nid in tqdm(test.index):

    recommendations_songs_tuples = song_recommend_model.recommend(
        int(nid), train_songs_A, 100)
    recommendations_tags_tuples = tag_recommend_model.recommend(
        int(nid), train_tags_A, 10)

    # extract only songs/tags from (songs/tags, score) tuple
    recommendations_songs = [t[0] for t in recommendations_songs_tuples]
    recommendations_tags = [t[0] for t in recommendations_tags_tuples]

    ans_songs = [song_sid_id[song] for song in recommendations_songs]
    ans_tags = [tag_tid_id[tag] for tag in recommendations_tags]

    answers.append({
        "id": plylst_nid_id[nid],
        "songs": ans_songs,
        "tags": ans_tags
    })

# write_json
write_json(answers, "results/results.json")
    def merge_trains(self, train_fnames, output_fname):
        merged_train = []
        for train_fname in train_fnames:
            merged_train += load_json(train_fname)

        write_json(merged_train, output_fname)
Exemple #23
0
### 3.1 hyperparameters: k, rho, weights
### 3.2 parameters: sim_songs, sim_tags, sim_normalize

song_k = 500
tag_k = 90
song_k_step = 50
tag_k_step = 10
rho = 0.4
weight_val_songs = 0.9
weight_pred_songs = 1 - weight_val_songs
weight_val_tags = 0.7
weight_pred_tags = 1 - weight_val_tags
sim_songs = 'idf'
sim_tags = 'idf'
sim_normalize = True

### 3.3 run KNN.predict() : returns pandas.DataFrame
pred = KNN(song_k=song_k, tag_k=tag_k, rho=rho, \
                   song_k_step=song_k_step, tag_k_step=tag_k_step, \
                   weight_val_songs=weight_val_songs, weight_pred_songs=weight_pred_songs, \
                   weight_val_tags=weight_val_tags, weight_pred_tags=weight_pred_tags, \
                   sim_songs=sim_songs, sim_tags=sim_tags, sim_normalize=sim_normalize, \
                   train=train, val=val, song_meta=song_meta, pred=pred).predict()

### 4. post-processing
### 4.1 convert "tag_id" to "tag"
pred = convert_id_to_tag(pred, id_to_tag)
pred = generate_answers(load_json(train_path), to_list(pred))

write_json(pred, 'results.json')
Exemple #24
0
def Recommender(train,
                questions,
                n_msp,
                n_mtp,
                mode,
                sim_measure,
                song_meta,
                freq_song,
                save=False):
    ## 최종 추천리스트
    rec_list = []

    ## 1단계: 전처리
    # 1) 추천 결과가 없거나 모자란 경우를 위해 most_popular 생성
    _, song_mp = most_popular(train, "songs", 200)
    _, tag_mp = most_popular(train, "tags", 20)

    # 2) 빠른 접근을 위한 Dictionary 생성
    song_plylst_dic, song_tag_dic, plylst_song_dic, plylst_tag_dic, tag_plylst_dic, tag_song_dic, song_issue_dic, song_artist_dic = DicGenerator(
        train, song_meta)

    # 3) 미리 계산한 플레이리스트 유사도 불러오기
    '''
    sim_scores: 입력으로 들어온 questions과 train간 유사도 (Autoencoder 기반)
    gnr_scores: 입력으로 들어온 questions과 train간 유사도 (genre 정보 추가)
    title_scores: 입력으로 들어온 questions과 train간 유사도 (Word2vec 기반)
    '''
    sim_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}.npy',
                         allow_pickle=True).item()
    gnr_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}_gnr.npy',
                         allow_pickle=True).item()
    title_scores = np.load(
        f'scores/{mode}_scores_title_{sim_measure}_24000.npy',
        allow_pickle=True).item()

    ## 2단계: 함수 정의
    # 1) Counter 객체에서 빈도수 기준 topk개 출력
    def most_similar(cnt, topk):
        cnt_topk = cnt.most_common(topk)
        return [k for k, v in cnt_topk]

    # 2) 미리 계산한 유사도 기준 topk개의 플레이리스트의 plylsts와 scores 출력
    def most_similar_emb(q_id, topk, title=False, genre=False):
        # title_scores 기준
        if title:
            plylsts = [t[0] for t in title_scores[q_id][:topk]]
            scores = [t[1] for t in title_scores[q_id][:topk]]
        # gnr_scores 기준
        elif genre:
            plylsts = [t[0] for t in gnr_scores[q_id][:topk]]
            scores = [t[1] for t in gnr_scores[q_id][:topk]]
        # sim_scores 기준
        else:
            plylsts = [t[0] for t in sim_scores[q_id][:topk]]
            scores = [t[1] for t in sim_scores[q_id][:topk]]
        return plylsts, scores

    # 3) new_song_plylst_dict
    def get_new_song_plylst_dict(plylst_ms):
        new_song_plylst_dict = defaultdict(set)
        for plylst in plylst_ms:
            for _song in plylst_song_dic[plylst]:
                new_song_plylst_dict[_song].add(plylst)
        return new_song_plylst_dict

    ## 3단계: 입력으로 들어온 questions 플레이리스트에 대해 추천
    for q in tqdm(questions):

        # 1) question 플레이리스트의 정보
        # 수록 song/tag
        q_songs = q['songs']
        q_tags = q['tags']

        # 수록 song/tag와 함께 등장한 song/tag/plylst 빈도 수
        song_plylst_C = Counter()
        song_tag_C = Counter()
        tag_plylst_C = Counter()
        tag_song_C = Counter()

        # 수록 song/tag가 둘 다 없거나 적을 때
        no_songs_tags, few_songs_tags = False, False
        if len(q_songs) == 0 and len(q_tags) == 0:
            no_songs_tags = True
        elif len(q_songs) <= 3:
            few_songs_tags = True

        # 2) 빈도수 기반 추천을 위해 카운트
        # 수록 song에 대해
        for q_s in q_songs:
            song_plylst_C.update(song_plylst_dic[q_s])
            song_tag_C.update(song_tag_dic[q_s])
        # 수록 tag에 대해
        for q_t in q_tags:
            tag_plylst_C.update(tag_plylst_dic[q_t])
            tag_song_C.update(tag_song_dic[q_t])
            # 수록곡 수로 나눠서 비율로 계산
        for i, j in list(song_plylst_C.items()):
            if len(plylst_song_dic[i]) > 0:
                song_plylst_C[i] = (j / len(plylst_song_dic[i]))

                # 3) 유사도 기반 추천을 위해 점수 계산
        plylst_song_scores = defaultdict(lambda: 0)
        plylst_tag_scores = defaultdict(lambda: 0)

        # Case 1: song과 tag가 둘 다 없는 경우
        if no_songs_tags:
            # plylst_ms / plylst_mt: title_scores 기준 유사한 플레이리스트 n_msp / n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'],
                                                      n_msp,
                                                      title=True)
            plylst_mt, tag_scores = most_similar_emb(q['id'],
                                                     n_mtp,
                                                     title=True)
            plylst_add, add_scores = most_similar_emb(q['id'], n_mtp)

        # Case 2: song과 tag가 부족한 경우
        elif few_songs_tags:
            # plylst_ms / plylst_mt: sim_scores 기준 n_msp개 / title_scores 기준 n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'], n_msp)
            plylst_mt, tag_scores = most_similar_emb(q['id'],
                                                     n_mtp,
                                                     title=True)
            plylst_add, add_scores = most_similar_emb(q['id'],
                                                      n_mtp,
                                                      genre=True)

        # Case 3: song과 tag가 충분한 경우
        else:
            # plylst_ms / plylst_mt: sim_scores 기준 유사한 플레이리스트 n_msp / n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'], n_msp)
            plylst_mt, tag_scores = most_similar_emb(q['id'],
                                                     n_mtp,
                                                     genre=True)
            plylst_add, add_scores = most_similar_emb(q['id'],
                                                      n_mtp,
                                                      title=True)

        new_song_plylst_dict = get_new_song_plylst_dict(plylst_ms)

        # 3-1. plylst_song_scores 계산
        for idx, ms_p in enumerate(plylst_ms):
            for song in plylst_song_dic[ms_p]:
                song_score = 0
                for q_s in q_songs:
                    try:
                        song_score += len(new_song_plylst_dict[q_s]
                                          & new_song_plylst_dict[song]) / len(
                                              new_song_plylst_dict[q_s])
                    except:
                        pass
                if song in freq_song:
                    plylst_song_scores[song] += song_plylst_C[
                        ms_p] * song_score * song_scores[idx] * (n_msp -
                                                                 idx) * 4
                else:
                    plylst_song_scores[song] += song_plylst_C[
                        ms_p] * song_score * song_scores[idx] * (n_msp - idx)
            for tag in plylst_tag_dic[ms_p]:
                plylst_tag_scores[tag] += tag_scores[idx] * (n_msp - idx)

        # 3-2. plylst_tag_scores 계산
        for idx, mt_p in enumerate(plylst_mt):
            for tag in plylst_tag_dic[mt_p]:
                plylst_tag_scores[tag] += tag_scores[idx] * (n_mtp - idx)
            for song in plylst_song_dic[mt_p]:
                plylst_song_scores[song] += tag_scores[idx]

        # 3-3. plylst_{song/tag}_scores 보정
        for idx, mt_p in enumerate(plylst_add):
            for tag in plylst_tag_dic[mt_p]:
                plylst_tag_scores[tag] += add_scores[idx] * (n_mtp - idx)

        # 4) song과 tag 둘 다 없거나 적은 경우 예측해서 채워넣기
        if no_songs_tags:
            # q_songs 새롭게 채워넣기 (원래는 song가 없지만 title_scores 기준 유사한 플레이리스트로부터 song 예측)
            pre_songs = sorted(plylst_song_scores.items(),
                               key=lambda x: x[1],
                               reverse=True)
            pre_songs = [scores[0] for scores in pre_songs][:200]
            pre_songs = pre_songs + remove_seen(pre_songs, song_mp)
            q_songs = pre_songs[:100]

            # q_tags 새롭게 채워넣기 (원래는 tag가 없지만 title_scores 기준 유사한 플레이리스트로부터 tag 예측)
            pre_tags = sorted(plylst_tag_scores.items(),
                              key=lambda x: x[1],
                              reverse=True)
            pre_tags = [scores[0] for scores in pre_tags][:20]
            pre_tags = pre_tags + remove_seen(pre_tags, tag_mp)
            q_tags = pre_tags[:10]

            # 5) questions 플레이리스트에 대해 추천
        ## song 추천
        # song 있을 때
        lt_song_art = []
        if len(q_songs) > 0:
            plylst_song_scores = sorted(plylst_song_scores.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

            lt_artist = []
            for w_song in q_songs:
                lt_artist.extend(song_artist_dic[w_song])
            counter_artist = Counter(lt_artist)
            counter_artist = sorted(counter_artist.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
            if few_songs_tags:
                artist = [art[0] for art in counter_artist]
            else:
                artist = [x[0] for x in counter_artist if x[1] > 1]
            cand_ms = [scores[0] for scores in plylst_song_scores
                       ][(100 - len(artist)):1000]
            for cand in cand_ms:
                if artist == []:
                    break
                if cand in q_songs:
                    break
                for art in song_artist_dic[cand]:
                    if art in artist:
                        lt_song_art.append(cand)
                        artist.remove(art)
                        break
            song_ms = [scores[0] for scores in plylst_song_scores][:200]

        # song 없고, tag 있을 때
        else:
            song_ms = most_similar(tag_song_C, 200)

        ## tag 추천
        # tag 있을 때
        if len(q_tags) > 0:
            plylst_tag_scores = sorted(plylst_tag_scores.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
            tag_ms = [scores[0] for scores in plylst_tag_scores][:20]

        # tag 없고, song 있을 때
        else:
            plylst_tag_scores = sorted(plylst_tag_scores.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
            tag_ms = [scores[0] for scores in plylst_tag_scores][:20]

        ## issue date 늦은 song 제거
        if q['updt_date']:
            q_updt_date = q['updt_date'][:4] + q['updt_date'][5:7] + q[
                'updt_date'][8:10]
            song_ms = [x for x in song_ms if song_issue_dic[x] < q_updt_date]

        ## 중복 제거 및 부족하면 most_popular로 채워넣기
        song_candidate = song_ms + remove_seen(song_ms, song_mp)
        tag_candidate = tag_ms + remove_seen(tag_ms, tag_mp)

        song_remove = q_songs
        tag_remove = q_tags

        song_candidate = song_candidate[:100] if no_songs_tags else remove_seen(
            song_remove, song_candidate)[:100]
        if len(lt_song_art) > 0:
            lt_song_art = [x for x in lt_song_art if x not in song_candidate]
            song_candidate[(100 - len(lt_song_art)):100] = lt_song_art

        rec_list.append({
            "id":
            q["id"],
            "songs":
            song_candidate,
            "tags":
            tag_candidate[:10] if no_songs_tags else remove_seen(
                tag_remove, tag_candidate)[:10]
        })

    # 6) results.json 파일 저장 여부
    if save == True:
        write_json(
            rec_list, 'results/results_' +
            dt.datetime.now().strftime("%y%m%d-%H%M%S") + '_' + mode + '.json')

    return rec_list
Exemple #25
0
    def run(self, fname):

        np.random.seed(self.SEED)

        train_org = pd.read_json(fname)

        split = self._generateIdx(train_org)
        if self.DATA_FOLDER not in os.listdir("."):
            os.mkdir(self.DATA_FOLDER)

        for i, (tr_idx, te_idx) in enumerate(split):
            folder = "fold" + str(i)
            path = self.DATA_FOLDER + "/" + folder
            if folder not in os.listdir(self.DATA_FOLDER):
                os.mkdir(path)

            print("Splitting data...")
            train, val1, val2 = self._split_data(train_org, tr_idx, te_idx)

            print(f"fold {i} Original train...")
            write_json(train, path + "/train.json")
            #train.json은 새로 만든다 orig 폴더에

            print(f"fold {i} Original val1...")
            write_json(val1, path + "/val1.json")

            print(f"fold {i} Original val2...")
            write_json(val2, path + "/val2.json")

            print(f"fold {i} Masked val1...")
            val1_q, val1_a = self._mask_data(val1)  #validation할것을 마스크해서
            write_json(val1_q, path + "/val1_q.json")
            write_json(val1_a, path + "/val1_a.json")

            print(f"fold {i} Masked val2...")
            val2_q, val2_a = self._mask_data(val2)  #validation할것을 마스크해서
            write_json(val2_q, path + "/val2_q.json")
            write_json(val2_a, path + "/val2_a.json")
def infer(MODE="Test"):
    mode_opt = {
        "Valid": {
            "train_path": "arena_data/orig/train.json",
            "test_path": "arena_data/questions/val.json",
            "results_path": "cf2/val/results.json",
            "eval": True
        },
        "Dev": {
            "train_path": "res/train.json",
            "test_path": "res/val.json",
            "results_path": "cf2/dev/results.json",
            "eval": False
        },
        "Test": {
            "train_path": "res/train.json",
            "test_path": "res/test.json",
            "results_path": "cf2/test/results.json",
            "eval": False
        }
    }
    opt = mode_opt[MODE]

    train = pd.read_json(opt["train_path"])
    test = pd.read_json(opt["test_path"])

    if MODE != "Dev":
        dev = pd.read_json("res/val.json")

    if MODE != "Test":
        test_res = pd.read_json("res/test.json")

    print("Preprocessing dates")
    test_date = {}
    for i in tqdm(test.index):
        test_date[test.at[i, 'id']] = test.at[i, 'updt_date']

    song_meta = pd.read_json("res/song_meta.json")

    song_date = {}
    for i in tqdm(song_meta.index):
        song_date[song_meta.at[i, "id"]] = str(song_meta.at[i, "issue_date"])

    del song_meta

    song_update_date = []
    for i in train.index:
        updt_date = train.loc[i, 'updt_date'][:4] + train.loc[
            i, 'updt_date'][5:7] + train.loc[i, 'updt_date'][8:10]
        for t in train.loc[i, 'songs']:
            if song_date[t] > updt_date:
                song_date[t] = updt_date
                song_update_date.append(t)
    for i in test.index:
        updt_date = test.loc[i, 'updt_date'][:4] + test.loc[
            i, 'updt_date'][5:7] + test.loc[i, 'updt_date'][8:10]
        for t in test.loc[i, 'songs']:
            if song_date[t] > updt_date:
                song_date[t] = updt_date
                song_update_date.append(t)
    if MODE != "Dev":
        for i in dev.index:
            updt_date = dev.loc[i, 'updt_date'][:4] + dev.loc[
                i, 'updt_date'][5:7] + dev.loc[i, 'updt_date'][8:10]
            for t in dev.loc[i, 'songs']:
                if song_date[t] > updt_date:
                    song_date[t] = updt_date
                    song_update_date.append(t)
    if MODE != "Test":
        for i in test_res.index:
            updt_date = test_res.loc[i, 'updt_date'][:4] + test_res.loc[
                i, 'updt_date'][5:7] + test_res.loc[i, 'updt_date'][8:10]
            for t in test_res.loc[i, 'songs']:
                if song_date[t] > updt_date:
                    song_date[t] = updt_date
                    song_update_date.append(t)
    print("The number of processed songs :", len(set(song_update_date)))

    # Loading tags extracted from tiltle
    pred_tag = load_json("arena_data/model/pred_tag.json")

    dic_pred_tag = {}
    for p_t in pred_tag:
        dic_pred_tag[p_t['id']] = p_t['predict_tag']

    train['tags_org'] = train['tags']
    for i in train.index:
        train.at[i,
                 'tags'] = train.at[i, 'tags'] + dic_pred_tag[train.at[i,
                                                                       'id']]

    test['tags_org'] = test['tags']
    for i in test.index:
        test.at[i,
                'tags'] = test.at[i, 'tags'] + dic_pred_tag[test.at[i, 'id']]

    if MODE != "Dev":
        dev['tags_org'] = dev['tags']
        for i in dev.index:
            dev.at[i,
                   'tags'] = dev.at[i, 'tags'] + dic_pred_tag[dev.at[i, 'id']]

    if MODE != "Test":
        test_res['tags_org'] = test_res['tags']
        for i in test_res.index:
            test_res.at[i, 'tags'] = test_res.at[i, 'tags'] + dic_pred_tag[
                test_res.at[i, 'id']]

    # Calculating IDF
    inv_doc_freq = {}
    for d in train['songs'] + train['tags']:
        for i in d:
            if i in inv_doc_freq:
                inv_doc_freq[i] += 1
            else:
                inv_doc_freq[i] = 1

    for d in test['songs'] + test['tags']:
        for i in d:
            if i in inv_doc_freq:
                inv_doc_freq[i] += 1
            else:
                inv_doc_freq[i] = 1

    if MODE != "Dev":
        for d in dev['songs'] + dev['tags']:
            for i in d:
                if i in inv_doc_freq:
                    inv_doc_freq[i] += 1
                else:
                    inv_doc_freq[i] = 1

    if MODE != "Test":
        for d in test_res['songs'] + test_res['tags']:
            for i in d:
                if i in inv_doc_freq:
                    inv_doc_freq[i] += 1
                else:
                    inv_doc_freq[i] = 1

    for k in inv_doc_freq:
        if MODE == "Valid":
            inv_doc_freq[k] = math.log10(
                (len(train) + len(test) + len(dev) + len(test_res)) /
                inv_doc_freq[k])
        elif MODE == "Dev":
            inv_doc_freq[k] = math.log10(
                (len(train) + len(test) + len(test_res)) / inv_doc_freq[k])
        else:
            inv_doc_freq[k] = math.log10(
                (len(train) + len(test) + len(dev)) / inv_doc_freq[k])

    # Preprocessing data for CF matrix
    if MODE == "Valid":
        n_train = len(train) + len(dev) + len(test_res)
    elif MODE == "Dev":
        n_train = len(train) + len(test_res)
    else:
        n_train = len(train) + len(dev)
    n_test = len(test)

    # train + test
    if MODE == "Valid":
        plylst = pd.concat([train, dev, test_res, test], ignore_index=True)
    elif MODE == "Dev":
        plylst = pd.concat([train, test_res, test], ignore_index=True)
    else:
        plylst = pd.concat([train, dev, test], ignore_index=True)

    # playlist id
    plylst["nid"] = range(n_train + n_test)

    # nid -> id
    plylst_nid_id = dict(zip(plylst["nid"], plylst["id"]))

    plylst_tag = plylst['tags']
    tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
    tag_dict = {x: tag_counter[x] for x in tag_counter}

    id_type = dict()

    tag_id_tid = dict()
    tag_tid_id = dict()
    for i, t in enumerate(tag_dict):
        tag_id_tid[t] = i
        tag_tid_id[i] = t
        id_type[t] = 1

    n_tags = len(tag_dict)

    plylst_song = plylst['songs']
    song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
    song_dict = {x: song_counter[x] for x in song_counter}

    song_id_sid = dict()
    song_sid_id = dict()
    for i, t in enumerate(song_dict):
        song_id_sid[t] = i
        song_sid_id[i] = t
        id_type[t] = 1

    n_songs = len(song_dict)

    plylst_st = plylst['songs'] + plylst['tags']
    st_counter = Counter([st for sts in plylst_st for st in sts])
    st_dict = {x: st_counter[x] for x in st_counter}

    st_id_tid = dict()
    st_tid_id = dict()
    for i, t in enumerate(st_dict):
        st_id_tid[t] = i
        st_tid_id[i] = t

    n_sts = len(st_dict)

    print("Tags : ", n_tags, ", Songs : ", n_songs, ", Total : ", n_sts)

    plylst['songs_id'] = plylst['songs'].map(
        lambda x:
        [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
    plylst['tags_id'] = plylst['tags_org'].map(
        lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])
    plylst['sts_id'] = (plylst['songs'] + plylst['tags']).map(
        lambda x: [st_id_tid.get(st) for st in x if st_id_tid.get(st) != None])

    plylst_use = plylst[['nid', 'updt_date', 'songs_id', 'tags_id', 'sts_id']]
    plylst_use.loc[:, 'num_songs'] = plylst_use['songs_id'].map(len)
    plylst_use.loc[:, 'num_tags'] = plylst_use['tags_id'].map(len)
    plylst_use.loc[:, 'num_sts'] = plylst_use['sts_id'].map(len)
    plylst_use = plylst_use.set_index('nid')

    plylst_train = plylst_use.iloc[:, :]
    plylst_test = plylst_use.iloc[n_train:, :]

    n_train = len(plylst_train)

    np.random.seed(33)
    test_set = plylst_test
    print("The number of test samples : ", len(test_set))

    # Building CF matrices
    avg_len_songs = 0
    for songs in plylst_train['songs_id']:
        avg_len_songs += len(songs)
    avg_len_songs /= len(plylst_train['songs_id'])

    avg_len_tags = 0
    for tags in plylst_train['tags_id']:
        avg_len_tags += len(tags)
    avg_len_tags /= len(plylst_train['tags_id'])

    avg_len_sts = 0
    for sts in plylst_train['sts_id']:
        avg_len_sts += len(sts)
    avg_len_sts /= len(plylst_train['sts_id'])

    row = np.repeat(range(n_train), plylst_train['num_songs'])
    col = [song for songs in plylst_train['songs_id'] for song in songs]
    dat = [1 for songs in plylst_train['songs_id'] for song in songs]
    train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

    row = np.repeat(range(n_train), plylst_train['num_tags'])
    col = [tag for tags in plylst_train['tags_id'] for tag in tags]
    dat = [1 for tags in plylst_train['tags_id'] for tag in tags]
    train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

    row = np.repeat(range(n_train), plylst_train['num_sts'])
    col = [st for sts in plylst_train['sts_id'] for st in sts]
    dat = [
        inv_doc_freq[st_tid_id[st]] / (len(sts) + 50)
        for sts in plylst_train['sts_id'] for st in sts
    ]
    train_sts_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_sts))

    train_songs_A_T = train_songs_A.T.tocsr()
    train_tags_A_T = train_tags_A.T.tocsr()

    # Building map playlist id to songs or tags for playlist2vec
    if MODE == "Valid":
        p2v_targets = [train, test, dev, test_res]
    elif MODE == "Dev":
        p2v_targets = [train, test, test_res]
    else:
        p2v_targets = [train, test, dev]

    song_dic = {}
    tag_dic = {}
    for i, q in tqdm(pd.concat(p2v_targets).iterrows()):
        song_dic[str(q['id'])] = q['songs']
        tag_dic[str(q['id'])] = q['tags_org']

    # Loading playlist embedding vectors
    p2v_song = WordEmbeddingsKeyedVectors.load(
        "arena_data/model/p2v_song.model")
    p2v_tag = WordEmbeddingsKeyedVectors.load("arena_data/model/p2v_tag.model")

    print("Predicting")
    res = []
    filtered_lot_song = []
    filtered_lot_tag = []
    for pid in tqdm(test_set.index):
        songs_already = test_set.loc[pid, "songs_id"]
        tags_already = test_set.loc[pid, "tags_id"]

        # Song prediction - 1. Query vector to predict songs
        p = np.zeros((n_sts, 1))
        if len(test_set.loc[pid, 'sts_id']) > 0:
            for st in test_set.loc[pid, 'sts_id']:
                if st_tid_id[st] in inv_doc_freq:
                    p[st] = inv_doc_freq[st_tid_id[st]] / (
                        len(test_set.loc[pid, 'sts_id']) + 50)

        # Song prediction - 2. K-nn playlists
        val = train_sts_A.dot(p).reshape(-1)

        val_idx = val.reshape(-1).argsort()[-250:][::-1]

        val_knn = np.zeros((n_train))
        val_knn[val_idx] = val[val_idx]

        val = val_knn**2

        # Song prediction - 3. Candidates
        cand_song = train_songs_A_T.dot(val)

        # Song prediction - 4. Rescoring using playlist2vec
        dic_song_score = {}
        if str(plylst_nid_id[pid]) in p2v_song.wv.vocab:
            most_id = [
                x for x in p2v_song.most_similar(str(plylst_nid_id[pid]),
                                                 topn=50)
            ]
            for ID in most_id:
                for s in song_dic[ID[0]]:
                    if s in dic_song_score:
                        dic_song_score[s] += ID[1]
                    else:
                        dic_song_score[s] = ID[1]

        for k in dic_song_score:
            cand_song[song_id_sid[k]] *= dic_song_score[k]**0.2

        cand_song_idx = cand_song.reshape(-1).argsort()[-5000:][::-1]

        # Song prediction - 5. Filtering by score and date
        cand_song_idx_filtered = []
        for cand in cand_song_idx:
            if cand_song[cand] > 0 and song_date[song_sid_id[
                    cand]] <= test_date[plylst_nid_id[pid]][:4] + test_date[
                        plylst_nid_id[pid]][5:7] + test_date[
                            plylst_nid_id[pid]][8:10]:
                cand_song_idx_filtered.append(cand)
        if len(cand_song_idx_filtered) < 400:
            filtered_lot_song.append(len(cand_song_idx_filtered))
        cand_song_idx = np.array(cand_song_idx_filtered)

        # Song prediction - 6. Rescoring using heuristics
        dict_score = {}
        for idx in cand_song_idx:
            dict_score[idx] = cand_song[idx]

        mean_doc_freq = 0
        std_doc_freq = 0
        list_doc_freq = []
        mean_song_date = 0
        list_song_date = []
        if len(test_set.loc[pid, "songs_id"]) > 0:
            for t in test_set.loc[pid, "songs_id"]:
                if song_sid_id[t] in inv_doc_freq:
                    list_doc_freq.append(inv_doc_freq[song_sid_id[t]])
                song_d = int(song_date[song_sid_id[t]])
                if song_d > 19000000 and song_d < 20210000:
                    list_song_date.append(song_d)
            if len(list_doc_freq) > 0:
                mean_doc_freq = np.mean(list_doc_freq)
                std_doc_freq = np.std(list_doc_freq)
            if len(list_song_date) > 0:
                mean_song_date = np.mean(list_song_date)

        # Song prediction - 6-1. Rescoring by IDF comparison
        if len(list_doc_freq) > 0:
            for c in dict_score:
                if song_sid_id[c] in inv_doc_freq:
                    dict_score[c] = 1 / (
                        len(list_doc_freq)**0.5) * dict_score[c] + (
                            1 - 1 /
                            (len(list_doc_freq)**0.5)) * dict_score[c] * 2 / (
                                np.abs(inv_doc_freq[song_sid_id[c]] -
                                       mean_doc_freq) / (std_doc_freq + 1) + 2)
                else:
                    dict_score[c] = 1 / (len(list_doc_freq)**
                                         0.5) * dict_score[c]

        # Song prediction - 6-2. Rescoring by Date comparison
        if len(list_song_date) > 0:
            for c in dict_score:
                song_d = int(song_date[song_sid_id[c]])
                if song_d > 19000000 and song_d < 20210000:
                    dict_score[c] = 1 / (
                        len(list_song_date)**0.5) * dict_score[c] + (
                            1 - 1 /
                            (len(list_song_date)**0.5)) * dict_score[c] / (
                                np.abs(song_d - mean_song_date) / 500000 + 1)
                else:
                    dict_score[c] = 1 / (len(list_song_date)**
                                         0.5) * dict_score[c]

        score_sorted = sorted(dict_score.items(),
                              key=lambda x: x[1],
                              reverse=True)

        cand_song_idx = []
        for t in score_sorted:
            cand_song_idx.append(t[0])
        cand_song_idx = np.array(cand_song_idx)

        cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) ==
                                      False][:300]
        rec_song_idx = [song_sid_id[i] for i in cand_song_idx]

        # Tag prediction - 1. Query vector to predict tags
        p = np.zeros((n_sts, 1))
        p[test_set.loc[pid, 'sts_id']] = 1

        # Tag prediction - 2. K-nn playlists
        val = train_sts_A.dot(p).reshape(-1)

        val_idx = val.reshape(-1).argsort()[-250:][::-1]

        val_knn = np.zeros((n_train))
        val_knn[val_idx] = val[val_idx]

        val = val_knn**2

        # Tag prediction - 3. Candidates
        cand_tag = train_tags_A_T.dot(val)

        # Tag prediction - 4. Rescoring using playlist2vec
        dic_tag_score = {}
        if str(plylst_nid_id[pid]) in p2v_tag.wv.vocab:
            most_id = [
                x
                for x in p2v_tag.most_similar(str(plylst_nid_id[pid]), topn=50)
            ]
            for ID in most_id:
                for t in tag_dic[ID[0]]:
                    if t in dic_tag_score:
                        dic_tag_score[t] += ID[1]
                    else:
                        dic_tag_score[t] = ID[1]

        for k in dic_tag_score:
            cand_tag[tag_id_tid[k]] *= dic_tag_score[k]**0.5

        cand_tag_idx = cand_tag.reshape(-1).argsort()[-35:][::-1]

        # Tag prediction - 5. Filtering by score
        cand_tag_idx_filtered = []
        for cand in cand_tag_idx:
            if cand_tag[cand] > 0:
                cand_tag_idx_filtered.append(cand)
        if len(cand_tag_idx_filtered) != 35:
            filtered_lot_tag.append(len(cand_tag_idx_filtered))
        cand_tag_idx = np.array(cand_tag_idx_filtered)

        cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) ==
                                    False][:30]
        rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

        res.append({
            "id": plylst_nid_id[pid],
            "songs": rec_song_idx,
            "tags": rec_tag_idx
        })

    print(len(filtered_lot_song), filtered_lot_song)
    print(len(filtered_lot_tag), filtered_lot_tag)

    write_json(res, "results/" + opt["results_path"])

    if opt["eval"]:
        evaluator = CustomEvaluator()
        evaluator.evaluate("arena_data/answers/val.json",
                           "arena_data/results/" + opt["results_path"])