def _generate_answers(self, song_meta_json, train, questions):
        song_meta = {int(song["id"]): song for song in song_meta_json}
        song_mp_counter, song_mp = most_popular(train, "songs", 200)
        tag_mp_counter, tag_mp = most_popular(train, "tags", 100)
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        answers = []
        for q in tqdm(questions):
            genre_counter = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)

            if len(top_genre) != 0:
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                cur_songs = song_mp

            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], cur_songs)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10]
            })

        return answers
Beispiel #2
0
    def _generate_answers(self, song_meta_json, train, questions):
        # key를 song_id value를 해당 song_id에 대한 정보로 dictionary 생성
        song_meta = {int(song["id"]): song for song in song_meta_json}
        # 상위 200개 곡
        song_mp_counter, song_mp = most_popular(train, "songs", 200)
        # 상위 100개 태그
        tag_mp_counter, tag_mp = most_popular(train, "tags", 100)
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        answers = []
        for q in tqdm(questions):
            genre_counter = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)
            # 가장 인기있는 장르가 존재하면
            if len(top_genre) != 0:
                # 해당 장르에서 가장 많이 등장한 song 추천
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                # 아니면 가장 많이 등장한 노래 추천
                cur_songs = song_mp

            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], cur_songs)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10]
            })

        return answers
Beispiel #3
0
    def _generate_answers(self, val):

        # Loading all the files required for recommendation
        with open("songtag_length.pkl", "rb") as f:
            songtag_length = pickle.load(f)
        with open("popular_song_dict.pkl", "rb") as f:
            popular_song_dict = pickle.load(f)
        with open("popular_tag_dict.pkl", "rb") as f:
            popular_tag_dict = pickle.load(f)
        with open("trainval_id_dict.pkl", "rb") as f:
            trainval_id_dict = pickle.load(f)
        songtag_matrix = sparse.load_npz('songtag_matrix.npz')
        with open('model.sav', 'rb') as f:
            model = pickle.load(f)

        # Setting the number of popular songs equal to train.py
        popular_num_song = songtag_length[0]


        ###########################
        print("Finished 1st step!")
        ###########################


        # Making recommendation lists (takes approximately 50 minutes)
        song_fill = []
        tag_fill = []
        for j in [trainval_id_dict[i] for i in val.id.values]:
            song_fill.append([popular_song_dict[k] for k,_ in model.recommend(j, songtag_matrix, filter_items = range(popular_num_song, songtag_matrix.shape[1]), N = 200)])
            tag_fill.append([popular_tag_dict[k] for k in [i for i,_ in model.rank_items(j, songtag_matrix, list(range(popular_num_song, songtag_matrix.shape[1])))[:15]]])
        
        
        ###########################
        print("Finished 2nd step!")
        ###########################


        # Creating the final dictionary for results.json
        answers = []
        for i in range(len(val)):
            answers.append({
                "id": val.id.values[i],
                "songs": remove_seen(val.songs.values[i], song_fill[i])[:100],
                "tags": remove_seen(val.tags.values[i], tag_fill[i])[:10]
            })


        ###########################
        print("Finished 3rd step!")
        print("Finished writing answers!")
        ###########################


        return answers
Beispiel #4
0
    def _generate_answers(self, train, questions):
        _, song_mp = most_popular(train, "songs", 200)
        _, tag_mp = most_popular(train, "tags", 100)

        answers = []

        for q in tqdm(questions):
            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_mp)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10],
            })

        return answers
    def _generate_answers(self, train, questions, song_meta):

        song_infos = {}
        for t in train:
            song_infos[t['id']] = [song_meta[a] for a in t['songs']]

        plylst_list = {}
        for plylst, songs in song_infos.items():
            plylst_list[plylst] = songs2vec(songs)

        answers = []

        for q in tqdm(questions):
            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_mp)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10],
            })

        return answers
Beispiel #6
0
def generate_answers(train, questions):

    _, song_mp = most_popular(train, "songs", 200)
    _, tag_mp = most_popular(train, "tags", 100)

    answers = []

    for q in questions:
        if len(q["songs"]) != 0 and len(q["tags"]) != 0:
            answers.append({
                "id": q["id"],
                "songs": q["songs"],
                "tags": q["tags"]
            })
        else:
            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_mp)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10]
            })

    return answers
Beispiel #7
0
    def cf_(self,
            train_songs_A,
            train_tags_A,
            test_songs_A,
            test_tags_A,
            song_ntop=500,
            tag_ntop=50):

        print("CF...")

        train_songs_A_T = train_songs_A.T.tocsr(
        )  # shape) n_songs * n_train ply
        train_tags_A_T = train_tags_A.T.tocsr()  # shape) n_tags * n_train ply

        res = []

        song_val = test_songs_A.dot(train_songs_A_T)
        tag_val = test_tags_A.dot(train_tags_A_T)

        cand_song_matrix = song_val.dot(train_songs_A)
        cand_tag_matrix = tag_val.dot(train_tags_A)

        del song_val
        del tag_val

        for r, pid in tqdm(enumerate(self.plylst_test.index), 0):

            songs_already = self.plylst_test.loc[pid, "songs_id"]
            tags_already = self.plylst_test.loc[pid, "tags_id"]
            '''
            if self.plylst_test.loc[pid,"song_added"]:
                songs_already = self.orig_test.loc[self.plylst_nid_id[pid],"songs"]
            
            if self.plylst_test.loc[pid,"tag_added"]:
                tags_already = self.orig_test.loc[self.plylst_nid_id[pid],"tags"]
            
            '''

            song_row = cand_song_matrix.getrow(r).toarray().reshape(
                -1, )  # 1 * n_songs > 점수 행렬
            cand_song_idx = song_row.argsort(
            )[-song_ntop - 50:][::-1]  # 점수 순 idx(= 곡 sid) sort
            cand_song_idx = remove_seen(
                songs_already, cand_song_idx
            )[:song_ntop]  # cand_song_idx에 있는 곡들 중 songs_already에 없는 곡
            #cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:song_ntop] # 원래 있던 곡 제외, 상위 n개
            rec_song_score = [song_row[i] for i in cand_song_idx]

            tag_row = cand_tag_matrix.getrow(r).toarray().reshape(-1, )
            cand_tag_idx = tag_row.argsort()[-tag_ntop - 5:][::-1]
            cand_tag_idx = remove_seen(tags_already, cand_song_idx)[:tag_ntop]
            #cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:tag_ntop]
            rec_tag_score = [tag_row.data[i] for i in cand_tag_idx]

            res.append({
                "id": self.plylst_nid_id[pid],  # id로 반환
                "songs":
                [self.song_sid_id[i] for i in cand_song_idx],  # id로 반환
                "tags": [self.tag_tid_id[i] for i in cand_tag_idx],  # id로 반환
                "songs_score": rec_song_score,
                "tags_score": rec_tag_score
            })

        print("DONE")

        return res
Beispiel #8
0
def put_most_popular(seq, pop):
    unseen = remove_seen(seq, pop)
    return seq + unseen[:len(pop) - len(seq)]
    def _generate_answers(self, song_meta_json, train,
                          questions):  #train : arena_data/orig/train.json
        song_meta = {int(song["id"]): song
                     for song in song_meta_json
                     }  #song id를 키값으로 저장하고 그 song의 특징(tags, name 등) 저장
        song_mp_counter, song_mp = most_popular(
            train, "songs", 200
        )  #song_mp_counter : 딕셔너리값({... , 18273 : 1, ...}),song_mp : train에서 'songs'에 가장 많이 있는 200개 곡
        tag_mp_counter, tag_mp = most_popular(
            train, "tags", 100)  #train에서 'tags'에 가장 많이 있는 100개 태그
        song_mp_per_genre = self._song_mp_per_genre(
            song_meta, song_mp_counter
        )  #song_mp_per_genre = res ex) res = { pop : ['hello' : 200 ... ], }
        art_dic = self._artist_songs(
            song_meta, song_mp_counter)  #200넘는곡을 가진 가수의 이름과 곡 딕셔너리
        tag_id = self._songs_most_tag(train)

        answers = []
        for q in tqdm(questions):

            genre_counter = Counter()
            art_c = Counter()
            tag_c = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["artist_name_basket"]:
                    art_c.update({genre: 1})

            artist_name = list(art_c.keys())

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)

            if len(artist_name) == 1 and artist_name[0] in art_dic.keys():
                cur_songs = list(art_dic[artist_name[0]])
            elif len(top_genre) != 0:
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                cur_songs = song_mp

            tag_list = []

            if (q['songs'] != []):
                for sid in q["songs"]:
                    if (sid in tag_id):
                        for a in tag_id[sid]:
                            tag_c.update({a: 1})
                    tag_list = [k for k, v in tag_c.most_common()]
                if len(tag_list) > 10:
                    cur_tags = tag_list[:10]
                else:
                    new_list = remove_seen(tag_list, tag_mp)[:10]
                    cur_tags = (tag_list + new_list)[:10]
            else:
                cur_tags = remove_seen(q["tags"], tag_mp)[:10]

            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], cur_songs)[:100],
                "tags": cur_tags
            })

        return answers
Beispiel #10
0
    def mixed_(self,
               train_songs_A,
               train_tags_A,
               test_songs_A,
               test_tags_A,
               song_ntop=500,
               tag_ntop=50,
               iteration=20):

        print("MF for song / CF for tag...")

        res = []

        # song
        songs_A = spr.vstack([test_songs_A, train_songs_A])
        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)
        als_model.fit(songs_A.T * 100)

        # tag
        train_tags_A_T = train_tags_A.T.tocsr()  # shape) n_tags * n_train ply
        tag_val = test_tags_A.dot(train_tags_A_T)

        cand_tag_matrix = tag_val.dot(train_tags_A)

        del tag_val

        for r, pid in tqdm(enumerate(range(test_songs_A.shape[0]), 0)):

            # song
            if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop,
                    filter_already_liked_items=False)

            else:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop,
                    filter_already_liked_items=True)

            rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song]
            rec_song_score = [x[1] for x in cand_song]

            # tag
            tag_row = cand_tag_matrix.getrow(r).toarray().reshape(-1, )
            cand_tag_idx = tag_row.argsort()[-tag_ntop - 5:][::-1]

            tags_already = self.plylst_test.loc[self.n_train + pid, "tags_id"]

            if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1:
                rec_tag_idx = remove_seen(tags_already,
                                          cand_tag_idx)[:tag_ntop]

            else:
                tags_already = self.plylst_test.loc[self.n_train + pid,
                                                    "tags_id"]
                rec_tag_idx = remove_seen(tags_already,
                                          cand_tag_idx)[:tag_ntop]

            rec_tag_score = [tag_row.data[i] for i in cand_tag_idx]

            res.append({
                "id": self.plylst_nid_id[self.n_train + pid],
                "songs": rec_song_idx,
                "tags": [self.tag_tid_id[i] for i in rec_tag_idx],
                "songs_score": rec_song_score,
                "tags_score": rec_tag_score
            })
            return res
    def _generate_answers(self, train, questions):

        _, song_mp = self._most_popular(train, "songs", 200)
        _, tag_mp = self._most_popular(train, "tags", 100)

        len_train = len(train)  # 92056 개
        len_question = len(questions)  # 23015 개

        plylst = pd.concat([train, questions])
        plylst["nid"] = range(len_train + len_question)
        # plylst_id_nid = dict(zip(plylst["id"], plylst["nid"]))
        plylst_nid_id = dict(zip(plylst["nid"], plylst["id"]))

        # song에 새로운 id로 sid 부여
        # len_songs : 576168
        song_id_sid, song_sid_id, len_songs = self._add_new_id(plylst, "songs")

        # tag에 id로 tid 부여
        # len_tags : 26586
        tag_id_tid, tag_tid_id, len_tags = self._add_new_id(plylst, "tags")

        plylst['songs_id'] = plylst['songs']\
            .map(lambda x: [song_id_sid.get(s) for s in x
                            if song_id_sid.get(s) is not None])
        plylst['tags_id'] = plylst['tags']\
            .map(lambda x: [tag_id_tid.get(t) for t in x
                            if tag_id_tid.get(t) is not None])

        plylst_use = plylst[['nid', 'songs_id', 'tags_id']].copy()
        plylst_use['num_songs'] = plylst_use['songs_id'].str.len()
        plylst_use['num_tags'] = plylst_use['tags_id'].str.len()
        plylst_use = plylst_use.set_index('nid')

        plylst_train = plylst_use.iloc[:len_train, :]
        plylst_test = plylst_use.iloc[len_train:, :]

        row = np.repeat(range(len_train), plylst_train['num_songs'])  # 4239978
        col = [song for songs in plylst_train['songs_id'] for song in songs]  # 4239978
        dat = np.repeat(1, plylst_train['num_songs'].sum())  # 4239978
        train_songs_A = coo_matrix((dat, (row, col)),
                                   shape=(len_train, len_songs))  # (92056, 576168)
        train_songs_A_T = train_songs_A.T.tocsr()

        row = np.repeat(range(len_train), plylst_train['num_tags'])
        col = [tag for tags in plylst_train['tags_id'] for tag in tags]
        dat = np.repeat(1, plylst_train['num_tags'].sum())
        train_tags_A = coo_matrix((dat, (row, col)),
                                  shape=(len_train, len_tags))
        train_tags_A_T = train_tags_A.T.tocsr()

        # song, tag 추천
        ans = []
        for pid in tqdm(plylst_test.index):

            # 예측할 플레이리스트에 들어있는 song과 tag 확인
            songs_already = plylst_test.loc[pid, "songs_id"]
            tags_already = plylst_test.loc[pid, "tags_id"]

            if not songs_already:
                rec_song_idx = song_mp
                rec_tag_idx = remove_seen(tags_already, tag_mp)
            else:
                p = np.zeros((len_songs, 1))  # (576168, 1)
                p[plylst_test.loc[pid, 'songs_id']] = 1

                val = train_songs_A.dot(p).reshape(-1)  # (92056, )

                cand_song = train_songs_A_T.dot(val)  # (576168, )
                cand_song_idx = cand_song.reshape(-1).argsort()[-200:][::-1]
                cand_song_idx = remove_seen(songs_already, cand_song_idx)
                rec_song_idx = [song_sid_id[i] for i in cand_song_idx]

                cand_tag = train_tags_A_T.dot(val)
                cand_tag_idx = cand_tag.reshape(-1).argsort()[-20:][::-1]
                cand_tag_idx = remove_seen(tags_already, cand_tag_idx)
                rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

            ans.append({
                "id": plylst_nid_id[pid],
                "songs": rec_song_idx[:100],
                "tags": rec_tag_idx[:10]
            })

        return ans
Beispiel #12
0
def train():
    MODE = "Test"
    if MODE == "Valid":
        train = load_json("arena_data/orig/train.json") + load_json(
            "arena_data/questions/val.json")
        dev = load_json("res/val.json")
        test = load_json("res/test.json")
    else:
        train = load_json("res/train.json")
        dev = load_json("res/val.json")
        test = load_json("res/test.json")

    pred_tag = load_json("arena_data/model/pred_tag.json")
    dic_pred_tag = {}
    for p_t in pred_tag:
        dic_pred_tag[p_t['id']] = p_t['predict_tag']

    for doc in train:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    for doc in dev:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    for doc in test:
        doc['tags_org'] = doc['tags'].copy()
        doc['tags'] += dic_pred_tag[doc['id']]

    item_list = []
    len_item = []

    for doc in train + dev + test:
        song_list = []
        for i in doc['songs']:
            song_list.append(str(i))
        item_list.append(song_list + doc['tags'])
        len_item.append(len(song_list + doc['tags']))
    print("Max length of item list :", max(len_item), ", Min :", min(len_item))
    item_list = [x for x in item_list if len(x) > 1]
    print("Train set :", len(item_list))

    print("Training Item2Vec model")
    SIZE = 100
    model = Word2Vec(sentences=item_list,
                     size=SIZE,
                     window=240,
                     min_count=2,
                     sg=1,
                     workers=8,
                     iter=10,
                     negative=7,
                     compute_loss=True,
                     callbacks=[LossPrinter()])
    model.save("arena_data/model/word2vec.model")
    print("Vocab : ", len(model.wv.vocab))

    print("Building and saving playlist embeddings")
    song_dic = {}
    tag_dic = {}
    for q in tqdm(train + test + dev):
        song_dic[str(q['id'])] = q['songs']
        tag_dic[str(q['id'])] = q['tags_org']

    p2v_song = WordEmbeddingsKeyedVectors(SIZE)
    ID = []
    vec = []
    for q in tqdm(train + test + dev):
        tmp_vec = 0
        cnt_vocab = 0
        if len(q['songs']) >= 1:
            for item in q['songs']:
                try:
                    tmp_vec += model.wv.get_vector(str(item)) * 2
                    cnt_vocab += 1
                except KeyError:
                    pass
        if len(q['tags']) >= 1:
            for item in q['tags']:
                try:
                    tmp_vec += model.wv.get_vector(str(item))
                    cnt_vocab += 1
                except KeyError:
                    pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_song.add(ID, vec)
    p2v_song.save("arena_data/model/p2v_song.model")

    p2v_tag = WordEmbeddingsKeyedVectors(SIZE)
    ID = []
    vec = []
    for q in tqdm(train + test + dev):
        tmp_vec = 0
        cnt_vocab = 0
        if len(q['songs']) >= 1:
            for item in q['songs']:
                try:
                    tmp_vec += model.wv.get_vector(str(item))
                    cnt_vocab += 1
                except KeyError:
                    pass
        if len(q['tags']) >= 1:
            for item in q['tags']:
                try:
                    tmp_vec += model.wv.get_vector(str(item)) * 2
                    cnt_vocab += 1
                except KeyError:
                    pass
        if type(tmp_vec) != int:
            ID.append(str(q['id']))
            vec.append(tmp_vec)
    p2v_tag.add(ID, vec)
    p2v_tag.save("arena_data/model/p2v_tag.model")

    if MODE == "Valid":
        print("Testing")
        questions = load_json("arena_data/questions/val.json")
        cnt_wv_song = 0
        cnt_wv_tag = 0
        res = []
        for q in tqdm(questions):
            dic_song_score = {}
            dic_tag_score = {}

            song_result = []
            tag_result = []

            if str(q['id']) in p2v_song.wv.vocab:
                most_id = [
                    x for x in p2v_song.most_similar(str(q['id']), topn=50)
                ]
                for ID in most_id:
                    for s in song_dic[ID[0]]:
                        if s in dic_song_score:
                            dic_song_score[s] += ID[1]
                        else:
                            dic_song_score[s] = ID[1]

            if str(q['id']) in p2v_tag.wv.vocab:
                most_id = [
                    x for x in p2v_tag.most_similar(str(q['id']), topn=50)
                ]
                for t in tag_dic[ID[0]]:
                    if t in dic_tag_score:
                        dic_tag_score[t] += ID[1]
                    else:
                        dic_tag_score[t] = ID[1]

            if len(dic_song_score) > 0:
                sort_song_score = sorted(dic_song_score.items(),
                                         key=lambda x: x[1],
                                         reverse=True)

                for s in sort_song_score:
                    song_result.append(s[0])
                cnt_wv_song += 1

            if len(dic_tag_score) > 0:
                sort_tag_score = sorted(dic_tag_score.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

                for s in sort_tag_score:
                    tag_result.append(s[0])
                cnt_wv_tag += 1

            res.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], song_result)[:100],
                "tags": remove_seen(q["tags"], tag_result)[:10],
            })

        print(len(questions), cnt_wv_song, cnt_wv_tag)

        ans = load_json("arena_data/answers/val.json")
        evaluator = CustomEvaluator()
        evaluator._evaluate(ans, res)
    def _generate_answers(self, song_meta_json, train, questions):
        song_meta = {int(song["id"]): song for song in song_meta_json}
        train_meta = {int(plylst["id"]): plylst for plylst in train}

        song_mp_counter, song_mp = most_popular(train, "songs", 200)
        tag_mp_counter, tag_mp = most_popular(train, "tags", 100)
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        tag_per_song = self._tag_per_song(train_meta)

        ## modified for song prediction
        ## pre-processing train set data
        _, song_pop = most_popular(train, "songs", 200000)
        #song_pop = set(song_pop)
        voca_dict, voca_dict_t = self._build_vocadict(song_pop)

        # filtering song list
        num_users = len(train)
        f_song_lst, f_usr_lst = self._const_filtered_lst(train,
                                                         voca_dict,
                                                         num_users,
                                                         to_idx=num_users,
                                                         val=False)
        num_items = len(set(f_song_lst))
        data_len = len(f_song_lst)

        # re-setting index of filtered songs
        item_ids = np.array([voca_dict[i] for i in f_song_lst])
        data = np.ones(data_len)
        rows, cols, data = zip(*set(zip(f_usr_lst, item_ids, data)))
        print('train preproc done', num_items)

        ## pre-processing valid/test set data
        v_num_users = len(questions)
        f_song_lst_v, f_usr_lst_v = self._const_filtered_lst(
            questions, voca_dict, num_users, to_idx=v_num_users, val=True)
        data_len_v = len(f_song_lst_v)

        v_item_ids = np.array([voca_dict[i] for i in f_song_lst_v])
        v_data = np.ones(data_len_v)
        v_rows, v_cols, v_data = zip(
            *set(zip(f_usr_lst_v, v_item_ids, v_data)))
        print('valid preproc done', num_items)

        n_rows = rows + v_rows
        n_cols = cols + v_cols
        n_data = data + v_data
        t_num_users = num_users + v_num_users

        usr_item_mat = sp.csr_matrix((n_data, (n_rows, n_cols)),
                                     shape=(t_num_users, num_items))
        item_usr_mat = usr_item_mat.T
        als_model = implicit.als.AlternatingLeastSquares(factors=100,
                                                         regularization=0.05,
                                                         iterations=50)
        #als_model = implicit.bpr.BayesianPersonalizedRanking(factors=50)  ### actually bpr
        #als_model = implicit.lmf.LogisticMatrixFactorization(factors=50) ### actually Logistic MF
        als_model.fit(item_usr_mat)
        print("als model fitting done")

        ### for cold-start users (plylst containing no song)
        title_to_tok, vocab = title_to_token(train)
        v_title_to_tok, v_vocab = title_to_token(questions)
        title_to_tok.extend(v_title_to_tok)
        vocab.extend(v_vocab)
        print("title to token converted", len(title_to_tok), len(vocab))

        fin_vocab = get_fin_vocab(vocab)
        print("final vocab size", len(fin_vocab))

        title_to_idx = []
        for plylst in title_to_tok:
            res_idx = tok_to_idx(plylst, fin_vocab)
            title_to_idx.append(res_idx)

        user_lst, vocab_lst = preproc_for_csr(title_to_idx, 0)
        cb_rows = np.array(user_lst)
        cb_cols = np.array(vocab_lst)
        cb_data = np.ones(len(user_lst))
        plylst_tt_mat = sp.csr_matrix(
            (cb_data, (cb_rows, cb_cols)),
            shape=(len(title_to_tok), len(fin_vocab)))
        print("csr matrix for tf-idf matrix made")

        tfidf_mat = build_tfidf_mat(plylst_tt_mat)

        ####

        answers = []
        for idx, q in tqdm(enumerate(questions)):
            genre_counter = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)

            if len(top_genre) != 0:
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                cur_songs = song_mp

            ## modified for tag prediction
            tag_lst = self._tag_per_plylst(q, tag_per_song)
            tag_res = remove_seen(q["tags"], tag_lst)[:10]
            if len(tag_res) < 10:
                tag_res = remove_seen(q["tags"], tag_mp)[:10]

            ## modified for song prediction
            if len(q["songs"]) == 0:
                n_idx = idx + len(train)
                most_sim_lst = get_sim_plylst(tfidf_mat, given=n_idx, topn=30)
                cands = gather_cand(train, questions, most_sim_lst)
                song_res = remove_seen(q["songs"], cands)[:100]
                #print(n_idx, song_res)
            else:
                song_lst = self._cal_alsmodel(idx, num_users, usr_item_mat,
                                              als_model, voca_dict_t)
                song_res = remove_seen(q["songs"], song_lst)[:100]

            if len(song_res) < 100:
                print('checked here', idx)
                song_res = remove_seen(q["songs"], cur_songs)[:100]

            answers.append({"id": q["id"], "songs": song_res, "tags": tag_res})

        return answers
Beispiel #14
0
def Recommender(train,
                questions,
                n_msp,
                n_mtp,
                mode,
                sim_measure,
                song_meta,
                freq_song,
                save=False):
    ## 최종 추천리스트
    rec_list = []

    ## 1단계: 전처리
    # 1) 추천 결과가 없거나 모자란 경우를 위해 most_popular 생성
    _, song_mp = most_popular(train, "songs", 200)
    _, tag_mp = most_popular(train, "tags", 20)

    # 2) 빠른 접근을 위한 Dictionary 생성
    song_plylst_dic, song_tag_dic, plylst_song_dic, plylst_tag_dic, tag_plylst_dic, tag_song_dic, song_issue_dic, song_artist_dic = DicGenerator(
        train, song_meta)

    # 3) 미리 계산한 플레이리스트 유사도 불러오기
    '''
    sim_scores: 입력으로 들어온 questions과 train간 유사도 (Autoencoder 기반)
    gnr_scores: 입력으로 들어온 questions과 train간 유사도 (genre 정보 추가)
    title_scores: 입력으로 들어온 questions과 train간 유사도 (Word2vec 기반)
    '''
    sim_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}.npy',
                         allow_pickle=True).item()
    gnr_scores = np.load(f'scores/{mode}_scores_bias_{sim_measure}_gnr.npy',
                         allow_pickle=True).item()
    title_scores = np.load(
        f'scores/{mode}_scores_title_{sim_measure}_24000.npy',
        allow_pickle=True).item()

    ## 2단계: 함수 정의
    # 1) Counter 객체에서 빈도수 기준 topk개 출력
    def most_similar(cnt, topk):
        cnt_topk = cnt.most_common(topk)
        return [k for k, v in cnt_topk]

    # 2) 미리 계산한 유사도 기준 topk개의 플레이리스트의 plylsts와 scores 출력
    def most_similar_emb(q_id, topk, title=False, genre=False):
        # title_scores 기준
        if title:
            plylsts = [t[0] for t in title_scores[q_id][:topk]]
            scores = [t[1] for t in title_scores[q_id][:topk]]
        # gnr_scores 기준
        elif genre:
            plylsts = [t[0] for t in gnr_scores[q_id][:topk]]
            scores = [t[1] for t in gnr_scores[q_id][:topk]]
        # sim_scores 기준
        else:
            plylsts = [t[0] for t in sim_scores[q_id][:topk]]
            scores = [t[1] for t in sim_scores[q_id][:topk]]
        return plylsts, scores

    # 3) new_song_plylst_dict
    def get_new_song_plylst_dict(plylst_ms):
        new_song_plylst_dict = defaultdict(set)
        for plylst in plylst_ms:
            for _song in plylst_song_dic[plylst]:
                new_song_plylst_dict[_song].add(plylst)
        return new_song_plylst_dict

    ## 3단계: 입력으로 들어온 questions 플레이리스트에 대해 추천
    for q in tqdm(questions):

        # 1) question 플레이리스트의 정보
        # 수록 song/tag
        q_songs = q['songs']
        q_tags = q['tags']

        # 수록 song/tag와 함께 등장한 song/tag/plylst 빈도 수
        song_plylst_C = Counter()
        song_tag_C = Counter()
        tag_plylst_C = Counter()
        tag_song_C = Counter()

        # 수록 song/tag가 둘 다 없거나 적을 때
        no_songs_tags, few_songs_tags = False, False
        if len(q_songs) == 0 and len(q_tags) == 0:
            no_songs_tags = True
        elif len(q_songs) <= 3:
            few_songs_tags = True

        # 2) 빈도수 기반 추천을 위해 카운트
        # 수록 song에 대해
        for q_s in q_songs:
            song_plylst_C.update(song_plylst_dic[q_s])
            song_tag_C.update(song_tag_dic[q_s])
        # 수록 tag에 대해
        for q_t in q_tags:
            tag_plylst_C.update(tag_plylst_dic[q_t])
            tag_song_C.update(tag_song_dic[q_t])
            # 수록곡 수로 나눠서 비율로 계산
        for i, j in list(song_plylst_C.items()):
            if len(plylst_song_dic[i]) > 0:
                song_plylst_C[i] = (j / len(plylst_song_dic[i]))

                # 3) 유사도 기반 추천을 위해 점수 계산
        plylst_song_scores = defaultdict(lambda: 0)
        plylst_tag_scores = defaultdict(lambda: 0)

        # Case 1: song과 tag가 둘 다 없는 경우
        if no_songs_tags:
            # plylst_ms / plylst_mt: title_scores 기준 유사한 플레이리스트 n_msp / n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'],
                                                      n_msp,
                                                      title=True)
            plylst_mt, tag_scores = most_similar_emb(q['id'],
                                                     n_mtp,
                                                     title=True)
            plylst_add, add_scores = most_similar_emb(q['id'], n_mtp)

        # Case 2: song과 tag가 부족한 경우
        elif few_songs_tags:
            # plylst_ms / plylst_mt: sim_scores 기준 n_msp개 / title_scores 기준 n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'], n_msp)
            plylst_mt, tag_scores = most_similar_emb(q['id'],
                                                     n_mtp,
                                                     title=True)
            plylst_add, add_scores = most_similar_emb(q['id'],
                                                      n_mtp,
                                                      genre=True)

        # Case 3: song과 tag가 충분한 경우
        else:
            # plylst_ms / plylst_mt: sim_scores 기준 유사한 플레이리스트 n_msp / n_mtp개
            plylst_ms, song_scores = most_similar_emb(q['id'], n_msp)
            plylst_mt, tag_scores = most_similar_emb(q['id'],
                                                     n_mtp,
                                                     genre=True)
            plylst_add, add_scores = most_similar_emb(q['id'],
                                                      n_mtp,
                                                      title=True)

        new_song_plylst_dict = get_new_song_plylst_dict(plylst_ms)

        # 3-1. plylst_song_scores 계산
        for idx, ms_p in enumerate(plylst_ms):
            for song in plylst_song_dic[ms_p]:
                song_score = 0
                for q_s in q_songs:
                    try:
                        song_score += len(new_song_plylst_dict[q_s]
                                          & new_song_plylst_dict[song]) / len(
                                              new_song_plylst_dict[q_s])
                    except:
                        pass
                if song in freq_song:
                    plylst_song_scores[song] += song_plylst_C[
                        ms_p] * song_score * song_scores[idx] * (n_msp -
                                                                 idx) * 4
                else:
                    plylst_song_scores[song] += song_plylst_C[
                        ms_p] * song_score * song_scores[idx] * (n_msp - idx)
            for tag in plylst_tag_dic[ms_p]:
                plylst_tag_scores[tag] += tag_scores[idx] * (n_msp - idx)

        # 3-2. plylst_tag_scores 계산
        for idx, mt_p in enumerate(plylst_mt):
            for tag in plylst_tag_dic[mt_p]:
                plylst_tag_scores[tag] += tag_scores[idx] * (n_mtp - idx)
            for song in plylst_song_dic[mt_p]:
                plylst_song_scores[song] += tag_scores[idx]

        # 3-3. plylst_{song/tag}_scores 보정
        for idx, mt_p in enumerate(plylst_add):
            for tag in plylst_tag_dic[mt_p]:
                plylst_tag_scores[tag] += add_scores[idx] * (n_mtp - idx)

        # 4) song과 tag 둘 다 없거나 적은 경우 예측해서 채워넣기
        if no_songs_tags:
            # q_songs 새롭게 채워넣기 (원래는 song가 없지만 title_scores 기준 유사한 플레이리스트로부터 song 예측)
            pre_songs = sorted(plylst_song_scores.items(),
                               key=lambda x: x[1],
                               reverse=True)
            pre_songs = [scores[0] for scores in pre_songs][:200]
            pre_songs = pre_songs + remove_seen(pre_songs, song_mp)
            q_songs = pre_songs[:100]

            # q_tags 새롭게 채워넣기 (원래는 tag가 없지만 title_scores 기준 유사한 플레이리스트로부터 tag 예측)
            pre_tags = sorted(plylst_tag_scores.items(),
                              key=lambda x: x[1],
                              reverse=True)
            pre_tags = [scores[0] for scores in pre_tags][:20]
            pre_tags = pre_tags + remove_seen(pre_tags, tag_mp)
            q_tags = pre_tags[:10]

            # 5) questions 플레이리스트에 대해 추천
        ## song 추천
        # song 있을 때
        lt_song_art = []
        if len(q_songs) > 0:
            plylst_song_scores = sorted(plylst_song_scores.items(),
                                        key=lambda x: x[1],
                                        reverse=True)

            lt_artist = []
            for w_song in q_songs:
                lt_artist.extend(song_artist_dic[w_song])
            counter_artist = Counter(lt_artist)
            counter_artist = sorted(counter_artist.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
            if few_songs_tags:
                artist = [art[0] for art in counter_artist]
            else:
                artist = [x[0] for x in counter_artist if x[1] > 1]
            cand_ms = [scores[0] for scores in plylst_song_scores
                       ][(100 - len(artist)):1000]
            for cand in cand_ms:
                if artist == []:
                    break
                if cand in q_songs:
                    break
                for art in song_artist_dic[cand]:
                    if art in artist:
                        lt_song_art.append(cand)
                        artist.remove(art)
                        break
            song_ms = [scores[0] for scores in plylst_song_scores][:200]

        # song 없고, tag 있을 때
        else:
            song_ms = most_similar(tag_song_C, 200)

        ## tag 추천
        # tag 있을 때
        if len(q_tags) > 0:
            plylst_tag_scores = sorted(plylst_tag_scores.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
            tag_ms = [scores[0] for scores in plylst_tag_scores][:20]

        # tag 없고, song 있을 때
        else:
            plylst_tag_scores = sorted(plylst_tag_scores.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
            tag_ms = [scores[0] for scores in plylst_tag_scores][:20]

        ## issue date 늦은 song 제거
        if q['updt_date']:
            q_updt_date = q['updt_date'][:4] + q['updt_date'][5:7] + q[
                'updt_date'][8:10]
            song_ms = [x for x in song_ms if song_issue_dic[x] < q_updt_date]

        ## 중복 제거 및 부족하면 most_popular로 채워넣기
        song_candidate = song_ms + remove_seen(song_ms, song_mp)
        tag_candidate = tag_ms + remove_seen(tag_ms, tag_mp)

        song_remove = q_songs
        tag_remove = q_tags

        song_candidate = song_candidate[:100] if no_songs_tags else remove_seen(
            song_remove, song_candidate)[:100]
        if len(lt_song_art) > 0:
            lt_song_art = [x for x in lt_song_art if x not in song_candidate]
            song_candidate[(100 - len(lt_song_art)):100] = lt_song_art

        rec_list.append({
            "id":
            q["id"],
            "songs":
            song_candidate,
            "tags":
            tag_candidate[:10] if no_songs_tags else remove_seen(
                tag_remove, tag_candidate)[:10]
        })

    # 6) results.json 파일 저장 여부
    if save == True:
        write_json(
            rec_list, 'results/results_' +
            dt.datetime.now().strftime("%y%m%d-%H%M%S") + '_' + mode + '.json')

    return rec_list