Beispiel #1
0
def train_and_predict(train_filepath, test_filepath):
    train_df = pd.read_json(train_filepath)
    test_df = pd.read_json(test_filepath)

    tr_songs = train_df.songs.tolist()
    te_songs = test_df.songs.tolist()
    tr_tags = train_df.tags.tolist()
    te_tags = test_df.tags.tolist()

    vocab = Vocabulary(pd.concat([train_df, test_df], ignore_index=True))

    train_data = encode_features(train_df, vocab)
    test_data = encode_features(test_df, vocab)

    # Shuffle train data
    train_data = shuffle(train_data)

    # list of lists -> CSR
    def lil_to_csr(indices, shape):
        data = []
        row_ind = []
        col_ind = []
        for row_idx, row in enumerate(indices):
            for col_idx in row:
                data.append(1)
                row_ind.append(row_idx)
                col_ind.append(col_idx)
        return csr_matrix((data, (row_ind, col_ind)), shape=shape)

    train_csr = lil_to_csr(train_data, (len(train_data), vocab.size))
    test_csr = lil_to_csr(test_data, (len(test_data), vocab.size))

    r = scipy.sparse.vstack([test_csr, train_csr])
    r = csr_matrix(r)

    factors = 512
    alpha = 500.0
    als_model = ALS(factors=factors, regularization=0.1)
    als_model.fit(r.T * alpha)

    song_model = ALS(factors=factors)
    tag_model = ALS(factors=factors)
    song_model.user_factors = als_model.user_factors
    tag_model.user_factors = als_model.user_factors
    song_model.item_factors = als_model.item_factors[:vocab.num_songs]
    tag_model.item_factors = als_model.item_factors[vocab.num_songs:]

    song_rec_csr = test_csr[:, :vocab.num_songs]
    tag_rec_csr = test_csr[:, vocab.num_songs:]

    song_rec = song_model.recommend_all(song_rec_csr, N=100)
    tag_rec = tag_model.recommend_all(tag_rec_csr, N=10)
    tag_rec += vocab.num_songs

    return [{
        "id": test_playlist_id,
        "songs": list(map(vocab.id_to_song, song_rec[test_row_idx])),
        "tags": list(map(vocab.id_to_tag, tag_rec[test_row_idx])),
    } for test_row_idx, test_playlist_id in enumerate(tqdm(test_df.id))]
 def __init__(self, params={"c": None}, nunique_feature=None):
     self.params = params.copy()
     self.c = params["c"]
     del params["c"]
     self.model = ALS(**params)
     self.song_model = ALS(**params)
     self.tag_model = ALS(**params)
     self.song_rec_csr = None
     self.tag_rec_csr = None
     self.nunique_feature = nunique_feature
Beispiel #3
0
    def __init__(self, df, config, orig_df):
        df = self._calc_confidence_preference(df, config.alpha)
        self.config = config
        self.orig_df = orig_df

        def check_index_uniformity(index):
            return index.min() == 0 and \
                   index.max() == len(index) - 1

        def index_info(index):
            return 'index with min %d max %d count %d items' % (
                index.min(), index.max(), len(index))

        assert check_index_uniformity(
            df.user_id.drop_duplicates()), index_info(
                df.user_id.drop_duplicates())
        assert check_index_uniformity(
            df.item_id.drop_duplicates()), index_info(
                df.item_id.drop_duplicates())

        users = df.user_id.to_list()
        items = df.item_id.to_list()
        rate = df.rate.to_list()
        shape = (len(set(items)), len(set(users)))
        self.iu_mat = csr_matrix((rate, (items, users)), shape=shape)
        self.ui_mat = self.iu_mat.transpose()

        self.model = ALS(factors=config.factors,
                         calculate_training_loss=True,
                         iterations=config.iterations,
                         regularization=config.regularization)
        self.max_uix = max(users)
 def __init__(self, params):
     params = params.copy()
     self.c = params["c"]
     del params["c"]
     self.model = ALS(**params)
     self.plylst_feature_mapping = {}
     self.song_feature_mapping = {}
     self.n_plylst = None
     self.n_song = None
     self.n_tag = None
     self.n_train = None
     self.n_test = None
    def _split_collective_model(self, csr_matrix):
        splited_models = []
        splited_csr_matrix = []
        prev_idx = 0
        for num_entity in self.num_entities:
            model = ALS(factors=self.num_factors)
            model.user_factors = self.model.user_factors
            model.item_factors = self.model.item_factors[prev_idx:prev_idx+num_entity]
            splited_models.append(model)

            entity_csr_matrix = csr_matrix[:, prev_idx:prev_idx+num_entity]
            splited_csr_matrix.append(entity_csr_matrix)

            prev_idx = prev_idx + num_entity
        return splited_models, splited_csr_matrix
Beispiel #6
0
    def mf_(self,
            train_songs_A,
            train_tags_A,
            test_songs_A,
            test_tags_A,
            song_ntop=500,
            tag_ntop=50,
            iteration=20):

        print(f'MF... iters:{iteration}')
        # 0711 기준 최고 하이퍼파라미터) * 100, song - 256, tag - 32, reg = 0.1, epoch 20 > song 56.4%, tag 61.3%

        res = []

        songs_A = spr.vstack([test_songs_A, train_songs_A])
        tags_A = spr.vstack([test_tags_A, train_tags_A])

        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)  # epoch
        als_model.fit(songs_A.T * 100)

        als_model_tag = ALS(factors=32,
                            regularization=0.08,
                            use_gpu=True,
                            iterations=iteration)
        als_model_tag.fit(tags_A.T * 100)

        #rec_song = als_model.recommend_all(train_songs_A,N=500)
        #rec_tag = als_model_tag.recommend_all(train_tags_A,N=50) # list (no score)

        for pid in tqdm(range(test_songs_A.shape[0])):

            if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop + 50,
                    filter_already_liked_items=False)

            else:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop,
                    filter_already_liked_items=True)

            if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1:
                cand_tag = als_model_tag.recommend(
                    pid,
                    test_tags_A,
                    N=tag_ntop + 5,
                    filter_already_liked_items=True)
                #tags_already = self.orig_test[self.orig_test['id']== self.plylst_nid_id[self.n_train + pid]]['tags']
                #cand_tag = remove_seen(tags_already,cand_tag)[:tag_ntop]

            else:
                cand_tag = als_model_tag.recommend(
                    pid,
                    test_tags_A,
                    N=tag_ntop,
                    filter_already_liked_items=True)

            rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song]
            rec_song_score = [x[1] for x in cand_song]
            rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag]
            rec_tag_score = [x[1] for x in cand_tag]

            res.append({
                "id": self.plylst_nid_id[self.n_train + pid],
                "songs": rec_song_idx,
                "tags": rec_tag_idx,
                "songs_score": rec_song_score,
                "tags_score": rec_tag_score
            })

        print("DONE")

        return res
Beispiel #7
0
    def mixed_(self,
               train_songs_A,
               train_tags_A,
               test_songs_A,
               test_tags_A,
               song_ntop=500,
               tag_ntop=50,
               iteration=20):

        print("MF for song / CF for tag...")

        res = []

        # song
        songs_A = spr.vstack([test_songs_A, train_songs_A])
        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)
        als_model.fit(songs_A.T * 100)

        # tag
        train_tags_A_T = train_tags_A.T.tocsr()  # shape) n_tags * n_train ply
        tag_val = test_tags_A.dot(train_tags_A_T)

        cand_tag_matrix = tag_val.dot(train_tags_A)

        del tag_val

        for r, pid in tqdm(enumerate(range(test_songs_A.shape[0]), 0)):

            # song
            if self.plylst_test.loc[(self.n_train + pid), "song_dirty"] == 1:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop,
                    filter_already_liked_items=False)

            else:
                cand_song = als_model.recommend(
                    pid,
                    test_songs_A,
                    N=song_ntop,
                    filter_already_liked_items=True)

            rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song]
            rec_song_score = [x[1] for x in cand_song]

            # tag
            tag_row = cand_tag_matrix.getrow(r).toarray().reshape(-1, )
            cand_tag_idx = tag_row.argsort()[-tag_ntop - 5:][::-1]

            tags_already = self.plylst_test.loc[self.n_train + pid, "tags_id"]

            if self.plylst_test.loc[(self.n_train + pid), "tag_dirty"] == 1:
                rec_tag_idx = remove_seen(tags_already,
                                          cand_tag_idx)[:tag_ntop]

            else:
                tags_already = self.plylst_test.loc[self.n_train + pid,
                                                    "tags_id"]
                rec_tag_idx = remove_seen(tags_already,
                                          cand_tag_idx)[:tag_ntop]

            rec_tag_score = [tag_row.data[i] for i in cand_tag_idx]

            res.append({
                "id": self.plylst_nid_id[self.n_train + pid],
                "songs": rec_song_idx,
                "tags": [self.tag_tid_id[i] for i in rec_tag_idx],
                "songs_score": rec_song_score,
                "tags_score": rec_tag_score
            })
            return res
Beispiel #8
0
    def multi_mf_(self, train_songs_A, train_tags_A, test_songs_A, test_tags_A,
                  song_ntop, tag_ntop, iteration):

        print(f'Multi_MF... iters:{iteration}')

        #res = []

        songs_A = spr.vstack([test_songs_A, train_songs_A])
        tags_A = spr.vstack([test_tags_A, train_tags_A])

        A = spr.hstack([songs_A, tags_A])

        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)  # epoch
        als_model.fit(A.T * 15)

        song_model = ALS(use_gpu=False)
        tag_model = ALS(use_gpu=False)

        song_model.user_factors = als_model.user_factors
        tag_model.user_factors = als_model.user_factors

        song_model.item_factors = als_model.item_factors[:self.n_songs]
        tag_model.item_factors = als_model.item_factors[self.n_songs:]

        # for test
        song_rec_csr = songs_A[:self.n_test, :]
        tag_rec_csr = tags_A[:self.n_test, :]

        cand_song = song_model.recommend_all(song_rec_csr, N=500)
        cand_tag = tag_model.recommend_all(tag_rec_csr, N=50)

        res = [{
            "id": self.plylst_nid_id[self.n_train + id],
            "songs": [self.song_sid_id.get(x) for x in rec_idx[0].tolist()],
            "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()]
        } for id, rec_idx in enumerate(zip(cand_song, cand_tag))]

        #rec_song = als_model.recommend_all(train_songs_A,N=500)
        #rec_tag = als_model_tag.recommend_all(train_tags_A,N=50) # list (no score)
        '''
        for id in tqdm(range(self.n_test)):

            # song
            cand_song = song_model.recommend(id,song_rec_csr, N=song_ntop, filter_already_liked_items=True)
            rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song]
            rec_song_score = [x[1] for x in cand_song]

            # tag
            cand_tag = tag_model.recommend(id,tag_rec_csr, N=tag_ntop, filter_already_liked_items=True)
            rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag]
            rec_tag_score = [x[1] for x in cand_tag]

            res.append({
                "id": self.plylst_nid_id[self.n_train + id],
                "songs": rec_song_idx, 
                "tags": rec_tag_idx,
                "songs_score":rec_song_score,
                "tags_score":rec_tag_score
            })
        '''

        print("DONE")

        return res
Beispiel #9
0
    def mf_(self,
            train_songs_A,
            train_tags_A,
            test_songs_A,
            test_tags_A,
            song_ntop=500,
            tag_ntop=50,
            iteration=20):

        print(f'MF... iters:{iteration}')
        # 0711 기준 최고 하이퍼파라미터) * 100, song - 256, tag - 32, reg = 0.1, epoch 20 > song 56.4%, tag 61.3%

        val_song_res = []
        val_tag_res = []
        test_song_res = []
        test_tag_res = []

        songs_A = spr.vstack([test_songs_A, train_songs_A])
        tags_A = spr.vstack([test_tags_A, train_tags_A])

        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)  # epoch
        als_model.fit(songs_A.T * 100)

        als_model_tag = ALS(factors=32,
                            regularization=0.08,
                            use_gpu=True,
                            iterations=iteration)
        als_model_tag.fit(tags_A.T * 100)

        for id in tqdm(range(self.n_test_song)):  # 18636 / 태그 -> 11605 행

            # song
            cand_song = als_model.recommend(id,
                                            test_songs_A,
                                            N=song_ntop,
                                            filter_already_liked_items=True)

            rec_song_idx = [self.song_sid_id.get(x[0]) for x in cand_song]
            rec_song_score = [x[1] for x in cand_song]

            if (id < self.n_val_song):  # 순서 - train, val, test
                val_song_res.append({
                    "id":
                    self.plylst_nid_id[self.plylst_test_song.index[id]],
                    "songs":
                    rec_song_idx,
                    "songs_score":
                    rec_song_score
                })
            else:
                test_song_res.append({
                    "id":
                    self.plylst_nid_id[self.plylst_test_song.index[id]],
                    "songs":
                    rec_song_idx,
                    "songs_score":
                    rec_song_score
                })

            # tag
            try:
                cand_tag = als_model_tag.recommend(
                    id,
                    test_tags_A,
                    N=tag_ntop,
                    filter_already_liked_items=True)

                rec_tag_idx = [self.tag_tid_id.get(x[0]) for x in cand_tag]
                rec_tag_score = [x[1] for x in cand_tag]

                if (id < self.n_val_song):
                    val_tag_res.append({
                        "id":
                        self.plylst_nid_id[self.plylst_test_tag.index[id]],
                        "tags":
                        rec_tag_idx,
                        "tags_score":
                        rec_tag_score
                    })
                else:
                    test_tag_res.append({
                        "id":
                        self.plylst_nid_id[self.plylst_test_tag.index[id]],
                        "tags":
                        rec_tag_idx,
                        "tags_score":
                        rec_tag_score
                    })

            except IndexError:
                pass

        print("DONE")

        return val_song_res, val_tag_res, test_song_res, test_tag_res
 def __init__(self, num_entities, num_factors, **kwargs):
     self.num_entities = num_entities
     self.num_factors = num_factors
     self.model = ALS(factors=num_factors, **kwargs)
Beispiel #11
0
    def multi_mf_(self,
                  train_songs_A,
                  train_tags_A,
                  val_songs_A,
                  val_tags_A,
                  test_songs_A,
                  test_tags_A,
                  meta=True,
                  song_ntop=500,
                  tag_ntop=50,
                  iteration=20,
                  score=False):

        print(f'Multi_MF... iters:{iteration}')

        val_res = []
        test_res = []

        songs_A = spr.vstack([val_songs_A, test_songs_A, train_songs_A])
        tags_A = spr.vstack([val_tags_A, test_tags_A, train_tags_A])

        print(val_songs_A.shape, test_songs_A.shape, train_songs_A.shape)

        if meta == True:
            s_meta = self.mkspr_for_meta()
            print(songs_A.shape, tags_A.shape, s_meta.shape)
            A = spr.hstack([songs_A, tags_A, s_meta])
        else:
            A = spr.hstack([songs_A, tags_A])

        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)
        als_model.fit(A.T * 100)

        song_model = ALS(use_gpu=True)
        tag_model = ALS(use_gpu=True)

        song_model.user_factors = als_model.user_factors
        tag_model.user_factors = als_model.user_factors

        song_model.item_factors = als_model.item_factors[:self.n_songs]
        tag_model.item_factors = als_model.item_factors[self.n_songs:]

        # for val
        val_song_rec_csr = songs_A[:self.n_val, :]
        val_tag_rec_csr = tags_A[:self.n_val, :]

        # for test
        test_song_rec_csr = songs_A[self.n_val:self.n_val + self.n_test, :]
        test_tag_rec_csr = tags_A[self.n_val:self.n_val + self.n_test, :]

        if score is True:
            pass

        else:
            # val
            cand_song = song_model.recommend_all(val_song_rec_csr, N=song_ntop)
            cand_tag = tag_model.recommend_all(val_tag_rec_csr, N=tag_ntop)

            val_res = [{
                "id":
                self.plylst_nid_id[self.n_train + id],
                "songs":
                [self.song_sid_id.get(x) for x in rec_idx[0].tolist()],
                "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()]
            } for id, rec_idx in enumerate(zip(cand_song, cand_tag))]

            # test
            cand_song = song_model.recommend_all(test_song_rec_csr,
                                                 N=song_ntop)
            cand_tag = tag_model.recommend_all(test_tag_rec_csr, N=tag_ntop)

            test_res = [{
                "id":
                self.plylst_nid_id[self.n_train + self.n_val + id],
                "songs":
                [self.song_sid_id.get(x) for x in rec_idx[0].tolist()],
                "tags": [self.tag_tid_id.get(x) for x in rec_idx[1].tolist()]
            } for id, rec_idx in enumerate(zip(cand_song, cand_tag))]

        return val_res, test_res
Beispiel #12
0
    def mf_(self,
            train_songs_A,
            train_tags_A,
            val_songs_A,
            val_tags_A,
            test_songs_A,
            test_tags_A,
            song_ntop=500,
            tag_ntop=50,
            iteration=20,
            score=False):

        print(f'MF... iters:{iteration}')

        val_song_res = []
        val_tag_res = []
        test_song_res = []
        test_tag_res = []

        songs_A = spr.vstack([val_songs_A, test_songs_A, train_songs_A])
        tags_A = spr.vstack([val_tags_A, test_tags_A, train_tags_A])

        als_model = ALS(factors=256,
                        regularization=0.08,
                        use_gpu=True,
                        iterations=iteration)  # epoch
        als_model.fit(songs_A.T * 100)
        als_model_tag = ALS(factors=32,
                            regularization=0.08,
                            use_gpu=True,
                            iterations=iteration)
        als_model_tag.fit(tags_A.T * 100)

        def res_recommend(id,
                          als_model=als_model,
                          matrix=val_songs_A,
                          N=song_ntop,
                          nid_id=self.song_sid_id,
                          id_index=self.plylst_val_song.index,
                          res=val_song_res):
            try:
                cand_song = als_model.recommend(
                    id, val_songs_A, N=N, filter_already_liked_items=True)

                rec_song_idx = [nid_id.get(x[0]) for x in cand_song]
                rec_song_score = [x[1] for x in cand_song]

                val_song_res.append({"id":self.plylst_nid_id[id_index[id]],\
                    "songs" : rec_song_idx, "songs_score": rec_song_score})

            except IndexError:
                pass

        if score is True:
            for id in tqdm(range(self.n_val_song)):
                res_recommend(id, als_model = als_model, matrix = val_songs_A, N=song_ntop, nid_id = self.song_sid_id,\
                     id_index = self.plylst_val_song.index, res = val_song_res)
                res_recommend(id, als_model = als_model_tag, matrix = val_tags_A, N=tag_ntop, nid_id = self.tag_tid_id,\
                     id_index = self.plylst_val_tag.index, res = val_tag_res)
                res_recommend(id, als_model = als_model, matrix = test_songs_A, N=song_ntop, nid_id = self.song_sid_id,\
                     id_index = self.plylst_test_song.index, res = test_song_res)
                res_recommend(id, als_model = als_model_tag, matrix = test_tags_A, N=tag_ntop, nid_id = self.tag_tid_id,\
                     id_index = self.plylst_test_tag.index, res = test_tag_res)

        else:  # Score > False

            val_cand_song = als_model.recommend_all(
                val_songs_A, N=song_ntop, filter_already_liked_items=True)
            val_cand_tag = als_model_tag.recommend_all(
                val_tags_A, N=tag_ntop, filter_already_liked_items=True)
            test_cand_song = als_model.recommend_all(
                test_songs_A, N=song_ntop, filter_already_liked_items=True)
            test_cand_tag = als_model_tag.recommend_all(
                test_tags_A, N=tag_ntop, filter_already_liked_items=True)

            val_song_res = [{
                "id":
                self.plylst_nid_id[self.plylst_val_song.index[id]],
                "songs": [self.song_sid_id.get(x) for x in rec_idx.tolist()]
            } for id, rec_idx in enumerate(val_cand_song, 0)]
            val_tag_res = [{
                "id":
                self.plylst_nid_id[self.plylst_val_tag.index[id]],
                "tags": [self.tag_tid_id.get(x) for x in rec_idx.tolist()]
            } for id, rec_idx in enumerate(val_cand_tag, 0)]
            test_song_res = [{
                "id":
                self.plylst_nid_id[self.plylst_test_song.index[id]],
                "songs": [self.song_sid_id.get(x) for x in rec_idx.tolist()]
            } for id, rec_idx in enumerate(test_cand_song, 0)]
            test_tag_res = [{
                "id":
                self.plylst_nid_id[self.plylst_test_tag.index[id]],
                "tags": [self.tag_tid_id.get(x) for x in rec_idx.tolist()]
            } for id, rec_idx in enumerate(test_cand_tag, 0)]

        print("DONE")

        return val_song_res, val_tag_res, test_song_res, test_tag_res