Example #1
0
    def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=10, k_filtering=50):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """

        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        print("CBF started")

        urm = urm.tocsr()

        if self.r_hat_aug is None:
            cbf = ContentBasedFiltering()
            cbf.fit(urm, tg_playlist,
                    tg_tracks,
                    ds)

            # get R_hat
            self.r_hat_aug = cbf.getR_hat()

        # save S
        # compute cosine similarity between users
        S = compute_cosine(self.r_hat_aug[[dataset.get_playlist_index_from_id(x)
                                            for x in self.pl_id_list]],
                           self.r_hat_aug.transpose(),
                           k_filtering=k_filtering,
                           shrinkage=shrinkage)
        # normalize s
        s_norm = S.sum(axis=1)
        s_norm[s_norm == 0] = 1
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))

        # compute ratings
        print("Similarity matrix ready!")
        urm_cleaned = urm[:, [dataset.get_track_index_from_id(x)
                              for x in self.tr_id_list]]

        R_hat = S.dot(urm_cleaned)

        # clean from already rated items
        print("R_hat done")
        urm_cleaned = urm_cleaned[[dataset.get_playlist_index_from_id(x)
                                   for x in self.pl_id_list]]
        R_hat[urm_cleaned.nonzero()] = 0
        R_hat.eliminate_zeros()
        # eliminate playlist that are not target, already done, to check
        # R_hat = R_hat[:, [dataset.get_track_index_from_id(
        #    x) for x in self.tr_id_list]]
        print("Shape of final matrix: ", R_hat.shape)
        self.R_hat = R_hat
Example #2
0
    def fit(self, urm, target_playlist, target_tracks, dataset):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """
        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        print("UBF started")

        # get UCM from dataset
        ucm = dataset.build_ucm()

        # add user ratings to ucm
        ucm = vstack([urm.transpose()], format='csr')

        # compute cosine similarity between users
        S = compute_cosine(ucm.transpose()[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]],
                           ucm,
                           k_filtering=self.k_filtering,
                           shrinkage=self.shrinkage)
        s_norm = S.sum(axis=1)
        s_norm[s_norm == 0] = 1
        # normalize s
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))
        # compute ratings
        print("Similarity matrix ready!")
        urm_cleaned = urm[:, [
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]]

        R_hat = S.dot(urm_cleaned)

        # clean from already rated items
        print("R_hat done")
        urm_cleaned = urm_cleaned[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]]
        R_hat[urm_cleaned.nonzero()] = 0
        R_hat.eliminate_zeros()
        # eliminate playlist that are not target, already done, to check
        # R_hat = R_hat[:, [dataset.get_track_index_from_id(
        #    x) for x in self.tr_id_list]]
        print("Shape of final matrix: ", R_hat.shape)
        self.R_hat = R_hat
Example #3
0
    def fit(self, urm, tg_playlist, tg_tracks, dataset):
        self.pl_id_list = list(tg_playlist)
        self.tr_id_list = list(tg_tracks)
        self.dataset = dataset
        self.urm = urm

        # Build slim similarity
        slim = SLIM()
        slim.fit(urm, self.pl_id_list, self.tr_id_list, dataset)
        S_cslim = slim.getW()
        S_cslim = S_cslim[:, [
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]].transpose()

        # Build content based similarity
        icm = dataset.build_icm()
        S_cbf = compute_cosine(icm.transpose()[[
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]],
                               icm,
                               k_filtering=200,
                               shrinkage=10)

        # Build collaborative similarity
        S_cf = compute_cosine(urm.transpose()[[
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]],
                              urm,
                              k_filtering=200,
                              shrinkage=10)

        # Build similarity from implicit model
        # ials = IALS(500, 50, 1e-4, 800)
        # ials.fit(urm, tg_playlist, tg_tracks, dataset)
        # item_factors = ials.model.item_factors
        # S_ials = compute_cosine(item_factors[[dataset.get_track_index_from_id(x) for x in self.tr_id_list]], item_factors.transpose(), k_filtering=200, shrinkage=10)

        # append all similarities
        self.similarities.append(S_cbf)
        self.similarities.append(S_cf)
        self.similarities.append(S_cslim)
Example #4
0
    def fit(self, urm, target_playlist, target_tracks, dataset):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        """
        # initialize model fields
        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)

        urm = csr_matrix(urm)

        # Build collaborative similarity
        S_cf = compute_cosine(urm.transpose()[[
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]],
                              urm,
                              k_filtering=self.k_filtering,
                              shrinkage=self.shrinkage)

        # normalize
        S_cf = normalize_by_row(S_cf)

        self.R_hat = urm[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]].dot(S_cf.transpose())

        urm_cleaned = urm[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]]
        urm_cleaned = urm_cleaned[:, [
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]]

        self.R_hat[urm_cleaned.nonzero()] = 0
        self.R_hat.eliminate_zeros()
Example #5
0
    def fit(self,
            urm,
            target_playlist,
            target_tracks,
            dataset,
            shrinkage=50,
            k_filtering=200,
            test_dict={}):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """
        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        print("CBF started")
        # get ICM from dataset, assume it already cleaned
        icm = csr_matrix(dataset.build_icm())
        ufm = urm.dot(icm.transpose())
        ufm = top_k_filtering(ufm, 100)
        # Iu contains for each user the number of tracks rated
        Iu = urm.sum(axis=1)
        # save from divide by zero!
        Iu[Iu == 0] = 1
        # Add a term for shrink
        Iu = Iu + 10
        # since we have to divide the ufm get the reciprocal of this vector
        Iu = np.reciprocal(Iu)
        # multiply the ufm by Iu. Normalize UFM
        ufm = csr_matrix(ufm.multiply(Iu)).transpose()
        ucm = csr_matrix(dataset.build_ucm())
        ucm = vstack([ucm, urm.transpose().multiply, ufm], format='csr')
        u_sim = compute_cosine(ucm.transpose(),
                               ucm,
                               k_filtering=k_filtering,
                               shrinkage=shrinkage)
        u_sim_norm = u_sim.sum(axis=1)
        # normalize
        u_sim = u_sim.multiply(np.reciprocal(u_sim_norm))
        # compute augmented urm
        aUrm = u_sim.dot(urm)
        print("Augmented URM ready!")
        aUrm[urm.nonzero()] = 1
        # keep only 500 items for each user
        for row_i in range(0, aUrm.shape[0]):
            row = aUrm.data[aUrm.indptr[row_i]:aUrm.indptr[row_i + 1]]
            k_filtering = 500
            if row.shape[0] >= k_filtering:
                sorted_idx = np.argpartition(row, row.shape[0] -
                                             k_filtering)[:-k_filtering]
                row[sorted_idx] = 0
        aUrm.eliminate_zeros()
        # aUrm.data = np.ones_like(aUrm.data)
        # put all to one:
        icm = dataset.add_playlist_to_icm(icm, aUrm, 0.4)
        S = compute_cosine(icm.transpose()[[
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]],
                           icm,
                           k_filtering=k_filtering,
                           shrinkage=shrinkage)
        sim_norm = S.sum(axis=1)
        # normalize
        S = S.multiply(np.reciprocal(sim_norm))
        R_hat = aUrm[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]].dot(S.transpose())
        urm_red = urm[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]]
        urm_red = urm_red[:, [
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]]
        R_hat[urm_red.nonzero()] = 0
        R_hat.eliminate_zeros()

        print("R_hat done")
        print("Shape of final matrix: ", R_hat.shape)
        self.R_hat = R_hat
Example #6
0
    def fit(self,
            urm,
            target_playlist,
            target_tracks,
            dataset,
            shrinkage=50,
            k_filtering=200,
            features=100):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """
        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        print("CBF started")
        # Compute URM matrix factorization and predict missing ratings
        print('SVD on eURM')
        icm = dataset.build_icm()
        ucm = dataset.build_ucm().transpose()  # ucm is playlist x attributes
        zeros_matrix = lil_matrix((icm.shape[0], ucm.shape[1])).tocsr()
        left_part = vstack([urm.multiply(0.5), icm], format='csr')
        # Stack all matrices:
        right_part = vstack([ucm, zeros_matrix], format='csr')
        eURM = hstack([left_part, right_part], format='csr')
        # DO SVD!
        u, s, v = sparsesvd(eURM.tocsc(), features)
        print("SHAPE of V: ", v.shape)
        print("SHAPE OF U", u.shape)
        # get only the features of the items
        v = csr_matrix(v[:, 0:icm.shape[1]])
        # get only the item part and compute cosine
        S = utils.compute_cosine(v.transpose()[[
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]],
                                 v,
                                 k_filtering=k_filtering,
                                 shrinkage=shrinkage)
        print("Similarity matrix ready, let's normalize it!")
        # zero out diagonal
        # in the diagonal there is the sim between i and i (1)
        # maybe it's better to have a lil matrix here
        # S.setdiag(0)
        # S.eliminate_zeros()
        # keep only target rows of URM and target columns
        urm_cleaned = urm[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]]
        s_norm = S.sum(axis=1)
        # normalize s
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))
        self.S = S.transpose()
        # compute ratings
        R_hat = urm_cleaned.dot(S.transpose().tocsc()).tocsr()
        print("R_hat done")
        # apply mask for eliminating already rated items
        urm_cleaned = urm_cleaned[:, [
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]]
        R_hat[urm_cleaned.nonzero()] = 0
        R_hat.eliminate_zeros()
        # eliminate playlist that are not target, already done, to check
        # R_hat = R_hat[:, [dataset.get_track_index_from_id(
        #    x) for x in self.tr_id_list]]
        print("Shape of final matrix: ", R_hat.shape)
        self.R_hat = R_hat
Example #7
0
    def fit(self, urm, target_playlist, target_tracks, dataset):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """
        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        urm = urm.tocsr()
        print("CBF started")
        # get ICM from dataset, assume it already cleaned
        icm = dataset.build_icm()

        # Build the tag matrix, apply TFIDF
        print("Build tags matrix and apply TFIDF...")
        icm_tag = dataset.build_tags_matrix()
        tags = applyTFIDF(icm_tag)

        # Before stacking tags with the rest of the ICM, we keep only
        # the top K tags for each item. This way we try to reduce the
        # natural noise added by such sparse features.
        tags = top_k_filtering(tags.transpose(), topK=55).transpose()

        # User augmented UCM
        # print("Building User augmented ICM")
        # ucm = dataset.build_ucm()
        # ua_icm = user_augmented_icm(urm, ucm)
        # ua_icm = top_k_filtering(ua_icm.transpose(), topK=55).transpose()

        # stack all
        icm = vstack([icm, tags, urm * 0.8], format='csr')
        # icm = vstack([icm, tags, applyTFIDF(urm)], format='csr')

        S = compute_cosine(icm.transpose(),
                           icm,
                           k_filtering=self.k_filtering,
                           shrinkage=self.shrinkage,
                           n_threads=4,
                           chunksize=1000)
        s_norm = S.sum(axis=1)

        # Normalize S
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))
        print("Similarity matrix ready!")

        self.S = S.transpose()

        # Compute ratings
        R_hat = urm.dot(S.transpose().tocsc()).tocsr()
        print("R_hat done")
        R_hat[urm.nonzero()] = 0
        R_hat.eliminate_zeros()
        R_hat = top_k_filtering(R_hat, topK=5)
        # Remove the entries in R_hat that are already present in the URM
        R_hat[urm.nonzero()] = 1

        print("Shape of final matrix: ", R_hat.shape)
        self.R_hat = R_hat
Example #8
0
    def fit(self, urm, target_playlist, target_tracks, dataset, k_feature=1000, k_similar=1000):
        # initialization
        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        self.urm = urm
        print("FWUM started!")

        # get ICM from dataset
        icm = dataset.build_icm()
        tags = dataset.build_tags_matrix()
        tags = applyTfIdf(tags)
        icm = vstack([icm, tags], format='csr')
        # icm = dataset.add_tracks_num_rating_to_icm(icm, urm).tocsr()

        # CONTENT BASED USER PROFILE
        # ucm_red = dataset.build_ucm()
        # ucm_red = dataset.add_playlist_num_rating_to_icm(ucm_red, urm)

        # build the user feature matrix
        # UxF
        ufm = urm.dot(icm.transpose())

        # Iu contains for each user the number of tracks rated
        Iu = urm.sum(axis=1)
        # save from divide by zero!
        Iu[Iu == 0] = 1
        # since we have to divide the ufm get the reciprocal of this vector
        Iu = np.reciprocal(Iu)
        # multiply the ufm by Iu. Normalize UFM
        # FxU
        ufm = csr_matrix(ufm.multiply(Iu).transpose())
        print("UCM ready")

        # build owner rating matrix
        self.ucm = dataset.build_ucm()
        orm = dataset.build_owner_item_matrix(self.ucm, urm)[[dataset.get_playlist_index_from_id(x) for x in target_playlist]]

        # build owner feature matrix
        # for each owner the average of the feature of its tracks
        ofm = orm.dot(icm.transpose())

        # usual normalization
        Iu = orm.sum(axis=1)
        # save from divide by zero!
        Iu[Iu == 0] = 1
        # since we have to divide the ufm get the reciprocal of this vector
        Iu = np.reciprocal(Iu)
        # multiply the ufm by Iu. Normalize OFM
        ofm = csr_matrix(ofm.multiply(Iu).transpose())
        # ofm is user x item feature
        print("OFM ready")

        # put together the user profile and the owner profile
        # by doing a weighted average
        ufm = ufm[:, [dataset.get_playlist_index_from_id(
            x) for x in target_playlist]]

        ufm = ufm.multiply(1) + ofm.multiply(0.1)

        # S = compute_cosine(ufm,
        #                   ufm.transpose(),
        #                   k_filtering=100)

        # S = normalize_by_row(S)

        # ufm_aug = S.dot(ufm[:, [dataset.get_playlist_index_from_id(
        #    x) for x in target_playlist]])


        # restore original preferences
        # ufm_aug[ufm.nonzero()] = ufm[ufm.nonzero()]

        self.R_hat_fwum = compute_cosine(ufm.transpose(), icm[:, [dataset.get_track_index_from_id(x) for x in target_tracks]], k_filtering=500)

        # stack the urm
        # ufm = vstack([ufm, urm.transpose().multiply(2)], format='csr')

        # ufm = TfidfTransformer().fit_transform(ufm.transpose()).transpose()

        # # compute profile based prediction
        # # R_hat_1 = compute_cosine(ufm.transpose(),
        # #                          icm[:, [dataset.get_track_index_from_id(x)
        # #                                  for x in target_tracks]],
        # #                          k_filtering=500)

        # S_user = compute_cosine(ufm.transpose()[[dataset.get_playlist_index_from_id(
        #     x) for x in target_playlist]], ufm, k_filtering=500, shrinkage=100)
        # # normalize
        # S_user = normalize_by_row(S_user)
        # self.R_hat_ubf = S_user.dot(urm[:,
        #     [dataset.get_track_index_from_id(x) for x in target_tracks]])

        # # compute content based predictions
        # icm = dataset.add_playlist_to_icm(icm, urm, 0.4)
        # S_cbf = compute_cosine(icm.transpose()[
        #     [dataset.get_track_index_from_id(x) for x in target_tracks]],
        #     icm, k_filtering=200, shrinkage=50)

        # # normalize
        # norm = S_cbf.sum(axis=1)
        # # save from divide by zero!
        # norm[norm == 0] = 1
        # # since we have to divide the ufm get the reciprocal of this vector
        # norm = np.reciprocal(norm)
        # S_cbf = csr_matrix(S_cbf.multiply(norm))

        # # compute content based predictions
        # self.R_hat_cbf = urm[[dataset.get_playlist_index_from_id(
        #     x) for x in target_playlist]].dot(S_cbf.transpose())

        self.urm = self.urm[:, [
             dataset.get_track_index_from_id(x) for x in target_tracks]]
        self.urm = self.urm[[dataset.get_playlist_index_from_id(
             x) for x in target_playlist]]
        # self.R_hat_cbf[self.urm.nonzero()] = 0
        # self.R_hat_cbf.eliminate_zeros()
        # self.R_hat_cbf = top_k_filtering(self.R_hat_cbf, 10)

        # self.R_hat_ubf[self.urm.nonzero()] = 0
        # self.R_hat_ubf.eliminate_zeros()

        self.R_hat_fwum[self.urm.nonzero()] = 0
        self.R_hat_fwum.eliminate_zeros()

        # R_hat computation
        self.R_hat = self.R_hat_fwum

        print("R_hat done")
Example #9
0
    def fit(self, urm, target_playlist, target_tracks, dataset, urm_weight=0.8):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """
        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        print("CBF started")

        # get ICM from dataset, assume it already cleaned
        icm = dataset.build_icm()
        artist = dataset.build_artist_matrix(icm)
        album = dataset.build_album_matrix(icm)
        playcount = dataset.build_playcount_matrix(icm)
        duration = dataset.build_duration_matrix(icm)
        # icm = dataset.add_tracks_num_rating_to_icm(icm, urm)
        # urm_n = np.reciprocal(urm.sum(axis=1))
        # urm = csr_matrix(urm.multiply(urm_n))
        # rationale behind this. If in a playlist there are 1000 songs the similarity between them is low
        #urm_mod = applyTfIdf(urm, topK=1000)

        #icm = dataset.add_playlist_to_icm(icm, urm, urm_weight)

        tags = dataset.build_tags_matrix()
        tags = applyTfIdf(tags, topK=55)

        playcount = applyTfIdf(playcount)
        duration = applyTfIdf(duration)

        icm = vstack([artist, album, playcount, duration, tags], format='csr')
        icm = dataset.add_playlist_to_icm(icm, urm, urm_weight)
        # build user content matrix
        # ucm = dataset.build_ucm()

        # build item user-feature matrix: UFxI
        # iucm = ucm.dot(urm)

        # iucm_norm = urm.sum(axis=0)
        # iucm_norm[iucm_norm == 0] = 1
        # iucm_norm = np.reciprocal(iucm_norm)
        # iucm = csr_matrix(iucm.multiply(iucm_norm))
        # # for each item keep only top 100 user attributes
        # iucm = top_k_filtering(iucm.transpose(), 100).transpose()
        # icm = vstack([icm.multiply(2), iucm], format='csr')
        # applytfidf
        # icm = TfidfTransformer(norm='l1').fit_transform(icm.transpose()).transpose()
        S = compute_cosine(icm.transpose()[[dataset.get_track_index_from_id(x)
                                            for x in self.tr_id_list]],
                           icm,
                           k_filtering=self.k_filtering,
                           shrinkage=self.shrinkage,
                           chunksize=1000)
        s_norm = S.sum(axis=1)

        # normalize s
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))

        # compute ratings
        print("Similarity matrix ready!")
        urm_cleaned = urm[[dataset.get_playlist_index_from_id(x)
                           for x in self.pl_id_list]]
        self.S = S.transpose()

        # compute ratings
        R_hat = urm_cleaned.dot(S.transpose().tocsc()).tocsr()
        print("R_hat done")
        urm_cleaned = urm_cleaned[:, [dataset.get_track_index_from_id(x)
                                      for x in self.tr_id_list]]
        R_hat[urm_cleaned.nonzero()] = 0
        R_hat.eliminate_zeros()
        self.R_hat = top_k_filtering(R_hat, 20)
Example #10
0
    def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=50, k_filtering=200):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """

        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        print("CBF started")

        urm = urm.tocsr()

        # get ICM from dataset
        icm = dataset.build_icm_2()

        # add n_ratings to icm
        icm = dataset.add_tracks_num_rating_to_icm(icm, urm)

        # weight urm: Each playlist is a feature
        # weight each playlist by the inverse of the number of its tracks
        urm_weights = urm.sum(axis=1)
        urm_weighted = urm.multiply(np.reciprocal(urm_weights))

        # add urm
        icm = dataset.add_playlist_to_icm(icm, urm_weighted, 2)

        # compute cosine similarity (only for tg tracks) wrt to all tracks
        S = compute_cosine(icm.transpose()[[dataset.get_track_index_from_id(x)
                                            for x in self.tr_id_list]],
                           icm, k_filtering=k_filtering, shrinkage=shrinkage)

        # Normalize S
        s_norm = S.sum(axis=1)
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))

        # keep only target rows of URM and target columns
        urm_cleaned = urm[[dataset.get_playlist_index_from_id(x)
                           for x in self.pl_id_list]]

        # save S
        self.S = S.transpose()

        # compute ratings
        R_hat = urm_cleaned.dot(S.transpose()).tocsr()

        # apply mask for eliminating already rated items
        urm_cleaned = urm_cleaned[:, [dataset.get_track_index_from_id(x)
                                      for x in self.tr_id_list]]
        R_hat[urm_cleaned.nonzero()] = 0
        R_hat.eliminate_zeros()

        print("R_hat done")
        self.R_hat = R_hat
Example #11
0
    def fit(self, urm, target_playlist, target_tracks, dataset, shrinkage=50, k_filtering=200, test_dict={}):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """
        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        print("CBF started")
        # Compute URM matrix factorization and predict missing ratings
        print('Implicit matrix factorization on URM...')
        ials = implicit.als.AlternatingLeastSquares(factors=10)
        ials.fit(urm.transpose().multiply(40))
        # Get latent factors
        user_factors = ials.user_factors
        item_factors = ials.item_factors
        print('Estimating URM multiplying latent factors...')
        urm_hat = utils.dot_chunked(user_factors,
                                    item_factors.transpose(),
                                    topK=500,
                                    chunksize=1000)
        urm_hat = lil_matrix(urm_hat)
        urm_hat[urm.nonzero()] = 1
        urm_hat = csr_matrix(urm_hat)
        # get ICM from dataset, assume it already cleaned
        icm = dataset.add_playlist_to_icm(dataset.build_icm(),
                                          urm_hat,
                                          0.25).tocsr()
        # Get tags feature matrix and build the aggregated weighted matrix
        # tags = dataset.build_tags_matrix()
        # tags = build_aggregated_feature_space(tags, n_features=3, topK=10)
        # tags = tags.multiply(0.5)  # Weight tags
        # Stack the ICM on top of aggregated weighted tags features
        # icm = vstack((icm, tags))
        print("SHAPE of ICM: ", icm.shape)
        # apply tfidf
        # transformer = TfidfTransformer()
        # icm = transformer.fit_transform(icm.transpose()).transpose()
        # calculate similarity between items:
        # S_ij=(sum for k belonging to attributes t_ik*t_jk)/norm_i * norm_k
        # first calculate norm
        # sum over rows (obtaining a row vector)

        # Compute cosine similarity matrix on ICM
        icm_t = icm.transpose()[[dataset.get_track_index_from_id(x)
                                 for x in self.tr_id_list]]
        # S is a (n_target_tracks, n_tracks)
        S = utils.compute_cosine(icm_t,
                                 icm,
                                 k_filtering=k_filtering,
                                 shrinkage=shrinkage)
        print("Similarity matrix ready, let's normalize it!")
        # zero out diagonal
        # in the diagonal there is the sim between i and i (1)
        # maybe it's better to have a lil matrix here
        # S.setdiag(0)
        # S.eliminate_zeros()
        # keep only target rows of URM and target columns
        urm_cleaned = urm[[dataset.get_playlist_index_from_id(x)
                           for x in self.pl_id_list]]
        s_norm = S.sum(axis=1)
        # normalize s
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))
        self.S = S.transpose()
        # compute ratings
        R_hat = urm_cleaned.dot(S.transpose().tocsc()).tocsr()
        print("R_hat done")
        # apply mask for eliminating already rated items
        urm_cleaned = urm_cleaned[:, [dataset.get_track_index_from_id(x)
                                      for x in self.tr_id_list]]
        R_hat[urm_cleaned.nonzero()] = 0
        R_hat.eliminate_zeros()
        # eliminate playlist that are not target, already done, to check
        # R_hat = R_hat[:, [dataset.get_track_index_from_id(
        #    x) for x in self.tr_id_list]]
        print("Shape of final matrix: ", R_hat.shape)
        self.R_hat = R_hat
Example #12
0
    def fit(self, urm, target_playlist, target_tracks, dataset, k_feature=1000, k_similar=1000):
        # initialization
        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        print("FWUM started!")
        # get ICM from dataset
        icm = dataset.build_icm()
        self.urm = urm
        # CONTENT BASED USER PROFILE
        ucm_red = dataset.build_ucm()
        # build the user feature matrix
        # FxUt
        ufm = urm.dot(icm.transpose())[[dataset.get_playlist_index_from_id(x) for x in target_playlist]].transpose()
        print("Start filtering")

        ufm = self.filter_by_topic(ufm, dataset).transpose()
        # Iu contains for each user the number of tracks rated
        Iu = urm[[dataset.get_playlist_index_from_id(x) for x in target_playlist]].sum(axis=1)
        # save from divide by zero!
        Iu[Iu == 0] = 1
        # since we have to divide the ufm get the reciprocal of this vector
        Iu = np.reciprocal(Iu)
        # multiply the ufm by Iu. Normalize UFM
        print("UFM ready")
        ufm = ufm.multiply(Iu).transpose()
        ucm = vstack([ufm, ucm_red[:,[dataset.get_playlist_index_from_id(x) for x in target_playlist]]], format='csr')
        print("UCM ready")
        ## User Based content profile
        # uFxI
        iucm = ucm_red.dot(urm)[:, [dataset.get_track_index_from_id(x) for x in target_tracks]]
        i_sum = urm[:, [dataset.get_track_index_from_id(x) for x in target_tracks]].sum(axis=0)
        # save from divide by zero!
        i_sum[i_sum == 0] = 1
        # since we have to divide the ufm get the reciprocal of this vector
        i_sum = np.reciprocal(i_sum)
        # multiply the ufm by Iu. Normalize UFM
        iucm = csr_matrix(iucm.multiply(i_sum))
        # Add playlist to icm
        print("SHAPE of ICM: ", icm.shape)
        # filter iucm
        owner = dataset.build_owner_matrix(iucm)
        owner = top_k_filtering(owner, 50)

        title = dataset.build_title_matrix(iucm)
        title = top_k_filtering(title, 50)

        created_at = dataset.build_created_at_matrix(iucm)
        created_at = top_k_filtering(created_at, 10)

        duration = dataset.build_pl_duration_matrix(iucm)
        duration = top_k_filtering(duration, 10)

        numtracks = dataset.build_numtracks_matrix(iucm)
        numtracks = top_k_filtering(numtracks, 10)
        icm = vstack([icm[:, [dataset.get_track_index_from_id(x) for x in target_tracks]], title, owner, created_at, duration, numtracks], format='csr')

        print("UFM and ICM Done!")
        # NEIGHBOR FORMATION
        # normalize matrix
        R_hat_1 = compute_cosine(ucm.transpose(), icm, k_filtering=500, shrinkage=10)

        # R_hat computation
        self.R_hat = R_hat_1.tocsr()
        # self.R_hat = self.R_hat[[dataset.get_playlist_index_from_id(x) for x in target_playlist]]
        # self.R_hat = self.R_hat[:, [dataset.get_track_index_from_id(x) for x in target_tracks]]
        # restore original ratings
        # self.R_hat[self.urm.nonzero()] = 1
        # clean urm
        self.urm = self.urm[:, [dataset.get_track_index_from_id(x) for x in target_tracks]]
        self.urm = self.urm[[dataset.get_playlist_index_from_id(x) for x in target_playlist]]
        # put to zero already rated elements
        self.R_hat[self.urm.nonzero()] = 0
        self.R_hat.eliminate_zeros()
        print("R_hat done")
        print("Shape:", self.R_hat.shape)
Example #13
0
    def fit(self,
            urm,
            target_playlist,
            target_tracks,
            dataset,
            shrinkage=10,
            k_filtering=200,
            alfa=0.95):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """

        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        print("CBF started")

        # get ICM from dataset
        icm = dataset.build_icm_2()

        # add urm
        icm = dataset.add_playlist_to_icm(icm, urm, 0.8)
        icm_tag = dataset.build_tags_matrix()
        tags = applyTfIdf(icm_tag, 55)
        icm = vstack([icm, tags], format='csr')

        # add n_ratings to icm
        # icm = dataset.add_tracks_num_rating_to_icm(icm, urm)

        # build user content matrix
        ucm = dataset.build_ucm()

        # build item user-feature matrix: UFxI
        iucm = ucm.dot(urm)

        # icm = vstack([icm.multiply(0), iucm], format='csr')
        iucm = applyTfIdf(iucm, 100)

        S_user = compute_cosine(iucm.transpose()[[
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]],
                                iucm,
                                k_filtering=k_filtering,
                                normalize=True)

        # To filter or not to filter? Who knows?

        # title = dataset.build_title_matrix(iucm)
        # title = top_k_filtering(title.transpose(), topK=100)
        # title.data = np.ones_like(title.data)
        # title = title.multiply(0.05)
        # # owner = dataset.build_owner_matrix(iucm)
        # # owner = top_k_filtering(owner.transpose(), topK=500)
        # # owner.data = np.ones_like(owner.data)
        # # owner = owner.multiply(0.05)
        # created_at = dataset.build_created_at_matrix(iucm)
        # print(created_at.shape)
        # created_at = top_k_filtering(created_at.transpose(), topK=10)
        # created_at.data = np.ones_like(created_at.data)
        # created_at = created_at.multiply(0.01)

        # compute cosine similarity (only for tg tracks) wrt to all tracks
        S = compute_cosine(icm.transpose()[[
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]],
                           icm,
                           k_filtering=k_filtering,
                           chunksize=1000)

        # compute a weighted average
        S = S.multiply(alfa) + S_user.multiply(1 - alfa)

        # Normalize S
        s_norm = S.sum(axis=1)
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))

        # keep only target rows of URM and target columns
        urm_cleaned = urm[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]]

        # save S
        self.S = S.transpose()

        # compute ratings
        R_hat = urm_cleaned.dot(S.transpose()).tocsr()

        # apply mask for eliminating already rated items
        urm_cleaned = urm_cleaned[:, [
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]]
        R_hat[urm_cleaned.nonzero()] = 0
        R_hat.eliminate_zeros()

        print("R_hat done")
        self.R_hat = R_hat
Example #14
0
    def fit(self, urm, target_playlist, target_tracks, dataset):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """
        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        print("UBF started")

        # get UCM from dataset
        ucm = dataset.build_ucm()

        # add user ratings to ucm
        #ucm = vstack([ucm, urm.transpose()])

        #ucm = applyTfIdf(ucm)

        # build user profile from urm and icm
        icm = dataset.build_icm_2()
        tags = dataset.build_tags_matrix()
        tags = applyTfIdf(tags, topK=55)
        icm = vstack([icm, tags], format='csr')
        ufm = urm.dot(icm.transpose())

        # # Iu contains for each user the number of tracks rated
        Iu = urm.sum(axis=1)
        # save from divide by zero!
        Iu[Iu == 0] = 1
        # since we have to divide the ufm get the reciprocal of this vector
        Iu = np.reciprocal(Iu)
        # multiply the ufm by Iu. Normalize UFM
        ufm = ufm.multiply(Iu).transpose()

        ucm = vstack([ucm.multiply(5),
                      urm.transpose().multiply(5), ufm],
                     format='csr')

        # compute cosine similarity between users
        S = compute_cosine(ucm.transpose()[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]],
                           ucm,
                           k_filtering=self.k_filtering,
                           shrinkage=self.shrinkage)
        s_norm = S.sum(axis=1)
        s_norm[s_norm == 0] = 1
        # normalize s
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))
        # compute ratings
        print("Similarity matrix ready!")
        urm_cleaned = urm[:, [
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]]

        R_hat = S.dot(urm_cleaned)

        # clean from already rated items
        print("R_hat done")
        urm_cleaned = urm_cleaned[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]]
        R_hat[urm_cleaned.nonzero()] = 0
        R_hat.eliminate_zeros()
        # eliminate playlist that are not target, already done, to check
        # R_hat = R_hat[:, [dataset.get_track_index_from_id(
        #    x) for x in self.tr_id_list]]
        print("Shape of final matrix: ", R_hat.shape)
        self.R_hat = R_hat
Example #15
0
    def fit(self,
            urm,
            target_playlist,
            target_tracks,
            dataset,
            shrinkage=10,
            k_filtering=50):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """

        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        print("CBF started")

        urm = urm.tocsr()

        if self.r_hat_aug is None:
            # augment r_hat
            cbf = ContentBasedFiltering()
            cbf.fit(urm, tg_playlist, tg_tracks, ds)

            # get R_hat
            self.r_hat_aug = cbf.getR_hat()

        # do collaborative filtering
        S_cf = compute_cosine(self.r_hat_aug.transpose()[[
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]],
                              self.r_hat_aug,
                              k_filtering=k_filtering,
                              shrinkage=shrinkage)

        # normalize
        S_cf = normalize_by_row(S_cf)

        self.R_hat = csr_matrix(urm[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]].dot(S_cf.transpose()))

        urm_cleaned = urm[[
            dataset.get_playlist_index_from_id(x) for x in self.pl_id_list
        ]]
        urm_cleaned = urm_cleaned[:, [
            dataset.get_track_index_from_id(x) for x in self.tr_id_list
        ]]

        self.R_hat[urm_cleaned.nonzero()] = 0
        self.R_hat.eliminate_zeros()

        print("R_hat done")
        self.R_hat = csr_matrix(self.R_hat)