Esempio n. 1
0
def test_sparse_matrix(rng):
    ratings = ml_test.ratings
    mat, uidx, iidx = lm.sparse_ratings(ratings)

    assert mat.nrows == len(uidx)
    assert mat.nrows == ratings.user.nunique()
    assert mat.ncols == len(iidx)
    assert mat.ncols == ratings.item.nunique()

    # user indicators should correspond to user item counts
    ucounts = ratings.groupby('user').item.count()
    ucounts = ucounts.loc[uidx].cumsum()
    assert all(mat.rowptrs[1:] == ucounts.values)

    # verify rating values
    ratings = ratings.set_index(['user', 'item'])
    for u in rng.choice(uidx, size=50):
        ui = uidx.get_loc(u)
        vs = mat.row_vs(ui)
        vs = pd.Series(vs, iidx[mat.row_cs(ui)])
        rates = ratings.loc[u]['rating']
        vs, rates = vs.align(rates)
        assert not any(vs.isna())
        assert not any(rates.isna())
        assert all(vs == rates)
Esempio n. 2
0
    def fit(self, ratings, **kwargs):
        """
        Train a model.

        The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other
        algorithm parameters.

        Args:
            ratings(pandas.DataFrame):
                (user,item,rating) data for computing item similarities.
        """
        # Training proceeds in 2 steps:
        # 1. Normalize item vectors to be mean-centered and unit-normalized
        # 2. Compute similarities with pairwise dot products
        self._timer = util.Stopwatch()

        _logger.debug('[%s] beginning fit, memory use %s', self._timer,
                      util.max_memory())

        init_rmat, users, items = matrix.sparse_ratings(ratings)
        n_items = len(items)
        _logger.info(
            '[%s] made sparse matrix for %d items (%d ratings from %d users)',
            self._timer, len(items), init_rmat.nnz, len(users))
        _logger.debug('[%s] made matrix, memory use %s', self._timer,
                      util.max_memory())

        rmat, item_means = self._mean_center(ratings, init_rmat, items)
        _logger.debug('[%s] centered, memory use %s', self._timer,
                      util.max_memory())

        rmat = self._normalize(rmat)
        _logger.debug('[%s] normalized, memory use %s', self._timer,
                      util.max_memory())

        _logger.info('[%s] computing similarity matrix', self._timer)
        smat = self._compute_similarities(rmat)
        _logger.debug('[%s] computed, memory use %s', self._timer,
                      util.max_memory())

        _logger.info('[%s] got neighborhoods for %d of %d items', self._timer,
                     np.sum(np.diff(smat.rowptrs) > 0), n_items)

        _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz)

        self.item_index_ = items
        self.item_means_ = item_means
        self.item_counts_ = np.diff(smat.rowptrs)
        self.sim_matrix_ = smat
        self.user_index_ = users
        self.rating_matrix_ = init_rmat
        # create an inverted similarity matrix for efficient scanning
        self._sim_inv_ = smat.transpose()
        _logger.info('[%s] transposed matrix for optimization', self._timer)
        _logger.debug('[%s] done, memory use %s', self._timer,
                      util.max_memory())

        return self
Esempio n. 3
0
def test_sparse_matrix_implicit():
    ratings = lktu.ml_pandas.renamed.ratings
    ratings = ratings.loc[:, ['user', 'item']]
    mat, uidx, iidx = lm.sparse_ratings(ratings)

    assert mat.nrows == len(uidx)
    assert mat.nrows == ratings.user.nunique()
    assert mat.ncols == len(iidx)
    assert mat.ncols == ratings.item.nunique()
    assert mat.values is None
Esempio n. 4
0
def test_sparse_matrix_scipy_implicit():
    ratings = lktu.ml_pandas.renamed.ratings
    ratings = ratings.loc[:, ['user', 'item']]
    mat, uidx, iidx = lm.sparse_ratings(ratings, scipy=True)

    assert sps.issparse(mat)
    assert sps.isspmatrix_csr(mat)
    assert len(uidx) == ratings.user.nunique()
    assert len(iidx) == ratings.item.nunique()

    assert all(mat.data == 1.0)
Esempio n. 5
0
def test_sparse_matrix_scipy():
    ratings = lktu.ml_pandas.renamed.ratings
    mat, uidx, iidx = lm.sparse_ratings(ratings, scipy=True)

    assert sps.issparse(mat)
    assert sps.isspmatrix_csr(mat)
    assert len(uidx) == ratings.user.nunique()
    assert len(iidx) == ratings.item.nunique()

    # user indicators should correspond to user item counts
    ucounts = ratings.groupby('user').item.count()
    ucounts = ucounts.loc[uidx].cumsum()
    assert all(mat.indptr[1:] == ucounts.values)
Esempio n. 6
0
def test_sparse_matrix():
    ratings = lktu.ml_pandas.renamed.ratings
    mat, uidx, iidx = lm.sparse_ratings(ratings)

    assert mat.nrows == len(uidx)
    assert mat.nrows == ratings.user.nunique()
    assert mat.ncols == len(iidx)
    assert mat.ncols == ratings.item.nunique()

    # user indicators should correspond to user item counts
    ucounts = ratings.groupby('user').item.count()
    ucounts = ucounts.loc[uidx].cumsum()
    assert all(mat.rowptrs[1:] == ucounts.values)
Esempio n. 7
0
def test_sparse_matrix_scipy(format, sps_fmt_checker):
    ratings = ml_test.ratings
    mat, uidx, iidx = lm.sparse_ratings(ratings, scipy=format)

    assert sps.issparse(mat)
    assert sps_fmt_checker(mat)
    assert len(uidx) == ratings.user.nunique()
    assert len(iidx) == ratings.item.nunique()

    # user indicators should correspond to user item counts
    ucounts = ratings.groupby('user').item.count()
    ucounts = ucounts.loc[uidx].cumsum()
    if sps.isspmatrix_coo(mat):
        mat = mat.tocsr()
    assert all(mat.indptr[1:] == ucounts.values)
Esempio n. 8
0
File: bpr.py Progetto: yw4509/lkpy
    def fit(self, ratings, **kwargs):
        timer = util.Stopwatch()
        rng = util.rng(self.rng_spec)

        matrix, users, items = sparse_ratings(ratings[['user', 'item']])

        _log.info('[%s] setting up model', timer)
        train, model = self._build_model(len(users), len(items))

        _log.info('[%s] preparing training dataset', timer)
        train_data = BprInputs(matrix, self.batch_size, self.neg_count, rng)

        _log.info('[%s] training model', timer)
        train.fit(train_data, epochs=self.epochs)

        _log.info('[%s] model finished', timer)

        self.user_index_ = users
        self.item_index_ = items
        self.model = model

        return self
Esempio n. 9
0
def test_sparse_matrix_indexes(rng):
    ratings = ml_test.ratings
    uidx = pd.Index(rng.permutation(ratings['user'].unique()))
    iidx = pd.Index(rng.permutation(ratings['item'].unique()))

    mat, _uidx, _iidx = lm.sparse_ratings(ratings, users=uidx, items=iidx)

    assert _uidx is uidx
    assert _iidx is iidx
    assert len(_uidx) == ratings.user.nunique()
    assert len(_iidx) == ratings.item.nunique()

    # verify rating values
    ratings = ratings.set_index(['user', 'item'])
    for u in rng.choice(_uidx, size=50):
        ui = _uidx.get_loc(u)
        vs = mat.row_vs(ui)
        vs = pd.Series(vs, _iidx[mat.row_cs(ui)])
        rates = ratings.loc[u]['rating']
        vs, rates = vs.align(rates)
        assert not any(vs.isna())
        assert not any(rates.isna())
        assert all(vs == rates)
Esempio n. 10
0
    def fit(self, ratings):
        """
        Train a model.

        The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other
        algorithm parameters.

        Args:
            ratings(pandas.DataFrame):
                (user,item,rating) data for computing item similarities.
        """
        # Training proceeds in 2 steps:
        # 1. Normalize item vectors to be mean-centered and unit-normalized
        # 2. Compute similarities with pairwise dot products
        self._timer = util.Stopwatch()

        init_rmat, users, items = matrix.sparse_ratings(ratings)
        '''
        # Find User Rating to remove for experimenting with Unlearn Algorithm
        # Try to Find non trivial rating items to remove 
        for index, row in ratings.iterrows():
            if items.get_loc(row['item']) in [17,138,22,83,76,31,92]:
                #print(row['user'],row['item'],index,users.get_loc(row['user']),items.get_loc(row['item']))
                pass
        '''
        n_items = len(items)
        _logger.info(
            '[%s] made sparse matrix for %d items (%d ratings from %d users)',
            self._timer, len(items), init_rmat.nnz, len(users))

        start = time.time()
        rmat_scipy = init_rmat.to_scipy()

        self._compute_similarities_unlearn_min_centering_sparse_vectorize(
            rmat_scipy, items, users)
        end = time.time()
        learn_unlearn_time = end - start
        print("Unlearn Supported Learning: {}".format(end - start))

        rows, cols, vals = self.smat_unlearn_sparse_csr
        self.smat_unlearn_sparse = sps.csr_matrix((vals, (rows, cols)),
                                                  shape=(self.M, self.M))
        # Print OUT Similarity Matrix to Verify Completeness
        #print(self.smat_unlearn_sparse)

        start = time.time()
        self._unlearn_min_centering_sparse(54, 17, rmat_scipy,
                                           self.smat_unlearn_sparse)
        end = time.time()
        unlearn_time = end - start
        print("Unlearn: {}".format(end - start))

        start = time.time()
        rmat, item_means = self._mean_center(ratings, init_rmat, items, users)

        rmat = self._normalize(rmat)
        _logger.info('[%s] computing similarity matrix', self._timer)
        smat = self._compute_similarities(rmat, items, users)

        end = time.time()
        native_learn_time = end - start
        # Print OUT Similarity Matrix to Verify Completeness
        #print(smat.to_scipy())
        print("Native Learning: {}".format(end - start))

        _logger.info('[%s] got neighborhoods for %d of %d items', self._timer,
                     np.sum(np.diff(smat.rowptrs) > 0), n_items)

        _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz)

        self.item_index_ = items
        self.item_means_ = item_means
        self.item_counts_ = np.diff(smat.rowptrs)
        self.sim_matrix_ = smat
        self.user_index_ = users
        self.rating_matrix_ = init_rmat

        # Save the Time Cost evaluation result
        #f = open("output_matrix.csv","a+")
        #f.write("{},{},{},{}\n".format(init_rmat.nnz ,native_learn_time,learn_unlearn_time,unlearn_time))
        #f.close()
        return self