Beispiel #1
0
    def _mkl_similarities(self, rmat):
        assert rmat.values is not None

        _logger.info('[%s] multiplying matrix with MKL', self._timer)
        m_nbrs = self.save_nbrs
        if m_nbrs is None or m_nbrs < 0:
            m_nbrs = 0
        trmat = rmat.transpose()
        nitems = trmat.nrows

        # for i in range(nitems):
        #     _logger.debug('verifying row %d', i)
        #     cs = trmat.row_cs(i)
        #     assert np.all(cs >= 0)
        #     assert np.all(cs < trmat.ncols)
        #     assert pd.Series(cs).nunique() == len(cs)

        _logger.debug('[%s] transposed, memory use %s', self._timer,
                      util.max_memory())
        s_blocks = _mkl_sim_blocks(trmat.N, self.min_sim, m_nbrs)
        _logger.debug('[%s] computed blocks, memory use %s', self._timer,
                      util.max_memory())
        s_blocks = [matrix.CSR(N=b) for (b, bs, be) in s_blocks]
        nnz = sum(b.nnz for b in s_blocks)
        tot_rows = sum(b.nrows for b in s_blocks)
        _logger.info('[%s] computed %d similarities for %d items in %d blocks',
                     self._timer, nnz, tot_rows, len(s_blocks))
        row_nnzs = np.concatenate([b.row_nnzs() for b in s_blocks])
        assert len(row_nnzs) == nitems, \
            'only have {} rows for {} items'.format(len(row_nnzs), nitems)

        smat = matrix.CSR.empty((nitems, nitems), row_nnzs, rpdtype=np.int64)
        start = 0
        for bi, b in enumerate(s_blocks):
            bnr = b.nrows
            end = start + bnr
            v_sp = smat.rowptrs[start]
            v_ep = smat.rowptrs[end]
            _logger.debug('block %d (%d:%d) has %d entries, storing in %d:%d',
                          bi, start, end, b.nnz, v_sp, v_ep)
            smat.colinds[v_sp:v_ep] = b.colinds
            smat.values[v_sp:v_ep] = b.values
            start = end

        _logger.info('[%s] sorting similarity matrix with %d entries',
                     self._timer, smat.nnz)
        _sort_nbrs(smat.N)

        return smat
Beispiel #2
0
    def fit(self, ratings, **kwargs):
        """
        Train a model.

        The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other
        algorithm parameters.

        Args:
            ratings(pandas.DataFrame):
                (user,item,rating) data for computing item similarities.
        """
        util.check_env()
        # Training proceeds in 2 steps:
        # 1. Normalize item vectors to be mean-centered and unit-normalized
        # 2. Compute similarities with pairwise dot products
        self._timer = util.Stopwatch()

        _logger.debug('[%s] beginning fit, memory use %s', self._timer,
                      util.max_memory())
        _logger.debug('[%s] using CSR kernel %s', self._timer, csrk.name)

        init_rmat, users, items = sparse_ratings(ratings)
        n_items = len(items)
        _logger.info(
            '[%s] made sparse matrix for %d items (%d ratings from %d users)',
            self._timer, len(items), init_rmat.nnz, len(users))
        _logger.debug('[%s] made matrix, memory use %s', self._timer,
                      util.max_memory())

        rmat, item_means = self._mean_center(ratings, init_rmat, items)
        _logger.debug('[%s] centered, memory use %s', self._timer,
                      util.max_memory())

        rmat = self._normalize(rmat)
        _logger.debug('[%s] normalized, memory use %s', self._timer,
                      util.max_memory())

        _logger.info('[%s] computing similarity matrix', self._timer)
        smat = self._compute_similarities(rmat)
        _logger.debug('[%s] computed, memory use %s', self._timer,
                      util.max_memory())

        _logger.info('[%s] got neighborhoods for %d of %d items', self._timer,
                     np.sum(np.diff(smat.rowptrs) > 0), n_items)

        _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz)

        self.item_index_ = items
        self.item_means_ = item_means
        self.item_counts_ = np.diff(smat.rowptrs)
        self.sim_matrix_ = smat
        self.user_index_ = users
        self.rating_matrix_ = init_rmat
        # create an inverted similarity matrix for efficient scanning
        self._sim_inv_ = smat.transpose()
        _logger.info('[%s] transposed matrix for optimization', self._timer)
        _logger.debug('[%s] done, memory use %s', self._timer,
                      util.max_memory())

        return self