def _mkl_similarities(self, rmat): assert rmat.values is not None _logger.info('[%s] multiplying matrix with MKL', self._timer) m_nbrs = self.save_nbrs if m_nbrs is None or m_nbrs < 0: m_nbrs = 0 trmat = rmat.transpose() nitems = trmat.nrows # for i in range(nitems): # _logger.debug('verifying row %d', i) # cs = trmat.row_cs(i) # assert np.all(cs >= 0) # assert np.all(cs < trmat.ncols) # assert pd.Series(cs).nunique() == len(cs) _logger.debug('[%s] transposed, memory use %s', self._timer, util.max_memory()) s_blocks = _mkl_sim_blocks(trmat.N, self.min_sim, m_nbrs) _logger.debug('[%s] computed blocks, memory use %s', self._timer, util.max_memory()) s_blocks = [matrix.CSR(N=b) for (b, bs, be) in s_blocks] nnz = sum(b.nnz for b in s_blocks) tot_rows = sum(b.nrows for b in s_blocks) _logger.info('[%s] computed %d similarities for %d items in %d blocks', self._timer, nnz, tot_rows, len(s_blocks)) row_nnzs = np.concatenate([b.row_nnzs() for b in s_blocks]) assert len(row_nnzs) == nitems, \ 'only have {} rows for {} items'.format(len(row_nnzs), nitems) smat = matrix.CSR.empty((nitems, nitems), row_nnzs, rpdtype=np.int64) start = 0 for bi, b in enumerate(s_blocks): bnr = b.nrows end = start + bnr v_sp = smat.rowptrs[start] v_ep = smat.rowptrs[end] _logger.debug('block %d (%d:%d) has %d entries, storing in %d:%d', bi, start, end, b.nnz, v_sp, v_ep) smat.colinds[v_sp:v_ep] = b.colinds smat.values[v_sp:v_ep] = b.values start = end _logger.info('[%s] sorting similarity matrix with %d entries', self._timer, smat.nnz) _sort_nbrs(smat.N) return smat
def fit(self, ratings, **kwargs): """ Train a model. The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other algorithm parameters. Args: ratings(pandas.DataFrame): (user,item,rating) data for computing item similarities. """ util.check_env() # Training proceeds in 2 steps: # 1. Normalize item vectors to be mean-centered and unit-normalized # 2. Compute similarities with pairwise dot products self._timer = util.Stopwatch() _logger.debug('[%s] beginning fit, memory use %s', self._timer, util.max_memory()) _logger.debug('[%s] using CSR kernel %s', self._timer, csrk.name) init_rmat, users, items = sparse_ratings(ratings) n_items = len(items) _logger.info( '[%s] made sparse matrix for %d items (%d ratings from %d users)', self._timer, len(items), init_rmat.nnz, len(users)) _logger.debug('[%s] made matrix, memory use %s', self._timer, util.max_memory()) rmat, item_means = self._mean_center(ratings, init_rmat, items) _logger.debug('[%s] centered, memory use %s', self._timer, util.max_memory()) rmat = self._normalize(rmat) _logger.debug('[%s] normalized, memory use %s', self._timer, util.max_memory()) _logger.info('[%s] computing similarity matrix', self._timer) smat = self._compute_similarities(rmat) _logger.debug('[%s] computed, memory use %s', self._timer, util.max_memory()) _logger.info('[%s] got neighborhoods for %d of %d items', self._timer, np.sum(np.diff(smat.rowptrs) > 0), n_items) _logger.info('[%s] computed %d neighbor pairs', self._timer, smat.nnz) self.item_index_ = items self.item_means_ = item_means self.item_counts_ = np.diff(smat.rowptrs) self.sim_matrix_ = smat self.user_index_ = users self.rating_matrix_ = init_rmat # create an inverted similarity matrix for efficient scanning self._sim_inv_ = smat.transpose() _logger.info('[%s] transposed matrix for optimization', self._timer) _logger.debug('[%s] done, memory use %s', self._timer, util.max_memory()) return self