Beispiel #1
0
def test_ii_train():
    algo = knn.ItemItem(30, save_nbrs=500)
    algo.fit(simple_ratings)

    assert isinstance(algo.item_index_, pd.Index)
    assert isinstance(algo.item_means_, np.ndarray)
    assert isinstance(algo.item_counts_, np.ndarray)
    matrix = lm.csr_to_scipy(algo.sim_matrix_)

    # 6 is a neighbor of 7
    six, seven = algo.item_index_.get_indexer([6, 7])
    _log.info('six: %d', six)
    _log.info('seven: %d', seven)
    _log.info('matrix: %s', algo.sim_matrix_)
    assert matrix[six, seven] > 0
    # and has the correct score
    six_v = simple_ratings[simple_ratings.item == 6].set_index('user').rating
    six_v = six_v - six_v.mean()
    seven_v = simple_ratings[simple_ratings.item == 7].set_index('user').rating
    seven_v = seven_v - seven_v.mean()
    denom = la.norm(six_v.values) * la.norm(seven_v.values)
    six_v, seven_v = six_v.align(seven_v, join='inner')
    num = six_v.dot(seven_v)
    assert matrix[six, seven] == approx(num / denom, 0.01)

    assert all(np.logical_not(np.isnan(algo.sim_matrix_.values)))
    assert all(algo.sim_matrix_.values > 0)
    # a little tolerance
    assert all(algo.sim_matrix_.values < 1 + 1.0e-6)
Beispiel #2
0
def test_csr_to_sps():
    # initialize sparse matrix
    mat = np.random.randn(10, 5)
    mat[mat <= 0] = 0
    # get COO
    smat = sps.coo_matrix(mat)
    # make sure it's sparse
    assert smat.nnz == np.sum(mat > 0)

    csr = lm.csr_from_coo(smat.row, smat.col, smat.data, shape=smat.shape)
    assert csr.nnz == smat.nnz
    assert csr.nrows == smat.shape[0]
    assert csr.ncols == smat.shape[1]

    smat2 = lm.csr_to_scipy(csr)
    assert sps.isspmatrix(smat2)
    assert sps.isspmatrix_csr(smat2)

    for i in range(csr.nrows):
        assert smat2.indptr[i] == csr.rowptrs[i]
        assert smat2.indptr[i + 1] == csr.rowptrs[i + 1]
        sp = smat2.indptr[i]
        ep = smat2.indptr[i + 1]
        assert all(smat2.indices[sp:ep] == csr.colinds[sp:ep])
        assert all(smat2.data[sp:ep] == csr.values[sp:ep])
Beispiel #3
0
def test_ii_save_load(tmp_path):
    "Save and load a model"
    tmp_path = lktu.norm_path(tmp_path)
    original = knn.ItemItem(30, save_nbrs=500)
    _log.info('building model')
    original.fit(lktu.ml_sample())

    fn = tmp_path / 'ii.mod'
    _log.info('saving model to %s', fn)
    original.save(fn)
    _log.info('reloading model')

    algo = knn.ItemItem(30)
    algo.load(fn)
    _log.info('checking model')

    assert all(np.logical_not(np.isnan(algo.sim_matrix_.values)))
    assert all(algo.sim_matrix_.values > 0)
    # a little tolerance
    assert all(algo.sim_matrix_.values < 1 + 1.0e-6)

    assert all(algo.item_counts_ == original.item_counts_)
    assert algo.item_counts_.sum() == algo.sim_matrix_.nnz
    assert algo.sim_matrix_.nnz == original.sim_matrix_.nnz
    assert all(algo.sim_matrix_.rowptrs == original.sim_matrix_.rowptrs)
    assert algo.sim_matrix_.values == approx(original.sim_matrix_.values)

    r_mat = algo.sim_matrix_
    o_mat = original.sim_matrix_
    assert all(r_mat.rowptrs == o_mat.rowptrs)

    for i in range(len(algo.item_index_)):
        sp = r_mat.rowptrs[i]
        ep = r_mat.rowptrs[i + 1]

        # everything is in decreasing order
        assert all(np.diff(r_mat.values[sp:ep]) <= 0)
        assert all(r_mat.values[sp:ep] == o_mat.values[sp:ep])

    means = ml_ratings.groupby('item').rating.mean()
    assert means[algo.item_index_].values == approx(original.item_means_)

    matrix = lm.csr_to_scipy(algo.sim_matrix_)

    items = pd.Series(algo.item_index_)
    items = items[algo.item_counts_ > 0]
    for i in items.sample(50):
        ipos = algo.item_index_.get_loc(i)
        _log.debug('checking item %d at position %d', i, ipos)

        row = matrix.getrow(ipos)

        # it should be sorted !
        # check this by diffing the row values, and make sure they're negative
        assert all(np.diff(row.data) < 1.0e-6)
Beispiel #4
0
def test_uu_implicit():
    "Train and use user-user on an implicit data set."
    algo = knn.UserUser(20, center=False, aggregate='sum')
    data = ml_ratings.loc[:, ['user', 'item']]

    algo.fit(data)
    assert algo.user_means_ is None

    mat = matrix.csr_to_scipy(algo.rating_matrix_)
    norms = sps.linalg.norm(mat, 2, 1)
    assert norms == approx(1.0)

    preds = algo.predict_for_user(50, [1, 2, 42])
    assert all(preds[preds.notna()] > 0)
Beispiel #5
0
 def _normalize(self, rmat):
     rmat = matrix.csr_to_scipy(rmat)
     # compute column norms
     norms = spla.norm(rmat, 2, axis=0)
     # and multiply by a diagonal to normalize columns
     recip_norms = norms.copy()
     is_nz = recip_norms > 0
     recip_norms[is_nz] = np.reciprocal(recip_norms[is_nz])
     norm_mat = rmat @ sps.diags(recip_norms)
     assert norm_mat.shape[1] == rmat.shape[1]
     # and reset NaN
     norm_mat.data[np.isnan(norm_mat.data)] = 0
     _logger.info('[%s] normalized rating matrix columns', self._timer)
     return matrix.csr_from_scipy(norm_mat, False)
Beispiel #6
0
    def _scipy_similarities(self, rmat):
        nitems = rmat.ncols
        sp_rmat = matrix.csr_to_scipy(rmat)

        _logger.info('[%s] multiplying matrix with scipy', self._timer)
        smat = sp_rmat.T @ sp_rmat
        smat = smat.tocoo()
        rows, cols, vals = smat.row, smat.col, smat.data
        rows = rows[:smat.nnz]
        cols = cols[:smat.nnz]
        vals = vals[:smat.nnz]

        rows, cols, vals = self._filter_similarities(rows, cols, vals)
        csr = self._select_similarities(nitems, rows, cols, vals)
        return csr
Beispiel #7
0
def test_mkl_syrk():
    for i in range(50):
        M = np.random.randn(10, 5)
        M[M <= 0] = 0
        s = sps.csr_matrix(M)
        assert s.nnz == np.sum(M > 0)

        csr = lm.csr_from_scipy(s)

        ctc = mkl_ops.csr_syrk(csr)
        res = lm.csr_to_scipy(ctc).toarray()
        res = res.T + res
        rd = np.diagonal(res)
        res = res - np.diagflat(rd) * 0.5

        mtm = M.T @ M
        assert res == approx(mtm)
Beispiel #8
0
def test_ii_train_unbounded():
    algo = knn.ItemItem(30)
    algo.fit(simple_ratings)

    assert all(np.logical_not(np.isnan(algo.sim_matrix_.values)))
    assert all(algo.sim_matrix_.values > 0)
    # a little tolerance
    assert all(algo.sim_matrix_.values < 1 + 1.0e-6)

    # 6 is a neighbor of 7
    matrix = lm.csr_to_scipy(algo.sim_matrix_)
    six, seven = algo.item_index_.get_indexer([6, 7])
    assert matrix[six, seven] > 0

    # and has the correct score
    six_v = simple_ratings[simple_ratings.item == 6].set_index('user').rating
    six_v = six_v - six_v.mean()
    seven_v = simple_ratings[simple_ratings.item == 7].set_index('user').rating
    seven_v = seven_v - seven_v.mean()
    denom = la.norm(six_v.values) * la.norm(seven_v.values)
    six_v, seven_v = six_v.align(seven_v, join='inner')
    num = six_v.dot(seven_v)
    assert matrix[six, seven] == approx(num / denom, 0.01)
Beispiel #9
0
def test_ii_large_models():
    "Several tests of large trained I-I models"
    _log.info('training limited model')
    MODEL_SIZE = 100
    algo_lim = knn.ItemItem(30, save_nbrs=MODEL_SIZE)
    algo_lim.fit(ml_ratings)

    _log.info('training unbounded model')
    algo_ub = knn.ItemItem(30)
    algo_ub.fit(ml_ratings)

    _log.info('testing models')
    assert all(np.logical_not(np.isnan(algo_lim.sim_matrix_.values)))
    assert all(algo_lim.sim_matrix_.values > 0)
    # a little tolerance
    assert all(algo_lim.sim_matrix_.values < 1 + 1.0e-6)

    means = ml_ratings.groupby('item').rating.mean()
    assert means[algo_lim.item_index_].values == approx(algo_lim.item_means_)

    assert all(np.logical_not(np.isnan(algo_ub.sim_matrix_.values)))
    assert all(algo_ub.sim_matrix_.values > 0)
    # a little tolerance
    assert all(algo_ub.sim_matrix_.values < 1 + 1.0e-6)

    means = ml_ratings.groupby('item').rating.mean()
    assert means[algo_ub.item_index_].values == approx(algo_ub.item_means_)

    mc_rates = ml_ratings.set_index('item')\
                         .join(pd.DataFrame({'item_mean': means}))\
                         .assign(rating=lambda df: df.rating - df.item_mean)

    mat_lim = lm.csr_to_scipy(algo_lim.sim_matrix_)
    mat_ub = lm.csr_to_scipy(algo_ub.sim_matrix_)

    _log.info('checking a sample of neighborhoods')
    items = pd.Series(algo_ub.item_index_)
    items = items[algo_ub.item_counts_ > 0]
    for i in items.sample(50):
        ipos = algo_ub.item_index_.get_loc(i)
        _log.debug('checking item %d at position %d', i, ipos)
        assert ipos == algo_lim.item_index_.get_loc(i)
        irates = mc_rates.loc[[i], :].set_index('user').rating

        ub_row = mat_ub.getrow(ipos)
        b_row = mat_lim.getrow(ipos)
        assert b_row.nnz <= MODEL_SIZE
        assert all(pd.Series(b_row.indices).isin(ub_row.indices))

        # it should be sorted !
        # check this by diffing the row values, and make sure they're negative
        assert all(np.diff(b_row.data) < 1.0e-6)
        assert all(np.diff(ub_row.data) < 1.0e-6)

        # spot-check some similarities
        for n in pd.Series(ub_row.indices).sample(min(10,
                                                      len(ub_row.indices))):
            n_id = algo_ub.item_index_[n]
            n_rates = mc_rates.loc[n_id, :].set_index('user').rating
            ir, nr = irates.align(n_rates, fill_value=0)
            cor = ir.corr(nr)
            assert mat_ub[ipos, n] == approx(cor)

        # short rows are equal
        if b_row.nnz < MODEL_SIZE:
            _log.debug('short row of length %d', b_row.nnz)
            assert b_row.nnz == ub_row.nnz
            ub_row.sort_indices()
            b_row.sort_indices()
            assert b_row.data == approx(ub_row.data)
            continue

        # row is truncated - check that truncation is correct
        ub_nbrs = pd.Series(ub_row.data, algo_ub.item_index_[ub_row.indices])
        b_nbrs = pd.Series(b_row.data, algo_lim.item_index_[b_row.indices])

        assert len(ub_nbrs) >= len(b_nbrs)
        assert len(b_nbrs) <= MODEL_SIZE
        assert all(b_nbrs.index.isin(ub_nbrs.index))
        # the similarities should be equal!
        b_match, ub_match = b_nbrs.align(ub_nbrs, join='inner')
        assert all(b_match == b_nbrs)
        assert b_match.values == approx(ub_match.values)
        assert b_nbrs.max() == approx(ub_nbrs.max())
        if len(ub_nbrs) > MODEL_SIZE:
            assert len(b_nbrs) == MODEL_SIZE
            ub_shrink = ub_nbrs.nlargest(MODEL_SIZE)
            # the minimums should be equal
            assert ub_shrink.min() == approx(b_nbrs.min())
            # everything above minimum value should be the same set of items
            ubs_except_min = ub_shrink[ub_shrink > b_nbrs.min()]
            assert all(ubs_except_min.index.isin(b_nbrs.index))