def test_als_method_match():
    lu = als.ImplicitMF(20, iterations=15, method='lu', rng_spec=42)
    cg = als.ImplicitMF(20, iterations=15, method='cg', rng_spec=42)

    ratings = lktu.ml_test.ratings

    timer = Stopwatch()
    lu.fit(ratings)
    timer.stop()
    _log.info('fit with LU solver in %s', timer)

    timer = Stopwatch()
    cg.fit(ratings)
    timer.stop()
    _log.info('fit with CG solver in %s', timer)

    preds = []

    rng = util.rng(42, legacy=True)
    for u in rng.choice(ratings.user.unique(), 10, replace=False):
        items = rng.choice(ratings.item.unique(), 15, replace=False)
        lu_preds = lu.predict_for_user(u, items)
        cd_preds = cg.predict_for_user(u, items)
        diff = lu_preds - cd_preds
        adiff = np.abs(diff)
        _log.info(
            'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f',
            u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff),
            np.max(adiff), np.quantile(adiff, 0.9))

        preds.append(
            pd.DataFrame({
                'user': u,
                'item': items,
                'lu': lu_preds,
                'cg': cd_preds,
                'adiff': adiff
            }))
        _log.info('user %s tau: %s', u, stats.kendalltau(lu_preds, cd_preds))

    preds = pd.concat(preds, ignore_index=True)
    _log.info('LU preds:\n%s', preds.lu.describe())
    _log.info('CD preds:\n%s', preds.cg.describe())
    _log.info('overall differences:\n%s', preds.adiff.describe())
    # there are differences. our check: the 90% are reasonable
    assert np.quantile(adiff, 0.9) < 0.5
Exemple #2
0
def test_als_method_match():
    lu = als.BiasedMF(20, iterations=15, reg=(2, 0.001), method='lu', rng_spec=42)
    cd = als.BiasedMF(20, iterations=20, reg=(2, 0.001), method='cd', rng_spec=42)

    ratings = lktu.ml_test.ratings

    timer = Stopwatch()
    lu.fit(ratings)
    timer.stop()
    _log.info('fit with LU solver in %s', timer)

    timer = Stopwatch()
    cd.fit(ratings)
    timer.stop()
    _log.info('fit with CD solver in %s', timer)

    assert lu.bias.mean_ == approx(ratings.rating.mean())
    assert cd.bias.mean_ == approx(ratings.rating.mean())

    preds = []

    rng = util.rng(42, legacy=True)
    for u in rng.choice(np.unique(ratings.user), 15, replace=False):
        items = rng.choice(np.unique(ratings.item), 15, replace=False)
        lu_preds = lu.predict_for_user(u, items)
        cd_preds = cd.predict_for_user(u, items)
        diff = lu_preds - cd_preds
        adiff = np.abs(diff)
        _log.info('user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f', u,
                  np.linalg.norm(diff, 2),
                  np.min(adiff), np.median(adiff), np.max(adiff), np.quantile(adiff, 0.9))

        preds.append(pd.DataFrame({
            'user': u,
            'item': items,
            'lu': lu_preds,
            'cd': cd_preds,
            'adiff': adiff
        }))

    preds = pd.concat(preds, ignore_index=True)
    _log.info('LU preds:\n%s', preds.lu.describe())
    _log.info('CD preds:\n%s', preds.cd.describe())
    _log.info('overall differences:\n%s', preds.adiff.describe())
    # there are differences. our check: the 90% are under a quarter star
    assert np.quantile(adiff, 0.9) <= 0.27
Exemple #3
0
def run_model(model, env, inst, cfg, *, var='gender'):
    """
    Run a STAN model.
    """

    seed = stan_seed(inst, var)

    data = env.profiles.loc[inst, :]

    _log.info('running profile model on %d profiles for %s', len(data), inst)
    timer = Stopwatch()

    stan_data = {'J': len(data)}
    if var == 'gender':
        stan_data['n'] = data['Known']
        stan_data['y'] = data['female']
        out_pfx = 'profile'
    elif var == 'dcode':
        stan_data['n'] = data['dcknown']
        stan_data['y'] = data['dcyes']
        out_pfx = 'profile-dcode'
    else:
        raise ValueError(f'unknown variant {var}')

    fit = model.sampling(stan_data,
                         seed=seed,
                         check_hmc_diagnostics=True,
                         **cfg)
    _log.info('profile sample for %s finished in %s', inst, timer)
    summary = fit.stansummary(pars=["mu", "sigma", "thetaP", "nP", "yP"])
    print(summary)
    (data_dir / inst / f'{out_pfx}-model.txt').write_text(summary)

    _log.info('extracting samples')
    samples = fit.extract(permuted=True)

    write_samples(data_dir / inst / f'{out_pfx}-samples.h5', samples)

    _log.info('pickling model and fit')
    with dt.zstd_write(data_dir / inst / f'{out_pfx}-fit.pkl.zstd') as ff:
        pickle.dump((model, fit), ff, protocol=4)
def inspect(opts):
    _log.info('inspecting file %s', opts.path)
    stat = opts.path.stat()
    _log.info('file size: %s (%s)', stat.st_size, binarysize(stat.st_size))

    timer = Stopwatch()
    with opts.path.open('rb') as f:
        model = pickle.load(f)
    timer.stop()
    gc.collect()
    res = resource.getrusage(resource.RUSAGE_SELF)
    _log.info('loaded model in %s', timer)
    _log.info('max RSS %s', binarysize(res.ru_maxrss * 1024))

    bufs = PBJar()
    timer = Stopwatch()
    p_bytes = pickle5.dumps(model, protocol=5, buffer_callback=bufs)
    timer.stop()
    bsize = bufs.total_size()
    _log.info('pickled to %d bytes in %s', len(p_bytes), timer)
    _log.info('with %d bytes of buffers', bsize)
    _log.info('total size: %s', binarysize(len(p_bytes) + bsize))
    _log.info('compresses to: %s', binarysize(len(p_bytes) + bufs.encoded_size()))
Exemple #5
0
def do_measure(opts):
    name = opts['-d']

    _log.info('reading data %s', name)
    test = pd.read_parquet(f'data/{name}-test.parquet')
    recs = pd.read_parquet(f'data/{name}-recs.parquet')

    _log.info('setting up analysis')
    rla = RecListAnalysis()
    rla.add_metric(ndcg)
    rla.add_metric(recip_rank)

    timer = Stopwatch()
    results = rla.compute(recs, test, include_missing=True)
    _log.info('analyzed in %s', timer)

    results = results.fillna(0)
    a_res = results.groupby('Algorithm').mean()
    a_res['count'] = results.groupby('Algorithm')['nrecs'].count()
    _log.info('finished')
    print(a_res)
    print(results.groupby('Algorithm')['recip_rank'].describe())
Exemple #6
0
model = args.get('ALGO')
dsname = args.get('DATASET')

_log.info('importing from module %s', mod_name)
algorithms = importlib.import_module(mod_name)

_log.info('locating model %s', model)
algo = getattr(algorithms, model)
_log.info('locating data set %s', dsname)
data = getattr(datasets, dsname)

_log.info('loading ratings')
ratings = data.ratings
_log.info('training model')
algo = Recommender.adapt(algo)
timer = Stopwatch()
algo.fit(ratings)
timer.stop()
_log.info('trained model in %s', timer)
if resource:
    res = resource.getrusage(resource.RUSAGE_SELF)
    _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime,
              res.ru_stime, res.ru_maxrss / 1024)

if out is None:
    out = f'models/{dsname}-{model}.pkl.gz'

_log.info('writing to %s', out)
pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True)
with gzip.open(out, 'wb') as f:
    pickle.dump(algo, f, 4)
Exemple #7
0
from lkdemo import datasets, log
from lenskit.util import Stopwatch

_log = log.script(__file__)

args = docopt(__doc__)
n = int(args['-n'])

if args['-d']:
    _log.info('using data %s', args['-d'])
    data = getattr(datasets, args['-d'])
    items = data.movies
else:
    data = None
    items = None

_log.info('reading from %s', args['MODEL'])
with gzip.open(args['MODEL'], 'rb') as f:
    algo = pickle.load(f)

for u in args['USER']:
    u = int(u)
    timer = Stopwatch()
    _log.info('getting %d recs for user %d', n, u)
    recs = algo.recommend(u, n)
    if items is not None:
        recs = recs.join(items, how='left', on='item')
    print('recommendations for', u)
    print(recs)
    _log.info('completed recommendations in %s', timer)