Example #1
0
def main(args):
    mod_name = args.get('-m', 'lkdemo.algorithms')
    out = args.get('FILE', None)
    model = args.get('ALGO')
    dsname = args.get('DATASET')

    _log.info('importing from module %s', mod_name)
    algorithms = importlib.import_module(mod_name)

    _log.info('locating model %s', model)
    algo = getattr(algorithms, model)
    _log.info('locating data set %s', dsname)
    data = getattr(datasets, dsname)

    _log.info('loading ratings')
    ratings = data.ratings
    _log.info('training model')
    algo = Recommender.adapt(algo)
    timer = Stopwatch()
    algo.fit(ratings)
    timer.stop()
    _log.info('trained model in %s', timer)
    if resource:
        res = resource.getrusage(resource.RUSAGE_SELF)
        _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime,
                  res.ru_stime, res.ru_maxrss / 1024)

    if out is None:
        out = f'models/{dsname}-{model}.pkl.gz'

    _log.info('writing to %s', out)
    pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True)
    with gzip.open(out, 'wb') as f:
        pickle.dump(algo, f, 4)
def _train_algo(data, algo, ratings):
    algo = Recommender.adapt(algo)
    _log.info('training algorithm %s', algo)
    timer = Stopwatch()
    algo.fit(ratings)
    timer.stop()
    _log.info('trained %s in %s', algo, timer)
    return algo
Example #3
0
def test_als_method_match():
    lu = als.BiasedMF(20,
                      iterations=15,
                      reg=(2, 0.001),
                      method='lu',
                      rng_spec=42)
    cd = als.BiasedMF(20,
                      iterations=20,
                      reg=(2, 0.001),
                      method='cd',
                      rng_spec=42)

    ratings = lktu.ml_test.ratings

    timer = Stopwatch()
    lu.fit(ratings)
    timer.stop()
    _log.info('fit with LU solver in %s', timer)

    timer = Stopwatch()
    cd.fit(ratings)
    timer.stop()
    _log.info('fit with CD solver in %s', timer)

    assert lu.global_bias_ == approx(ratings.rating.mean())
    assert cd.global_bias_ == approx(ratings.rating.mean())

    preds = []

    rng = util.rng(42, legacy=True)
    for u in rng.choice(np.unique(ratings.user), 15, replace=False):
        items = rng.choice(np.unique(ratings.item), 15, replace=False)
        lu_preds = lu.predict_for_user(u, items)
        cd_preds = cd.predict_for_user(u, items)
        diff = lu_preds - cd_preds
        adiff = np.abs(diff)
        _log.info(
            'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f',
            u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff),
            np.max(adiff), np.quantile(adiff, 0.9))

        preds.append(
            pd.DataFrame({
                'user': u,
                'item': items,
                'lu': lu_preds,
                'cd': cd_preds,
                'adiff': adiff
            }))

    preds = pd.concat(preds, ignore_index=True)
    _log.info('LU preds:\n%s', preds.lu.describe())
    _log.info('CD preds:\n%s', preds.cd.describe())
    _log.info('overall differences:\n%s', preds.adiff.describe())
    # there are differences. our check: the 90% are under a quarter star
    assert np.quantile(adiff, 0.9) <= 0.25
Example #4
0
def test_als_method_match():
    lu = als.ImplicitMF(20,
                        iterations=15,
                        method='lu',
                        rand=np.random.RandomState(42).randn)
    cg = als.ImplicitMF(20,
                        iterations=15,
                        method='cg',
                        rand=np.random.RandomState(42).randn)

    ratings = lktu.ml_test.ratings

    timer = Stopwatch()
    lu.fit(ratings)
    timer.stop()
    _log.info('fit with LU solver in %s', timer)

    timer = Stopwatch()
    cg.fit(ratings)
    timer.stop()
    _log.info('fit with CG solver in %s', timer)

    preds = []

    with lktu.rand_seed(42):
        for u in np.random.choice(ratings.user.unique(), 10, replace=False):
            items = np.random.choice(ratings.item.unique(), 15, replace=False)
            lu_preds = lu.predict_for_user(u, items)
            cd_preds = cg.predict_for_user(u, items)
            diff = lu_preds - cd_preds
            adiff = np.abs(diff)
            _log.info(
                'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f',
                u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff),
                np.max(adiff), np.quantile(adiff, 0.9))

            preds.append(
                pd.DataFrame({
                    'user': u,
                    'item': items,
                    'lu': lu_preds,
                    'cg': cd_preds,
                    'adiff': adiff
                }))
            _log.info('user %s tau: %s', u,
                      stats.kendalltau(lu_preds, cd_preds))

    preds = pd.concat(preds, ignore_index=True)
    _log.info('LU preds:\n%s', preds.lu.describe())
    _log.info('CD preds:\n%s', preds.cg.describe())
    _log.info('overall differences:\n%s', preds.adiff.describe())
    # there are differences. our check: the 90% are reasonable
    assert np.quantile(adiff, 0.9) <= 0.3
Example #5
0
def train(options: TrainOptions):
    seed = init_rng(rng_seed(), 'train-model', options.data, options.algo)
    _log.info('using random seed %s', seed)

    ddir = data_dir / options.data
    rating_file = ddir / 'ratings.parquet'
    if options.train_data == 'all':
        mdir = ddir / 'models'
    elif options.train_data == 'eval':
        mdir = ddir / 'eval'
    else:
        raise ValueError(f'unknown training data {options.train_data}')

    mdir.mkdir(parents=True, exist_ok=True)
    mfn = mdir / f'{options.algo_fn}.model'
    if options.default:
        _log.warn('Using default settings')
        opt_fn = None
    else:
        opt_fn = ddir / 'tuning' / f'{options.algo_fn}.json'
        _log.info('Using algorithm optimization results %s', opt_fn)

    with LogFile(mdir / f'{options.algo_fn}.log'):
        _log.info('reading ratings from %s', rating_file)
        ratings = pd.read_parquet(rating_file)
        if options.drop_ratings and 'rating' in ratings.columns:
            _log.info('dropping rating column')
            ratings = ratings.drop(columns=['rating'])
        if options.train_data == 'eval':
            _log.info('reading test data')
            test = pd.read_parquet(ddir / 'eval' / 'test-ratings.parquet')
            train_mask = pd.Series(True, index=ratings.index)
            train_mask[test.index] = False
            ratings = ratings[train_mask].copy().reset_index(drop=True)

        implicit = 'rating' not in ratings.columns

        _log.info('loading algorithm %s for %s in %s mode', options.data,
                  options.algo, 'implicit' if implicit else 'explicit')
        algo = get_algorithm(options.data, options.algo, opt_fn, implicit)
        algo = Recommender.adapt(algo)

        _log.info('training %s on %s ratings', algo, len(ratings))
        timer = Stopwatch()
        model = algo.fit(ratings)
        timer.stop()
        _log.info('trained in %s', timer)
        _log.info('saving model to %s', mfn)
        with open(mfn, 'wb') as f:
            p = dt.CompactingPickler(f, protocol=4)
            p.dump(model)
def inspect(opts):
    _log.info('inspecting file %s', opts.path)
    stat = opts.path.stat()
    _log.info('file size: %s (%s)', stat.st_size, binarysize(stat.st_size))

    timer = Stopwatch()
    with opts.path.open('rb') as f:
        model = pickle.load(f)
    timer.stop()
    gc.collect()
    res = resource.getrusage(resource.RUSAGE_SELF)
    _log.info('loaded model in %s', timer)
    _log.info('max RSS %s', binarysize(res.ru_maxrss * 1024))

    bufs = PBJar()
    timer = Stopwatch()
    p_bytes = pickle5.dumps(model, protocol=5, buffer_callback=bufs)
    timer.stop()
    bsize = bufs.total_size()
    _log.info('pickled to %d bytes in %s', len(p_bytes), timer)
    _log.info('with %d bytes of buffers', bsize)
    _log.info('total size: %s', binarysize(len(p_bytes) + bsize))
    _log.info('compresses to: %s', binarysize(len(p_bytes) + bufs.encoded_size()))
Example #7
0
dsname = args.get('DATASET')

_log.info('importing from module %s', mod_name)
algorithms = importlib.import_module(mod_name)

_log.info('locating model %s', model)
algo = getattr(algorithms, model)
_log.info('locating data set %s', dsname)
data = getattr(datasets, dsname)

_log.info('loading ratings')
ratings = data.ratings
_log.info('training model')
algo = Recommender.adapt(algo)
timer = Stopwatch()
algo.fit(ratings)
timer.stop()
_log.info('trained model in %s', timer)
if resource:
    res = resource.getrusage(resource.RUSAGE_SELF)
    _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime,
              res.ru_stime, res.ru_maxrss / 1024)

if out is None:
    out = f'models/{dsname}-{model}.pkl.gz'

_log.info('writing to %s', out)
pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True)
with gzip.open(out, 'wb') as f:
    pickle.dump(algo, f, 4)