def main(args): mod_name = args.get('-m', 'lkdemo.algorithms') out = args.get('FILE', None) model = args.get('ALGO') dsname = args.get('DATASET') _log.info('importing from module %s', mod_name) algorithms = importlib.import_module(mod_name) _log.info('locating model %s', model) algo = getattr(algorithms, model) _log.info('locating data set %s', dsname) data = getattr(datasets, dsname) _log.info('loading ratings') ratings = data.ratings _log.info('training model') algo = Recommender.adapt(algo) timer = Stopwatch() algo.fit(ratings) timer.stop() _log.info('trained model in %s', timer) if resource: res = resource.getrusage(resource.RUSAGE_SELF) _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime, res.ru_stime, res.ru_maxrss / 1024) if out is None: out = f'models/{dsname}-{model}.pkl.gz' _log.info('writing to %s', out) pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True) with gzip.open(out, 'wb') as f: pickle.dump(algo, f, 4)
def _train_algo(data, algo, ratings): algo = Recommender.adapt(algo) _log.info('training algorithm %s', algo) timer = Stopwatch() algo.fit(ratings) timer.stop() _log.info('trained %s in %s', algo, timer) return algo
def test_als_method_match(): lu = als.BiasedMF(20, iterations=15, reg=(2, 0.001), method='lu', rng_spec=42) cd = als.BiasedMF(20, iterations=20, reg=(2, 0.001), method='cd', rng_spec=42) ratings = lktu.ml_test.ratings timer = Stopwatch() lu.fit(ratings) timer.stop() _log.info('fit with LU solver in %s', timer) timer = Stopwatch() cd.fit(ratings) timer.stop() _log.info('fit with CD solver in %s', timer) assert lu.global_bias_ == approx(ratings.rating.mean()) assert cd.global_bias_ == approx(ratings.rating.mean()) preds = [] rng = util.rng(42, legacy=True) for u in rng.choice(np.unique(ratings.user), 15, replace=False): items = rng.choice(np.unique(ratings.item), 15, replace=False) lu_preds = lu.predict_for_user(u, items) cd_preds = cd.predict_for_user(u, items) diff = lu_preds - cd_preds adiff = np.abs(diff) _log.info( 'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f', u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff), np.max(adiff), np.quantile(adiff, 0.9)) preds.append( pd.DataFrame({ 'user': u, 'item': items, 'lu': lu_preds, 'cd': cd_preds, 'adiff': adiff })) preds = pd.concat(preds, ignore_index=True) _log.info('LU preds:\n%s', preds.lu.describe()) _log.info('CD preds:\n%s', preds.cd.describe()) _log.info('overall differences:\n%s', preds.adiff.describe()) # there are differences. our check: the 90% are under a quarter star assert np.quantile(adiff, 0.9) <= 0.25
def test_als_method_match(): lu = als.ImplicitMF(20, iterations=15, method='lu', rand=np.random.RandomState(42).randn) cg = als.ImplicitMF(20, iterations=15, method='cg', rand=np.random.RandomState(42).randn) ratings = lktu.ml_test.ratings timer = Stopwatch() lu.fit(ratings) timer.stop() _log.info('fit with LU solver in %s', timer) timer = Stopwatch() cg.fit(ratings) timer.stop() _log.info('fit with CG solver in %s', timer) preds = [] with lktu.rand_seed(42): for u in np.random.choice(ratings.user.unique(), 10, replace=False): items = np.random.choice(ratings.item.unique(), 15, replace=False) lu_preds = lu.predict_for_user(u, items) cd_preds = cg.predict_for_user(u, items) diff = lu_preds - cd_preds adiff = np.abs(diff) _log.info( 'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f', u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff), np.max(adiff), np.quantile(adiff, 0.9)) preds.append( pd.DataFrame({ 'user': u, 'item': items, 'lu': lu_preds, 'cg': cd_preds, 'adiff': adiff })) _log.info('user %s tau: %s', u, stats.kendalltau(lu_preds, cd_preds)) preds = pd.concat(preds, ignore_index=True) _log.info('LU preds:\n%s', preds.lu.describe()) _log.info('CD preds:\n%s', preds.cg.describe()) _log.info('overall differences:\n%s', preds.adiff.describe()) # there are differences. our check: the 90% are reasonable assert np.quantile(adiff, 0.9) <= 0.3
def train(options: TrainOptions): seed = init_rng(rng_seed(), 'train-model', options.data, options.algo) _log.info('using random seed %s', seed) ddir = data_dir / options.data rating_file = ddir / 'ratings.parquet' if options.train_data == 'all': mdir = ddir / 'models' elif options.train_data == 'eval': mdir = ddir / 'eval' else: raise ValueError(f'unknown training data {options.train_data}') mdir.mkdir(parents=True, exist_ok=True) mfn = mdir / f'{options.algo_fn}.model' if options.default: _log.warn('Using default settings') opt_fn = None else: opt_fn = ddir / 'tuning' / f'{options.algo_fn}.json' _log.info('Using algorithm optimization results %s', opt_fn) with LogFile(mdir / f'{options.algo_fn}.log'): _log.info('reading ratings from %s', rating_file) ratings = pd.read_parquet(rating_file) if options.drop_ratings and 'rating' in ratings.columns: _log.info('dropping rating column') ratings = ratings.drop(columns=['rating']) if options.train_data == 'eval': _log.info('reading test data') test = pd.read_parquet(ddir / 'eval' / 'test-ratings.parquet') train_mask = pd.Series(True, index=ratings.index) train_mask[test.index] = False ratings = ratings[train_mask].copy().reset_index(drop=True) implicit = 'rating' not in ratings.columns _log.info('loading algorithm %s for %s in %s mode', options.data, options.algo, 'implicit' if implicit else 'explicit') algo = get_algorithm(options.data, options.algo, opt_fn, implicit) algo = Recommender.adapt(algo) _log.info('training %s on %s ratings', algo, len(ratings)) timer = Stopwatch() model = algo.fit(ratings) timer.stop() _log.info('trained in %s', timer) _log.info('saving model to %s', mfn) with open(mfn, 'wb') as f: p = dt.CompactingPickler(f, protocol=4) p.dump(model)
def inspect(opts): _log.info('inspecting file %s', opts.path) stat = opts.path.stat() _log.info('file size: %s (%s)', stat.st_size, binarysize(stat.st_size)) timer = Stopwatch() with opts.path.open('rb') as f: model = pickle.load(f) timer.stop() gc.collect() res = resource.getrusage(resource.RUSAGE_SELF) _log.info('loaded model in %s', timer) _log.info('max RSS %s', binarysize(res.ru_maxrss * 1024)) bufs = PBJar() timer = Stopwatch() p_bytes = pickle5.dumps(model, protocol=5, buffer_callback=bufs) timer.stop() bsize = bufs.total_size() _log.info('pickled to %d bytes in %s', len(p_bytes), timer) _log.info('with %d bytes of buffers', bsize) _log.info('total size: %s', binarysize(len(p_bytes) + bsize)) _log.info('compresses to: %s', binarysize(len(p_bytes) + bufs.encoded_size()))
dsname = args.get('DATASET') _log.info('importing from module %s', mod_name) algorithms = importlib.import_module(mod_name) _log.info('locating model %s', model) algo = getattr(algorithms, model) _log.info('locating data set %s', dsname) data = getattr(datasets, dsname) _log.info('loading ratings') ratings = data.ratings _log.info('training model') algo = Recommender.adapt(algo) timer = Stopwatch() algo.fit(ratings) timer.stop() _log.info('trained model in %s', timer) if resource: res = resource.getrusage(resource.RUSAGE_SELF) _log.info('%.2fs user, %.2fs system, %.1fMB max RSS', res.ru_utime, res.ru_stime, res.ru_maxrss / 1024) if out is None: out = f'models/{dsname}-{model}.pkl.gz' _log.info('writing to %s', out) pathlib.Path(out).parent.mkdir(parents=True, exist_ok=True) with gzip.open(out, 'wb') as f: pickle.dump(algo, f, 4)