def test01_most_similar(self): set_log_level(2) data_opt = self.get_ml100k_mm_opt() opt = ALSOption().get_default_option() opt.d = 20 opt.num_workers = 1 als = ALS(opt, data_opt=data_opt) als.initialize() als.train() pals = ParALS(als) random_keys = [ k for k, _ in als.most_similar('49.Star_Wars_(1977)', topk=128) ] random_indexes = als.get_index_pool(random_keys) naive = [als.most_similar(k, topk=10) for k in random_keys] topks0 = [[k for k, _ in result] for result in naive] scores0 = np.array([[v for _, v in result] for result in naive]) self.assertEqual(scores0.shape, ( 128, 10, ), msg='check even size') scores0 = scores0.reshape(len(naive), 10) pals.num_workers = 1 topks1, scores1 = pals.most_similar(random_keys, topk=10, repr=True) topks2, scores2 = pals.most_similar(random_indexes, topk=10, repr=True) for a, b in combinations([topks0, topks1, topks2], 2): self.assertEqual(a, b) for a, b in combinations([scores0, scores1, scores2], 2): self.assertTrue(np.allclose(a, b))
def test1_is_valid_option(self): opt = ALSOption().get_default_option() self.assertTrue(ALSOption().is_valid_option(opt)) opt['save_best'] = 1 self.assertRaises(RuntimeError, ALSOption().is_valid_option, opt) opt['save_best'] = False self.assertTrue(ALSOption().is_valid_option(opt))
def test05_topk_MT(self): set_log_level(2) data_opt = self.get_ml100k_mm_opt() opt = ALSOption().get_default_option() opt.d = 20 opt.num_workers = 1 als = ALS(opt, data_opt=data_opt) als.initialize() als.train() als.build_userid_map() all_keys = als._idmanager.userids start_t = time.time() naive = als.topk_recommendation(all_keys, topk=5) naive_elapsed = time.time() - start_t pals = ParALS(als) pals.num_workers = 4 start_t = time.time() qkeys1, topks1, scores1 = pals.topk_recommendation(all_keys, topk=5, repr=True) par_elapsed = time.time() - start_t self.assertEqual(len(qkeys1), len(naive)) for q, t in zip(qkeys1, topks1): self.assertEqual(naive[q], t) self.assertTrue(naive_elapsed > par_elapsed * 1.5)
def __init__(self, opt_path=None, *args, **kwargs): Algo.__init__(self, *args, **kwargs) ALSOption.__init__(self, *args, **kwargs) Evaluable.__init__(self, *args, **kwargs) Serializable.__init__(self, *args, **kwargs) Optimizable.__init__(self, *args, **kwargs) if opt_path is None: opt_path = ALSOption().get_default_option() self.logger = log.get_logger('ALS') self.opt, self.opt_path = self.get_option(opt_path) if self.opt.accelerator and not inited_CUALS: self.logger.error("ImportError CuALS, no cuda library exists.") raise RuntimeError() self.obj = CuALS() if self.opt.accelerator else CyALS() assert self.obj.init(bytes( self.opt_path, 'utf-8')), 'cannot parse option file: %s' % opt_path self.data = None data = kwargs.get('data') data_opt = self.opt.get('data_opt') data_opt = kwargs.get('data_opt', data_opt) if data_opt: self.data = buffalo.data.load(data_opt) self.data.create() elif isinstance(data, Data): self.data = data self.logger.info('ALS(%s)' % json.dumps(self.opt, indent=2)) if self.data: self.logger.info(self.data.show_info()) assert self.data.data_type in ['matrix']
def example1(): log.set_log_level(log.DEBUG) als_option = ALSOption().get_default_option() als_option.validation = aux.Option({'topk': 10}) data_option = MatrixMarketOptions().get_default_option() data_option.input.main = '../tests/ext/ml-100k/main' data_option.input.iid = '../tests/ext/ml-100k/iid' als = ALS(als_option, data_opt=data_option) als.initialize() als.train() print('MovieLens 100k metrics for validations\n%s' % json.dumps(als.get_validation_results(), indent=2)) print('Similar movies to Star_Wars_(1977)') for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')): print(f'{rank + 1:02d}. {score:.3f} {movie_name}') print('Run hyper parameter optimization for val_ndcg...') als.opt.num_workers = 4 als.opt.evaluation_period = 10 als.opt.optimize = aux.Option({ 'loss': 'val_ndcg', 'max_trials': 100, 'deployment': True, 'start_with_default_parameters': True, 'space': { 'd': ['randint', ['d', 10, 128]], 'reg_u': ['uniform', ['reg_u', 0.1, 1.0]], 'reg_i': ['uniform', ['reg_i', 0.1, 1.0]], 'alpha': ['randint', ['alpha', 1, 10]], } }) log.set_log_level(log.INFO) als.opt.model_path = './example1.ml100k.als.optimize.bin' print(json.dumps({'alpha': als.opt.alpha, 'd': als.opt.d, 'reg_u': als.opt.reg_u, 'reg_i': als.opt.reg_i}, indent=2)) als.optimize() als.load('./example1.ml100k.als.optimize.bin') print('Similar movies to Star_Wars_(1977)') for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')): print(f'{rank + 1:02d}. {score:.3f} {movie_name}') optimization_res = als.get_optimization_data() best_parameters = optimization_res['best_parameters'] print(json.dumps(optimization_res['best'], indent=2)) print(json.dumps({'alpha': int(best_parameters['alpha']), 'd': int(best_parameters['d']), 'reg_u': best_parameters['reg_u'], 'reg_i': best_parameters['reg_i']}, indent=2))
def test11_train_ml_20m_on_gpu(self): opt = ALSOption().get_default_option() opt.num_workers = 8 opt.d = 100 opt.validation = aux.Option({'topk': 10}) opt.compute_loss_on_training = True opt.accelerator = True opt.num_cg_max_iters = 3 self._test7_train_ml_20m(ALS, opt)
def test06_topk_pool(self): set_log_level(2) data_opt = self.get_ml100k_mm_opt() opt = ALSOption().get_default_option() opt.d = 20 opt.num_workers = 1 als = ALS(opt, data_opt=data_opt) als.initialize() als.train() pals = ParALS(als) pool = np.array([i for i in range(5)], dtype=np.int32) als.build_userid_map() all_keys = als._idmanager.userids[::][:10] naive = als.topk_recommendation(all_keys, topk=10, pool=pool) qkeys1, topks1, scores1 = pals.topk_recommendation(all_keys, topk=10, pool=pool, repr=True) for q, t in zip(qkeys1, topks1): self.assertEqual(naive[q], t)
def test00_tensorboard(self): set_log_level(2) opt = ALSOption().get_default_option() opt.d = 5 opt.validation = aux.Option({'topk': 10}) opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'}) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) als = ALS(opt, data_opt=data_opt) als.initialize() als.train() results = als.get_validation_results() self.assertTrue(results['ndcg'] > 0.025) self.assertTrue(results['map'] > 0.015)
def test5_validation(self): opt = ALSOption().get_default_option() opt.d = 5 opt.num_iters = 20 opt.validation = aux.Option({'topk': 10}) opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'}) self._test5_validation(ALS, opt)
def get_option(self, lib_name, algo_name, **kwargs): if lib_name == 'buffalo': if algo_name == 'als': from buffalo.algo.options import ALSOption opt = ALSOption().get_default_option() opt.update({'d': kwargs.get('d', 100), 'optimizer': {True: 'manual_cg', False: 'ldlt'}.get(kwargs.get('use_cg', True)), 'num_iters': kwargs.get('num_iters', 10), 'num_cg_max_iters': 3, 'accelerator': kwargs.get('gpu', False), 'num_workers': kwargs.get('num_workers', 10), 'compute_loss_on_training': kwargs.get('compute_loss_on_training', False)}) return opt if algo_name == 'bpr': from buffalo.algo.options import BPRMFOption opt = BPRMFOption().get_default_option() opt.update({'d': kwargs.get('d', 100), 'num_iters': kwargs.get('num_iters', 10), 'num_workers': kwargs.get('num_workers', 10), 'compute_loss_on_training': kwargs.get('compute_loss_on_training', False)}) return opt elif lib_name == 'implicit': if algo_name == 'als': return {'factors': kwargs.get('d', 100), 'dtype': np.float32, 'use_native': True, 'use_gpu': kwargs.get('gpu', False), 'use_cg': kwargs.get('use_cg', True), 'iterations': kwargs.get('num_iters', 10), 'num_threads': kwargs.get('num_workers', 10), 'calculate_training_loss': kwargs.get('calculate_training_loss', False)} if algo_name == 'bpr': return {'factors': kwargs.get('d', 100), 'dtype': np.float32, 'iterations': kwargs.get('num_iters', 10), 'verify_negative_samples': True, 'num_threads': kwargs.get('num_workers', 10)} elif lib_name == 'lightfm': if algo_name == 'bpr': return {'epochs': kwargs.get('num_iters', 10), 'verbose': True, 'num_threads': kwargs.get('num_workers', 10)} elif lib_name == 'pyspark': if algo_name == 'als': return {'maxIter': kwargs.get('num_iters', 10), 'rank': kwargs.get('d', 100), 'alpha': 8, 'implicitPrefs': True, 'userCol': 'row', 'itemCol': 'col', 'intermediateStorageLevel': 'MEMORY_ONLY', 'finalStorageLevel': 'MEMORY_ONLY', 'ratingCol': 'data'}
def __init__(self, *args, **kwargs): Algo.__init__(self, *args, **kwargs) Optimizable.__init__(self, *args, **kwargs) TensorboardExtention.__init__(self, *args, **kwargs) self.logger = log.get_logger('MockAlgo') option = ALSOption().get_default_option() optimize_option = ALSOption().get_default_optimize_option() optimize_option.start_with_default_parameters = False option.optimize = optimize_option option.model_path = 'hello.world.bin' self.opt = option self._optimize_loss = {'loss': 987654321.0}
def test4_optimize(self): set_log_level(2) opt = ALSOption().get_default_option() opt.d = 5 opt.num_workers = 2 opt.model_path = 'als.bin' opt.validation = aux.Option({'topk': 10}) optimize_option = aux.Option({ 'loss': 'val_rmse', 'max_trials': 10, 'deployment': True, 'start_with_default_parameters': True, 'space': { 'd': ['randint', ['d', 10, 20]], 'reg_u': ['uniform', ['reg_u', 0.1, 0.3]], 'reg_i': ['uniform', ['reg_i', 0.1, 0.3]], 'alpha': ['randint', ['alpha', 8, 10]] } }) opt.optimize = optimize_option opt.evaluation_period = 1 opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'}) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) als = ALS(opt, data_opt=data_opt) als.init_factors() als.train() default_result = als.get_validation_results() als.optimize() base_loss = default_result['rmse'] # val_rmse optimize_loss = als.get_optimization_data()['best']['val_rmse'] self.assertTrue(base_loss > optimize_loss) als.load('als.bin') loss = als.get_validation_results() self.assertAlmostEqual(loss['rmse'], optimize_loss) os.remove('als.bin')
def test02_most_similar(self): set_log_level(1) data_opt = self.get_ml100k_mm_opt() opt = ALSOption().get_default_option() opt.d = 20 opt.num_workers = 1 als = ALS(opt, data_opt=data_opt) als.initialize() als.train() als.build_itemid_map() pals = ParALS(als) all_keys = als._idmanager.itemids[::] start_t = time.time() [als.most_similar(k, topk=10) for k in all_keys] naive_elapsed = time.time() - start_t pals.num_workers = 4 start_t = time.time() pals.most_similar(all_keys, topk=10, repr=True) parals_elapsed = time.time() - start_t self.assertTrue(naive_elapsed > parals_elapsed * 3.0)
def test2_most_similar(self): set_log_level(2) opt = ALSOption().get_default_option() data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' als = ALS(opt, data_opt=data_opt) als.initialize() als.train() q1, q2, q3 = '49.Star_Wars_(1977)', '180.Return_of_the_Jedi_(1983)', '171.Empire_Strikes_Back,_The_(1980)' self._test_most_similar(als, q1, q2, q3)
def example2(): log.set_log_level(log.INFO) als_option = ALSOption().get_default_option() data_option = MatrixMarketOptions().get_default_option() data_option.input.main = '../tests/ext/ml-20m/main' data_option.input.iid = '../tests/ext/ml-20m/iid' data_option.data.path = './ml20m.h5py' data_option.data.use_cache = True als = ALS(als_option, data_opt=data_option) als.initialize() als.train() als.normalize('item') als.build_itemid_map() print( 'Make item recommendation on als.ml20m.par.top10.tsv with Paralell(Thread=4)' ) par = ParALS(als) par.num_workers = 4 all_items = als._idmanager.itemids start_t = time.time() with open('als.ml20m.par.top10.tsv', 'w') as fout: for idx in range(0, len(all_items), 128): topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True) for q, p in zip(all_items[idx:idx + 128], topks): fout.write('%s\t%s\n' % (q, '\t'.join(p))) print('took: %.3f secs' % (time.time() - start_t)) from n2 import HnswIndex index = HnswIndex(als.Q.shape[1]) for f in als.Q: index.add_data(f) index.build(n_threads=4) index.save('ml20m.n2.index') index.unload() print( 'Make item recommendation on als.ml20m.par.top10.tsv with Ann(Thread=1)' ) par.set_hnsw_index('ml20m.n2.index', 'item') par.num_workers = 4 start_t = time.time() with open('als.ml20m.ann.top10.tsv', 'w') as fout: for idx in range(0, len(all_items), 128): topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True) for q, p in zip(all_items[idx:idx + 128], topks): fout.write('%s\t%s\n' % (q, '\t'.join(p))) print('took: %.3f secs' % (time.time() - start_t))
def test02_optimize(self): def mock_fn(opt): loss = 1.0 - opt['adaptive_reg'] / 1.0 loss += 1.0 / (opt['d']**2 + 1) loss += 1.0 / opt['alpha'] loss += (opt['reg_i'] / 2.0) loss += (opt['reg_u'] / 2.0) return loss option = ALSOption().get_default_optimize_option() space = Optimizable()._get_space(option.space) best = fmin(fn=mock_fn, space=space, algo=tpe.suggest, max_evals=600) self.assertGreaterEqual(int(best['d']), 2) # this is shifted by 10 self.assertGreaterEqual(int(best['alpha']), 15) self.assertLessEqual(best['reg_i'], 0.3) self.assertLessEqual(best['reg_u'], 0.3) self.assertEqual(best['adaptive_reg'], 1)
def test2_optimize(self): def mock_fn(opt): loss = 1.0 - opt['adaptive_reg'] / 1.0 loss += 1.0 / opt['d'] loss += 1.0 / opt['alpha'] loss += 1.0 / opt['reg_i'] loss += 1.0 / opt['reg_u'] return loss option = ALSOption().get_default_optimize_option() space = Optimizable()._get_space(option.space) best = fmin(fn=mock_fn, space=space, algo=tpe.suggest, max_evals=600) self.assertGreaterEqual(int(best['d']), 16) self.assertGreaterEqual(int(best['alpha']), 16) self.assertGreaterEqual(best['reg_i'], 0.5) self.assertGreaterEqual(best['reg_u'], 0.5) self.assertEqual(best['adaptive_reg'], 1)
def test6_topk(self): opt = ALSOption().get_default_option() opt.d = 5 opt.validation = aux.Option({'topk': 10}) self._test6_topk(ALS, opt)
def test0_get_default_option(self): ALSOption().get_default_option() self.assertTrue(True)
def test2_init_with_dict(self): set_log_level(3) opt = ALSOption().get_default_option() ALS(opt) self.assertTrue(True)
def test3_init(self): opt = ALSOption().get_default_option() self._test3_init(ALS, opt)
def test4_train(self): opt = ALSOption().get_default_option() opt.d = 20 self._test4_train(ALS, opt)
def test7_train_ml_20m(self): opt = ALSOption().get_default_option() opt.num_workers = 8 opt.validation = aux.Option({'topk': 10}) self._test7_train_ml_20m(ALS, opt)
def test9_compact_serialization(self): opt = ALSOption().get_default_option() opt.d = 5 opt.validation = aux.Option({'topk': 10}) self._test9_compact_serialization(ALS, opt)
def test10_fast_most_similar(self): opt = ALSOption().get_default_option() opt.d = 5 opt.validation = aux.Option({'topk': 10}) self._test10_fast_most_similar(ALS, opt)