Exemple #1
0
    def test01_most_similar(self):
        set_log_level(2)
        data_opt = self.get_ml100k_mm_opt()
        opt = ALSOption().get_default_option()
        opt.d = 20
        opt.num_workers = 1
        als = ALS(opt, data_opt=data_opt)
        als.initialize()
        als.train()
        pals = ParALS(als)
        random_keys = [
            k for k, _ in als.most_similar('49.Star_Wars_(1977)', topk=128)
        ]
        random_indexes = als.get_index_pool(random_keys)
        naive = [als.most_similar(k, topk=10) for k in random_keys]
        topks0 = [[k for k, _ in result] for result in naive]
        scores0 = np.array([[v for _, v in result] for result in naive])
        self.assertEqual(scores0.shape, (
            128,
            10,
        ), msg='check even size')
        scores0 = scores0.reshape(len(naive), 10)
        pals.num_workers = 1
        topks1, scores1 = pals.most_similar(random_keys, topk=10, repr=True)
        topks2, scores2 = pals.most_similar(random_indexes, topk=10, repr=True)

        for a, b in combinations([topks0, topks1, topks2], 2):
            self.assertEqual(a, b)
        for a, b in combinations([scores0, scores1, scores2], 2):
            self.assertTrue(np.allclose(a, b))
Exemple #2
0
 def test1_is_valid_option(self):
     opt = ALSOption().get_default_option()
     self.assertTrue(ALSOption().is_valid_option(opt))
     opt['save_best'] = 1
     self.assertRaises(RuntimeError, ALSOption().is_valid_option, opt)
     opt['save_best'] = False
     self.assertTrue(ALSOption().is_valid_option(opt))
Exemple #3
0
    def test05_topk_MT(self):
        set_log_level(2)
        data_opt = self.get_ml100k_mm_opt()
        opt = ALSOption().get_default_option()
        opt.d = 20
        opt.num_workers = 1
        als = ALS(opt, data_opt=data_opt)
        als.initialize()
        als.train()

        als.build_userid_map()
        all_keys = als._idmanager.userids
        start_t = time.time()
        naive = als.topk_recommendation(all_keys, topk=5)
        naive_elapsed = time.time() - start_t

        pals = ParALS(als)
        pals.num_workers = 4
        start_t = time.time()
        qkeys1, topks1, scores1 = pals.topk_recommendation(all_keys,
                                                           topk=5,
                                                           repr=True)
        par_elapsed = time.time() - start_t
        self.assertEqual(len(qkeys1), len(naive))
        for q, t in zip(qkeys1, topks1):
            self.assertEqual(naive[q], t)
        self.assertTrue(naive_elapsed > par_elapsed * 1.5)
Exemple #4
0
    def __init__(self, opt_path=None, *args, **kwargs):
        Algo.__init__(self, *args, **kwargs)
        ALSOption.__init__(self, *args, **kwargs)
        Evaluable.__init__(self, *args, **kwargs)
        Serializable.__init__(self, *args, **kwargs)
        Optimizable.__init__(self, *args, **kwargs)
        if opt_path is None:
            opt_path = ALSOption().get_default_option()

        self.logger = log.get_logger('ALS')
        self.opt, self.opt_path = self.get_option(opt_path)
        if self.opt.accelerator and not inited_CUALS:
            self.logger.error("ImportError CuALS, no cuda library exists.")
            raise RuntimeError()
        self.obj = CuALS() if self.opt.accelerator else CyALS()
        assert self.obj.init(bytes(
            self.opt_path, 'utf-8')), 'cannot parse option file: %s' % opt_path

        self.data = None
        data = kwargs.get('data')
        data_opt = self.opt.get('data_opt')
        data_opt = kwargs.get('data_opt', data_opt)
        if data_opt:
            self.data = buffalo.data.load(data_opt)
            self.data.create()
        elif isinstance(data, Data):
            self.data = data
        self.logger.info('ALS(%s)' % json.dumps(self.opt, indent=2))
        if self.data:
            self.logger.info(self.data.show_info())
            assert self.data.data_type in ['matrix']
Exemple #5
0
def example1():
    log.set_log_level(log.DEBUG)
    als_option = ALSOption().get_default_option()
    als_option.validation = aux.Option({'topk': 10})
    data_option = MatrixMarketOptions().get_default_option()
    data_option.input.main = '../tests/ext/ml-100k/main'
    data_option.input.iid = '../tests/ext/ml-100k/iid'

    als = ALS(als_option, data_opt=data_option)
    als.initialize()
    als.train()
    print('MovieLens 100k metrics for validations\n%s' % json.dumps(als.get_validation_results(), indent=2))

    print('Similar movies to Star_Wars_(1977)')
    for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')):
        print(f'{rank + 1:02d}. {score:.3f} {movie_name}')

    print('Run hyper parameter optimization for val_ndcg...')
    als.opt.num_workers = 4
    als.opt.evaluation_period = 10
    als.opt.optimize = aux.Option({
        'loss': 'val_ndcg',
        'max_trials': 100,
        'deployment': True,
        'start_with_default_parameters': True,
        'space': {
            'd': ['randint', ['d', 10, 128]],
            'reg_u': ['uniform', ['reg_u', 0.1, 1.0]],
            'reg_i': ['uniform', ['reg_i', 0.1, 1.0]],
            'alpha': ['randint', ['alpha', 1, 10]],
        }
    })
    log.set_log_level(log.INFO)
    als.opt.model_path = './example1.ml100k.als.optimize.bin'
    print(json.dumps({'alpha': als.opt.alpha, 'd': als.opt.d,
                      'reg_u': als.opt.reg_u, 'reg_i': als.opt.reg_i}, indent=2))
    als.optimize()
    als.load('./example1.ml100k.als.optimize.bin')

    print('Similar movies to Star_Wars_(1977)')
    for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')):
        print(f'{rank + 1:02d}. {score:.3f} {movie_name}')

    optimization_res = als.get_optimization_data()
    best_parameters = optimization_res['best_parameters']

    print(json.dumps(optimization_res['best'], indent=2))
    print(json.dumps({'alpha': int(best_parameters['alpha']), 'd': int(best_parameters['d']),
                      'reg_u': best_parameters['reg_u'], 'reg_i': best_parameters['reg_i']}, indent=2))
Exemple #6
0
 def test11_train_ml_20m_on_gpu(self):
     opt = ALSOption().get_default_option()
     opt.num_workers = 8
     opt.d = 100
     opt.validation = aux.Option({'topk': 10})
     opt.compute_loss_on_training = True
     opt.accelerator = True
     opt.num_cg_max_iters = 3
     self._test7_train_ml_20m(ALS, opt)
Exemple #7
0
    def test06_topk_pool(self):
        set_log_level(2)
        data_opt = self.get_ml100k_mm_opt()
        opt = ALSOption().get_default_option()
        opt.d = 20
        opt.num_workers = 1
        als = ALS(opt, data_opt=data_opt)
        als.initialize()
        als.train()
        pals = ParALS(als)

        pool = np.array([i for i in range(5)], dtype=np.int32)
        als.build_userid_map()
        all_keys = als._idmanager.userids[::][:10]
        naive = als.topk_recommendation(all_keys, topk=10, pool=pool)
        qkeys1, topks1, scores1 = pals.topk_recommendation(all_keys, topk=10, pool=pool, repr=True)
        for q, t in zip(qkeys1, topks1):
            self.assertEqual(naive[q], t)
Exemple #8
0
    def test00_tensorboard(self):
        set_log_level(2)
        opt = ALSOption().get_default_option()
        opt.d = 5
        opt.validation = aux.Option({'topk': 10})
        opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'})

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        als = ALS(opt, data_opt=data_opt)
        als.initialize()
        als.train()
        results = als.get_validation_results()
        self.assertTrue(results['ndcg'] > 0.025)
        self.assertTrue(results['map'] > 0.015)
Exemple #9
0
 def test5_validation(self):
     opt = ALSOption().get_default_option()
     opt.d = 5
     opt.num_iters = 20
     opt.validation = aux.Option({'topk': 10})
     opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'})
     self._test5_validation(ALS, opt)
Exemple #10
0
 def get_option(self, lib_name, algo_name, **kwargs):
     if lib_name == 'buffalo':
         if algo_name == 'als':
             from buffalo.algo.options import ALSOption
             opt = ALSOption().get_default_option()
             opt.update({'d': kwargs.get('d', 100),
                         'optimizer': {True: 'manual_cg', False: 'ldlt'}.get(kwargs.get('use_cg', True)),
                         'num_iters': kwargs.get('num_iters', 10),
                         'num_cg_max_iters': 3,
                         'accelerator': kwargs.get('gpu', False),
                         'num_workers': kwargs.get('num_workers', 10),
                         'compute_loss_on_training': kwargs.get('compute_loss_on_training', False)})
             return opt
         if algo_name == 'bpr':
             from buffalo.algo.options import BPRMFOption
             opt = BPRMFOption().get_default_option()
             opt.update({'d': kwargs.get('d', 100),
                         'num_iters': kwargs.get('num_iters', 10),
                         'num_workers': kwargs.get('num_workers', 10),
                         'compute_loss_on_training': kwargs.get('compute_loss_on_training', False)})
             return opt
     elif lib_name == 'implicit':
         if algo_name == 'als':
             return {'factors': kwargs.get('d', 100),
                     'dtype': np.float32,
                     'use_native': True,
                     'use_gpu': kwargs.get('gpu', False),
                     'use_cg': kwargs.get('use_cg', True),
                     'iterations': kwargs.get('num_iters', 10),
                     'num_threads': kwargs.get('num_workers', 10),
                     'calculate_training_loss': kwargs.get('calculate_training_loss', False)}
         if algo_name == 'bpr':
             return {'factors': kwargs.get('d', 100),
                     'dtype': np.float32,
                     'iterations': kwargs.get('num_iters', 10),
                     'verify_negative_samples': True,
                     'num_threads': kwargs.get('num_workers', 10)}
     elif lib_name == 'lightfm':
         if algo_name == 'bpr':
             return {'epochs': kwargs.get('num_iters', 10),
                     'verbose': True,
                     'num_threads': kwargs.get('num_workers', 10)}
     elif lib_name == 'pyspark':
         if algo_name == 'als':
             return {'maxIter': kwargs.get('num_iters', 10),
                     'rank': kwargs.get('d', 100),
                     'alpha': 8,
                     'implicitPrefs': True,
                     'userCol': 'row',
                     'itemCol': 'col',
                     'intermediateStorageLevel': 'MEMORY_ONLY',
                     'finalStorageLevel': 'MEMORY_ONLY',
                     'ratingCol': 'data'}
Exemple #11
0
 def __init__(self, *args, **kwargs):
     Algo.__init__(self, *args, **kwargs)
     Optimizable.__init__(self, *args, **kwargs)
     TensorboardExtention.__init__(self, *args, **kwargs)
     self.logger = log.get_logger('MockAlgo')
     option = ALSOption().get_default_option()
     optimize_option = ALSOption().get_default_optimize_option()
     optimize_option.start_with_default_parameters = False
     option.optimize = optimize_option
     option.model_path = 'hello.world.bin'
     self.opt = option
     self._optimize_loss = {'loss': 987654321.0}
Exemple #12
0
    def test4_optimize(self):
        set_log_level(2)
        opt = ALSOption().get_default_option()
        opt.d = 5
        opt.num_workers = 2
        opt.model_path = 'als.bin'
        opt.validation = aux.Option({'topk': 10})
        optimize_option = aux.Option({
            'loss': 'val_rmse',
            'max_trials': 10,
            'deployment': True,
            'start_with_default_parameters': True,
            'space': {
                'd': ['randint', ['d', 10, 20]],
                'reg_u': ['uniform', ['reg_u', 0.1, 0.3]],
                'reg_i': ['uniform', ['reg_i', 0.1, 0.3]],
                'alpha': ['randint', ['alpha', 8, 10]]
            }
        })
        opt.optimize = optimize_option
        opt.evaluation_period = 1
        opt.tensorboard = aux.Option({'root': './tb',
                                      'name': 'als'})

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        als = ALS(opt, data_opt=data_opt)
        als.init_factors()
        als.train()
        default_result = als.get_validation_results()
        als.optimize()
        base_loss = default_result['rmse']  # val_rmse
        optimize_loss = als.get_optimization_data()['best']['val_rmse']
        self.assertTrue(base_loss > optimize_loss)

        als.load('als.bin')
        loss = als.get_validation_results()
        self.assertAlmostEqual(loss['rmse'], optimize_loss)
        os.remove('als.bin')
Exemple #13
0
    def test02_most_similar(self):
        set_log_level(1)
        data_opt = self.get_ml100k_mm_opt()
        opt = ALSOption().get_default_option()
        opt.d = 20
        opt.num_workers = 1
        als = ALS(opt, data_opt=data_opt)
        als.initialize()
        als.train()
        als.build_itemid_map()
        pals = ParALS(als)

        all_keys = als._idmanager.itemids[::]
        start_t = time.time()
        [als.most_similar(k, topk=10) for k in all_keys]
        naive_elapsed = time.time() - start_t

        pals.num_workers = 4
        start_t = time.time()
        pals.most_similar(all_keys, topk=10, repr=True)
        parals_elapsed = time.time() - start_t

        self.assertTrue(naive_elapsed > parals_elapsed * 3.0)
Exemple #14
0
    def test2_most_similar(self):
        set_log_level(2)
        opt = ALSOption().get_default_option()

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'

        als = ALS(opt, data_opt=data_opt)
        als.initialize()
        als.train()
        q1, q2, q3 = '49.Star_Wars_(1977)', '180.Return_of_the_Jedi_(1983)', '171.Empire_Strikes_Back,_The_(1980)'
        self._test_most_similar(als, q1, q2, q3)
Exemple #15
0
def example2():
    log.set_log_level(log.INFO)
    als_option = ALSOption().get_default_option()
    data_option = MatrixMarketOptions().get_default_option()
    data_option.input.main = '../tests/ext/ml-20m/main'
    data_option.input.iid = '../tests/ext/ml-20m/iid'
    data_option.data.path = './ml20m.h5py'
    data_option.data.use_cache = True

    als = ALS(als_option, data_opt=data_option)
    als.initialize()
    als.train()
    als.normalize('item')
    als.build_itemid_map()

    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Paralell(Thread=4)'
    )
    par = ParALS(als)
    par.num_workers = 4
    all_items = als._idmanager.itemids
    start_t = time.time()
    with open('als.ml20m.par.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))

    from n2 import HnswIndex
    index = HnswIndex(als.Q.shape[1])
    for f in als.Q:
        index.add_data(f)
    index.build(n_threads=4)
    index.save('ml20m.n2.index')
    index.unload()
    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Ann(Thread=1)'
    )
    par.set_hnsw_index('ml20m.n2.index', 'item')
    par.num_workers = 4
    start_t = time.time()
    with open('als.ml20m.ann.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))
Exemple #16
0
    def test02_optimize(self):
        def mock_fn(opt):
            loss = 1.0 - opt['adaptive_reg'] / 1.0
            loss += 1.0 / (opt['d']**2 + 1)
            loss += 1.0 / opt['alpha']
            loss += (opt['reg_i'] / 2.0)
            loss += (opt['reg_u'] / 2.0)
            return loss

        option = ALSOption().get_default_optimize_option()
        space = Optimizable()._get_space(option.space)
        best = fmin(fn=mock_fn, space=space, algo=tpe.suggest, max_evals=600)
        self.assertGreaterEqual(int(best['d']), 2)  # this is shifted by 10
        self.assertGreaterEqual(int(best['alpha']), 15)
        self.assertLessEqual(best['reg_i'], 0.3)
        self.assertLessEqual(best['reg_u'], 0.3)
        self.assertEqual(best['adaptive_reg'], 1)
Exemple #17
0
    def test2_optimize(self):
        def mock_fn(opt):
            loss = 1.0 - opt['adaptive_reg'] / 1.0
            loss += 1.0 / opt['d']
            loss += 1.0 / opt['alpha']
            loss += 1.0 / opt['reg_i']
            loss += 1.0 / opt['reg_u']
            return loss

        option = ALSOption().get_default_optimize_option()
        space = Optimizable()._get_space(option.space)
        best = fmin(fn=mock_fn,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=600)
        self.assertGreaterEqual(int(best['d']), 16)
        self.assertGreaterEqual(int(best['alpha']), 16)
        self.assertGreaterEqual(best['reg_i'], 0.5)
        self.assertGreaterEqual(best['reg_u'], 0.5)
        self.assertEqual(best['adaptive_reg'], 1)
Exemple #18
0
 def test6_topk(self):
     opt = ALSOption().get_default_option()
     opt.d = 5
     opt.validation = aux.Option({'topk': 10})
     self._test6_topk(ALS, opt)
Exemple #19
0
 def test0_get_default_option(self):
     ALSOption().get_default_option()
     self.assertTrue(True)
Exemple #20
0
 def test2_init_with_dict(self):
     set_log_level(3)
     opt = ALSOption().get_default_option()
     ALS(opt)
     self.assertTrue(True)
Exemple #21
0
 def test3_init(self):
     opt = ALSOption().get_default_option()
     self._test3_init(ALS, opt)
Exemple #22
0
 def test4_train(self):
     opt = ALSOption().get_default_option()
     opt.d = 20
     self._test4_train(ALS, opt)
Exemple #23
0
 def test7_train_ml_20m(self):
     opt = ALSOption().get_default_option()
     opt.num_workers = 8
     opt.validation = aux.Option({'topk': 10})
     self._test7_train_ml_20m(ALS, opt)
Exemple #24
0
 def test9_compact_serialization(self):
     opt = ALSOption().get_default_option()
     opt.d = 5
     opt.validation = aux.Option({'topk': 10})
     self._test9_compact_serialization(ALS, opt)
Exemple #25
0
 def test10_fast_most_similar(self):
     opt = ALSOption().get_default_option()
     opt.d = 5
     opt.validation = aux.Option({'topk': 10})
     self._test10_fast_most_similar(ALS, opt)