예제 #1
0
 def test1_is_valid_option(self):
     opt = MatrixMarketOptions().get_default_option()
     self.assertTrue(MatrixMarketOptions().is_valid_option(opt))
     opt['type'] = 1
     self.assertRaises(RuntimeError,
                       MatrixMarketOptions().is_valid_option, opt)
     opt['type'] = 'matrix_market'
     self.assertTrue(MatrixMarketOptions().is_valid_option(opt))
예제 #2
0
    def test2_create(self):
        set_log_level(3)
        opt = MatrixMarketOptions().get_default_option()
        opt.input.main = self.mm_path
        opt.input.uid = self.uid_path
        opt.input.iid = self.iid_path
        mm = MatrixMarket(opt)
        mm.create()
        self.temp_files.append(opt.data.path)
        self.assertTrue(True)
        db = mm.handle
        self.assertEqual(sorted(db.keys()),
                         sorted(['vali', 'idmap', 'rowwise', 'colwise']))
        header = mm.get_header()
        self.assertEqual(header['num_nnz'], 5)
        self.assertEqual(header['num_users'], 5)
        self.assertEqual(header['num_items'], 3)

        data = [(u, kk, vv) for u, kk, vv in mm.iterate()]
        self.assertEqual(len(data), 5)
        self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1])
        self.assertEqual(data[2], (2, 2, 1.0))

        data = [(u, kk, vv) for u, kk, vv in mm.iterate(axis='colwise')]
        self.assertEqual([int(kk) for _, kk, _ in data], [0, 1, 3, 4, 2])
예제 #3
0
    def test1_minmax(self):
        opt = MatrixMarketOptions().get_default_option()
        opt.input.main = self.mm_path
        opt.input.uid = self.uid_path
        opt.input.iid = self.iid_path
        opt.data.value_prepro = aux.Option({
            'name': 'MinMaxScalar',
            'min': 3,
            'max': 5.0
        })
        mm = MatrixMarket(opt)
        mm.create()
        self.assertTrue(True)
        db = mm.handle
        self.assertEqual(sorted(db.keys()),
                         sorted(['vali', 'idmap', 'rowwise', 'colwise']))
        header = mm.get_header()
        self.assertEqual(header['num_nnz'], 5)
        self.assertEqual(header['num_users'], 5)
        self.assertEqual(header['num_items'], 3)

        data = [(u, kk, vv) for u, kk, vv in mm.iterate()]
        self.assertEqual(len(data), 5)
        self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1])
        self.assertEqual([int(vv) for _, _, vv in data], [3, 5, 3, 3, 4])
        self.assertEqual(data[2], (2, 2, 3.0))
예제 #4
0
파일: base.py 프로젝트: zzong2006/buffalo
    def _test10_fast_most_similar(self, cls, opt):
        set_log_level(1)

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = cls(opt, data_opt=data_opt)
        c.initialize()
        c.train()

        keys = [x for x, _ in c.most_similar('49.Star_Wars_(1977)', topk=100)]
        start_t = time.time()
        for i in range(100):
            for key in keys:
                c.most_similar(key)
        elapsed_a = time.time() - start_t

        c.normalize(group='item')
        start_t = time.time()
        for i in range(100):
            for key in keys:
                c.most_similar(key)
        elapsed_b = time.time() - start_t
        self.assertTrue(elapsed_a > elapsed_b)
예제 #5
0
파일: base.py 프로젝트: zzong2006/buffalo
    def _test9_compact_serialization(self, cls, opt):
        set_log_level(1)

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = cls(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        ret_a = [
            x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)',
                                         topk=100)
        ]
        self.assertIn('49.Star_Wars_(1977)', ret_a)
        c.save('model.bin', with_userid_map=False)
        c = cls(opt)
        c.load('model.bin', data_fields=['Q', '_idmanager'])
        ret_a = [
            x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)',
                                         topk=100)
        ]
        self.assertIn('49.Star_Wars_(1977)', ret_a)
        self.assertFalse(hasattr(c, 'P'))
        c.normalize(group='item')
        ret_a = [
            x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)',
                                         topk=100)
        ]
        self.assertIn('49.Star_Wars_(1977)', ret_a)
예제 #6
0
파일: base.py 프로젝트: zzong2006/buffalo
    def _test8_serialization(self, cls, opt):
        set_log_level(1)

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = cls(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        ret_a = [
            x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)',
                                         topk=100)
        ]
        self.assertIn('49.Star_Wars_(1977)', ret_a)
        c.save('model.bin')
        c.load('model.bin')
        os.remove('model.bin')
        ret_a = [
            x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)',
                                         topk=100)
        ]
        self.assertIn('49.Star_Wars_(1977)', ret_a)
예제 #7
0
 def test3_sppmi(self):
     opt = MatrixMarketOptions().get_default_option()
     opt.input.main = self.mm_path
     opt.input.uid = self.uid_path
     opt.input.iid = self.iid_path
     opt.data.value_prepro = aux.Option({'name': 'SPPMI'})
     self.assertRaises(RuntimeError, MatrixMarket, opt)
예제 #8
0
파일: base.py 프로젝트: zzong2006/buffalo
    def _test6_topk(self, cls, opt):
        set_log_level(2)

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = cls(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        self.assertTrue(len(c.topk_recommendation('1', 10)), 10)
        ret_a = [
            x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)',
                                         topk=100)
        ]
        self.assertIn('49.Star_Wars_(1977)', ret_a)
        c.normalize()
        ret_b = [
            x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)',
                                         topk=100)
        ]
        self.assertIn('49.Star_Wars_(1977)', ret_b)
        self.assertEqual(ret_a[:10], ret_b[:10])
예제 #9
0
 def get_ml100k_mm_opt(self):
     data_opt = MatrixMarketOptions().get_default_option()
     data_opt.input.main = self.ml_100k + 'main'
     data_opt.input.uid = self.ml_100k + 'uid'
     data_opt.input.iid = self.ml_100k + 'iid'
     data_opt.data.path = './ml100k.h5py'
     return data_opt
예제 #10
0
 def test3_id_list(self):
     opt = MatrixMarketOptions().get_default_option()
     opt.input.main = np.array([[1, 2], [1, 2], [2, 1]])
     opt.input.uid = [1, 2.0, '3']
     opt.input.iid = np.array(['1', 'a'])
     mm = MatrixMarket(opt)
     mm.create()
     self.assertTrue(True)
예제 #11
0
파일: models.py 프로젝트: younghai/buffalo
 def get_database(self, name, **kwargs):
     from buffalo.data.mm import MatrixMarketOptions
     data_opt = MatrixMarketOptions().get_default_option()
     data_opt.validation = None
     data_opt.data.use_cache = True
     data_opt.data.batch_mb = kwargs.get('batch_mb', 1024)
     if name == 'ml20m':
         data_opt.data.path = DB[name]
         data_opt.input.main = '../tests/ext/ml-20m/main'
     elif name =='ml100k':
         data_opt.data.path = DB[name]
         data_opt.input.main = '../tests/ext/ml-100k/main'
     elif name == 'kakao_reco_730m':
         data_opt.data.path = DB[name]
         data_opt.data.tmp_dir = './tmp/'
         data_opt.input.main = '../tests/ext/kakao-reco-730m/main'
     elif name == 'kakao_brunch_12m':
         data_opt.data.path = DB[name]
         data_opt.data.tmp_dir = './tmp/'
         data_opt.input.main = '../tests/ext/kakao-brunch-12m/main'
     return data_opt
예제 #12
0
파일: base.py 프로젝트: zzong2006/buffalo
    def _test4_train(self, cls, opt):
        set_log_level(3)
        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = cls(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        self.assertTrue(True)
예제 #13
0
파일: base.py 프로젝트: zzong2006/buffalo
    def _test3_init(self, cls, opt):
        set_log_level(3)
        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.path = './ml100k.h5py'

        c = cls(opt, data_opt=data_opt)
        self.assertTrue(True)
        c.init_factors()
        self.assertEqual(c.P.shape, (943, 20))
        self.assertEqual(c.Q.shape, (1682, 20))
예제 #14
0
    def test2_most_similar(self):
        set_log_level(2)
        opt = ALSOption().get_default_option()

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'

        als = ALS(opt, data_opt=data_opt)
        als.initialize()
        als.train()
        q1, q2, q3 = '49.Star_Wars_(1977)', '180.Return_of_the_Jedi_(1983)', '171.Empire_Strikes_Back,_The_(1980)'
        self._test_most_similar(als, q1, q2, q3)
예제 #15
0
파일: base.py 프로젝트: zzong2006/buffalo
    def _test7_train_ml_20m(self, cls, opt):
        set_log_level(3)

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_20m + 'main'
        data_opt.input.uid = self.ml_20m + 'uid'
        data_opt.input.iid = self.ml_20m + 'iid'
        data_opt.data.path = './ml20m.h5py'
        data_opt.data.use_cache = True

        c = cls(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        self.assertTrue(True)
예제 #16
0
def example1():
    log.set_log_level(log.DEBUG)
    als_option = ALSOption().get_default_option()
    als_option.validation = aux.Option({'topk': 10})
    data_option = MatrixMarketOptions().get_default_option()
    data_option.input.main = '../tests/ext/ml-100k/main'
    data_option.input.iid = '../tests/ext/ml-100k/iid'

    als = ALS(als_option, data_opt=data_option)
    als.initialize()
    als.train()
    print('MovieLens 100k metrics for validations\n%s' % json.dumps(als.get_validation_results(), indent=2))

    print('Similar movies to Star_Wars_(1977)')
    for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')):
        print(f'{rank + 1:02d}. {score:.3f} {movie_name}')

    print('Run hyper parameter optimization for val_ndcg...')
    als.opt.num_workers = 4
    als.opt.evaluation_period = 10
    als.opt.optimize = aux.Option({
        'loss': 'val_ndcg',
        'max_trials': 100,
        'deployment': True,
        'start_with_default_parameters': True,
        'space': {
            'd': ['randint', ['d', 10, 128]],
            'reg_u': ['uniform', ['reg_u', 0.1, 1.0]],
            'reg_i': ['uniform', ['reg_i', 0.1, 1.0]],
            'alpha': ['randint', ['alpha', 1, 10]],
        }
    })
    log.set_log_level(log.INFO)
    als.opt.model_path = './example1.ml100k.als.optimize.bin'
    print(json.dumps({'alpha': als.opt.alpha, 'd': als.opt.d,
                      'reg_u': als.opt.reg_u, 'reg_i': als.opt.reg_i}, indent=2))
    als.optimize()
    als.load('./example1.ml100k.als.optimize.bin')

    print('Similar movies to Star_Wars_(1977)')
    for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')):
        print(f'{rank + 1:02d}. {score:.3f} {movie_name}')

    optimization_res = als.get_optimization_data()
    best_parameters = optimization_res['best_parameters']

    print(json.dumps(optimization_res['best'], indent=2))
    print(json.dumps({'alpha': int(best_parameters['alpha']), 'd': int(best_parameters['d']),
                      'reg_u': best_parameters['reg_u'], 'reg_i': best_parameters['reg_i']}, indent=2))
예제 #17
0
파일: base.py 프로젝트: zzong2006/buffalo
    def _test5_validation(self, cls, opt, ndcg=0.06, map=0.04):
        set_log_level(2)

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = cls(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        results = c.get_validation_results()
        self.assertTrue(results['ndcg'] > ndcg, msg='NDCG Test')
        self.assertTrue(results['map'] > map, msg='MAP Test')
예제 #18
0
def example2():
    log.set_log_level(log.INFO)
    als_option = ALSOption().get_default_option()
    data_option = MatrixMarketOptions().get_default_option()
    data_option.input.main = '../tests/ext/ml-20m/main'
    data_option.input.iid = '../tests/ext/ml-20m/iid'
    data_option.data.path = './ml20m.h5py'
    data_option.data.use_cache = True

    als = ALS(als_option, data_opt=data_option)
    als.initialize()
    als.train()
    als.normalize('item')
    als.build_itemid_map()

    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Paralell(Thread=4)'
    )
    par = ParALS(als)
    par.num_workers = 4
    all_items = als._idmanager.itemids
    start_t = time.time()
    with open('als.ml20m.par.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))

    from n2 import HnswIndex
    index = HnswIndex(als.Q.shape[1])
    for f in als.Q:
        index.add_data(f)
    index.build(n_threads=4)
    index.save('ml20m.n2.index')
    index.unload()
    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Ann(Thread=1)'
    )
    par.set_hnsw_index('ml20m.n2.index', 'item')
    par.num_workers = 4
    start_t = time.time()
    with open('als.ml20m.ann.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))
예제 #19
0
    def test4_optimize(self):
        set_log_level(2)
        opt = ALSOption().get_default_option()
        opt.d = 5
        opt.num_workers = 2
        opt.model_path = 'als.bin'
        opt.validation = aux.Option({'topk': 10})
        optimize_option = aux.Option({
            'loss': 'val_rmse',
            'max_trials': 10,
            'deployment': True,
            'start_with_default_parameters': True,
            'space': {
                'd': ['randint', ['d', 10, 20]],
                'reg_u': ['uniform', ['reg_u', 0.1, 0.3]],
                'reg_i': ['uniform', ['reg_i', 0.1, 0.3]],
                'alpha': ['randint', ['alpha', 8, 10]]
            }
        })
        opt.optimize = optimize_option
        opt.evaluation_period = 1
        opt.tensorboard = aux.Option({'root': './tb',
                                      'name': 'als'})

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        als = ALS(opt, data_opt=data_opt)
        als.init_factors()
        als.train()
        default_result = als.get_validation_results()
        als.optimize()
        base_loss = default_result['rmse']  # val_rmse
        optimize_loss = als.get_optimization_data()['best']['val_rmse']
        self.assertTrue(base_loss > optimize_loss)

        als.load('als.bin')
        loss = als.get_validation_results()
        self.assertAlmostEqual(loss['rmse'], optimize_loss)
        os.remove('als.bin')
예제 #20
0
    def test00_tensorboard(self):
        set_log_level(2)
        opt = ALSOption().get_default_option()
        opt.d = 5
        opt.validation = aux.Option({'topk': 10})
        opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'})

        data_opt = MatrixMarketOptions().get_default_option()
        data_opt.input.main = self.ml_100k + 'main'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        als = ALS(opt, data_opt=data_opt)
        als.initialize()
        als.train()
        results = als.get_validation_results()
        self.assertTrue(results['ndcg'] > 0.025)
        self.assertTrue(results['map'] > 0.015)
예제 #21
0
    def test0_onebased(self):
        opt = MatrixMarketOptions().get_default_option()
        opt.input.main = self.mm_path
        opt.input.uid = self.uid_path
        opt.input.iid = self.iid_path
        opt.data.value_prepro = aux.Option({'name': 'OneBased'})
        mm = MatrixMarket(opt)
        mm.create()
        self.temp_files.append(opt.data.path)
        self.assertTrue(True)
        db = mm.handle
        self.assertEqual(sorted(db.keys()),
                         sorted(['vali', 'idmap', 'rowwise', 'colwise']))
        header = mm.get_header()
        self.assertEqual(header['num_nnz'], 5)
        self.assertEqual(header['num_users'], 5)
        self.assertEqual(header['num_items'], 3)

        data = [(u, kk, vv) for u, kk, vv in mm.iterate()]
        self.assertEqual(len(data), 5)
        self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1])
        self.assertEqual([int(vv) for _, _, vv in data], [1, 1, 1, 1, 1])
        self.assertEqual(data[2], (2, 2, 1.0))
예제 #22
0
    def test2_implicit_als(self):
        opt = MatrixMarketOptions().get_default_option()
        opt.input.main = self.mm_path
        opt.input.uid = self.uid_path
        opt.input.iid = self.iid_path
        opt.data.value_prepro = aux.Option({
            'name': 'ImplicitALS',
            'epsilon': 0.5
        })
        mm = MatrixMarket(opt)
        mm.create()
        self.assertTrue(True)
        db = mm.handle
        self.assertEqual(sorted(db.keys()),
                         sorted(['vali', 'idmap', 'rowwise', 'colwise']))
        header = mm.get_header()
        self.assertEqual(header['num_nnz'], 5)
        self.assertEqual(header['num_users'], 5)
        self.assertEqual(header['num_items'], 3)

        data = [(u, kk, vv) for u, kk, vv in mm.iterate()]
        self.assertEqual(len(data), 5)
        self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1])
        self.assertAlmostEqual(data[2][2], math.log(1 + 1.0 / 0.5))
예제 #23
0
 def test2_dense(self):
     opt = MatrixMarketOptions().get_default_option()
     opt.input.main = self.mm_dense
     mm = MatrixMarket(opt)
     mm.create()
     self.assertTrue(True)
예제 #24
0
 def test3_list(self):
     opt = MatrixMarketOptions().get_default_option()
     opt.input.main = [[10, 123], [1, 2]]
     mm = MatrixMarket(opt)
     self.assertRaises(RuntimeError, opt.is_valid_option)
     self.assertRaises(RuntimeError, mm.create)
예제 #25
0
 def test3_id_list_except(self):
     opt = MatrixMarketOptions().get_default_option()
     opt.input.main = np.array([[1, 2], [1, 2], [2, 1]])
     opt.input.uid = [1, 2.0]  # size should be 3
     mm = MatrixMarket(opt)
     self.assertRaises(TypeError, mm.create)
예제 #26
0
 def test0_get_default_option(self):
     MatrixMarketOptions().get_default_option()
     self.assertTrue(True)