def test1_is_valid_option(self): opt = MatrixMarketOptions().get_default_option() self.assertTrue(MatrixMarketOptions().is_valid_option(opt)) opt['type'] = 1 self.assertRaises(RuntimeError, MatrixMarketOptions().is_valid_option, opt) opt['type'] = 'matrix_market' self.assertTrue(MatrixMarketOptions().is_valid_option(opt))
def test2_create(self): set_log_level(3) opt = MatrixMarketOptions().get_default_option() opt.input.main = self.mm_path opt.input.uid = self.uid_path opt.input.iid = self.iid_path mm = MatrixMarket(opt) mm.create() self.temp_files.append(opt.data.path) self.assertTrue(True) db = mm.handle self.assertEqual(sorted(db.keys()), sorted(['vali', 'idmap', 'rowwise', 'colwise'])) header = mm.get_header() self.assertEqual(header['num_nnz'], 5) self.assertEqual(header['num_users'], 5) self.assertEqual(header['num_items'], 3) data = [(u, kk, vv) for u, kk, vv in mm.iterate()] self.assertEqual(len(data), 5) self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1]) self.assertEqual(data[2], (2, 2, 1.0)) data = [(u, kk, vv) for u, kk, vv in mm.iterate(axis='colwise')] self.assertEqual([int(kk) for _, kk, _ in data], [0, 1, 3, 4, 2])
def test1_minmax(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = self.mm_path opt.input.uid = self.uid_path opt.input.iid = self.iid_path opt.data.value_prepro = aux.Option({ 'name': 'MinMaxScalar', 'min': 3, 'max': 5.0 }) mm = MatrixMarket(opt) mm.create() self.assertTrue(True) db = mm.handle self.assertEqual(sorted(db.keys()), sorted(['vali', 'idmap', 'rowwise', 'colwise'])) header = mm.get_header() self.assertEqual(header['num_nnz'], 5) self.assertEqual(header['num_users'], 5) self.assertEqual(header['num_items'], 3) data = [(u, kk, vv) for u, kk, vv in mm.iterate()] self.assertEqual(len(data), 5) self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1]) self.assertEqual([int(vv) for _, _, vv in data], [3, 5, 3, 3, 4]) self.assertEqual(data[2], (2, 2, 3.0))
def _test10_fast_most_similar(self, cls, opt): set_log_level(1) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() keys = [x for x, _ in c.most_similar('49.Star_Wars_(1977)', topk=100)] start_t = time.time() for i in range(100): for key in keys: c.most_similar(key) elapsed_a = time.time() - start_t c.normalize(group='item') start_t = time.time() for i in range(100): for key in keys: c.most_similar(key) elapsed_b = time.time() - start_t self.assertTrue(elapsed_a > elapsed_b)
def _test9_compact_serialization(self, cls, opt): set_log_level(1) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a) c.save('model.bin', with_userid_map=False) c = cls(opt) c.load('model.bin', data_fields=['Q', '_idmanager']) ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a) self.assertFalse(hasattr(c, 'P')) c.normalize(group='item') ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a)
def _test8_serialization(self, cls, opt): set_log_level(1) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a) c.save('model.bin') c.load('model.bin') os.remove('model.bin') ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a)
def test3_sppmi(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = self.mm_path opt.input.uid = self.uid_path opt.input.iid = self.iid_path opt.data.value_prepro = aux.Option({'name': 'SPPMI'}) self.assertRaises(RuntimeError, MatrixMarket, opt)
def _test6_topk(self, cls, opt): set_log_level(2) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() self.assertTrue(len(c.topk_recommendation('1', 10)), 10) ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a) c.normalize() ret_b = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_b) self.assertEqual(ret_a[:10], ret_b[:10])
def get_ml100k_mm_opt(self): data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.path = './ml100k.h5py' return data_opt
def test3_id_list(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = np.array([[1, 2], [1, 2], [2, 1]]) opt.input.uid = [1, 2.0, '3'] opt.input.iid = np.array(['1', 'a']) mm = MatrixMarket(opt) mm.create() self.assertTrue(True)
def get_database(self, name, **kwargs): from buffalo.data.mm import MatrixMarketOptions data_opt = MatrixMarketOptions().get_default_option() data_opt.validation = None data_opt.data.use_cache = True data_opt.data.batch_mb = kwargs.get('batch_mb', 1024) if name == 'ml20m': data_opt.data.path = DB[name] data_opt.input.main = '../tests/ext/ml-20m/main' elif name =='ml100k': data_opt.data.path = DB[name] data_opt.input.main = '../tests/ext/ml-100k/main' elif name == 'kakao_reco_730m': data_opt.data.path = DB[name] data_opt.data.tmp_dir = './tmp/' data_opt.input.main = '../tests/ext/kakao-reco-730m/main' elif name == 'kakao_brunch_12m': data_opt.data.path = DB[name] data_opt.data.tmp_dir = './tmp/' data_opt.input.main = '../tests/ext/kakao-brunch-12m/main' return data_opt
def _test4_train(self, cls, opt): set_log_level(3) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() self.assertTrue(True)
def _test3_init(self, cls, opt): set_log_level(3) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.path = './ml100k.h5py' c = cls(opt, data_opt=data_opt) self.assertTrue(True) c.init_factors() self.assertEqual(c.P.shape, (943, 20)) self.assertEqual(c.Q.shape, (1682, 20))
def test2_most_similar(self): set_log_level(2) opt = ALSOption().get_default_option() data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' als = ALS(opt, data_opt=data_opt) als.initialize() als.train() q1, q2, q3 = '49.Star_Wars_(1977)', '180.Return_of_the_Jedi_(1983)', '171.Empire_Strikes_Back,_The_(1980)' self._test_most_similar(als, q1, q2, q3)
def _test7_train_ml_20m(self, cls, opt): set_log_level(3) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_20m + 'main' data_opt.input.uid = self.ml_20m + 'uid' data_opt.input.iid = self.ml_20m + 'iid' data_opt.data.path = './ml20m.h5py' data_opt.data.use_cache = True c = cls(opt, data_opt=data_opt) c.initialize() c.train() self.assertTrue(True)
def example1(): log.set_log_level(log.DEBUG) als_option = ALSOption().get_default_option() als_option.validation = aux.Option({'topk': 10}) data_option = MatrixMarketOptions().get_default_option() data_option.input.main = '../tests/ext/ml-100k/main' data_option.input.iid = '../tests/ext/ml-100k/iid' als = ALS(als_option, data_opt=data_option) als.initialize() als.train() print('MovieLens 100k metrics for validations\n%s' % json.dumps(als.get_validation_results(), indent=2)) print('Similar movies to Star_Wars_(1977)') for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')): print(f'{rank + 1:02d}. {score:.3f} {movie_name}') print('Run hyper parameter optimization for val_ndcg...') als.opt.num_workers = 4 als.opt.evaluation_period = 10 als.opt.optimize = aux.Option({ 'loss': 'val_ndcg', 'max_trials': 100, 'deployment': True, 'start_with_default_parameters': True, 'space': { 'd': ['randint', ['d', 10, 128]], 'reg_u': ['uniform', ['reg_u', 0.1, 1.0]], 'reg_i': ['uniform', ['reg_i', 0.1, 1.0]], 'alpha': ['randint', ['alpha', 1, 10]], } }) log.set_log_level(log.INFO) als.opt.model_path = './example1.ml100k.als.optimize.bin' print(json.dumps({'alpha': als.opt.alpha, 'd': als.opt.d, 'reg_u': als.opt.reg_u, 'reg_i': als.opt.reg_i}, indent=2)) als.optimize() als.load('./example1.ml100k.als.optimize.bin') print('Similar movies to Star_Wars_(1977)') for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')): print(f'{rank + 1:02d}. {score:.3f} {movie_name}') optimization_res = als.get_optimization_data() best_parameters = optimization_res['best_parameters'] print(json.dumps(optimization_res['best'], indent=2)) print(json.dumps({'alpha': int(best_parameters['alpha']), 'd': int(best_parameters['d']), 'reg_u': best_parameters['reg_u'], 'reg_i': best_parameters['reg_i']}, indent=2))
def _test5_validation(self, cls, opt, ndcg=0.06, map=0.04): set_log_level(2) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() results = c.get_validation_results() self.assertTrue(results['ndcg'] > ndcg, msg='NDCG Test') self.assertTrue(results['map'] > map, msg='MAP Test')
def example2(): log.set_log_level(log.INFO) als_option = ALSOption().get_default_option() data_option = MatrixMarketOptions().get_default_option() data_option.input.main = '../tests/ext/ml-20m/main' data_option.input.iid = '../tests/ext/ml-20m/iid' data_option.data.path = './ml20m.h5py' data_option.data.use_cache = True als = ALS(als_option, data_opt=data_option) als.initialize() als.train() als.normalize('item') als.build_itemid_map() print( 'Make item recommendation on als.ml20m.par.top10.tsv with Paralell(Thread=4)' ) par = ParALS(als) par.num_workers = 4 all_items = als._idmanager.itemids start_t = time.time() with open('als.ml20m.par.top10.tsv', 'w') as fout: for idx in range(0, len(all_items), 128): topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True) for q, p in zip(all_items[idx:idx + 128], topks): fout.write('%s\t%s\n' % (q, '\t'.join(p))) print('took: %.3f secs' % (time.time() - start_t)) from n2 import HnswIndex index = HnswIndex(als.Q.shape[1]) for f in als.Q: index.add_data(f) index.build(n_threads=4) index.save('ml20m.n2.index') index.unload() print( 'Make item recommendation on als.ml20m.par.top10.tsv with Ann(Thread=1)' ) par.set_hnsw_index('ml20m.n2.index', 'item') par.num_workers = 4 start_t = time.time() with open('als.ml20m.ann.top10.tsv', 'w') as fout: for idx in range(0, len(all_items), 128): topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True) for q, p in zip(all_items[idx:idx + 128], topks): fout.write('%s\t%s\n' % (q, '\t'.join(p))) print('took: %.3f secs' % (time.time() - start_t))
def test4_optimize(self): set_log_level(2) opt = ALSOption().get_default_option() opt.d = 5 opt.num_workers = 2 opt.model_path = 'als.bin' opt.validation = aux.Option({'topk': 10}) optimize_option = aux.Option({ 'loss': 'val_rmse', 'max_trials': 10, 'deployment': True, 'start_with_default_parameters': True, 'space': { 'd': ['randint', ['d', 10, 20]], 'reg_u': ['uniform', ['reg_u', 0.1, 0.3]], 'reg_i': ['uniform', ['reg_i', 0.1, 0.3]], 'alpha': ['randint', ['alpha', 8, 10]] } }) opt.optimize = optimize_option opt.evaluation_period = 1 opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'}) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) als = ALS(opt, data_opt=data_opt) als.init_factors() als.train() default_result = als.get_validation_results() als.optimize() base_loss = default_result['rmse'] # val_rmse optimize_loss = als.get_optimization_data()['best']['val_rmse'] self.assertTrue(base_loss > optimize_loss) als.load('als.bin') loss = als.get_validation_results() self.assertAlmostEqual(loss['rmse'], optimize_loss) os.remove('als.bin')
def test00_tensorboard(self): set_log_level(2) opt = ALSOption().get_default_option() opt.d = 5 opt.validation = aux.Option({'topk': 10}) opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'}) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) als = ALS(opt, data_opt=data_opt) als.initialize() als.train() results = als.get_validation_results() self.assertTrue(results['ndcg'] > 0.025) self.assertTrue(results['map'] > 0.015)
def test0_onebased(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = self.mm_path opt.input.uid = self.uid_path opt.input.iid = self.iid_path opt.data.value_prepro = aux.Option({'name': 'OneBased'}) mm = MatrixMarket(opt) mm.create() self.temp_files.append(opt.data.path) self.assertTrue(True) db = mm.handle self.assertEqual(sorted(db.keys()), sorted(['vali', 'idmap', 'rowwise', 'colwise'])) header = mm.get_header() self.assertEqual(header['num_nnz'], 5) self.assertEqual(header['num_users'], 5) self.assertEqual(header['num_items'], 3) data = [(u, kk, vv) for u, kk, vv in mm.iterate()] self.assertEqual(len(data), 5) self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1]) self.assertEqual([int(vv) for _, _, vv in data], [1, 1, 1, 1, 1]) self.assertEqual(data[2], (2, 2, 1.0))
def test2_implicit_als(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = self.mm_path opt.input.uid = self.uid_path opt.input.iid = self.iid_path opt.data.value_prepro = aux.Option({ 'name': 'ImplicitALS', 'epsilon': 0.5 }) mm = MatrixMarket(opt) mm.create() self.assertTrue(True) db = mm.handle self.assertEqual(sorted(db.keys()), sorted(['vali', 'idmap', 'rowwise', 'colwise'])) header = mm.get_header() self.assertEqual(header['num_nnz'], 5) self.assertEqual(header['num_users'], 5) self.assertEqual(header['num_items'], 3) data = [(u, kk, vv) for u, kk, vv in mm.iterate()] self.assertEqual(len(data), 5) self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1]) self.assertAlmostEqual(data[2][2], math.log(1 + 1.0 / 0.5))
def test2_dense(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = self.mm_dense mm = MatrixMarket(opt) mm.create() self.assertTrue(True)
def test3_list(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = [[10, 123], [1, 2]] mm = MatrixMarket(opt) self.assertRaises(RuntimeError, opt.is_valid_option) self.assertRaises(RuntimeError, mm.create)
def test3_id_list_except(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = np.array([[1, 2], [1, 2], [2, 1]]) opt.input.uid = [1, 2.0] # size should be 3 mm = MatrixMarket(opt) self.assertRaises(TypeError, mm.create)
def test0_get_default_option(self): MatrixMarketOptions().get_default_option() self.assertTrue(True)