def test1_is_valid_option(self): opt = StreamOptions().get_default_option() self.assertTrue(StreamOptions().is_valid_option(opt)) opt['type'] = 1 self.assertRaises(RuntimeError, StreamOptions().is_valid_option, opt) opt['type'] = 'stream' self.assertTrue(StreamOptions().is_valid_option(opt))
def test3_to_matrix(self): opt = StreamOptions().get_default_option() opt.input.main = self.main_path opt.input.uid = self.uid_path opt.data.internal_data_type = 'matrix' mm = Stream(opt) mm.create() self.assertTrue(True) db = mm.handle if opt.data.sppmi: self.assertEqual( sorted(db.keys()), sorted(['idmap', 'rowwise', 'colwise', 'vali', 'sppmi'])) else: self.assertEqual(sorted(db.keys()), sorted(['idmap', 'rowwise', 'colwise', 'vali'])) header = mm.get_header() self.assertEqual(header['num_nnz'], 7) # due to validation samples self.assertEqual(header['num_users'], 3) self.assertEqual(header['num_items'], 6) data = [(u, kk, vv) for u, kk, vv in mm.iterate()] self.assertEqual(len(data), 7) self.assertEqual([uu for uu, _, _ in data], [0, 0, 0, 0, 1, 2, 2]) data = [(u, kk, vv) for u, kk, vv in mm.iterate(axis='colwise')] data = [(u, kk, vv) for u, kk, vv in mm.iterate(axis='colwise', use_repr_name=True) ] data.sort() self.assertEqual( [uu for uu, _, _ in data], ['apple', 'coke', 'juice', 'juice', 'mango', 'pie', 'pie'])
def test2_create(self): opt = StreamOptions().get_default_option() opt.input.main = self.main_path opt.input.uid = self.uid_path mm = Stream(opt) mm.create() self.temp_files.append(opt.data.path) self.assertTrue(True) db = mm.handle if opt.data.sppmi: self.assertEqual( sorted(db.keys()), sorted(['idmap', 'rowwise', 'colwise', 'vali', 'sppmi'])) else: self.assertEqual(sorted(db.keys()), sorted(['idmap', 'rowwise', 'colwise', 'vali'])) header = mm.get_header() self.assertEqual(header['num_nnz'], 9) # due to validation samples self.assertEqual(header['num_users'], 3) self.assertEqual(header['num_items'], 6) data = [(u, kk) for u, kk in mm.iterate(use_repr_name=True)] self.assertEqual(len(data), 9) self.assertEqual([kk for _, kk in data], [ 'apple', 'mango', 'mango', 'apple', 'pie', 'juice', 'pie', 'juice', 'coke' ])
def test10_fast_most_similar(self): set_log_level(1) opt = CFROption().get_default_option() data_opt = StreamOptions().get_default_option() data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() keys = [x for x, _ in c.most_similar('49.Star_Wars_(1977)', 10)] start_t = time.time() for i in range(100): for key in keys: c.most_similar(key) elapsed_a = time.time() - start_t c.normalize(group='item') start_t = time.time() for i in range(100): for key in keys: c.most_similar(key) elapsed_b = time.time() - start_t self.assertTrue(elapsed_a > elapsed_b)
def test9_compact_serialization(self): set_log_level(1) opt = CFROption().get_default_option() data_opt = StreamOptions().get_default_option() data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a) c.save('model.bin', with_userid_map=False) c = CFR(opt) c.load('model.bin', data_fields=['I', '_idmanager']) ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a) self.assertFalse(hasattr(c, 'U')) c.normalize(group='item') ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
def test4_train(self): set_log_level(3) opt = CFROption().get_default_option() data_opt = StreamOptions().get_default_option() data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() self.assertTrue(True)
def test3_init(self): set_log_level(3) opt = CFROption().get_default_option() opt.d = 20 data_opt = StreamOptions().get_default_option() data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.path = './ml100k.h5py' c = CFR(opt, data_opt=data_opt) self.assertTrue(True) c.initialize() self.assertEqual(c.U.shape, (943, 20)) self.assertEqual(c.I.shape, (1682, 20))
def test5_validation(self, ndcg=0.06, map=0.04): set_log_level(3) opt = CFROption().get_default_option() opt.validation = aux.Option({'topk': 10}) opt.tensorboard = aux.Option({'root': './tb', 'name': 'cfr'}) data_opt = StreamOptions().get_default_option() data_opt.data.validation.name = "sample" data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() results = c.get_validation_results() self.assertTrue(results['ndcg'] > ndcg) self.assertTrue(results['map'] > map)
def test8_serialization(self): set_log_level(1) opt = CFROption().get_default_option() data_opt = StreamOptions().get_default_option() data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a) c.save('model.bin') c.load('model.bin') os.remove('model.bin') ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
def load_text8_model(self): if os.path.isfile('text8.w2v.bin'): w2v = W2V() w2v.load('text8.w2v.bin') return w2v set_log_level(3) opt = W2VOption().get_default_option() opt.num_workers = 12 opt.d = 40 opt.min_count = 4 opt.num_iters = 10 opt.model_path = 'text8.w2v.bin' data_opt = StreamOptions().get_default_option() data_opt.input.main = self.text8 + 'main' data_opt.data.path = './text8.h5py' data_opt.data.use_cache = True data_opt.data.validation = {} c = W2V(opt, data_opt=data_opt) c.initialize() c.train() c.save() return c
def test6_topk(self): set_log_level(1) opt = CFROption().get_default_option() opt.validation = aux.Option({'topk': 10}) data_opt = StreamOptions().get_default_option() data_opt.data.validation.name = "sample" data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() self.assertTrue(len(c.topk_recommendation('1', 10)), 10) ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a) c.normalize() ret_b = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_b) self.assertEqual(ret_a, ret_b)
def test0_get_default_option(self): StreamOptions().get_default_option() self.assertTrue(True)
def test5_text8_accuracy(self): set_log_level(2) opt = W2VOption().get_default_option() opt.num_workers = 12 opt.d = 200 opt.num_iters = 15 opt.min_count = 4 data_opt = StreamOptions().get_default_option() data_opt.input.main = self.text8 + 'main' data_opt.data.path = './text8.h5py' data_opt.data.use_cache = True data_opt.data.validation = {} model_path = 'text8.accuracy.w2v.bin' w = W2V(opt, data_opt=data_opt) if os.path.isfile(model_path): w.load(model_path) else: w.initialize() w.train() w.build_itemid_map() with open('./ext/text8/questions-words.txt') as fin: questions = fin.read().strip().split('\n') met = {} target_class = ['capital-common-countries'] class_name = None for line in questions: if not line: continue if line.startswith(':'): _, class_name = line.split(' ', 1) if class_name in target_class and class_name not in met: met[class_name] = {'hit': 0, 'miss': 0, 'total': 0} else: if class_name not in target_class: continue a, b, c, answer = line.lower().strip().split() oov = any( [w.get_feature(t) is None for t in [a, b, c, answer]]) if oov: continue topk = w.most_similar( w.get_weighted_feature({ b: 1, c: 1, a: -1 })) for nn, _ in topk: if nn in [a, b, c]: continue if nn == answer: met[class_name]['hit'] += 1 else: met[class_name]['miss'] += 1 break # top-1 met[class_name]['total'] += 1 stat = met['capital-common-countries'] acc = float(stat['hit']) / stat['total'] print('Top1-Accuracy={:0.3f}'.format(acc)) self.assertTrue(acc > 0.7)