def test_create_byte_tensors(self): tensorizer = ByteTensorizer(text_column="text", lower=False) # not initializing because initializing is a no-op for ByteTensorizer s1 = "I want some coffee" s2 = "Turn it up" s3 = "我不会说中文" rows = [{"text": s1}, {"text": s2}, {"text": s3}] expected = [list(s1.encode()), list(s2.encode()), list(s3.encode())] tensors = [tensorizer.numberize(row) for row in rows] self.assertEqual([(bytes, len(bytes)) for bytes in expected], tensors)
def test_byte_tensors_error_code(self): tensorizer = ByteTensorizer( text_column="text", lower=False, add_bos_token=True, add_eos_token=True ) s1 = "I want some coffee#" s2 = "This is ^the best show I've ever seen" rows = [{"text": s1}, {"text": s2}] expected_error_code = 1 with self.assertRaises(SystemExit) as cm: for row in rows: tensorizer.numberize(row) self.assertEqual(cm.exception.code, expected_error_code)
def test_initialize_tensorizers(self): tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), "chars": ByteTensorizer(text_column="text"), } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].vocab))
def test_create_byte_tensors(self): tensorizer = ByteTensorizer(column="text", lower=False) # not initializing because initializing is a no-op for ByteTensorizer s1 = "I want some coffee" s2 = "Turn it up" rows = [{"text": types.Text(s1)}, {"text": types.Text(s2)}] expected = [[ord(c) for c in s1], [ord(c) for c in s2]] tensors = (tensorizer.numberize(row) for row in rows) chars, seq_len = next(tensors) self.assertEqual(len(s1), len(chars)) self.assertEqual(expected[0], chars) self.assertEqual(len(s1), seq_len) chars, seq_len = next(tensors) self.assertEqual(len(s2), len(chars)) self.assertEqual(expected[1], chars) self.assertEqual(len(s2), seq_len)
def test_create_byte_tensors(self): tensorizer = ByteTensorizer(column="text") # not initializing because initializing is a no-op for ByteTensorizer s1 = "I want some coffee" s2 = "Turn it up" ld = len(s1) - len(s2) batch = [{"text": types.Text(s1)}, {"text": types.Text(s2)}] expected = [ [ord(c) for c in s1] + [0] * (max(-ld, 0)), [ord(c) for c in s2] + [0] * (max(ld, 0)), ] chars, seq_lens = tensorizer.create_training_tensors(batch) self.assertIsInstance(chars, torch.LongTensor) self.assertIsInstance(seq_lens, torch.LongTensor) self.assertEqual((2, max(len(s1), len(s2))), chars.size()) self.assertEqual((2, ), seq_lens.size()) self.assertEqual(expected, chars.tolist()) self.assertEqual([len(s1), len(s2)], seq_lens.tolist())