def test_create_byte_tensors(self):
        tensorizer = ByteTensorizer(text_column="text", lower=False)
        # not initializing because initializing is a no-op for ByteTensorizer

        s1 = "I want some coffee"
        s2 = "Turn it up"
        s3 = "我不会说中文"
        rows = [{"text": s1}, {"text": s2}, {"text": s3}]
        expected = [list(s1.encode()), list(s2.encode()), list(s3.encode())]

        tensors = [tensorizer.numberize(row) for row in rows]
        self.assertEqual([(bytes, len(bytes)) for bytes in expected], tensors)
    def test_byte_tensors_error_code(self):
        tensorizer = ByteTensorizer(
            text_column="text", lower=False, add_bos_token=True, add_eos_token=True
        )
        s1 = "I want some coffee#"
        s2 = "This is ^the best show I've ever seen"

        rows = [{"text": s1}, {"text": s2}]
        expected_error_code = 1
        with self.assertRaises(SystemExit) as cm:
            for row in rows:
                tensorizer.numberize(row)

        self.assertEqual(cm.exception.code, expected_error_code)
 def test_initialize_tensorizers(self):
     tensorizers = {
         "tokens": TokenTensorizer(text_column="text"),
         "labels": LabelTensorizer(label_column="label"),
         "chars": ByteTensorizer(text_column="text"),
     }
     initialize_tensorizers(tensorizers, self.data.train)
     self.assertEqual(49, len(tensorizers["tokens"].vocab))
     self.assertEqual(7, len(tensorizers["labels"].vocab))
Example #4
0
    def test_create_byte_tensors(self):
        tensorizer = ByteTensorizer(column="text", lower=False)
        # not initializing because initializing is a no-op for ByteTensorizer

        s1 = "I want some coffee"
        s2 = "Turn it up"
        rows = [{"text": types.Text(s1)}, {"text": types.Text(s2)}]
        expected = [[ord(c) for c in s1], [ord(c) for c in s2]]

        tensors = (tensorizer.numberize(row) for row in rows)
        chars, seq_len = next(tensors)
        self.assertEqual(len(s1), len(chars))
        self.assertEqual(expected[0], chars)
        self.assertEqual(len(s1), seq_len)

        chars, seq_len = next(tensors)
        self.assertEqual(len(s2), len(chars))
        self.assertEqual(expected[1], chars)
        self.assertEqual(len(s2), seq_len)
Example #5
0
    def test_create_byte_tensors(self):
        tensorizer = ByteTensorizer(column="text")
        # not initializing because initializing is a no-op for ByteTensorizer

        s1 = "I want some coffee"
        s2 = "Turn it up"

        ld = len(s1) - len(s2)

        batch = [{"text": types.Text(s1)}, {"text": types.Text(s2)}]
        expected = [
            [ord(c) for c in s1] + [0] * (max(-ld, 0)),
            [ord(c) for c in s2] + [0] * (max(ld, 0)),
        ]

        chars, seq_lens = tensorizer.create_training_tensors(batch)
        self.assertIsInstance(chars, torch.LongTensor)
        self.assertIsInstance(seq_lens, torch.LongTensor)
        self.assertEqual((2, max(len(s1), len(s2))), chars.size())
        self.assertEqual((2, ), seq_lens.size())
        self.assertEqual(expected, chars.tolist())
        self.assertEqual([len(s1), len(s2)], seq_lens.tolist())