def test_byte_tensors_error_code(self):
        tensorizer = ByteTensorizer(
            text_column="text", lower=False, add_bos_token=True, add_eos_token=True
        )
        s1 = "I want some coffee#"
        s2 = "This is ^the best show I've ever seen"

        rows = [{"text": s1}, {"text": s2}]
        expected_error_code = 1
        with self.assertRaises(SystemExit) as cm:
            for row in rows:
                tensorizer.numberize(row)

        self.assertEqual(cm.exception.code, expected_error_code)
    def test_create_byte_tensors(self):
        tensorizer = ByteTensorizer(text_column="text", lower=False)
        # not initializing because initializing is a no-op for ByteTensorizer

        s1 = "I want some coffee"
        s2 = "Turn it up"
        s3 = "我不会说中文"
        rows = [{"text": s1}, {"text": s2}, {"text": s3}]
        expected = [list(s1.encode()), list(s2.encode()), list(s3.encode())]

        tensors = [tensorizer.numberize(row) for row in rows]
        self.assertEqual([(bytes, len(bytes)) for bytes in expected], tensors)
Beispiel #3
0
    def test_create_byte_tensors(self):
        tensorizer = ByteTensorizer(column="text", lower=False)
        # not initializing because initializing is a no-op for ByteTensorizer

        s1 = "I want some coffee"
        s2 = "Turn it up"
        rows = [{"text": types.Text(s1)}, {"text": types.Text(s2)}]
        expected = [[ord(c) for c in s1], [ord(c) for c in s2]]

        tensors = (tensorizer.numberize(row) for row in rows)
        chars, seq_len = next(tensors)
        self.assertEqual(len(s1), len(chars))
        self.assertEqual(expected[0], chars)
        self.assertEqual(len(s1), seq_len)

        chars, seq_len = next(tensors)
        self.assertEqual(len(s2), len(chars))
        self.assertEqual(expected[1], chars)
        self.assertEqual(len(s2), seq_len)