def test_invalid_encoding_causes_errors(self):
    strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]

    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          strings,
          input_encoding="invalid",
          output_encoding="UTF-8",
          errors="replace",
          replacement_char=ord(" "),
          replace_control_characters=False)
      with self.assertRaisesOpError(
          "Could not create converter for input encoding: invalid"):
        self.evaluate(outputs)

    with self.assertRaisesRegexp(ValueError, "Op passed string 'invalid'"):
      with self.cached_session() as sess:
        outputs = string_ops.unicode_transcode(
            strings,
            input_encoding="UTF-8",
            output_encoding="invalid",
            errors="replace",
            replacement_char=ord(" "),
            replace_control_characters=False)
        self.evaluate(outputs)
Ejemplo n.º 2
0
    def test_transcode_utf8_simple(self):
        strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]

        with self.cached_session() as sess:
            outputs = string_ops.unicode_transcode(
                strings,
                input_encoding="UTF-8",
                output_encoding="UTF-8",
                errors="replace",
                replacement_char=ord(" "),
                replace_control_characters=False)
            values = sess.run(outputs)
            self.assertAllEqual(values, strings)

            outputs = string_ops.unicode_transcode(
                strings,
                input_encoding="ISO-8859-1",
                output_encoding="UTF-8",
                errors="replace",
                replacement_char=ord(" "),
                replace_control_characters=False)
            values = sess.run(outputs)
            self.assertAllEqual(values, strings)

            outputs = string_ops.unicode_transcode(
                strings,
                input_encoding="US-ASCII",
                output_encoding="UTF-8",
                errors="replace",
                replacement_char=ord(" "),
                replace_control_characters=False)
            values = sess.run(outputs)
            self.assertAllEqual(values, strings)
  def test_transcode_utf8_simple(self):
    strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]

    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          strings,
          input_encoding="UTF-8",
          output_encoding="UTF-8",
          errors="replace",
          replacement_char=ord(" "),
          replace_control_characters=False)
      values = self.evaluate(outputs)
      self.assertAllEqual(values, strings)

      outputs = string_ops.unicode_transcode(
          strings,
          input_encoding="ISO-8859-1",
          output_encoding="UTF-8",
          errors="replace",
          replacement_char=ord(" "),
          replace_control_characters=False)
      values = self.evaluate(outputs)
      self.assertAllEqual(values, strings)

      outputs = string_ops.unicode_transcode(
          strings,
          input_encoding="US-ASCII",
          output_encoding="UTF-8",
          errors="replace",
          replacement_char=ord(" "),
          replace_control_characters=False)
      values = self.evaluate(outputs)
      self.assertAllEqual(values, strings)
Ejemplo n.º 4
0
    def test_invalid_encoding_causes_errors(self):
        strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]

        with self.cached_session() as sess:
            outputs = string_ops.unicode_transcode(
                strings,
                input_encoding="invalid",
                output_encoding="UTF-8",
                errors="replace",
                replacement_char=ord(" "),
                replace_control_characters=False)
            with self.assertRaisesOpError(
                    "Could not create converter for input encoding: invalid"):
                sess.run(outputs)

        with self.assertRaisesRegexp(ValueError, "Op passed string 'invalid'"):
            with self.cached_session() as sess:
                outputs = string_ops.unicode_transcode(
                    strings,
                    input_encoding="UTF-8",
                    output_encoding="invalid",
                    errors="replace",
                    replacement_char=ord(" "),
                    replace_control_characters=False)
                sess.run(outputs)
Ejemplo n.º 5
0
    def test_cjk_encodings(self):
        strings_ja = [
            b"\x5c\x5c",  # Yen sign
            b"\x8f\x70",  # kanji character "waza"
            b"\x83\x4f"
        ]  # katakana character "gu"
        strings_zh_cn = [b"\xca\xf5"]  # simplified "shu4"
        strings_zh_tw = [b"\xb3\x4e"]  # traditional "shu4"
        strings_ko = [b"\xc7\xd1\xb9\xce"]  # hangul "hanmin"

        expected_ja = [
            s.decode("shift_jis").encode("UTF-8") for s in strings_ja
        ]
        expected_zh_cn = [
            s.decode("gb18030").encode("UTF-8") for s in strings_zh_cn
        ]
        expected_zh_tw = [
            s.decode("big5").encode("UTF-8") for s in strings_zh_tw
        ]
        expected_ko = [s.decode("euc_kr").encode("UTF-8") for s in strings_ko]

        with self.cached_session() as sess:
            outputs_ja = string_ops.unicode_transcode(
                strings_ja,
                input_encoding="shift_jis",
                output_encoding="UTF-8",
                replacement_char=ord(" "),
                replace_control_characters=False)

            outputs_zh_cn = string_ops.unicode_transcode(
                strings_zh_cn,
                input_encoding="gb18030",
                output_encoding="UTF-8",
                replacement_char=ord(" "),
                replace_control_characters=False)

            outputs_zh_tw = string_ops.unicode_transcode(
                strings_zh_tw,
                input_encoding="big5",
                output_encoding="UTF-8",
                replacement_char=ord(" "),
                replace_control_characters=False)

            outputs_ko = string_ops.unicode_transcode(
                strings_ko,
                input_encoding="euc_kr",
                output_encoding="UTF-8",
                replacement_char=ord(" "),
                replace_control_characters=False)

            result_ja, result_zh_cn, result_zh_tw, result_ko = sess.run(
                [outputs_ja, outputs_zh_cn, outputs_zh_tw, outputs_ko])

            self.assertAllEqual(result_ja, expected_ja)
            self.assertAllEqual(result_zh_cn, expected_zh_cn)
            self.assertAllEqual(result_zh_tw, expected_zh_tw)
            self.assertAllEqual(result_ko, expected_ko)
  def test_cjk_encodings(self):
    strings_ja = [
        b"\x5c\x5c",  # Yen sign
        b"\x8f\x70",  # kanji character "waza"
        b"\x83\x4f"
    ]  # katakana character "gu"
    strings_zh_cn = [b"\xca\xf5"]  # simplified "shu4"
    strings_zh_tw = [b"\xb3\x4e"]  # traditional "shu4"
    strings_ko = [b"\xc7\xd1\xb9\xce"]  # hangul "hanmin"

    expected_ja = [s.decode("shift_jis").encode("UTF-8") for s in strings_ja]
    expected_zh_cn = [
        s.decode("gb18030").encode("UTF-8") for s in strings_zh_cn
    ]
    expected_zh_tw = [s.decode("big5").encode("UTF-8") for s in strings_zh_tw]
    expected_ko = [s.decode("euc_kr").encode("UTF-8") for s in strings_ko]

    with self.cached_session() as sess:
      outputs_ja = string_ops.unicode_transcode(
          strings_ja,
          input_encoding="shift_jis",
          output_encoding="UTF-8",
          replacement_char=ord(" "),
          replace_control_characters=False)

      outputs_zh_cn = string_ops.unicode_transcode(
          strings_zh_cn,
          input_encoding="gb18030",
          output_encoding="UTF-8",
          replacement_char=ord(" "),
          replace_control_characters=False)

      outputs_zh_tw = string_ops.unicode_transcode(
          strings_zh_tw,
          input_encoding="big5",
          output_encoding="UTF-8",
          replacement_char=ord(" "),
          replace_control_characters=False)

      outputs_ko = string_ops.unicode_transcode(
          strings_ko,
          input_encoding="euc_kr",
          output_encoding="UTF-8",
          replacement_char=ord(" "),
          replace_control_characters=False)

      result_ja, result_zh_cn, result_zh_tw, result_ko = sess.run(
          [outputs_ja, outputs_zh_cn, outputs_zh_tw, outputs_ko])

      self.assertAllEqual(result_ja, expected_ja)
      self.assertAllEqual(result_zh_cn, expected_zh_cn)
      self.assertAllEqual(result_zh_tw, expected_zh_tw)
      self.assertAllEqual(result_ko, expected_ko)
  def test_transcode_utf8_with_bom(self):
    bom_string = b"\xef\xbb\xbfabcdefg"
    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-8", output_encoding="UTF-8")
      values = sess.run(outputs)
      self.assertAllEqual(values, b"\xef\xbb\xbfabcdefg")  # BOM preserved

      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-8", output_encoding="UTF-16-BE")
      values = sess.run(outputs)
      utf16expected = bom_string.decode("UTF-8").encode("UTF-16-BE")
      self.assertAllEqual(values, utf16expected)
  def test_transcode_utf8_with_bom(self):
    bom_string = b"\xef\xbb\xbfabcdefg"
    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-8", output_encoding="UTF-8")
      values = self.evaluate(outputs)
      self.assertAllEqual(values, b"\xef\xbb\xbfabcdefg")  # BOM preserved

      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-8", output_encoding="UTF-16-BE")
      values = self.evaluate(outputs)
      utf16expected = bom_string.decode("UTF-8").encode("UTF-16-BE")
      self.assertAllEqual(values, utf16expected)
  def test_transcode_utf8_with_replacement_char(self):
    strings = [b"a\xef\xbf\xbd"]
    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          strings, input_encoding="UTF-8", output_encoding="UTF-8",
          errors="strict")
      values = self.evaluate(outputs)
      self.assertAllEqual(values, [b"a\xef\xbf\xbd"])

      outputs = string_ops.unicode_transcode(
          strings, input_encoding="UTF-8", output_encoding="UTF-8",
          errors="replace", replacement_char=ord("?"))
      values = self.evaluate(outputs)
      self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
  def test_transcode_utf8_with_replacement_char(self):
    strings = [b"a\xef\xbf\xbd"]
    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          strings, input_encoding="UTF-8", output_encoding="UTF-8",
          errors="strict")
      values = sess.run(outputs)
      self.assertAllEqual(values, [b"a\xef\xbf\xbd"])

      outputs = string_ops.unicode_transcode(
          strings, input_encoding="UTF-8", output_encoding="UTF-8",
          errors="replace", replacement_char=ord("?"))
      values = sess.run(outputs)
      self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
Ejemplo n.º 11
0
 def test_transcode_bad_utf8_termination_with_defaults(self):
   bad_string = b"a\xf0"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
     values = self.evaluate(outputs)
     self.assertAllEqual(values, b"a\xef\xbf\xbd")   # 0xFFFD
Ejemplo n.º 12
0
def coerce_to_structurally_valid_utf8(input,
                                      replacement_char=_unichr(65533),
                                      name=None):
    """Coerce UTF-8 input strings to structurally valid UTF-8.

  Any bytes which cause the input string to be invalid UTF-8 are substituted
  with the provided replacement character codepoint (default 65533). If you plan
  on overriding the default, use a single byte replacement character codepoint
  to preserve alignment to the source input string.

  Args:
    input: UTF-8 string tensor to coerce to valid UTF-8.
    replacement_char: The replacement character to be used in place of any
        invalid byte in the input. Any valid Unicode character may be used. The
        default value is the default Unicode replacement character which is
        0xFFFD (or U+65533). Note that passing a replacement character
        expressible in 1 byte, such as ' ' or '?', will preserve string
        alignment to the source since individual invalid bytes will be replaced
        with a 1-byte replacement. (optional)
    name: A name for the operation (optional).

  Returns:
    A tensor of type string with the same shape as the input.
  """
    return string_ops.unicode_transcode(input,
                                        input_encoding='UTF-8',
                                        output_encoding='UTF-8',
                                        errors='replace',
                                        replacement_char=ord(replacement_char),
                                        name=name)
Ejemplo n.º 13
0
 def test_transcode_bad_utf8_with_defaults(self):
   bad_string = b"\x00\xff"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
     values = self.evaluate(outputs)
     self.assertAllEqual(values, b"\x00\xef\xbf\xbd")
 def test_transcode_bad_utf8_termination_with_defaults(self):
   bad_string = b"a\xf0"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
     values = sess.run(outputs)
     self.assertAllEqual(values, b"a\xef\xbf\xbd")   # 0xFFFD
 def test_transcode_bad_utf8_with_defaults(self):
   bad_string = b"\x00\xff"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
     values = sess.run(outputs)
     self.assertAllEqual(values, b"\x00\xef\xbf\xbd")
 def test_transcode_bad_utf8_with_space_replacement(self):
   bad_string = b"\x00\xff"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string, input_encoding="UTF-8", output_encoding="UTF-8",
         replacement_char=ord(" "))
     values = sess.run(outputs)
     self.assertAllEqual(values, b"\x00 ")
Ejemplo n.º 17
0
 def test_transcode_bad_utf8_with_space_replacement(self):
   bad_string = b"\x00\xff"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string, input_encoding="UTF-8", output_encoding="UTF-8",
         replacement_char=ord(" "))
     values = self.evaluate(outputs)
     self.assertAllEqual(values, b"\x00 ")
Ejemplo n.º 18
0
 def test_transcode_bad_utf8_with_elision_of_malformatting(self):
     bad_string = b"\x00\xff"
     with self.cached_session() as sess:
         outputs = string_ops.unicode_transcode(bad_string,
                                                input_encoding="UTF-8",
                                                output_encoding="UTF-8",
                                                errors="ignore")
         values = sess.run(outputs)
         self.assertAllEqual(values, b"\x00")
Ejemplo n.º 19
0
 def test_transcode_bad_utf8_with_elision_of_malformatting(self):
   bad_string = b"\x00\xff"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string,
         input_encoding="UTF-8",
         output_encoding="UTF-8",
         errors="ignore")
     values = self.evaluate(outputs)
     self.assertAllEqual(values, b"\x00")
Ejemplo n.º 20
0
 def test_transcode_bad_utf8_start_with_strict_errors(self):
     bad_string = b"\xffabcd"
     with self.cached_session() as sess:
         outputs = string_ops.unicode_transcode(bad_string,
                                                input_encoding="UTF-8",
                                                output_encoding="UTF-8",
                                                errors="strict")
         with self.assertRaisesOpError(
                 "Invalid formatting on input string"):
             sess.run(outputs)
Ejemplo n.º 21
0
  def test_transcode_utf16_le_be_with_bom(self):
    bom_string = b"\xfe\xff\x00\x61"  # Big-endian BOM with 'a' encoded
    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-16-BE", output_encoding="UTF-8")
      values = self.evaluate(outputs)
      # BOM is preserved in output
      self.assertAllEqual(values, b"\xef\xbb\xbfa")

      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
      values = self.evaluate(outputs)
      # mangled BOM and value from (incorrect) LE encoding
      self.assertAllEqual(values, b"\xef\xbf\xbe\xe6\x84\x80")

      bom_string = b"\xff\xfe\x61\x00"  # Little-endian BOM with 'a' encoded
      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
      values = self.evaluate(outputs)
      self.assertAllEqual(values, b"\xef\xbb\xbfa")
Ejemplo n.º 22
0
 def test_transcode_ascii_with_shift_chars(self):
   strings = [b"\x0e\x0e", b"\x0f\x0f"]
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         strings,
         input_encoding="US-ASCII",
         output_encoding="UTF-8",
         replacement_char=ord(" "),
         replace_control_characters=False)
     values = self.evaluate(outputs)
     self.assertAllEqual(values, strings)
Ejemplo n.º 23
0
 def test_transcode_ascii_with_shift_chars(self):
     strings = [b"\x0e\x0e", b"\x0f\x0f"]
     with self.cached_session() as sess:
         outputs = string_ops.unicode_transcode(
             strings,
             input_encoding="US-ASCII",
             output_encoding="UTF-8",
             replacement_char=ord(" "),
             replace_control_characters=False)
         values = sess.run(outputs)
         self.assertAllEqual(values, strings)
Ejemplo n.º 24
0
 def test_transcode_bad_utf8_start_with_strict_errors(self):
   bad_string = b"\xffabcd"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string,
         input_encoding="UTF-8",
         output_encoding="UTF-8",
         errors="strict")
     with self.assertRaisesOpError(
         "Invalid formatting on input string"):
       self.evaluate(outputs)
  def test_transcode_utf16_le_be_with_bom(self):
    bom_string = b"\xfe\xff\x00\x61"  # Big-endian BOM with 'a' encoded
    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-16-BE", output_encoding="UTF-8")
      values = sess.run(outputs)
      # BOM is preserved in output
      self.assertAllEqual(values, b"\xef\xbb\xbfa")

      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
      values = sess.run(outputs)
      # mangled BOM and value from (incorrect) LE encoding
      self.assertAllEqual(values, b"\xef\xbf\xbe\xe6\x84\x80")

      bom_string = b"\xff\xfe\x61\x00"  # Little-endian BOM with 'a' encoded
      outputs = string_ops.unicode_transcode(
          bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
      values = sess.run(outputs)
      self.assertAllEqual(values, b"\xef\xbb\xbfa")
Ejemplo n.º 26
0
 def test_transcode_bad_utf8_with_elision_including_control_chars(self):
     bad_string = b"\x00\xff"
     with self.cached_session() as sess:
         outputs = string_ops.unicode_transcode(
             bad_string,
             input_encoding="UTF-8",
             output_encoding="UTF-8",
             errors="ignore",
             replace_control_characters=True)
         values = sess.run(outputs)
         self.assertAllEqual(values, b"")
Ejemplo n.º 27
0
 def test_transcode_bad_utf8_with_elision_including_control_chars(self):
   bad_string = b"\x00\xff"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string,
         input_encoding="UTF-8",
         output_encoding="UTF-8",
         errors="ignore",
         replace_control_characters=True)
     values = self.evaluate(outputs)
     self.assertAllEqual(values, b"")
Ejemplo n.º 28
0
 def test_transcode_bad_utf8_with_some_good(self):
     bad_string = b"abc\xffabcdefg"
     with self.cached_session() as sess:
         outputs = string_ops.unicode_transcode(
             bad_string,
             input_encoding="UTF-8",
             output_encoding="UTF-8",
             errors="replace",
             replacement_char=ord(" "),
             replace_control_characters=False)
         values = sess.run(outputs)
         self.assertAllEqual(values, b"abc abcdefg")
Ejemplo n.º 29
0
 def test_transcode_utf8_to_utf32(self):
     strings = [b"ab\xe2\x82\xac", b"\xf0\x90\x90\xb7"]
     expected = [s.decode("UTF-8").encode("UTF-32-BE") for s in strings]
     with self.cached_session() as sess:
         outputs = string_ops.unicode_transcode(
             strings,
             input_encoding="UTF-8",
             output_encoding="UTF-32-BE",
             replacement_char=ord(" "),
             replace_control_characters=False)
         values = sess.run(outputs)
         self.assertAllEqual(values, expected)
Ejemplo n.º 30
0
 def test_transcode_utf8_to_utf32(self):
   strings = [b"ab\xe2\x82\xac", b"\xf0\x90\x90\xb7"]
   expected = [s.decode("UTF-8").encode("UTF-32-BE") for s in strings]
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         strings,
         input_encoding="UTF-8",
         output_encoding="UTF-32-BE",
         replacement_char=ord(" "),
         replace_control_characters=False)
     values = self.evaluate(outputs)
     self.assertAllEqual(values, expected)
Ejemplo n.º 31
0
 def test_transcode_bad_utf8_with_some_good(self):
   bad_string = b"abc\xffabcdefg"
   with self.cached_session() as sess:
     outputs = string_ops.unicode_transcode(
         bad_string,
         input_encoding="UTF-8",
         output_encoding="UTF-8",
         errors="replace",
         replacement_char=ord(" "),
         replace_control_characters=False)
     values = self.evaluate(outputs)
     self.assertAllEqual(values, b"abc abcdefg")
Ejemplo n.º 32
0
    def test_transcode_bad_utf8(self):
        bad_string = b"\x00\xff"
        with self.cached_session() as sess:
            outputs = string_ops.unicode_transcode(
                bad_string,
                input_encoding="UTF-8",
                output_encoding="UTF-8",
                errors="replace",
                replacement_char=ord(" "),
                replace_control_characters=True)
            values = self.evaluate(outputs)
            self.assertAllEqual(values, b"  ")

            outputs = string_ops.unicode_transcode(
                bad_string,
                input_encoding="UTF-8",
                output_encoding="UTF-8",
                errors="replace",
                replacement_char=ord(" "),
                replace_control_characters=False)
            values = self.evaluate(outputs)
            self.assertAllEqual(values, b"\x00 ")
Ejemplo n.º 33
0
  def test_forwarding(self):
    with self.cached_session():
      # Generate an input that is uniquely consumed by the transcode op.
      # This exercises code paths which are optimized for this case
      # (e.g., using forwarding).
      inp = string_ops.substr(
          constant_op.constant([b"AbCdEfG", b"HiJkLmN"], dtypes.string),
          pos=0,
          len=5)
      transcoded = string_ops.unicode_transcode(
          inp, input_encoding="UTF-8", output_encoding="UTF-8")

      self.assertAllEqual([b"AbCdE", b"HiJkL"], transcoded)
  def test_forwarding(self):
    with self.cached_session():
      # Generate an input that is uniquely consumed by the transcode op.
      # This exercises code paths which are optimized for this case
      # (e.g., using forwarding).
      inp = string_ops.substr(
          constant_op.constant([b"AbCdEfG", b"HiJkLmN"], dtypes.string),
          pos=0,
          len=5)
      transcoded = string_ops.unicode_transcode(
          inp, input_encoding="UTF-8", output_encoding="UTF-8")

      self.assertAllEqual([b"AbCdE", b"HiJkL"], transcoded)
  def test_transcode_bad_utf8(self):
    bad_string = b"\x00\xff"
    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          bad_string,
          input_encoding="UTF-8",
          output_encoding="UTF-8",
          errors="replace",
          replacement_char=ord(" "),
          replace_control_characters=True)
      values = sess.run(outputs)
      self.assertAllEqual(values, b"  ")

      outputs = string_ops.unicode_transcode(
          bad_string,
          input_encoding="UTF-8",
          output_encoding="UTF-8",
          errors="replace",
          replacement_char=ord(" "),
          replace_control_characters=False)
      values = sess.run(outputs)
      self.assertAllEqual(values, b"\x00 ")
Ejemplo n.º 36
0
  def test_transcode_utf16_to_utf8(self):
    strings = [b"\x00a\x00b\x20\xAC", b"\xD8\x01\xDC\x37"]  # U+10437
    expected = [s.decode("UTF-16-BE").encode("UTF-8") for s in strings]

    with self.cached_session() as sess:
      outputs = string_ops.unicode_transcode(
          strings,
          input_encoding="UTF-16",
          output_encoding="UTF-8",
          errors="replace",
          replacement_char=ord(" "),
          replace_control_characters=False)
      values = self.evaluate(outputs)
      self.assertAllEqual(values, expected)
Ejemplo n.º 37
0
    def test_transcode_utf16_to_utf8(self):
        strings = [b"\x00a\x00b\x20\xAC", b"\xD8\x01\xDC\x37"]  # U+10437
        expected = [s.decode("UTF-16-BE").encode("UTF-8") for s in strings]

        with self.cached_session() as sess:
            outputs = string_ops.unicode_transcode(
                strings,
                input_encoding="UTF-16",
                output_encoding="UTF-8",
                errors="replace",
                replacement_char=ord(" "),
                replace_control_characters=False)
            values = sess.run(outputs)
            self.assertAllEqual(values, expected)
Ejemplo n.º 38
0
  def test_invalid_error_policy_causes_errors(self):
    strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]

    with self.assertRaisesRegexp(
        ValueError, "'invalid' not in: \"strict\", \"replace\", \"ignore\"."):
      with self.cached_session() as sess:
        outputs = string_ops.unicode_transcode(
            strings,
            input_encoding="UTF-8",
            output_encoding="UTF-8",
            errors="invalid",
            replacement_char=ord(" "),
            replace_control_characters=False)
        self.evaluate(outputs)
  def test_invalid_error_policy_causes_errors(self):
    strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]

    with self.assertRaisesRegexp(
        ValueError, "'invalid' not in: \"strict\", \"replace\", \"ignore\"."):
      with self.cached_session() as sess:
        outputs = string_ops.unicode_transcode(
            strings,
            input_encoding="UTF-8",
            output_encoding="UTF-8",
            errors="invalid",
            replacement_char=ord(" "),
            replace_control_characters=False)
        sess.run(outputs)
Ejemplo n.º 40
0
 def test_bom_handling(self, string, input_encoding, expected):
     with self.test_session():
         output = string_ops.unicode_transcode(
             string, input_encoding=input_encoding, output_encoding="UTF-8")
         self.assertAllEqual(output.eval(), expected)
Ejemplo n.º 41
0
 def test_bom_handling(self, string, input_encoding, expected):
   with self.test_session():
     output = string_ops.unicode_transcode(
         string, input_encoding=input_encoding, output_encoding="UTF-8")
     self.assertAllEqual(output.eval(), expected)