コード例 #1
0
def get_text(string, start, end, bom=True):
    """This method correctly accesses slices of strings using character
    start/end offsets referring to UTF-16 encoded bytes.  This allows
    for using character offsets generated by Rosette (and other softwares)
    that use UTF-16 native string representations under Pythons with UCS-4 
    support, such as Python 3.3+ (refer to https://www.python.org/dev/peps/pep-0393/).
    
    The offsets are adjusted to account for a UTF-16 byte order mark (BOM) 
    (2 bytes) and also that each UTF-16 logical character consumes 2 bytes.
    'character' in this context refers to logical characters for the purpose of
    character offsets; an individual character can consume up to 4 bytes (32 
    bits for so-called 'wide' characters) and graphemes can consume even more.
    """
    import codecs
    if not isinstance(string, str):
        raise ValueError('expected string to be of type str')
    if not any(((start is None), isinstance(start, int))):
        raise ValueError('expected start to be of type int or NoneType')
    if not any(((end is None), isinstance(end, int))):
        raise ValueError('expected end to be of type int or NoneType')
    if start is not None:
        start *= 2
        if bom:
            start += 2
    if end is not None:
        end *= 2
        if bom:
            end += 2
    utf_16, _ = codecs.utf_16_encode(string)
    sliced, _ = codecs.utf_16_decode(utf_16[start:end])
    return sliced
コード例 #2
0
 def encode(self, input, errors='strict'):
     self.bom_written = True
     result = codecs.utf_16_encode(input, errors)
     if sys.byteorder == 'little':
         self.encode = codecs.utf_16_le_encode
     else:
         self.encode = codecs.utf_16_be_encode
     return result
コード例 #3
0
ファイル: utf_16.py プロジェクト: 3rdandUrban-dev/Nuxleus
 def encode(self, input, errors='strict'):
     self.bom_written = True
     result = codecs.utf_16_encode(input, errors)
     if sys.byteorder == 'little':
         self.encode = codecs.utf_16_le_encode
     else:
         self.encode = codecs.utf_16_be_encode
     return result
コード例 #4
0
ファイル: utf16.py プロジェクト: OpenXT/xc-windows
def validate(s):
    correct_utf16 = codecs.utf_16_encode(s)[0][2:]
    utf8 = codecs.utf_8_encode(s)
    p = subprocess.Popen("./utf16", stdin = subprocess.PIPE, stdout = subprocess.PIPE)
    maybe_utf16 = p.communicate(utf8[0])[0]
    if correct_utf16 != maybe_utf16:
        print u"tried to do %r, got back %r, expected %r" % (utf8[0], maybe_utf16, correct_utf16)
        raise "failed"
コード例 #5
0
ファイル: utf_16.py プロジェクト: ClayMason/BlackrockFBP
 def encode(self, input, final=False):
     if self.encoder is None:
         result = codecs.utf_16_encode(input, self.errors)[0]
         if sys.byteorder == 'little':
             self.encoder = codecs.utf_16_le_encode
         else:
             self.encoder = codecs.utf_16_be_encode
         return result
     return self.encoder(input, self.errors)[0]
コード例 #6
0
 def encode(self, input, final=False):
     if self.encoder is None:
         result = codecs.utf_16_encode(input, self.errors)[0]
         if sys.byteorder == 'little':
             self.encoder = codecs.utf_16_le_encode
         else:
             self.encoder = codecs.utf_16_be_encode
         return result
     return self.encoder(input, self.errors)[0]
コード例 #7
0
 def encode(self, input, errors='strict'):
     if self.encoder is None:
         result = codecs.utf_16_encode(input, errors)
         if sys.byteorder == 'little':
             self.encoder = codecs.utf_16_le_encode
         else:
             self.encoder = codecs.utf_16_be_encode
         return result
     return self.encoder(input, errors)
コード例 #8
0
ファイル: utf_16.py プロジェクト: ClayMason/BlackrockFBP
 def encode(self, input, errors='strict'):
     if self.encoder is None:
         result = codecs.utf_16_encode(input, errors)
         if sys.byteorder == 'little':
             self.encoder = codecs.utf_16_le_encode
         else:
             self.encoder = codecs.utf_16_be_encode
         return result
     else:
         return self.encoder(input, errors)
コード例 #9
0
def validate(s):
    correct_utf16 = codecs.utf_16_encode(s)[0][2:]
    utf8 = codecs.utf_8_encode(s)
    p = subprocess.Popen("./utf16",
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE)
    maybe_utf16 = p.communicate(utf8[0])[0]
    if correct_utf16 != maybe_utf16:
        print u"tried to do %r, got back %r, expected %r" % (
            utf8[0], maybe_utf16, correct_utf16)
        raise "failed"
コード例 #10
0
    def test_codecs_builtins(self):
        s = "abc"

        encoded = codecs.utf_8_encode(s)
        self.assertEqual(s, codecs.utf_8_decode(encoded[0])[0])

        encoded = codecs.utf_7_encode(s)
        self.assertEqual(s, codecs.utf_7_decode(encoded[0])[0])

        encoded = codecs.utf_16_encode(s)
        self.assertEqual(s, codecs.utf_16_decode(encoded[0])[0])

        encoded = codecs.utf_16_le_encode(s)
        self.assertEqual(s, codecs.utf_16_le_decode(encoded[0])[0])

        encoded = codecs.utf_16_be_encode(s)
        self.assertEqual(s, codecs.utf_16_be_decode(encoded[0])[0])

        encoded = codecs.utf_32_encode(s)
        self.assertEqual(s, codecs.utf_32_decode(encoded[0])[0])

        encoded = codecs.utf_32_le_encode(s)
        self.assertEqual(s, codecs.utf_32_le_decode(encoded[0])[0])

        encoded = codecs.utf_32_be_encode(s)
        self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0])

        encoded = codecs.utf_32_be_encode(s)
        self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0])

        encoded = codecs.raw_unicode_escape_encode(s)
        self.assertEqual(s, codecs.raw_unicode_escape_decode(encoded[0])[0])

        encoded = codecs.unicode_escape_encode(s)
        self.assertEqual(s, codecs.unicode_escape_decode(encoded[0])[0])

        encoded = codecs.latin_1_encode(s)
        self.assertEqual(s, codecs.latin_1_decode(encoded[0])[0])

        encoded = codecs.ascii_encode(s)
        self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])
コード例 #11
0
 def test_utf_16_encode(self):
     #Sanity
     self.assertEqual(codecs.utf_16_encode("abc"), ('\xff\xfea\x00b\x00c\x00', 3))
コード例 #12
0
 def test_utf_16_encode(self):
     # On little-endian systems, UTF-16 encodes in UTF-16-LE prefixed with BOM
     data, num_processed = codecs.utf_16_encode("abc")
     self.assertEqual(data, codecs.BOM_UTF16 + b'a\0b\0c\0')
     self.assertEqual(num_processed, 3)
コード例 #13
0
 def test_utf_16_encode(self):
     #Sanity
     self.assertEqual(codecs.utf_16_encode("abc"), ('\xff\xfea\x00b\x00c\x00', 3))
コード例 #14
0
ファイル: nodes.py プロジェクト: xxoolm/Ryven
 def update_event(self, inp=-1):
     self.set_output_val(
         0, codecs.utf_16_encode(self.input(0), self.input(1),
                                 self.input(2)))