Example #1
0
    def test_utf_16_decode(self):
        # When BOM present: it is removed and the proper UTF-16 variant is automatically selected
        string, num_processed = codecs.utf_16_decode(codecs.BOM_UTF16_LE + b'a\0b\0c\0')
        self.assertEqual(string, "abc")
        self.assertEqual(num_processed, 4 * 2)

        string, num_processed = codecs.utf_16_decode(codecs.BOM_UTF16_BE + b'\0a\0b\0c')
        self.assertEqual(string, "abc")
        self.assertEqual(num_processed, 4 * 2)

        # When no BOM: on little-endian systems, UTF-16 defaults to UTF-16-LE
        string, num_processed = codecs.utf_16_decode(b'a\0b\0c\0')
        self.assertEqual(string, 'abc')
        self.assertEqual(num_processed, 3 * 2)
Example #2
0
    def convert_table(self, dbname, tablename, target_dir):
        print("\tConverting {}.{}".format(dbname, tablename))
        cols = self.get_column_names(dbname, tablename)
        cols = ';'.join(cols) + '\n'
        
        targetpath = os.path.join(target_dir, tablename + '.csv')
        tmpfile = TempFile(self.tmp_dir)

        self.write_data(dbname, tablename, tmpfile.path)
        # The underlying dbc tends to append an ending
        tmpfile.update_ending() 

        i = 0
        with open(targetpath, 'w') as f:
            f.write(cols)
            with tmpfile.open('rb') as t:
                # Decode as utf-16
                # Returns (decoded_text, n_chars)
                text = codecs.utf_16_decode(t.read())[0]
                # replace NULL with nothing
                text = text.replace('\x00', '')
                # convert to *nix line endings
                text = text.replace('\r\n', '\n')

                f.write(text)
        tmpfile.delete()
Example #3
0
def get_text(string, start, end, bom=True):
    """This method correctly accesses slices of strings using character
    start/end offsets referring to UTF-16 encoded bytes.  This allows
    for using character offsets generated by Rosette (and other softwares)
    that use UTF-16 native string representations under Pythons with UCS-4 
    support, such as Python 3.3+ (refer to https://www.python.org/dev/peps/pep-0393/).
    
    The offsets are adjusted to account for a UTF-16 byte order mark (BOM) 
    (2 bytes) and also that each UTF-16 logical character consumes 2 bytes.
    'character' in this context refers to logical characters for the purpose of
    character offsets; an individual character can consume up to 4 bytes (32 
    bits for so-called 'wide' characters) and graphemes can consume even more.
    """
    import codecs
    if not isinstance(string, str):
        raise ValueError('expected string to be of type str')
    if not any(((start is None), isinstance(start, int))):
        raise ValueError('expected start to be of type int or NoneType')
    if not any(((end is None), isinstance(end, int))):
        raise ValueError('expected end to be of type int or NoneType')
    if start is not None:
        start *= 2
        if bom:
            start += 2
    if end is not None:
        end *= 2
        if bom:
            end += 2
    utf_16, _ = codecs.utf_16_encode(string)
    sliced, _ = codecs.utf_16_decode(utf_16[start:end])
    return sliced
Example #4
0
 def read_short_stream(self, entry):
     ss_size = 1 << self.header.sb_shift
     name = utf_16_decode(entry.name)[0].strip()
     print("read_short_stream: %s" % name)
     chain = self.get_secID_chain(entry.secID, self.ssat)
     data = b''
     for secID in chain:
         if secID == -2:
             break
         offset = self.short_sector_pos(secID)
         data += self.ssc[offset:offset + ss_size]
     return data[:entry._v.size]
Example #5
0
 def read_stream(self, entry):
     sec_size = 1 << self.header.bb_shift
     if entry._v.size < self.header.threshold:
         return self.read_short_stream(entry)
     name = utf_16_decode(entry.name)[0].strip()
     print("read_stream: %s" % name)
     chain = self.get_secID_chain(entry.secID)
     data = b''
     for secID in chain:
         if secID == -2:
             break
         offset = self.sector_offset(secID)
         data += self.__file[offset:offset + sec_size]
     return data[:entry._v.size]
Example #6
0
    def test_codecs_builtins(self):
        s = "abc"

        encoded = codecs.utf_8_encode(s)
        self.assertEqual(s, codecs.utf_8_decode(encoded[0])[0])

        encoded = codecs.utf_7_encode(s)
        self.assertEqual(s, codecs.utf_7_decode(encoded[0])[0])

        encoded = codecs.utf_16_encode(s)
        self.assertEqual(s, codecs.utf_16_decode(encoded[0])[0])

        encoded = codecs.utf_16_le_encode(s)
        self.assertEqual(s, codecs.utf_16_le_decode(encoded[0])[0])

        encoded = codecs.utf_16_be_encode(s)
        self.assertEqual(s, codecs.utf_16_be_decode(encoded[0])[0])

        encoded = codecs.utf_32_encode(s)
        self.assertEqual(s, codecs.utf_32_decode(encoded[0])[0])

        encoded = codecs.utf_32_le_encode(s)
        self.assertEqual(s, codecs.utf_32_le_decode(encoded[0])[0])

        encoded = codecs.utf_32_be_encode(s)
        self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0])

        encoded = codecs.utf_32_be_encode(s)
        self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0])

        encoded = codecs.raw_unicode_escape_encode(s)
        self.assertEqual(s, codecs.raw_unicode_escape_decode(encoded[0])[0])

        encoded = codecs.unicode_escape_encode(s)
        self.assertEqual(s, codecs.unicode_escape_decode(encoded[0])[0])

        encoded = codecs.latin_1_encode(s)
        self.assertEqual(s, codecs.latin_1_decode(encoded[0])[0])

        encoded = codecs.ascii_encode(s)
        self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])
Example #7
0
def decode(input, errors='strict'):
    return codecs.utf_16_decode(input, errors, True)
Example #8
0
 def test_utf_16_decode(self):
     #sanity
     new_str, size = codecs.utf_16_decode("abc")
     self.assertEqual(new_str, u'\u6261')
     self.assertEqual(size, 2)
Example #9
0
def decode(input, errors='strict'):
    return codecs.utf_16_decode(input, errors, True)
Example #10
0
 def test_utf_16_decode(self):
     #sanity
     new_str, size = codecs.utf_16_decode("abc")
     self.assertEqual(new_str, u'\u6261')
     self.assertEqual(size, 2)
Example #11
0
 def update_event(self, inp=-1):
     self.set_output_val(
         0, codecs.utf_16_decode(self.input(0), self.input(1),
                                 self.input(2)))