def test_utf_16_decode(self): # When BOM present: it is removed and the proper UTF-16 variant is automatically selected string, num_processed = codecs.utf_16_decode(codecs.BOM_UTF16_LE + b'a\0b\0c\0') self.assertEqual(string, "abc") self.assertEqual(num_processed, 4 * 2) string, num_processed = codecs.utf_16_decode(codecs.BOM_UTF16_BE + b'\0a\0b\0c') self.assertEqual(string, "abc") self.assertEqual(num_processed, 4 * 2) # When no BOM: on little-endian systems, UTF-16 defaults to UTF-16-LE string, num_processed = codecs.utf_16_decode(b'a\0b\0c\0') self.assertEqual(string, 'abc') self.assertEqual(num_processed, 3 * 2)
def convert_table(self, dbname, tablename, target_dir): print("\tConverting {}.{}".format(dbname, tablename)) cols = self.get_column_names(dbname, tablename) cols = ';'.join(cols) + '\n' targetpath = os.path.join(target_dir, tablename + '.csv') tmpfile = TempFile(self.tmp_dir) self.write_data(dbname, tablename, tmpfile.path) # The underlying dbc tends to append an ending tmpfile.update_ending() i = 0 with open(targetpath, 'w') as f: f.write(cols) with tmpfile.open('rb') as t: # Decode as utf-16 # Returns (decoded_text, n_chars) text = codecs.utf_16_decode(t.read())[0] # replace NULL with nothing text = text.replace('\x00', '') # convert to *nix line endings text = text.replace('\r\n', '\n') f.write(text) tmpfile.delete()
def get_text(string, start, end, bom=True): """This method correctly accesses slices of strings using character start/end offsets referring to UTF-16 encoded bytes. This allows for using character offsets generated by Rosette (and other softwares) that use UTF-16 native string representations under Pythons with UCS-4 support, such as Python 3.3+ (refer to https://www.python.org/dev/peps/pep-0393/). The offsets are adjusted to account for a UTF-16 byte order mark (BOM) (2 bytes) and also that each UTF-16 logical character consumes 2 bytes. 'character' in this context refers to logical characters for the purpose of character offsets; an individual character can consume up to 4 bytes (32 bits for so-called 'wide' characters) and graphemes can consume even more. """ import codecs if not isinstance(string, str): raise ValueError('expected string to be of type str') if not any(((start is None), isinstance(start, int))): raise ValueError('expected start to be of type int or NoneType') if not any(((end is None), isinstance(end, int))): raise ValueError('expected end to be of type int or NoneType') if start is not None: start *= 2 if bom: start += 2 if end is not None: end *= 2 if bom: end += 2 utf_16, _ = codecs.utf_16_encode(string) sliced, _ = codecs.utf_16_decode(utf_16[start:end]) return sliced
def read_short_stream(self, entry): ss_size = 1 << self.header.sb_shift name = utf_16_decode(entry.name)[0].strip() print("read_short_stream: %s" % name) chain = self.get_secID_chain(entry.secID, self.ssat) data = b'' for secID in chain: if secID == -2: break offset = self.short_sector_pos(secID) data += self.ssc[offset:offset + ss_size] return data[:entry._v.size]
def read_stream(self, entry): sec_size = 1 << self.header.bb_shift if entry._v.size < self.header.threshold: return self.read_short_stream(entry) name = utf_16_decode(entry.name)[0].strip() print("read_stream: %s" % name) chain = self.get_secID_chain(entry.secID) data = b'' for secID in chain: if secID == -2: break offset = self.sector_offset(secID) data += self.__file[offset:offset + sec_size] return data[:entry._v.size]
def test_codecs_builtins(self): s = "abc" encoded = codecs.utf_8_encode(s) self.assertEqual(s, codecs.utf_8_decode(encoded[0])[0]) encoded = codecs.utf_7_encode(s) self.assertEqual(s, codecs.utf_7_decode(encoded[0])[0]) encoded = codecs.utf_16_encode(s) self.assertEqual(s, codecs.utf_16_decode(encoded[0])[0]) encoded = codecs.utf_16_le_encode(s) self.assertEqual(s, codecs.utf_16_le_decode(encoded[0])[0]) encoded = codecs.utf_16_be_encode(s) self.assertEqual(s, codecs.utf_16_be_decode(encoded[0])[0]) encoded = codecs.utf_32_encode(s) self.assertEqual(s, codecs.utf_32_decode(encoded[0])[0]) encoded = codecs.utf_32_le_encode(s) self.assertEqual(s, codecs.utf_32_le_decode(encoded[0])[0]) encoded = codecs.utf_32_be_encode(s) self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0]) encoded = codecs.utf_32_be_encode(s) self.assertEqual(s, codecs.utf_32_be_decode(encoded[0])[0]) encoded = codecs.raw_unicode_escape_encode(s) self.assertEqual(s, codecs.raw_unicode_escape_decode(encoded[0])[0]) encoded = codecs.unicode_escape_encode(s) self.assertEqual(s, codecs.unicode_escape_decode(encoded[0])[0]) encoded = codecs.latin_1_encode(s) self.assertEqual(s, codecs.latin_1_decode(encoded[0])[0]) encoded = codecs.ascii_encode(s) self.assertEqual(s, codecs.ascii_decode(encoded[0])[0])
def decode(input, errors='strict'): return codecs.utf_16_decode(input, errors, True)
def test_utf_16_decode(self): #sanity new_str, size = codecs.utf_16_decode("abc") self.assertEqual(new_str, u'\u6261') self.assertEqual(size, 2)
def update_event(self, inp=-1): self.set_output_val( 0, codecs.utf_16_decode(self.input(0), self.input(1), self.input(2)))