def utf8_encode_mbcs(s, errors, errorhandler, force_replace=True): # TODO: do the encoding without decoding utf8 -> unicode uni = s.decode('utf8') lgt = len(uni) if not force_replace and errors not in ('strict', 'replace'): msg = "mbcs encoding does not support errors='%s'" % errors errorhandler('strict', 'mbcs', msg, s, 0, 0) if lgt == 0: return '' if force_replace or errors == 'replace': flags = 0 used_default_p = lltype.nullptr(BOOLP.TO) else: # strict flags = rwin32.WC_NO_BEST_FIT_CHARS used_default_p = lltype.malloc(BOOLP.TO, 1, flavor='raw') used_default_p[0] = rffi.cast(rwin32.BOOL, False) try: with rffi.scoped_nonmoving_unicodebuffer(uni) as dataptr: # first get the size of the result mbcssize = WideCharToMultiByte(CP_ACP, flags, dataptr, lgt, None, 0, None, used_default_p) if mbcssize == 0: raise rwin32.lastSavedWindowsError() # If we used a default char, then we failed! if (used_default_p and rffi.cast(lltype.Bool, used_default_p[0])): errorhandler('strict', 'mbcs', "invalid character", s, 0, 0) with rffi.scoped_alloc_buffer(mbcssize) as buf: # do the conversion if WideCharToMultiByte(CP_ACP, flags, dataptr, lgt, buf.raw, mbcssize, None, used_default_p) == 0: raise rwin32.lastSavedWindowsError() if (used_default_p and rffi.cast(lltype.Bool, used_default_p[0])): errorhandler('strict', 'mbcs', "invalid character", s, 0, 0) result = buf.str(mbcssize) assert result is not None return result finally: if used_default_p: lltype.free(used_default_p, flavor='raw')
def utf8_encode_code_page(cp, s, errors, errorhandler): """Encode a utf8 string s using code page cp and the given errors/errorhandler. Returns a encoded byte string """ name = _code_page_name(cp) lgt = len(s) if lgt == 0: return '' flags = _encode_code_page_flags(cp, errors) if cp in (rwin32.CP_UTF8, rwin32.CP_UTF7): used_default_p = lltype.nullptr(BOOLP.TO) else: used_default_p = lltype.malloc(BOOLP.TO, 1, flavor='raw') # Encode one codpoint at a time to allow the errorhandlers to do # their thing chars = lltype.malloc(rffi.CWCHARP.TO, 2, flavor='raw') res = StringBuilder(lgt) try: with rffi.scoped_alloc_buffer(4) as buf: pos = 0 # TODO: update s if obj != s is returned from an errorhandler for uni in Utf8StringIterator(s): if used_default_p: used_default_p[0] = rffi.cast(rwin32.BOOL, False) if uni < 0x10000: chars[0] = rffi.cast(lltype.UniChar, uni) charsize = 1 else: chars[0] = Py_UNICODE_HIGH_SURROGATE(uni) chars[1] = Py_UNICODE_LOW_SURROGATE(uni) charsize = 2 # first get the size of the result outsize = WideCharToMultiByte(cp, flags, chars, charsize, buf.raw, 4, None, used_default_p) if outsize > 0: if not (used_default_p and used_default_p[0]): r = buf.str(outsize) assert r is not None res.append(r) pos += 1 continue elif rwin32.GetLastError_saved( ) != rwin32.ERROR_NO_UNICODE_TRANSLATION: raise rwin32.lastSavedWindowsError() # If we used a default char, then we failed! r, pos, retype, obj = errorhandler(errors, name, "invalid character", s, pos, pos + 1) res.append(r) pos += 1 finally: lltype.free(chars, flavor='raw') if used_default_p: lltype.free(used_default_p, flavor='raw') return res.build()
def _unibuf_to_utf8(dataptr, insize): """Encode the widechar unicode buffer u to utf8 Should never error, since the buffer comes from a call to MultiByteToWideChar """ flags = 0 cp = rwin32.CP_UTF8 used_default_p = lltype.nullptr(BOOLP.TO) # first get the size of the result outsize = WideCharToMultiByte(cp, flags, dataptr, insize, None, 0, None, used_default_p) if outsize == 0: raise rwin32.lastSavedWindowsError() with rffi.scoped_alloc_buffer(outsize) as buf: # do the conversion if WideCharToMultiByte(cp, flags, dataptr, insize, buf.raw, outsize, None, used_default_p) == 0: raise rwin32.lastSavedWindowsError() result = buf.str(outsize) assert result is not None return result