def test_convert_to_utf8_unknown_encoding(): eq_(u"abc\u20acdef", charsets.convert_to_unicode("windows-874", b"abc\x80def")) eq_(u"qwe", charsets.convert_to_unicode('X-UNKNOWN', u'qwe')) eq_(u"qwe", charsets.convert_to_unicode('ru_RU.KOI8-R', 'qwe')) eq_(u"qwe", charsets.convert_to_unicode('"utf-8"; format="flowed"', 'qwe'))
def mime_to_unicode(header): """ Takes a header value and returns a fully decoded unicode string. It differs from standard Python's mail.header.decode_header() because: - it is higher level, i.e. returns a unicode string instead of an array of tuples - it accepts Unicode and non-ASCII strings as well >>> header_to_unicode("=?UTF-8?B?UmVbMl06INCX0LXQvNC70Y/QutC4?=") u"Земляки" >>> header_to_unicode("hello") u"Hello" """ # Only string header values need to be converted. if not isinstance(header, basestring): return header try: header = unfold(header) decoded = [] # decoded parts while header: match = encodedWord.search(header) if match: start = match.start() if start != 0: # decodes unencoded ascii part to unicode value = charsets.convert_to_unicode(ascii, header[0:start]) if value.strip(): decoded.append(value) # decode a header =?...?= of encoding charset, value = decode_part( match.group('charset').lower(), match.group('encoding').lower(), match.group('encoded')) decoded.append(charsets.convert_to_unicode(charset, value)) header = header[match.end():] else: # no match? append the remainder # of the string to the list of chunks decoded.append(charsets.convert_to_unicode(ascii, header)) break return u"".join(decoded) except Exception: try: logged_header = header if isinstance(logged_header, unicode): logged_header = logged_header.encode('utf-8') # encode header as utf-8 so all characters can be base64 encoded logged_header = b64encode(logged_header) log.warning( u"HEADER-DECODE-FAIL: ({0}) - b64encoded".format( logged_header)) except Exception: log.exception("Failed to log exception") return header
def mime_to_unicode(header): """ Takes a header value and returns a fully decoded unicode string. It differs from standard Python's mail.header.decode_header() because: - it is higher level, i.e. returns a unicode string instead of an array of tuples - it accepts Unicode and non-ASCII strings as well >>> header_to_unicode("=?UTF-8?B?UmVbMl06INCX0LXQvNC70Y/QutC4?=") u"Земляки" >>> header_to_unicode("hello") u"Hello" """ # Only string header values need to be converted. basestring = (str, bytes) if not isinstance(header, basestring): return header try: header = unfold(header) decoded = [] # decoded parts while header: match = encodedWord.search(header) if match: start = match.start() if start != 0: # decodes unencoded ascii part to unicode value = charsets.convert_to_unicode(ascii, header[0:start]) if value.strip(): decoded.append(value) # decode a header =?...?= of encoding charset, value = decode_part( match.group('charset').lower(), match.group('encoding').lower(), match.group('encoded')) decoded.append(charsets.convert_to_unicode(charset, value)) header = header[match.end():] else: # no match? append the remainder # of the string to the list of chunks decoded.append(charsets.convert_to_unicode(ascii, header)) break return u"".join(decoded) except Exception: try: logged_header = header if isinstance(logged_header, str): logged_header = logged_header.encode('utf-8') # encode header as utf-8 so all characters can be base64 encoded logged_header = b64encode(logged_header) log.warning(u"HEADER-DECODE-FAIL: ({0}) - b64encoded".format( logged_header)) except Exception: log.exception("Failed to log exception") return header
def __init__( self, content_type, body, charset=None, disposition=None, filename=None): self.headers = headers.MimeHeaders() self.body = body self.disposition = disposition or ('attachment' if filename else None) self.filename = filename self.size = len(body) if self.filename: self.filename = path.basename(self.filename) content_type = adjust_content_type(content_type, body, filename) if content_type.main == 'text': # the text should have a charset if not charset: charset = "utf-8" # it should be stored as unicode. period self.body = charsets.convert_to_unicode(charset, body) # let's be simple when possible if charset != 'ascii' and is_pure_ascii(body): charset = 'ascii' self.headers['MIME-Version'] = '1.0' self.headers['Content-Type'] = content_type if charset: content_type.params['charset'] = charset if self.disposition: self.headers['Content-Disposition'] = WithParams(disposition) if self.filename: self.headers['Content-Disposition'].params['filename'] = self.filename self.headers['Content-Type'].params['name'] = self.filename
def mime_to_unicode(header): """ Takes a header value and returns a fully decoded unicode string. It differs from standard Python's mail.header.decode_header() because: - it is higher level, i.e. returns a unicode string instead of an array of tuples - it accepts Unicode and non-ASCII strings as well >>> header_to_unicode("=?UTF-8?B?UmVbMl06INCX0LXQvNC70Y/QutC4?=") u"Земляки" >>> header_to_unicode("hello") u"Hello" """ # Only string header values need to be converted. if not isinstance(header, six.string_types): return header try: header = unfold(header) decoded = [] # decoded parts while header: match = _RE_ENCODED_WORD.search(header) if not match: # Append the remainder of the string to the list of chunks. decoded.append((header, 'ascii')) break start = match.start() if start != 0: # decodes unencoded ascii part to unicode value = header[0:start] if value.strip(): decoded.append((value, 'ascii')) # decode a header =?...?= of encoding charset, value = _decode_part( match.group('charset').lower(), match.group('encoding').lower(), match.group('encoded')) if decoded and decoded[-1][1] == charset: decoded[-1] = (decoded[-1][0] + value, charset) else: decoded.append((value, charset)) header = header[match.end():] return u"".join(charsets.convert_to_unicode(c, v) for v, c in decoded) except Exception: try: logged_header = header if isinstance(logged_header, six.text_type): logged_header = logged_header.encode('utf-8') # encode header as utf-8 so all characters can be base64 encoded logged_header = b64encode(logged_header) _log.warning(u"HEADER-DECODE-FAIL: ({0}) - b64encoded".format( logged_header)) except Exception: _log.exception("Failed to log exception") return header
def decode_charset(parameter): """Decodes things like: "us-ascii'en'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20" to unicode """ v = get_value(parameter) parts = v.split("'", 2) if len(parts) != 3: return v charset, language, val = parts val = urllib_parse.unquote(val) return charsets.convert_to_unicode(charset, val)
def decode_charset(parameter): """Decodes things like: "us-ascii'en'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20" to unicode """ v = get_value(parameter) parts = v.split("'", 2) if len(parts) != 3: return v charset, language, val = parts val = urllib.parse.unquote(val) return charsets.convert_to_unicode(charset, val)
def decode_charset(ctype, body): if ctype.main != 'text': return body charset = ctype.get_charset() body = charsets.convert_to_unicode(charset, body) # for text/html unicode bodies make sure to replace # the whitespace (0xA0) with Outlook is reported to # have a bug there if ctype.sub =='html' and charset == 'utf-8': # Outlook bug body = body.replace(u'\xa0', u' ') return body
def body(self): if self._body: return self._body if self.content_type.is_delivery_status(): self._body = self._m.get_payload(decode=True) if self._body is None: self._body = "\r\n".join(str(p) for p in self._m.get_payload()) elif not self._m.is_multipart(): self._body = self._m.get_payload(decode=True) if self._m.get_content_maintype() == 'text': self._body = convert_to_unicode(self.charset, self._body) return self._body
def decode_charset(ctype, body): if ctype.main != "text": return body charset = ctype.get_charset() body = charsets.convert_to_unicode(charset, body) # for text/html unicode bodies make sure to replace # the whitespace (0xA0) with Outlook is reported to # have a bug there if ctype.sub == "html" and charset == "utf-8": # Outlook bug body = body.replace(u"\xa0", u" ") return body
def decode_charset(ctype, body): if ctype.main != 'text': return body, ctype mime_type = magic.from_buffer(body, mime=True) mime_type = ContentType(*mime_type.split("/", 1)) if mime_type.main != 'text': return body, mime_type charset = ctype.get_charset() body = charsets.convert_to_unicode(charset, body) # for text/html unicode bodies make sure to replace # the whitespace (0xA0) with Outlook is reported to # have a bug there if ctype.sub =='html' and charset == 'utf-8': # Outlook bug body = body.replace(u'\xa0', u' ') return body, ctype
def test_convert_to_utf8_unknown_encoding(): v = "abc\x80def" eq_(u"abc\u20acdef", charsets.convert_to_unicode("windows-874", v)) eq_(u"qwe", charsets.convert_to_unicode("X-UNKNOWN", u"qwe")) eq_(u"qwe", charsets.convert_to_unicode("ru_RU.KOI8-R", "qwe")) eq_(u"qwe", charsets.convert_to_unicode('"utf-8"; format="flowed"', "qwe"))
def body(self): if not self.m.is_multipart(): return charsets.convert_to_unicode( self.charset, self.m.get_payload(decode=True))
def body(self): if not self.m.is_multipart(): return charsets.convert_to_unicode(self.charset, self.m.get_payload(decode=True))