Example #1
0
def test_convert_to_utf8_unknown_encoding():
    eq_(u"abc\u20acdef",
        charsets.convert_to_unicode("windows-874", b"abc\x80def"))
    eq_(u"qwe",
        charsets.convert_to_unicode('X-UNKNOWN', u'qwe'))
    eq_(u"qwe",
        charsets.convert_to_unicode('ru_RU.KOI8-R', 'qwe'))
    eq_(u"qwe",
        charsets.convert_to_unicode('"utf-8"; format="flowed"', 'qwe'))
Example #2
0
def test_convert_to_utf8_unknown_encoding():
    eq_(u"abc\u20acdef",
        charsets.convert_to_unicode("windows-874", b"abc\x80def"))
    eq_(u"qwe",
        charsets.convert_to_unicode('X-UNKNOWN', u'qwe'))
    eq_(u"qwe",
        charsets.convert_to_unicode('ru_RU.KOI8-R', 'qwe'))
    eq_(u"qwe",
        charsets.convert_to_unicode('"utf-8"; format="flowed"', 'qwe'))
Example #3
0
def mime_to_unicode(header):
    """
    Takes a header value and returns a fully decoded unicode string.
    It differs from standard Python's mail.header.decode_header() because:
        - it is higher level, i.e. returns a unicode string instead of
          an array of tuples
        - it accepts Unicode and non-ASCII strings as well

    >>> header_to_unicode("=?UTF-8?B?UmVbMl06INCX0LXQvNC70Y/QutC4?=")
        u"Земляки"
    >>> header_to_unicode("hello")
        u"Hello"
    """
    # Only string header values need to be converted.
    if not isinstance(header, basestring):
        return header

    try:
        header = unfold(header)
        decoded = []  # decoded parts

        while header:
            match = encodedWord.search(header)
            if match:
                start = match.start()
                if start != 0:
                    # decodes unencoded ascii part to unicode
                    value = charsets.convert_to_unicode(ascii, header[0:start])
                    if value.strip():
                        decoded.append(value)
                # decode a header =?...?= of encoding
                charset, value = decode_part(
                    match.group('charset').lower(),
                    match.group('encoding').lower(),
                    match.group('encoded'))
                decoded.append(charsets.convert_to_unicode(charset, value))
                header = header[match.end():]
            else:
                # no match? append the remainder
                # of the string to the list of chunks
                decoded.append(charsets.convert_to_unicode(ascii, header))
                break
        return u"".join(decoded)
    except Exception:
        try:
            logged_header = header
            if isinstance(logged_header, unicode):
                logged_header = logged_header.encode('utf-8')
                # encode header as utf-8 so all characters can be base64 encoded
            logged_header = b64encode(logged_header)
            log.warning(
                u"HEADER-DECODE-FAIL: ({0}) - b64encoded".format(
                    logged_header))
        except Exception:
            log.exception("Failed to log exception")
        return header
Example #4
0
def mime_to_unicode(header):
    """
    Takes a header value and returns a fully decoded unicode string.
    It differs from standard Python's mail.header.decode_header() because:
        - it is higher level, i.e. returns a unicode string instead of
          an array of tuples
        - it accepts Unicode and non-ASCII strings as well

    >>> header_to_unicode("=?UTF-8?B?UmVbMl06INCX0LXQvNC70Y/QutC4?=")
        u"Земляки"
    >>> header_to_unicode("hello")
        u"Hello"
    """
    # Only string header values need to be converted.
    basestring = (str, bytes)
    if not isinstance(header, basestring):
        return header

    try:
        header = unfold(header)
        decoded = []  # decoded parts

        while header:
            match = encodedWord.search(header)
            if match:
                start = match.start()
                if start != 0:
                    # decodes unencoded ascii part to unicode
                    value = charsets.convert_to_unicode(ascii, header[0:start])
                    if value.strip():
                        decoded.append(value)
                # decode a header =?...?= of encoding
                charset, value = decode_part(
                    match.group('charset').lower(),
                    match.group('encoding').lower(), match.group('encoded'))
                decoded.append(charsets.convert_to_unicode(charset, value))
                header = header[match.end():]
            else:
                # no match? append the remainder
                # of the string to the list of chunks
                decoded.append(charsets.convert_to_unicode(ascii, header))
                break
        return u"".join(decoded)
    except Exception:
        try:
            logged_header = header
            if isinstance(logged_header, str):
                logged_header = logged_header.encode('utf-8')
                # encode header as utf-8 so all characters can be base64 encoded
            logged_header = b64encode(logged_header)
            log.warning(u"HEADER-DECODE-FAIL: ({0}) - b64encoded".format(
                logged_header))
        except Exception:
            log.exception("Failed to log exception")
        return header
Example #5
0
    def __init__(
        self, content_type, body, charset=None, disposition=None, filename=None):
        self.headers = headers.MimeHeaders()
        self.body = body
        self.disposition = disposition or ('attachment' if filename else None)
        self.filename = filename
        self.size = len(body)

        if self.filename:
            self.filename = path.basename(self.filename)

        content_type = adjust_content_type(content_type, body, filename)

        if content_type.main == 'text':
            # the text should have a charset
            if not charset:
                charset = "utf-8"

            # it should be stored as unicode. period
            self.body = charsets.convert_to_unicode(charset, body)

            # let's be simple when possible
            if charset != 'ascii' and is_pure_ascii(body):
                charset = 'ascii'

        self.headers['MIME-Version'] = '1.0'
        self.headers['Content-Type'] = content_type
        if charset:
            content_type.params['charset'] = charset

        if self.disposition:
            self.headers['Content-Disposition'] = WithParams(disposition)
            if self.filename:
                self.headers['Content-Disposition'].params['filename'] = self.filename
                self.headers['Content-Type'].params['name'] = self.filename
Example #6
0
    def __init__(
        self, content_type, body, charset=None, disposition=None, filename=None):
        self.headers = headers.MimeHeaders()
        self.body = body
        self.disposition = disposition or ('attachment' if filename else None)
        self.filename = filename
        self.size = len(body)

        if self.filename:
            self.filename = path.basename(self.filename)

        content_type = adjust_content_type(content_type, body, filename)

        if content_type.main == 'text':
            # the text should have a charset
            if not charset:
                charset = "utf-8"

            # it should be stored as unicode. period
            self.body = charsets.convert_to_unicode(charset, body)

            # let's be simple when possible
            if charset != 'ascii' and is_pure_ascii(body):
                charset = 'ascii'

        self.headers['MIME-Version'] = '1.0'
        self.headers['Content-Type'] = content_type
        if charset:
            content_type.params['charset'] = charset

        if self.disposition:
            self.headers['Content-Disposition'] = WithParams(disposition)
            if self.filename:
                self.headers['Content-Disposition'].params['filename'] = self.filename
                self.headers['Content-Type'].params['name'] = self.filename
Example #7
0
def mime_to_unicode(header):
    """
    Takes a header value and returns a fully decoded unicode string.
    It differs from standard Python's mail.header.decode_header() because:
        - it is higher level, i.e. returns a unicode string instead of
          an array of tuples
        - it accepts Unicode and non-ASCII strings as well

    >>> header_to_unicode("=?UTF-8?B?UmVbMl06INCX0LXQvNC70Y/QutC4?=")
        u"Земляки"
    >>> header_to_unicode("hello")
        u"Hello"
    """
    # Only string header values need to be converted.
    if not isinstance(header, six.string_types):
        return header

    try:
        header = unfold(header)
        decoded = []  # decoded parts

        while header:
            match = _RE_ENCODED_WORD.search(header)
            if not match:
                # Append the remainder of the string to the list of chunks.
                decoded.append((header, 'ascii'))
                break

            start = match.start()
            if start != 0:
                # decodes unencoded ascii part to unicode
                value = header[0:start]
                if value.strip():
                    decoded.append((value, 'ascii'))
            # decode a header =?...?= of encoding
            charset, value = _decode_part(
                match.group('charset').lower(),
                match.group('encoding').lower(), match.group('encoded'))
            if decoded and decoded[-1][1] == charset:
                decoded[-1] = (decoded[-1][0] + value, charset)
            else:
                decoded.append((value, charset))
            header = header[match.end():]

        return u"".join(charsets.convert_to_unicode(c, v) for v, c in decoded)
    except Exception:
        try:
            logged_header = header
            if isinstance(logged_header, six.text_type):
                logged_header = logged_header.encode('utf-8')
                # encode header as utf-8 so all characters can be base64 encoded
            logged_header = b64encode(logged_header)
            _log.warning(u"HEADER-DECODE-FAIL: ({0}) - b64encoded".format(
                logged_header))
        except Exception:
            _log.exception("Failed to log exception")
        return header
Example #8
0
def decode_charset(parameter):
    """Decodes things like:
    "us-ascii'en'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20"
    to unicode """

    v = get_value(parameter)
    parts = v.split("'", 2)
    if len(parts) != 3:
        return v
    charset, language, val = parts
    val = urllib_parse.unquote(val)
    return charsets.convert_to_unicode(charset, val)
Example #9
0
def decode_charset(parameter):
    """Decodes things like:
    "us-ascii'en'This%20is%20even%20more%20%2A%2A%2Afun%2A%2A%2A%20"
    to unicode """

    v = get_value(parameter)
    parts = v.split("'", 2)
    if len(parts) != 3:
        return v
    charset, language, val = parts
    val = urllib.parse.unquote(val)
    return charsets.convert_to_unicode(charset, val)
Example #10
0
def decode_charset(ctype, body):
    if ctype.main != 'text':
        return body

    charset = ctype.get_charset()
    body = charsets.convert_to_unicode(charset, body)

    # for text/html unicode bodies make sure to replace
    # the whitespace (0xA0) with   Outlook is reported to
    # have a bug there
    if ctype.sub =='html' and charset == 'utf-8':
        # Outlook bug
        body = body.replace(u'\xa0', u' ')

    return body
Example #11
0
    def body(self):
        if self._body:
            return self._body

        if self.content_type.is_delivery_status():
            self._body = self._m.get_payload(decode=True)
            if self._body is None:
                self._body = "\r\n".join(str(p) for p in self._m.get_payload())

        elif not self._m.is_multipart():
            self._body = self._m.get_payload(decode=True)
            if self._m.get_content_maintype() == 'text':
                self._body = convert_to_unicode(self.charset, self._body)

        return self._body
Example #12
0
    def body(self):
        if self._body:
            return self._body

        if self.content_type.is_delivery_status():
            self._body = self._m.get_payload(decode=True)
            if self._body is None:
                self._body = "\r\n".join(str(p) for p in self._m.get_payload())

        elif not self._m.is_multipart():
            self._body = self._m.get_payload(decode=True)
            if self._m.get_content_maintype() == 'text':
                self._body = convert_to_unicode(self.charset, self._body)

        return self._body
Example #13
0
def decode_charset(ctype, body):
    if ctype.main != 'text':
        return body

    charset = ctype.get_charset()
    body = charsets.convert_to_unicode(charset, body)

    # for text/html unicode bodies make sure to replace
    # the whitespace (0xA0) with   Outlook is reported to
    # have a bug there
    if ctype.sub =='html' and charset == 'utf-8':
        # Outlook bug
        body = body.replace(u'\xa0', u' ')

    return body
Example #14
0
def decode_charset(ctype, body):
    if ctype.main != "text":
        return body

    charset = ctype.get_charset()
    body = charsets.convert_to_unicode(charset, body)

    # for text/html unicode bodies make sure to replace
    # the whitespace (0xA0) with   Outlook is reported to
    # have a bug there
    if ctype.sub == "html" and charset == "utf-8":
        # Outlook bug
        body = body.replace(u"\xa0", u" ")

    return body
Example #15
0
def decode_charset(ctype, body):
    if ctype.main != 'text':
        return body, ctype

    mime_type = magic.from_buffer(body, mime=True)
    mime_type = ContentType(*mime_type.split("/", 1))
    if mime_type.main != 'text':
        return body, mime_type

    charset = ctype.get_charset()
    body = charsets.convert_to_unicode(charset, body)

    # for text/html unicode bodies make sure to replace
    # the whitespace (0xA0) with   Outlook is reported to
    # have a bug there
    if ctype.sub =='html' and charset == 'utf-8':
        # Outlook bug
        body = body.replace(u'\xa0', u' ')

    return body, ctype
Example #16
0
def test_convert_to_utf8_unknown_encoding():
    v = "abc\x80def"
    eq_(u"abc\u20acdef", charsets.convert_to_unicode("windows-874", v))
    eq_(u"qwe", charsets.convert_to_unicode("X-UNKNOWN", u"qwe"))
    eq_(u"qwe", charsets.convert_to_unicode("ru_RU.KOI8-R", "qwe"))
    eq_(u"qwe", charsets.convert_to_unicode('"utf-8"; format="flowed"', "qwe"))
Example #17
0
 def body(self):
     if not self.m.is_multipart():
         return charsets.convert_to_unicode(
             self.charset,
             self.m.get_payload(decode=True))
Example #18
0
 def body(self):
     if not self.m.is_multipart():
         return charsets.convert_to_unicode(self.charset,
                                            self.m.get_payload(decode=True))