Ejemplo n.º 1
0
def safe_decode_hdr(msg=None, name=None, hdr=None, charset=None):
    """
    This method stubbornly tries to decode header data and convert
    to Pythonic unicode strings. The strings are guaranteed not to
    contain tab, newline or carriage return characters.

    If used with a message object, the header and the MIME charset
    will be inferred from the message headers.
    >>> msg = email.message.Message()
    >>> msg['content-type'] = 'text/plain; charset=utf-8'
    >>> msg['from'] = 'G\\xc3\\xadsli R \\xc3\\x93la <*****@*****.**>'
    >>> safe_decode_hdr(msg, 'from')
    u'G\\xedsli R \\xd3la <*****@*****.**>'

    The =?...?= MIME header encoding is also recognized and processed.

    >>> safe_decode_hdr(hdr='=?iso-8859-1?Q?G=EDsli_R_=D3la?=\\r\\n<*****@*****.**>')
    u'G\\xedsli R \\xd3la <*****@*****.**>'

    >>> safe_decode_hdr(hdr='"=?utf-8?Q?G=EDsli_R?= =?iso-8859-1?Q?=D3la?="')
    u'G\\xedsli R \\xd3la'

    And finally, guesses are made with raw binary data. This process
    could be improved, it currently only attempts utf-8 and iso-8859-1.

    >>> safe_decode_hdr(hdr='"G\\xedsli R \\xd3la"\\r\\t<*****@*****.**>')
    u'"G\\xedsli R \\xd3la"  <*****@*****.**>'

    >>> safe_decode_hdr(hdr='"G\\xc3\\xadsli R \\xc3\\x93la"\\n <*****@*****.**>')
    u'"G\\xedsli R \\xd3la"  <*****@*****.**>'

    # See https://bugs.python.org/issue1079

    # encoded word enclosed in parenthesis (comment syntax)
    >>> safe_decode_hdr(hdr='[email protected] (=?utf-8?Q?Ren=C3=A9?=)')
    u'[email protected] ( Ren\\xe9 )'

    # no space after encoded word
    >>> safe_decode_hdr(hdr='=?UTF-8?Q?Direction?=<*****@*****.**>')
    u'Direction <*****@*****.**>'
    """
    if hdr is None:
        value = msg and msg[name] or ''
        charset = charset or msg.get_content_charset() or 'utf-8'
    else:
        value = hdr
        charset = charset or 'utf-8'

    if not isinstance(value, unicode):
        # Already a str! Oh shit, might be nasty binary data.
        value = try_decode(value, charset, replace='?')

    # At this point we know we have a unicode string. Next we try
    # to very stubbornly decode and discover character sets.
    if '=?' in value and '?=' in value:
        try:
            # decode_header wants an unquoted str (not unicode)
            value = value.encode('utf-8').replace('"', '')
            # Decode!
            pairs = decode_header(value)
            value = ' '.join([try_decode(t, cs or charset)
                              for t, cs in pairs])
        except email.errors.HeaderParseError:
            pass

    # Finally, return the unicode data, with white-space normalized
    return value.replace('\r', ' ').replace('\t', ' ').replace('\n', ' ')
Ejemplo n.º 2
0
 def test_decode_header_no_encoding(self):
     res = decode_header("olmsted")
     self.assertEqual(res, [('olmsted', None)])