Beispiel #1
0
 def htmlentityreplace_errors(exc):
     if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
         res = []
         codepoints = []
         skip = False
         for i, c in enumerate(exc.object[exc.start:exc.end]):
             if skip:
                 skip = False
                 continue
             index = i + exc.start
             if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
                 codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
                 skip = True
             else:
                 codepoint = ord(c)
             codepoints.append(codepoint)
         for cp in codepoints:
             e = encode_entity_map.get(cp)
             if e:
                 res.append("&")
                 res.append(e)
                 if not e.endswith(";"):
                     res.append(";")
             else:
                 res.append("&#x%s;"%(hex(cp)[2:]))
         return (u"".join(res), exc.end)
     else:
         return xmlcharrefreplace_errors(exc)
Beispiel #2
0
    unicode_encode_errors = "strict"
else:
    unicode_encode_errors = "htmlentityreplace"

    from lib.html5lib.constants import entities

    encode_entity_map = {}
    is_ucs4 = len(u"\U0010FFFF") == 1
    for k, v in entities.items():
        #skip multi-character entities
        if ((is_ucs4 and len(v) > 1) or
            (not is_ucs4 and len(v) > 2)):
            continue
        if v != "&":
            if len(v) == 2:
                v = utils.surrogatePairToCodepoint(v)
            else:
                try:
                    v = ord(v)
                except:
                    print v
                    raise
            if not v in encode_entity_map or k.islower():
                # prefer < over < and similarly for &, >, etc.
                encode_entity_map[v] = k

    def htmlentityreplace_errors(exc):
        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
            res = []
            codepoints = []
            skip = False