def substitute_entity(match): ent = match.group(3) if match.group(1) == "#": # number, decimal or hexadecimal return unichr(int(ent)) if match.group(2) == '' else unichr(int('0x'+ent,16)) else: # name cp = name2codepoint.get(ent) return unichr(cp) if cp else match.group()
def _resolve_entity (mo): """ Resolve a HTML entity. @param mo: matched _entity_re object with a "entity" match group @type mo: MatchObject instance @return: resolved entity char, or empty string on error @rtype: unicode string """ ent = mo.group("entity") s = mo.group() if s.startswith('&#'): if s[2] in 'xX': radix = 16 else: radix = 10 try: num = int(ent, radix) except (ValueError, OverflowError): return u'' else: num = name2codepoint.get(ent) if num is None or num < 0: # unknown entity -> ignore return u'' try: return unichr(num) except ValueError: return u''
def convert_entity(m): entity_body = m.group(3) if m.group(1): try: if m.group(2): number = int(entity_body, 16) else: number = int(entity_body, 10) # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML if 0x80 <= number <= 0x9f: return chr(number).decode('cp1252') except ValueError: number = None else: if entity_body in keep: return m.group(0) else: number = name2codepoint.get(entity_body) if number is not None: try: return unichr(number) except ValueError: pass return u'' if remove_illegal else m.group(0)
def handler(mo): """ Callback to convert entities """ e = mo.group(1) v = e[1:-1] if not v.startswith('#'): codepoint = name2codepoint.get(v) return codepoint and '&#%d;' % codepoint or '' else: return e
def escape2char(s): """ E.g. "\\u0041" -> 'A' """ from htmlentitydefs import name2codepoint c = s[1] if c in 'xuU': c = int(s[2:], 16) elif c.isdigit(): c = int(s[1:], 8) elif c == '&': c = name2codepoint.get(s[2:-1], 0xFFFD) else: c = escape_table.get(c, 0xFFFD) return unichr(c)
def inline_entity_repl(self, stack, entity): if entity[1] == '#': if entity[2] == 'x': c = int(entity[3:-1], 16) else: c = int(entity[2:-1], 10) c = unichr(c) else: c = unichr(name2codepoint.get(entity[1:-1], 0xfffe)) stack.top_append(c)
def decode_entity(match): what = match.group(1) if what.startswith('#x'): what = int(what[2:], 16) elif what.startswith('#'): what = int(what[1:]) else: from htmlentitydefs import name2codepoint what = name2codepoint.get(what, match.group(0)) return uchr(what)
def _substitute_entity(m): ent = m.group(2) if m.group(1) == "#": return unichr(int(ent)) else: cp = name2codepoint.get(ent) if cp: return unichr(cp) else: return m.group()
def substitute_entity(match): ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group()
def substitute_entity(match): ent = match.group(3) if match.group(1) == "#": if match.group(2) == '': return unichr(int(ent)) elif match.group(2) == 'x': return unichr(int('0x'+ent, 16)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group()
def substitute_entity(match): from htmlentitydefs import name2codepoint as n2cp ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group()
def __entity(match): ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)) else: cp = name2codepoint.get(ent) if cp: return unichr(cp) else: return match.group()
def inline_entity_repl(self, stack, entity): if entity[1] == "#": if entity[2] == "x": c = int(entity[3:-1], 16) else: c = int(entity[2:-1], 10) c = unichr(c) else: from htmlentitydefs import name2codepoint c = unichr(name2codepoint.get(entity[1:-1], 0xFFFE)) stack.top_append(c)
def subs_entity(match): entity = match.group(3) if match.group(1) == "#": if match.group(2) == '': return unichr(int(entity)) elif match.group(2) == 'x': return unichr(int('0x' + entity, 16)) else: codepoint = name2codepoint.get(entity, "") if codepoint != "": return unichr(codepoint) return match.group()
def substitute_html_entity(match): ent = match.group(2) if match.group(1) == "#": return "&#" + ent + ";" else: if ent in ['br', 'nbsp', 'gt', 'lt', 'quot']: return "&" + ent + ";" cp = n2cp.get(ent) if cp: return "&#" + str(cp) + ";" else: return match.group()
def substitute_entity(match): ent = match.group(2) try: if match.group(1) == "#": return unichr(int(ent)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group() except UnicodeDecodeError: return ent
def _substitute_entity(m): ent = m.group(2) if m.group(1) == "#": # Hex value if ent[0] == 'x': return unichr(int(ent[1:], 16)) else: return unichr(int(ent)) else: cp = name2codepoint.get(ent) if cp: return unichr(cp) else: return m.group()
def _substitute_entity(cls, match): """ Adapted from http://snippets.dzone.com/posts/show/4569 """ ent = match.group(3) if match.group(1) == '#': if match.group(2) == '': return unichr(int(ent)) elif match.group(2) == 'x': return unichr(int('0x' + ent, 16)) else: cp = name2codepoint.get(ent) if cp: return unichr(cp) else: return match.group()
def substitute_entity(match): """ used by decode_htmlentities. """ ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group()
def substitute_entity(match): ent = match.group(3) if match.group(1) == "#": # decoding by number if match.group(2) == '': # number is in decimal return unichr(int(ent)) elif match.group(2) == 'x': # number is in hex return unichr(int('0x'+ent, 16)) else: # they were using a name cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group()
def repl_func(m): entity = m.group('entity') # get the entity name or number try: # if integer codepoint = int(entity) except ValueError: # not integer - it must be named and therefore # in name2codepoint (i.e. codepoint is never None) codepoint = name2codepoint.get(entity) # if codepoint > 16**2, or for some other # reason we cannot encode, just leave as-is try: return unichr(codepoint) except ValueError: return m.group()
def entity_to_char(self, text): """Swap accents code in HTML to accents characters""" name_func = '(Text.swap_cod_html_to_char) ' cods = self.cod_html.findall(text) cods = set(cods) for cod in cods: cod_unicode = name2codepoint.get(cod) if cod_unicode: try: text = text.replace('&%s;'%(cod), unichr(cod_unicode).encode('utf-8')) except Exception, msg: logging.error('%sErro ao trocar os chars acentuados em \ HTML(1): %s' % (name_func, msg))
def _substitute_entity(match): ent = match.group(3) if match.group(1) == '#': # decoding by number if match.group(2) == '': # number is in decimal return unichr(int(ent)) elif match.group(2) == 'x': # number is in hex return unichr(int('0x' + ent, 16)) else: # they were using a name cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group()
def substitute_entity(match): from htmlentitydefs import name2codepoint as n2cp ent = match.group(3) if match.group(1) == "#": # decoding by number if match.group(2) == '': # number is in decimal return unichr(int(ent)) elif match.group(2) == 'x': # number is in hex return unichr(int('0x' + ent, 16)) else: # they were using a name cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group()
def substitute_entity(match): try: ent = match.group(3) if match.group(1) == "#": if match.group(2) == '': return unichr(int(ent)) elif match.group(2) == 'x': return unichr(int('0x'+ent, 16)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group() except: return ""
def substitute_entity(match): ent = match.group(3) if match.group(1) == "#": # decoding by number if match.group(2) == '': # number is in decimal return unichr(int(ent)) elif match.group(2) == 'x': # number is in hex return unichr(int('0x'+ent, 16)) else: # they were using a name cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group() entity_re = re.compile(r'&(#?)(x?)(\w+);') return entity_re.subn(substitute_entity, string)[0]
def handle_entityref(self, name): """Process an entity reference.""" # XXX: doesn't get called if convert_charrefs=True num = name2codepoint.get(name) # we are sure we're on PY2 here if num is not None: print('&#%(ref)d;' % {'ref': num}, end='')
def convert_entities(s): s = re.sub('&#(\d+);', lambda m: unichr(int(m.groups(0)[0])), s) return re.sub('&(\w)+;', lambda m: n2cp.get(m.groups(0), u'&%s;' % m.groups(0)[0]), s)
def handle_entityref(self, name): cp = name2codepoint.get(name) if cp: self.__text.append(unichr(cp)) else: self.__text.append(u'&'+name)
def handle_entityref(self, name): """Handle named entities of the form &aaaa; e.g. ’""" if name in ['gt', 'lt', 'amp']: self.handle_data("&%s;" % name) else: self.handle_data(unichr(name2codepoint.get(name, u"&%s;" % name)))
def html_unescape(mystring): return HTML_RE.sub(lambda m: unichr(name2codepoint.get(m.group(1), 63)), mystring)