def convert_entity(m): groups = m.groupdict() if groups.get('dec'): number = int(groups['dec'], 10) elif groups.get('hex'): number = int(groups['hex'], 16) elif groups.get('named'): entity_name = groups['named'] if entity_name.lower() in keep: return m.group(0) else: number = (name2codepoint.get(entity_name) or name2codepoint.get(entity_name.lower())) if number is not None: # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML try: if 0x80 <= number <= 0x9f: return bytes((number, )).decode('cp1252') else: return chr(number) except ValueError: pass return '' if remove_illegal and groups.get('semicolon') else m.group(0)
def convert_entity(m: Match) -> str: groups = m.groupdict() number = None if groups.get("dec"): number = int(groups["dec"], 10) elif groups.get("hex"): number = int(groups["hex"], 16) elif groups.get("named"): entity_name = groups["named"] if entity_name.lower() in keep: return m.group(0) else: number = name2codepoint.get(entity_name) or name2codepoint.get( entity_name.lower()) if number is not None: # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML try: if 0x80 <= number <= 0x9F: return bytes((number, )).decode("cp1252") else: return chr(number) except ValueError: pass return "" if remove_illegal and groups.get("semicolon") else m.group(0)
def convert_entity(m): entity_body = m.group(3) if m.group(1): try: if m.group(2): number = int(entity_body, 16) else: number = int(entity_body, 10) # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML if 0x80 <= number <= 0x9f: return chr(number).decode('cp1252') except ValueError: number = None else: if entity_body in keep: return m.group(0) else: number = name2codepoint.get(entity_body) if number is not None: try: return chr(number) except ValueError: pass return '' if remove_illegal else m.group(0)
def substitute_entity(match): ent = match.group(3) if match.group(1) == "#": # number, decimal or hexadecimal return unichr(int(ent)) if match.group(2) == '' else unichr(int('0x'+ent,16)) else: # name cp = name2codepoint.get(ent) return unichr(cp) if cp else match.group()
def substitute_entity(match): ent = match.group(3) if match.group(1) == "#": # number, decimal or hexadecimal return unichr(int(ent)) if match.group(2) == '' else unichr( int('0x' + ent, 16)) else: # name cp = name2codepoint.get(ent) return unichr(cp) if cp else match.group()
def decode_entity(match): what = match.group(1) if what.startswith('#x'): what = int(what[2:], 16) elif what.startswith('#'): what = int(what[1:]) else: from html.entities import name2codepoint what = name2codepoint.get(what, match.group(0)) return uchr(what)
def subst_entity(match): ent = match.group(2) if match.group(1) == '#': return unichr(int(ent)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group()
def _substitute_entity(m): ent = m.group(2) if m.group(1) == "#": return chr(int(ent)) else: cp = name2codepoint.get(ent) if cp: return chr(cp) else: return m.group()
def inline_entity_repl(self, stack, entity): if entity[1] == '#': if entity[2] == 'x': c = int(entity[3:-1], 16) else: c = int(entity[2:-1], 10) c = chr(c) else: c = chr(name2codepoint.get(entity[1:-1], 0xfffe)) stack.top_append(c)
def substitute(match): ent = match.group(2) if match.group(1) == "#": return chr(int(ent)) else: cp = n2cp.get(ent) if cp: return chr(cp) else: return match.group()
def substitute_entity(match): from html.entities import name2codepoint as n2cp ent = match.group(2) if match.group(1) == "#": return chr(int(ent)) else: cp = n2cp.get(ent) if cp: return chr(cp) else: return match.group()
def substitute_entity(match): ent = match.group(2) if match.group(1) == "#": # numeric substitution return chr(int(ent)) else: # get the codepoint from the name cp = n2cp.get(ent) if cp: #if a codepoint was found, return it's string value return chr(cp) else: # codepoint wasn't found, return the match untouched return match.group()
def substitute_entity(match): if PY3: from html.entities import name2codepoint as n2cp else: from htmlentitydefs import name2codepoint as n2cp ent = match.group(2) if match.group(1) == "#": return unichr(int(ent)).encode('utf-8') else: cp = n2cp.get(ent) if cp: return unichr(cp).encode('utf-8') else: return match.group()
def substitute(match): ent = match.group(2) if match.group(1) == "#": try: return unichr(int(ent)) except: return chr(int(ent)) else: cp = n2cp.get(ent) if cp: try: return unichr(cp) except: return chr(cp) else: return match.group()
def substitute_entity(match): ent = match.group(3) if match.group(1) == "#": # decoding by number if match.group(2) == '': # number is in decimal return unichr(int(ent)) elif match.group(2) == 'x': # number is in hex return unichr(int('0x' + ent, 16)) else: # they were using a name cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group()
def substitute_entity(match): try: ent = match.group(3) if match.group(1) == "#": if match.group(2) == '': return chr(int(ent)) elif match.group(2) == 'x': return chr(int('0x' + ent, 16)) else: cp = n2cp.get(ent) if cp: return chr(cp) else: return match.group() except: return ""
def substitute_entity(match): if PY3: from html.entities import name2codepoint as n2cp else: from htmlentitydefs import name2codepoint as n2cp ent = match.group(2) if match.group(1) == "#": ent = unichr(int(ent)).encode('utf-8') if PY3 and isinstance(ent, bytes): ent = ent.decode("utf-8") return ent else: cp = n2cp.get(ent) if cp: cp = unichr(cp).encode('utf-8') if PY3 and isinstance(cp, bytes): cp = cp.decode("utf-8") return cp else: return match.group()
def substitute_entity(match): try: ent = match.group(3) if match.group(1) == "#": # decoding by number if match.group(2) == '': # number is in decimal return safe_unichr(int(ent)) elif match.group(2) in ['x', 'X']: # number is in hex return safe_unichr(int(ent, 16)) else: # they were using a name cp = n2cp.get(ent) if cp: return safe_unichr(cp) else: return match.group() except Exception: # in case of errors, return original input return match.group()
def substitute_entity(match): try: ent = match.group(3) if match.group(1) == "#": # decoding by number if match.group(2) == '': # number is in decimal return safe_unichr(int(ent)) elif match.group(2) in ['x', 'X']: # number is in hex return safe_unichr(int(ent, 16)) else: # they were using a name cp = n2cp.get(ent) if cp: return safe_unichr(cp) else: return match.group() except: # in case of errors, return original input return match.group()
def handle_entityref(self, name): cp = name2codepoint.get(name) if cp: self.__text.append(chr(cp)) else: self.__text.append('&' + name)
def decode_xml_replacer(match): name=match.group(1) if(name.startswith("#")): return chr(int(name[1:],16)) return chr(name2codepoint.get(name,'?'))
def handle_entityref(self, name): """Handle named entities of the form &aaaa; e.g. ’""" if name in ['gt', 'lt', 'amp']: self.handle_data("&%s;" % name) else: self.handle_data(chr(name2codepoint.get(name, "&%s;" % name)))
def handle_entityref(self, name): """Handle named entities of the form &aaaa; e.g. ’""" if name in ['gt', 'lt', 'amp']: self.handle_data("&%s;" % name) else: self.handle_data(chr(name2codepoint.get(name, u"&%s;" % name)))
def char_from_entity(match): code = name2codepoint.get(match.group(1), 0xFFFD) return chr(code)
def convert_entities(s): s = re.sub('&#(\d+);', lambda m: chr(int(m.groups(0)[0])), s) return re.sub('&(\w)+;', lambda m: n2cp.get(m.groups(0), '&%s;' % m.groups(0)[0]), s)
def handle_entityref(self, name): num = name2codepoint.get(name) if num is not None: self.handle_charref(num)