Esempio n. 1
0
    def convert_entity(m):
        groups = m.groupdict()
        if groups.get('dec'):
            number = int(groups['dec'], 10)
        elif groups.get('hex'):
            number = int(groups['hex'], 16)
        elif groups.get('named'):
            entity_name = groups['named']
            if entity_name.lower() in keep:
                return m.group(0)
            else:
                number = (name2codepoint.get(entity_name)
                          or name2codepoint.get(entity_name.lower()))
        if number is not None:
            # Numeric character references in the 80-9F range are typically
            # interpreted by browsers as representing the characters mapped
            # to bytes 80-9F in the Windows-1252 encoding. For more info
            # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
            try:
                if 0x80 <= number <= 0x9f:
                    return bytes((number, )).decode('cp1252')
                else:
                    return chr(number)
            except ValueError:
                pass

        return '' if remove_illegal and groups.get('semicolon') else m.group(0)
Esempio n. 2
0
    def convert_entity(m: Match) -> str:
        groups = m.groupdict()
        number = None
        if groups.get("dec"):
            number = int(groups["dec"], 10)
        elif groups.get("hex"):
            number = int(groups["hex"], 16)
        elif groups.get("named"):
            entity_name = groups["named"]
            if entity_name.lower() in keep:
                return m.group(0)
            else:
                number = name2codepoint.get(entity_name) or name2codepoint.get(
                    entity_name.lower())
        if number is not None:
            # Numeric character references in the 80-9F range are typically
            # interpreted by browsers as representing the characters mapped
            # to bytes 80-9F in the Windows-1252 encoding. For more info
            # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
            try:
                if 0x80 <= number <= 0x9F:
                    return bytes((number, )).decode("cp1252")
                else:
                    return chr(number)
            except ValueError:
                pass

        return "" if remove_illegal and groups.get("semicolon") else m.group(0)
Esempio n. 3
0
    def convert_entity(m):
        entity_body = m.group(3)
        if m.group(1):
            try:
                if m.group(2):
                    number = int(entity_body, 16)
                else:
                    number = int(entity_body, 10)
                # Numeric character references in the 80-9F range are typically
                # interpreted by browsers as representing the characters mapped
                # to bytes 80-9F in the Windows-1252 encoding. For more info
                # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
                if 0x80 <= number <= 0x9f:
                    return chr(number).decode('cp1252')
            except ValueError:
                number = None
        else:
            if entity_body in keep:
                return m.group(0)
            else:
                number = name2codepoint.get(entity_body)
        if number is not None:
            try:
                return chr(number)
            except ValueError:
                pass

        return '' if remove_illegal else m.group(0)
Esempio n. 4
0
    def convert_entity(m):
        entity_body = m.group(3)
        if m.group(1):
            try:
                if m.group(2):
                    number = int(entity_body, 16)
                else:
                    number = int(entity_body, 10)
                # Numeric character references in the 80-9F range are typically
                # interpreted by browsers as representing the characters mapped
                # to bytes 80-9F in the Windows-1252 encoding. For more info
                # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
                if 0x80 <= number <= 0x9f:
                    return chr(number).decode('cp1252')
            except ValueError:
                number = None
        else:
            if entity_body in keep:
                return m.group(0)
            else:
                number = name2codepoint.get(entity_body)
        if number is not None:
            try:
                return chr(number)
            except ValueError:
                pass

        return '' if remove_illegal else m.group(0)
Esempio n. 5
0
 def substitute_entity(match):
     ent = match.group(3)
     if match.group(1) == "#":    # number, decimal or hexadecimal
         return unichr(int(ent)) if match.group(2) == '' else unichr(int('0x'+ent,16))
     else:    # name
         cp = name2codepoint.get(ent)
         return unichr(cp) if cp else match.group()
Esempio n. 6
0
 def substitute_entity(match):
     ent = match.group(3)
     if match.group(1) == "#":  # number, decimal or hexadecimal
         return unichr(int(ent)) if match.group(2) == '' else unichr(
             int('0x' + ent, 16))
     else:  # name
         cp = name2codepoint.get(ent)
         return unichr(cp) if cp else match.group()
Esempio n. 7
0
def decode_entity(match):
    what = match.group(1)
    if what.startswith('#x'):
        what = int(what[2:], 16)
    elif what.startswith('#'):
        what = int(what[1:])
    else:
        from html.entities import name2codepoint
        what = name2codepoint.get(what, match.group(0))
    return uchr(what)
Esempio n. 8
0
 def subst_entity(match):
     ent = match.group(2)
     if match.group(1) == '#':
         return unichr(int(ent))
     else:
         cp = n2cp.get(ent)
         if cp:
             return unichr(cp)
         else:
             return match.group()
Esempio n. 9
0
def decode_entity(match):
    what = match.group(1)
    if what.startswith('#x'):
        what = int(what[2:], 16)
    elif what.startswith('#'):
        what = int(what[1:])
    else:
        from html.entities import name2codepoint
        what = name2codepoint.get(what, match.group(0))
    return uchr(what)
Esempio n. 10
0
def _substitute_entity(m):
    ent = m.group(2)
    if m.group(1) == "#":
        return chr(int(ent))
    else:
        cp = name2codepoint.get(ent)
        if cp:
            return chr(cp)
        else:
            return m.group()
Esempio n. 11
0
 def inline_entity_repl(self, stack, entity):
     if entity[1] == '#':
         if entity[2] == 'x':
             c = int(entity[3:-1], 16)
         else:
             c = int(entity[2:-1], 10)
         c = chr(c)
     else:
         c = chr(name2codepoint.get(entity[1:-1], 0xfffe))
     stack.top_append(c)
Esempio n. 12
0
def _substitute_entity(m):
    ent = m.group(2)
    if m.group(1) == "#":
        return chr(int(ent))
    else:
        cp = name2codepoint.get(ent)
        if cp:
            return chr(cp)
        else:
            return m.group()
def substitute(match):
    ent = match.group(2)
    if match.group(1) == "#":
        return chr(int(ent))
    else:
        cp = n2cp.get(ent)
        if cp:
            return chr(cp)
        else:
            return match.group()
Esempio n. 14
0
 def substitute_entity(match):
     from html.entities import name2codepoint as n2cp
     ent = match.group(2)
     if match.group(1) == "#":
         return chr(int(ent))
     else:
         cp = n2cp.get(ent)
         if cp:
             return chr(cp)
         else:
             return match.group()
Esempio n. 15
0
    def substitute_entity(match):
        ent = match.group(2)
        if match.group(1) == "#":
            # numeric substitution
            return chr(int(ent))
        else:
            # get the codepoint from the name
            cp = n2cp.get(ent)

        if cp:
            #if a codepoint was found, return it's string value
            return chr(cp)
        else:
            # codepoint wasn't found, return the match untouched
            return match.group()
Esempio n. 16
0
    def substitute_entity(match):
        ent = match.group(2)
        if match.group(1) == "#":
            # numeric substitution
            return chr(int(ent))
        else:
            # get the codepoint from the name
            cp = n2cp.get(ent)

        if cp:
            #if a codepoint was found, return it's string value
            return chr(cp)
        else:
            # codepoint wasn't found, return the match untouched
            return match.group()
Esempio n. 17
0
    def substitute_entity(match):
        if PY3:
            from html.entities import name2codepoint as n2cp
        else:
            from htmlentitydefs import name2codepoint as n2cp
        ent = match.group(2)
        if match.group(1) == "#":
            return unichr(int(ent)).encode('utf-8')
        else:
            cp = n2cp.get(ent)

            if cp:
                return unichr(cp).encode('utf-8')
            else:
                return match.group()
def substitute(match):
    ent = match.group(2)
    if match.group(1) == "#":
        try:
            return unichr(int(ent))
        except:
            return chr(int(ent))
    else:
        cp = n2cp.get(ent)
        if cp:
            try:
                return unichr(cp)
            except:
                return chr(cp)
        else:
            return match.group()
Esempio n. 19
0
 def substitute_entity(match):
     ent = match.group(3)
     if match.group(1) == "#":
         # decoding by number
         if match.group(2) == '':
             # number is in decimal
             return unichr(int(ent))
         elif match.group(2) == 'x':
             # number is in hex
             return unichr(int('0x' + ent, 16))
     else:
         # they were using a name
         cp = n2cp.get(ent)
         if cp:
             return unichr(cp)
         else:
             return match.group()
Esempio n. 20
0
 def substitute_entity(match):
     ent = match.group(3)
     if match.group(1) == "#":
         # decoding by number
         if match.group(2) == '':
             # number is in decimal
             return unichr(int(ent))
         elif match.group(2) == 'x':
             # number is in hex
             return unichr(int('0x' + ent, 16))
     else:
         # they were using a name
         cp = n2cp.get(ent)
         if cp:
             return unichr(cp)
         else:
             return match.group()
Esempio n. 21
0
def substitute_entity(match):
  try:
    ent = match.group(3)

    if match.group(1) == "#":
        if match.group(2) == '':
            return chr(int(ent))
        elif match.group(2) == 'x':
            return chr(int('0x' + ent, 16))
    else:
        cp = n2cp.get(ent)

        if cp:
            return chr(cp)
        else:
            return match.group()
  except:
    return ""
Esempio n. 22
0
def substitute_entity(match):
    try:
        ent = match.group(3)

        if match.group(1) == "#":
            if match.group(2) == '':
                return chr(int(ent))
            elif match.group(2) == 'x':
                return chr(int('0x' + ent, 16))
        else:
            cp = n2cp.get(ent)

            if cp:
                return chr(cp)
            else:
                return match.group()
    except:
        return ""
Esempio n. 23
0
    def substitute_entity(match):
        if PY3:
            from html.entities import name2codepoint as n2cp
        else:
            from htmlentitydefs import name2codepoint as n2cp
        ent = match.group(2)
        if match.group(1) == "#":
            ent = unichr(int(ent)).encode('utf-8')
            if PY3 and isinstance(ent, bytes):
                ent = ent.decode("utf-8")
            return ent
        else:
            cp = n2cp.get(ent)

            if cp:
                cp = unichr(cp).encode('utf-8')
                if PY3 and isinstance(cp, bytes):
                    cp = cp.decode("utf-8")
                return cp
            else:
                return match.group()
Esempio n. 24
0
 def substitute_entity(match):
     try:
         ent = match.group(3)
         if match.group(1) == "#":
             # decoding by number
             if match.group(2) == '':
                 # number is in decimal
                 return safe_unichr(int(ent))
             elif match.group(2) in ['x', 'X']:
                 # number is in hex
                 return safe_unichr(int(ent, 16))
         else:
             # they were using a name
             cp = n2cp.get(ent)
             if cp:
                 return safe_unichr(cp)
             else:
                 return match.group()
     except Exception:
         # in case of errors, return original input
         return match.group()
Esempio n. 25
0
 def substitute_entity(match):
     try:
         ent = match.group(3)
         if match.group(1) == "#":
             # decoding by number
             if match.group(2) == '':
                 # number is in decimal
                 return safe_unichr(int(ent))
             elif match.group(2) in ['x', 'X']:
                 # number is in hex
                 return safe_unichr(int(ent, 16))
         else:
             # they were using a name
             cp = n2cp.get(ent)
             if cp:
                 return safe_unichr(cp)
             else:
                 return match.group()
     except:
         # in case of errors, return original input
         return match.group()
Esempio n. 26
0
File: utils.py Progetto: ianatha/GAM
 def handle_entityref(self, name):
     cp = name2codepoint.get(name)
     if cp:
         self.__text.append(chr(cp))
     else:
         self.__text.append('&' + name)
Esempio n. 27
0
def decode_xml_replacer(match):
  name=match.group(1)
  if(name.startswith("#")):
    return chr(int(name[1:],16))
  return chr(name2codepoint.get(name,'?'))
Esempio n. 28
0
 def handle_entityref(self, name):
     """Handle named entities of the form &aaaa; e.g. &rsquo;"""
     if name in ['gt', 'lt', 'amp']:
         self.handle_data("&%s;" % name)
     else:
         self.handle_data(chr(name2codepoint.get(name, "&%s;" % name)))
Esempio n. 29
0
File: utils.py Progetto: jay0lee/GAM
 def handle_entityref(self, name):
   cp = name2codepoint.get(name)
   if cp:
     self.__text.append(chr(cp))
   else:
     self.__text.append('&' + name)
Esempio n. 30
0
 def handle_entityref(self, name):
     """Handle named entities of the form &aaaa; e.g. &rsquo;"""
     if name in ['gt', 'lt', 'amp']:
         self.handle_data("&%s;" % name)
     else:
         self.handle_data(chr(name2codepoint.get(name, u"&%s;" % name)))
Esempio n. 31
0
 def char_from_entity(match):
     code = name2codepoint.get(match.group(1), 0xFFFD)
     return chr(code)
Esempio n. 32
0
def convert_entities(s):
    s = re.sub('&#(\d+);', lambda m: chr(int(m.groups(0)[0])), s)
    return re.sub('&(\w)+;',
                  lambda m: n2cp.get(m.groups(0), '&%s;' % m.groups(0)[0]), s)
Esempio n. 33
0
 def handle_entityref(self, name):
     num = name2codepoint.get(name)
     if num is not None:
         self.handle_charref(num)