Example #1
0
File: plugin.py Project: eif0/d0b
		def substitute_entity(match):
			ent = match.group(3)
			if match.group(1) == "#":	# number, decimal or hexadecimal
				return unichr(int(ent)) if match.group(2) == '' else unichr(int('0x'+ent,16))
			else:	# name
				cp = name2codepoint.get(ent)
				return unichr(cp) if cp else match.group()
Example #2
0
def _resolve_entity (mo):
    """
    Resolve a HTML entity.

    @param mo: matched _entity_re object with a "entity" match group
    @type mo: MatchObject instance
    @return: resolved entity char, or empty string on error
    @rtype: unicode string
    """
    ent = mo.group("entity")
    s = mo.group()
    if s.startswith('&#'):
        if s[2] in 'xX':
            radix = 16
        else:
            radix = 10
        try:
            num = int(ent, radix)
        except (ValueError, OverflowError):
            return u''
    else:
        num = name2codepoint.get(ent)
    if num is None or num < 0:
        # unknown entity -> ignore
        return u''
    try:
        return unichr(num)
    except ValueError:
        return u''
Example #3
0
    def convert_entity(m):
        entity_body = m.group(3)
        if m.group(1):
            try:
                if m.group(2):
                    number = int(entity_body, 16)
                else:
                    number = int(entity_body, 10)
                # Numeric character references in the 80-9F range are typically
                # interpreted by browsers as representing the characters mapped
                # to bytes 80-9F in the Windows-1252 encoding. For more info
                # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
                if 0x80 <= number <= 0x9f:
                    return chr(number).decode('cp1252')
            except ValueError:
                number = None
        else:
            if entity_body in keep:
                return m.group(0)
            else:
                number = name2codepoint.get(entity_body)
        if number is not None:
            try:
                return unichr(number)
            except ValueError:
                pass

        return u'' if remove_illegal else m.group(0)
Example #4
0
 def handler(mo):
     """ Callback to convert entities """
     e = mo.group(1)
     v = e[1:-1]
     if not v.startswith('#'):
         codepoint =  name2codepoint.get(v)
         return codepoint and '&#%d;' % codepoint or ''
     else:
         return e
Example #5
0
def escape2char(s):
  """ E.g. "\\u0041" -> 'A' """
  from htmlentitydefs import name2codepoint
  c = s[1]
  if c in 'xuU':    c = int(s[2:], 16)
  elif c.isdigit(): c = int(s[1:], 8)
  elif c == '&':    c = name2codepoint.get(s[2:-1], 0xFFFD)
  else:             c = escape_table.get(c, 0xFFFD)
  return unichr(c)
Example #6
0
 def inline_entity_repl(self, stack, entity):
     if entity[1] == '#':
         if entity[2] == 'x':
             c = int(entity[3:-1], 16)
         else:
             c = int(entity[2:-1], 10)
         c = unichr(c)
     else:
         c = unichr(name2codepoint.get(entity[1:-1], 0xfffe))
     stack.top_append(c)
Example #7
0
def decode_entity(match):
    what = match.group(1)
    if what.startswith('#x'):
        what = int(what[2:], 16)
    elif what.startswith('#'):
        what = int(what[1:])
    else:
        from htmlentitydefs import name2codepoint
        what = name2codepoint.get(what, match.group(0))
    return uchr(what)
def _substitute_entity(m):
    ent = m.group(2)
    if m.group(1) == "#":
        return unichr(int(ent))
    else:
        cp = name2codepoint.get(ent)
        if cp:
            return unichr(cp)
        else:
            return m.group()
Example #9
0
 def substitute_entity(match):
     ent = match.group(2)
     if match.group(1) == "#":
         return unichr(int(ent))
     else:
         cp = n2cp.get(ent)
         if cp:
             return unichr(cp)
         else:
             return match.group()
Example #10
0
 def substitute_entity(match):
     ent = match.group(3)
     if match.group(1) == "#":
         if match.group(2) == '':
             return unichr(int(ent))
         elif match.group(2) == 'x':
             return unichr(int('0x'+ent, 16))
     else:
         cp = n2cp.get(ent)
         if cp: return unichr(cp)
         else: return match.group()
Example #11
0
 def substitute_entity(match):
     from htmlentitydefs import name2codepoint as n2cp
     ent = match.group(2)
     if match.group(1) == "#":
         return unichr(int(ent))
     else:
         cp = n2cp.get(ent)
         if cp:
             return unichr(cp)
         else:
             return match.group()
Example #12
0
	def __entity(match):
		ent = match.group(2)
		if match.group(1) == "#":
			return unichr(int(ent))
		else:
			cp = name2codepoint.get(ent)
		
			if cp:
				return unichr(cp)
			else:
				return match.group()
Example #13
0
    def inline_entity_repl(self, stack, entity):
        if entity[1] == "#":
            if entity[2] == "x":
                c = int(entity[3:-1], 16)
            else:
                c = int(entity[2:-1], 10)
            c = unichr(c)
        else:
            from htmlentitydefs import name2codepoint

            c = unichr(name2codepoint.get(entity[1:-1], 0xFFFE))
        stack.top_append(c)
Example #14
0
def subs_entity(match):
    entity = match.group(3)
    if match.group(1) == "#":
        if match.group(2) == '':
            return unichr(int(entity))
        elif match.group(2) == 'x':
            return unichr(int('0x' + entity, 16))
    else:
        codepoint = name2codepoint.get(entity, "")
        if codepoint != "":
            return unichr(codepoint)
    return match.group()
Example #15
0
def substitute_html_entity(match):
	ent = match.group(2)
	if match.group(1) == "#":
		return "&#" + ent + ";"
	else:
		if ent in ['br', 'nbsp', 'gt', 'lt', 'quot']:
			return "&" + ent + ";"
		
		cp = n2cp.get(ent)
		if cp:
			return "&#" + str(cp) + ";"
		else:
			return match.group()
Example #16
0
def substitute_entity(match):
    ent = match.group(2)
    try:
        if match.group(1) == "#":
            return unichr(int(ent))
        else:
            cp = n2cp.get(ent)
        if cp:
            return unichr(cp)
        else:
            return match.group()
    except UnicodeDecodeError:
        return ent
Example #17
0
def _substitute_entity(m):
	ent = m.group(2)
	if m.group(1) == "#":
		# Hex value
		if ent[0] == 'x':
			return unichr(int(ent[1:], 16))
		else:
			return unichr(int(ent))
	else:
		cp = name2codepoint.get(ent)
		if cp:
			return unichr(cp)
		else:
			return m.group()
Example #18
0
 def _substitute_entity(cls, match):
     """ Adapted from http://snippets.dzone.com/posts/show/4569 """
     ent = match.group(3)
     if match.group(1) == '#':
         if match.group(2) == '':
             return unichr(int(ent))
         elif match.group(2) == 'x':
             return unichr(int('0x' + ent, 16))
     else:
         cp = name2codepoint.get(ent)
         if cp:
             return unichr(cp)
         else:
             return match.group()
def substitute_entity(match):
    """
    used by decode_htmlentities.

    """
    ent = match.group(2)
    if match.group(1) == "#":
        return unichr(int(ent))
    else:
        cp = n2cp.get(ent)

        if cp:
            return unichr(cp)
        else:
            return match.group()
Example #20
0
 def substitute_entity(match):
     ent = match.group(3)
     if match.group(1) == "#":
         # decoding by number
         if match.group(2) == '':
             # number is in decimal
             return unichr(int(ent))
         elif match.group(2) == 'x':
             # number is in hex
             return unichr(int('0x'+ent, 16))
     else:
         # they were using a name
         cp = n2cp.get(ent)
         if cp: return unichr(cp)
         else: return match.group()
 def repl_func(m):
     entity = m.group('entity') # get the entity name or number
     try:
         # if integer
         codepoint = int(entity)       
     except ValueError: 
         # not integer - it must be named and therefore 
         # in name2codepoint (i.e. codepoint is never None)
         codepoint = name2codepoint.get(entity)
     # if codepoint > 16**2, or for some other 
     # reason we cannot encode, just leave as-is
     try:
         return unichr(codepoint)
     except ValueError:
         return m.group()
Example #22
0
    def entity_to_char(self, text):
        """Swap accents code in HTML to accents characters"""
        name_func = '(Text.swap_cod_html_to_char) '

        cods = self.cod_html.findall(text)
        cods = set(cods)
        for cod in cods:
            cod_unicode = name2codepoint.get(cod)
            if cod_unicode:
                try:
                    text = text.replace('&%s;'%(cod),
                                        unichr(cod_unicode).encode('utf-8'))
                except Exception, msg:
                    logging.error('%sErro ao trocar os chars acentuados em \
HTML(1): %s' % (name_func, msg))
Example #23
0
def _substitute_entity(match):
    ent = match.group(3)
    if match.group(1) == '#':
        # decoding by number
        if match.group(2) == '':
            # number is in decimal
            return unichr(int(ent))
        elif match.group(2) == 'x':
            # number is in hex
            return unichr(int('0x' + ent, 16))
    else:
        # they were using a name
        cp = n2cp.get(ent)
        if cp:
            return unichr(cp)
        else:
            return match.group()
Example #24
0
 def substitute_entity(match):
     from htmlentitydefs import name2codepoint as n2cp
     ent = match.group(3)
     if match.group(1) == "#":
         # decoding by number
         if match.group(2) == '':
             # number is in decimal
             return unichr(int(ent))
         elif match.group(2) == 'x':
             # number is in hex
             return unichr(int('0x' + ent, 16))
     else:
         # they were using a name
         cp = n2cp.get(ent)
         if cp:
             return unichr(cp)
         else:
             return match.group()
Example #25
0
def substitute_entity(match):
  try:
    ent = match.group(3)
    
    if match.group(1) == "#":
        if match.group(2) == '':
            return unichr(int(ent))
        elif match.group(2) == 'x':
            return unichr(int('0x'+ent, 16))
    else:
        cp = n2cp.get(ent)

        if cp:
            return unichr(cp)
        else:
            return match.group()
  except:
      return ""
      def substitute_entity(match):
        ent = match.group(3)
        if match.group(1) == "#":
          # decoding by number
          if match.group(2) == '':
              # number is in decimal
              return unichr(int(ent))
          elif match.group(2) == 'x':
              # number is in hex
              return unichr(int('0x'+ent, 16))
        else:
          # they were using a name
          cp = n2cp.get(ent)
          if cp: return unichr(cp)
          else: return match.group()

        entity_re = re.compile(r'&(#?)(x?)(\w+);')
        return entity_re.subn(substitute_entity, string)[0]
Example #27
0
 def handle_entityref(self, name):
     """Process an entity reference."""
     # XXX: doesn't get called if convert_charrefs=True
     num = name2codepoint.get(name)  # we are sure we're on PY2 here
     if num is not None:
         print('&#%(ref)d;' % {'ref': num}, end='')
Example #28
0
def convert_entities(s):
    s = re.sub('&#(\d+);', lambda m: unichr(int(m.groups(0)[0])), s)
    return re.sub('&(\w)+;',
                  lambda m: n2cp.get(m.groups(0), u'&%s;' % m.groups(0)[0]), s)
Example #29
0
 def handle_entityref(self, name):
   cp = name2codepoint.get(name)
   if cp:
     self.__text.append(unichr(cp))
   else:
     self.__text.append(u'&'+name)
Example #30
0
def convert_entities(s):
    s = re.sub('&#(\d+);', lambda m: unichr(int(m.groups(0)[0])), s)
    return re.sub('&(\w)+;',
        lambda m: n2cp.get(m.groups(0), u'&%s;' % m.groups(0)[0]), s)
Example #31
0
 def handle_entityref(self, name):
     """Handle named entities of the form &aaaa; e.g. &rsquo;"""
     if name in ['gt', 'lt', 'amp']:
         self.handle_data("&%s;" % name)
     else:
         self.handle_data(unichr(name2codepoint.get(name, u"&%s;" % name)))
Example #32
0
 def handle_entityref(self, name):
     """Process an entity reference."""
     # XXX: doesn't get called if convert_charrefs=True
     num = name2codepoint.get(name)  # we are sure we're on PY2 here
     if num is not None:
         print('&#%(ref)d;' % {'ref': num}, end='')
Example #33
0
 def handle_entityref(self, name):
     """Handle named entities of the form &aaaa; e.g. &rsquo;"""
     if name in ['gt', 'lt', 'amp']:
         self.handle_data("&%s;" % name)
     else:
         self.handle_data(unichr(name2codepoint.get(name, u"&%s;" % name)))
Example #34
0
def html_unescape(mystring):
    return HTML_RE.sub(lambda m: unichr(name2codepoint.get(m.group(1), 63)),
                       mystring)