Example #1
0
    def rawHTML(self, markup):
        if markup.strip() == "":
            return ""

        if "<" not in markup and ">" not in markup:
            # Seems there are no tags.
            # Let's get all the "entity references".
            cleaned = markup
            import re
            entities = re.compile("&(?P<e>[a-zA-Z]+);").findall(cleaned)
            from htmlentitydefs import name2codepoint
            for ent in entities:
                if name2codepoint.has_key(ent):
                    cleaned = cleaned.replace("&%s;" % ent,
                                              unichr(name2codepoint[ent]))

            # Then we replace all escaped unicodes.
            escapedunicodes = re.compile("&#(?P<h>[0-9]+);").findall(markup)
            for uni in escapedunicodes:
                cleaned = cleaned.replace("&#%s;" % uni, unichr(int(uni)))

            self.text(cleaned)

        self._emitComment("RAW HTML: " + markup)
        return ""
Example #2
0
 def handle_entityref(self, ref):
     # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
     # Reconstruct the original entity reference.
     if name2codepoint.has_key(ref):
         self.pieces.append('&%(ref)s;' % locals())
     else:
         self.pieces.append('&amp;%(ref)s' % locals())
Example #3
0
 def handle_entityref(self, ref):
     # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
     # Reconstruct the original entity reference.
     if name2codepoint.has_key(ref):
         self.pieces.append('&%(ref)s;' % locals())
     else:
         self.pieces.append('&amp;%(ref)s' % locals())
Example #4
0
    def rawHTML(self, markup):
        if markup.strip() == "":
            return ""

        if "<" not in markup and ">" not in markup:
            # Seems there are no tags.
            # Let's get all the "entity references".
            cleaned = markup
            import re

            entities = re.compile("&(?P<e>[a-zA-Z]+);").findall(cleaned)
            from htmlentitydefs import name2codepoint

            for ent in entities:
                if name2codepoint.has_key(ent):
                    cleaned = cleaned.replace("&%s;" % ent, unichr(name2codepoint[ent]))

            # Then we replace all escaped unicodes.
            escapedunicodes = re.compile("&#(?P<h>[0-9]+);").findall(markup)
            for uni in escapedunicodes:
                cleaned = cleaned.replace("&#%s;" % uni, unichr(int(uni)))

            self.text(cleaned)

        self._emitComment("RAW HTML: " + markup)
        return ""
Example #5
0
def entity(match):
    value = match.group(1).lower()
    if value.startswith('#x'):
        return unichr(int(value[2:], 16))
    elif value.startswith('#'):
        return unichr(int(value[1:]))
    elif name2codepoint.has_key(value):
        return unichr(name2codepoint[value])
    return '[' + value + ']'
Example #6
0
File: web.py Project: 0gobi/chinbot
def entity(match): 
   value = match.group(1).lower()
   if value.startswith('#x'): 
      return unichr(int(value[2:], 16))
   elif value.startswith('#'): 
      return unichr(int(value[1:]))
   elif name2codepoint.has_key(value): 
      return unichr(name2codepoint[value])
   return '[' + value + ']'
Example #7
0
def entity2unicode(mo, dont_convert):
    entity = mo.group(0)
    name = mo.group(1)
    if name in dont_convert:
        return entity
    if name.startswith(u'#'):
        if name.startswith(u'#x'):
            ordinal = int(name[2:], 16)
        else:
            ordinal = int(name[1:])
        try:
            return unichr(ordinal)
        except:
            return entity
    elif name2codepoint.has_key(name):
        return unichr(name2codepoint[name])
    else:
        return entity