def rawHTML(self, markup): if markup.strip() == "": return "" if "<" not in markup and ">" not in markup: # Seems there are no tags. # Let's get all the "entity references". cleaned = markup import re entities = re.compile("&(?P<e>[a-zA-Z]+);").findall(cleaned) from htmlentitydefs import name2codepoint for ent in entities: if name2codepoint.has_key(ent): cleaned = cleaned.replace("&%s;" % ent, unichr(name2codepoint[ent])) # Then we replace all escaped unicodes. escapedunicodes = re.compile("&#(?P<h>[0-9]+);").findall(markup) for uni in escapedunicodes: cleaned = cleaned.replace("&#%s;" % uni, unichr(int(uni))) self.text(cleaned) self._emitComment("RAW HTML: " + markup) return ""
def handle_entityref(self, ref): # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. if name2codepoint.has_key(ref): self.pieces.append('&%(ref)s;' % locals()) else: self.pieces.append('&%(ref)s' % locals())
def entity(match): value = match.group(1).lower() if value.startswith('#x'): return unichr(int(value[2:], 16)) elif value.startswith('#'): return unichr(int(value[1:])) elif name2codepoint.has_key(value): return unichr(name2codepoint[value]) return '[' + value + ']'
def entity2unicode(mo, dont_convert): entity = mo.group(0) name = mo.group(1) if name in dont_convert: return entity if name.startswith(u'#'): if name.startswith(u'#x'): ordinal = int(name[2:], 16) else: ordinal = int(name[1:]) try: return unichr(ordinal) except: return entity elif name2codepoint.has_key(name): return unichr(name2codepoint[name]) else: return entity