def _replace(match_obj): original_str = match_obj.group(0) digits, xdigits, name = map(_translate, match_obj.group(1, 3, 5)) if digits is not None: return chr(int(digits)) elif xdigits is not None: return chr(int(xdigits, base=16)) elif name is not None: txt = name2txt.get(name, None) if txt is None: txt = name2txt.get(name + ';', original_str) return txt raise logic - error
def remove_html_tags(html): # If we would want more speed, we could make these global re_strip_tags = re.compile('<[^>]*>') re_unicode_entities = re.compile(r'&#(\d{2,4});') re_html_entities = re.compile('&(.{2,8});') re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I) re_listing_tags = re.compile('<li[^>]*>', re.I) result = html # Convert common HTML elements to their text equivalent result = re_newline_tags.sub('\n', result) result = re_listing_tags.sub('\n * ', result) result = re.sub('<[Pp]>', '\n\n', result) # Remove all HTML/XML tags from the string result = re_strip_tags.sub('', result) # Convert numeric XML entities to their unicode character result = re_unicode_entities.sub(lambda x: chr(int(x.group(1))), result) # Convert named HTML entities to their unicode character result = re_html_entities.sub( lambda x: str(entitydefs.get(x.group(1), ''), 'iso-8859-1'), result ) # Convert more than two newlines to two newlines result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result) return mark_safe(result.strip())
def remove_html_tags(html): # If we would want more speed, we could make these global re_strip_tags = re.compile('<[^>]*>') re_unicode_entities = re.compile('&#(\d{2,4});') re_html_entities = re.compile('&(.{2,8});') re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I) re_listing_tags = re.compile('<li[^>]*>', re.I) result = html # Convert common HTML elements to their text equivalent result = re_newline_tags.sub('\n', result) result = re_listing_tags.sub('\n * ', result) result = re.sub('<[Pp]>', '\n\n', result) # Remove all HTML/XML tags from the string result = re_strip_tags.sub('', result) # Convert numeric XML entities to their unicode character result = re_unicode_entities.sub(lambda x: chr(int(x.group(1))), result) # Convert named HTML entities to their unicode character result = re_html_entities.sub( lambda x: str(entitydefs.get(x.group(1), ''), 'iso-8859-1'), result ) # Convert more than two newlines to two newlines result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result) return mark_safe(result.strip())
def skippedEntity(self, name): # Encoding? content = html_entity_defs.get(name) if not content: raise RuntimeError("Unknown HTML entity &%s;" % name) return self.characters(str(content))
def fixup(m): text = m.group(0) startswith = text.startswith if startswith('<'): return '' # ignore tags if startswith('&'): if startswith('&#'): try: if startswith('&#x'): return chr(int(text[3:-1], 16)) else: return chr(int(text[2:-1])) except ValueError: pass else: entity = htmlentitydefs.get(text[1:-1]) if entity: if entity.startswith('&#'): # TODO: test this case try: return chr(int(entity[2:-1])) except ValueError: pass else: # return unicode(entity, "iso-8859-1") return entity # TODO: encode ? return text # Leave as is
def fix_up(m): sub_text = m.group(0) startswith = sub_text.startswith if startswith('<'): return '' # ignore tags if startswith('&'): if startswith('&#'): try: if startswith('&#x'): return chr(int(sub_text[3:-1], 16)) else: return chr(int(sub_text[2:-1])) except ValueError: pass else: entity = html_entities.get(sub_text[1:-1]) if entity: # if entity.startswith('&#'): # try: # return chr(int(entity[2:-1])) # except ValueError: # pass # else: # return entity return entity # TODO: encode ? return sub_text # Leave as is
def convert_entity(m): if m.group(1): try: return chr(int(m.group(2))) except ValueError: return m.group(0) return entitydefs.get(m.group(2), m.group(0))
def process(self, html): """ Remove HTML tags from a string and replace numeric and named entities with the corresponding character, so the HTML text can be displayed in a simple text view. """ if html is None: return None # If we would want more speed, we could make these global re_strip_tags = re.compile('<[^>]*>') re_unicode_entities = re.compile('&#(\d{2,4});') re_html_entities = re.compile('&(.{2,8});') re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I) re_listing_tags = re.compile('<li[^>]*>', re.I) result = html # Convert common HTML elements to their text equivalent result = re_newline_tags.sub('\n', result) result = re_listing_tags.sub('\n * ', result) result = re.sub('<[Pp]>', '\n\n', result) # Remove all HTML/XML tags from the string result = re_strip_tags.sub('', result) # Convert numeric XML entities to their unicode character result = re_unicode_entities.sub( lambda x: chr(int(x.group(1))), result) # Convert named HTML entities to their unicode character result = re_html_entities.sub( lambda x: str(entitydefs.get(x.group(1), ''), 'iso-8859-1'), result) # Convert more than two newlines to two newlines result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result) return result.strip()
def process(self, html): """ Remove HTML tags from a string and replace numeric and named entities with the corresponding character, so the HTML text can be displayed in a simple text view. """ if html is None: return None # If we would want more speed, we could make these global re_strip_tags = re.compile('<[^>]*>') re_unicode_entities = re.compile('&#(\d{2,4});') re_html_entities = re.compile('&(.{2,8});') re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I) re_listing_tags = re.compile('<li[^>]*>', re.I) result = html # Convert common HTML elements to their text equivalent result = re_newline_tags.sub('\n', result) result = re_listing_tags.sub('\n * ', result) result = re.sub('<[Pp]>', '\n\n', result) # Remove all HTML/XML tags from the string result = re_strip_tags.sub('', result) # Convert numeric XML entities to their unicode character result = re_unicode_entities.sub(lambda x: chr(int(x.group(1))), result) # Convert named HTML entities to their unicode character result = re_html_entities.sub( lambda x: str(entitydefs.get(x.group(1), ''), 'iso-8859-1'), result) # Convert more than two newlines to two newlines result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result) return result.strip()