def remove_html_tags(html): # If we would want more speed, we could make these global re_strip_tags = re.compile('<[^>]*>') re_unicode_entities = re.compile('&#(\d{2,4});') re_html_entities = re.compile('&(.{2,8});') re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I) re_listing_tags = re.compile('<li[^>]*>', re.I) result = html # Convert common HTML elements to their text equivalent result = re_newline_tags.sub('\n', result) result = re_listing_tags.sub('\n * ', result) result = re.sub('<[Pp]>', '\n\n', result) # Remove all HTML/XML tags from the string result = re_strip_tags.sub('', result) # Convert numeric XML entities to their unicode character result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result) # Convert named HTML entities to their unicode character result = re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result) # Convert more than two newlines to two newlines result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result) return mark_safe(result.strip())
def fixup(m): text = m.group(0) if text[:1] == "<": return "" # ignore tags if text[:2] == "&#": try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)) else: return unichr(int(text[2:-1])) except ValueError: pass elif text[:1] == "&": if IS_PY2: from htmlentitydefs import entitydefs else: from html.entities import entitydefs entity = entitydefs.get(text[1:-1]) if entity: if entity[:2] == "&#": try: return unichr(int(entity[2:-1])) except ValueError: pass else: return unicode(entity, "iso-8859-1") return text # leave as is
def handle_entityref(self, name): if entitydefs.get(name) is None: # unknown entity refs are emitted as is self.fed.append('&{}'.format(name)) else: # known entity refs are replaced with space self.fed.append(' ')
def handle_entityref(self, name): if entitydefs.get(name) is None: m = self.entityref.match(self.rawdata.splitlines()[self.lineno - 1][self.offset :]) # semicolon is consumed, other chars are not. if m is not None: entity = m.group() if entity[-1] != ";": entity = entity[:-1] self.fed.append(entity) else: self.fed.append("") else: print "entity is none" self.fed.append(" ")
def handle_entityref(self, name): if entitydefs.get(name) is None: m = self.entityref.match( self.rawdata.splitlines()[self.lineno - 1][self.offset:]) entity = m.group() # semicolon is consumed, other chars are not. if entity is not None: #print "entity is none" if entity[-1] != ';': entity = entity[:-1] self.fed.append(entity) else: self.fed.append('') else: self.fed.append(' ')
def remove_html_tags(html): """ Remove HTML tags from a string and replace numeric and named entities with the corresponding character, so the HTML text can be displayed in a simple text view. """ # If we would want more speed, we could make these global re_strip_tags=re.compile('<[^>]*>') re_unicode_entities=re.compile('&#(\d{2,4});') re_html_entities=re.compile('&(.{2,8});') # Remove all HTML/XML tags from the string result=re_strip_tags.sub('', html) # Convert numeric XML entities to their unicode character result=re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result) # Convert named HTML entities to their unicode character result=re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result) return result.strip()
def remove_html_tags(html): """ Remove HTML tags from a string and replace numeric and named entities with the corresponding character, so the HTML text can be displayed in a simple text view. """ if html is None: return None # If we would want more speed, we could make these global re_strip_tags = re.compile('<[^>]*>') re_unicode_entities = re.compile('&#(\d{2,4});') re_html_entities = re.compile('&(.{2,8});') re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I) re_listing_tags = re.compile('<li[^>]*>', re.I) result = html # Convert common HTML elements to their text equivalent result = re_newline_tags.sub('\n', result) result = re_listing_tags.sub('\n * ', result) result = re.sub('<[Pp]>', '\n\n', result) # Remove all HTML/XML tags from the string result = re_strip_tags.sub('', result) # Convert numeric XML entities to their unicode character result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result) # Convert named HTML entities to their unicode character result = re_html_entities.sub( lambda x: unicode(entitydefs.get(x.group(1), ''), 'iso-8859-1'), result) # Convert more than two newlines to two newlines result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result) return result.strip()
def remove_html_tags(html): """ Remove HTML tags from a string and replace numeric and named entities with the corresponding character, so the HTML text can be displayed in a simple text view. """ if html is None: return None # If we would want more speed, we could make these global re_strip_tags = re.compile('<[^>]*>') re_unicode_entities = re.compile('&#(\d{2,4});') re_html_entities = re.compile('&(.{2,8});') re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I) re_listing_tags = re.compile('<li[^>]*>', re.I) result = html # Convert common HTML elements to their text equivalent result = re_newline_tags.sub('\n', result) result = re_listing_tags.sub('\n * ', result) result = re.sub('<[Pp]>', '\n\n', result) # Remove all HTML/XML tags from the string result = re_strip_tags.sub('', result) # Convert numeric XML entities to their unicode character result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result) # Convert named HTML entities to their unicode character result = re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result) # Convert more than two newlines to two newlines result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result) return result.strip()
def handle_entityref(self, name): if not entitydefs.get(name): self.errInput(self.getpos(), "wrongEntity")
def handle_entityref(self, name): return entitydefs.get(name, '')
def _fix_entities(s): if not s: return "" return _entities_re.sub(lambda m: entitydefs.get(m.group(1), m.group(1)), s)
def handle_entityref(self, name): self.builder.text(entitydefs.get(name, ''))
def handle_charref(self, name): return entitydefs.get(name, "")
def handle_entityref(self, name): if not entitydefs.get(name): self.errInput(self.getpos(), "wrongEntity") if self.account == "Samsung": if any(name in q for q in self.samsungQuoteList): self.errInput(self.getpos(), "samsungUnwantedQuote")