Exemple #1
0
def remove_html_tags(html):
    # If we would want more speed, we could make these global
    re_strip_tags = re.compile('<[^>]*>')
    re_unicode_entities = re.compile('&#(\d{2,4});')
    re_html_entities = re.compile('&(.{2,8});')
    re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I)
    re_listing_tags = re.compile('<li[^>]*>', re.I)

    result = html

    # Convert common HTML elements to their text equivalent
    result = re_newline_tags.sub('\n', result)
    result = re_listing_tags.sub('\n * ', result)
    result = re.sub('<[Pp]>', '\n\n', result)

    # Remove all HTML/XML tags from the string
    result = re_strip_tags.sub('', result)

    # Convert numeric XML entities to their unicode character
    result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result)

    # Convert named HTML entities to their unicode character
    result = re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result)

    # Convert more than two newlines to two newlines
    result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result)

    return mark_safe(result.strip())
Exemple #2
0
 def fixup(m):
     text = m.group(0)
     if text[:1] == "<":
         return "" # ignore tags
     if text[:2] == "&#":
         try:
             if text[:3] == "&#x":
                 return unichr(int(text[3:-1], 16))
             else:
                 return unichr(int(text[2:-1]))
         except ValueError:
             pass
     elif text[:1] == "&":
         if IS_PY2:
             from htmlentitydefs import entitydefs
         else:
             from html.entities import entitydefs
         entity = entitydefs.get(text[1:-1])
         if entity:
             if entity[:2] == "&#":
                 try:
                     return unichr(int(entity[2:-1]))
                 except ValueError:
                     pass
             else:
                 return unicode(entity, "iso-8859-1")
     return text # leave as is
Exemple #3
0
 def handle_entityref(self, name):
     if entitydefs.get(name) is None:
         # unknown entity refs are emitted as is
         self.fed.append('&{}'.format(name))
     else:
        # known entity refs are replaced with space
        self.fed.append(' ')
Exemple #4
0
 def handle_entityref(self, name):
     if entitydefs.get(name) is None:
         m = self.entityref.match(self.rawdata.splitlines()[self.lineno - 1][self.offset :])
         # semicolon is consumed, other chars are not.
         if m is not None:
             entity = m.group()
             if entity[-1] != ";":
                 entity = entity[:-1]
             self.fed.append(entity)
         else:
             self.fed.append("")
     else:
         print "entity is none"
         self.fed.append(" ")
Exemple #5
0
 def handle_entityref(self, name):
     if entitydefs.get(name) is None:
         m = self.entityref.match(
             self.rawdata.splitlines()[self.lineno - 1][self.offset:])
         entity = m.group()
         # semicolon is consumed, other chars are not.
         if entity is not None:
             #print "entity is none"
             if entity[-1] != ';':
                 entity = entity[:-1]
             self.fed.append(entity)
         else:
             self.fed.append('')
     else:
         self.fed.append(' ')
Exemple #6
0
def remove_html_tags(html):
    """
    Remove HTML tags from a string and replace numeric and
    named entities with the corresponding character, so the 
    HTML text can be displayed in a simple text view.
    """
    # If we would want more speed, we could make these global
    re_strip_tags=re.compile('<[^>]*>')
    re_unicode_entities=re.compile('&#(\d{2,4});')
    re_html_entities=re.compile('&(.{2,8});')

    # Remove all HTML/XML tags from the string
    result=re_strip_tags.sub('', html)

    # Convert numeric XML entities to their unicode character
    result=re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result)

    # Convert named HTML entities to their unicode character
    result=re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result)

    return result.strip()
Exemple #7
0
def remove_html_tags(html):
    """
    Remove HTML tags from a string and replace numeric and
    named entities with the corresponding character, so the 
    HTML text can be displayed in a simple text view.
    """
    if html is None:
        return None

    # If we would want more speed, we could make these global
    re_strip_tags = re.compile('<[^>]*>')
    re_unicode_entities = re.compile('&#(\d{2,4});')
    re_html_entities = re.compile('&(.{2,8});')
    re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I)
    re_listing_tags = re.compile('<li[^>]*>', re.I)

    result = html

    # Convert common HTML elements to their text equivalent
    result = re_newline_tags.sub('\n', result)
    result = re_listing_tags.sub('\n * ', result)
    result = re.sub('<[Pp]>', '\n\n', result)

    # Remove all HTML/XML tags from the string
    result = re_strip_tags.sub('', result)

    # Convert numeric XML entities to their unicode character
    result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result)

    # Convert named HTML entities to their unicode character
    result = re_html_entities.sub(
        lambda x: unicode(entitydefs.get(x.group(1), ''), 'iso-8859-1'),
        result)

    # Convert more than two newlines to two newlines
    result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result)

    return result.strip()
Exemple #8
0
def remove_html_tags(html):
    """
    Remove HTML tags from a string and replace numeric and
    named entities with the corresponding character, so the 
    HTML text can be displayed in a simple text view.
    """
    if html is None:
        return None

    # If we would want more speed, we could make these global
    re_strip_tags = re.compile('<[^>]*>')
    re_unicode_entities = re.compile('&#(\d{2,4});')
    re_html_entities = re.compile('&(.{2,8});')
    re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I)
    re_listing_tags = re.compile('<li[^>]*>', re.I)

    result = html
    
    # Convert common HTML elements to their text equivalent
    result = re_newline_tags.sub('\n', result)
    result = re_listing_tags.sub('\n * ', result)
    result = re.sub('<[Pp]>', '\n\n', result)

    # Remove all HTML/XML tags from the string
    result = re_strip_tags.sub('', result)

    # Convert numeric XML entities to their unicode character
    result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result)

    # Convert named HTML entities to their unicode character
    result = re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result)
    
    # Convert more than two newlines to two newlines
    result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result)

    return result.strip()
Exemple #9
0
 def handle_entityref(self, name):
     if not entitydefs.get(name):
         self.errInput(self.getpos(), "wrongEntity")
Exemple #10
0
 def handle_entityref(self, name):
     return entitydefs.get(name, '')
Exemple #11
0
 def handle_entityref(self, name):
     return entitydefs.get(name, '')
def _fix_entities(s):
    if not s: return ""
    return _entities_re.sub(lambda m: entitydefs.get(m.group(1), m.group(1)), s)
Exemple #13
0
	def handle_entityref(self, name):
		if not entitydefs.get(name):
			self.errInput(self.getpos(), "wrongEntity")
 def handle_entityref(self, name):
     self.builder.text(entitydefs.get(name, ''))
Exemple #15
0
 def handle_charref(self, name):
     return entitydefs.get(name, "")
Exemple #16
0
 def handle_entityref(self, name):
     if not entitydefs.get(name):
         self.errInput(self.getpos(), "wrongEntity")
     if self.account == "Samsung":
         if any(name in q for q in self.samsungQuoteList):
             self.errInput(self.getpos(), "samsungUnwantedQuote")