Example #1
0
def _replace(match_obj):
    original_str = match_obj.group(0)
    digits, xdigits, name = map(_translate, match_obj.group(1, 3, 5))
    if digits is not None:
        return chr(int(digits))
    elif xdigits is not None:
        return chr(int(xdigits, base=16))
    elif name is not None:
        txt = name2txt.get(name, None)
        if txt is None:
            txt = name2txt.get(name + ';', original_str)
        return txt

    raise logic - error
Example #2
0
def remove_html_tags(html):
    # If we would want more speed, we could make these global
    re_strip_tags = re.compile('<[^>]*>')
    re_unicode_entities = re.compile(r'&#(\d{2,4});')
    re_html_entities = re.compile('&(.{2,8});')
    re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I)
    re_listing_tags = re.compile('<li[^>]*>', re.I)

    result = html

    # Convert common HTML elements to their text equivalent
    result = re_newline_tags.sub('\n', result)
    result = re_listing_tags.sub('\n * ', result)
    result = re.sub('<[Pp]>', '\n\n', result)

    # Remove all HTML/XML tags from the string
    result = re_strip_tags.sub('', result)

    # Convert numeric XML entities to their unicode character
    result = re_unicode_entities.sub(lambda x: chr(int(x.group(1))), result)

    # Convert named HTML entities to their unicode character
    result = re_html_entities.sub(
        lambda x: str(entitydefs.get(x.group(1), ''), 'iso-8859-1'), result
    )

    # Convert more than two newlines to two newlines
    result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result)

    return mark_safe(result.strip())
Example #3
0
def remove_html_tags(html):
    # If we would want more speed, we could make these global
    re_strip_tags = re.compile('<[^>]*>')
    re_unicode_entities = re.compile('&#(\d{2,4});')
    re_html_entities = re.compile('&(.{2,8});')
    re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I)
    re_listing_tags = re.compile('<li[^>]*>', re.I)

    result = html

    # Convert common HTML elements to their text equivalent
    result = re_newline_tags.sub('\n', result)
    result = re_listing_tags.sub('\n * ', result)
    result = re.sub('<[Pp]>', '\n\n', result)

    # Remove all HTML/XML tags from the string
    result = re_strip_tags.sub('', result)

    # Convert numeric XML entities to their unicode character
    result = re_unicode_entities.sub(lambda x: chr(int(x.group(1))), result)

    # Convert named HTML entities to their unicode character
    result = re_html_entities.sub(
        lambda x: str(entitydefs.get(x.group(1), ''), 'iso-8859-1'), result
    )

    # Convert more than two newlines to two newlines
    result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result)

    return mark_safe(result.strip())
Example #4
0
    def skippedEntity(self, name):
        # Encoding?
        content = html_entity_defs.get(name)
        if not content:
            raise RuntimeError("Unknown HTML entity &%s;" % name)

        return self.characters(str(content))
Example #5
0
    def fixup(m):
        text = m.group(0)
        startswith = text.startswith

        if startswith('<'):
            return ''  # ignore tags

        if startswith('&'):
            if startswith('&#'):
                try:
                    if startswith('&#x'):
                        return chr(int(text[3:-1], 16))
                    else:
                        return chr(int(text[2:-1]))
                except ValueError:
                    pass
            else:
                entity = htmlentitydefs.get(text[1:-1])

                if entity:
                    if entity.startswith('&#'):  # TODO: test this case
                        try:
                            return chr(int(entity[2:-1]))
                        except ValueError:
                            pass
                    else:
                        # return unicode(entity, "iso-8859-1")
                        return entity  # TODO: encode ?

        return text  # Leave as is
Example #6
0
    def fix_up(m):
        sub_text = m.group(0)
        startswith = sub_text.startswith

        if startswith('<'):
            return ''  # ignore tags

        if startswith('&'):
            if startswith('&#'):
                try:
                    if startswith('&#x'):
                        return chr(int(sub_text[3:-1], 16))
                    else:
                        return chr(int(sub_text[2:-1]))
                except ValueError:
                    pass
            else:
                entity = html_entities.get(sub_text[1:-1])

                if entity:
                    # if entity.startswith('&#'):
                    #     try:
                    #         return chr(int(entity[2:-1]))
                    #     except ValueError:
                    #         pass
                    # else:
                    #     return entity
                    return entity  # TODO: encode ?

        return sub_text  # Leave as is
Example #7
0
    def convert_entity(m):
        if m.group(1):
            try:
                return chr(int(m.group(2)))
            except ValueError:
                return m.group(0)

        return entitydefs.get(m.group(2), m.group(0))
Example #8
0
    def process(self, html):
        """
        Remove HTML tags from a string and replace numeric and
        named entities with the corresponding character, so the
        HTML text can be displayed in a simple text view.
        """
        if html is None:
            return None

        # If we would want more speed, we could make these global
        re_strip_tags = re.compile('<[^>]*>')
        re_unicode_entities = re.compile('&#(\d{2,4});')
        re_html_entities = re.compile('&(.{2,8});')
        re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I)
        re_listing_tags = re.compile('<li[^>]*>', re.I)

        result = html

        # Convert common HTML elements to their text equivalent
        result = re_newline_tags.sub('\n', result)
        result = re_listing_tags.sub('\n * ', result)
        result = re.sub('<[Pp]>', '\n\n', result)

        # Remove all HTML/XML tags from the string
        result = re_strip_tags.sub('', result)
        # Convert numeric XML entities to their unicode character
        result = re_unicode_entities.sub(
            lambda x: chr(int(x.group(1))),
            result)

        # Convert named HTML entities to their unicode character
        result = re_html_entities.sub(
            lambda x: str(entitydefs.get(x.group(1), ''), 'iso-8859-1'),
            result)

        # Convert more than two newlines to two newlines
        result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result)

        return result.strip()
Example #9
0
    def process(self, html):
        """
        Remove HTML tags from a string and replace numeric and
        named entities with the corresponding character, so the
        HTML text can be displayed in a simple text view.
        """
        if html is None:
            return None

        # If we would want more speed, we could make these global
        re_strip_tags = re.compile('<[^>]*>')
        re_unicode_entities = re.compile('&#(\d{2,4});')
        re_html_entities = re.compile('&(.{2,8});')
        re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I)
        re_listing_tags = re.compile('<li[^>]*>', re.I)

        result = html

        # Convert common HTML elements to their text equivalent
        result = re_newline_tags.sub('\n', result)
        result = re_listing_tags.sub('\n * ', result)
        result = re.sub('<[Pp]>', '\n\n', result)

        # Remove all HTML/XML tags from the string
        result = re_strip_tags.sub('', result)
        # Convert numeric XML entities to their unicode character
        result = re_unicode_entities.sub(lambda x: chr(int(x.group(1))),
                                         result)

        # Convert named HTML entities to their unicode character
        result = re_html_entities.sub(
            lambda x: str(entitydefs.get(x.group(1), ''), 'iso-8859-1'),
            result)

        # Convert more than two newlines to two newlines
        result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result)

        return result.strip()