예제 #1
0
    def clean(self, text):
        """Cleans text and returns sanitized result as unicode

        :arg str text: text to be cleaned

        :returns: sanitized text as unicode

        """
        if not text:
            return u''

        text = force_unicode(text)

        dom = self.parser.parseFragment(text)
        filtered = BleachSanitizerFilter(
            source=self.walker(dom),

            # Bleach-sanitizer-specific things
            attributes=self.attributes,
            strip_disallowed_elements=self.strip,
            strip_html_comments=self.strip_comments,

            # html5lib-sanitizer things
            allowed_elements=self.tags,
            allowed_css_properties=self.styles,
            allowed_protocols=self.protocols,
            allowed_svg_properties=[],
        )

        # Apply any filters after the BleachSanitizerFilter
        for filter_class in self.filters:
            filtered = filter_class(source=filtered)

        return self.serializer.render(filtered)
예제 #2
0
    def linkify(self, text):
        """Linkify specified text

        :arg str text: the text to add links to

        :returns: linkified text as unicode

        """
        if isinstance(text, six.string_types):

            text = force_unicode(text)

            if not text:
                return u''

            dom = self.parser.parseFragment(text)
            filtered = LinkifyFilter(
                source=self.walker(dom),
                callbacks=self.callbacks,
                skip_tags=self.skip_tags,
                parse_email=self.parse_email,
                url_re=self.url_re,
                email_re=self.email_re,
            )
            return self.serializer.render(filtered)

        raise TypeError('argument must of text type')
예제 #3
0
    def clean(self, text):
        """Cleans text and returns sanitized result as unicode

        :arg str text: text to be cleaned

        :returns: sanitized text as unicode

        """
        if not text:
            return u''

        text = force_unicode(text)

        dom = self.parser.parseFragment(text)
        filtered = BkBleachSanitizerFilter(
            source=self.walker(dom),

            # Bleach-sanitizer-specific things
            attributes=self.attributes,
            strip_disallowed_elements=self.strip,
            strip_html_comments=self.strip_comments,

            # html5lib-sanitizer things
            allowed_elements=self.tags,
            allowed_css_properties=self.styles,
            allowed_protocols=self.protocols,
            allowed_svg_properties=[],
        )

        # Apply any filters after the BleachSanitizerFilter
        for filter_class in self.filters:
            filtered = filter_class(source=filtered)

        return self.serializer.render(filtered)
예제 #4
0
def html_check_and_safe(value):

    tags = bleach.ALLOWED_TAGS + [
        'div', 'br', 'font', 'p', 'table', 'tr', 'td', 'th', 'img', 'u',
        'span', 'tbody', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr'
    ]
    attrs = {
        '*': [
            'class', 'style', 'color', 'align', 'title', 'data-toggle',
            'data-placement'
        ],
        'a': ['href', 'rel'],
        'img': ['src', 'alt'],
    }
    style = ['line-height', 'background-color', 'font-size', 'margin-top']

    text = force_unicode(value)

    class s(BleachSanitizer):
        allowed_elements = tags
        allowed_attributes = attrs
        allowed_css_properties = style
        strip_disallowed_elements = True
        strip_html_comments = True
        allowed_protocols = ['http', 'https', 'data']

    parser = html5lib.HTMLParser(tokenizer=s)

    return mark_safe(bleach._render(parser.parseFragment(text)))
예제 #5
0
파일: forms.py 프로젝트: orlenko/plei
 def render(self, name, value, attrs=None):
     attrs = attrs or {}
     final_attrs = self.build_attrs(attrs, name=name)
     return mark_safe(render_to_string('ckeditor/widget.html', {
         'final_attrs': flatatt(final_attrs),
         'value': conditional_escape(force_unicode(value)),
         'id': final_attrs['id']
     }))
예제 #6
0
 def render(self, name, value, attrs=None):
     attrs = attrs or {}
     final_attrs = self.build_attrs(attrs, name=name)
     return mark_safe(
         render_to_string(
             'ckeditor/widget.html', {
                 'final_attrs': flatatt(final_attrs),
                 'value': conditional_escape(force_unicode(value)),
                 'id': final_attrs['id']
             }))
예제 #7
0
    def handle_email_addresses(self, src_iter):
        """Handle email addresses in character tokens"""
        for token in src_iter:
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                # For each email address we find in the text
                for match in self.email_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {u'type': u'Characters', u'data': text[end:match.start()]}
                        )

                    # Run attributes through the callbacks to see what we
                    # should do with this match
                    attrs = {
                        (None, u'href'): u'mailto:%s' % match.group(0),
                        u'_text': match.group(0)
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text--but not as a link
                        new_tokens.append(
                            {u'type': u'Characters', u'data': match.group(0)}
                        )

                    else:
                        # Add an "a" tag for the new link
                        _text = attrs.pop(u'_text', '')
                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend([
                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
                            {u'type': u'Characters', u'data': force_unicode(_text)},
                            {u'type': u'EndTag', u'name': 'a'}
                        ])
                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
예제 #8
0
def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES,
          styles=ALLOWED_STYLES, strip=False, strip_comments=True):
    """Clean an HTML fragment and return it"""
    if not text:
        return ''

    text = force_unicode(text)

    class s(BleachSanitizer):
        allowed_elements = tags
        allowed_attributes = attributes
        allowed_css_properties = styles
        strip_disallowed_elements = strip
        strip_html_comments = strip_comments

    parser = html5lib.HTMLParser(tokenizer=s)

    return _render(parser.parseFragment(text))
예제 #9
0
    def handle_a_tag(self, token_buffer):
        """Handle the "a" tag

        This could adjust the link or drop it altogether depending on what the
        callbacks return.

        This yields the new set of tokens.

        """
        a_token = token_buffer[0]
        if a_token['data']:
            attrs = a_token['data']
        else:
            attrs = {}
        text = self.extract_character_data(token_buffer)
        attrs['_text'] = text

        attrs = self.apply_callbacks(attrs, False)

        if attrs is None:
            # We're dropping the "a" tag and everything else and replacing
            # it with character data. So emit that token.
            yield {'type': 'Characters', 'data': text}

        else:
            new_text = attrs.pop('_text', '')
            a_token['data'] = alphabetize_attributes(attrs)

            if text == new_text:
                # The callbacks didn't change the text, so we yield the new "a"
                # token, then whatever else was there, then the end "a" token
                yield a_token
                for mem in token_buffer[1:]:
                    yield mem

            else:
                # If the callbacks changed the text, then we're going to drop
                # all the tokens between the start and end "a" tags and replace
                # it with the new text
                yield a_token
                yield {'type': 'Characters', 'data': force_unicode(new_text)}
                yield token_buffer[-1]
예제 #10
0
    def linkify(self, text):
        """Linkify specified text

        :arg str text: the text to add links to

        :returns: linkified text as unicode

        """
        text = force_unicode(text)

        if not text:
            return u''

        dom = self.parser.parseFragment(text)
        filtered = LinkifyFilter(
            source=self.walker(dom),
            callbacks=self.callbacks,
            skip_tags=self.skip_tags,
            parse_email=self.parse_email,
            url_re=self.url_re,
            email_re=self.email_re,
        )
        return self.serializer.render(filtered)
예제 #11
0
def html_check_and_safe(value):

    tags = bleach.ALLOWED_TAGS + ['div', 'br', 'font', 'p', 'table', 'tr', 'td', 'th', 'img', 'u', 'span', 'tbody', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr']
    attrs = {
        '*': ['class', 'style', 'color', 'align', 'title', 'data-toggle', 'data-placement'],
        'a': ['href', 'rel'],
        'img': ['src', 'alt'],
    }
    style = ['line-height', 'background-color', 'font-size', 'margin-top']

    text = force_unicode(value)

    class s(BleachSanitizer):
        allowed_elements = tags
        allowed_attributes = attrs
        allowed_css_properties = style
        strip_disallowed_elements = True
        strip_html_comments = True
        allowed_protocols = ['http', 'https', 'data']

    parser = html5lib.HTMLParser(tokenizer=s)

    return mark_safe(bleach._render(parser.parseFragment(text)))
예제 #12
0
파일: __init__.py 프로젝트: rmoorman/bleach
def clean(text,
          tags=ALLOWED_TAGS,
          attributes=ALLOWED_ATTRIBUTES,
          styles=ALLOWED_STYLES,
          protocols=ALLOWED_PROTOCOLS,
          strip=False,
          strip_comments=True):
    """Clean an HTML fragment of malicious content and return it

    This function is a security-focused function whose sole purpose is to
    remove malicious content from a string such that it can be displayed as
    content in a web page.

    This function is not designed to use to transform content to be used in
    non-web-page contexts.

    :arg text: the text to clean
    :arg tags: whitelist of allowed tags; defaults to
        ``bleach.ALLOWED_TAGS``
    :arg attributes: whitelist of allowed attributes; defaults to
        ``bleach.ALLOWED_ATTRIBUTES``
    :arg styles: whitelist of allowed css; defaults to
        ``bleach.ALLOWED_STYLES``
    :arg protocols: whitelist of allowed protocols for links; defaults
        to ``bleach.ALLOWED_PROTOCOLS``
    :arg strip: whether or not to strip disallowed elements
    :arg strip_comments: whether or not to strip HTML comments

    :returns: cleaned text as unicode

    """
    if not text:
        return u''

    text = force_unicode(text)

    parser = html5lib.HTMLParser(namespaceHTMLElements=False)
    dom = parser.parseFragment(text)

    walker = html5lib.getTreeWalker('etree')
    filtered = BleachSanitizerFilter(
        source=walker(dom),

        # Bleach-sanitizer-specific things
        allowed_attributes_map=attributes,
        strip_disallowed_elements=strip,
        strip_html_comments=strip_comments,

        # html5lib-sanitizer things
        allowed_elements=tags,
        allowed_css_properties=styles,
        allowed_protocols=protocols,
        allowed_svg_properties=[],
    )
    s = HTMLSerializer(
        quote_attr_values='always',
        omit_optional_tags=False,

        # Bleach has its own sanitizer, so don't use the html5lib one
        sanitize=False,

        # Bleach sanitizer alphabetizes already, so don't use the html5lib one
        alphabetical_attributes=False,
    )
    return s.render(filtered)
예제 #13
0
파일: __init__.py 프로젝트: rmoorman/bleach
def linkify(text,
            callbacks=DEFAULT_CALLBACKS,
            skip_pre=False,
            parse_email=False):
    """Convert URL-like strings in an HTML fragment to links

    ``linkify()`` converts strings that look like URLs, domain names and email
    addresses in text that may be an HTML fragment to links, while preserving:

    1. links already in the string
    2. urls found in attributes
    3. email addresses

    ``linkify()`` does a best-effort approach and tries to recover from bad
    situations due to crazy text.

    """
    text = force_unicode(text)

    if not text:
        return u''

    parser = html5lib.HTMLParser()

    forest = parser.parseFragment(text)
    _seen = set()

    def replace_nodes(tree, new_frag, node, index=0):
        """Doesn't really replace nodes, but inserts the nodes contained in
        ``new_frag`` into ``tree`` at position ``index`` and returns the number
        of nodes inserted.

        If ``node`` is passed in, it is removed from the resulting tree.

        :arg tree: tree
        :arg new_frag: fragment of html text to insert
        :arg node: the node to "replace"
        :arg index: the index position to focus on

        :returns: number of nodes inserted so that you can skip ahead

        """
        count = 0
        new_tree = parser.parseFragment(new_frag)
        # capture any non-tag text at the start of the fragment
        if new_tree.text:
            if index == 0:
                tree.text = (tree.text or '') + new_tree.text
            else:
                tree[index -
                     1].tail = (tree[index - 1].tail or '') + new_tree.text

        # then put in the tagged elements into the old tree
        for n in new_tree:
            if n.tag == ETREE_TAG('a'):
                _seen.add(n)
            tree.insert(index + count, n)
            count += 1

        # if we got a node to remove...
        if node is not None:
            # first, grab the node tail so we don't lose text
            if node.tail:
                if index + count == 0:
                    tree.text = (tree.text or '') + node.tail
                else:
                    tree[index + count -
                         1].tail = (tree[index + count - 1].tail
                                    or '') + node.tail
            tree.remove(node)
        return count

    def strip_wrapping_parentheses(fragment):
        """Strips wrapping parentheses.

        Returns a tuple of the following format::

            (string stripped from wrapping parentheses,
             count of stripped opening parentheses,
             count of stripped closing parentheses)
        """
        opening_parentheses = closing_parentheses = 0
        # Count consecutive opening parentheses
        # at the beginning of the fragment (string).
        for char in fragment:
            if char == '(':
                opening_parentheses += 1
            else:
                break

        if opening_parentheses:
            newer_frag = ''
            # Cut the consecutive opening brackets from the fragment.
            fragment = fragment[opening_parentheses:]
            # Reverse the fragment for easier detection of parentheses
            # inside the URL.
            reverse_fragment = fragment[::-1]
            skip = False
            for char in reverse_fragment:
                # Remove the closing parentheses if it has a matching
                # opening parentheses (they are balanced).
                if (char == ')' and closing_parentheses < opening_parentheses
                        and not skip):
                    closing_parentheses += 1
                    continue
                # Do not remove ')' from the URL itself.
                elif char != ')':
                    skip = True
                newer_frag += char
            fragment = newer_frag[::-1]

        return fragment, opening_parentheses, closing_parentheses

    def apply_callbacks(attrs, new):
        for cb in callbacks:
            attrs = cb(attrs, new)
            if attrs is None:
                return None
        return attrs

    def _render_inner(node):
        out = ['' if node.text is None else node.text]
        for subnode in node:
            out.append(_render(subnode))
            if subnode.tail:
                out.append(subnode.tail)
        return ''.join(out)

    def linkify_nodes(tree, parse_text=True):
        children = len(tree)
        current_child = -1
        # start at -1 to process the parent first
        while current_child < len(tree):
            if current_child < 0:
                node = tree
                if parse_text and node.text:
                    new_txt = old_txt = node.text
                    if parse_email:
                        new_txt = re.sub(email_re, email_repl, node.text)
                        if new_txt and new_txt != node.text:
                            node.text = ''
                            adj = replace_nodes(tree, new_txt, None, 0)
                            children += adj
                            current_child += adj
                            linkify_nodes(tree, True)
                            continue

                    new_txt = re.sub(url_re, link_repl, new_txt)
                    if new_txt != old_txt:
                        node.text = ''
                        adj = replace_nodes(tree, new_txt, None, 0)
                        children += adj
                        current_child += adj
                        continue
            else:
                node = tree[current_child]

            if parse_text and node.tail:
                new_tail = old_tail = node.tail
                if parse_email:
                    new_tail = re.sub(email_re, email_repl, new_tail)
                    if new_tail != node.tail:
                        node.tail = ''
                        adj = replace_nodes(tree, new_tail, None,
                                            current_child + 1)
                        # Insert the new nodes made from my tail into
                        # the tree right after me. current_child+1
                        children += adj
                        continue

                new_tail = re.sub(url_re, link_repl, new_tail)
                if new_tail != old_tail:
                    node.tail = ''
                    adj = replace_nodes(tree, new_tail, None,
                                        current_child + 1)
                    children += adj

            if node.tag == ETREE_TAG('a') and not (node in _seen):
                if not node.get('href', None) is None:
                    attrs = dict(node.items())

                    _text = attrs['_text'] = _render_inner(node)

                    attrs = apply_callbacks(attrs, False)

                    if attrs is None:
                        # # <a> tag replaced by the text within it
                        adj = replace_nodes(tree, _text, node, current_child)
                        # pull back current_child by 1 to scan the new nodes
                        # again.
                        current_child -= 1
                    else:
                        text = force_unicode(attrs.pop('_text'))
                        for attr_key, attr_val in attrs.items():
                            node.set(attr_key, attr_val)

                        for n in reversed(list(node)):
                            node.remove(n)
                        text = parser.parseFragment(text)
                        node.text = text.text
                        for n in text:
                            node.append(n)
                        _seen.add(node)

            elif current_child >= 0:
                if node.tag == ETREE_TAG('pre') and skip_pre:
                    linkify_nodes(node, False)
                elif not (node in _seen):
                    linkify_nodes(node, parse_text)

            current_child += 1

    def email_repl(match):
        addr = match.group(0).replace('"', '&quot;')
        link = {
            '_text': addr,
            'href': 'mailto:{0!s}'.format(addr),
        }
        link = apply_callbacks(link, True)

        if link is None:
            return addr

        _href = link.pop('href')
        _text = link.pop('_text')

        repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
        attr = '{0!s}="{1!s}"'
        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
        return repl.format(_href, attribs, _text)

    def link_repl(match):
        url = match.group(0)
        open_brackets = close_brackets = 0
        if url.startswith('('):
            _wrapping = strip_wrapping_parentheses(url)
            url, open_brackets, close_brackets = _wrapping
        if url.endswith(')') and '(' not in url:
            # This is a clumsy handling for the case where we have something
            # like (foo http://example.com) and the ) gets picked up by the
            # url_re but we don't want it part of the link.
            new_url = url.rstrip(')')
            close_brackets += len(url) - len(new_url)
            url = new_url

        end = ''
        m = re.search(punct_re, url)
        if m:
            end = m.group(0)
            url = url[0:m.start()]
        if re.search(proto_re, url):
            href = url
        else:
            href = ''.join(['http://', url])

        link = {
            '_text': url,
            'href': href,
        }

        link = apply_callbacks(link, True)

        if link is None:
            return '(' * open_brackets + url + ')' * close_brackets

        _text = link.pop('_text')
        _href = link.pop('href')

        repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
        attr = '{0!s}="{1!s}"'
        attribs = ' '.join(attr.format(k, v) for k, v in link.items())

        return repl.format('(' * open_brackets, _href, attribs, _text, end,
                           ')' * close_brackets)

    try:
        linkify_nodes(forest)
    except RuntimeError as e:
        # If we hit the max recursion depth, just return what we've got.
        log.exception('Probable recursion error: {0!r}'.format(e))

    return _render(forest)
예제 #14
0
    def handle_links(self, src_iter):
        """Handle links in character tokens"""
        for token in src_iter:
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                for match in self.url_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {u'type': u'Characters', u'data': text[end:match.start()]}
                        )

                    url = match.group(0)
                    prefix = suffix = ''

                    # Sometimes we pick up too much in the url match, so look for
                    # bits we should drop and remove them from the match
                    url, prefix, suffix = self.strip_non_url_bits(url)

                    # If there's no protocol, add one
                    if PROTO_RE.search(url):
                        href = url
                    else:
                        href = u'http://%s' % url

                    attrs = {
                        (None, u'href'): href,
                        u'_text': url
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text
                        new_tokens.append(
                            {u'type': u'Characters', u'data': prefix + url + suffix}
                        )

                    else:
                        # Add the "a" tag!
                        if prefix:
                            new_tokens.append(
                                {u'type': u'Characters', u'data': prefix}
                            )

                        _text = attrs.pop(u'_text', '')
                        attrs = alphabetize_attributes(attrs)

                        new_tokens.extend([
                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
                            {u'type': u'Characters', u'data': force_unicode(_text)},
                            {u'type': u'EndTag', u'name': 'a'},
                        ])

                        if suffix:
                            new_tokens.append(
                                {u'type': u'Characters', u'data': suffix}
                            )

                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
예제 #15
0
파일: __init__.py 프로젝트: rmoorman/bleach
    def linkify_nodes(tree, parse_text=True):
        children = len(tree)
        current_child = -1
        # start at -1 to process the parent first
        while current_child < len(tree):
            if current_child < 0:
                node = tree
                if parse_text and node.text:
                    new_txt = old_txt = node.text
                    if parse_email:
                        new_txt = re.sub(email_re, email_repl, node.text)
                        if new_txt and new_txt != node.text:
                            node.text = ''
                            adj = replace_nodes(tree, new_txt, None, 0)
                            children += adj
                            current_child += adj
                            linkify_nodes(tree, True)
                            continue

                    new_txt = re.sub(url_re, link_repl, new_txt)
                    if new_txt != old_txt:
                        node.text = ''
                        adj = replace_nodes(tree, new_txt, None, 0)
                        children += adj
                        current_child += adj
                        continue
            else:
                node = tree[current_child]

            if parse_text and node.tail:
                new_tail = old_tail = node.tail
                if parse_email:
                    new_tail = re.sub(email_re, email_repl, new_tail)
                    if new_tail != node.tail:
                        node.tail = ''
                        adj = replace_nodes(tree, new_tail, None,
                                            current_child + 1)
                        # Insert the new nodes made from my tail into
                        # the tree right after me. current_child+1
                        children += adj
                        continue

                new_tail = re.sub(url_re, link_repl, new_tail)
                if new_tail != old_tail:
                    node.tail = ''
                    adj = replace_nodes(tree, new_tail, None,
                                        current_child + 1)
                    children += adj

            if node.tag == ETREE_TAG('a') and not (node in _seen):
                if not node.get('href', None) is None:
                    attrs = dict(node.items())

                    _text = attrs['_text'] = _render_inner(node)

                    attrs = apply_callbacks(attrs, False)

                    if attrs is None:
                        # # <a> tag replaced by the text within it
                        adj = replace_nodes(tree, _text, node, current_child)
                        # pull back current_child by 1 to scan the new nodes
                        # again.
                        current_child -= 1
                    else:
                        text = force_unicode(attrs.pop('_text'))
                        for attr_key, attr_val in attrs.items():
                            node.set(attr_key, attr_val)

                        for n in reversed(list(node)):
                            node.remove(n)
                        text = parser.parseFragment(text)
                        node.text = text.text
                        for n in text:
                            node.append(n)
                        _seen.add(node)

            elif current_child >= 0:
                if node.tag == ETREE_TAG('pre') and skip_pre:
                    linkify_nodes(node, False)
                elif not (node in _seen):
                    linkify_nodes(node, parse_text)

            current_child += 1
예제 #16
0
def _render(tree):
    """Try rendering as HTML, then XML, then give up."""
    return force_unicode(_serialize(tree))
예제 #17
0
    def linkify_nodes(tree, parse_text=True):
        children = len(tree)
        current_child = -1
        # start at -1 to process the parent first
        while current_child < len(tree):
            if current_child < 0:
                node = tree
                if parse_text and node.text:
                    new_txt = old_txt = node.text
                    if parse_email:
                        new_txt = re.sub(email_re, email_repl, node.text)
                        if new_txt and new_txt != node.text:
                            node.text = ''
                            adj = replace_nodes(tree, new_txt, None, 0)
                            children += adj
                            current_child += adj
                            linkify_nodes(tree, True)
                            continue

                    new_txt = re.sub(url_re, link_repl, new_txt)
                    if new_txt != old_txt:
                        node.text = ''
                        adj = replace_nodes(tree, new_txt, None, 0)
                        children += adj
                        current_child += adj
                        continue
            else:
                node = tree[current_child]

            if parse_text and node.tail:
                new_tail = old_tail = node.tail
                if parse_email:
                    new_tail = re.sub(email_re, email_repl, new_tail)
                    if new_tail != node.tail:
                        node.tail = ''
                        adj = replace_nodes(tree, new_tail, None,
                                            current_child + 1)
                        # Insert the new nodes made from my tail into
                        # the tree right after me. current_child+1
                        children += adj
                        continue

                new_tail = re.sub(url_re, link_repl, new_tail)
                if new_tail != old_tail:
                    node.tail = ''
                    adj = replace_nodes(tree, new_tail, None,
                                        current_child + 1)
                    children += adj

            if node.tag == ETREE_TAG('a') and not (node in _seen):
                if not node.get('href', None) is None:
                    attrs = dict(node.items())

                    _text = attrs['_text'] = _render_inner(node)

                    attrs = apply_callbacks(attrs, False)

                    if attrs is None:
                        # <a> tag replaced by the text within it
                        adj = replace_nodes(tree, _text, node,
                                            current_child)
                        current_child -= 1
                        # pull back current_child by 1 to scan the
                        # new nodes again.
                    else:
                        text = force_unicode(attrs.pop('_text'))
                        for attr_key, attr_val in attrs.items():
                            node.set(attr_key, attr_val)

                        for n in reversed(list(node)):
                            node.remove(n)
                        text = parser.parseFragment(text)
                        node.text = text.text
                        for n in text:
                            node.append(n)
                        _seen.add(node)

            elif current_child >= 0:
                if node.tag == ETREE_TAG('pre') and skip_pre:
                    linkify_nodes(node, False)
                elif not (node in _seen):
                    linkify_nodes(node, True)

            current_child += 1
예제 #18
0
def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_pre=False,
            parse_email=False, tokenizer=HTMLSanitizer):
    """Convert URL-like strings in an HTML fragment to links.

    linkify() converts strings that look like URLs or domain names in a
    blob of text that may be an HTML fragment to links, while preserving
    (a) links already in the string, (b) urls found in attributes, and
    (c) email addresses.
    """
    text = force_unicode(text)

    if not text:
        return ''

    parser = html5lib.HTMLParser(tokenizer=tokenizer)

    forest = parser.parseFragment(text)
    _seen = set([])

    def replace_nodes(tree, new_frag, node, index=0):
        """
        Doesn't really replace nodes, but inserts the nodes contained in
        new_frag into the treee at position index and returns the number
        of nodes inserted.
        If node is passed in, it is removed from the tree
        """
        count = 0
        new_tree = parser.parseFragment(new_frag)
        # capture any non-tag text at the start of the fragment
        if new_tree.text:
            if index == 0:
                tree.text = tree.text or ''
                tree.text += new_tree.text
            else:
                tree[index - 1].tail = tree[index - 1].tail or ''
                tree[index - 1].tail += new_tree.text
        # the put in the tagged elements into the old tree
        for n in new_tree:
            if n.tag == ETREE_TAG('a'):
                _seen.add(n)
            tree.insert(index + count, n)
            count += 1
        # if we got a node to remove...
        if node is not None:
            tree.remove(node)
        return count

    def strip_wrapping_parentheses(fragment):
        """Strips wrapping parentheses.

        Returns a tuple of the following format::

            (string stripped from wrapping parentheses,
             count of stripped opening parentheses,
             count of stripped closing parentheses)
        """
        opening_parentheses = closing_parentheses = 0
        # Count consecutive opening parentheses
        # at the beginning of the fragment (string).
        for char in fragment:
            if char == '(':
                opening_parentheses += 1
            else:
                break

        if opening_parentheses:
            newer_frag = ''
            # Cut the consecutive opening brackets from the fragment.
            fragment = fragment[opening_parentheses:]
            # Reverse the fragment for easier detection of parentheses
            # inside the URL.
            reverse_fragment = fragment[::-1]
            skip = False
            for char in reverse_fragment:
                # Remove the closing parentheses if it has a matching
                # opening parentheses (they are balanced).
                if (char == ')' and
                        closing_parentheses < opening_parentheses and
                        not skip):
                    closing_parentheses += 1
                    continue
                # Do not remove ')' from the URL itself.
                elif char != ')':
                    skip = True
                newer_frag += char
            fragment = newer_frag[::-1]

        return fragment, opening_parentheses, closing_parentheses

    def apply_callbacks(attrs, new):
        for cb in callbacks:
            attrs = cb(attrs, new)
            if attrs is None:
                return None
        return attrs

    def _render_inner(node):
        out = ['' if node.text is None else node.text]
        for subnode in node:
            out.append(_render(subnode))
            if subnode.tail:
                out.append(subnode.tail)
        return ''.join(out)

    def linkify_nodes(tree, parse_text=True):
        children = len(tree)
        current_child = -1
        # start at -1 to process the parent first
        while current_child < len(tree):
            if current_child < 0:
                node = tree
                if parse_text and node.text:
                    new_txt = old_txt = node.text
                    if parse_email:
                        new_txt = re.sub(email_re, email_repl, node.text)
                        if new_txt and new_txt != node.text:
                            node.text = ''
                            adj = replace_nodes(tree, new_txt, None, 0)
                            children += adj
                            current_child += adj
                            linkify_nodes(tree, True)
                            continue

                    new_txt = re.sub(url_re, link_repl, new_txt)
                    if new_txt != old_txt:
                        node.text = ''
                        adj = replace_nodes(tree, new_txt, None, 0)
                        children += adj
                        current_child += adj
                        continue
            else:
                node = tree[current_child]

            if parse_text and node.tail:
                new_tail = old_tail = node.tail
                if parse_email:
                    new_tail = re.sub(email_re, email_repl, new_tail)
                    if new_tail != node.tail:
                        node.tail = ''
                        adj = replace_nodes(tree, new_tail, None,
                                            current_child + 1)
                        # Insert the new nodes made from my tail into
                        # the tree right after me. current_child+1
                        children += adj
                        continue

                new_tail = re.sub(url_re, link_repl, new_tail)
                if new_tail != old_tail:
                    node.tail = ''
                    adj = replace_nodes(tree, new_tail, None,
                                        current_child + 1)
                    children += adj

            if node.tag == ETREE_TAG('a') and not (node in _seen):
                if not node.get('href', None) is None:
                    attrs = dict(node.items())

                    _text = attrs['_text'] = _render_inner(node)

                    attrs = apply_callbacks(attrs, False)

                    if attrs is None:
                        # <a> tag replaced by the text within it
                        adj = replace_nodes(tree, _text, node,
                                            current_child)
                        current_child -= 1
                        # pull back current_child by 1 to scan the
                        # new nodes again.
                    else:
                        text = force_unicode(attrs.pop('_text'))
                        for attr_key, attr_val in attrs.items():
                            node.set(attr_key, attr_val)

                        for n in reversed(list(node)):
                            node.remove(n)
                        text = parser.parseFragment(text)
                        node.text = text.text
                        for n in text:
                            node.append(n)
                        _seen.add(node)

            elif current_child >= 0:
                if node.tag == ETREE_TAG('pre') and skip_pre:
                    linkify_nodes(node, False)
                elif not (node in _seen):
                    linkify_nodes(node, True)

            current_child += 1

    def email_repl(match):
        addr = match.group(0).replace('"', '&quot;')
        link = {
            '_text': addr,
            'href': 'mailto:{0!s}'.format(addr),
        }
        link = apply_callbacks(link, True)

        if link is None:
            return addr

        _href = link.pop('href')
        _text = link.pop('_text')

        repl = '<a href="{0!s}" {1!s}>{2!s}</a>'
        attr = '{0!s}="{1!s}"'
        attribs = ' '.join(attr.format(k, v) for k, v in link.items())
        return repl.format(_href, attribs, _text)

    def link_repl(match):
        url = match.group(0)
        open_brackets = close_brackets = 0
        if url.startswith('('):
            _wrapping = strip_wrapping_parentheses(url)
            url, open_brackets, close_brackets = _wrapping
        end = ''
        m = re.search(punct_re, url)
        if m:
            end = m.group(0)
            url = url[0:m.start()]
        if re.search(proto_re, url):
            href = url
        else:
            href = ''.join(['http://', url])

        link = {
            '_text': url,
            'href': href,
        }

        link = apply_callbacks(link, True)

        if link is None:
            return '(' * open_brackets + url + ')' * close_brackets

        _text = link.pop('_text')
        _href = link.pop('href')

        repl = '{0!s}<a href="{1!s}" {2!s}>{3!s}</a>{4!s}{5!s}'
        attr = '{0!s}="{1!s}"'
        attribs = ' '.join(attr.format(k, v) for k, v in link.items())

        return repl.format('(' * open_brackets,
                           _href, attribs, _text, end,
                           ')' * close_brackets)

    try:
        linkify_nodes(forest)
    except RuntimeError as e:
        # If we hit the max recursion depth, just return what we've got.
        log.exception('Probable recursion error: {0!r}'.format(e))

    return _render(forest)
예제 #19
0
파일: __init__.py 프로젝트: rmoorman/bleach
def _render(tree):
    """Try rendering as HTML, then XML, then give up."""
    return force_unicode(_serialize(tree))