Ejemplo n.º 1
0
    def linkify(self, text):
        """Linkify specified text

        :arg str text: the text to add links to

        :returns: linkified text as unicode

        :raises TypeError: if ``text`` is not a text type

        """
        if not isinstance(text, six.string_types):
            raise TypeError('argument must be of text type')

        text = force_unicode(text)

        if not text:
            return ''

        dom = self.parser.parseFragment(text)
        filtered = LinkifyFilter(
            source=self.walker(dom),
            callbacks=self.callbacks,
            skip_tags=self.skip_tags,
            parse_email=self.parse_email,
            url_re=self.url_re,
            email_re=self.email_re,
        )
        return self.serializer.render(filtered)
Ejemplo n.º 2
0
    def linkify(self, text):
        """Linkify specified text

        :arg str text: the text to add links to

        :returns: linkified text as unicode

        :raises TypeError: if ``text`` is not a text type

        """
        if not isinstance(text, six.string_types):
            raise TypeError('argument must be of text type')

        text = force_unicode(text)

        if not text:
            return u''

        dom = self.parser.parseFragment(text)
        filtered = LinkifyFilter(
            source=self.walker(dom),
            callbacks=self.callbacks,
            skip_tags=self.skip_tags,
            parse_email=self.parse_email,
            url_re=self.url_re,
            email_re=self.email_re,
        )
        return self.serializer.render(filtered)
Ejemplo n.º 3
0
    def clean(self, text, allowed_domains=False):
        if not allowed_domains:
            allowed_domains = []

        if not isinstance(text, str):
            message = "argument cannot be of '{name}' type, must be of text type".format(
                name=text.__class__.__name__)
            raise TypeError(message)

        if not text:
            return ""

        text = force_unicode(text)
        dom = self.parser.parseFragment(text)
        filtered = BleachSanitizerFilter(
            source=self.walker(dom),
            # Bleach-sanitizer-specific things
            attributes=self.attributes,
            strip_disallowed_elements=self.strip,
            strip_html_comments=self.strip_comments,
            # html5lib-sanitizer things
            allowed_elements=self.tags,
            allowed_css_properties=self.styles,
            allowed_protocols=self.protocols,
            allowed_svg_properties=[],
        )

        # Apply any filters after the BleachSanitizerFilter
        for filter_class in self.filters:
            fc = filter_class(source=filtered)
            filtered = fc.__iter__(allowed_domains=allowed_domains)

        return self.serializer.render(filtered)
Ejemplo n.º 4
0
    def handle_email_addresses(self, src_iter):
        """Handle email addresses in character tokens"""
        for token in src_iter:
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                # For each email address we find in the text
                for match in self.email_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {'type': 'Characters', 'data': text[end:match.start()]}
                        )

                    # Run attributes through the callbacks to see what we
                    # should do with this match
                    attrs = {
                        (None, 'href'): 'mailto:%s' % match.group(0),
                        '_text': match.group(0)
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text--but not as a link
                        new_tokens.append(
                            {'type': 'Characters', 'data': match.group(0)}
                        )

                    else:
                        # Add an "a" tag for the new link
                        _text = attrs.pop('_text', '')
                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend([
                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
                            {'type': 'Characters', 'data': force_unicode(_text)},
                            {'type': 'EndTag', 'name': 'a'}
                        ])
                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({'type': 'Characters', 'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
Ejemplo n.º 5
0
    def handle_email_addresses(self, src_iter):
        """Handle email addresses in character tokens"""
        for token in src_iter:
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                # For each email address we find in the text
                for match in self.email_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {u'type': u'Characters', u'data': text[end:match.start()]}
                        )

                    # Run attributes through the callbacks to see what we
                    # should do with this match
                    attrs = {
                        (None, u'href'): u'mailto:%s' % match.group(0),
                        u'_text': match.group(0)
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text--but not as a link
                        new_tokens.append(
                            {u'type': u'Characters', u'data': match.group(0)}
                        )

                    else:
                        # Add an "a" tag for the new link
                        _text = attrs.pop(u'_text', '')
                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend([
                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
                            {u'type': u'Characters', u'data': force_unicode(_text)},
                            {u'type': u'EndTag', u'name': 'a'}
                        ])
                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
Ejemplo n.º 6
0
    def handle_a_tag(self, token_buffer):
        """Handle the "a" tag

        This could adjust the link or drop it altogether depending on what the
        callbacks return.

        This yields the new set of tokens.

        """
        a_token = token_buffer[0]
        if a_token['data']:
            attrs = a_token['data']
        else:
            attrs = {}
        text = self.extract_character_data(token_buffer)
        attrs['_text'] = text

        attrs = self.apply_callbacks(attrs, False)

        if attrs is None:
            # We're dropping the "a" tag and everything else and replacing
            # it with character data. So emit that token.
            yield {'type': 'Characters', 'data': text}

        else:
            new_text = attrs.pop('_text', '')
            a_token['data'] = alphabetize_attributes(attrs)

            if text == new_text:
                # The callbacks didn't change the text, so we yield the new "a"
                # token, then whatever else was there, then the end "a" token
                yield a_token
                for mem in token_buffer[1:]:
                    yield mem

            else:
                # If the callbacks changed the text, then we're going to drop
                # all the tokens between the start and end "a" tags and replace
                # it with the new text
                yield a_token
                yield {'type': 'Characters', 'data': force_unicode(new_text)}
                yield token_buffer[-1]
Ejemplo n.º 7
0
    def handle_a_tag(self, token_buffer):
        """Handle the "a" tag

        This could adjust the link or drop it altogether depending on what the
        callbacks return.

        This yields the new set of tokens.

        """
        a_token = token_buffer[0]
        if a_token['data']:
            attrs = a_token['data']
        else:
            attrs = {}
        text = self.extract_character_data(token_buffer)
        attrs['_text'] = text

        attrs = self.apply_callbacks(attrs, False)

        if attrs is None:
            # We're dropping the "a" tag and everything else and replacing
            # it with character data. So emit that token.
            yield {'type': 'Characters', 'data': text}

        else:
            new_text = attrs.pop('_text', '')
            a_token['data'] = alphabetize_attributes(attrs)

            if text == new_text:
                # The callbacks didn't change the text, so we yield the new "a"
                # token, then whatever else was there, then the end "a" token
                yield a_token
                for mem in token_buffer[1:]:
                    yield mem

            else:
                # If the callbacks changed the text, then we're going to drop
                # all the tokens between the start and end "a" tags and replace
                # it with the new text
                yield a_token
                yield {'type': 'Characters', 'data': force_unicode(new_text)}
                yield token_buffer[-1]
Ejemplo n.º 8
0
    def clean(self, text):
        """Cleans text and returns sanitized result as unicode

        :arg str text: text to be cleaned

        :returns: sanitized text as unicode

        :raises TypeError: if ``text`` is not a text type

        """
        if not isinstance(text, six.string_types):
            message = "argument cannot be of '{name}' type, must be of text type".format(
                name=text.__class__.__name__)
            raise TypeError(message)

        if not text:
            return ''

        text = force_unicode(text)

        dom = self.parser.parseFragment(text)
        filtered = BleachSanitizerFilter(
            source=self.walker(dom),

            # Bleach-sanitizer-specific things
            attributes=self.attributes,
            strip_disallowed_elements=self.strip,
            strip_html_comments=self.strip_comments,

            # html5lib-sanitizer things
            allowed_elements=self.tags,
            allowed_css_properties=self.styles,
            allowed_protocols=self.protocols,
            allowed_svg_properties=[],
        )

        # Apply any filters after the BleachSanitizerFilter
        for filter_class in self.filters:
            filtered = filter_class(source=filtered)

        return self.serializer.render(filtered)
Ejemplo n.º 9
0
    def clean(self, text):
        """Cleans text and returns sanitized result as unicode

        :arg str text: text to be cleaned

        :returns: sanitized text as unicode

        :raises TypeError: if ``text`` is not a text type

        """
        if not isinstance(text, six.string_types):
            message = "argument cannot be of '{name}' type, must be of text type".format(
                name=text.__class__.__name__)
            raise TypeError(message)

        if not text:
            return u''

        text = force_unicode(text)

        dom = self.parser.parseFragment(text)
        filtered = BleachSanitizerFilter(
            source=self.walker(dom),

            # Bleach-sanitizer-specific things
            attributes=self.attributes,
            strip_disallowed_elements=self.strip,
            strip_html_comments=self.strip_comments,

            # html5lib-sanitizer things
            allowed_elements=self.tags,
            allowed_css_properties=self.styles,
            allowed_protocols=self.protocols,
            allowed_svg_properties=[],
        )

        # Apply any filters after the BleachSanitizerFilter
        for filter_class in self.filters:
            filtered = filter_class(source=filtered)

        return self.serializer.render(filtered)
Ejemplo n.º 10
0
    def handle_links(self, src_iter):
        """Handle links in character tokens"""
        in_a = False  # happens, if parse_email=True and if a mail was found
        for token in src_iter:
            if in_a:
                if token['type'] == 'EndTag' and token['name'] == 'a':
                    in_a = False
                yield token
                continue
            elif token['type'] == 'StartTag' and token['name'] == 'a':
                in_a = True
                yield token
                continue
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                for match in self.url_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {'type': 'Characters', 'data': text[end:match.start()]}
                        )

                    url = match.group(0)
                    prefix = suffix = ''

                    # Sometimes we pick up too much in the url match, so look for
                    # bits we should drop and remove them from the match
                    url, prefix, suffix = self.strip_non_url_bits(url)

                    # If there's no protocol, add one
                    if PROTO_RE.search(url):
                        href = url
                    else:
                        href = 'http://%s' % url

                    attrs = {
                        (None, 'href'): href,
                        '_text': url
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text
                        new_tokens.append(
                            {'type': 'Characters', 'data': prefix + url + suffix}
                        )

                    else:
                        # Add the "a" tag!
                        if prefix:
                            new_tokens.append(
                                {'type': 'Characters', 'data': prefix}
                            )

                        _text = attrs.pop('_text', '')
                        attrs = alphabetize_attributes(attrs)

                        new_tokens.extend([
                            {'type': 'StartTag', 'name': 'a', 'data': attrs},
                            {'type': 'Characters', 'data': force_unicode(_text)},
                            {'type': 'EndTag', 'name': 'a'},
                        ])

                        if suffix:
                            new_tokens.append(
                                {'type': 'Characters', 'data': suffix}
                            )

                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({'type': 'Characters', 'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
Ejemplo n.º 11
0
    def handle_links(self, src_iter):
        """Handle links in character tokens"""
        in_a = False  # happens, if parse_email=True and if a mail was found
        for token in src_iter:
            if in_a:
                if token["type"] == "EndTag" and token["name"] == "a":
                    in_a = False
                yield token
                continue
            elif token["type"] == "StartTag" and token["name"] == "a":
                in_a = True
                yield token
                continue
            if token["type"] == "Characters":
                text = token["data"]
                new_tokens = []
                end = 0

                for match in self.url_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append({
                            "type": "Characters",
                            "data": text[end:match.start()]
                        })

                    url = match.group(0)
                    prefix = suffix = ""

                    # Sometimes we pick up too much in the url match, so look for
                    # bits we should drop and remove them from the match
                    url, prefix, suffix = self.strip_non_url_bits(url)

                    # If there's no protocol, add one
                    if PROTO_RE.search(url):
                        href = url
                    else:
                        href = "http://%s" % url

                    attrs = {(None, "href"): href, "_text": url}
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text
                        new_tokens.append({
                            "type": "Characters",
                            "data": prefix + url + suffix
                        })

                    else:
                        # Add the "a" tag!
                        if prefix:
                            new_tokens.append({
                                "type": "Characters",
                                "data": prefix
                            })

                        _text = attrs.pop("_text", "")
                        attrs = alphabetize_attributes(attrs)

                        new_tokens.extend([
                            {
                                "type": "StartTag",
                                "name": "a",
                                "data": attrs
                            },
                            {
                                "type": "Characters",
                                "data": force_unicode(_text)
                            },
                            {
                                "type": "EndTag",
                                "name": "a"
                            },
                        ])

                        if suffix:
                            new_tokens.append({
                                "type": "Characters",
                                "data": suffix
                            })

                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({
                            "type": "Characters",
                            "data": text[end:]
                        })

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
Ejemplo n.º 12
0
    def handle_email_addresses(self, src_iter):
        """Handle email addresses in character tokens"""
        for token in src_iter:
            if token["type"] == "Characters":
                text = token["data"]
                new_tokens = []
                end = 0

                # For each email address we find in the text
                for match in self.email_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append({
                            "type": "Characters",
                            "data": text[end:match.start()]
                        })

                    # Run attributes through the callbacks to see what we
                    # should do with this match
                    attrs = {
                        (None, "href"): "mailto:%s" % match.group(0),
                        "_text": match.group(0),
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text--but not as a link
                        new_tokens.append({
                            "type": "Characters",
                            "data": match.group(0)
                        })

                    else:
                        # Add an "a" tag for the new link
                        _text = attrs.pop("_text", "")
                        attrs = alphabetize_attributes(attrs)
                        new_tokens.extend([
                            {
                                "type": "StartTag",
                                "name": "a",
                                "data": attrs
                            },
                            {
                                "type": "Characters",
                                "data": force_unicode(_text)
                            },
                            {
                                "type": "EndTag",
                                "name": "a"
                            },
                        ])
                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({
                            "type": "Characters",
                            "data": text[end:]
                        })

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token
Ejemplo n.º 13
0
    def handle_links(self, src_iter):
        """Handle links in character tokens"""
        in_a = False  # happens, if parse_email=True and if a mail was found
        for token in src_iter:
            if in_a:
                if token['type'] == 'EndTag' and token['name'] == 'a':
                    in_a = False
                yield token
                continue
            elif token['type'] == 'StartTag' and token['name'] == 'a':
                in_a = True
                yield token
                continue
            if token['type'] == 'Characters':
                text = token['data']
                new_tokens = []
                end = 0

                for match in self.url_re.finditer(text):
                    if match.start() > end:
                        new_tokens.append(
                            {u'type': u'Characters', u'data': text[end:match.start()]}
                        )

                    url = match.group(0)
                    prefix = suffix = ''

                    # Sometimes we pick up too much in the url match, so look for
                    # bits we should drop and remove them from the match
                    url, prefix, suffix = self.strip_non_url_bits(url)

                    # If there's no protocol, add one
                    if PROTO_RE.search(url):
                        href = url
                    else:
                        href = u'http://%s' % url

                    attrs = {
                        (None, u'href'): href,
                        u'_text': url
                    }
                    attrs = self.apply_callbacks(attrs, True)

                    if attrs is None:
                        # Just add the text
                        new_tokens.append(
                            {u'type': u'Characters', u'data': prefix + url + suffix}
                        )

                    else:
                        # Add the "a" tag!
                        if prefix:
                            new_tokens.append(
                                {u'type': u'Characters', u'data': prefix}
                            )

                        _text = attrs.pop(u'_text', '')
                        attrs = alphabetize_attributes(attrs)

                        new_tokens.extend([
                            {u'type': u'StartTag', u'name': u'a', u'data': attrs},
                            {u'type': u'Characters', u'data': force_unicode(_text)},
                            {u'type': u'EndTag', u'name': 'a'},
                        ])

                        if suffix:
                            new_tokens.append(
                                {u'type': u'Characters', u'data': suffix}
                            )

                    end = match.end()

                if new_tokens:
                    # Yield the adjusted set of tokens and then continue
                    # through the loop
                    if end < len(text):
                        new_tokens.append({u'type': u'Characters', u'data': text[end:]})

                    for new_token in new_tokens:
                        yield new_token

                    continue

            yield token