Beispiel #1
0
 def __init__(self, source):
     html5lib_Filter.__init__(self, source)
     self.level = 2
     self.max_level = 3
     self.in_header = False
     self.open_level = 0
     self.in_hierarchy = False
Beispiel #2
0
 def __init__(self, source):
     html5lib_Filter.__init__(self, source)
     self.level = 2
     self.max_level = 3
     self.in_header = False
     self.open_level = 0
     self.in_hierarchy = False
Beispiel #3
0
    def __init__(self, source, id):
        html5lib_Filter.__init__(self, source)

        self.section_id = id

        self.open_level = 0
        self.parent_level = None
        self.in_section = False
        self.skip = False
Beispiel #4
0
    def __init__(self, source, id):
        html5lib_Filter.__init__(self, source)

        self.section_id = id

        self.open_level = 0
        self.parent_level = None
        self.in_section = False
        self.skip = False
Beispiel #5
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            if (token['type'] == 'StartTag'
                    and token['name'] in self.tag_attributes):
                attrs = dict(token['data'])

                # If the element has the attribute we're looking for
                desired_attr = self.tag_attributes[token['name']]

                for (namespace, name), value in attrs.items():
                    if desired_attr == name:
                        if not value.startswith('http'):
                            if value.startswith('//') or value.startswith(
                                    '{{'):
                                # Do nothing for absolute addresses or apparent
                                # template variable output
                                attrs[(namespace, name)] = value
                            elif value.startswith('/'):
                                # Starts with "/", so just add the base url
                                attrs[(namespace,
                                       name)] = self.base_url + value
                            else:
                                attrs[(namespace,
                                       name)] = self.base_url + '/' + value
                            token['data'] = attrs
                        break

            yield token
Beispiel #6
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        # loop through all 'tokens'
        for token in input:

            # if this token is a start tag...
            if token['type'] == 'StartTag':
                # increment counter that tracks nesting
                self.open_level += 1

                for key in token['data']:
                    if 'id' == key[1] and token['data'][key] == self.section_id:
                        # note we're in the matching section
                        self.in_section = True
                        # keep track of how nested we were when section started
                        self.parent_level = self.open_level

            elif token['type'] == 'EndTag':
                # If the parent of the section has ended, end the section.
                if (self.parent_level is not None
                        and self.open_level is self.parent_level):
                    self.in_section = False
                    self.skip = True
                    self.parent_level = None

                # reduce nesting counter
                self.open_level -= 1

            # emit tokens if we're not in the section being removed
            if not self.in_section and not self.skip:
                yield token
            else:
                self.skip = False
Beispiel #7
0
    def __init__(self, source, id, replace_source=None, ignore_heading=False):
        html5lib_Filter.__init__(self, source)

        self.replace_source = replace_source
        self.ignore_heading = ignore_heading
        self.section_id = id

        self.heading = None
        self.heading_rank = None
        self.open_level = 0
        self.parent_level = None
        self.in_section = False
        self.heading_to_ignore = None
        self.already_ignored_header = False
        self.next_in_section = False
        self.replacement_emitted = False
Beispiel #8
0
    def __iter__(self):
        isScript = None
        for token in Filter.__iter__(self):
            ttype = token['type']
            if ttype == 'StartTag':
                tname = token['name']
                tdata = token['data']
                if self.f (token):
                    self.inside = 0
                if tname in {'script', 'style'}:
                    isScript = 0

            if isScript is not None:
                if ttype == 'EndTag':
                    isScript -= 1
                    if isScript <= 0:
                        isScript = None
            elif self.inside is not None:
                if ttype == 'StartTag':
                    self.inside += 1
                if ttype == 'EndTag':
                    self.inside -= 1
                if self.inside <= 0:
                    self.inside = None

                yield token
Beispiel #9
0
 def __iter__(self, _title_attr=(None, 'title')):
     html_ns = namespaces['html']
     for token in BaseFilter.__iter__(self):
         yield token
         if (
             token['type'] == 'EmptyTag' and
             token['name'] == 'img' and
             token['namespace'] == html_ns and
             'data' in token
         ):
             attrs = token['data']
             if _title_attr in attrs:
                 yield {
                     'type': 'StartTag',
                     'namespace': html_ns,
                     'name': 'aside',
                     'data': OrderedDict(),  # TODO Some way to pass through special styling.
                 }
                 yield {
                     'type': 'Characters',
                     'data': attrs[_title_attr],
                 }
                 yield {
                     'type': 'EndTag',
                     'namespace': html_ns,
                     'name': 'aside',
                 }
Beispiel #10
0
            def __iter__(self):
                for token in Filter.__iter__(self):
                    if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
                        for attr, value in token['data'].items():
                            token['data'][attr] = 'moo'

                    yield token
Beispiel #11
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        # loop through all 'tokens'
        for token in input:

            # if this token is a start tag...
            if token['type'] == 'StartTag':
                # increment counter that tracks nesting
                self.open_level += 1

                for key in token['data']:
                    if 'id' == key[1] and token['data'][key] == self.section_id:
                        # note we're in the matching section
                        self.in_section = True
                        # keep track of how nested we were when section started
                        self.parent_level = self.open_level

            elif token['type'] == 'EndTag':
                # If the parent of the section has ended, end the section.
                if (self.parent_level is not None and
                        self.open_level is self.parent_level):
                    self.in_section = False
                    self.skip = True
                    self.parent_level = None

                # reduce nesting counter
                self.open_level -= 1

            # emit tokens if we're not in the section being removed
            if not self.in_section and not self.skip:
                yield token
            else:
                self.skip = False
Beispiel #12
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            yield token

            if token["type"] == "StartTag" and token["name"] in SECTION_TAGS:
                attrs = dict(token["data"])
                for (namespace, name), value in attrs.items():
                    if name == "id" and value:
                        ts = (
                            {
                                "type": "StartTag",
                                "name": "a",
                                "data": {
                                    (None, "title"): gettext("Edit section"),
                                    (None, "class"): "edit-section",
                                    (None, "data-section-id"): value,
                                    (None, "data-section-src-url"): order_params(
                                        "%s?%s"
                                        % (
                                            reverse(
                                                "wiki.document",
                                                args=[self.slug],
                                                locale=self.locale,
                                            ),
                                            urlencode(
                                                {
                                                    "section": value.encode(),
                                                    "raw": "true",
                                                }
                                            ),
                                        )
                                    ),
                                    (None, "href"): order_params(
                                        "%s?%s"
                                        % (
                                            reverse(
                                                "wiki.edit",
                                                args=[self.slug],
                                                locale=self.locale,
                                            ),
                                            (
                                                urlencode(
                                                    {
                                                        "section": value.encode(),
                                                        "edit_links": "true",
                                                    }
                                                )
                                            ),
                                        )
                                    ),
                                },
                            },
                            {"type": "Characters", "data": gettext("Edit")},
                            {"type": "EndTag", "name": "a"},
                        )
                        for t in ts:
                            yield t
Beispiel #13
0
    def __init__(self, source, id, replace_source=None, ignore_heading=False):
        html5lib_Filter.__init__(self, source)

        self.replace_source = replace_source
        self.ignore_heading = ignore_heading
        self.section_id = id

        self.heading = None
        self.heading_rank = None
        self.open_level = 0
        self.parent_level = None
        self.in_section = False
        self.heading_to_ignore = None
        self.already_ignored_header = False
        self.next_in_section = False
        self.replacement_emitted = False
Beispiel #14
0
 def __iter__(self):
     for token in Filter.__iter__(self):
         if token["type"] == "StartTag" and token["name"] == "p":
             continue
         elif token["type"] == "EndTag" and token["name"] == "p":
             yield (self.NEWLINE_TOKEN)
             continue
         yield (token)
Beispiel #15
0
 def __iter__(self):
     for token in Filter.__iter__(self):
         if token['type'] in ['StartTag', 'EmptyTag'] and token['data']:
             if token['name'] == 'img':
                 for attr, value in token['data'].items():
                     token['data'][attr] = image_rewrite(
                         urljoin(result['url'], token['data'][attr]),
                         result['@id'])
         yield token
Beispiel #16
0
 def __iter__(self):
     for token in HTML5LibFilterBase.__iter__(self):
         type = token['type']
         if type in ('StartTag', 'EmptyTag', 'EndTag'):
             name = token['name']
             if name in ('html', 'head', 'body'):
                 continue
         
         yield token
Beispiel #17
0
 def __iter__(self):
     stack = []
     for token in Filter.__iter__(self):
         if 'name' in token and token['name'] in REMOVED_TAGS:
             if token['type'] == 'StartTag':
                 stack.append(token['name'])
             elif token['type'] == 'EndTag':
                 stack.pop(-1)
         elif not stack:
             yield token
Beispiel #18
0
 def __iter__(self):
     for token in Filter.__iter__(self):
         data = token.get ('data')
         if data and token['type'] in {'StartTag', 'EmptyTag'}:
             newdata = {}
             for (namespace, k), v in data.items ():
                 if k.lower () not in self.attributes:
                     newdata[(namespace, k)] = v
             token['data'] = newdata
         yield token
Beispiel #19
0
 def __iter__(self):
     for token in Filter.__iter__(self):
         if token["type"] == "StartTag" and token["name"] == "a" and token[
                 "data"]:
             url = token["data"].get((None, "href"))
             if url is not None:
                 actual_url = parse_qs(urlparse(url).query).get("q")
                 if actual_url is not None and len(actual_url) > 0:
                     token["data"][(None, "href")] = actual_url[0]
         yield (token)
Beispiel #20
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag':
             # Strip out any attributes that start with "on"
             attrs = {}
             for (namespace, name), value in token['data'].items():
                 if name.startswith('on'):
                     continue
                 attrs[(namespace, name)] = value
             token['data'] = attrs
         yield token
Beispiel #21
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag':
             # Strip out any attributes that start with "on"
             attrs = {}
             for (namespace, name), value in token['data'].items():
                 if name.startswith('on'):
                     continue
                 attrs[(namespace, name)] = value
             token['data'] = attrs
         yield token
Beispiel #22
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token["type"] == "StartTag":
             # Strip out any attributes that start with "on"
             attrs = {}
             for (namespace, name), value in token["data"].items():
                 if name.startswith("on"):
                     continue
                 attrs[(namespace, name)] = value
             token["data"] = attrs
         yield token
Beispiel #23
0
 def __iter__(self):
     delete = 0
     for token in Filter.__iter__(self):
         tokenType = token['type']
         if tokenType in {'StartTag', 'EmptyTag'}:
             if delete > 0 or token['name'].lower () in self.tags:
                 delete += 1
         if delete == 0:
             yield token
         if tokenType == 'EndTag' and delete > 0:
             delete -= 1
Beispiel #24
0
    def __iter__(self):
        for token in Filter.__iter__(self):
            if token["type"] == "StartTag" and token['name'] == "a":
                href = token["data"][(None, 'href')]

                if href.startswith("mailto:"):
                    obfuscated = obfuscate(href[7:])
                    href = "mailto:{}".format(obfuscated)
                    token["data"][(None, 'href')] = href

            yield token
Beispiel #25
0
 def __iter__(self, _SRC_ATTR=(None, 'src'), _youtube_hosts=('youtube.com',
                                                             'www.youtube.com',
                                                             'youtube-nocookie.com',
                                                             'www.youtube-nocookie.com')):
     html_ns = namespaces['html']
     elide = False
     for token in BaseFilter.__iter__(self):
         token_type = token['type']
         if elide:
             # NOTE html5lib doesn't permit nesting <iframe> tags,
             # (presumably because HTML5 doesn't permit it). Therefore we
             # don't need to deal with that case here, just wait for the
             # first end tag.
             if token_type == 'EndTag' and token['name'] == 'iframe':
                 elide = False
         else:
             if (
                 token_type == 'StartTag' and
                 token['name'] == 'iframe' and
                 token['namespace'] == html_ns and
                 'data' in token and
                 _SRC_ATTR in token['data']
             ):
                 url = URL.from_text(token['data'][_SRC_ATTR])
                 if url.absolute and url.host in _youtube_hosts and len(url.path) == 2 and url.path[0] == 'embed':
                     yield {
                         'type': 'StartTag',
                         'namespace': html_ns,
                         'name': 'a',
                         'data': OrderedDict([
                             ((None, 'href'), self._watch_url(url).to_text()),
                         ]),
                     }
                     yield {
                         'type': 'EmptyTag',
                         'namespace': html_ns,
                         'name': u'img',
                         'data': OrderedDict([
                             ((None, 'alt'), 'YouTube video'),
                             (_SRC_ATTR, self._thumbnail_url(url).to_text()),
                             ((None, 'width'), '320'),
                             ((None, 'height'), '180'),
                         ]),
                     }
                     yield {
                         'type': 'EndTag',
                         'namespace': html_ns,
                         'name': 'a',
                     }
                     elide = True
                 else:
                     yield token
             else:
                 yield token
Beispiel #26
0
    def __iter__(self):
        for token in Filter.__iter__(self):
            ret = self.sanitize_token(token)

            if not ret:
                continue

            if isinstance(ret, list):
                for subtoken in ret:
                    yield subtoken
            else:
                yield ret
Beispiel #27
0
    def __iter__(self):
        for token in Filter.__iter__(self):
            ret = self.sanitize_token(token)

            if not ret:
                continue

            if isinstance(ret, list):
                for subtoken in ret:
                    yield subtoken
            else:
                yield ret
Beispiel #28
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'a':
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'href' and value:
                     if re.search(self.blocked_protocols, value):
                         attrs[(namespace, 'href')] = ''
                 token['data'] = attrs
             yield token
         else:
             yield token
Beispiel #29
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            yield token

            if token['type'] == 'StartTag' and token['name'] in SECTION_TAGS:
                attrs = dict(token['data'])
                for (namespace, name), value in attrs.items():
                    if name == 'id' and value:
                        ts = ({
                            'type': 'StartTag',
                            'name': 'a',
                            'data': {
                                (None, u'title'):
                                ugettext('Edit section'),
                                (None, u'class'):
                                'edit-section',
                                (None, u'data-section-id'):
                                value,
                                (None, u'data-section-src-url'):
                                order_params(
                                    u'%s?%s' %
                                    (reverse('wiki.document',
                                             args=[self.slug],
                                             locale=self.locale),
                                     urlencode({
                                         'section': value.encode('utf-8'),
                                         'raw': 'true'
                                     }))),
                                (None, u'href'):
                                order_params(
                                    u'%s?%s' %
                                    (reverse('wiki.edit',
                                             args=[self.slug],
                                             locale=self.locale),
                                     (urlencode(
                                         {
                                             'section': value.encode('utf-8'),
                                             'edit_links': 'true'
                                         }))))
                            }
                        }, {
                            'type': 'Characters',
                            'data': ugettext(u'Edit')
                        }, {
                            'type': 'EndTag',
                            'name': 'a'
                        })
                        for t in ts:
                            yield t
Beispiel #30
0
 def __iter__(self):
     tokens = Filter.__iter__(self)
     while True:
         for token in tokens:
             if token["type"] == "StartTag" and token["name"].lower() in self._tags:
                 break
             yield token
         else:
             # we ran out of tokens
             break
         for token in tokens:
             if token["type"] == "EndTag" and token["name"].lower() in self._tags:
                 break
Beispiel #31
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'pre':
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'function' and value:
                     m = MT_SYNTAX_RE.match(value)
                     if m:
                         lang = m.group(1).lower()
                         brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang)
                         attrs[(namespace, u'class')] = "brush: %s" % brush
                         del attrs[(None, 'function')]
                         token['data'] = attrs
         yield token
Beispiel #32
0
 def __iter__(self):
     remove_end_tag = False
     for token in Filter.__iter__(self):
         # only check anchor tags
         if 'name' in token and token['name'] == 'a' and token['type'] in [
                 'StartTag', 'EndTag'
         ]:
             if token['type'] == 'StartTag' and token['data'] == {}:
                 remove_end_tag = True
                 continue
             elif token['type'] == 'EndTag' and remove_end_tag:
                 remove_end_tag = False
                 continue
         yield token
Beispiel #33
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)
        for token in input:
            if token["type"] == "StartTag" and token["name"] == "pre":
                attrs = dict(token["data"]) or {(None, "class"): ""}
                for (namespace, name), value in attrs.copy().items():
                    if name == "class" and "notranslate" not in value.split():
                        before = attrs.get((namespace, "class")) or ""
                        after = f"{before} notranslate".strip()
                        attrs[(namespace, "class")] = after

                token["data"] = attrs

            yield token
Beispiel #34
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token["type"] == "StartTag" and token["name"] == "pre":
             attrs = dict(token["data"])
             for (namespace, name), value in attrs.copy().items():
                 if name == "function" and value:
                     m = MT_SYNTAX_RE.match(value)
                     if m:
                         lang = m.group(1).lower()
                         brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang)
                         attrs[(namespace, "class")] = "brush: %s" % brush
                         del attrs[(None, "function")]
                         token["data"] = attrs
         yield token
Beispiel #35
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'pre':
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.copy().items():
                 if name == 'function' and value:
                     m = MT_SYNTAX_RE.match(value)
                     if m:
                         lang = m.group(1).lower()
                         brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang)
                         attrs[(namespace, u'class')] = "brush: %s" % brush
                         del attrs[(None, 'function')]
                         token['data'] = attrs
         yield token
Beispiel #36
0
 def __iter__(self):
     in_link = False
     for token in Filter.__iter__(self):
         if token.get('name') == 'a':
             if token['type'] == 'StartTag':
                 in_link = True
                 data = token['data'][(None, 'href')]
                 data = data.replace('mailto:', '')
                 yield {'type': 'Characters', 'data': data}
             elif token['type'] == 'EndTag':
                 in_link = False
         elif token['type'] == 'Characters':
             if not in_link:
                 yield token
         else:
             yield token
Beispiel #37
0
 def __iter__(self):
     in_iframe = False
     for token in html5lib_Filter.__iter__(self):
         if token["type"] == "StartTag" and token["name"] == "iframe":
             in_iframe = True
             attrs = dict(token["data"])
             for (namespace, name), value in attrs.items():
                 if name == "src" and value:
                     if not self.validate_src(value):
                         attrs[(namespace, "src")] = ""
                 token["data"] = attrs
             yield token
         if token["type"] == "EndTag" and token["name"] == "iframe":
             in_iframe = False
         if not in_iframe:
             yield token
Beispiel #38
0
 def __iter__(self):
     in_iframe = False
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'iframe':
             in_iframe = True
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'src' and value:
                     if not self.validate_src(value):
                         attrs[(namespace, 'src')] = ''
                 token['data'] = attrs
             yield token
         if token['type'] == 'EndTag' and token['name'] == 'iframe':
             in_iframe = False
         if not in_iframe:
             yield token
Beispiel #39
0
 def __iter__(self):
     remove_end_tag = False
     for token in Filter.__iter__(self):
         # only check anchor tags
         if 'name' in token and token['name'] == 'a' and token['type'] in ['StartTag', 'EndTag']:
             if token['type'] == 'StartTag':
                 remove_end_tag = True
                 for attr, value in token['data'].items():
                     if attr == (None, 'href') and value != '' and is_valid_url(value):
                         remove_end_tag = False
                 if remove_end_tag:
                     continue
             elif token['type'] == 'EndTag' and remove_end_tag:
                 remove_end_tag = False
                 continue
         yield token
Beispiel #40
0
 def __iter__(self):
     in_iframe = False
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'iframe':
             in_iframe = True
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'src' and value:
                     if not re.search(self.hosts, value):
                         attrs[(namespace, 'src')] = ''
                 token['data'] = attrs
             yield token
         if token['type'] == 'EndTag' and token['name'] == 'iframe':
             in_iframe = False
         if not in_iframe:
             yield token
Beispiel #41
0
    def __iter__(self):
        html_ns = namespaces['html']
        nest = 0
        for token in BaseFilter.__iter__(self):
            token_type = token['type']
            # Drop <param> when inside <object>. We don't handle nesting
            # properly, but they're not valid anywhere else so that's not
            # a problem.
            if nest >= 1 and token_type == 'EmptyTag' and token['name'] == 'param' and token['namespace'] == html_ns:
                continue

            if token_type == 'EndTag' and token['name'] == 'object' and token['namespace'] == html_ns:
                nest -= 1
                continue

            if token_type == 'StartTag' and token['name'] == 'object' and token['namespace'] == html_ns:
                nest += 1
                continue

            yield token
Beispiel #42
0
 def __iter__(self):
     elide = 0
     elide_ns = None
     elide_name = None
     for token in BaseFilter.__iter__(self):
         token_type = token['type']
         if elide:
             if token_type == 'EndTag' and token['name'] == elide_name and token['namespace'] == elide_ns:
                 elide -= 1
             if token_type == 'StartTag' and token['name'] == elide_name and token['namespace'] == elide_ns:
                 elide += 1
             continue  # Drop the token
         else:
             if token_type == 'StartTag':
                 if (token['namespace'], token['name']) in self._elide_tags:
                     elide += 1
                     elide_name = token['name']
                     elide_ns = token['namespace']
                     continue  # Drop this token.
             yield token
Beispiel #43
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            yield token

            if (token['type'] == 'StartTag' and
                    token['name'] in SECTION_TAGS):
                attrs = dict(token['data'])
                for (namespace, name), value in attrs.items():
                    if name == 'id' and value:
                        ts = ({'type': 'StartTag',
                               'name': 'a',
                               'data': {
                                   (None, u'title'): ugettext('Edit section'),
                                   (None, u'class'): 'edit-section',
                                   (None, u'data-section-id'): value,
                                   (None, u'data-section-src-url'): u'%s?%s' % (
                                       reverse('wiki.document',
                                               args=[self.slug],
                                               locale=self.locale),
                                       urlencode({'section': value.encode('utf-8'),
                                                  'raw': 'true'})
                                   ),
                                   (None, u'href'): u'%s?%s' % (
                                       reverse('wiki.edit',
                                               args=[self.slug],
                                               locale=self.locale),
                                       urlencode({'section': value.encode('utf-8'),
                                                  'edit_links': 'true'})
                                   )
                               }},
                              {'type': 'Characters',
                               'data': ugettext(u'Edit')},
                              {'type': 'EndTag', 'name': 'a'})
                        for t in ts:
                            yield t
Beispiel #44
0
 def __init__(self, source, hosts):
     html5lib_Filter.__init__(self, source)
     self.hosts = hosts
Beispiel #45
0
 def __iter__(self):
     for token in Filter.__iter__(self):
         if (token['type'] in ['StartTag', 'EndTag']):
             if token['name'] in ['h1', 'h2', 'h3']:
                 token['name'] = 'h4'
         yield token
Beispiel #46
0
 def __init__(self, source, patterns):
     html5lib_Filter.__init__(self, source)
     self.allowed_src_patterns = patterns
Beispiel #47
0
 def __init__(self, source, base_url):
     html5lib_Filter.__init__(self, source)
     self.base_url = base_url
     self.base_url_parsed = urlparse(base_url)
Beispiel #48
0
    def __iter__(self):
        from kuma.wiki.models import Document

        input = html5lib_Filter.__iter__(self)

        # Pass #1: Gather all the link URLs and prepare annotations
        links = {}
        buffer = []
        for token in input:
            buffer.append(token)
            if token['type'] == 'StartTag' and token['name'] == 'a':
                for (namespace, name), value in token['data'].items():
                    if name == 'href':
                        href = value
                        href_parsed = urlparse(href)
                        if href_parsed.netloc == self.base_url_parsed.netloc:
                            # Squash site-absolute URLs to site-relative paths.
                            href = href_parsed.path

                        # Prepare annotations record for this path.
                        links[href] = {'classes': [], 'rel': []}

        needs_existence_check = defaultdict(lambda: defaultdict(set))

        # Run through all the links and check for annotatable conditions.
        for href in links.keys():

            # Is this an external URL?
            is_external = False
            for prefix in self.EXTERNAL_PREFIXES:
                if href.startswith(prefix):
                    is_external = True
                    break
            if is_external:
                links[href]['classes'].append('external')
                # https://mathiasbynens.github.io/rel-noopener/
                links[href]['rel'].append('noopener')
                continue

            # TODO: Should this also check for old-school mindtouch URLs? Or
            # should we encourage editors to convert to new-style URLs to take
            # advantage of link annotation? (I'd say the latter)

            # Is this a kuma doc URL?
            if '/docs/' in href:

                # Check if this is a special docs path that's exempt from "new"
                skip = False
                for path in DOC_SPECIAL_PATHS:
                    if '/docs/%s' % path in href:
                        skip = True
                if skip:
                    continue

                href_locale, href_path = href.split(u'/docs/', 1)
                if href_locale.startswith(u'/'):
                    href_locale = href_locale[1:]

                if '#' in href_path:
                    # If present, discard the hash anchor
                    href_path, _, _ = href_path.partition('#')

                # Handle any URL-encoded UTF-8 characters in the path
                href_path = href_path.encode('utf-8', 'ignore')
                href_path = unquote(href_path)
                href_path = href_path.decode('utf-8', 'ignore')

                # Try to sort out the locale and slug through some of our
                # redirection logic.
                locale, slug, needs_redirect = (
                    locale_and_slug_from_path(href_path,
                                              path_locale=href_locale))

                # Gather up this link for existence check
                needs_existence_check[locale.lower()][slug.lower()].add(href)

        # Perform existence checks for all the links, using one DB query per
        # locale for all the candidate slugs.
        for locale, slug_hrefs in needs_existence_check.items():

            existing_slugs = (Document.objects
                              .filter(locale=locale,
                                      slug__in=slug_hrefs.keys())
                              .values_list('slug', flat=True))

            # Remove the slugs that pass existence check.
            check_collation = False
            for slug in existing_slugs:
                lslug = slug.lower()
                try:
                    del slug_hrefs[lslug]
                except KeyError:
                    # Same slug by MySQL collation rules
                    check_collation = True

            # Some slugs are matched by collation rules, so use single checks
            if check_collation:
                to_delete = set()
                for slug in slug_hrefs.keys():
                    match = Document.objects.filter(locale=locale, slug=slug)
                    if match.exists():
                        to_delete.add(slug)
                for slug in to_delete:
                    del slug_hrefs[slug]

            # Mark all the links whose slugs did not come back from the DB
            # query as "new"
            for slug, hrefs in slug_hrefs.items():
                for href in hrefs:
                    links[href]['classes'].append('new')
                    links[href]['rel'].append('nofollow')

        # Pass #2: Filter the content, annotating links
        for token in buffer:
            if token['type'] == 'StartTag' and token['name'] == 'a':
                attrs = dict(token['data'])
                names = [name for (namespace, name) in attrs.keys()]
                for (namespace, name), value in attrs.copy().items():
                    if name == 'href':
                        href = value
                        href_parsed = urlparse(value)
                        if href_parsed.netloc == self.base_url_parsed.netloc:
                            # Squash site-absolute URLs to site-relative paths.
                            href = href_parsed.path

                        # Update attributes on this link element.
                        def add_to_attr(attr_name, add_list):
                            """Add values to the attribute dictionary."""
                            if attr_name in names:
                                values = set(
                                    attrs[(namespace, attr_name)].split(u' '))
                            else:
                                values = set()
                            values.update(add_list)
                            if values:
                                attrs[(namespace, attr_name)] = (
                                    u' '.join(sorted(values)))

                        add_to_attr(u'class', links[href]['classes'])
                        add_to_attr(u'rel', links[href]['rel'])

                token['data'] = attrs

            yield token
Beispiel #49
0
 def __init__(self, source):
     html5lib_Filter.__init__(self, source)
     self.id_cnt = 0
     self.known_ids = set()
Beispiel #50
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        # First, collect all ID values already in the source HTML.
        buffer = []
        for token in input:
            buffer.append(token)
            if token['type'] == 'StartTag':
                attrs = dict(token['data'])
                for (namespace, name), value in attrs.items():
                    # Collect both 'name' and 'id' attributes since
                    # 'name' gets treated as a manual override to
                    # specify an ID.
                    if name == 'id' and token['name'] not in HEAD_TAGS:
                        self.known_ids.add(value)
                    if name == 'name':
                        self.known_ids.add(value)

        # Then walk the tree again identifying elements in need of IDs
        # and adding them.
        while len(buffer):
            token = buffer.pop(0)

            if not (token['type'] == 'StartTag' and
                    token['name'] in SECTION_TAGS):
                # If this token isn't the start tag of a section or
                # header, we don't add an ID and just short-circuit
                # out to return the token as-is.
                yield token
            else:
                # Potential bug warning: there may not be any
                # attributes, so doing a for loop over them to look
                # for existing ID/name values is unsafe. Instead we
                # dict-ify the attrs, and then check directly for the
                # things we care about instead of iterating all
                # attributes and waiting for one we care about to show
                # up.
                attrs = dict(token['data'])

                # First check for a 'name' attribute; if it's present,
                # treat it as a manual override by the author and make
                # that value be the ID.
                if (None, 'name') in attrs:
                    attrs[(None, u'id')] = attrs[(None, 'name')]
                    token['data'] = attrs
                    yield token
                    continue
                # Next look for <section> tags which don't have an ID
                # set; since we don't generate an ID for them from
                # their text contents, they just get a numeric one
                # from gen_id().
                if token['name'] not in HEAD_TAGS:
                    if (None, 'id') not in attrs:
                        attrs[(None, u'id')] = self.gen_id()
                        token['data'] = attrs
                    yield token
                    continue
                # If we got here, we're looking at the start tag of a
                # header which had no 'name' attribute set. We're
                # going to pop out the text contents of the header,
                # use them to generate a slugified ID for it, and
                # return it with that ID added in.
                buffer, header_tokens = self.process_header(token, buffer)
                for t in header_tokens:
                    yield t
Beispiel #51
0
 def __init__(self, source, slug, locale):
     html5lib_Filter.__init__(self, source)
     self.slug = slug
     self.locale = locale
Beispiel #52
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        self.skip_header = False

        for token in input:
            if token['type'] == 'StartTag' and token['name'] in HEAD_TAGS_TOC:
                level_match = LEVEL_RE.match(token['name'])
                level = int(level_match.group(1))
                if level > self.max_level:
                    self.skip_header = True
                    continue
                self.in_header = True
                out = []
                if level > self.level:
                    diff = level - self.level
                    for i in range(diff):
                        if (not self.in_hierarchy and i % 2 == 0):
                            out.append({'type': 'StartTag',
                                        'name': 'li',
                                        'data': {}})
                        out.append({'type': 'StartTag',
                                    'name': 'ol',
                                    'data': {}})
                        if (diff > 1 and i % 2 == 0 and i != diff - 1):
                            out.append({'type': 'StartTag',
                                        'name': 'li',
                                        'data': {}})
                        self.open_level += 1
                    self.level = level
                elif level < self.level:
                    diff = self.level - level
                    for i in range(diff):
                        out.extend([{'type': 'EndTag',
                                     'name': 'ol'},
                                    {'type': 'EndTag',
                                     'name': 'li'}])
                        self.open_level -= 1
                    self.level = level
                attrs = dict(token['data'])
                id = attrs.get((None, 'id'), None)
                if id:
                    out.extend([
                        {'type': 'StartTag', 'name': 'li', 'data': {}},
                        {'type': 'StartTag', 'name': 'a',
                         'data': {(None, u'rel'): 'internal',
                                  (None, u'href'): '#%s' % id}},
                    ])
                    self.in_hierarchy = True
                    for t in out:
                        yield t
            elif (token['type'] == 'StartTag' and
                  token['name'] in TAGS_IN_TOC and
                  self.in_header and
                  not self.skip_header):
                yield token
            elif (token['type'] in ("Characters", "SpaceCharacters") and
                  self.in_header):
                yield token
            elif (token['type'] == 'EndTag' and
                  token['name'] in TAGS_IN_TOC and
                  self.in_header):
                yield token
            elif (token['type'] == 'EndTag' and
                  token['name'] in HEAD_TAGS_TOC):
                level_match = LEVEL_RE.match(token['name'])
                level = int(level_match.group(1))
                if level > self.max_level:
                    self.skip_header = False
                    continue
                self.in_header = False
                yield {'type': 'EndTag', 'name': 'a'}

        if self.open_level > 0:
            out = []
            for i in range(self.open_level):
                out.extend([{'type': 'EndTag', 'name': 'ol'},
                            {'type': 'EndTag', 'name': 'li'}])
            for t in out:
                yield t
Beispiel #53
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)
        for token in input:

            # Section start was deferred, so start it now.
            if self.next_in_section:
                self.next_in_section = False
                self.in_section = True

            if token['type'] == 'StartTag':
                attrs = dict(token['data'])
                self.open_level += 1

                # Have we encountered the section or heading element we're
                # looking for?
                if self.section_id in attrs.values():

                    # If we encounter a section element that matches the ID,
                    # then we'll want to scoop up all its children as an
                    # explicit section.
                    if self.parent_level is None and self._isSection(token):
                        self.parent_level = self.open_level
                        # Defer the start of the section, so the section parent
                        # itself isn't included.
                        self.next_in_section = True

                    # If we encounter a heading element that matches the ID, we
                    # start an implicit section.
                    elif self.heading is None and self._isHeading(token):
                        self.heading = token
                        self.heading_rank = self._getHeadingRank(token)
                        self.parent_level = self.open_level - 1
                        self.in_section = True

                # If started an implicit section, these rules apply to
                # siblings...
                elif self.heading is not None and self.open_level - 1 == self.parent_level:

                    # The implicit section should stop if we hit another
                    # sibling heading whose rank is equal or higher, since that
                    # starts a new implicit section
                    if self._isHeading(token) and self._getHeadingRank(token) <= self.heading_rank:
                        self.in_section = False

                # If this is the first heading of the section and we want to
                # omit it, note that we've found it
                is_first_heading = (
                    self.in_section and
                    self.ignore_heading and
                    not self.already_ignored_header and
                    not self.heading_to_ignore and
                    self._isHeading(token))
                if is_first_heading:
                    self.heading_to_ignore = token

            elif token['type'] == 'EndTag':
                self.open_level -= 1

                # If the parent of the section has ended, end the section.
                # This applies to both implicit and explicit sections.
                if self.parent_level is not None and self.open_level < self.parent_level:
                    self.in_section = False

            # If there's no replacement source, then this is a section
            # extraction. So, emit tokens while we're in the section, as long
            # as we're also not in the process of ignoring a heading
            if not self.replace_source:
                if self.in_section and not self.heading_to_ignore:
                    yield token

            # If there is a replacement source, then this is a section
            # replacement. Emit tokens of the source stream until we're in the
            # section, then emit the replacement stream and ignore the rest of
            # the source stream for the section. Note that an ignored heading
            # is *not* replaced.
            else:
                if not self.in_section or self.heading_to_ignore:
                    yield token
                elif not self.replacement_emitted:
                    for r_token in self.replace_source:
                        yield r_token
                    self.replacement_emitted = True

            # If this looks like the end of a heading we were ignoring, clear
            # the ignoring condition.
            if (token['type'] == 'EndTag' and
                    self.in_section and
                    self.ignore_heading and
                    not self.already_ignored_header and
                    self.heading_to_ignore and
                    self._isHeading(token) and
                    token['name'] == self.heading_to_ignore['name']):

                self.heading_to_ignore = None
                self.already_ignored_header = True
Beispiel #54
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if 'SpaceCharacters' == token['type']:
             continue
         yield token