Example #1
0
 def __init__(self, source):
     html5lib_Filter.__init__(self, source)
     self.level = 2
     self.max_level = 3
     self.in_header = False
     self.open_level = 0
     self.in_hierarchy = False
Example #2
0
    def __init__(self, source, id, replace_source=None):
        html5lib_Filter.__init__(self, source)

        self.replace_source = replace_source
        self.section_id = id

        self.heading = None
        self.heading_rank = None
        self.open_level = 0
        self.parent_level = None
        self.in_section = False
        self.next_in_section = False
        self.replacement_emitted = False
Example #3
0
    def __init__(self, source, id, replace_source=None):
        html5lib_Filter.__init__(self, source)

        self.replace_source = replace_source
        self.section_id = id

        self.heading = None
        self.heading_rank = None
        self.open_level = 0
        self.parent_level = None
        self.in_section = False
        self.next_in_section = False
        self.replacement_emitted = False
Example #4
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:
            if ('StartTag' == token['type']
                    and token['name'] in HEAD_TAGS_TOC):
                self.in_header = True
                out = ()
                level_match = re.compile(r'^h(\d)$').match(token['name'])
                level = int(level_match.group(1))
                if level > self.level:
                    diff = level - self.level
                    for i in range(diff):
                        out += ({
                            'type': 'StartTag',
                            'name': 'ol',
                            'data': {}
                        }, )
                    self.level = level
                elif level < self.level:
                    diff = self.level - level
                    for i in range(diff):
                        out += ({
                            'type': 'EndTag',
                            'name': 'li'
                        }, {
                            'type': 'EndTag',
                            'name': 'ol'
                        })
                    self.level = level
                attrs = dict(token['data'])
                id = attrs.get('id', None)
                if id:
                    out += (
                        {
                            'type': 'StartTag',
                            'name': 'li',
                            'data': {}
                        },
                        {
                            'type': 'StartTag',
                            'name': 'a',
                            'data': {
                                'rel': 'internal',
                                'href': '#%s' % id,
                            }
                        },
                    )
                    for t in out:
                        yield t
            elif ('Characters' == token['type'] and self.in_header):
                yield token
            elif ('EndTag' == token['type']
                  and token['name'] in HEAD_TAGS_TOC):
                self.in_header = False
                level_match = re.compile(r'^h(\d)$').match(token['name'])
                level = int(level_match.group(1))
                out = ({'type': 'EndTag', 'name': 'a'}, )
                for t in out:
                    yield t
Example #5
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            if "StartTag" == token["type"] and token["name"] in self.tag_attributes:
                attrs = dict(token["data"])

                # If the element has the attribute we're looking for
                desired_attr = self.tag_attributes[token["name"]]
                if desired_attr in attrs:
                    address = attrs[desired_attr]
                    if not address.startswith("http"):
                        if address.startswith("//") or address.startswith("{{"):
                            # Do nothing for absolute addresses or apparent
                            # template variable output
                            attrs[desired_attr] = address
                        elif address.startswith("/"):
                            # Starts with "/", so just add the base url
                            attrs[desired_attr] = self.base_url + address
                        else:
                            attrs[desired_attr] = self.base_url + "/" + address
                        token["data"] = attrs.items()

            yield token
Example #6
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            if (token['type'] == 'StartTag'
                    and token['name'] in self.tag_attributes):
                attrs = dict(token['data'])

                # If the element has the attribute we're looking for
                desired_attr = self.tag_attributes[token['name']]

                for (namespace, name), value in attrs.items():
                    if desired_attr == name:
                        if not value.startswith('http'):
                            if value.startswith('//') or value.startswith(
                                    '{{'):
                                # Do nothing for absolute addresses or apparent
                                # template variable output
                                attrs[(namespace, name)] = value
                            elif value.startswith('/'):
                                # Starts with "/", so just add the base url
                                attrs[(namespace,
                                       name)] = self.base_url + value
                            else:
                                attrs[(namespace,
                                       name)] = self.base_url + '/' + value
                            token['data'] = attrs
                        break

            yield token
Example #7
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            if (token['type'] == 'StartTag' and
                    token['name'] in self.tag_attributes):
                attrs = dict(token['data'])

                # If the element has the attribute we're looking for
                desired_attr = self.tag_attributes[token['name']]

                for (namespace, name), value in attrs.items():
                    if desired_attr == name:
                        if not value.startswith('http'):
                            if value.startswith('//') or value.startswith('{{'):
                                # Do nothing for absolute addresses or apparent
                                # template variable output
                                attrs[(namespace, name)] = value
                            elif value.startswith('/'):
                                # Starts with "/", so just add the base url
                                attrs[(namespace, name)] = self.base_url + value
                            else:
                                attrs[(namespace, name)] = self.base_url + '/' + value
                            token['data'] = attrs
                        break

            yield token
Example #8
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            if token["type"] == "StartTag" and token["name"] in self.tag_attributes:
                attrs = dict(token["data"])

                # If the element has the attribute we're looking for
                desired_attr = self.tag_attributes[token["name"]]

                for (namespace, name), value in attrs.items():
                    if desired_attr == name:
                        if not value.startswith("http"):
                            if value.startswith("//") or value.startswith("{{"):
                                # Do nothing for absolute addresses or apparent
                                # template variable output
                                attrs[(namespace, name)] = value
                            elif value.startswith("/"):
                                # Starts with "/", so just add the base url
                                attrs[(namespace, name)] = self.base_url + value
                            else:
                                attrs[(namespace, name)] = self.base_url + "/" + value
                            token["data"] = attrs
                        break

            yield token
Example #9
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            if ('StartTag' == token['type'] and token['name'] in self.tag_attributes):
                attrs = dict(token['data'])

                # If the element has the attribute we're looking for
                desired_attr = self.tag_attributes[token['name']]
                if desired_attr in attrs:
                    address = attrs[desired_attr]
                    if not address.startswith('http'):
                        if address.startswith('//') or address.startswith('{{'):
                            # Do nothing for absolute addresses or apparent
                            # template variable output
                            attrs[desired_attr] = address
                        elif address.startswith('/'):
                            # Starts with "/", so just add the base url
                            attrs[desired_attr] = self.base_url + address
                        else:
                            attrs[desired_attr] = self.base_url + '/' + address
                        token['data'] = attrs.items()

            yield token
Example #10
0
        def __iter__(self):
            for token in HTML5LibFilterBase.__iter__(self):
                type = token['type']
                if type in ('StartTag', 'EmptyTag', 'EndTag'):
                    name = token['name']
                    if name in ('html', 'head', 'body'):
                        continue

                yield token
Example #11
0
    def __iter__(self):

        for token in html5lib_Filter.__iter__(self):

            if "StartTag" == token["type"]:

                # Strip out any attributes that start with "on"
                token["data"] = [(k, v) for (k, v) in dict(token["data"]).items() if not k.startswith("on")]

            yield token
Example #12
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag':
             # Strip out any attributes that start with "on"
             attrs = {}
             for (namespace, name), value in token['data'].items():
                 if name.startswith('on'):
                     continue
                 attrs[(namespace, name)] = value
             token['data'] = attrs
         yield token
Example #13
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag':
             # Strip out any attributes that start with "on"
             attrs = {}
             for (namespace, name), value in token['data'].items():
                 if name.startswith('on'):
                     continue
                 attrs[(namespace, name)] = value
             token['data'] = attrs
         yield token
Example #14
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token["type"] == "StartTag":
             # Strip out any attributes that start with "on"
             attrs = {}
             for (namespace, name), value in token["data"].items():
                 if name.startswith("on"):
                     continue
                 attrs[(namespace, name)] = value
             token["data"] = attrs
         yield token
Example #15
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if ('StartTag' == token['type']):
             if 'iframe' == token['name']:
                 attrs = dict(token['data'])
                 src = attrs.get('src', '')
                 if src:
                     parts = urlparse(src)
                     if not parts.netloc or parts.netloc not in self.hosts:
                         attrs['src'] = ''
                 token['data'] = attrs.items()
         yield token
Example #16
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'a':
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'href' and value:
                     if re.search(self.blocked_protocols, value):
                         attrs[(namespace, 'href')] = ''
                 token['data'] = attrs
             yield token
         else:
             yield token
Example #17
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token["type"] == "StartTag" and token["name"] == "a":
             attrs = dict(token["data"])
             for (namespace, name), value in attrs.items():
                 if name == "href" and value:
                     if re.search(self.blocked_protocols, value):
                         attrs[(namespace, "href")] = ""
                 token["data"] = attrs
             yield token
         else:
             yield token
Example #18
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'a':
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'href' and value:
                     if re.search(self.blocked_protocols, value):
                         attrs[(namespace, 'href')] = ''
                 token['data'] = attrs
             yield token
         else:
             yield token
Example #19
0
    def __iter__(self):

        for token in html5lib_Filter.__iter__(self):

            if ('StartTag' == token['type']):

                # Strip out any attributes that start with "on"
                token['data'] = [(k, v)
                    for (k, v) in dict(token['data']).items()
                    if not k.startswith('on')]

            yield token
Example #20
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if ('StartTag' == token['type']):
             if 'iframe' == token['name']:
                 attrs = dict(token['data'])
                 src = attrs.get('src', '')
                 if src:
                     parts = urlparse(src)
                     if not parts.netloc or parts.netloc not in self.hosts:
                         attrs['src'] = ''
                 token['data'] = attrs.items()
         yield token
Example #21
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'pre':
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'function' and value:
                     m = MT_SYNTAX_RE.match(value)
                     if m:
                         lang = m.group(1).lower()
                         brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang)
                         attrs[(namespace, u'class')] = "brush: %s" % brush
                         del attrs[(None, 'function')]
                         token['data'] = attrs
         yield token
Example #22
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'pre':
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'function' and value:
                     m = MT_SYNTAX_RE.match(value)
                     if m:
                         lang = m.group(1).lower()
                         brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang)
                         attrs[(namespace, u'class')] = "brush: %s" % brush
                         del attrs[(None, 'function')]
                         token['data'] = attrs
         yield token
Example #23
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if token["type"] == "StartTag" and token["name"] == "pre":
             attrs = dict(token["data"])
             for (namespace, name), value in attrs.items():
                 if name == "function" and value:
                     m = MT_SYNTAX_RE.match(value)
                     if m:
                         lang = m.group(1).lower()
                         brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang)
                         attrs[(namespace, u"class")] = "brush: %s" % brush
                         del attrs[(None, "function")]
                         token["data"] = attrs
         yield token
Example #24
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            yield token

            if (token['type'] == 'StartTag' and token['name'] in SECTION_TAGS):
                attrs = dict(token['data'])
                for (namespace, name), value in attrs.items():
                    if name == 'id' and value:
                        ts = ({
                            'type': 'StartTag',
                            'name': 'a',
                            'data': {
                                (None, u'title'):
                                ugettext('Edit section'),
                                (None, u'class'):
                                'edit-section',
                                (None, u'data-section-id'):
                                value,
                                (None, u'data-section-src-url'):
                                u'{0!s}?{1!s}'.format(
                                    reverse('wiki.document',
                                            args=[self.slug],
                                            locale=self.locale),
                                    urlencode({
                                        'section': value.encode('utf-8'),
                                        'raw': 'true'
                                    })),
                                (None, u'href'):
                                u'{0!s}?{1!s}'.format(
                                    reverse('wiki.edit',
                                            args=[self.slug],
                                            locale=self.locale),
                                    urlencode({
                                        'section': value.encode('utf-8'),
                                        'edit_links': 'true'
                                    }))
                            }
                        }, {
                            'type': 'Characters',
                            'data': ugettext(u'Edit')
                        }, {
                            'type': 'EndTag',
                            'name': 'a'
                        })
                        for t in ts:
                            yield t
Example #25
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if ('StartTag' == token['type']):
             if 'pre' == token['name']:
                 attrs = dict(token['data'])
                 function = attrs.get('function', None)
                 if function:
                     m = MT_SYNTAX_PAT.match(function)
                     if m:
                         lang = m.group(1).lower()
                         brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang)
                         attrs['class'] = "brush: %s" % brush
                         del attrs['function']
                         token['data'] = attrs.items()
         yield token
Example #26
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if ('StartTag' == token['type']):
             if 'pre' == token['name']:
                 attrs = dict(token['data'])
                 function = attrs.get('function', None)
                 if function:
                     m = MT_SYNTAX_PAT.match(function)
                     if m:
                         lang = m.group(1).lower()
                         brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang)
                         attrs['class'] = "brush: %s" % brush
                         del attrs['function']
                         token['data'] = attrs.items()
         yield token
Example #27
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if "StartTag" == token["type"]:
             if "pre" == token["name"]:
                 attrs = dict(token["data"])
                 function = attrs.get("function", None)
                 if function:
                     m = MT_SYNTAX_PAT.match(function)
                     if m:
                         lang = m.group(1).lower()
                         brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang)
                         attrs["class"] = "brush: %s" % brush
                         del attrs["function"]
                         token["data"] = attrs.items()
         yield token
Example #28
0
 def __iter__(self):
     in_iframe = False
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'iframe':
             in_iframe = True
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'src' and value:
                     if not re.search(self.hosts, value):
                         attrs[(namespace, 'src')] = ''
                 token['data'] = attrs
             yield token
         if token['type'] == 'EndTag' and token['name'] == 'iframe':
             in_iframe = False
         if not in_iframe:
             yield token
Example #29
0
 def __iter__(self):
     in_iframe = False
     for token in html5lib_Filter.__iter__(self):
         if token["type"] == "StartTag" and token["name"] == "iframe":
             in_iframe = True
             attrs = dict(token["data"])
             for (namespace, name), value in attrs.items():
                 if name == "src" and value:
                     if not re.search(self.hosts, value):
                         attrs[(namespace, "src")] = ""
                 token["data"] = attrs
             yield token
         if token["type"] == "EndTag" and token["name"] == "iframe":
             in_iframe = False
         if not in_iframe:
             yield token
Example #30
0
 def __iter__(self):
     in_iframe = False
     for token in html5lib_Filter.__iter__(self):
         if token['type'] == 'StartTag' and token['name'] == 'iframe':
             in_iframe = True
             attrs = dict(token['data'])
             for (namespace, name), value in attrs.items():
                 if name == 'src' and value:
                     if not re.search(self.hosts, value):
                         attrs[(namespace, 'src')] = ''
                 token['data'] = attrs
             yield token
         if token['type'] == 'EndTag' and token['name'] == 'iframe':
             in_iframe = False
         if not in_iframe:
             yield token
Example #31
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            yield token

            if ('StartTag' == token['type'] and token['name'] in SECTION_TAGS):
                attrs = dict(token['data'])
                id = attrs.get('id', None)
                if id:
                    out = ({
                        'type': 'StartTag',
                        'name': 'a',
                        'data': {
                            'title':
                            _('Edit section'),
                            'class':
                            'edit-section',
                            'data-section-id':
                            id,
                            'data-section-src-url':
                            u'%s?%s' % (reverse('wiki.document',
                                                args=[self.full_path],
                                                locale=self.locale),
                                        urlencode({
                                            'section': id.encode('utf-8'),
                                            'raw': 'true'
                                        })),
                            'href':
                            u'%s?%s' % (reverse('wiki.edit_document',
                                                args=[self.full_path],
                                                locale=self.locale),
                                        urlencode({
                                            'section': id.encode('utf-8'),
                                            'edit_links': 'true'
                                        }))
                        }
                    }, {
                        'type': 'Characters',
                        'data': _('Edit')
                    }, {
                        'type': 'EndTag',
                        'name': 'a'
                    })
                    for t in out:
                        yield t
Example #32
0
 def __iter__(self):
     in_iframe = False
     for token in html5lib_Filter.__iter__(self):
         if ('StartTag' == token['type']):
             if 'iframe' == token['name']:
                 in_iframe = True
                 attrs = dict(token['data'])
                 src = attrs.get('src', '')
                 if src:
                     if not re.search(self.hosts, src):
                         attrs['src'] = ''
                 token['data'] = attrs.items()
                 yield token
         if ('EndTag' == token['type']):
             if 'iframe' == token['name']:
                 in_iframe = False
         if not in_iframe:
             yield token
Example #33
0
 def __iter__(self):
     in_iframe = False
     for token in html5lib_Filter.__iter__(self):
         if "StartTag" == token["type"]:
             if "iframe" == token["name"]:
                 in_iframe = True
                 attrs = dict(token["data"])
                 src = attrs.get("src", "")
                 if src:
                     if not re.search(self.hosts, src):
                         attrs["src"] = ""
                 token["data"] = attrs.items()
                 yield token
         if "EndTag" == token["type"]:
             if "iframe" == token["name"]:
                 in_iframe = False
         if not in_iframe:
             yield token
Example #34
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:
            if ('StartTag' == token['type'] and token['name'] in HEAD_TAGS_TOC):
                self.in_header = True
                out = ()
                level_match = re.compile(r'^h(\d)$').match(token['name'])
                level = int(level_match.group(1))
                if level > self.level:
                    diff = level - self.level
                    for i in range(diff):
                        out += ({'type': 'StartTag', 'name': 'ol',
                                 'data': {}},)
                    self.level = level
                elif level < self.level:
                    diff = self.level - level
                    for i in range(diff):
                        out += ({'type': 'EndTag', 'name': 'li'},
                                {'type': 'EndTag', 'name': 'ol'})
                    self.level = level
                attrs = dict(token['data'])
                id = attrs.get('id', None)
                if id:
                    out += (
                        {'type': 'StartTag', 'name': 'li', 'data': {}},
                        {'type': 'StartTag', 'name': 'a',
                         'data': {
                            'rel': 'internal',
                            'href': '#%s' % id,
                         }},
                    )
                    for t in out:
                        yield t
            elif ('Characters' == token['type'] and self.in_header):
                yield token
            elif ('EndTag' == token['type'] and token['name'] in HEAD_TAGS_TOC):
                self.in_header = False
                level_match = re.compile(r'^h(\d)$').match(token['name'])
                level = int(level_match.group(1))
                out = ({'type': 'EndTag', 'name': 'a'},)
                for t in out:
                    yield t
Example #35
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        # Pass 1: Collect all known IDs from the stream
        buffer = []
        for token in input:
            buffer.append(token)
            if 'StartTag' == token['type']:
                attrs = dict(token['data'])
                if 'id' in attrs:
                    self.known_ids.add(attrs['id'])

        # Pass 2: Sprinkle in IDs where they're missing
        for token in buffer:
            if ('StartTag' == token['type'] and
                    token['name'] in SECTION_EDIT_TAGS):
                attrs = dict(token['data'])
                id = attrs.get('id', None)
                if not id:
                    attrs['id'] = self.gen_id()
                    token['data'] = attrs.items()
            yield token
Example #36
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            yield token

            if ('StartTag' == token['type'] and
                    token['name'] in SECTION_TAGS):
                attrs = dict(token['data'])
                id = attrs.get('id', None)
                if id:
                    out = (
                        {'type': 'StartTag', 'name': 'a',
                         'data': {
                             'title': _('Edit section'),
                             'class': 'edit-section',
                             'data-section-id': id,
                             'data-section-src-url': u'%s?%s' % (
                                 reverse('wiki.document',
                                         args=[self.full_path],
                                         locale=self.locale),
                                 urlencode({'section': id.encode('utf-8'),
                                            'raw': 'true'})
                              ),
                              'href': u'%s?%s' % (
                                 reverse('wiki.edit_document',
                                         args=[self.full_path],
                                         locale=self.locale),
                                 urlencode({'section': id.encode('utf-8'),
                                            'edit_links': 'true'})
                              )
                         }},
                        {'type': 'Characters', 'data': _('Edit')},
                        {'type': 'EndTag', 'name': 'a'}
                    )
                    for t in out:
                        yield t
Example #37
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            yield token

            if (token['type'] == 'StartTag' and
                    token['name'] in SECTION_TAGS):
                attrs = dict(token['data'])
                for (namespace, name), value in attrs.items():
                    if name == 'id' and value:
                        ts = ({'type': 'StartTag',
                               'name': 'a',
                               'data': {
                                   (None, u'title'): ugettext('Edit section'),
                                   (None, u'class'): 'edit-section',
                                   (None, u'data-section-id'): value,
                                   (None, u'data-section-src-url'): u'%s?%s' % (
                                       reverse('wiki.document',
                                               args=[self.slug],
                                               locale=self.locale),
                                       urlencode({'section': value.encode('utf-8'),
                                                  'raw': 'true'})
                                   ),
                                   (None, u'href'): u'%s?%s' % (
                                       reverse('wiki.edit',
                                               args=[self.slug],
                                               locale=self.locale),
                                       urlencode({'section': value.encode('utf-8'),
                                                  'edit_links': 'true'})
                                   )
                               }},
                              {'type': 'Characters',
                               'data': ugettext(u'Edit')},
                              {'type': 'EndTag', 'name': 'a'})
                        for t in ts:
                            yield t
Example #38
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:
            if "StartTag" == token["type"] and token["name"] in HEAD_TAGS_TOC:
                self.in_header = True
                out = ()
                level_match = re.compile(r"^h(\d)$").match(token["name"])
                level = int(level_match.group(1))
                if level > self.level:
                    diff = level - self.level
                    for i in range(diff):
                        out += ({"type": "StartTag", "name": "ol", "data": {}},)
                    self.level = level
                elif level < self.level:
                    diff = self.level - level
                    for i in range(diff):
                        out += ({"type": "EndTag", "name": "li"}, {"type": "EndTag", "name": "ol"})
                    self.level = level
                attrs = dict(token["data"])
                id = attrs.get("id", None)
                if id:
                    out += (
                        {"type": "StartTag", "name": "li", "data": {}},
                        {"type": "StartTag", "name": "a", "data": {"rel": "internal", "href": "#%s" % id}},
                    )
                    for t in out:
                        yield t
            elif "Characters" == token["type"] and self.in_header:
                yield token
            elif "EndTag" == token["type"] and token["name"] in HEAD_TAGS_TOC:
                self.in_header = False
                level_match = re.compile(r"^h(\d)$").match(token["name"])
                level = int(level_match.group(1))
                out = ({"type": "EndTag", "name": "a"},)
                for t in out:
                    yield t
Example #39
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            yield token

            if token["type"] == "StartTag" and token["name"] in SECTION_TAGS:
                attrs = dict(token["data"])
                for (namespace, name), value in attrs.items():
                    if name == "id" and value:
                        ts = (
                            {
                                "type": "StartTag",
                                "name": "a",
                                "data": {
                                    (None, u"title"): ugettext("Edit section"),
                                    (None, u"class"): "edit-section",
                                    (None, u"data-section-id"): value,
                                    (None, u"data-section-src-url"): u"%s?%s"
                                    % (
                                        reverse("wiki.document", args=[self.slug], locale=self.locale),
                                        urlencode({"section": value.encode("utf-8"), "raw": "true"}),
                                    ),
                                    (None, u"href"): u"%s?%s"
                                    % (
                                        reverse("wiki.edit", args=[self.slug], locale=self.locale),
                                        urlencode({"section": value.encode("utf-8"), "edit_links": "true"}),
                                    ),
                                },
                            },
                            {"type": "Characters", "data": ugettext(u"Edit")},
                            {"type": "EndTag", "name": "a"},
                        )
                        for t in ts:
                            yield t
Example #40
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:

            yield token

            if "StartTag" == token["type"] and token["name"] in SECTION_TAGS:
                attrs = dict(token["data"])
                id = attrs.get("id", None)
                if id:
                    out = (
                        {
                            "type": "StartTag",
                            "name": "a",
                            "data": {
                                "title": _("Edit section"),
                                "class": "edit-section",
                                "data-section-id": id,
                                "data-section-src-url": u"%s?%s"
                                % (
                                    reverse("wiki.document", args=[self.full_path], locale=self.locale),
                                    urlencode({"section": id.encode("utf-8"), "raw": "true"}),
                                ),
                                "href": u"%s?%s"
                                % (
                                    reverse("wiki.edit_document", args=[self.full_path], locale=self.locale),
                                    urlencode({"section": id.encode("utf-8"), "edit_links": "true"}),
                                ),
                            },
                        },
                        {"type": "Characters", "data": _("Edit")},
                        {"type": "EndTag", "name": "a"},
                    )
                    for t in out:
                        yield t
Example #41
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)
        for token in input:

            # Section start was deferred, so start it now.
            if self.next_in_section:
                self.next_in_section = False
                self.in_section = True

            if 'StartTag' == token['type']:
                attrs = dict(token['data'])
                self.open_level += 1

                # Have we encountered the section or heading element we're
                # looking for?
                if attrs.get('id', None) == self.section_id:

                    # If we encounter a section element that matches the ID,
                    # then we'll want to scoop up all its children as an
                    # explicit section.
                    if (self.parent_level is None and self._isSection(token)):
                        self.parent_level = self.open_level
                        # Defer the start of the section, so the section parent
                        # itself isn't included.
                        self.next_in_section = True

                    # If we encounter a heading element that matches the ID, we
                    # start an implicit section.
                    elif (self.heading is None and self._isHeading(token)):
                        self.heading = token
                        self.heading_rank = self._getHeadingRank(token)
                        self.parent_level = self.open_level - 1
                        self.in_section = True

                # If started an implicit section, these rules apply to
                # siblings...
                elif (self.heading is not None
                      and self.open_level - 1 == self.parent_level):

                    # The implicit section should stop if we hit another
                    # sibling heading whose rank is equal or higher, since that
                    # starts a new implicit section
                    if (self._isHeading(token) and
                            self._getHeadingRank(token) <= self.heading_rank):
                        self.in_section = False

            if 'EndTag' == token['type']:
                self.open_level -= 1

                # If the parent of the section has ended, end the section.
                # This applies to both implicit and explicit sections.
                if (self.parent_level is not None
                        and self.open_level < self.parent_level):
                    self.in_section = False

            # If there's no replacement source, then this is a section
            # extraction. So, emit tokens while we're in the section.
            if not self.replace_source:
                if self.in_section:
                    yield token

            # If there is a replacement source, then this is a section
            # replacement. Emit tokens of the source stream until we're in the
            # section, then emit the replacement stream and ignore the rest of
            # the source stream for the section..
            else:
                if not self.in_section:
                    yield token
                elif not self.replacement_emitted:
                    for r_token in self.replace_source:
                        yield r_token
                    self.replacement_emitted = True
Example #42
0
    def __iter__(self):
        from wiki.models import Document

        input = html5lib_Filter.__iter__(self)

        # Pass #1: Gather all the link URLs and prepare annotations
        links = dict()
        buffer = []
        for token in input:
            buffer.append(token)
            if ('StartTag' == token['type'] and 'a' == token['name']):
                attrs = dict(token['data'])
                if not 'href' in attrs:
                    continue

                href = attrs['href']
                if href.startswith(self.base_url):
                    # Squash site-absolute URLs to site-relative paths.
                    href = '/%s' % href[len(self.base_url):]

                # Prepare annotations record for this path.
                links[href] = dict(classes=[])

        # Run through all the links and check for annotatable conditions.
        for href in links.keys():

            # Is this an external URL?
            is_external = False
            for prefix in self.EXTERNAL_PREFIXES:
                if href.startswith(prefix):
                    is_external = True
                    break
            if is_external:
                links[href]['classes'].append('external')
                continue

            # TODO: Should this also check for old-school mindtouch URLs? Or
            # should we encourage editors to convert to new-style URLs to take
            # advantage of link annotation? (I'd say the latter)

            # Is this a kuma doc URL?
            if '/docs/' in href:

                # Check if this is a special docs path that's exempt from "new"
                skip = False
                for path in DOC_SPECIAL_PATHS:
                    if '/docs/%s' % path in href:
                        skip = True
                if skip:
                    continue

                href_locale, href_path = href.split(u'/docs/', 1)
                if href_locale.startswith(u'/'):
                    href_locale = href_locale[1:]

                if '#' in href_path:
                    # If present, discard the hash anchor
                    href_path, _, _ = href_path.partition('#')

                # Handle any URL-encoded UTF-8 characters in the path
                href_path = href_path.encode('utf-8', 'ignore')
                href_path = urllib.unquote(href_path)
                href_path = href_path.decode('utf-8', 'ignore')

                # Try to sort out the locale and slug through some of our
                # redirection logic.
                locale, slug, needs_redirect = (
                    Document.locale_and_slug_from_path(
                        href_path, path_locale=href_locale))

                # Does this locale and slug correspond to an existing document?
                # If not, mark it as a "new" link.
                #
                # TODO: Should these DB queries be batched up into one big
                # query? A page with hundreds of links will fire off hundreds
                # of queries
                ct = Document.objects.filter(locale=locale, slug=slug).count()
                if ct == 0:
                    links[href]['classes'].append('new')

        # Pass #2: Filter the content, annotating links
        for token in buffer:
            if ('StartTag' == token['type'] and 'a' == token['name']):
                attrs = dict(token['data'])

                if 'href' in attrs:

                    href = attrs['href']
                    if href.startswith(self.base_url):
                        # Squash site-absolute URLs to site-relative paths.
                        href = '/%s' % href[len(self.base_url):]

                    if href in links:
                        # Update class names on this link element.
                        if 'class' in attrs:
                            classes = set(attrs['class'].split(u' '))
                        else:
                            classes = set()
                        classes.update(links[href]['classes'])
                        if classes:
                            attrs['class'] = u' '.join(classes)

                token['data'] = attrs.items()

            yield token
Example #43
0
 def __init__(self, source):
     html5lib_Filter.__init__(self, source)
     self.id_cnt = 0
     self.known_ids = set()
Example #44
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        # Pass 1: Collect all known IDs from the stream
        buffer = []
        for token in input:
            buffer.append(token)
            if 'StartTag' == token['type']:
                attrs = dict(token['data'])
                if 'id' in attrs:
                    self.known_ids.add(attrs['id'])
                if 'name' in attrs:
                    self.known_ids.add(attrs['name'])

        # Pass 2: Sprinkle in IDs where they're needed
        while len(buffer):
            token = buffer.pop(0)

            if not ('StartTag' == token['type']
                    and token['name'] in SECTION_TAGS):
                yield token
            else:
                attrs = dict(token['data'])

                # Treat a name attribute as a human-specified ID override
                name = attrs.get('name', None)
                if name:
                    attrs['id'] = name
                    token['data'] = attrs.items()
                    yield token
                    continue

                # If this is not a header, then generate a section ID.
                if token['name'] not in HEAD_TAGS:
                    attrs['id'] = self.gen_id()
                    token['data'] = attrs.items()
                    yield token
                    continue

                # If this is a header, then scoop up the rest of the header and
                # gather the text it contains.
                start, text, tmp = token, [], []
                while len(buffer):
                    token = buffer.pop(0)
                    tmp.append(token)
                    if token['type'] in ('Characters', 'SpaceCharacters'):
                        text.append(token['data'])
                    elif ('EndTag' == token['type']
                          and start['name'] == token['name']):
                        # Note: This is naive, and doesn't track other
                        # start/end tags nested in the header. Odd things might
                        # happen in a case like <h1><h1></h1></h1>. But, that's
                        # invalid markup and the worst case should be a
                        # truncated ID because all the text wasn't accumulated.
                        break

                # Slugify the text we found inside the header, generate an ID
                # as a last resort.
                slug = self.slugify(u''.join(text))
                if not slug:
                    slug = self.gen_id()
                attrs['id'] = slug
                start['data'] = attrs.items()

                # Finally, emit the tokens we scooped up for the header.
                yield start
                for t in tmp:
                    yield t
Example #45
0
    def __iter__(self):

        buffer = []
        for token in html5lib_Filter.__iter__(self):
            buffer.append(token)

        while len(buffer):
            token = buffer.pop(0)

            if not ('StartTag' == token['type'] and 'span' == token['name']):
                yield token
                continue

            attrs = dict(token['data'])
            if attrs.get('class', '') != 'script':
                yield token
                continue

            ds_call = []
            while len(buffer):
                token = buffer.pop(0)
                if token['type'] in ('Characters', 'SpaceCharacters'):
                    ds_call.append(token['data'])
                elif 'StartTag' == token['type']:
                    attrs = token['data']
                    if attrs:
                        a_out = (u' %s' % u' '.join(
                            (u'%s=%s' % (name, quoteattr(val))
                             for name, val in attrs)))
                    else:
                        a_out = u''
                    ds_call.append(u'<%s%s>' % (token['name'], a_out))
                elif 'EndTag' == token['type']:
                    if 'span' == token['name']:
                        break
                    ds_call.append('</%s>' % token['name'])

            ds_call = u''.join(ds_call).strip()

            # Snip off any "template." prefixes
            strip_prefixes = ('template.', 'wiki.')
            for prefix in strip_prefixes:
                if ds_call.lower().startswith(prefix):
                    ds_call = ds_call[len(prefix):]

            # template("template name", [ "params" ])
            wt_re = re.compile(
                r'''^template\(['"]([^'"]+)['"],\s*\[([^\]]+)]''', re.I)
            m = wt_re.match(ds_call)
            if m:
                ds_call = '%s(%s)' % (m.group(1), m.group(2).strip())

            # template("template name")
            wt_re = re.compile(r'''^template\(['"]([^'"]+)['"]''', re.I)
            m = wt_re.match(ds_call)
            if m:
                ds_call = '%s()' % (m.group(1))

            # HACK: This is dirty, but seems like the easiest way to
            # reconstitute the token stream, including what gets parsed as
            # markup in the middle of macro parameters.
            #
            # eg. {{ Note("This is <strong>strongly</strong> discouraged") }}
            parsed = parse('{{ %s }}' % ds_call)
            for token in parsed.stream:
                yield token
Example #46
0
 def __init__(self, source, base_url):
     html5lib_Filter.__init__(self, source)
     self.base_url = base_url
Example #47
0
    def __init__(self, source, hosts):
        html5lib_Filter.__init__(self, source)

        self.hosts = hosts
Example #48
0
    def __iter__(self):

        buffer = []
        for token in html5lib_Filter.__iter__(self):
            buffer.append(token)

        while len(buffer):
            token = buffer.pop(0)

            if not ('StartTag' == token['type'] and
                    'span' == token['name']):
                yield token
                continue

            attrs = dict(token['data'])
            if attrs.get('class', '') != 'script':
                yield token
                continue

            ds_call = []
            while len(buffer):
                token = buffer.pop(0)
                if token['type'] in ('Characters', 'SpaceCharacters'):
                    ds_call.append(token['data'])
                elif 'StartTag' == token['type']:
                    attrs = token['data']
                    if attrs:
                        a_out = (u' %s' % u' '.join(
                            (u'%s=%s' %
                             (name, quoteattr(val))
                             for name, val in attrs)))
                    else:
                        a_out = u''
                    ds_call.append(u'<%s%s>' % (token['name'], a_out))
                elif 'EndTag' == token['type']:
                    if 'span' == token['name']:
                        break
                    ds_call.append('</%s>' % token['name'])

            ds_call = u''.join(ds_call).strip()

            # Snip off any "template." prefixes
            strip_prefixes = ('template.', 'wiki.')
            for prefix in strip_prefixes:
                if ds_call.lower().startswith(prefix):
                    ds_call = ds_call[len(prefix):]

            # template("template name", [ "params" ])
            wt_re = re.compile(
                r'''^template\(['"]([^'"]+)['"],\s*\[([^\]]+)]''', re.I)
            m = wt_re.match(ds_call)
            if m:
                ds_call = '%s(%s)' % (m.group(1), m.group(2).strip())

            # template("template name")
            wt_re = re.compile(r'''^template\(['"]([^'"]+)['"]''', re.I)
            m = wt_re.match(ds_call)
            if m:
                ds_call = '%s()' % (m.group(1))

            # HACK: This is dirty, but seems like the easiest way to
            # reconstitute the token stream, including what gets parsed as
            # markup in the middle of macro parameters.
            #
            # eg. {{ Note("This is <strong>strongly</strong> discouraged") }}
            parsed = parse('{{ %s }}' % ds_call)
            for token in parsed.stream:
                yield token
Example #49
0
 def __init__(self, source, base_url, tag_attributes):
     html5lib_Filter.__init__(self, source)
     self.base_url = base_url
     self.tag_attributes = tag_attributes
Example #50
0
    def __iter__(self):
        input = html5lib_Filter.__iter__(self)

        for token in input:
            if ('StartTag' == token['type']
                    and token['name'] in HEAD_TAGS_TOC):
                self.in_header = True
                out = ()
                level_match = re.compile(r'^h(\d)$').match(token['name'])
                level = int(level_match.group(1))
                if level > self.level:
                    diff = level - self.level
                    for i in range(diff):
                        if (not self.in_hierarchy and i % 2 == 0):
                            out += ({
                                'type': 'StartTag',
                                'name': 'li',
                                'data': {}
                            }, )
                        out += ({
                            'type': 'StartTag',
                            'name': 'ol',
                            'data': {}
                        }, )
                        if (diff > 1 and i % 2 == 0 and i != diff - 1):
                            out += ({
                                'type': 'StartTag',
                                'name': 'li',
                                'data': {}
                            }, )
                        self.open_level += 1
                    self.level = level
                elif level < self.level:
                    diff = self.level - level
                    for i in range(diff):
                        out += ({
                            'type': 'EndTag',
                            'name': 'ol'
                        }, {
                            'type': 'EndTag',
                            'name': 'li'
                        })
                        self.open_level -= 1
                    self.level = level
                attrs = dict(token['data'])
                id = attrs.get('id', None)
                if id:
                    out += (
                        {
                            'type': 'StartTag',
                            'name': 'li',
                            'data': {}
                        },
                        {
                            'type': 'StartTag',
                            'name': 'a',
                            'data': {
                                'rel': 'internal',
                                'href': '#%s' % id,
                            }
                        },
                    )
                    self.in_hierarchy = True
                    for t in out:
                        yield t
            elif ('StartTag' == token['type']
                  and token['name'] in TAGS_IN_TOC):
                yield token
            elif (token['type'] in ("Characters", "SpaceCharacters")
                  and self.in_header):
                yield token
            elif ('EndTag' == token['type'] and token['name'] in TAGS_IN_TOC):
                yield token
            elif ('EndTag' == token['type']
                  and token['name'] in HEAD_TAGS_TOC):
                self.in_header = False
                out = ({'type': 'EndTag', 'name': 'a'}, )
                for t in out:
                    yield t

        if self.open_level > 0:
            out = ()
            for i in range(self.open_level):
                out += ({
                    'type': 'EndTag',
                    'name': 'ol'
                }, {
                    'type': 'EndTag',
                    'name': 'li'
                })
            for t in out:
                yield t
Example #51
0
 def __init__(self, source, base_url):
     html5lib_Filter.__init__(self, source)
     self.base_url = base_url
     self.base_url_parsed = urlparse(base_url)
Example #52
0
 def __init__(self, source, full_path, locale):
     html5lib_Filter.__init__(self, source)
     self.full_path = full_path
     self.locale = locale
Example #53
0
 def __iter__(self):
     for token in html5lib_Filter.__iter__(self):
         if 'SpaceCharacters' == token['type']:
             continue
         yield token
Example #54
0
    def __iter__(self):
        from wiki.models import Document

        input = html5lib_Filter.__iter__(self)

        # Pass #1: Gather all the link URLs and prepare annotations
        links = dict()
        buffer = []
        for token in input:
            buffer.append(token)
            if ('StartTag' == token['type'] and 'a' == token['name']):
                attrs = dict(token['data'])
                if not 'href' in attrs:
                    continue

                href = attrs['href']
                href_parsed = urlparse(href)
                if href_parsed.netloc == self.base_url_parsed.netloc:
                    # Squash site-absolute URLs to site-relative paths.
                    href = href_parsed.path

                # Prepare annotations record for this path.
                links[href] = dict(
                    classes=[]
                )

        needs_existence_check = defaultdict(lambda: defaultdict(set))

        # Run through all the links and check for annotatable conditions.
        for href in links.keys():

            # Is this an external URL?
            is_external = False
            for prefix in self.EXTERNAL_PREFIXES:
                if href.startswith(prefix):
                    is_external = True
                    break
            if is_external:
                links[href]['classes'].append('external')
                continue

            # TODO: Should this also check for old-school mindtouch URLs? Or
            # should we encourage editors to convert to new-style URLs to take
            # advantage of link annotation? (I'd say the latter)

            # Is this a kuma doc URL?
            if '/docs/' in href:

                # Check if this is a special docs path that's exempt from "new"
                skip = False
                for path in DOC_SPECIAL_PATHS:
                    if '/docs/%s' % path in href:
                        skip = True
                if skip:
                    continue

                href_locale, href_path = href.split(u'/docs/', 1)
                if href_locale.startswith(u'/'):
                    href_locale = href_locale[1:]

                if '#' in href_path:
                    # If present, discard the hash anchor
                    href_path, _, _ = href_path.partition('#')

                # Handle any URL-encoded UTF-8 characters in the path
                href_path = href_path.encode('utf-8', 'ignore')
                href_path = urllib.unquote(href_path)
                href_path = href_path.decode('utf-8', 'ignore')

                # Try to sort out the locale and slug through some of our
                # redirection logic.
                locale, slug, needs_redirect = (Document
                        .locale_and_slug_from_path(href_path,
                                                   path_locale=href_locale))

                # Gather up this link for existence check
                needs_existence_check[locale.lower()][slug.lower()].add(href)

        # Perform existence checks for all the links, using one DB query per
        # locale for all the candidate slugs.
        for locale, slug_hrefs in needs_existence_check.items():

            existing_slugs = (Document.objects
                                      .filter(locale=locale,
                                              slug__in=slug_hrefs.keys())
                                      .values_list('slug', flat=True))

            # Remove the slugs that pass existence check.
            for slug in existing_slugs:
                lslug = slug.lower()
                if lslug in slug_hrefs:
                    del slug_hrefs[lslug]

            # Mark all the links whose slugs did not come back from the DB
            # query as "new"
            for slug, hrefs in slug_hrefs.items():
                for href in hrefs:
                    links[href]['classes'].append('new')

        # Pass #2: Filter the content, annotating links
        for token in buffer:
            if ('StartTag' == token['type'] and 'a' == token['name']):
                attrs = dict(token['data'])

                if 'href' in attrs:

                    href = attrs['href']
                    href_parsed = urlparse(href)
                    if href_parsed.netloc == self.base_url_parsed.netloc:
                        # Squash site-absolute URLs to site-relative paths.
                        href = href_parsed.path

                    if href in links:
                        # Update class names on this link element.
                        if 'class' in attrs:
                            classes = set(attrs['class'].split(u' '))
                        else:
                            classes = set()
                        classes.update(links[href]['classes'])
                        if classes:
                            attrs['class'] = u' '.join(classes)

                token['data'] = attrs.items()

            yield token