def __init__(self, source): html5lib_Filter.__init__(self, source) self.level = 2 self.max_level = 3 self.in_header = False self.open_level = 0 self.in_hierarchy = False
def __init__(self, source, id, replace_source=None): html5lib_Filter.__init__(self, source) self.replace_source = replace_source self.section_id = id self.heading = None self.heading_rank = None self.open_level = 0 self.parent_level = None self.in_section = False self.next_in_section = False self.replacement_emitted = False
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if ('StartTag' == token['type'] and token['name'] in HEAD_TAGS_TOC): self.in_header = True out = () level_match = re.compile(r'^h(\d)$').match(token['name']) level = int(level_match.group(1)) if level > self.level: diff = level - self.level for i in range(diff): out += ({ 'type': 'StartTag', 'name': 'ol', 'data': {} }, ) self.level = level elif level < self.level: diff = self.level - level for i in range(diff): out += ({ 'type': 'EndTag', 'name': 'li' }, { 'type': 'EndTag', 'name': 'ol' }) self.level = level attrs = dict(token['data']) id = attrs.get('id', None) if id: out += ( { 'type': 'StartTag', 'name': 'li', 'data': {} }, { 'type': 'StartTag', 'name': 'a', 'data': { 'rel': 'internal', 'href': '#%s' % id, } }, ) for t in out: yield t elif ('Characters' == token['type'] and self.in_header): yield token elif ('EndTag' == token['type'] and token['name'] in HEAD_TAGS_TOC): self.in_header = False level_match = re.compile(r'^h(\d)$').match(token['name']) level = int(level_match.group(1)) out = ({'type': 'EndTag', 'name': 'a'}, ) for t in out: yield t
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if "StartTag" == token["type"] and token["name"] in self.tag_attributes: attrs = dict(token["data"]) # If the element has the attribute we're looking for desired_attr = self.tag_attributes[token["name"]] if desired_attr in attrs: address = attrs[desired_attr] if not address.startswith("http"): if address.startswith("//") or address.startswith("{{"): # Do nothing for absolute addresses or apparent # template variable output attrs[desired_attr] = address elif address.startswith("/"): # Starts with "/", so just add the base url attrs[desired_attr] = self.base_url + address else: attrs[desired_attr] = self.base_url + "/" + address token["data"] = attrs.items() yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if (token['type'] == 'StartTag' and token['name'] in self.tag_attributes): attrs = dict(token['data']) # If the element has the attribute we're looking for desired_attr = self.tag_attributes[token['name']] for (namespace, name), value in attrs.items(): if desired_attr == name: if not value.startswith('http'): if value.startswith('//') or value.startswith( '{{'): # Do nothing for absolute addresses or apparent # template variable output attrs[(namespace, name)] = value elif value.startswith('/'): # Starts with "/", so just add the base url attrs[(namespace, name)] = self.base_url + value else: attrs[(namespace, name)] = self.base_url + '/' + value token['data'] = attrs break yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if (token['type'] == 'StartTag' and token['name'] in self.tag_attributes): attrs = dict(token['data']) # If the element has the attribute we're looking for desired_attr = self.tag_attributes[token['name']] for (namespace, name), value in attrs.items(): if desired_attr == name: if not value.startswith('http'): if value.startswith('//') or value.startswith('{{'): # Do nothing for absolute addresses or apparent # template variable output attrs[(namespace, name)] = value elif value.startswith('/'): # Starts with "/", so just add the base url attrs[(namespace, name)] = self.base_url + value else: attrs[(namespace, name)] = self.base_url + '/' + value token['data'] = attrs break yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if token["type"] == "StartTag" and token["name"] in self.tag_attributes: attrs = dict(token["data"]) # If the element has the attribute we're looking for desired_attr = self.tag_attributes[token["name"]] for (namespace, name), value in attrs.items(): if desired_attr == name: if not value.startswith("http"): if value.startswith("//") or value.startswith("{{"): # Do nothing for absolute addresses or apparent # template variable output attrs[(namespace, name)] = value elif value.startswith("/"): # Starts with "/", so just add the base url attrs[(namespace, name)] = self.base_url + value else: attrs[(namespace, name)] = self.base_url + "/" + value token["data"] = attrs break yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if ('StartTag' == token['type'] and token['name'] in self.tag_attributes): attrs = dict(token['data']) # If the element has the attribute we're looking for desired_attr = self.tag_attributes[token['name']] if desired_attr in attrs: address = attrs[desired_attr] if not address.startswith('http'): if address.startswith('//') or address.startswith('{{'): # Do nothing for absolute addresses or apparent # template variable output attrs[desired_attr] = address elif address.startswith('/'): # Starts with "/", so just add the base url attrs[desired_attr] = self.base_url + address else: attrs[desired_attr] = self.base_url + '/' + address token['data'] = attrs.items() yield token
def __iter__(self): for token in HTML5LibFilterBase.__iter__(self): type = token['type'] if type in ('StartTag', 'EmptyTag', 'EndTag'): name = token['name'] if name in ('html', 'head', 'body'): continue yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if "StartTag" == token["type"]: # Strip out any attributes that start with "on" token["data"] = [(k, v) for (k, v) in dict(token["data"]).items() if not k.startswith("on")] yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag': # Strip out any attributes that start with "on" attrs = {} for (namespace, name), value in token['data'].items(): if name.startswith('on'): continue attrs[(namespace, name)] = value token['data'] = attrs yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token["type"] == "StartTag": # Strip out any attributes that start with "on" attrs = {} for (namespace, name), value in token["data"].items(): if name.startswith("on"): continue attrs[(namespace, name)] = value token["data"] = attrs yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if ('StartTag' == token['type']): if 'iframe' == token['name']: attrs = dict(token['data']) src = attrs.get('src', '') if src: parts = urlparse(src) if not parts.netloc or parts.netloc not in self.hosts: attrs['src'] = '' token['data'] = attrs.items() yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag' and token['name'] == 'a': attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'href' and value: if re.search(self.blocked_protocols, value): attrs[(namespace, 'href')] = '' token['data'] = attrs yield token else: yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token["type"] == "StartTag" and token["name"] == "a": attrs = dict(token["data"]) for (namespace, name), value in attrs.items(): if name == "href" and value: if re.search(self.blocked_protocols, value): attrs[(namespace, "href")] = "" token["data"] = attrs yield token else: yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if ('StartTag' == token['type']): # Strip out any attributes that start with "on" token['data'] = [(k, v) for (k, v) in dict(token['data']).items() if not k.startswith('on')] yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag' and token['name'] == 'pre': attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'function' and value: m = MT_SYNTAX_RE.match(value) if m: lang = m.group(1).lower() brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang) attrs[(namespace, u'class')] = "brush: %s" % brush del attrs[(None, 'function')] token['data'] = attrs yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token["type"] == "StartTag" and token["name"] == "pre": attrs = dict(token["data"]) for (namespace, name), value in attrs.items(): if name == "function" and value: m = MT_SYNTAX_RE.match(value) if m: lang = m.group(1).lower() brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang) attrs[(namespace, u"class")] = "brush: %s" % brush del attrs[(None, "function")] token["data"] = attrs yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: yield token if (token['type'] == 'StartTag' and token['name'] in SECTION_TAGS): attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'id' and value: ts = ({ 'type': 'StartTag', 'name': 'a', 'data': { (None, u'title'): ugettext('Edit section'), (None, u'class'): 'edit-section', (None, u'data-section-id'): value, (None, u'data-section-src-url'): u'{0!s}?{1!s}'.format( reverse('wiki.document', args=[self.slug], locale=self.locale), urlencode({ 'section': value.encode('utf-8'), 'raw': 'true' })), (None, u'href'): u'{0!s}?{1!s}'.format( reverse('wiki.edit', args=[self.slug], locale=self.locale), urlencode({ 'section': value.encode('utf-8'), 'edit_links': 'true' })) } }, { 'type': 'Characters', 'data': ugettext(u'Edit') }, { 'type': 'EndTag', 'name': 'a' }) for t in ts: yield t
def __iter__(self): for token in html5lib_Filter.__iter__(self): if ('StartTag' == token['type']): if 'pre' == token['name']: attrs = dict(token['data']) function = attrs.get('function', None) if function: m = MT_SYNTAX_PAT.match(function) if m: lang = m.group(1).lower() brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang) attrs['class'] = "brush: %s" % brush del attrs['function'] token['data'] = attrs.items() yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if "StartTag" == token["type"]: if "pre" == token["name"]: attrs = dict(token["data"]) function = attrs.get("function", None) if function: m = MT_SYNTAX_PAT.match(function) if m: lang = m.group(1).lower() brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang) attrs["class"] = "brush: %s" % brush del attrs["function"] token["data"] = attrs.items() yield token
def __iter__(self): in_iframe = False for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag' and token['name'] == 'iframe': in_iframe = True attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'src' and value: if not re.search(self.hosts, value): attrs[(namespace, 'src')] = '' token['data'] = attrs yield token if token['type'] == 'EndTag' and token['name'] == 'iframe': in_iframe = False if not in_iframe: yield token
def __iter__(self): in_iframe = False for token in html5lib_Filter.__iter__(self): if token["type"] == "StartTag" and token["name"] == "iframe": in_iframe = True attrs = dict(token["data"]) for (namespace, name), value in attrs.items(): if name == "src" and value: if not re.search(self.hosts, value): attrs[(namespace, "src")] = "" token["data"] = attrs yield token if token["type"] == "EndTag" and token["name"] == "iframe": in_iframe = False if not in_iframe: yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: yield token if ('StartTag' == token['type'] and token['name'] in SECTION_TAGS): attrs = dict(token['data']) id = attrs.get('id', None) if id: out = ({ 'type': 'StartTag', 'name': 'a', 'data': { 'title': _('Edit section'), 'class': 'edit-section', 'data-section-id': id, 'data-section-src-url': u'%s?%s' % (reverse('wiki.document', args=[self.full_path], locale=self.locale), urlencode({ 'section': id.encode('utf-8'), 'raw': 'true' })), 'href': u'%s?%s' % (reverse('wiki.edit_document', args=[self.full_path], locale=self.locale), urlencode({ 'section': id.encode('utf-8'), 'edit_links': 'true' })) } }, { 'type': 'Characters', 'data': _('Edit') }, { 'type': 'EndTag', 'name': 'a' }) for t in out: yield t
def __iter__(self): in_iframe = False for token in html5lib_Filter.__iter__(self): if ('StartTag' == token['type']): if 'iframe' == token['name']: in_iframe = True attrs = dict(token['data']) src = attrs.get('src', '') if src: if not re.search(self.hosts, src): attrs['src'] = '' token['data'] = attrs.items() yield token if ('EndTag' == token['type']): if 'iframe' == token['name']: in_iframe = False if not in_iframe: yield token
def __iter__(self): in_iframe = False for token in html5lib_Filter.__iter__(self): if "StartTag" == token["type"]: if "iframe" == token["name"]: in_iframe = True attrs = dict(token["data"]) src = attrs.get("src", "") if src: if not re.search(self.hosts, src): attrs["src"] = "" token["data"] = attrs.items() yield token if "EndTag" == token["type"]: if "iframe" == token["name"]: in_iframe = False if not in_iframe: yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if ('StartTag' == token['type'] and token['name'] in HEAD_TAGS_TOC): self.in_header = True out = () level_match = re.compile(r'^h(\d)$').match(token['name']) level = int(level_match.group(1)) if level > self.level: diff = level - self.level for i in range(diff): out += ({'type': 'StartTag', 'name': 'ol', 'data': {}},) self.level = level elif level < self.level: diff = self.level - level for i in range(diff): out += ({'type': 'EndTag', 'name': 'li'}, {'type': 'EndTag', 'name': 'ol'}) self.level = level attrs = dict(token['data']) id = attrs.get('id', None) if id: out += ( {'type': 'StartTag', 'name': 'li', 'data': {}}, {'type': 'StartTag', 'name': 'a', 'data': { 'rel': 'internal', 'href': '#%s' % id, }}, ) for t in out: yield t elif ('Characters' == token['type'] and self.in_header): yield token elif ('EndTag' == token['type'] and token['name'] in HEAD_TAGS_TOC): self.in_header = False level_match = re.compile(r'^h(\d)$').match(token['name']) level = int(level_match.group(1)) out = ({'type': 'EndTag', 'name': 'a'},) for t in out: yield t
def __iter__(self): input = html5lib_Filter.__iter__(self) # Pass 1: Collect all known IDs from the stream buffer = [] for token in input: buffer.append(token) if 'StartTag' == token['type']: attrs = dict(token['data']) if 'id' in attrs: self.known_ids.add(attrs['id']) # Pass 2: Sprinkle in IDs where they're missing for token in buffer: if ('StartTag' == token['type'] and token['name'] in SECTION_EDIT_TAGS): attrs = dict(token['data']) id = attrs.get('id', None) if not id: attrs['id'] = self.gen_id() token['data'] = attrs.items() yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: yield token if ('StartTag' == token['type'] and token['name'] in SECTION_TAGS): attrs = dict(token['data']) id = attrs.get('id', None) if id: out = ( {'type': 'StartTag', 'name': 'a', 'data': { 'title': _('Edit section'), 'class': 'edit-section', 'data-section-id': id, 'data-section-src-url': u'%s?%s' % ( reverse('wiki.document', args=[self.full_path], locale=self.locale), urlencode({'section': id.encode('utf-8'), 'raw': 'true'}) ), 'href': u'%s?%s' % ( reverse('wiki.edit_document', args=[self.full_path], locale=self.locale), urlencode({'section': id.encode('utf-8'), 'edit_links': 'true'}) ) }}, {'type': 'Characters', 'data': _('Edit')}, {'type': 'EndTag', 'name': 'a'} ) for t in out: yield t
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: yield token if (token['type'] == 'StartTag' and token['name'] in SECTION_TAGS): attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'id' and value: ts = ({'type': 'StartTag', 'name': 'a', 'data': { (None, u'title'): ugettext('Edit section'), (None, u'class'): 'edit-section', (None, u'data-section-id'): value, (None, u'data-section-src-url'): u'%s?%s' % ( reverse('wiki.document', args=[self.slug], locale=self.locale), urlencode({'section': value.encode('utf-8'), 'raw': 'true'}) ), (None, u'href'): u'%s?%s' % ( reverse('wiki.edit', args=[self.slug], locale=self.locale), urlencode({'section': value.encode('utf-8'), 'edit_links': 'true'}) ) }}, {'type': 'Characters', 'data': ugettext(u'Edit')}, {'type': 'EndTag', 'name': 'a'}) for t in ts: yield t
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if "StartTag" == token["type"] and token["name"] in HEAD_TAGS_TOC: self.in_header = True out = () level_match = re.compile(r"^h(\d)$").match(token["name"]) level = int(level_match.group(1)) if level > self.level: diff = level - self.level for i in range(diff): out += ({"type": "StartTag", "name": "ol", "data": {}},) self.level = level elif level < self.level: diff = self.level - level for i in range(diff): out += ({"type": "EndTag", "name": "li"}, {"type": "EndTag", "name": "ol"}) self.level = level attrs = dict(token["data"]) id = attrs.get("id", None) if id: out += ( {"type": "StartTag", "name": "li", "data": {}}, {"type": "StartTag", "name": "a", "data": {"rel": "internal", "href": "#%s" % id}}, ) for t in out: yield t elif "Characters" == token["type"] and self.in_header: yield token elif "EndTag" == token["type"] and token["name"] in HEAD_TAGS_TOC: self.in_header = False level_match = re.compile(r"^h(\d)$").match(token["name"]) level = int(level_match.group(1)) out = ({"type": "EndTag", "name": "a"},) for t in out: yield t
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: yield token if token["type"] == "StartTag" and token["name"] in SECTION_TAGS: attrs = dict(token["data"]) for (namespace, name), value in attrs.items(): if name == "id" and value: ts = ( { "type": "StartTag", "name": "a", "data": { (None, u"title"): ugettext("Edit section"), (None, u"class"): "edit-section", (None, u"data-section-id"): value, (None, u"data-section-src-url"): u"%s?%s" % ( reverse("wiki.document", args=[self.slug], locale=self.locale), urlencode({"section": value.encode("utf-8"), "raw": "true"}), ), (None, u"href"): u"%s?%s" % ( reverse("wiki.edit", args=[self.slug], locale=self.locale), urlencode({"section": value.encode("utf-8"), "edit_links": "true"}), ), }, }, {"type": "Characters", "data": ugettext(u"Edit")}, {"type": "EndTag", "name": "a"}, ) for t in ts: yield t
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: yield token if "StartTag" == token["type"] and token["name"] in SECTION_TAGS: attrs = dict(token["data"]) id = attrs.get("id", None) if id: out = ( { "type": "StartTag", "name": "a", "data": { "title": _("Edit section"), "class": "edit-section", "data-section-id": id, "data-section-src-url": u"%s?%s" % ( reverse("wiki.document", args=[self.full_path], locale=self.locale), urlencode({"section": id.encode("utf-8"), "raw": "true"}), ), "href": u"%s?%s" % ( reverse("wiki.edit_document", args=[self.full_path], locale=self.locale), urlencode({"section": id.encode("utf-8"), "edit_links": "true"}), ), }, }, {"type": "Characters", "data": _("Edit")}, {"type": "EndTag", "name": "a"}, ) for t in out: yield t
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: # Section start was deferred, so start it now. if self.next_in_section: self.next_in_section = False self.in_section = True if 'StartTag' == token['type']: attrs = dict(token['data']) self.open_level += 1 # Have we encountered the section or heading element we're # looking for? if attrs.get('id', None) == self.section_id: # If we encounter a section element that matches the ID, # then we'll want to scoop up all its children as an # explicit section. if (self.parent_level is None and self._isSection(token)): self.parent_level = self.open_level # Defer the start of the section, so the section parent # itself isn't included. self.next_in_section = True # If we encounter a heading element that matches the ID, we # start an implicit section. elif (self.heading is None and self._isHeading(token)): self.heading = token self.heading_rank = self._getHeadingRank(token) self.parent_level = self.open_level - 1 self.in_section = True # If started an implicit section, these rules apply to # siblings... elif (self.heading is not None and self.open_level - 1 == self.parent_level): # The implicit section should stop if we hit another # sibling heading whose rank is equal or higher, since that # starts a new implicit section if (self._isHeading(token) and self._getHeadingRank(token) <= self.heading_rank): self.in_section = False if 'EndTag' == token['type']: self.open_level -= 1 # If the parent of the section has ended, end the section. # This applies to both implicit and explicit sections. if (self.parent_level is not None and self.open_level < self.parent_level): self.in_section = False # If there's no replacement source, then this is a section # extraction. So, emit tokens while we're in the section. if not self.replace_source: if self.in_section: yield token # If there is a replacement source, then this is a section # replacement. Emit tokens of the source stream until we're in the # section, then emit the replacement stream and ignore the rest of # the source stream for the section.. else: if not self.in_section: yield token elif not self.replacement_emitted: for r_token in self.replace_source: yield r_token self.replacement_emitted = True
def __iter__(self): from wiki.models import Document input = html5lib_Filter.__iter__(self) # Pass #1: Gather all the link URLs and prepare annotations links = dict() buffer = [] for token in input: buffer.append(token) if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if not 'href' in attrs: continue href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] # Prepare annotations record for this path. links[href] = dict(classes=[]) # Run through all the links and check for annotatable conditions. for href in links.keys(): # Is this an external URL? is_external = False for prefix in self.EXTERNAL_PREFIXES: if href.startswith(prefix): is_external = True break if is_external: links[href]['classes'].append('external') continue # TODO: Should this also check for old-school mindtouch URLs? Or # should we encourage editors to convert to new-style URLs to take # advantage of link annotation? (I'd say the latter) # Is this a kuma doc URL? if '/docs/' in href: # Check if this is a special docs path that's exempt from "new" skip = False for path in DOC_SPECIAL_PATHS: if '/docs/%s' % path in href: skip = True if skip: continue href_locale, href_path = href.split(u'/docs/', 1) if href_locale.startswith(u'/'): href_locale = href_locale[1:] if '#' in href_path: # If present, discard the hash anchor href_path, _, _ = href_path.partition('#') # Handle any URL-encoded UTF-8 characters in the path href_path = href_path.encode('utf-8', 'ignore') href_path = urllib.unquote(href_path) href_path = href_path.decode('utf-8', 'ignore') # Try to sort out the locale and slug through some of our # redirection logic. locale, slug, needs_redirect = ( Document.locale_and_slug_from_path( href_path, path_locale=href_locale)) # Does this locale and slug correspond to an existing document? # If not, mark it as a "new" link. # # TODO: Should these DB queries be batched up into one big # query? A page with hundreds of links will fire off hundreds # of queries ct = Document.objects.filter(locale=locale, slug=slug).count() if ct == 0: links[href]['classes'].append('new') # Pass #2: Filter the content, annotating links for token in buffer: if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if 'href' in attrs: href = attrs['href'] if href.startswith(self.base_url): # Squash site-absolute URLs to site-relative paths. href = '/%s' % href[len(self.base_url):] if href in links: # Update class names on this link element. if 'class' in attrs: classes = set(attrs['class'].split(u' ')) else: classes = set() classes.update(links[href]['classes']) if classes: attrs['class'] = u' '.join(classes) token['data'] = attrs.items() yield token
def __init__(self, source): html5lib_Filter.__init__(self, source) self.id_cnt = 0 self.known_ids = set()
def __iter__(self): input = html5lib_Filter.__iter__(self) # Pass 1: Collect all known IDs from the stream buffer = [] for token in input: buffer.append(token) if 'StartTag' == token['type']: attrs = dict(token['data']) if 'id' in attrs: self.known_ids.add(attrs['id']) if 'name' in attrs: self.known_ids.add(attrs['name']) # Pass 2: Sprinkle in IDs where they're needed while len(buffer): token = buffer.pop(0) if not ('StartTag' == token['type'] and token['name'] in SECTION_TAGS): yield token else: attrs = dict(token['data']) # Treat a name attribute as a human-specified ID override name = attrs.get('name', None) if name: attrs['id'] = name token['data'] = attrs.items() yield token continue # If this is not a header, then generate a section ID. if token['name'] not in HEAD_TAGS: attrs['id'] = self.gen_id() token['data'] = attrs.items() yield token continue # If this is a header, then scoop up the rest of the header and # gather the text it contains. start, text, tmp = token, [], [] while len(buffer): token = buffer.pop(0) tmp.append(token) if token['type'] in ('Characters', 'SpaceCharacters'): text.append(token['data']) elif ('EndTag' == token['type'] and start['name'] == token['name']): # Note: This is naive, and doesn't track other # start/end tags nested in the header. Odd things might # happen in a case like <h1><h1></h1></h1>. But, that's # invalid markup and the worst case should be a # truncated ID because all the text wasn't accumulated. break # Slugify the text we found inside the header, generate an ID # as a last resort. slug = self.slugify(u''.join(text)) if not slug: slug = self.gen_id() attrs['id'] = slug start['data'] = attrs.items() # Finally, emit the tokens we scooped up for the header. yield start for t in tmp: yield t
def __iter__(self): buffer = [] for token in html5lib_Filter.__iter__(self): buffer.append(token) while len(buffer): token = buffer.pop(0) if not ('StartTag' == token['type'] and 'span' == token['name']): yield token continue attrs = dict(token['data']) if attrs.get('class', '') != 'script': yield token continue ds_call = [] while len(buffer): token = buffer.pop(0) if token['type'] in ('Characters', 'SpaceCharacters'): ds_call.append(token['data']) elif 'StartTag' == token['type']: attrs = token['data'] if attrs: a_out = (u' %s' % u' '.join( (u'%s=%s' % (name, quoteattr(val)) for name, val in attrs))) else: a_out = u'' ds_call.append(u'<%s%s>' % (token['name'], a_out)) elif 'EndTag' == token['type']: if 'span' == token['name']: break ds_call.append('</%s>' % token['name']) ds_call = u''.join(ds_call).strip() # Snip off any "template." prefixes strip_prefixes = ('template.', 'wiki.') for prefix in strip_prefixes: if ds_call.lower().startswith(prefix): ds_call = ds_call[len(prefix):] # template("template name", [ "params" ]) wt_re = re.compile( r'''^template\(['"]([^'"]+)['"],\s*\[([^\]]+)]''', re.I) m = wt_re.match(ds_call) if m: ds_call = '%s(%s)' % (m.group(1), m.group(2).strip()) # template("template name") wt_re = re.compile(r'''^template\(['"]([^'"]+)['"]''', re.I) m = wt_re.match(ds_call) if m: ds_call = '%s()' % (m.group(1)) # HACK: This is dirty, but seems like the easiest way to # reconstitute the token stream, including what gets parsed as # markup in the middle of macro parameters. # # eg. {{ Note("This is <strong>strongly</strong> discouraged") }} parsed = parse('{{ %s }}' % ds_call) for token in parsed.stream: yield token
def __init__(self, source, base_url): html5lib_Filter.__init__(self, source) self.base_url = base_url
def __init__(self, source, hosts): html5lib_Filter.__init__(self, source) self.hosts = hosts
def __init__(self, source, base_url, tag_attributes): html5lib_Filter.__init__(self, source) self.base_url = base_url self.tag_attributes = tag_attributes
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if ('StartTag' == token['type'] and token['name'] in HEAD_TAGS_TOC): self.in_header = True out = () level_match = re.compile(r'^h(\d)$').match(token['name']) level = int(level_match.group(1)) if level > self.level: diff = level - self.level for i in range(diff): if (not self.in_hierarchy and i % 2 == 0): out += ({ 'type': 'StartTag', 'name': 'li', 'data': {} }, ) out += ({ 'type': 'StartTag', 'name': 'ol', 'data': {} }, ) if (diff > 1 and i % 2 == 0 and i != diff - 1): out += ({ 'type': 'StartTag', 'name': 'li', 'data': {} }, ) self.open_level += 1 self.level = level elif level < self.level: diff = self.level - level for i in range(diff): out += ({ 'type': 'EndTag', 'name': 'ol' }, { 'type': 'EndTag', 'name': 'li' }) self.open_level -= 1 self.level = level attrs = dict(token['data']) id = attrs.get('id', None) if id: out += ( { 'type': 'StartTag', 'name': 'li', 'data': {} }, { 'type': 'StartTag', 'name': 'a', 'data': { 'rel': 'internal', 'href': '#%s' % id, } }, ) self.in_hierarchy = True for t in out: yield t elif ('StartTag' == token['type'] and token['name'] in TAGS_IN_TOC): yield token elif (token['type'] in ("Characters", "SpaceCharacters") and self.in_header): yield token elif ('EndTag' == token['type'] and token['name'] in TAGS_IN_TOC): yield token elif ('EndTag' == token['type'] and token['name'] in HEAD_TAGS_TOC): self.in_header = False out = ({'type': 'EndTag', 'name': 'a'}, ) for t in out: yield t if self.open_level > 0: out = () for i in range(self.open_level): out += ({ 'type': 'EndTag', 'name': 'ol' }, { 'type': 'EndTag', 'name': 'li' }) for t in out: yield t
def __init__(self, source, base_url): html5lib_Filter.__init__(self, source) self.base_url = base_url self.base_url_parsed = urlparse(base_url)
def __init__(self, source, full_path, locale): html5lib_Filter.__init__(self, source) self.full_path = full_path self.locale = locale
def __iter__(self): for token in html5lib_Filter.__iter__(self): if 'SpaceCharacters' == token['type']: continue yield token
def __iter__(self): from wiki.models import Document input = html5lib_Filter.__iter__(self) # Pass #1: Gather all the link URLs and prepare annotations links = dict() buffer = [] for token in input: buffer.append(token) if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if not 'href' in attrs: continue href = attrs['href'] href_parsed = urlparse(href) if href_parsed.netloc == self.base_url_parsed.netloc: # Squash site-absolute URLs to site-relative paths. href = href_parsed.path # Prepare annotations record for this path. links[href] = dict( classes=[] ) needs_existence_check = defaultdict(lambda: defaultdict(set)) # Run through all the links and check for annotatable conditions. for href in links.keys(): # Is this an external URL? is_external = False for prefix in self.EXTERNAL_PREFIXES: if href.startswith(prefix): is_external = True break if is_external: links[href]['classes'].append('external') continue # TODO: Should this also check for old-school mindtouch URLs? Or # should we encourage editors to convert to new-style URLs to take # advantage of link annotation? (I'd say the latter) # Is this a kuma doc URL? if '/docs/' in href: # Check if this is a special docs path that's exempt from "new" skip = False for path in DOC_SPECIAL_PATHS: if '/docs/%s' % path in href: skip = True if skip: continue href_locale, href_path = href.split(u'/docs/', 1) if href_locale.startswith(u'/'): href_locale = href_locale[1:] if '#' in href_path: # If present, discard the hash anchor href_path, _, _ = href_path.partition('#') # Handle any URL-encoded UTF-8 characters in the path href_path = href_path.encode('utf-8', 'ignore') href_path = urllib.unquote(href_path) href_path = href_path.decode('utf-8', 'ignore') # Try to sort out the locale and slug through some of our # redirection logic. locale, slug, needs_redirect = (Document .locale_and_slug_from_path(href_path, path_locale=href_locale)) # Gather up this link for existence check needs_existence_check[locale.lower()][slug.lower()].add(href) # Perform existence checks for all the links, using one DB query per # locale for all the candidate slugs. for locale, slug_hrefs in needs_existence_check.items(): existing_slugs = (Document.objects .filter(locale=locale, slug__in=slug_hrefs.keys()) .values_list('slug', flat=True)) # Remove the slugs that pass existence check. for slug in existing_slugs: lslug = slug.lower() if lslug in slug_hrefs: del slug_hrefs[lslug] # Mark all the links whose slugs did not come back from the DB # query as "new" for slug, hrefs in slug_hrefs.items(): for href in hrefs: links[href]['classes'].append('new') # Pass #2: Filter the content, annotating links for token in buffer: if ('StartTag' == token['type'] and 'a' == token['name']): attrs = dict(token['data']) if 'href' in attrs: href = attrs['href'] href_parsed = urlparse(href) if href_parsed.netloc == self.base_url_parsed.netloc: # Squash site-absolute URLs to site-relative paths. href = href_parsed.path if href in links: # Update class names on this link element. if 'class' in attrs: classes = set(attrs['class'].split(u' ')) else: classes = set() classes.update(links[href]['classes']) if classes: attrs['class'] = u' '.join(classes) token['data'] = attrs.items() yield token