def __init__(self, source): html5lib_Filter.__init__(self, source) self.level = 2 self.max_level = 3 self.in_header = False self.open_level = 0 self.in_hierarchy = False
def __init__(self, source, id): html5lib_Filter.__init__(self, source) self.section_id = id self.open_level = 0 self.parent_level = None self.in_section = False self.skip = False
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if (token['type'] == 'StartTag' and token['name'] in self.tag_attributes): attrs = dict(token['data']) # If the element has the attribute we're looking for desired_attr = self.tag_attributes[token['name']] for (namespace, name), value in attrs.items(): if desired_attr == name: if not value.startswith('http'): if value.startswith('//') or value.startswith( '{{'): # Do nothing for absolute addresses or apparent # template variable output attrs[(namespace, name)] = value elif value.startswith('/'): # Starts with "/", so just add the base url attrs[(namespace, name)] = self.base_url + value else: attrs[(namespace, name)] = self.base_url + '/' + value token['data'] = attrs break yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) # loop through all 'tokens' for token in input: # if this token is a start tag... if token['type'] == 'StartTag': # increment counter that tracks nesting self.open_level += 1 for key in token['data']: if 'id' == key[1] and token['data'][key] == self.section_id: # note we're in the matching section self.in_section = True # keep track of how nested we were when section started self.parent_level = self.open_level elif token['type'] == 'EndTag': # If the parent of the section has ended, end the section. if (self.parent_level is not None and self.open_level is self.parent_level): self.in_section = False self.skip = True self.parent_level = None # reduce nesting counter self.open_level -= 1 # emit tokens if we're not in the section being removed if not self.in_section and not self.skip: yield token else: self.skip = False
def __init__(self, source, id, replace_source=None, ignore_heading=False): html5lib_Filter.__init__(self, source) self.replace_source = replace_source self.ignore_heading = ignore_heading self.section_id = id self.heading = None self.heading_rank = None self.open_level = 0 self.parent_level = None self.in_section = False self.heading_to_ignore = None self.already_ignored_header = False self.next_in_section = False self.replacement_emitted = False
def __iter__(self): isScript = None for token in Filter.__iter__(self): ttype = token['type'] if ttype == 'StartTag': tname = token['name'] tdata = token['data'] if self.f (token): self.inside = 0 if tname in {'script', 'style'}: isScript = 0 if isScript is not None: if ttype == 'EndTag': isScript -= 1 if isScript <= 0: isScript = None elif self.inside is not None: if ttype == 'StartTag': self.inside += 1 if ttype == 'EndTag': self.inside -= 1 if self.inside <= 0: self.inside = None yield token
def __iter__(self, _title_attr=(None, 'title')): html_ns = namespaces['html'] for token in BaseFilter.__iter__(self): yield token if ( token['type'] == 'EmptyTag' and token['name'] == 'img' and token['namespace'] == html_ns and 'data' in token ): attrs = token['data'] if _title_attr in attrs: yield { 'type': 'StartTag', 'namespace': html_ns, 'name': 'aside', 'data': OrderedDict(), # TODO Some way to pass through special styling. } yield { 'type': 'Characters', 'data': attrs[_title_attr], } yield { 'type': 'EndTag', 'namespace': html_ns, 'name': 'aside', }
def __iter__(self): for token in Filter.__iter__(self): if token['type'] in ['StartTag', 'EmptyTag'] and token['data']: for attr, value in token['data'].items(): token['data'][attr] = 'moo' yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: yield token if token["type"] == "StartTag" and token["name"] in SECTION_TAGS: attrs = dict(token["data"]) for (namespace, name), value in attrs.items(): if name == "id" and value: ts = ( { "type": "StartTag", "name": "a", "data": { (None, "title"): gettext("Edit section"), (None, "class"): "edit-section", (None, "data-section-id"): value, (None, "data-section-src-url"): order_params( "%s?%s" % ( reverse( "wiki.document", args=[self.slug], locale=self.locale, ), urlencode( { "section": value.encode(), "raw": "true", } ), ) ), (None, "href"): order_params( "%s?%s" % ( reverse( "wiki.edit", args=[self.slug], locale=self.locale, ), ( urlencode( { "section": value.encode(), "edit_links": "true", } ) ), ) ), }, }, {"type": "Characters", "data": gettext("Edit")}, {"type": "EndTag", "name": "a"}, ) for t in ts: yield t
def __iter__(self): for token in Filter.__iter__(self): if token["type"] == "StartTag" and token["name"] == "p": continue elif token["type"] == "EndTag" and token["name"] == "p": yield (self.NEWLINE_TOKEN) continue yield (token)
def __iter__(self): for token in Filter.__iter__(self): if token['type'] in ['StartTag', 'EmptyTag'] and token['data']: if token['name'] == 'img': for attr, value in token['data'].items(): token['data'][attr] = image_rewrite( urljoin(result['url'], token['data'][attr]), result['@id']) yield token
def __iter__(self): for token in HTML5LibFilterBase.__iter__(self): type = token['type'] if type in ('StartTag', 'EmptyTag', 'EndTag'): name = token['name'] if name in ('html', 'head', 'body'): continue yield token
def __iter__(self): stack = [] for token in Filter.__iter__(self): if 'name' in token and token['name'] in REMOVED_TAGS: if token['type'] == 'StartTag': stack.append(token['name']) elif token['type'] == 'EndTag': stack.pop(-1) elif not stack: yield token
def __iter__(self): for token in Filter.__iter__(self): data = token.get ('data') if data and token['type'] in {'StartTag', 'EmptyTag'}: newdata = {} for (namespace, k), v in data.items (): if k.lower () not in self.attributes: newdata[(namespace, k)] = v token['data'] = newdata yield token
def __iter__(self): for token in Filter.__iter__(self): if token["type"] == "StartTag" and token["name"] == "a" and token[ "data"]: url = token["data"].get((None, "href")) if url is not None: actual_url = parse_qs(urlparse(url).query).get("q") if actual_url is not None and len(actual_url) > 0: token["data"][(None, "href")] = actual_url[0] yield (token)
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag': # Strip out any attributes that start with "on" attrs = {} for (namespace, name), value in token['data'].items(): if name.startswith('on'): continue attrs[(namespace, name)] = value token['data'] = attrs yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token["type"] == "StartTag": # Strip out any attributes that start with "on" attrs = {} for (namespace, name), value in token["data"].items(): if name.startswith("on"): continue attrs[(namespace, name)] = value token["data"] = attrs yield token
def __iter__(self): delete = 0 for token in Filter.__iter__(self): tokenType = token['type'] if tokenType in {'StartTag', 'EmptyTag'}: if delete > 0 or token['name'].lower () in self.tags: delete += 1 if delete == 0: yield token if tokenType == 'EndTag' and delete > 0: delete -= 1
def __iter__(self): for token in Filter.__iter__(self): if token["type"] == "StartTag" and token['name'] == "a": href = token["data"][(None, 'href')] if href.startswith("mailto:"): obfuscated = obfuscate(href[7:]) href = "mailto:{}".format(obfuscated) token["data"][(None, 'href')] = href yield token
def __iter__(self, _SRC_ATTR=(None, 'src'), _youtube_hosts=('youtube.com', 'www.youtube.com', 'youtube-nocookie.com', 'www.youtube-nocookie.com')): html_ns = namespaces['html'] elide = False for token in BaseFilter.__iter__(self): token_type = token['type'] if elide: # NOTE html5lib doesn't permit nesting <iframe> tags, # (presumably because HTML5 doesn't permit it). Therefore we # don't need to deal with that case here, just wait for the # first end tag. if token_type == 'EndTag' and token['name'] == 'iframe': elide = False else: if ( token_type == 'StartTag' and token['name'] == 'iframe' and token['namespace'] == html_ns and 'data' in token and _SRC_ATTR in token['data'] ): url = URL.from_text(token['data'][_SRC_ATTR]) if url.absolute and url.host in _youtube_hosts and len(url.path) == 2 and url.path[0] == 'embed': yield { 'type': 'StartTag', 'namespace': html_ns, 'name': 'a', 'data': OrderedDict([ ((None, 'href'), self._watch_url(url).to_text()), ]), } yield { 'type': 'EmptyTag', 'namespace': html_ns, 'name': u'img', 'data': OrderedDict([ ((None, 'alt'), 'YouTube video'), (_SRC_ATTR, self._thumbnail_url(url).to_text()), ((None, 'width'), '320'), ((None, 'height'), '180'), ]), } yield { 'type': 'EndTag', 'namespace': html_ns, 'name': 'a', } elide = True else: yield token else: yield token
def __iter__(self): for token in Filter.__iter__(self): ret = self.sanitize_token(token) if not ret: continue if isinstance(ret, list): for subtoken in ret: yield subtoken else: yield ret
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag' and token['name'] == 'a': attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'href' and value: if re.search(self.blocked_protocols, value): attrs[(namespace, 'href')] = '' token['data'] = attrs yield token else: yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: yield token if token['type'] == 'StartTag' and token['name'] in SECTION_TAGS: attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'id' and value: ts = ({ 'type': 'StartTag', 'name': 'a', 'data': { (None, u'title'): ugettext('Edit section'), (None, u'class'): 'edit-section', (None, u'data-section-id'): value, (None, u'data-section-src-url'): order_params( u'%s?%s' % (reverse('wiki.document', args=[self.slug], locale=self.locale), urlencode({ 'section': value.encode('utf-8'), 'raw': 'true' }))), (None, u'href'): order_params( u'%s?%s' % (reverse('wiki.edit', args=[self.slug], locale=self.locale), (urlencode( { 'section': value.encode('utf-8'), 'edit_links': 'true' })))) } }, { 'type': 'Characters', 'data': ugettext(u'Edit') }, { 'type': 'EndTag', 'name': 'a' }) for t in ts: yield t
def __iter__(self): tokens = Filter.__iter__(self) while True: for token in tokens: if token["type"] == "StartTag" and token["name"].lower() in self._tags: break yield token else: # we ran out of tokens break for token in tokens: if token["type"] == "EndTag" and token["name"].lower() in self._tags: break
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag' and token['name'] == 'pre': attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'function' and value: m = MT_SYNTAX_RE.match(value) if m: lang = m.group(1).lower() brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang) attrs[(namespace, u'class')] = "brush: %s" % brush del attrs[(None, 'function')] token['data'] = attrs yield token
def __iter__(self): remove_end_tag = False for token in Filter.__iter__(self): # only check anchor tags if 'name' in token and token['name'] == 'a' and token['type'] in [ 'StartTag', 'EndTag' ]: if token['type'] == 'StartTag' and token['data'] == {}: remove_end_tag = True continue elif token['type'] == 'EndTag' and remove_end_tag: remove_end_tag = False continue yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: if token["type"] == "StartTag" and token["name"] == "pre": attrs = dict(token["data"]) or {(None, "class"): ""} for (namespace, name), value in attrs.copy().items(): if name == "class" and "notranslate" not in value.split(): before = attrs.get((namespace, "class")) or "" after = f"{before} notranslate".strip() attrs[(namespace, "class")] = after token["data"] = attrs yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token["type"] == "StartTag" and token["name"] == "pre": attrs = dict(token["data"]) for (namespace, name), value in attrs.copy().items(): if name == "function" and value: m = MT_SYNTAX_RE.match(value) if m: lang = m.group(1).lower() brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang) attrs[(namespace, "class")] = "brush: %s" % brush del attrs[(None, "function")] token["data"] = attrs yield token
def __iter__(self): for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag' and token['name'] == 'pre': attrs = dict(token['data']) for (namespace, name), value in attrs.copy().items(): if name == 'function' and value: m = MT_SYNTAX_RE.match(value) if m: lang = m.group(1).lower() brush = MT_SYNTAX_BRUSH_MAP.get(lang, lang) attrs[(namespace, u'class')] = "brush: %s" % brush del attrs[(None, 'function')] token['data'] = attrs yield token
def __iter__(self): in_link = False for token in Filter.__iter__(self): if token.get('name') == 'a': if token['type'] == 'StartTag': in_link = True data = token['data'][(None, 'href')] data = data.replace('mailto:', '') yield {'type': 'Characters', 'data': data} elif token['type'] == 'EndTag': in_link = False elif token['type'] == 'Characters': if not in_link: yield token else: yield token
def __iter__(self): in_iframe = False for token in html5lib_Filter.__iter__(self): if token["type"] == "StartTag" and token["name"] == "iframe": in_iframe = True attrs = dict(token["data"]) for (namespace, name), value in attrs.items(): if name == "src" and value: if not self.validate_src(value): attrs[(namespace, "src")] = "" token["data"] = attrs yield token if token["type"] == "EndTag" and token["name"] == "iframe": in_iframe = False if not in_iframe: yield token
def __iter__(self): in_iframe = False for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag' and token['name'] == 'iframe': in_iframe = True attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'src' and value: if not self.validate_src(value): attrs[(namespace, 'src')] = '' token['data'] = attrs yield token if token['type'] == 'EndTag' and token['name'] == 'iframe': in_iframe = False if not in_iframe: yield token
def __iter__(self): remove_end_tag = False for token in Filter.__iter__(self): # only check anchor tags if 'name' in token and token['name'] == 'a' and token['type'] in ['StartTag', 'EndTag']: if token['type'] == 'StartTag': remove_end_tag = True for attr, value in token['data'].items(): if attr == (None, 'href') and value != '' and is_valid_url(value): remove_end_tag = False if remove_end_tag: continue elif token['type'] == 'EndTag' and remove_end_tag: remove_end_tag = False continue yield token
def __iter__(self): in_iframe = False for token in html5lib_Filter.__iter__(self): if token['type'] == 'StartTag' and token['name'] == 'iframe': in_iframe = True attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'src' and value: if not re.search(self.hosts, value): attrs[(namespace, 'src')] = '' token['data'] = attrs yield token if token['type'] == 'EndTag' and token['name'] == 'iframe': in_iframe = False if not in_iframe: yield token
def __iter__(self): html_ns = namespaces['html'] nest = 0 for token in BaseFilter.__iter__(self): token_type = token['type'] # Drop <param> when inside <object>. We don't handle nesting # properly, but they're not valid anywhere else so that's not # a problem. if nest >= 1 and token_type == 'EmptyTag' and token['name'] == 'param' and token['namespace'] == html_ns: continue if token_type == 'EndTag' and token['name'] == 'object' and token['namespace'] == html_ns: nest -= 1 continue if token_type == 'StartTag' and token['name'] == 'object' and token['namespace'] == html_ns: nest += 1 continue yield token
def __iter__(self): elide = 0 elide_ns = None elide_name = None for token in BaseFilter.__iter__(self): token_type = token['type'] if elide: if token_type == 'EndTag' and token['name'] == elide_name and token['namespace'] == elide_ns: elide -= 1 if token_type == 'StartTag' and token['name'] == elide_name and token['namespace'] == elide_ns: elide += 1 continue # Drop the token else: if token_type == 'StartTag': if (token['namespace'], token['name']) in self._elide_tags: elide += 1 elide_name = token['name'] elide_ns = token['namespace'] continue # Drop this token. yield token
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: yield token if (token['type'] == 'StartTag' and token['name'] in SECTION_TAGS): attrs = dict(token['data']) for (namespace, name), value in attrs.items(): if name == 'id' and value: ts = ({'type': 'StartTag', 'name': 'a', 'data': { (None, u'title'): ugettext('Edit section'), (None, u'class'): 'edit-section', (None, u'data-section-id'): value, (None, u'data-section-src-url'): u'%s?%s' % ( reverse('wiki.document', args=[self.slug], locale=self.locale), urlencode({'section': value.encode('utf-8'), 'raw': 'true'}) ), (None, u'href'): u'%s?%s' % ( reverse('wiki.edit', args=[self.slug], locale=self.locale), urlencode({'section': value.encode('utf-8'), 'edit_links': 'true'}) ) }}, {'type': 'Characters', 'data': ugettext(u'Edit')}, {'type': 'EndTag', 'name': 'a'}) for t in ts: yield t
def __init__(self, source, hosts): html5lib_Filter.__init__(self, source) self.hosts = hosts
def __iter__(self): for token in Filter.__iter__(self): if (token['type'] in ['StartTag', 'EndTag']): if token['name'] in ['h1', 'h2', 'h3']: token['name'] = 'h4' yield token
def __init__(self, source, patterns): html5lib_Filter.__init__(self, source) self.allowed_src_patterns = patterns
def __init__(self, source, base_url): html5lib_Filter.__init__(self, source) self.base_url = base_url self.base_url_parsed = urlparse(base_url)
def __iter__(self): from kuma.wiki.models import Document input = html5lib_Filter.__iter__(self) # Pass #1: Gather all the link URLs and prepare annotations links = {} buffer = [] for token in input: buffer.append(token) if token['type'] == 'StartTag' and token['name'] == 'a': for (namespace, name), value in token['data'].items(): if name == 'href': href = value href_parsed = urlparse(href) if href_parsed.netloc == self.base_url_parsed.netloc: # Squash site-absolute URLs to site-relative paths. href = href_parsed.path # Prepare annotations record for this path. links[href] = {'classes': [], 'rel': []} needs_existence_check = defaultdict(lambda: defaultdict(set)) # Run through all the links and check for annotatable conditions. for href in links.keys(): # Is this an external URL? is_external = False for prefix in self.EXTERNAL_PREFIXES: if href.startswith(prefix): is_external = True break if is_external: links[href]['classes'].append('external') # https://mathiasbynens.github.io/rel-noopener/ links[href]['rel'].append('noopener') continue # TODO: Should this also check for old-school mindtouch URLs? Or # should we encourage editors to convert to new-style URLs to take # advantage of link annotation? (I'd say the latter) # Is this a kuma doc URL? if '/docs/' in href: # Check if this is a special docs path that's exempt from "new" skip = False for path in DOC_SPECIAL_PATHS: if '/docs/%s' % path in href: skip = True if skip: continue href_locale, href_path = href.split(u'/docs/', 1) if href_locale.startswith(u'/'): href_locale = href_locale[1:] if '#' in href_path: # If present, discard the hash anchor href_path, _, _ = href_path.partition('#') # Handle any URL-encoded UTF-8 characters in the path href_path = href_path.encode('utf-8', 'ignore') href_path = unquote(href_path) href_path = href_path.decode('utf-8', 'ignore') # Try to sort out the locale and slug through some of our # redirection logic. locale, slug, needs_redirect = ( locale_and_slug_from_path(href_path, path_locale=href_locale)) # Gather up this link for existence check needs_existence_check[locale.lower()][slug.lower()].add(href) # Perform existence checks for all the links, using one DB query per # locale for all the candidate slugs. for locale, slug_hrefs in needs_existence_check.items(): existing_slugs = (Document.objects .filter(locale=locale, slug__in=slug_hrefs.keys()) .values_list('slug', flat=True)) # Remove the slugs that pass existence check. check_collation = False for slug in existing_slugs: lslug = slug.lower() try: del slug_hrefs[lslug] except KeyError: # Same slug by MySQL collation rules check_collation = True # Some slugs are matched by collation rules, so use single checks if check_collation: to_delete = set() for slug in slug_hrefs.keys(): match = Document.objects.filter(locale=locale, slug=slug) if match.exists(): to_delete.add(slug) for slug in to_delete: del slug_hrefs[slug] # Mark all the links whose slugs did not come back from the DB # query as "new" for slug, hrefs in slug_hrefs.items(): for href in hrefs: links[href]['classes'].append('new') links[href]['rel'].append('nofollow') # Pass #2: Filter the content, annotating links for token in buffer: if token['type'] == 'StartTag' and token['name'] == 'a': attrs = dict(token['data']) names = [name for (namespace, name) in attrs.keys()] for (namespace, name), value in attrs.copy().items(): if name == 'href': href = value href_parsed = urlparse(value) if href_parsed.netloc == self.base_url_parsed.netloc: # Squash site-absolute URLs to site-relative paths. href = href_parsed.path # Update attributes on this link element. def add_to_attr(attr_name, add_list): """Add values to the attribute dictionary.""" if attr_name in names: values = set( attrs[(namespace, attr_name)].split(u' ')) else: values = set() values.update(add_list) if values: attrs[(namespace, attr_name)] = ( u' '.join(sorted(values))) add_to_attr(u'class', links[href]['classes']) add_to_attr(u'rel', links[href]['rel']) token['data'] = attrs yield token
def __init__(self, source): html5lib_Filter.__init__(self, source) self.id_cnt = 0 self.known_ids = set()
def __iter__(self): input = html5lib_Filter.__iter__(self) # First, collect all ID values already in the source HTML. buffer = [] for token in input: buffer.append(token) if token['type'] == 'StartTag': attrs = dict(token['data']) for (namespace, name), value in attrs.items(): # Collect both 'name' and 'id' attributes since # 'name' gets treated as a manual override to # specify an ID. if name == 'id' and token['name'] not in HEAD_TAGS: self.known_ids.add(value) if name == 'name': self.known_ids.add(value) # Then walk the tree again identifying elements in need of IDs # and adding them. while len(buffer): token = buffer.pop(0) if not (token['type'] == 'StartTag' and token['name'] in SECTION_TAGS): # If this token isn't the start tag of a section or # header, we don't add an ID and just short-circuit # out to return the token as-is. yield token else: # Potential bug warning: there may not be any # attributes, so doing a for loop over them to look # for existing ID/name values is unsafe. Instead we # dict-ify the attrs, and then check directly for the # things we care about instead of iterating all # attributes and waiting for one we care about to show # up. attrs = dict(token['data']) # First check for a 'name' attribute; if it's present, # treat it as a manual override by the author and make # that value be the ID. if (None, 'name') in attrs: attrs[(None, u'id')] = attrs[(None, 'name')] token['data'] = attrs yield token continue # Next look for <section> tags which don't have an ID # set; since we don't generate an ID for them from # their text contents, they just get a numeric one # from gen_id(). if token['name'] not in HEAD_TAGS: if (None, 'id') not in attrs: attrs[(None, u'id')] = self.gen_id() token['data'] = attrs yield token continue # If we got here, we're looking at the start tag of a # header which had no 'name' attribute set. We're # going to pop out the text contents of the header, # use them to generate a slugified ID for it, and # return it with that ID added in. buffer, header_tokens = self.process_header(token, buffer) for t in header_tokens: yield t
def __init__(self, source, slug, locale): html5lib_Filter.__init__(self, source) self.slug = slug self.locale = locale
def __iter__(self): input = html5lib_Filter.__iter__(self) self.skip_header = False for token in input: if token['type'] == 'StartTag' and token['name'] in HEAD_TAGS_TOC: level_match = LEVEL_RE.match(token['name']) level = int(level_match.group(1)) if level > self.max_level: self.skip_header = True continue self.in_header = True out = [] if level > self.level: diff = level - self.level for i in range(diff): if (not self.in_hierarchy and i % 2 == 0): out.append({'type': 'StartTag', 'name': 'li', 'data': {}}) out.append({'type': 'StartTag', 'name': 'ol', 'data': {}}) if (diff > 1 and i % 2 == 0 and i != diff - 1): out.append({'type': 'StartTag', 'name': 'li', 'data': {}}) self.open_level += 1 self.level = level elif level < self.level: diff = self.level - level for i in range(diff): out.extend([{'type': 'EndTag', 'name': 'ol'}, {'type': 'EndTag', 'name': 'li'}]) self.open_level -= 1 self.level = level attrs = dict(token['data']) id = attrs.get((None, 'id'), None) if id: out.extend([ {'type': 'StartTag', 'name': 'li', 'data': {}}, {'type': 'StartTag', 'name': 'a', 'data': {(None, u'rel'): 'internal', (None, u'href'): '#%s' % id}}, ]) self.in_hierarchy = True for t in out: yield t elif (token['type'] == 'StartTag' and token['name'] in TAGS_IN_TOC and self.in_header and not self.skip_header): yield token elif (token['type'] in ("Characters", "SpaceCharacters") and self.in_header): yield token elif (token['type'] == 'EndTag' and token['name'] in TAGS_IN_TOC and self.in_header): yield token elif (token['type'] == 'EndTag' and token['name'] in HEAD_TAGS_TOC): level_match = LEVEL_RE.match(token['name']) level = int(level_match.group(1)) if level > self.max_level: self.skip_header = False continue self.in_header = False yield {'type': 'EndTag', 'name': 'a'} if self.open_level > 0: out = [] for i in range(self.open_level): out.extend([{'type': 'EndTag', 'name': 'ol'}, {'type': 'EndTag', 'name': 'li'}]) for t in out: yield t
def __iter__(self): input = html5lib_Filter.__iter__(self) for token in input: # Section start was deferred, so start it now. if self.next_in_section: self.next_in_section = False self.in_section = True if token['type'] == 'StartTag': attrs = dict(token['data']) self.open_level += 1 # Have we encountered the section or heading element we're # looking for? if self.section_id in attrs.values(): # If we encounter a section element that matches the ID, # then we'll want to scoop up all its children as an # explicit section. if self.parent_level is None and self._isSection(token): self.parent_level = self.open_level # Defer the start of the section, so the section parent # itself isn't included. self.next_in_section = True # If we encounter a heading element that matches the ID, we # start an implicit section. elif self.heading is None and self._isHeading(token): self.heading = token self.heading_rank = self._getHeadingRank(token) self.parent_level = self.open_level - 1 self.in_section = True # If started an implicit section, these rules apply to # siblings... elif self.heading is not None and self.open_level - 1 == self.parent_level: # The implicit section should stop if we hit another # sibling heading whose rank is equal or higher, since that # starts a new implicit section if self._isHeading(token) and self._getHeadingRank(token) <= self.heading_rank: self.in_section = False # If this is the first heading of the section and we want to # omit it, note that we've found it is_first_heading = ( self.in_section and self.ignore_heading and not self.already_ignored_header and not self.heading_to_ignore and self._isHeading(token)) if is_first_heading: self.heading_to_ignore = token elif token['type'] == 'EndTag': self.open_level -= 1 # If the parent of the section has ended, end the section. # This applies to both implicit and explicit sections. if self.parent_level is not None and self.open_level < self.parent_level: self.in_section = False # If there's no replacement source, then this is a section # extraction. So, emit tokens while we're in the section, as long # as we're also not in the process of ignoring a heading if not self.replace_source: if self.in_section and not self.heading_to_ignore: yield token # If there is a replacement source, then this is a section # replacement. Emit tokens of the source stream until we're in the # section, then emit the replacement stream and ignore the rest of # the source stream for the section. Note that an ignored heading # is *not* replaced. else: if not self.in_section or self.heading_to_ignore: yield token elif not self.replacement_emitted: for r_token in self.replace_source: yield r_token self.replacement_emitted = True # If this looks like the end of a heading we were ignoring, clear # the ignoring condition. if (token['type'] == 'EndTag' and self.in_section and self.ignore_heading and not self.already_ignored_header and self.heading_to_ignore and self._isHeading(token) and token['name'] == self.heading_to_ignore['name']): self.heading_to_ignore = None self.already_ignored_header = True
def __iter__(self): for token in html5lib_Filter.__iter__(self): if 'SpaceCharacters' == token['type']: continue yield token