def _insert_error(el, error, error_class, error_creator): if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea': is_block = False else: is_block = True if _nons(el.tag) != 'form' and error_class: _add_class(el, error_class) if el.get('id'): labels = _label_for_xpath(el, for_id=el.get('id')) if labels: for label in labels: _add_class(label, error_class) error_creator(el, is_block, error)
def form_values(self): """ Return a list of tuples of the field values for the form. This is suitable to be passed to ``urllib.urlencode()``. """ results = [] for el in self.inputs: name = el.name if not name: continue tag = _nons(el.tag) if tag == 'textarea': results.append((name, el.value)) elif tag == 'select': value = el.value if el.multiple: for v in value: results.append((name, v)) elif value is not None: results.append((name, el.value)) else: assert tag == 'input', ( "Unexpected tag: %r" % el) if el.checkable and not el.checked: continue if el.type in ('image', 'reset'): continue value = el.value if value is not None: results.append((name, el.value)) return results
def wrap_text(doc, element='p'): """Make sure there is no unwrapped text at the top level. Any bare text found is wrapped in a `<p>` element. """ def par(text): el = etree.Element(element, {MARKER: ''}) el.text = text return el if doc.text: doc.insert(0, par(doc.text)) doc.text = None while True: for (i, el) in enumerate(doc): if html._nons( el.tag) in INLINE_TAGS and i and MARKER in doc[i - 1].attrib: doc[i - 1].append(el) break if not is_whitespace(el.tail): doc.insert(i + 1, par(el.tail)) el.tail = None break else: break for el in doc: if MARKER in el.attrib: del el.attrib[MARKER]
def form_values(self): """ Return a list of tuples of the field values for the form. This is suitable to be passed to ``urllib.urlencode()``. """ results = [] for el in self.inputs: name = el.name if not name: continue tag = _nons(el.tag) if tag == 'textarea': results.append((name, el.value)) elif tag == 'select': value = el.value if el.multiple: for v in value: results.append((name, v)) elif value is not None: results.append((name, el.value)) else: assert tag == 'input', ("Unexpected tag: %r" % el) if el.checkable and not el.checked: continue if el.type in ('image', 'reset'): continue value = el.value if value is not None: results.append((name, el.value)) return results
def sanitize(input, cleaner=DocumentCleaner, wrap='p'): """Cleanup markup using a given cleanup configuration. Unwrapped text will be wrapped with wrap parameter. """ if "body" not in cleaner.allow_tags: cleaner.allow_tags.append("body") input=u"<html><body>%s</body></html>" % input document=html.document_fromstring(input) bodies=[e for e in document if html._nons(e.tag)=="body"] body=bodies[0] cleaned=cleaner.clean_html(body) RemoveEmptyTags(cleaned) StripOuterBreaks(cleaned) if wrap in html.defs.tags: WrapText(cleaned, wrap) else: raise KeyError( "Invalid html tag provided for wrapping the sanitized text") output=u"".join([etree.tostring(fragment, encoding=unicode) for fragment in cleaned.iterchildren()]) return output
def sanitize(input, cleaner=DocumentCleaner, wrap='p'): """Cleanup markup using a given cleanup configuration. Unwrapped text will be wrapped with wrap parameter. """ if 'body' not in cleaner.allow_tags: cleaner.allow_tags.append('body') input = six.u("<html><body>%s</body></html>") % input document = html.document_fromstring(input) bodies = [e for e in document if html._nons(e.tag) == 'body'] body = bodies[0] cleaned = cleaner.clean_html(body) remove_empty_tags(cleaned) strip_outer_breaks(cleaned) if wrap is not None: if wrap in html.defs.tags: wrap_text(cleaned, wrap) else: raise ValueError( 'Invalid html tag provided for wrapping the sanitized text') output = six.u('').join([etree.tostring(fragment, encoding=six.text_type) for fragment in cleaned.iterchildren()]) if wrap is None and cleaned.text: output = cleaned.text + output return output
def _fill_multiple(input, value): type = input.get('type', '').lower() if type == 'checkbox': v = input.get('value') if v is None: if not value: result = False else: result = value[0] if isinstance(value, basestring): # The only valid "on" value for an unnamed checkbox is 'on' result = result == 'on' _check(input, result) else: _check(input, v in value) elif type == 'radio': v = input.get('value') _check(input, v in value) else: assert _nons(input.tag) == 'select' for option in _options_xpath(input): v = option.get('value') if v is None: # This seems to be the default, at least on IE # FIXME: but I'm not sure v = option.text_content() _select(option, v in value)
def sanitize_html(input, cleaner=DocumentCleaner, wrap='p'): """Clean up markup using a given cleanup configuration. Unwrapped text will be wrapped with wrap parameter. """ if 'body' not in cleaner.allow_tags: cleaner.allow_tags.append('body') input = u"<html><body>%s</body></html>" % input document = html.document_fromstring(input) bodies = [e for e in document if html._nons(e.tag) == 'body'] body = bodies[0] cleaned = cleaner.clean_html(body) utils.remove_empty_tags(cleaned, extra_empty_tags=['tr', 'th', 'td', 'iframe']) utils.strip_outer_breaks(cleaned) if wrap is not None: if wrap in html.defs.tags: _wrap_text(cleaned, wrap) else: raise ValueError( 'Invalid html tag provided for wrapping the sanitized text') output = u''.join([ etree.tostring(fragment, encoding='unicode') for fragment in cleaned.iterchildren() ]) if wrap is None and cleaned.text: output = cleaned.text + output return output
def form_values(self): ''' overrides the standard form_values function to include the values of "submit" buttons ''' results = [] for el in self.inputs: name = el.name if not name: continue tag = html._nons(el.tag) if tag == 'textarea': results.append((name, el.value)) elif tag == 'select': value = el.value if el.multiple: for v in value: results.append((name, v)) elif value is not None: results.append((name, el.value)) else: assert tag == 'input', ( "Unexpected tag: %r" % el) if el.checkable and not el.checked: continue if el.type in ('image', 'reset'): continue value = el.value if value is not None: results.append((name, el.value)) return results
def wrap_text(doc, element='p'): """Make sure there is no unwrapped text at the top level. Any bare text found is wrapped in a `<p>` element. """ def par(text): el = etree.Element(element, {MARKER: ''}) el.text = text return el if doc.text: doc.insert(0, par(doc.text)) doc.text = None while True: for (i, el) in enumerate(doc): if html._nons(el.tag) in INLINE_TAGS and i and MARKER in doc[i - 1].attrib: doc[i - 1].append(el) break if not is_whitespace(el.tail): doc.insert(i + 1, par(el.tail)) el.tail = None break else: break for el in doc: if MARKER in el.attrib: del el.attrib[MARKER]
def _takes_multiple(input): if _nons(input.tag) == 'select' and input.get('multiple'): # FIXME: multiple="0"? return True type = input.get('type', '').lower() if type in ('radio', 'checkbox'): return True return False
def _takes_multiple(input): if _nons(input.tag) == "select" and input.get("multiple"): # FIXME: multiple="0"? return True type = input.get("type", "").lower() if type in ("radio", "checkbox"): return True return False
def _wrap_text(doc, element='p'): """ Make sure there is no unwrapped text at the top level. Any bare text found is wrapped in a `<p>` element (or alternative element that gets passed in to this method). In addition to what htmllaundry does, also any bare inline tags get wrapped, so that no `<em>`, `<strong>`, etc. tags will float around outside of a paragraph. """ def par(text): el = etree.Element(element, {MARKER: ''}) el.text = text return el def wrapper_par(el): wrapper = etree.Element(element, {MARKER: ''}) wrapper.insert(0, el) return wrapper if doc.text: doc.insert(0, par(doc.text)) doc.text = None while True: for (i, el) in enumerate(doc): if html._nons(el.tag) in INLINE_TAGS: if i and MARKER in doc[i - 1].attrib: doc[i - 1].append(el) break else: doc.insert(i, wrapper_par(el)) break if not utils.is_whitespace(el.tail): doc.insert(i + 1, par(el.tail)) el.tail = None break else: break for el in doc: if MARKER in el.attrib: del el.attrib[MARKER]
def __handle(self, el): """Handles a lxml element which is straight out of the parser and does the work of file objects building and starts the download. """ attribs = el.attrib tag = _nons(el.tag) if tag == 'object': codebase = None if 'codebase' in attribs: codebase = el.get('codebase') self.handle(el, 'codebase', codebase, 0) for attrib in ('classid', 'data'): if attrib in attribs: value = el.get(attrib) if codebase is not None: value = urljoin(codebase, value) self.handle(el, attrib, value, 0) if 'archive' in attribs: for match in _archive_re.finditer(el.get('archive')): value = match.group(0) if codebase is not None: value = urljoin(codebase, value) self.handle(el, 'archive', value, match.start()) else: for attrib in link_attrs: if attrib in attribs: self.handle(el, attrib, attribs[attrib], 0) for attrib in list_link_attrs: if attrib in attribs: urls = list(_iter_srcset_urls(attribs[attrib])) if urls: # return in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = _unquote_match( match.group(1).strip(), match.start(1)) self.handle(el, attrib, url, start) if tag == 'meta': http_equiv = attribs.get('http-equiv', '').lower() if http_equiv == 'refresh': content = attribs.get('content', '') match = _parse_meta_refresh_url(content) url = (match.group('url') if match else content).strip() # unexpected content means the redirect won't work, but we might # as well be permissive and return the entire string. if url: url, pos = _unquote_match( url, match.start('url') if match else content.find(url)) self.handle(el, 'content', url, pos) elif tag == 'param': valuetype = el.get('valuetype') or '' if valuetype.lower() == 'ref': self.handle(el, 'value', el.get('value'), 0) elif tag == 'style' and el.text: urls = [ # (start_pos, url) _unquote_match(match.group(1), match.start(1))[::-1] for match in _iter_css_urls(el.text) ] + [(match.start(1), match.group(1)) for match in _iter_css_imports(el.text)] if urls: # sort by start pos to bring both match sets back into order # and reverse the list to report correct positions despite # modifications urls.sort(reverse=True) for start, url in urls: self.handle(el, None, url, start) if 'style' in attribs: urls = list(_iter_css_urls(attribs['style'])) if urls: # return in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = _unquote_match(match.group(1), match.start(1)) self.handle(el, 'style', url, start)
def _fill_single(input, value): if _nons(input.tag) == 'textarea': input.text = value else: input.set('value', value)
def links(el): tag = _nons(el.tag) attribs = el.attrib if tag == 'object': # pragma: no cover codebase = None if 'codebase' in attribs: codebase = el.get('codebase') yield el, 'codebase', codebase, 0 for attrib in ('classid', 'data'): if attrib in attribs: value = el.get(attrib) if codebase is not None: value = urljoin(codebase, value) yield el, attrib, value, 0 if 'archive' in attribs: for match in _archive_re.finditer(el.get('archive')): value = match.group(0) if codebase is not None: value = urljoin(codebase, value) yield el, 'archive', value, match.start() else: for attrib in link_attrs: if attrib in attribs: yield el, attrib, attribs[attrib], 0 # XXX Patch for src-set url detection for attrib in srcset_attrs: if attrib in attribs: urls = list(_iter_srcset_urls(attribs[attrib])) if urls: # yield in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = unquote_match( match.group(1).strip(), match.start(1)) yield el, attrib, url, start if tag == 'meta': http_equiv = attribs.get('http-equiv', '').lower() if http_equiv == 'refresh': content = attribs.get('content', '') match = _parse_meta_refresh_url(content) url = (match.group('url') if match else content).strip() # unexpected content means the redirect won't work, but we might # as well be permissive and yield the entire string. if url: url, pos = unquote_match( url, match.start('url') if match else content.find(url)) yield el, 'content', url, pos itemprop = attribs.get('itemprop', '').lower() if itemprop == 'image': url = attribs.get('content', '') if url: yield el, 'content', url, 0 elif tag == 'param': valuetype = el.get('valuetype') or '' if valuetype.lower() == 'ref': yield el, 'value', el.get('value'), 0 elif tag == 'style' and el.text: urls = [ # (start_pos, url) unquote_match(match.group(1), match.start(1))[::-1] for match in _iter_css_urls(el.text) ] + [(match.start(1), match.group(1)) for match in _iter_css_imports(el.text)] if urls: # sort by start pos to bring both match sets back into order # and reverse the list to report correct positions despite # modifications urls.sort(reverse=True) for start, url in urls: yield el, None, url, start if 'style' in attribs: urls = list(_iter_css_urls(attribs['style'])) if urls: # yield in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = unquote_match(match.group(1), match.start(1)) yield el, 'style', url, start
def _fill_single(input, value): if _nons(input.tag) == 'textarea': input.clear() input.text = value else: input.set('value', value)
def from_response(cls, response, formname=None, formnumber=0, formdata=None, clickdata=None, dont_click=False, **kwargs): if not hasattr(formdata, "items"): try: if formdata: formdata = dict(formdata) else: formdata = {} except (ValueError, TypeError): raise ValueError('formdata should be a dict or iterable of tuples') encoding = kwargs.get('encoding', response.encoding or 'UTF-8') hxs = html.fromstring(response.body_as_unicode(), base_url=response.url) forms = hxs.forms if not forms: raise ValueError("No <form> element found in %s" % response) form = None if formname: for f in forms: attrs = f.attrib if 'name' in attrs and formname==attrs['name']: form = f break if form is None: try: form = forms[formnumber] except IndexError: raise IndexError("Form number %d not found in %s" % (formnumber, response)) clickable = [] results = [] xmlns = bool(hxs.xpath("@xmlns")) for el in form.inputs: name = el.name if not name or name in formdata: continue tag = html._nons(el.tag) if tag == 'textarea': results.append((name, el.value)) elif tag == 'select': if xmlns: #use builtin select parser with namespaces value = el.value else: value = el.xpath(".//option[@selected]") or None if el.multiple: for v in value: if v is not None: results.append((name, v)) elif value is not None: results.append((name, value[0] if isinstance(value, list) else value)) else: option = el.xpath(".//option[1]/@value") if option: results.append((name, option[0])) else: assert tag == 'input', ("Unexpected tag: %r" % el) if el.checkable and not el.checked: continue if el.type in ( 'image', 'reset'): continue elif el.type=='submit': clickable.append(el) continue value = el.value if value is not None: results.append((name, el.value)) if clickdata is not None: for key, value in clickdata.items(): input = form.xpath(".//input[@%s='%s']" %(key, value))[0] results.append([input.xpath("@name")[0], input.xpath("@value")]) elif not dont_click and clickable: if not set(clickable).intersection(formdata): button = clickable.pop(0) results.append((button.name, button.value)) results.extend([(key, value) for key, value in formdata.iteritems()]) values = [(_unicode_to_str(key, encoding), _unicode_to_str(value, encoding)) for key,value in results] if form.action: url = form.action else: url = form.base_url if form.method == "POST": kwargs.setdefault('headers', {}).update( {'Content-Type':'application/x-www-form-urlencoded'}) body = urllib.urlencode(values, doseq=1) else: if '?' in url: url += '&' else: url += '?' url += urllib.urlencode(values, doseq=1) body=None return cls(url, method=form.method, body=body, encoding=encoding, **kwargs)
def _fill_single(input, value): if _nons(input.tag) == "textarea": input.text = value else: input.set("value", value)
def _handle_lxml_elem(el): """ From source code of `lxml.html.iter_links` function. With added refactoring of multi-urls attributes, i.e. src-set Yielding and internally handling (element, attribute, link, pos), where attribute may be None (indicating the link is in the text). ``pos`` is the position where the link occurs; often 0, but sometimes something else in the case of links in stylesheets or style tags. Note: multiple links inside of a single text string or attribute value are returned in reversed order. This makes it possible to replace or delete them from the text string value based on their reported text positions. Otherwise, a modification at one text position can change the positions of links reported later on. """ attribs = el.attrib tag = _nons(el.tag) if tag == 'object': # pragma: no cover codebase = None if 'codebase' in attribs: codebase = el.get('codebase') yield el, 'codebase', codebase, 0 for attrib in ('classid', 'data'): if attrib in attribs: value = el.get(attrib) if codebase is not None: value = urljoin(codebase, value) yield el, attrib, value, 0 if 'archive' in attribs: for match in _archive_re.finditer(el.get('archive')): value = match.group(0) if codebase is not None: value = urljoin(codebase, value) yield el, 'archive', value, match.start() else: for attrib in SINGLE_LINK_ATTRIBS: if attrib in attribs: yield el, attrib, attribs[attrib], 0 # XXX Patch for multi-url detection for attrib in LIST_LINK_ATTRIBS: if attrib in attribs: urls = list(_iter_srcset_urls(attribs[attrib])) if urls: # yield in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = _unquote_match( match.group(1).strip(), match.start(1)) yield el, attrib, url, start if tag == 'meta': http_equiv = attribs.get('http-equiv', '').lower() if http_equiv == 'refresh': content = attribs.get('content', '') match = _parse_meta_refresh_url(content) url = (match.group('url') if match else content).strip() # unexpected content means the redirect won't work, but we might # as well be permissive and yield the entire string. if url: url, pos = _unquote_match( url, match.start('url') if match else content.find(url)) yield el, 'content', url, pos elif tag == 'param': valuetype = el.get('valuetype') or '' if valuetype.lower() == 'ref': yield el, 'value', el.get('value'), 0 elif tag == 'style' and el.text: urls = [ # (start_pos, url) _unquote_match(match.group(1), match.start(1))[::-1] for match in _iter_css_urls(el.text) ] + [(match.start(1), match.group(1)) for match in _iter_css_imports(el.text)] if urls: # sort by start pos to bring both match sets back into order # and reverse the list to report correct positions despite # modifications urls.sort(reverse=True) for start, url in urls: yield el, None, url, start if 'style' in attribs: urls = list(_iter_css_urls(attribs['style'])) if urls: # yield in reversed order to simplify in-place modifications for match in urls[::-1]: url, start = _unquote_match(match.group(1), match.start(1)) yield el, 'style', url, start
def __call__(self, doc): """ Cleans the document. """ if hasattr(doc, 'getroot'): # ElementTree instance, instead of an element doc = doc.getroot() # convert XHTML to HTML for el in doc.iter(): tag = el.tag if isinstance(tag, basestring): el.tag = _nons(tag) # Normalize a case that IE treats <image> like <img>, and that # can confuse either this step or later steps. for el in doc.iter('image'): el.tag = 'img' if not self.comments: # Of course, if we were going to kill comments anyway, we don't # need to worry about this self.kill_conditional_comments(doc) kill_tags = set() remove_tags = set(self.remove_tags or ()) if self.allow_tags: allow_tags = set(self.allow_tags) else: allow_tags = set() if self.scripts: kill_tags.add('script') if self.safe_attrs_only: safe_attrs = set(defs.safe_attrs) for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] if self.javascript: if not self.safe_attrs_only: # safe_attrs handles events attributes itself for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname.startswith('on'): del attrib[aname] doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False) if not self.style: # If we're deleting style then we don't have to remove JS links # from styles, otherwise... for el in _find_styled_elements(doc): old = el.get('style') new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib['style'] elif new != old: el.set('style', new) for el in list(doc.iter('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_tree() continue old = el.text or '' new = _css_javascript_re.sub('', old) # The imported CSS can do anything; we just can't allow: new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = '/* deleted */' elif new != old: el.text = new if self.comments or self.processing_instructions: # FIXME: why either? I feel like there's some obscure reason # because you can put PIs in comments...? But I've already # forgotten it kill_tags.add(etree.Comment) if self.processing_instructions: kill_tags.add(etree.ProcessingInstruction) if self.style: kill_tags.add('style') etree.strip_attributes(doc, 'style') if self.links: kill_tags.add('link') elif self.style or self.javascript: # We must get rid of included stylesheets if Javascript is not # allowed, as you can put Javascript in them for el in list(doc.iter('link')): if 'stylesheet' in el.get('rel', '').lower(): # Note this kills alternate stylesheets as well el.drop_tree() if self.meta: kill_tags.add('meta') if self.page_structure: remove_tags.update(('head', 'html', 'title')) if self.embedded: # FIXME: is <layer> really embedded? # We should get rid of any <param> tags not inside <applet>; # These are not really valid anyway. for el in list(doc.iter('param')): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ('applet', 'object'): parent = parent.getparent() if parent is None: el.drop_tree() kill_tags.update(('applet',)) # The alternate contents that are in an iframe are a good fallback: remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) if self.frames: # FIXME: ideally we should look at the frame links, but # generally frames don't mix properly with an HTML # fragment anyway. kill_tags.update(defs.frame_tags) if self.forms: remove_tags.add('form') kill_tags.update(('button', 'input', 'select', 'textarea')) if self.annoying_tags: remove_tags.update(('blink', 'marquee')) _remove = [] _kill = [] for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue _kill.append(el) elif el.tag in remove_tags: if self.allow_element(el): continue _remove.append(el) if _remove and _remove[0] == doc: # We have to drop the parent-most tag, which we can't # do. Instead we'll rewrite it: el = _remove.pop(0) el.tag = 'div' el.attrib.clear() elif _kill and _kill[0] == doc: # We have to drop the parent-most element, which we can't # do. Instead we'll clear it: el = _kill.pop(0) if el.tag != 'html': el.tag = 'div' el.clear() for el in _kill: el.drop_tree() for el in _remove: el.drop_tag() allow_tags = self.allow_tags if self.remove_unknown_tags: if allow_tags: raise ValueError( "It does not make sense to pass in both allow_tags and remove_unknown_tags") allow_tags = set(defs.tags) if allow_tags: bad = [] for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) for el in bad: el.drop_tag() if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): el.set('rel', 'nofollow')
def __call__(self, doc): """ Cleans the document. """ if hasattr(doc, 'getroot'): # ElementTree instance, instead of an element doc = doc.getroot() # convert XHTML to HTML for el in doc.iter(): tag = el.tag if isinstance(tag, basestring): el.tag = _nons(tag) # Normalize a case that IE treats <image> like <img>, and that # can confuse either this step or later steps. for el in doc.iter('image'): el.tag = 'img' if not self.comments: # Of course, if we were going to kill comments anyway, we don't # need to worry about this self.kill_conditional_comments(doc) kill_tags = set() remove_tags = set(self.remove_tags or ()) if self.allow_tags: allow_tags = set(self.allow_tags) else: allow_tags = set() if self.scripts: kill_tags.add('script') if self.safe_attrs_only: safe_attrs = set(defs.safe_attrs) for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname not in safe_attrs: del attrib[aname] if self.javascript: if not self.safe_attrs_only: # safe_attrs handles events attributes itself for el in doc.iter(): attrib = el.attrib for aname in attrib.keys(): if aname.startswith('on'): del attrib[aname] doc.rewrite_links(self._remove_javascript_link, resolve_base_href=False) if not self.style: # If we're deleting style then we don't have to remove JS links # from styles, otherwise... for el in _find_styled_elements(doc): old = el.get('style') new = _css_javascript_re.sub('', old) new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... del el.attrib['style'] elif new != old: el.set('style', new) for el in list(doc.iter('style')): if el.get('type', '').lower().strip() == 'text/javascript': el.drop_tree() continue old = el.text or '' new = _css_javascript_re.sub('', old) # The imported CSS can do anything; we just can't allow: new = _css_import_re.sub('', old) if self._has_sneaky_javascript(new): # Something tricky is going on... el.text = '/* deleted */' elif new != old: el.text = new if self.comments or self.processing_instructions: # FIXME: why either? I feel like there's some obscure reason # because you can put PIs in comments...? But I've already # forgotten it kill_tags.add(etree.Comment) if self.processing_instructions: kill_tags.add(etree.ProcessingInstruction) if self.style: kill_tags.add('style') for el in _find_styled_elements(doc): del el.attrib['style'] if self.links: kill_tags.add('link') elif self.style or self.javascript: # We must get rid of included stylesheets if Javascript is not # allowed, as you can put Javascript in them for el in list(doc.iter('link')): if 'stylesheet' in el.get('rel', '').lower(): # Note this kills alternate stylesheets as well el.drop_tree() if self.meta: kill_tags.add('meta') if self.page_structure: remove_tags.update(('head', 'html', 'title')) if self.embedded: # FIXME: is <layer> really embedded? # We should get rid of any <param> tags not inside <applet>; # These are not really valid anyway. for el in list(doc.iter('param')): found_parent = False parent = el.getparent() while parent is not None and parent.tag not in ('applet', 'object'): parent = parent.getparent() if parent is None: el.drop_tree() kill_tags.update(('applet', )) # The alternate contents that are in an iframe are a good fallback: remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) if self.frames: # FIXME: ideally we should look at the frame links, but # generally frames don't mix properly with an HTML # fragment anyway. kill_tags.update(defs.frame_tags) if self.forms: remove_tags.add('form') kill_tags.update(('button', 'input', 'select', 'textarea')) if self.annoying_tags: remove_tags.update(('blink', 'marque')) _remove = [] _kill = [] for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): continue _kill.append(el) elif el.tag in remove_tags: if self.allow_element(el): continue _remove.append(el) if _remove and _remove[0] == doc: # We have to drop the parent-most tag, which we can't # do. Instead we'll rewrite it: el = _remove.pop(0) el.tag = 'div' el.attrib.clear() elif _kill and _kill[0] == doc: # We have to drop the parent-most element, which we can't # do. Instead we'll clear it: el = _kill.pop(0) if el.tag != 'html': el.tag = 'div' el.clear() for el in _kill: el.drop_tree() for el in _remove: el.drop_tag() allow_tags = self.allow_tags if self.remove_unknown_tags: if allow_tags: raise ValueError( "It does not make sense to pass in both allow_tags and remove_unknown_tags" ) allow_tags = set(defs.tags) if allow_tags: bad = [] for el in doc.iter(): if el.tag not in allow_tags: bad.append(el) for el in bad: el.drop_tag() if self.add_nofollow: for el in _find_external_links(doc): if not self.allow_follow(el): el.set('rel', 'nofollow')