Exemple #1
0
def _insert_error(el, error, error_class, error_creator):
    if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
        is_block = False
    else:
        is_block = True
    if _nons(el.tag) != 'form' and error_class:
        _add_class(el, error_class)
    if el.get('id'):
        labels = _label_for_xpath(el, for_id=el.get('id'))
        if labels:
            for label in labels:
                _add_class(label, error_class)
    error_creator(el, is_block, error)
Exemple #2
0
def _insert_error(el, error, error_class, error_creator):
    if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
        is_block = False
    else:
        is_block = True
    if _nons(el.tag) != 'form' and error_class:
        _add_class(el, error_class)
    if el.get('id'):
        labels = _label_for_xpath(el, for_id=el.get('id'))
        if labels:
            for label in labels:
                _add_class(label, error_class)
    error_creator(el, is_block, error)
Exemple #3
0
def form_values(self):
    """
    Return a list of tuples of the field values for the form.
    This is suitable to be passed to ``urllib.urlencode()``.
    """
    results = []
    for el in self.inputs:
        name = el.name
        if not name:
            continue
        tag = _nons(el.tag)
        if tag == 'textarea':
            results.append((name, el.value))
        elif tag == 'select':
            value = el.value
            if el.multiple:
                for v in value:
                    results.append((name, v))
            elif value is not None:
                results.append((name, el.value))
        else:
            assert tag == 'input', (
                "Unexpected tag: %r" % el)
            if el.checkable and not el.checked:
                continue
            if el.type in ('image', 'reset'):
                continue
            value = el.value
            if value is not None:
                results.append((name, el.value))
    return results
Exemple #4
0
def wrap_text(doc, element='p'):
    """Make sure there is no unwrapped text at the top level. Any bare text
    found is wrapped in a `<p>` element.
    """
    def par(text):
        el = etree.Element(element, {MARKER: ''})
        el.text = text
        return el

    if doc.text:
        doc.insert(0, par(doc.text))
        doc.text = None

    while True:
        for (i, el) in enumerate(doc):
            if html._nons(
                    el.tag) in INLINE_TAGS and i and MARKER in doc[i -
                                                                   1].attrib:
                doc[i - 1].append(el)
                break
            if not is_whitespace(el.tail):
                doc.insert(i + 1, par(el.tail))
                el.tail = None
                break
        else:
            break

    for el in doc:
        if MARKER in el.attrib:
            del el.attrib[MARKER]
Exemple #5
0
def form_values(self):
    """
    Return a list of tuples of the field values for the form.
    This is suitable to be passed to ``urllib.urlencode()``.
    """
    results = []
    for el in self.inputs:
        name = el.name
        if not name:
            continue
        tag = _nons(el.tag)
        if tag == 'textarea':
            results.append((name, el.value))
        elif tag == 'select':
            value = el.value
            if el.multiple:
                for v in value:
                    results.append((name, v))
            elif value is not None:
                results.append((name, el.value))
        else:
            assert tag == 'input', ("Unexpected tag: %r" % el)
            if el.checkable and not el.checked:
                continue
            if el.type in ('image', 'reset'):
                continue
            value = el.value
            if value is not None:
                results.append((name, el.value))
    return results
Exemple #6
0
def sanitize(input, cleaner=DocumentCleaner, wrap='p'):
    """Cleanup markup using a given cleanup configuration.
       Unwrapped text will be wrapped with wrap parameter. 
    """
    if "body" not in cleaner.allow_tags:
        cleaner.allow_tags.append("body")

    input=u"<html><body>%s</body></html>" % input
    document=html.document_fromstring(input)
    bodies=[e for e in document if html._nons(e.tag)=="body"]
    body=bodies[0]

    cleaned=cleaner.clean_html(body)
    RemoveEmptyTags(cleaned)
    StripOuterBreaks(cleaned)

    if wrap in html.defs.tags:
        WrapText(cleaned, wrap)
    else:
        raise KeyError(
            "Invalid html tag provided for wrapping the sanitized text")

    output=u"".join([etree.tostring(fragment, encoding=unicode)
                     for fragment in cleaned.iterchildren()])

    return output
Exemple #7
0
def sanitize(input, cleaner=DocumentCleaner, wrap='p'):
    """Cleanup markup using a given cleanup configuration.
       Unwrapped text will be wrapped with wrap parameter.
    """
    if 'body' not in cleaner.allow_tags:
        cleaner.allow_tags.append('body')

    input = six.u("<html><body>%s</body></html>") % input
    document = html.document_fromstring(input)
    bodies = [e for e in document if html._nons(e.tag) == 'body']
    body = bodies[0]

    cleaned = cleaner.clean_html(body)
    remove_empty_tags(cleaned)
    strip_outer_breaks(cleaned)

    if wrap is not None:
        if wrap in html.defs.tags:
            wrap_text(cleaned, wrap)
        else:
            raise ValueError(
                'Invalid html tag provided for wrapping the sanitized text')

    output = six.u('').join([etree.tostring(fragment, encoding=six.text_type)
        for fragment in cleaned.iterchildren()])
    if wrap is None and cleaned.text:
        output = cleaned.text + output

    return output
Exemple #8
0
def _fill_multiple(input, value):
    type = input.get('type', '').lower()
    if type == 'checkbox':
        v = input.get('value')
        if v is None:
            if not value:
                result = False
            else:
                result = value[0]
                if isinstance(value, basestring):
                    # The only valid "on" value for an unnamed checkbox is 'on'
                    result = result == 'on'
            _check(input, result)
        else:
            _check(input, v in value)
    elif type == 'radio':
        v = input.get('value')
        _check(input, v in value)
    else:
        assert _nons(input.tag) == 'select'
        for option in _options_xpath(input):
            v = option.get('value')
            if v is None:
                # This seems to be the default, at least on IE
                # FIXME: but I'm not sure
                v = option.text_content()
            _select(option, v in value)
def sanitize_html(input, cleaner=DocumentCleaner, wrap='p'):
    """Clean up markup using a given cleanup configuration.
       Unwrapped text will be wrapped with wrap parameter.
    """
    if 'body' not in cleaner.allow_tags:
        cleaner.allow_tags.append('body')

    input = u"<html><body>%s</body></html>" % input
    document = html.document_fromstring(input)
    bodies = [e for e in document if html._nons(e.tag) == 'body']
    body = bodies[0]

    cleaned = cleaner.clean_html(body)
    utils.remove_empty_tags(cleaned,
                            extra_empty_tags=['tr', 'th', 'td', 'iframe'])
    utils.strip_outer_breaks(cleaned)

    if wrap is not None:
        if wrap in html.defs.tags:
            _wrap_text(cleaned, wrap)
        else:
            raise ValueError(
                'Invalid html tag provided for wrapping the sanitized text')

    output = u''.join([
        etree.tostring(fragment, encoding='unicode')
        for fragment in cleaned.iterchildren()
    ])
    if wrap is None and cleaned.text:
        output = cleaned.text + output

    return output
Exemple #10
0
Fichier : xml.py Projet : h/eureka
    def form_values(self):
        '''
        overrides the standard form_values function to include the
        values of "submit" buttons

        '''

        results = []
        for el in self.inputs:
            name = el.name
            if not name:
                continue
            tag = html._nons(el.tag)
            if tag == 'textarea':
                results.append((name, el.value))
            elif tag == 'select':
                value = el.value
                if el.multiple:
                    for v in value:
                        results.append((name, v))
                elif value is not None:
                    results.append((name, el.value))
            else:
                assert tag == 'input', (
                    "Unexpected tag: %r" % el)
                if el.checkable and not el.checked:
                    continue
                if el.type in ('image', 'reset'):
                    continue
                value = el.value
                if value is not None:
                    results.append((name, el.value))
        return results
Exemple #11
0
def wrap_text(doc, element='p'):
    """Make sure there is no unwrapped text at the top level. Any bare text
    found is wrapped in a `<p>` element.
    """
    def par(text):
        el = etree.Element(element, {MARKER: ''})
        el.text = text
        return el

    if doc.text:
        doc.insert(0, par(doc.text))
        doc.text = None

    while True:
        for (i, el) in enumerate(doc):
            if html._nons(el.tag) in INLINE_TAGS and i and MARKER in doc[i - 1].attrib:
                doc[i - 1].append(el)
                break
            if not is_whitespace(el.tail):
                doc.insert(i + 1, par(el.tail))
                el.tail = None
                break
        else:
            break

    for el in doc:
        if MARKER in el.attrib:
            del el.attrib[MARKER]
Exemple #12
0
def _fill_multiple(input, value):
    type = input.get('type', '').lower()
    if type == 'checkbox':
        v = input.get('value')
        if v is None:
            if not value:
                result = False
            else:
                result = value[0]
                if isinstance(value, basestring):
                    # The only valid "on" value for an unnamed checkbox is 'on'
                    result = result == 'on'
            _check(input, result)
        else:
            _check(input, v in value)
    elif type == 'radio':
        v = input.get('value')
        _check(input, v in value)
    else:
        assert _nons(input.tag) == 'select'
        for option in _options_xpath(input):
            v = option.get('value')
            if v is None:
                # This seems to be the default, at least on IE
                # FIXME: but I'm not sure
                v = option.text_content()
            _select(option, v in value)
Exemple #13
0
def _takes_multiple(input):
    if _nons(input.tag) == 'select' and input.get('multiple'):
        # FIXME: multiple="0"?
        return True
    type = input.get('type', '').lower()
    if type in ('radio', 'checkbox'):
        return True
    return False
Exemple #14
0
def _takes_multiple(input):
    if _nons(input.tag) == 'select' and input.get('multiple'):
        # FIXME: multiple="0"?
        return True
    type = input.get('type', '').lower()
    if type in ('radio', 'checkbox'):
        return True
    return False
def _takes_multiple(input):
    if _nons(input.tag) == "select" and input.get("multiple"):
        # FIXME: multiple="0"?
        return True
    type = input.get("type", "").lower()
    if type in ("radio", "checkbox"):
        return True
    return False
def _wrap_text(doc, element='p'):
    """
        Make sure there is no unwrapped text at the top level. Any bare text
        found is wrapped in a `<p>` element (or alternative element that gets
        passed in to this method).
        In addition to what htmllaundry does, also any bare inline tags get
        wrapped, so that no `<em>`, `<strong>`, etc. tags will float around
        outside of a paragraph.
    """
    def par(text):
        el = etree.Element(element, {MARKER: ''})
        el.text = text
        return el

    def wrapper_par(el):
        wrapper = etree.Element(element, {MARKER: ''})
        wrapper.insert(0, el)
        return wrapper

    if doc.text:
        doc.insert(0, par(doc.text))
        doc.text = None

    while True:
        for (i, el) in enumerate(doc):
            if html._nons(el.tag) in INLINE_TAGS:
                if i and MARKER in doc[i - 1].attrib:
                    doc[i - 1].append(el)
                    break
                else:
                    doc.insert(i, wrapper_par(el))
                    break
            if not utils.is_whitespace(el.tail):
                doc.insert(i + 1, par(el.tail))
                el.tail = None
                break
        else:
            break

    for el in doc:
        if MARKER in el.attrib:
            del el.attrib[MARKER]
Exemple #17
0
 def __handle(self, el):
     """Handles a lxml element which is straight out of the parser
     and does the work of file objects building and starts the download.
     """
     attribs = el.attrib
     tag = _nons(el.tag)
     if tag == 'object':
         codebase = None
         if 'codebase' in attribs:
             codebase = el.get('codebase')
             self.handle(el, 'codebase', codebase, 0)
         for attrib in ('classid', 'data'):
             if attrib in attribs:
                 value = el.get(attrib)
                 if codebase is not None:
                     value = urljoin(codebase, value)
                 self.handle(el, attrib, value, 0)
         if 'archive' in attribs:
             for match in _archive_re.finditer(el.get('archive')):
                 value = match.group(0)
                 if codebase is not None:
                     value = urljoin(codebase, value)
                 self.handle(el, 'archive', value, match.start())
     else:
         for attrib in link_attrs:
             if attrib in attribs:
                 self.handle(el, attrib, attribs[attrib], 0)
         for attrib in list_link_attrs:
             if attrib in attribs:
                 urls = list(_iter_srcset_urls(attribs[attrib]))
                 if urls:
                     # return in reversed order to simplify in-place modifications
                     for match in urls[::-1]:
                         url, start = _unquote_match(
                             match.group(1).strip(), match.start(1))
                         self.handle(el, attrib, url, start)
     if tag == 'meta':
         http_equiv = attribs.get('http-equiv', '').lower()
         if http_equiv == 'refresh':
             content = attribs.get('content', '')
             match = _parse_meta_refresh_url(content)
             url = (match.group('url') if match else content).strip()
             # unexpected content means the redirect won't work, but we might
             # as well be permissive and return the entire string.
             if url:
                 url, pos = _unquote_match(
                     url,
                     match.start('url') if match else content.find(url))
                 self.handle(el, 'content', url, pos)
     elif tag == 'param':
         valuetype = el.get('valuetype') or ''
         if valuetype.lower() == 'ref':
             self.handle(el, 'value', el.get('value'), 0)
     elif tag == 'style' and el.text:
         urls = [
             # (start_pos, url)
             _unquote_match(match.group(1), match.start(1))[::-1]
             for match in _iter_css_urls(el.text)
         ] + [(match.start(1), match.group(1))
              for match in _iter_css_imports(el.text)]
         if urls:
             # sort by start pos to bring both match sets back into order
             # and reverse the list to report correct positions despite
             # modifications
             urls.sort(reverse=True)
             for start, url in urls:
                 self.handle(el, None, url, start)
     if 'style' in attribs:
         urls = list(_iter_css_urls(attribs['style']))
         if urls:
             # return in reversed order to simplify in-place modifications
             for match in urls[::-1]:
                 url, start = _unquote_match(match.group(1), match.start(1))
                 self.handle(el, 'style', url, start)
Exemple #18
0
def _fill_single(input, value):
    if _nons(input.tag) == 'textarea':
        input.text = value
    else:
        input.set('value', value)
Exemple #19
0
def links(el):
    tag = _nons(el.tag)
    attribs = el.attrib

    if tag == 'object':  # pragma: no cover
        codebase = None
        if 'codebase' in attribs:
            codebase = el.get('codebase')
            yield el, 'codebase', codebase, 0
        for attrib in ('classid', 'data'):
            if attrib in attribs:
                value = el.get(attrib)
                if codebase is not None:
                    value = urljoin(codebase, value)
                yield el, attrib, value, 0
        if 'archive' in attribs:
            for match in _archive_re.finditer(el.get('archive')):
                value = match.group(0)
                if codebase is not None:
                    value = urljoin(codebase, value)
                yield el, 'archive', value, match.start()
    else:
        for attrib in link_attrs:
            if attrib in attribs:
                yield el, attrib, attribs[attrib], 0

        # XXX Patch for src-set url detection
        for attrib in srcset_attrs:
            if attrib in attribs:
                urls = list(_iter_srcset_urls(attribs[attrib]))
                if urls:
                    # yield in reversed order to simplify in-place modifications
                    for match in urls[::-1]:
                        url, start = unquote_match(
                            match.group(1).strip(), match.start(1))
                        yield el, attrib, url, start
    if tag == 'meta':
        http_equiv = attribs.get('http-equiv', '').lower()
        if http_equiv == 'refresh':
            content = attribs.get('content', '')
            match = _parse_meta_refresh_url(content)
            url = (match.group('url') if match else content).strip()
            # unexpected content means the redirect won't work, but we might
            # as well be permissive and yield the entire string.
            if url:
                url, pos = unquote_match(
                    url,
                    match.start('url') if match else content.find(url))
                yield el, 'content', url, pos
        itemprop = attribs.get('itemprop', '').lower()
        if itemprop == 'image':
            url = attribs.get('content', '')
            if url:
                yield el, 'content', url, 0
    elif tag == 'param':
        valuetype = el.get('valuetype') or ''
        if valuetype.lower() == 'ref':
            yield el, 'value', el.get('value'), 0
    elif tag == 'style' and el.text:
        urls = [
            # (start_pos, url)
            unquote_match(match.group(1), match.start(1))[::-1]
            for match in _iter_css_urls(el.text)
        ] + [(match.start(1), match.group(1))
             for match in _iter_css_imports(el.text)]
        if urls:
            # sort by start pos to bring both match sets back into order
            # and reverse the list to report correct positions despite
            # modifications
            urls.sort(reverse=True)
            for start, url in urls:
                yield el, None, url, start
    if 'style' in attribs:
        urls = list(_iter_css_urls(attribs['style']))
        if urls:
            # yield in reversed order to simplify in-place modifications
            for match in urls[::-1]:
                url, start = unquote_match(match.group(1), match.start(1))
                yield el, 'style', url, start
Exemple #20
0
def _fill_single(input, value):
    if _nons(input.tag) == 'textarea':
        input.clear()
        input.text = value
    else:
        input.set('value', value)
Exemple #21
0
    def from_response(cls, response, formname=None, formnumber=0, formdata=None,
                      clickdata=None, dont_click=False, **kwargs):
        if not hasattr(formdata, "items"):
            try:
                if formdata:
                    formdata = dict(formdata)
                else: formdata = {}
            except (ValueError, TypeError):
                raise ValueError('formdata should be a dict or iterable of tuples')
        encoding = kwargs.get('encoding', response.encoding or 'UTF-8')
        hxs = html.fromstring(response.body_as_unicode(), base_url=response.url)
        forms = hxs.forms
        if not forms:
            raise ValueError("No <form> element found in %s" % response)

        form = None

        if formname:
            for f in forms:
                attrs = f.attrib
                if 'name' in attrs and formname==attrs['name']:
                    form = f
                    break

        if form is None:
            try:
                form = forms[formnumber]
            except IndexError:
                raise IndexError("Form number %d not found in %s" % (formnumber, response))

        clickable = []
        results = []
        xmlns = bool(hxs.xpath("@xmlns"))
        for el in form.inputs:
            name = el.name
            if not name or name in formdata:
                continue
            tag = html._nons(el.tag)
            if tag == 'textarea':
                results.append((name, el.value))
            elif tag == 'select':
                if xmlns:
                    #use builtin select parser with namespaces
                    value = el.value
                else:
                    value = el.xpath(".//option[@selected]") or None

                if el.multiple:
                    for v in value:
                        if v is not None:
                            results.append((name, v))
                elif value is not None:
                    results.append((name, value[0] if isinstance(value, list) else value))
                else:
                    option = el.xpath(".//option[1]/@value")
                    if option:
                        results.append((name, option[0]))
            else:
                assert tag == 'input', ("Unexpected tag: %r" % el)
                if el.checkable and not el.checked:
                    continue
                if el.type in ( 'image', 'reset'):
                    continue
                elif el.type=='submit':
                    clickable.append(el)
                    continue
                value = el.value
                if value is not None:
                    results.append((name, el.value))
        if clickdata is not None:
            for key, value in clickdata.items():
                input = form.xpath(".//input[@%s='%s']" %(key, value))[0]
                results.append([input.xpath("@name")[0], input.xpath("@value")])
        elif not dont_click and clickable:
            if not set(clickable).intersection(formdata):
                button = clickable.pop(0)
                results.append((button.name, button.value))
        results.extend([(key, value) for key, value in formdata.iteritems()])
        values = [(_unicode_to_str(key, encoding), _unicode_to_str(value, encoding))
                  for key,value in results]
        if form.action:
            url = form.action
        else:
            url = form.base_url
        if form.method == "POST":
            kwargs.setdefault('headers', {}).update(
                        {'Content-Type':'application/x-www-form-urlencoded'})
            body = urllib.urlencode(values, doseq=1)
        else:
            if '?' in url:
                url += '&'
            else:
                url += '?'
            url += urllib.urlencode(values, doseq=1)
            body=None

        return cls(url, method=form.method, body=body, encoding=encoding, **kwargs)
def _fill_single(input, value):
    if _nons(input.tag) == "textarea":
        input.text = value
    else:
        input.set("value", value)
Exemple #23
0
    def _handle_lxml_elem(el):
        """
        From source code of `lxml.html.iter_links` function.
        With added refactoring of multi-urls attributes, i.e. src-set

        Yielding and internally handling (element, attribute, link, pos),
        where attribute may be None
        (indicating the link is in the text).  ``pos`` is the position
        where the link occurs; often 0, but sometimes something else in
        the case of links in stylesheets or style tags.

        Note: multiple links inside of a single text string or
        attribute value are returned in reversed order.  This makes it
        possible to replace or delete them from the text string value
        based on their reported text positions.  Otherwise, a
        modification at one text position can change the positions of
        links reported later on.
        """
        attribs = el.attrib
        tag = _nons(el.tag)
        if tag == 'object':  # pragma: no cover
            codebase = None
            if 'codebase' in attribs:
                codebase = el.get('codebase')
                yield el, 'codebase', codebase, 0
            for attrib in ('classid', 'data'):
                if attrib in attribs:
                    value = el.get(attrib)
                    if codebase is not None:
                        value = urljoin(codebase, value)
                    yield el, attrib, value, 0
            if 'archive' in attribs:
                for match in _archive_re.finditer(el.get('archive')):
                    value = match.group(0)
                    if codebase is not None:
                        value = urljoin(codebase, value)
                    yield el, 'archive', value, match.start()
        else:
            for attrib in SINGLE_LINK_ATTRIBS:
                if attrib in attribs:
                    yield el, attrib, attribs[attrib], 0

            # XXX Patch for multi-url detection
            for attrib in LIST_LINK_ATTRIBS:
                if attrib in attribs:
                    urls = list(_iter_srcset_urls(attribs[attrib]))
                    if urls:
                        # yield in reversed order to simplify in-place modifications
                        for match in urls[::-1]:
                            url, start = _unquote_match(
                                match.group(1).strip(), match.start(1))
                            yield el, attrib, url, start
        if tag == 'meta':
            http_equiv = attribs.get('http-equiv', '').lower()
            if http_equiv == 'refresh':
                content = attribs.get('content', '')
                match = _parse_meta_refresh_url(content)
                url = (match.group('url') if match else content).strip()
                # unexpected content means the redirect won't work, but we might
                # as well be permissive and yield the entire string.
                if url:
                    url, pos = _unquote_match(
                        url,
                        match.start('url') if match else content.find(url))
                    yield el, 'content', url, pos
        elif tag == 'param':
            valuetype = el.get('valuetype') or ''
            if valuetype.lower() == 'ref':
                yield el, 'value', el.get('value'), 0
        elif tag == 'style' and el.text:
            urls = [
                # (start_pos, url)
                _unquote_match(match.group(1), match.start(1))[::-1]
                for match in _iter_css_urls(el.text)
            ] + [(match.start(1), match.group(1))
                 for match in _iter_css_imports(el.text)]
            if urls:
                # sort by start pos to bring both match sets back into order
                # and reverse the list to report correct positions despite
                # modifications
                urls.sort(reverse=True)
                for start, url in urls:
                    yield el, None, url, start
        if 'style' in attribs:
            urls = list(_iter_css_urls(attribs['style']))
            if urls:
                # yield in reversed order to simplify in-place modifications
                for match in urls[::-1]:
                    url, start = _unquote_match(match.group(1), match.start(1))
                    yield el, 'style', url, start
Exemple #24
0
    def __call__(self, doc):
        """
        Cleans the document.
        """
        if hasattr(doc, 'getroot'):
            # ElementTree instance, instead of an element
            doc = doc.getroot()
        # convert XHTML to HTML
        for el in doc.iter():
            tag = el.tag
            if isinstance(tag, basestring):
                el.tag = _nons(tag)
        # Normalize a case that IE treats <image> like <img>, and that
        # can confuse either this step or later steps.
        for el in doc.iter('image'):
            el.tag = 'img'
        if not self.comments:
            # Of course, if we were going to kill comments anyway, we don't
            # need to worry about this
            self.kill_conditional_comments(doc)
        kill_tags = set()
        remove_tags = set(self.remove_tags or ())
        if self.allow_tags:
            allow_tags = set(self.allow_tags)
        else:
            allow_tags = set()
        if self.scripts:
            kill_tags.add('script')
        if self.safe_attrs_only:
            safe_attrs = set(defs.safe_attrs)
            for el in doc.iter():
                attrib = el.attrib
                for aname in attrib.keys():
                    if aname not in safe_attrs:
                        del attrib[aname]
        if self.javascript:
            if not self.safe_attrs_only:
                # safe_attrs handles events attributes itself
                for el in doc.iter():
                    attrib = el.attrib
                    for aname in attrib.keys():
                        if aname.startswith('on'):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link,
                              resolve_base_href=False)
            if not self.style:
                # If we're deleting style then we don't have to remove JS links
                # from styles, otherwise...
                for el in _find_styled_elements(doc):
                    old = el.get('style')
                    new = _css_javascript_re.sub('', old)
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        del el.attrib['style']
                    elif new != old:
                        el.set('style', new)
                for el in list(doc.iter('style')):
                    if el.get('type', '').lower().strip() == 'text/javascript':
                        el.drop_tree()
                        continue
                    old = el.text or ''
                    new = _css_javascript_re.sub('', old)
                    # The imported CSS can do anything; we just can't allow:
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        el.text = '/* deleted */'
                    elif new != old:
                        el.text = new
        if self.comments or self.processing_instructions:
            # FIXME: why either?  I feel like there's some obscure reason
            # because you can put PIs in comments...?  But I've already
            # forgotten it
            kill_tags.add(etree.Comment)
        if self.processing_instructions:
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add('style')
            etree.strip_attributes(doc, 'style')
        if self.links:
            kill_tags.add('link')
        elif self.style or self.javascript:
            # We must get rid of included stylesheets if Javascript is not
            # allowed, as you can put Javascript in them
            for el in list(doc.iter('link')):
                if 'stylesheet' in el.get('rel', '').lower():
                    # Note this kills alternate stylesheets as well
                    el.drop_tree()
        if self.meta:
            kill_tags.add('meta')
        if self.page_structure:
            remove_tags.update(('head', 'html', 'title'))
        if self.embedded:
            # FIXME: is <layer> really embedded?
            # We should get rid of any <param> tags not inside <applet>;
            # These are not really valid anyway.
            for el in list(doc.iter('param')):
                found_parent = False
                parent = el.getparent()
                while parent is not None and parent.tag not in ('applet', 'object'):
                    parent = parent.getparent()
                if parent is None:
                    el.drop_tree()
            kill_tags.update(('applet',))
            # The alternate contents that are in an iframe are a good fallback:
            remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
        if self.frames:
            # FIXME: ideally we should look at the frame links, but
            # generally frames don't mix properly with an HTML
            # fragment anyway.
            kill_tags.update(defs.frame_tags)
        if self.forms:
            remove_tags.add('form')
            kill_tags.update(('button', 'input', 'select', 'textarea'))
        if self.annoying_tags:
            remove_tags.update(('blink', 'marquee'))

        _remove = []
        _kill = []
        for el in doc.iter():
            if el.tag in kill_tags:
                if self.allow_element(el):
                    continue
                _kill.append(el)
            elif el.tag in remove_tags:
                if self.allow_element(el):
                    continue
                _remove.append(el)

        if _remove and _remove[0] == doc:
            # We have to drop the parent-most tag, which we can't
            # do.  Instead we'll rewrite it:
            el = _remove.pop(0)
            el.tag = 'div'
            el.attrib.clear()
        elif _kill and _kill[0] == doc:
            # We have to drop the parent-most element, which we can't
            # do.  Instead we'll clear it:
            el = _kill.pop(0)
            if el.tag != 'html':
                el.tag = 'div'
            el.clear()

        for el in _kill:
            el.drop_tree()
        for el in _remove:
            el.drop_tag()

        allow_tags = self.allow_tags
        if self.remove_unknown_tags:
            if allow_tags:
                raise ValueError(
                    "It does not make sense to pass in both allow_tags and remove_unknown_tags")
            allow_tags = set(defs.tags)
        if allow_tags:
            bad = []
            for el in doc.iter():
                if el.tag not in allow_tags:
                    bad.append(el)
            for el in bad:
                el.drop_tag()
        if self.add_nofollow:
            for el in _find_external_links(doc):
                if not self.allow_follow(el):
                    el.set('rel', 'nofollow')
Exemple #25
0
    def __call__(self, doc):
        """
        Cleans the document.
        """
        if hasattr(doc, 'getroot'):
            # ElementTree instance, instead of an element
            doc = doc.getroot()
        # convert XHTML to HTML
        for el in doc.iter():
            tag = el.tag
            if isinstance(tag, basestring):
                el.tag = _nons(tag)
        # Normalize a case that IE treats <image> like <img>, and that
        # can confuse either this step or later steps.
        for el in doc.iter('image'):
            el.tag = 'img'
        if not self.comments:
            # Of course, if we were going to kill comments anyway, we don't
            # need to worry about this
            self.kill_conditional_comments(doc)
        kill_tags = set()
        remove_tags = set(self.remove_tags or ())
        if self.allow_tags:
            allow_tags = set(self.allow_tags)
        else:
            allow_tags = set()
        if self.scripts:
            kill_tags.add('script')
        if self.safe_attrs_only:
            safe_attrs = set(defs.safe_attrs)
            for el in doc.iter():
                attrib = el.attrib
                for aname in attrib.keys():
                    if aname not in safe_attrs:
                        del attrib[aname]
        if self.javascript:
            if not self.safe_attrs_only:
                # safe_attrs handles events attributes itself
                for el in doc.iter():
                    attrib = el.attrib
                    for aname in attrib.keys():
                        if aname.startswith('on'):
                            del attrib[aname]
            doc.rewrite_links(self._remove_javascript_link,
                              resolve_base_href=False)
            if not self.style:
                # If we're deleting style then we don't have to remove JS links
                # from styles, otherwise...
                for el in _find_styled_elements(doc):
                    old = el.get('style')
                    new = _css_javascript_re.sub('', old)
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        del el.attrib['style']
                    elif new != old:
                        el.set('style', new)
                for el in list(doc.iter('style')):
                    if el.get('type', '').lower().strip() == 'text/javascript':
                        el.drop_tree()
                        continue
                    old = el.text or ''
                    new = _css_javascript_re.sub('', old)
                    # The imported CSS can do anything; we just can't allow:
                    new = _css_import_re.sub('', old)
                    if self._has_sneaky_javascript(new):
                        # Something tricky is going on...
                        el.text = '/* deleted */'
                    elif new != old:
                        el.text = new
        if self.comments or self.processing_instructions:
            # FIXME: why either?  I feel like there's some obscure reason
            # because you can put PIs in comments...?  But I've already
            # forgotten it
            kill_tags.add(etree.Comment)
        if self.processing_instructions:
            kill_tags.add(etree.ProcessingInstruction)
        if self.style:
            kill_tags.add('style')
            for el in _find_styled_elements(doc):
                del el.attrib['style']
        if self.links:
            kill_tags.add('link')
        elif self.style or self.javascript:
            # We must get rid of included stylesheets if Javascript is not
            # allowed, as you can put Javascript in them
            for el in list(doc.iter('link')):
                if 'stylesheet' in el.get('rel', '').lower():
                    # Note this kills alternate stylesheets as well
                    el.drop_tree()
        if self.meta:
            kill_tags.add('meta')
        if self.page_structure:
            remove_tags.update(('head', 'html', 'title'))
        if self.embedded:
            # FIXME: is <layer> really embedded?
            # We should get rid of any <param> tags not inside <applet>;
            # These are not really valid anyway.
            for el in list(doc.iter('param')):
                found_parent = False
                parent = el.getparent()
                while parent is not None and parent.tag not in ('applet',
                                                                'object'):
                    parent = parent.getparent()
                if parent is None:
                    el.drop_tree()
            kill_tags.update(('applet', ))
            # The alternate contents that are in an iframe are a good fallback:
            remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
        if self.frames:
            # FIXME: ideally we should look at the frame links, but
            # generally frames don't mix properly with an HTML
            # fragment anyway.
            kill_tags.update(defs.frame_tags)
        if self.forms:
            remove_tags.add('form')
            kill_tags.update(('button', 'input', 'select', 'textarea'))
        if self.annoying_tags:
            remove_tags.update(('blink', 'marque'))

        _remove = []
        _kill = []
        for el in doc.iter():
            if el.tag in kill_tags:
                if self.allow_element(el):
                    continue
                _kill.append(el)
            elif el.tag in remove_tags:
                if self.allow_element(el):
                    continue
                _remove.append(el)

        if _remove and _remove[0] == doc:
            # We have to drop the parent-most tag, which we can't
            # do.  Instead we'll rewrite it:
            el = _remove.pop(0)
            el.tag = 'div'
            el.attrib.clear()
        elif _kill and _kill[0] == doc:
            # We have to drop the parent-most element, which we can't
            # do.  Instead we'll clear it:
            el = _kill.pop(0)
            if el.tag != 'html':
                el.tag = 'div'
            el.clear()

        for el in _kill:
            el.drop_tree()
        for el in _remove:
            el.drop_tag()

        allow_tags = self.allow_tags
        if self.remove_unknown_tags:
            if allow_tags:
                raise ValueError(
                    "It does not make sense to pass in both allow_tags and remove_unknown_tags"
                )
            allow_tags = set(defs.tags)
        if allow_tags:
            bad = []
            for el in doc.iter():
                if el.tag not in allow_tags:
                    bad.append(el)
            for el in bad:
                el.drop_tag()
        if self.add_nofollow:
            for el in _find_external_links(doc):
                if not self.allow_follow(el):
                    el.set('rel', 'nofollow')