Beispiel #1
0
    def on_response(self, resp, req):
        ctype = resp.headers.iget('content-type')
        if not ctype:
            return

        ctype = ctype.split(";", 1)[0]

        # if this is an html page, parse it
        if ctype in HTML_CTYPES:
            body = resp.body_string()

            html = lxml.html.fromstring(body)

            # rewrite links to absolute
            html.rewrite_links(self.rewrite_link)

            # add base
            old_base = html.find(".//base")
            base = etree.Element("base")
            base.attrib['href'] = self.absolute_path

            if not old_base:
                head = html.find(".//head")
                head.append(base)

            # modify response
            rewritten_body = lxml.html.tostring(html)
            try:
                resp.headers.ipop('content-length')
            except KeyError:
                pass

            resp.headers['Content-Length'] = str(len(rewritten_body))
            resp._body = StringIO(rewritten_body)
            resp._already_read = False
Beispiel #2
0
    def _post_process_html(self, content):
        html = lxml.html.fromstring(content)
        if self.links:
            html.rewrite_links(self._map_cid)

            for link in html.iterlinks():
                link[0].set("target", "_blank")
        else:
            html.rewrite_links(lambda x: None)
        safe_attrs = list(defs.safe_attrs) + ["class", "style"]
        cleaner = Cleaner(
            scripts=True,
            javascript=True,
            links=True,
            page_structure=True,
            embedded=True,
            frames=True,
            add_nofollow=True,
            safe_attrs=safe_attrs
        )
        mail_text = lxml.html.tostring(
            cleaner.clean_html(html), encoding="unicode")
        with open("/tmp/output.txt", "w") as fp:
            fp.write(mail_text)
        return smart_text(mail_text)
Beispiel #3
0
    def on_response(self, resp, req):
        ctype = resp.headers.iget('content-type')
        if not ctype:
            return

        ctype = ctype.split(";", 1)[0]

        # if this is an html page, parse it
        if ctype in HTML_CTYPES:
            body = resp.body_string()

            html = lxml.html.fromstring(body)

            # rewrite links to absolute
            html.rewrite_links(self.rewrite_link)

            # add base
            old_base = html.find(".//base")
            base = etree.Element("base")
            base.attrib['href'] = self.absolute_path

            if not old_base:
                head = html.find(".//head")
                head.append(base)

            # modify response
            rewritten_body = lxml.html.tostring(html)
            try:
                resp.headers.ipop('content-length')
            except KeyError:
                pass

            resp.headers['Content-Length'] = str(len(rewritten_body))
            resp._body = StringIO(rewritten_body)
            resp._already_read = False
Beispiel #4
0
def show_proxy_response(_url, status_code, headers, output, start_response):
    import lxml.html
    if "content-type" in headers and u'text/html' in headers.get(
            "content-type"):
        html = lxml.html.fromstring(output)
        html.rewrite_links(RewriteLink(get_base_url(_url)))
        output = lxml.html.tostring(html)
    return show_response(status_code, headers, output, start_response)
def html_to_lxml(url, text, clean=False):
    """Parse plain-text HTML into an `lxml` tree."""
    if clean:
        text = _text_from_sp(('pandoc', '--from=html', '--to=html5'),
                             text.encode())
    html = lxml.html.document_fromstring(text)
    # Endless loops ahoy
    html.rewrite_links(lambda s: '' if urldefrag(s).url == url else s,
                       base_href=url)
    return html
Beispiel #6
0
def rewrite_resource_paths(content, base=None):
    """Given the content and a new base reference, rewrite the source paths
    in the content.
    """
    html = lxml.html.fromstring(content)
    def repl(link):
        if link.startswith('#') or link.startswith('//'):
            return link
        if base is not None:
            return "{0}/{1}".format(base.rstrip('/'), link)
        else:
            return link
    html.rewrite_links(repl, resolve_base_href=False)
    return lxml.html.tostring(html)
Beispiel #7
0
def get_translate_page(tran_page_url, user):
    htmlStr = get_html_str(tran_page_url)
    html = lxml.html.fromstring(htmlStr)

    #重写链接
    html.rewrite_links(change_url, base_href=tran_page_url)

    add_to_request_history(html, tran_page_url, user)
    change_script_data_main_url(html, tran_page_url)
    #变更所有标签
    change_all_element(html, user, tran_page_url)

    #添加 css
    add_css_js(html)
    return lxml.html.tostring(html)
Beispiel #8
0
    def execute(self):
        rewrite, headers = self.rewrite_headers()
        if not headers:
            msg = "HTTP/1.1 502 Gateway Error\r\n\r\n bad request."
            self.resp.send(msg)
            return
        
        if rewrite:
            body = self.parser.body_string()
            if not body:
                rewritten_body = ''
            else:
                html = lxml.html.fromstring(body)

                # rewrite links to absolute 
                html.rewrite_links(self.rewrite_link)

                # add base
                absolute_path = "%s%s" % (self.local_base,
                        self.extra.get('path', ''))
                
                old_base = html.find(".//base")
                base = etree.Element("base")
                base.attrib['href'] = absolute_path 

                if not old_base:
                    head = html.find(".//head")
                    head.append(base)
            
                # modify response
                rewritten_body = bytes(lxml.html.tostring(html))
            
            # finally send response.
            headers.extend([
                'Content-Length: %s\r\n' % len(rewritten_body),
                "\r\n"])
           
            self.resp.writeall(bytes("".join(headers)))
            stream = io.BytesIO(rewritten_body)
            while True:
                data = stream.read(io.DEFAULT_BUFFER_SIZE)
                if not data:
                    break
                self.resp.writeall(data)
        else:
            self.resp.writeall(bytes("".join(headers) + "\r\n"))
            body = self.parser.body_file()
            send_body(self.resp, body, self.parser.is_chunked())
Beispiel #9
0
def rewrite_resource_paths(content, base=None):
    """Given the content and a new base reference, rewrite the source paths
    in the content.
    """
    html = lxml.html.fromstring(content)

    def repl(link):
        if link.startswith('#') or link.startswith('//'):
            return link
        if base is not None:
            return "{0}/{1}".format(base.rstrip('/'), link)
        else:
            return link

    html.rewrite_links(repl, resolve_base_href=False)
    return lxml.html.tostring(html)
Beispiel #10
0
def rewrite_language_links(html, language_code):
    if language_code:
        html = rewrite_links(
            html,
            lambda lnk: LANGUAGE_LINK_RE.sub(u'/' + language_code + u'/', lnk))

    return mark_safe(html)
Beispiel #11
0
def rewrite_language_links(html, language_code):
    if language_code:
        html = rewrite_links(
            html,
            lambda lnk: LANGUAGE_LINK_RE.sub(u'/' + language_code + u'/', lnk)
        )

    return mark_safe(html)
Beispiel #12
0
def custom_template_preview_render():
    body, _ = EmailTemplate.make_sample(
        from_name=request.args.get('from_name'),
        subject=request.args.get('subject'),
        style=request.args.get('style'),
        body=request.args.get('body'),
    )

    return rewrite_links(body, lambda x: "#" + x)
Beispiel #13
0
def custom_template_preview_render():
    body, _ = EmailTemplate.make_sample(
        from_name=request.args.get('from_name'),
        subject=request.args.get('subject'),
        style=request.args.get('style'),
        body=request.args.get('body'),
    )

    return rewrite_links(body, lambda x: "#" + x)
Beispiel #14
0
    def viewmail_html(self, content, **kwargs):
        import lxml.html

        if content is None or content == "":
            return ""
        links = kwargs.get("links", 0)
        html = lxml.html.fromstring(content)
        if not links:
            html.rewrite_links(lambda x: None)
        else:
            html.rewrite_links(self.map_cid)
        body = html.find("body")
        if body is None:
            body = lxml.html.tostring(html)
        else:
            body = lxml.html.tostring(body)
            body = re.sub("<(/?)body", lambda m: "<%sdiv" % m.group(1), body)
        return body
Beispiel #15
0
    def viewmail_html(self, content, **kwargs):
        import lxml.html

        if content is None or content == "":
            return ""
        links = kwargs.get("links", 0)
        html = lxml.html.fromstring(content)
        if not links:
            html.rewrite_links(lambda x: None)
        else:
            html.rewrite_links(self.map_cid)
        body = html.find("body")
        if body is None:
            body = lxml.html.tostring(html)
        else:
            body = lxml.html.tostring(body)
            body = re.sub("<(/?)body", lambda m: "<%sdiv" % m.group(1), body)
        return body
Beispiel #16
0
def custom_template_preview_render():
    data = request.form
    body, _ = EmailTemplate.make_preview(
        from_name=data.get("from_name"),
        subject=data.get("subject"),
        style=data.get("style"),
        body=data.get("body"),
    )

    return rewrite_links(body, lambda x: "#" + x)
Beispiel #17
0
    def _post_process_html(self):
        html = lxml.html.fromstring(self.contents["html"])

        if self.links:
            html.rewrite_links(self._map_cid)

            for link in html.iterlinks():
                link[0].set("target", "_blank")
        else:
            html.rewrite_links(lambda x: None)

        cleaner = Cleaner(scripts=True,
                          javascript=True,
                          links=True,
                          page_structure=True,
                          embedded=True,
                          frames=True,
                          add_nofollow=True)
        mail_text = lxml.html.tostring(cleaner.clean_html(html))
        self.contents["html"] = smart_text(mail_text)
def adapt_content(content, server_name):
    decoded = unicode(content, 'UTF-8')
    html = lxml.html.fromstring(decoded)

    del content
    del decoded

    catID = extract_catID(html)
    titles = extract_titles(html)
    if catID and titles:
        rel_searches = perform_rel_search(catID, titles)
        
        product_details = html.xpath('//li[@class = "rev"]/div[@class = "product_details_content"]/..')

        for pos, element in enumerate(product_details):
            rel_search_phrases = rel_search_path(rel_searches, id=pos)
            if rel_search_phrases:
                rel_search_div = lxml.html.Element('div')
                rel_search_div.attrib['style'] = 'margin-right: 50px; background-color: rgb(100, 230, 50); font-size: 16px; z-index: 100'
                for phrase in rel_search_phrases:
                    search_link = lxml.html.Element('a')
                    href = '/classify?search_box=1&cat_id=%s&sfsk=0&keyword=%s' %(catID, quote_plus(phrase))
                    search_link.attrib['href'] = href
                    search_link.attrib['style'] = 'z-index: 100'
                    search_link.text = phrase
                    rel_search_div.append(search_link)
                    spacer = lxml.html.Element('span')
                    spacer.text = ' '
                    rel_search_div.append(spacer)

                # We could find the element properly, but I know its at 5 and this is a hack :S
                element.insert(5, rel_search_div)

    def rewrite_link(link):
        if link.startswith('http://www.bizrate.co.uk'):
            return link[len('http://www.bizrate.co.uk'):]
        else:
            return link

    html.rewrite_links(rewrite_link)
    return lxml.html.tostring(html, encoding="UTF-8")
Beispiel #19
0
    def _post_process_html(self, content):
        html = lxml.html.fromstring(content)

        if self.links:
            html.rewrite_links(self._map_cid)

            for link in html.iterlinks():
                link[0].set("target", "_blank")
        else:
            html.rewrite_links(lambda x: None)

        cleaner = Cleaner(
            scripts=True,
            javascript=True,
            links=True,
            page_structure=True,
            embedded=True,
            frames=True,
            add_nofollow=True)
        mail_text = lxml.html.tostring(cleaner.clean_html(html))
        return smart_text(mail_text)
 def get_html_base(self):
     """
     Gets the HTML associated with the current child task
     Input: None
     Output: Child task HTML
     """
     self.update_task_states()
     html = self.current_task.get_html(self.system)
     return_html = html
     try:
         #Without try except block, get this error:
         # File "/home/vik/mitx_all/mitx/common/lib/xmodule/xmodule/x_module.py", line 263, in rewrite_content_links
         # if link.startswith(XASSET_SRCREF_PREFIX):
         # Placing try except so that if the error is fixed, this code will start working again.
         return_html = rewrite_links(html, self.rewrite_content_links)
     except Exception:
         pass
     return return_html
 def get_html_base(self):
     """
     Gets the HTML associated with the current child task
     Input: None
     Output: Child task HTML
     """
     self.update_task_states()
     html = self.current_task.get_html(self.system)
     return_html = html
     try:
         #Without try except block, get this error:
         # File "/home/vik/mitx_all/mitx/common/lib/xmodule/xmodule/x_module.py", line 263, in rewrite_content_links
         # if link.startswith(XASSET_SRCREF_PREFIX):
         # Placing try except so that if the error is fixed, this code will start working again.
         return_html = rewrite_links(html, self.rewrite_content_links)
     except Exception:
         pass
     return return_html
Beispiel #22
0
def apply_markup_filter(text):
    """Applies a text-to-HTML conversion function to a piece of text and
    returns the generated HTML.

    The function to use is derived from the value of the setting
    ``MARKUP_FILTER``, which should be a 2-tuple:

        * The first element should be the name of a markup filter --
          e.g., "markdown" -- to apply. If no markup filter is desired,
          set this to None.

        * The second element should be a dictionary of keyword
          arguments which will be passed to the markup function. If no
          extra arguments are desired, set this to an empty
          dictionary; some arguments may still be inferred as needed,
          however.

    So, for example, to use Markdown with safe mode turned on (safe
    mode removes raw HTML), put this in your settings file::

        MARKUP_FILTER = ('markdown', { 'safe_mode': 'escape' })

    Currently supports Textile, Markdown and reStructuredText, using
    names identical to the template filters found in
    ``django.contrib.markup``.

    Borrowed from http://djangosnippets.org/snippets/104/
    """
    markup_filter_name, markup_kwargs = get_markup_filter()

    if not text.strip():
        return text

    html = text

    if markup_filter_name is not None:
        if markup_filter_name == 'textile':
            import textile
            if 'encoding' not in markup_kwargs:
                markup_kwargs.update(encoding=settings.DEFAULT_CHARSET)
            if 'output' not in markup_kwargs:
                markup_kwargs.update(output=settings.DEFAULT_CHARSET)

            html = textile.textile(text, **markup_kwargs)

        elif markup_filter_name == 'markdown':
            import markdown
            html = markdown.markdown(text, **markup_kwargs)

        elif markup_filter_name == 'restructuredtext':
            from docutils import core
            if 'settings_overrides' not in markup_kwargs:
                markup_kwargs.update(
                    settings_overrides=getattr(
                        settings,
                        "RESTRUCTUREDTEXT_FILTER_SETTINGS",
                        {},
                    )
                )
            if 'writer_name' not in markup_kwargs:
                markup_kwargs.update(writer_name='html4css1')

            parts = core.publish_parts(source=text, **markup_kwargs)
            html = parts['html_body']

    return rewrite_links(html, rewrite_internal_link)
Beispiel #23
0
def all_req(environ, start_response):
    path_url = environ['PATH_INFO']
    assert path_url.startswith("/")
    path_url = path_url[1:]
    method = environ.get('REQUEST_METHOD').upper()

    if not (path_url.startswith(u"http://")
            or path_url.startswith(u"https://")):
        path_url = u"http://" + unicode(path_url)
    if path_url != u'http://favicon.ico':
        setattr(all_req, LAST_REQ_BASE_URL, get_base_url(path_url))
    else:
        path_url = getattr(all_req, LAST_REQ_BASE_URL, "") + "/favicon.ico"

    req_query_string = environ.get("QUERY_STRING", "")
    try:
        # 获取data
        req_data = environ['wsgi.input'].read(
            int(environ.get('CONTENT_LENGTH', '0')))
    except:
        req_data = None

    requestpool_headers = {}
    req_headers = {}
    for key, val in environ.iteritems():
        if key.startswith('HTTP_'):
            # 生成req_headers 暂无需求
            header_name = key[5:].replace('_', '-')
            if header_name == 'host'.upper():
                continue
            # 禁用缓存
            if "CACHE-CONTROL" in header_name:
                continue
            elif "IF-MODIFIED-SINCE" in header_name:
                continue
            # 禁用复用
            if "CONNECTION" in header_name:
                continue
            if "CACHE-CONTROL" in header_name:
                continue
            if 'REQUESTSPOOL.' in header_name:
                requestpool_headers[header_name] = val
            else:
                req_headers[header_name] = val

    status_code, headers, output = get_http_result(
        url=path_url,
        method=method,
        req_query_string=req_query_string,
        req_data=req_data,
        req_headers=req_headers)

    if "content-type" in headers and u'text/html' in headers.get(
            "content-type"):
        html = lxml.html.fromstring(output)

        html.rewrite_links(RewriteLink(get_base_url(path_url)))
        output = lxml.html.tostring(html)

    start_response(
        "{0} {1}".format(status_code, responses.get(status_code, 'OK')),
        headers.items())
    return (output, )
    def get_load_page(self, request_info, url, **kwargs):
        """
        Proxy a web-page through so that a UI can be displayed for showing potential results.
        """

        web_client = None

        try:

            # --------------------------------------
            # 1: Make sure that user has permission to make inputs. We don't want to allow people
            #    to use this as a general proxy.
            # --------------------------------------
            if not WebInputOperationsHandler.hasCapability(
                    'edit_modinput_web_input'
            ) and WebInputOperationsHandler.hasCapability('admin_all_objects'):
                return self.render_error_html(
                    'You need the "edit_modinput_web_input" capability ' +
                    'to make website inputs', 403)

            # Don't allow proxying of the javascript files
            if url.endswith(".js"):
                return {
                    'payload': '',
                    'status': 200,
                    'headers': {
                        'Content-Type': 'application/javascript'
                    },
                }

            # --------------------------------------
            # 2: Only allow HTTPS if the install is on Splunk Cloud
            # --------------------------------------
            if ModularInput.is_on_cloud(request_info.session_key
                                        ) and not url.startswith("https://"):
                return self.render_error_html(
                    'URLs on Splunk Cloud must use HTTPS protocol',
                    401)  # TODO: deterine best code

            # --------------------------------------
            # 3: Perform a request for the page
            # --------------------------------------

            # Get the proxy configuration
            conf_stanza = "default"

            try:
                web_input = WebInput(timeout=10)

                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \
                web_input.get_proxy_config(request_info.session_key, conf_stanza)

            except ResourceNotFound:
                return self.render_error_html(
                    "Proxy server information could not be obtained", 202)

            # Get the timeout to use
            timeout = None

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except ValueError:
                    timeout = 15
            else:
                timeout = 15

            # Get the user-agent
            user_agent = kwargs.get('user_agent', None)

            # Get the information on the browser to use
            browser = None

            if 'browser' in kwargs:
                browser = kwargs['browser']

            # Make the client
            if browser is None or browser == WebScraper.INTEGRATED_CLIENT:
                web_client = DefaultWebClient(timeout, user_agent, logger)
            elif browser == WebScraper.FIREFOX:
                web_client = FirefoxClient(timeout, user_agent, logger)
            elif browser == WebScraper.CHROME:
                web_client = ChromeClient(timeout, user_agent, logger)

            web_client.setProxy(proxy_type, proxy_server, proxy_port,
                                proxy_user, proxy_password)

            # Get the username and password
            username = kwargs.get('username', None)
            password = kwargs.get('password', None)

            username_field = kwargs.get('username_field', None)
            password_field = kwargs.get('password_field', None)
            authentication_url = kwargs.get('authentication_url', None)

            if username is not None and password is not None:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                web_client.setCredentials(username, password)

                if authentication_url is not None:
                    logger.debug(
                        "Authenticating using form login in scrape_page")
                    web_client.doFormLogin(authentication_url, username_field,
                                           password_field)

            # Get the page
            try:
                content = web_client.get_url(url, 'GET')
                response = web_client.get_response_headers()
            except:
                logger.exception(
                    "Exception generated while attempting to content for url=%s",
                    url)
                return self.render_error_html(
                    "Page preview could not be obtained using a web-browser",
                    500)

            # --------------------------------------
            # 4: Render the content with the browser if necessary
            # --------------------------------------
            """
            if 'text/html' in response['content-type']:

                # Get the information on the browser to use
                browser = None

                if 'browser' in kwargs:
                    browser = kwargs['browser']

                # Try rendering the content using a web-browser
                try:
                    if browser is not None and browser != WebScraper.INTEGRATED_CLIENT:
                        
                        web_scraper = WebScraper(timeout=timeout)
                        web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)
                        web_scraper.set_authentication(username, password)
                        content = web_scraper.get_result_browser(urlparse(url), browser)

                except:
                    logger.exception("Exception generated while attempting to get browser rendering or url=%s", url)

                    cherrypy.response.status = 500
                    return self.render_error_html("Page preview could not be obtained using a web-browser")
            """

            # --------------------------------------
            # 5: Rewrite the links in HTML files so that they also point to the internal proxy
            # --------------------------------------
            if "<html" in content:

                # Parse the content
                html = lxml.html.document_fromstring(content)

                # Rewrite the links to point to this internal proxy
                rewrite_using_internal_proxy = True

                if rewrite_using_internal_proxy:

                    def relocate_href(link):
                        """
                        Change the hrefs such that they go through the proxy.
                        """

                        link = urljoin(url, link)

                        if link.endswith(".js"):
                            return ""
                        if not link.endswith(".css"):
                            return "load_page?url=" + link
                        else:
                            return link

                    html.rewrite_links(relocate_href)

                    # Block the href links
                    for element, attribute, _, _ in html.iterlinks():
                        if element.tag == "a" and attribute == "href":
                            element.set('href', "#")

                        elif element.tag == "form" and attribute == "action":
                            element.set('action', "?")
                else:
                    html.make_links_absolute(url)

                # Determine if we should clean the JS
                clean_script = True

                if 'clean_script' in kwargs:
                    clean_script = util.normalizeBoolean(
                        kwargs['clean_script'])

                # Determine if we should clean the CSS
                clean_styles = False

                if 'clean_styles' in kwargs:
                    clean_styles = util.normalizeBoolean(
                        kwargs['clean_styles'])

                # Clean up the HTML
                if clean_styles or clean_script:

                    kill_tags = []

                    if clean_script:
                        kill_tags = ["script"]

                    # Remove the script blocks
                    cleaner = Cleaner(page_structure=False,
                                      kill_tags=kill_tags,
                                      javascript=False,
                                      links=False,
                                      style=clean_styles,
                                      safe_attrs_only=False)

                    # Get the content
                    content = lxml.html.tostring(cleaner.clean_html(html),
                                                 encoding="unicode")

                else:
                    content = lxml.html.tostring(html, encoding="unicode")

            # --------------------------------------
            # 6: Respond with the results
            # --------------------------------------
            headers = {}

            if 'content-type' in response:
                headers['Content-Type'] = response['content-type']
            else:
                headers['Content-Type'] = 'text/html'

            # --------------------------------------
            # 7: Clear Javascript files
            # --------------------------------------
            if response.get('content-type', "") == "application/javascript" \
               or response.get('content-type', "") == "application/x-javascript" \
               or response.get('content-type', "") == "text/javascript" \
               or url.endswith(".js"):

                return {'payload': '', 'headers': headers, 'status': 200}

            return {'payload': content, 'headers': headers, 'status': 200}

        except LoginFormNotFound:
            logger.debug("Login form not found")
            return self.render_error_html("Login form was not found", 200)

        except FormAuthenticationFailed as e:
            logger.debug("Form authentication failed: " + str(e))
            return self.render_error_html(
                "Form authentication failed: " + str(e), 200)

        except:
            logger.exception("Error when attempting to proxy an HTTP request")
            return self.render_error_html("Page preview could not be created",
                                          500)

        finally:
            if web_client:
                web_client.close()
Beispiel #25
0
def apply_markup_filter(text):
    """Applies a text-to-HTML conversion function to a piece of text and
    returns the generated HTML.

    The function to use is derived from the value of the setting
    ``MARKUP_FILTER``, which should be a 2-tuple:

        * The first element should be the name of a markup filter --
          e.g., "markdown" -- to apply. If no markup filter is desired,
          set this to None.

        * The second element should be a dictionary of keyword
          arguments which will be passed to the markup function. If no
          extra arguments are desired, set this to an empty
          dictionary; some arguments may still be inferred as needed,
          however.

    So, for example, to use Markdown with safe mode turned on (safe
    mode removes raw HTML), put this in your settings file::

        MARKUP_FILTER = ('markdown', { 'safe_mode': 'escape' })

    Currently supports Textile, Markdown and reStructuredText, using
    names identical to the template filters found in
    ``django.contrib.markup``.

    Borrowed from http://djangosnippets.org/snippets/104/
    """
    markup_filter_name, markup_kwargs = get_markup_filter()

    if not text.strip():
        return text

    html = text

    if markup_filter_name is not None:
        if markup_filter_name == 'textile':
            import textile
            if 'encoding' not in markup_kwargs:
                markup_kwargs.update(encoding=settings.DEFAULT_CHARSET)
            if 'output' not in markup_kwargs:
                markup_kwargs.update(output=settings.DEFAULT_CHARSET)

            html = textile.textile(text, **markup_kwargs)

        elif markup_filter_name == 'markdown':
            import markdown
            html = markdown.markdown(text, **markup_kwargs)

        elif markup_filter_name == 'restructuredtext':
            from docutils import core
            if 'settings_overrides' not in markup_kwargs:
                markup_kwargs.update(settings_overrides=getattr(
                    settings,
                    "RESTRUCTUREDTEXT_FILTER_SETTINGS",
                    {},
                ))
            if 'writer_name' not in markup_kwargs:
                markup_kwargs.update(writer_name='html4css1')

            parts = core.publish_parts(source=text, **markup_kwargs)
            html = parts['html_body']

    return rewrite_links(html, rewrite_internal_link)
Beispiel #26
0
        newlink = '%s%s' % (relpath, link)
    else:
        newlink = link

    if args.vverbose:
        print '(abs2rel) old link: %s' % link
        print '(abs2rel) new link: %s' % newlink
        print

    return newlink


if args.verbose:
    print 'Replacing absolute links with relative links'

for root, dirs, files in os.walk(args.path):
    for file in files:
        if file.find(args.suffix) != -1:
            page = open(os.path.join(root, file)).read()

            if args.verbose:
                print 'file: %s/%s' % (root, file)

            html = lxml.html.fromstring(page)
            html.rewrite_links(abs2rel)

            # Write the updated links back to the file
            with open(os.path.join(root, file), 'w') as f:
                f.write(lxml.html.tostring(html))

Beispiel #27
0
    def handleAdd(self, action):
        data, errors = self.extractData()
        if errors:
            self.status = self.formErrorsMessage
            return

        # 1. Find or create folder where selected items will be archived.
        folder_id = "c"
        folder_title = "{} Converted".format(self.context.Title())
        folder = self.context.get(folder_id)
        if folder is None:
            folder = self.context[self.context.invokeFactory(
                'Folder', folder_id, title=folder_title)]

        # 2. Find original objects.
        adapter = IAdapter(self.context)
        paths = self.paths.split(
            '\r\n') if '\r\n' in self.paths else self.paths.split('\n')
        objs = self.context.getObjectsFromPathList(paths)
        form = self.request.form

        # Data from form
        title = form.get('form.widgets.IBasic.title')
        description = form.get('form.widgets.IBasic.description')
        fpaivays = None
        if form.get('form.widgets.paivays-year') and form.get(
                'form.widgets.paivays-day'):
            fpaivays = datetime(int(form.get('form.widgets.paivays-year')),
                                int(form.get('form.widgets.paivays-month')),
                                int(form.get('form.widgets.paivays-day')))
        text = form.get('form.widgets.text')

        omits = [
            'form.widgets.IBasic.title', 'form.widgets.IBasic.description',
            'form.widgets.paivays-year', 'form.widgets.paivays-month',
            'form.widgets.paivays-day', 'form.widgets.text',
            'form.widgets.text.mimeType', 'form.buttons.convert',
            'form.widgets.paivays-empty-marker',
            'form.widgets.paivays-calendar',
            'form.widgets.IVersionable.changeNote', 'form.widgets.paths'
        ]

        # file
        file_field = self.file_field()
        fname = 'form.widgets.{}'.format(file_field.getName())
        cfile = form.get(fname)
        omits.append(fname)
        if cfile:
            cfile.seek(0)
            file_data = cfile.read()
            file_name = cfile.filename
            cfile.close()

        # image
        image_field = self.image_field()
        iname = 'form.widgets.{}'.format(image_field.getName())
        cimage = form.get(iname)
        omits.append(iname)
        if cimage:
            cimage.seek(0)
            image_data = cimage.read()
            image_name = cimage.filename
            cimage.close()

        keys = [
            key for key in form.keys()
            if key not in omits and key.startswith('form.widgets.')
            and not key.endswith('empty-marker')
        ]
        data = {}
        for key in keys:
            val = form.get(key)
            if val:
                if isinstance(val, list):
                    val = [va.decode('unicode_escape') for va in val]
                data[key.split('.')[2]] = val

        object_ids = []
        # 3. Select values and create archive.
        for obj in objs:
            data = data.copy()
            data['title'] = safe_unicode(obj.Title()) or title
            data['description'] = safe_unicode(
                obj.Description()) or description
            uuid = obj.UID()
            brain = adapter.get_brain(UID=uuid)
            paivays = fpaivays
            if brain.review_state == 'published':
                paivays = brain.effective
            if paivays is None:
                paivays = brain.created
            if not isinstance(paivays, datetime):
                paivays = paivays.asdatetime().replace(tzinfo=None)
            data['paivays'] = paivays
            if obj.getField('text') is not None:
                text = self._strip_dev(
                    rewrite_links(safe_unicode(obj.getField('text').get(obj)),
                                  link_repl_func)) or text
            if text:
                data['text'] = text

            content = createContentInContainer(folder,
                                               'archive',
                                               checkConstraints=False,
                                               **data)

            # file
            filedata = None
            contentType = ''
            ofile = obj.getField('file', obj)
            if ofile:
                file_obj = ofile.get(obj)
                if file_obj and file_obj.get_size():
                    filedata = file_obj.data
                    filename = file_obj.filename or data['title']
                    contentType = file_obj.getContentType()

            if filedata is None and cfile:
                filedata = file_data
                filename = file_name

            if filedata is not None:
                setattr(
                    content, file_field.getName(),
                    NamedBlobFile(data=filedata,
                                  filename=safe_unicode(filename),
                                  contentType=contentType))

            # image
            imagedata = None
            contentType = ''
            oimage = obj.getField('image', obj)
            if oimage:
                image_obj = oimage.get(obj)
                if image_obj and image_obj.get_size():
                    imagedata = image_obj.data if not isinstance(
                        image_obj.data, Pdata) else image_obj.data.data
                    imagename = safe_unicode(
                        image_obj.filename) or data['title']
                    contentType = image_obj.getContentType()

            if imagedata is None and cimage:
                imagedata = image_data
                imagename = image_name

            if imagedata is not None:
                setattr(
                    content, image_field.getName(),
                    NamedBlobImage(data=imagedata,
                                   filename=safe_unicode(imagename),
                                   contentType=contentType))

            alsoProvides(content, IArchive)
            modified(content)

            object_ids.append(obj.id)
        # Remove the original object
        self.context.manage_delObjects(object_ids)

        message = _(
            u"add_converted_archives_success",
            default=
            u"${number} converted archive(s) are added to folder: ${title}",
            mapping={
                'number': len(objs),
                'title': safe_unicode(folder_title)
            })
        IStatusMessage(self.request).addStatusMessage(message, type='info')

        url = '{}/folder_contents'.format(self.context.absolute_url())
        return self.request.response.redirect(url)
Beispiel #28
0
        relpath = os.path.relpath(args.path, root)
        newlink = '%s%s' % (relpath, link)
    else:
        newlink = link

    if args.vverbose:
        print '(abs2rel) old link: %s' % link
        print '(abs2rel) new link: %s' % newlink
        print

    return newlink


if args.verbose:
    print 'Replacing absolute links with relative links'

for root, dirs, files in os.walk(args.path):
    for file in files:
        if file.find(args.suffix) != -1:
            page = open(os.path.join(root, file)).read()

            if args.verbose:
                print 'file: %s/%s' % (root, file)

            html = lxml.html.fromstring(page)
            html.rewrite_links(abs2rel)

            # Write the updated links back to the file
            with open(os.path.join(root, file), 'w') as f:
                f.write(lxml.html.tostring(html))
    def load_page(self, url, **kwargs):
        """
        Proxy a web-page through so that a UI can be displayed for showing potential results.
        """

        web_client = None

        try:

            # --------------------------------------
            # 1: Make sure that user has permission to make inputs. We don't want to allow people
            #    to use this as a general proxy.
            # --------------------------------------
            if not WebInputController.hasCapability('edit_modinput_web_input'):
                return self.render_error_html('You need the "edit_modinput_web_input" capability ' +
                                              'to make website inputs')

            # Don't allow proxying of the javascript files
            if url.endswith(".js"):
                cherrypy.response.headers['Content-Type'] = 'application/javascript'
                return ""

            # --------------------------------------
            # 2: Only allow HTTPS if the install is on Splunk Cloud
            # --------------------------------------
            if ModularInput.is_on_cloud(cherrypy.session.get('sessionKey')) and not url.startswith("https://"):
                return self.render_error_html('URLs on Splunk Cloud must use HTTPS protocol')

            # --------------------------------------
            # 3: Perform a request for the page
            # --------------------------------------

            # Get the proxy configuration
            conf_stanza = "default"

            try:
                web_input = WebInput(timeout=10)

                proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \
                web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza)

            except splunk.ResourceNotFound:
                cherrypy.response.status = 202
                return self.render_error_html("Proxy server information could not be obtained")

            # Get the timeout to use
            timeout = None

            if 'timeout' in kwargs:
                try:
                    timeout = int(kwargs['timeout'])
                except ValueError:
                    timeout = 15
            else:
                timeout = 15

            # Get the user-agent
            user_agent = kwargs.get('user_agent', None)

            # Get the information on the browser to use
            browser = None

            if 'browser' in kwargs:
                browser = kwargs['browser']

            # Make the client
            if browser is None or browser == WebScraper.INTEGRATED_CLIENT:
                web_client = DefaultWebClient(timeout, user_agent, logger)
            elif browser == WebScraper.FIREFOX:
                web_client = FirefoxClient(timeout, user_agent, logger)
            elif browser == WebScraper.CHROME:
                web_client = ChromeClient(timeout, user_agent, logger)
            
            web_client.setProxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)

            # Get the username and password
            username = kwargs.get('username', None)
            password = kwargs.get('password', None)

            username_field = kwargs.get('username_field', None)
            password_field = kwargs.get('password_field', None)
            authentication_url = kwargs.get('authentication_url', None)

            if username is not None and password is not None:
                username = kwargs['username']
                password = kwargs['password']

                username_field = kwargs.get('username_field', None)
                password_field = kwargs.get('password_field', None)
                authentication_url = kwargs.get('authentication_url', None)

                web_client.setCredentials(username, password)

                if authentication_url is not None:
                    logger.debug("Authenticating using form login in scrape_page")
                    web_client.doFormLogin(authentication_url, username_field, password_field)

            # Get the page
            try:
                content = web_client.get_url(url, 'GET')
                response = web_client.get_response_headers()
            except:
                logger.exception("Exception generated while attempting to content for url=%s", url)

                cherrypy.response.status = 500
                return self.render_error_html("Page preview could not be created using a web-browser")

            # --------------------------------------
            # 4: Render the content with the browser if necessary
            # --------------------------------------
            """
            if 'text/html' in response['content-type']:

                # Get the information on the browser to use
                browser = None

                if 'browser' in kwargs:
                    browser = kwargs['browser']

                # Try rendering the content using a web-browser
                try:
                    if browser is not None and browser != WebScraper.INTEGRATED_CLIENT:
                        
                        web_scraper = WebScraper(timeout=timeout)
                        web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password)
                        web_scraper.set_authentication(username, password)
                        content = web_scraper.get_result_browser(urlparse.urlparse(url), browser)

                except:
                    logger.exception("Exception generated while attempting to get browser rendering or url=%s", url)

                    cherrypy.response.status = 500
                    return self.render_error_html("Page preview could not be created using a web-browser")
            """

            # --------------------------------------
            # 5: Rewrite the links in HTML files so that they also point to the internal proxy
            # --------------------------------------
            if "<html" in content:

                # Parse the content
                html = lxml.html.document_fromstring(content)

                # Rewrite the links to point to this internal proxy
                rewrite_using_internal_proxy = True

                if rewrite_using_internal_proxy:

                    def relocate_href(link):
                        """
                        Change the hrefs such that they go through the proxy.
                        """

                        link = urlparse.urljoin(url, link)

                        if link.endswith(".js"):
                            return ""
                        if not link.endswith(".css"):
                            return "load_page?url=" + link
                        else:
                            return link

                    html.rewrite_links(relocate_href)

                    # Block the href links
                    for element, attribute, _, _ in html.iterlinks():
                        if element.tag == "a" and attribute == "href":
                            element.set('href', "#")

                        elif element.tag == "form" and attribute == "action":
                            element.set('action', "?")
                else:
                    html.make_links_absolute(url)

                # Determine if we should clean the JS
                clean_script = True

                if 'clean_script' in kwargs:
                    clean_script = util.normalizeBoolean(kwargs['clean_script'])

                # Determine if we should clean the CSS
                clean_styles = False

                if 'clean_styles' in kwargs:
                    clean_styles = util.normalizeBoolean(kwargs['clean_styles'])

                # Clean up the HTML
                if clean_styles or clean_script:

                    kill_tags = []

                    if clean_script:
                        kill_tags = ["script"]

                    # Remove the script blocks
                    cleaner = Cleaner(page_structure=False, kill_tags=kill_tags, javascript=False,
                                      links=False, style=clean_styles, safe_attrs_only=False)

                    # Get the content
                    content = lxml.html.tostring(cleaner.clean_html(html))

                else:
                    content = lxml.html.tostring(html)

            # --------------------------------------
            # 6: Respond with the results
            # --------------------------------------
            if 'content-type' in response:
                cherrypy.response.headers['Content-Type'] = response['content-type']
            else:
                cherrypy.response.headers['Content-Type'] = 'text/html'

            # --------------------------------------
            # 7: Clear Javascript files
            # --------------------------------------
            if response.get('content-type', "") == "application/javascript" \
               or response.get('content-type', "") == "application/x-javascript" \
               or response.get('content-type', "") == "text/javascript" \
               or url.endswith(".js"):

                return ""

            return content

        except LoginFormNotFound:
            logger.debug("Login form not found")
            return self.render_error_html("Login form was not found")

        except FormAuthenticationFailed as e:
            logger.debug("Form authentication failed: " + str(e))
            return self.render_error_html("Form authentication failed: " + str(e))

        except:
            logger.exception("Error when attempting to proxy an HTTP request")
            cherrypy.response.status = 500
            return self.render_error_html("Page preview could not be created")

        finally:
            if web_client:
                web_client.close()
Beispiel #30
0
def scrub(site):
    """
    Given root, find content with HTML body, look for bad links or other
    errors.

    """

    searcher = ICatalogSearch(site)
    total, docids, resolver = searcher(interfaces=[IContent], )

    log.info("Found a total of %d documents", total)
    for docid in docids:
        doc = resolver(docid)

        if not hasattr(doc, 'text'):
            continue

        path = model_path(doc)
        log.debug("Checking %s", path)

        text = doc.text

        if not text:
            # Some types we're expecting not to have text, so don't warn
            # about those
            if not (ICommunity.providedBy(doc)
                    or ICalendarEvent.providedBy(doc)):
                log.warn("No text: %s %s", type(doc), path)
            continue

        try:
            try:
                # Will throw ParserError if fragment doesn't have a single
                # root element.
                html = fragment_fromstring(doc.text)
            except ParserError:
                # Wrap in a single div to make the parser happy
                html = fragment_fromstring('<div>%s</div>' % doc.text)

        except XMLSyntaxError:
            log.error("Unparseable: %s", path, exc_info=True)

        # Check and fix links
        def callback(link):
            fixed = _rewrite_link(site, path, link)
            if fixed != link:
                log.info("Link rewritten at %s", path)
                log.info("Old link: %s", link)
                log.info("New link: %s", fixed)

            if not isinstance(fixed, unicode):
                fixed = unicode(fixed, 'utf-8')

            return fixed

        html.rewrite_links(callback)

        # Need to also change any instances of the 'mce_href' attribute to
        # match newly rewritten 'href' attribute.
        for element in html.getiterator():
            if 'mce_href' in element.keys():
                element.set('mce_href', element.get('href'))

        doc.text = unicode(lxml.html.tostring(html, 'utf-8'), 'utf-8')

    log.info("Done.")
    log.info("Unknown schemes: %s", ', '.join(unknown_schemes))