Example #1
0
 def download(self, url, outfd=None):
     """
     Download a given URL using current cookies.
     
     @param url: URL or path to download
     @param outfd: Output file-like stream. If None, return data string.
     @return: Bytes downloaded (None if something went wrong)
     @note: If url is a path, the current base URL will be pre-appended.        
     """
     def _on_reply(reply):
         url = unicode(reply.url().toString())
         self._download_reply_status = not bool(reply.error())
     self._download_reply_status = None
     if not urlparse.urlsplit(url).scheme:
         url = urlparse.urljoin(self.url, url) 
     request = QNetworkRequest(QUrl(url))
     # Create a new manager to process this download        
     manager = QNetworkAccessManager()
     reply = manager.get(request)
     if reply.error():
         raise SpynnerError("Download error: %s" % reply.errorString())
     reply.downloaded_nbytes = 0
     manager.setCookieJar(self.manager.cookieJar())
     manager.connect(manager, SIGNAL('finished(QNetworkReply *)'), _on_reply)
     outfd_set = bool(outfd)
     if not outfd_set:
         outfd = StringIO()            
     self._start_download(reply, outfd)
     while self._download_reply_status is None:
         self._events_loop()
     if outfd_set:
         return (reply.downloaded_nbytes if not reply.error() else None)
     else:
         return outfd.getvalue()  
Example #2
0
    def download(self, url, outfd=None):
        """
        Download a given URL using current cookies.
        
        @param url: URL or path to download
        @param outfd: Output file-like stream. If None, return data string.
        @return: Bytes downloaded (None if something went wrong)
        @note: If url is a path, the current base URL will be pre-appended.        
        """
        def _on_reply(reply):
            url = unicode(reply.url().toString())
            self._download_reply_status = not bool(reply.error())

        self._download_reply_status = None
        if not urlparse.urlsplit(url).scheme:
            url = urlparse.urljoin(self.url, url)
        request = QNetworkRequest(QUrl(url))
        # Create a new manager to process this download
        manager = QNetworkAccessManager()
        reply = manager.get(request)
        if reply.error():
            raise SpynnerError("Download error: %s" % reply.errorString())
        reply.downloaded_nbytes = 0
        manager.setCookieJar(self.manager.cookieJar())
        manager.connect(manager, SIGNAL('finished(QNetworkReply *)'),
                        _on_reply)
        outfd_set = bool(outfd)
        if not outfd_set:
            outfd = StringIO()
        self._start_download(reply, outfd)
        while self._download_reply_status is None:
            self._events_loop()
        if outfd_set:
            return (reply.downloaded_nbytes if not reply.error() else None)
        else:
            return outfd.getvalue()
Example #3
0
class Browser:
    """
    Stateful programmatic web browser class based upon QtWebKit.   
    
    >>> browser = Browser()
    >>> browser.load("http://www.wordreference.com")
    >>> browser.runjs("console.log('I can run Javascript!')")
    >>> browser.runjs("_jQuery('div').css('border', 'solid red')") # and jQuery!
    >>> browser.select("#esen")
    >>> browser.fill("input[name=enit]", "hola")
    >>> browser.click("input[name=b]", wait_load=True)
    >>> print browser.url, len(browser.html)
    >>> browser.close()
    """
    ignore_ssl_errors = True
    """@ivar: If True, ignore SSL certificate errors."""
    user_agent = None
    """@ivar: User agent for requests (see QWebPage::userAgentForUrl for details)"""
    jslib = "_jQuery"
    """@ivar: Library name for jQuery library injected by default to pages."""
    download_directory = "."
    """@ivar: Directory where downloaded files will be stored."""
    debug_stream = sys.stderr
    """@ivar: File-like stream where debug output will be written."""
    debug_level = ERROR
    """@ivar: Debug verbose level (L{ERROR}, L{WARNING}, L{INFO} or L{DEBUG})."""
    event_looptime = 0.01
    """@ivar: Event loop dispatcher loop delay (seconds)."""

    _javascript_files = ["jquery.min.js", "jquery.simulate.js"]

    _javascript_directories = [
        os.path.join(os.path.dirname(__file__), "../javascript"),
        os.path.join(sys.prefix, "share/spynner/javascript"),
    ]

    def __init__(self, qappargs=None, debug_level=None):
        """        
        Init a Browser instance.
        
        @param qappargs: Arguments for QApplication constructor.
        @param debug_level: Debug level logging (L{ERROR} by default)
        """
        self.application = QApplication(qappargs or [])
        """PyQt4.QtGui.Qapplication object."""
        if debug_level is not None:
            self.debug_level = debug_level
        self.webpage = QWebPage()
        """PyQt4.QtWebKit.QWebPage object."""
        self.webpage.userAgentForUrl = self._user_agent_for_url
        self.webframe = self.webpage.mainFrame()
        """PyQt4.QtWebKit.QWebFrame main webframe object."""
        self.webview = None
        """PyQt4.QtWebKit.QWebView object."""
        self._url_filter = None
        self._html_parser = None

        # Javascript
        directory = _first(self._javascript_directories, os.path.isdir)
        if not directory:
            raise SpynnerError("Cannot find javascript directory: %s" %
                               self._javascript_directories)
        self.javascript = "".join(
            open(os.path.join(directory, fn)).read()
            for fn in self._javascript_files)

        self.webpage.javaScriptAlert = self._javascript_alert
        self.webpage.javaScriptConsoleMessage = self._javascript_console_message
        self.webpage.javaScriptConfirm = self._javascript_confirm
        self.webpage.javaScriptPrompt = self._javascript_prompt
        self._javascript_confirm_callback = None
        self._javascript_confirm_prompt = None

        # Network Access Manager and cookies
        self.manager = QNetworkAccessManager()
        """PyQt4.QtNetwork.QTNetworkAccessManager object."""
        self.manager.createRequest = self._manager_create_request
        self.webpage.setNetworkAccessManager(self.manager)
        self.cookiesjar = _ExtendedNetworkCookieJar()
        """PyQt4.QtNetwork.QNetworkCookieJar object."""
        self.manager.setCookieJar(self.cookiesjar)
        self.manager.connect(
            self.manager,
            SIGNAL("sslErrors(QNetworkReply *, const QList<QSslError> &)"),
            self._on_manager_ssl_errors)
        self.manager.connect(self.manager, SIGNAL('finished(QNetworkReply *)'),
                             self._on_reply)
        self.manager.connect(
            self.manager,
            SIGNAL(
                'authenticationRequired(QNetworkReply *, QAuthenticator *)'),
            self._on_authentication_required)
        self._operation_names = dict(
            (getattr(QNetworkAccessManager, s + "Operation"), s.lower())
            for s in ("Get", "Head", "Post", "Put"))

        # Webpage slots
        self._load_status = None
        self._replies = 0
        self.webpage.setForwardUnsupportedContent(True)
        self.webpage.connect(self.webpage,
                             SIGNAL('unsupportedContent(QNetworkReply *)'),
                             self._on_unsupported_content)
        self.webpage.connect(self.webpage, SIGNAL('loadFinished(bool)'),
                             self._on_load_finished)
        self.webpage.connect(self.webpage, SIGNAL("loadStarted()"),
                             self._on_load_started)

    def _events_loop(self, wait=None):
        if wait is None:
            wait = self.event_looptime
        self.application.processEvents()
        time.sleep(wait)

    def _on_load_started(self):
        self._load_status = None
        self._debug(INFO, "Page load started")

    def _on_manager_ssl_errors(self, reply, errors):
        url = unicode(reply.url().toString())
        if self.ignore_ssl_errors:
            self._debug(WARNING, "SSL certificate error ignored: %s" % url)
            reply.ignoreSslErrors()
        else:
            self._debug(WARNING, "SSL certificate error: %s" % url)

    def _on_authentication_required(self, reply, authenticator):
        url = unicode(reply.url().toString())
        realm = unicode(authenticator.realm())
        self._debug("HTTP auth required: %s (realm: %s)" % (url, realm))
        if not self._http_authentication_callback:
            self._debug(WARNING, "HTTP auth required, but no callback defined")
            return
        credentials = self._http_authentication_callback(url, realm)
        if credentials:
            user, password = credentials
            self._debug(
                INFO, "callback returned HTTP credentials: %s/%s" %
                (user, "*" * len(password)))
            authenticator.setUser(user)
            authenticator.setPassword(password)
        else:
            self._debug(WARNING, "HTTP auth callback returned no credentials")

    def _manager_create_request(self, operation, request, data):
        url = unicode(request.url().toString())
        operation_name = self._operation_names[operation].upper()
        self._debug(INFO, "Request: %s %s" % (operation_name, url))
        for h in request.rawHeaderList():
            self._debug(DEBUG, "  %s: %s" % (h, request.rawHeader(h)))
        if self._url_filter:
            if self._url_filter(self._operation_names[operation],
                                url) is False:
                self._debug(INFO, "URL filtered: %s" % url)
                request.setUrl(QUrl("about:blank"))
            else:
                self._debug(DEBUG, "URL not filtered: %s" % url)
        reply = QNetworkAccessManager.createRequest(self.manager, operation,
                                                    request, data)
        return reply

    def _on_reply(self, reply):
        self._replies += 1
        url = unicode(reply.url().toString())
        if reply.error():
            self._debug(
                WARNING, "Reply error: %s - %d (%s)" %
                (url, reply.error(), reply.errorString()))
        else:
            self._debug(INFO, "Reply successful: %s" % url)
        for header in reply.rawHeaderList():
            self._debug(DEBUG, "  %s: %s" % (header, reply.rawHeader(header)))

    def _on_unsupported_content(self, reply, outfd=None):
        if not reply.error():
            self._start_download(reply, outfd)
        else:
            self._debug(
                ERROR,
                "Error on unsupported content: %s" % reply.errorString())

    def _javascript_alert(self, webframe, message):
        self._debug(INFO, "Javascript alert: %s" % message)
        if self.webview:
            QWebPage.javaScriptAlert(self.webpage, webframe, message)

    def _javascript_console_message(self, message, line, sourceid):
        if line:
            self._debug(
                INFO,
                "Javascript console (%s:%d): %s" % (sourceid, line, message))
        else:
            self._debug(INFO, "Javascript console: %s" % message)

    def _javascript_confirm(self, webframe, message):
        smessage = unicode(message)
        url = webframe.url()
        self._debug(
            INFO,
            "Javascript confirm (webframe url = %s): %s" % (url, smessage))
        if self._javascript_confirm_callback:
            value = self._javascript_confirm_callback(url, smessage)
            self._debug(INFO,
                        "Javascript confirm callback returned %s" % value)
            return value
        return QWebPage.javaScriptConfirm(self.webpage, webframe, message)

    def _javascript_prompt(self, webframe, message, defaultvalue, result):
        url = webframe.url()
        smessage = unicode(message)
        self._debug(
            INFO,
            "Javascript prompt (webframe url = %s): %s" % (url, smessage))
        if self._javascript_prompt_callback:
            value = self._javascript_prompt_callback(url, smessage,
                                                     defaultvalue)
            self._debug(INFO,
                        "Javascript prompt callback returned: %s" % value)
            if value in (False, None):
                return False
            result.clear()
            result.append(value)
            return True
        return QWebPage.javaScriptPrompt(self.webpage, webframe, message,
                                         defaultvalue, result)

    def _on_webview_destroyed(self, window):
        self.webview = None

    def _on_load_finished(self, successful):
        self._load_status = successful
        status = {True: "successful", False: "error"}[successful]
        self._debug(
            INFO, "Page load finished (%d bytes): %s (%s)" %
            (len(self.html), self.url, status))

    def _get_filepath_for_url(self, url):
        urlinfo = urlparse.urlsplit(url)
        path = os.path.join(self.download_directory,
                            urlinfo.netloc + urlinfo.path)
        if not os.path.isdir(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        return path

    def _start_download(self, reply, outfd):
        def _on_ready_read():
            data = reply.readAll()
            if not hasattr(reply, "downloaded_nbytes"):
                reply.downloaded_nbytes = 0
            reply.downloaded_nbytes += len(data)
            outfd.write(data)
            self._debug(
                DEBUG,
                "Read from download stream (%d bytes): %s" % (len(data), url))

        def _on_network_error():
            self.debug(ERROR, "Network error on download: %s" % url)

        def _on_finished():
            self._debug(INFO, "Download finished: %s" % url)

        url = unicode(reply.url().toString())
        if outfd is None:
            path = self._get_filepath_for_url(url)
            outfd = open(path, "wb")
        reply.connect(reply, SIGNAL("readyRead()"), _on_ready_read)
        reply.connect(reply, SIGNAL("NetworkError()"), _on_network_error)
        reply.connect(reply, SIGNAL("finished()"), _on_finished)
        self._debug(INFO, "Start download: %s" % url)

    def _wait_load(self, timeout=None):
        self._events_loop(0.0)
        if self._load_status is not None:
            load_status = self._load_status
            self._load_status = None
            return load_status
        itime = time.time()
        while self._load_status is None:
            if timeout and time.time() - itime > timeout:
                raise SpynnerTimeout("Timeout reached: %d seconds" % timeout)
            self._events_loop()
        self._events_loop(0.0)
        if self._load_status:
            jscode = "var %s = jQuery.noConflict();" % self.jslib
            self.runjs(self.javascript + jscode, debug=False)
            self.webpage.setViewportSize(
                self.webpage.mainFrame().contentsSize())
        load_status = self._load_status
        self._load_status = None
        return load_status

    def _debug(self, level, *args):
        if level <= self.debug_level:
            kwargs = dict(outfd=self.debug_stream)
            _debug(*args, **kwargs)

    def _user_agent_for_url(self, url):
        if self.user_agent:
            return self.user_agent
        return QWebPage.userAgentForUrl(self.webpage, url)

    def _runjs_on_jquery(self, name, code):
        code2 = "result = %s; result.length" % code
        if self.runjs(code2).toInt() < 1:
            raise SpynnerJavascriptError("error on %s: %s" % (name, code))

    def _get_html(self):
        return unicode(self.webframe.toHtml())
        #return str(self.webframe.toHtml().toAscii())

    def _get_soup(self):
        if not self._html_parser:
            raise SpynnerError("Cannot get soup with no HTML parser defined")
        return self._html_parser(self.html)

    def _get_url(self):
        return unicode(self.webframe.url().toString())

    # Properties

    url = property(_get_url)
    """Current URL."""

    html = property(_get_html)
    """Rendered HTML in current page."""

    soup = property(_get_soup)
    """HTML soup (see L{set_html_parser})."""

    #{ Basic interaction with browser

    def load(self, url):
        """Load a web page and return status (a boolean)."""
        self.webframe.load(QUrl(url))
        return self._wait_load()

    def click(self,
              selector,
              wait_load=False,
              wait_requests=None,
              timeout=None):
        """
        Click any clickable element in page.
        
        @param selector: jQuery selector.
        @param wait_load: If True, it will wait until a new page is loaded.
        @param timeout: Seconds to wait for the page to load before 
                                       raising an exception.
        @param wait_requests: How many requests to wait before returning. Useful
                              for AJAX requests.
    
        By default this method will not wait for a page to load. 
        If you are clicking a link or submit button, you must call this
        method with C{wait_load=True} or, alternatively, call 
        L{wait_load} afterwards. However, the recommended way it to use 
        L{click_link}.
                        
        When a non-HTML file is clicked this method will download it. The 
        file is automatically saved keeping the original structure (as 
        wget --recursive does). For example, a file with URL 
        I{http://server.org/dir1/dir2/file.ext} will be saved to  
        L{download_directory}/I{server.org/dir1/dir2/file.ext}.                 
        """
        jscode = "%s('%s').simulate('click')" % (self.jslib, selector)
        self._replies = 0
        self._runjs_on_jquery("click", jscode)
        if wait_requests:
            while self._replies < wait_requests:
                self._events_loop()
            self._events_loop(0.0)
        if wait_load:
            return self._wait_load(timeout)

    def click_link(self, selector, timeout=None):
        """Click a link and wait for the page to load."""
        return self.click(selector, wait_load=True, timeout=timeout)

    def click_ajax(self, selector, wait_requests=1, timeout=None):
        """Click a AJAX link and wait for the request to finish."""
        return self.click(selector,
                          wait_requests=wait_requests,
                          timeout=timeout)

    def wait_load(self, timeout=None):
        """
        Wait until the page is loaded.
        
        @param timeout: Time to wait (seconds) for the page load to complete.
        @return: Boolean state
        @raise SpynnerTimeout: If timeout is reached.
        """
        return self._wait_load(timeout)

    def wait(self, waittime):
        """
        Wait some time.
        
        @param waittime: Time to wait (seconds).
        
        This is an active wait, the events loop will be run, so it
        may be useful to wait for synchronous Javascript events that
        change the DOM.
        """
        itime = time.time()
        while time.time() - itime < waittime:
            self._events_loop()

    def close(self):
        """Close Browser instance and release resources."""
        if self.webview:
            self.destroy_webview()
        if self.webpage:
            del self.webpage

    @classmethod
    def configure_proxy(cls,
                        hostname,
                        port,
                        user=None,
                        password=None,
                        proxy_type=QNetworkProxy.HttpProxy):
        """
        Configure network proxy layer. 
        
        @param proxy_type: see QNetworkProxy.ProxyType. Default: HttpProxy.
        @param hostname: Proxy hostname.
        @param port: Proxy port.
        @param username: Proxy username (optional).
        @param passwrod: Proxy password (optional).
        """
        proxy = QNetworkProxy()
        proxy.setType(proxy_type)
        proxy.setHostName(hostname)
        proxy.setPort(port)
        if user and password is not None:
            proxy.setUser(user)
            proxy.setPassword(password)
        QNetworkProxy.setApplicationProxy(proxy)

    #}

    #{ Webview

    def create_webview(self, show=False):
        """Create a QWebView object and insert current QWebPage."""
        if self.webview:
            raise SpynnerError("Cannot create webview (already initialized)")
        self.webview = QWebView()
        self.webview.setPage(self.webpage)
        window = self.webview.window()
        window.setAttribute(Qt.WA_DeleteOnClose)
        window.connect(window, SIGNAL('destroyed(QObject *)'),
                       self._on_webview_destroyed)
        if show:
            self.show()

    def destroy_webview(self):
        """Destroy current QWebView."""
        if not self.webview:
            raise SpynnerError("Cannot destroy webview (not initialized)")
        del self.webview

    def show(self):
        """Show webview browser."""
        if not self.webview:
            raise SpynnerError("Webview is not initialized")
        self.webview.show()

    def hide(self):
        """Hide webview browser."""
        if not self.webview:
            raise SpynnerError("Webview is not initialized")
        self.webview.hide()

    def browse(self):
        """Let the user browse the current page (infinite loop)."""
        if not self.webview:
            raise SpynnerError("Webview is not initialized")
        self.show()
        while self.webview:
            self._events_loop()

    #}

    #{ Form manipulation

    def fill(self, selector, value):
        """Fill an input text with a string value using a jQuery selector."""
        escaped_value = value.replace("'", "\\'")
        jscode = "%s('%s').val('%s')" % (self.jslib, selector, escaped_value)
        self._runjs_on_jquery("fill", jscode)

    def check(self, selector):
        """Check an input checkbox using a jQuery selector."""
        jscode = "%s('%s').attr('checked', true)" % (self.jslib, selector)
        self._runjs_on_jquery("check", jscode)

    def uncheck(self, selector):
        """Uncheck input checkbox using a jQuery selector"""
        jscode = "%s('%s').attr('checked', false)" % (self.jslib, selector)
        self._runjs_on_jquery("uncheck", jscode)

    def choose(self, selector):
        """Choose a radio input using a jQuery selector."""
        jscode = "%s('%s').simulate('click')" % (self.jslib, selector)
        self._runjs_on_jquery("choose", jscode)

    def select(self, selector):
        """Choose a option in a select using a jQuery selector."""
        jscode = "%s('%s').attr('selected', 'selected')" % (self.jslib,
                                                            selector)
        self._runjs_on_jquery("select", jscode)

    submit = click_link

    #}

    #{ Javascript

    def runjs(self, jscode, debug=True):
        """
        Inject Javascript code into the current context of page.

        @param jscode: Javascript code to injected.
        @param debug: Set to False to disable debug output for this injection.
        
        You can call Jquery even if the original page does not include it 
        as Spynner injects the library for every loaded page. You must 
        use C{_jQuery(...)} instead of of C{jQuery} or the common {$(...)} 
        shortcut. 
        
        @note: You can change the _jQuery alias (see L{jslib}).        
        """
        if debug:
            self._debug(DEBUG, "Run Javascript code: %s" % jscode)
        r = self.webpage.mainFrame().evaluateJavaScript(jscode)
        if not r.isValid():
            r = self.webpage.mainFrame().evaluateJavaScript(jscode)
        return r

    def set_javascript_confirm_callback(self, callback):
        """
        Set function callback for Javascript confirm pop-ups.
        
        By default Javascript confirmations are not answered. If the webpage
        you are working pops Javascript confirmations, be sure to set a callback
        for them. 
        
        Calback signature: C{javascript_confirm_callback(url, message)}
        
            - url: Url where the popup was launched.        
            - param message: String message.
        
        The callback should return a boolean (True meaning 'yes', False meaning 'no')
        """
        self._javascript_confirm_callback = callback

    def set_javascript_prompt_callback(self, callback):
        """
        Set function callback for Javascript prompt.
        
        By default Javascript prompts are not answered. If the webpage
        you are working pops Javascript prompts, be sure to set a callback
        for them. 
        
        Callback signature: C{javascript_prompt_callback(url, message, defaultvalue)}
        
            - url: Url where the popup prompt was launched.
            - message: String message.
            - defaultvalue: Default value for prompt answer
            
        The callback should return a string with the answer or None to cancel the prompt.
        """
        self._javascript_prompt_callback = callback

    #}

    #{ Cookies

    def get_cookies(self):
        """Return string containing the current cookies in Mozilla format."""
        return self.cookiesjar.mozillaCookies()

    def set_cookies(self, string_cookies):
        """Set cookies from a string with Mozilla-format cookies."""
        return self.cookiesjar.setMozillaCookies(string_cookies)

    #}

    #{ Download files

    def download(self, url, outfd=None):
        """
        Download a given URL using current cookies.
        
        @param url: URL or path to download
        @param outfd: Output file-like stream. If None, return data string.
        @return: Bytes downloaded (None if something went wrong)
        @note: If url is a path, the current base URL will be pre-appended.        
        """
        def _on_reply(reply):
            url = unicode(reply.url().toString())
            self._download_reply_status = not bool(reply.error())

        self._download_reply_status = None
        if not urlparse.urlsplit(url).scheme:
            url = urlparse.urljoin(self.url, url)
        request = QNetworkRequest(QUrl(url))
        # Create a new manager to process this download
        manager = QNetworkAccessManager()
        reply = manager.get(request)
        if reply.error():
            raise SpynnerError("Download error: %s" % reply.errorString())
        reply.downloaded_nbytes = 0
        manager.setCookieJar(self.manager.cookieJar())
        manager.connect(manager, SIGNAL('finished(QNetworkReply *)'),
                        _on_reply)
        outfd_set = bool(outfd)
        if not outfd_set:
            outfd = StringIO()
        self._start_download(reply, outfd)
        while self._download_reply_status is None:
            self._events_loop()
        if outfd_set:
            return (reply.downloaded_nbytes if not reply.error() else None)
        else:
            return outfd.getvalue()

    #}

    #{ HTML and tag soup parsing

    def set_html_parser(self, parser):
        """
        Set HTML parser used to generate the HTML L{soup}.
        
        @param parser: Callback called to generate the soup.
        
        When a HTML parser is set for a Browser, the property L{soup} returns
        the parsed HTML.        
        """
        self._html_parser = parser

    def html_contains(self, regexp):
        """Return True if current HTML contains a given regular expression."""
        return bool(re.search(regexp, self.html))

    #}

    #{ HTTP Authentication

    def set_http_authentication_callback(self, callback):
        """
        Set HTTP authentication request callback.
        
        The callback must have this signature: 
        
        C{http_authentication_callback(url, realm)}: 
                        
            - C{url}: URL where the requested was made.
            - C{realm}: Realm requiring authentication.
            
        The callback should return a pair of string containing (user, password) 
        or None if you don't want to answer.
        """
        self._http_authentication_callback = callback

    #}

    #{ Miscellaneous

    def snapshot(self, box=None, format=QImage.Format_ARGB32):
        """        
        Take an image snapshot of the current frame.
        
        @param box: 4-element tuple containing box to capture (x1, y1, x2, y2).
                    If None, capture the whole page.
        @param format: QImage format (see QImage::Format_*).
        @return: A QImage image.
        
        Typical usage:
        
        >>> browser.load(url)
        >>> browser.snapshot().save("webpage.png") 
        """
        if box:
            x1, y1, x2, y2 = box
            w, h = (x2 - x1), (y2 - y1)
            image0 = QImage(QSize(x2, y2), format)
            painter = QPainter(image0)
            self.webpage.mainFrame().render(painter)
            painter.end()
            image = image0.copy(x1, y1, w, h)
        else:
            image = QImage(self.webpage.viewportSize(), format)
            painter = QPainter(image)
            self.webpage.mainFrame().render(painter)
            painter.end()
        return image

    def get_url_from_path(self, path):
        """Return the URL for a given path using the current URL as base."""
        return urlparse.urljoin(self.url, path)

    def set_url_filter(self, url_filter):
        """
        Set function callback to filter URL.
        
        By default all requested elements of a page are loaded. That includes 
        stylesheets, images and many other elements that you may not need at all.         
        Use this method to define the callback that will be called every time 
        a new request is made. The callback must have this signature: 
        
        C{my_url_filter(operation, url)}: 
                        
            - C{operation}: string with HTTP operation: C{get}, C{head}, 
                            C{post} or C{put}.
            - C{url}: requested item URL.
            
        It should return C{True} (proceed) or C{False} (reject).
        """
        self._url_filter = url_filter
Example #4
0
class Browser:
    """
    Stateful programmatic web browser class based upon QtWebKit.   
    
    >>> browser = Browser()
    >>> browser.load("http://www.wordreference.com")
    >>> browser.runjs("console.log('I can run Javascript!')")
    >>> browser.runjs("_jQuery('div').css('border', 'solid red')") # and jQuery!
    >>> browser.select("#esen")
    >>> browser.fill("input[name=enit]", "hola")
    >>> browser.click("input[name=b]", wait_load=True)
    >>> print browser.url, len(browser.html)
    >>> browser.close()
    """
    ignore_ssl_errors = True
    """@ivar: If True, ignore SSL certificate errors."""
    user_agent = None
    """@ivar: User agent for requests (see QWebPage::userAgentForUrl for details)"""
    jslib = "jq"
    """@ivar: Library name for jQuery library injected by default to pages."""
    download_directory = "."
    """@ivar: Directory where downloaded files will be stored."""    
    debug_stream = sys.stderr
    """@ivar: File-like stream where debug output will be written."""
    debug_level = ERROR
    """@ivar: Debug verbose level (L{ERROR}, L{WARNING}, L{INFO} or L{DEBUG})."""    
    event_looptime = 0.01
    """@ivar: Event loop dispatcher loop delay (seconds)."""
    
    errorCode = None
    errorMessage = None

    _javascript_files = ["jquery.min.js", "jquery.simulate.js"]

    _javascript_directories = [
        os.path.join(os.path.dirname(__file__), "../javascript"),
        os.path.join(sys.prefix, "share/spynner/javascript"),
    ]
    
    def __init__(self, qappargs=None, debug_level=None):
        """        
        Init a Browser instance.
        
        @param qappargs: Arguments for QApplication constructor.
        @param debug_level: Debug level logging (L{ERROR} by default)
        """ 
        self.application = QApplication(qappargs or [])
        """PyQt4.QtGui.Qapplication object."""
        if debug_level is not None:
            self.debug_level = debug_level
        self.webpage = QWebPage()
        """PyQt4.QtWebKit.QWebPage object."""
        self.webpage.userAgentForUrl = self._user_agent_for_url
        self.webframe = self.webpage.mainFrame()
        """PyQt4.QtWebKit.QWebFrame main webframe object."""
        self.webview = None
        """PyQt4.QtWebKit.QWebView object."""        
        self._url_filter = None
        self._html_parser = None
            
        # Javascript
        directory = _first(self._javascript_directories, os.path.isdir)
        if not directory:
            raise SpynnerError("Cannot find javascript directory: %s" %
                self._javascript_directories)           
        self.javascript = "".join(open(os.path.join(directory, fn)).read() 
            for fn in self._javascript_files)

        self.webpage.javaScriptAlert = self._javascript_alert                
        self.webpage.javaScriptConsoleMessage = self._javascript_console_message
        self.webpage.javaScriptConfirm = self._javascript_confirm
        self.webpage.javaScriptPrompt = self._javascript_prompt
        self._javascript_confirm_callback = None
        self._javascript_confirm_prompt = None
        
        # Network Access Manager and cookies
        self.manager = QNetworkAccessManager()
        """PyQt4.QtNetwork.QTNetworkAccessManager object."""
        self.manager.createRequest = self._manager_create_request 
        self.webpage.setNetworkAccessManager(self.manager)            
        self.cookiesjar = _ExtendedNetworkCookieJar()
        """PyQt4.QtNetwork.QNetworkCookieJar object."""
        self.manager.setCookieJar(self.cookiesjar)
        self.manager.connect(self.manager, 
            SIGNAL("sslErrors(QNetworkReply *, const QList<QSslError> &)"),
            self._on_manager_ssl_errors)
        self.manager.connect(self.manager, 
            SIGNAL('finished(QNetworkReply *)'),
            self._on_reply)
        self.manager.connect(self.manager,
            SIGNAL('authenticationRequired(QNetworkReply *, QAuthenticator *)'),
            self._on_authentication_required)   
        self._operation_names = dict(
            (getattr(QNetworkAccessManager, s + "Operation"), s.lower()) 
            for s in ("Get", "Head", "Post", "Put"))
        
        # Webpage slots         
        self._load_status = None
        self._replies = 0
        self.webpage.setForwardUnsupportedContent(True)
        self.webpage.connect(self.webpage,
            SIGNAL('unsupportedContent(QNetworkReply *)'), 
            self._on_unsupported_content)
        self.webpage.connect(self.webpage, 
            SIGNAL('loadFinished(bool)'),
            self._on_load_finished)            
        self.webpage.connect(self.webpage, 
            SIGNAL("loadStarted()"),
            self._on_load_started)

    def _events_loop(self, wait=None):
        if wait is None:
            wait = self.event_looptime
        self.application.processEvents()
        time.sleep(wait)        
                        
    def _on_load_started(self):
        self._load_status = None
        self._debug(INFO, "Page load started")            
    
    def _on_manager_ssl_errors(self, reply, errors):
        url = unicode(reply.url().toString())
        if self.ignore_ssl_errors:
            self._debug(WARNING, "SSL certificate error ignored: %s" % url)
            reply.ignoreSslErrors()
        else:
            self._debug(WARNING, "SSL certificate error: %s" % url)

    def _on_authentication_required(self, reply, authenticator):
        url = unicode(reply.url().toString())
        realm = unicode(authenticator.realm())
        self._debug("HTTP auth required: %s (realm: %s)" % (url, realm))
        if not self._http_authentication_callback:
            self._debug(WARNING, "HTTP auth required, but no callback defined")
            return        
        credentials = self._http_authentication_callback(url, realm)        
        if credentials:            
            user, password = credentials
            self._debug(INFO, "callback returned HTTP credentials: %s/%s" % 
                (user, "*"*len(password)))
            authenticator.setUser(user)
            authenticator.setPassword(password)
        else:
            self._debug(WARNING, "HTTP auth callback returned no credentials")
        
    def _manager_create_request(self, operation, request, data):
        url = unicode(request.url().toString())
        operation_name = self._operation_names[operation].upper()
        self._debug(INFO, "Request: %s %s" % (operation_name, url))
        for h in request.rawHeaderList():
            self._debug(DEBUG, "  %s: %s" % (h, request.rawHeader(h)))
        if self._url_filter:
            if self._url_filter(self._operation_names[operation], url) is False:
                self._debug(INFO, "URL filtered: %s" % url)
                request.setUrl(QUrl("about:blank"))
            else:
                self._debug(DEBUG, "URL not filtered: %s" % url)
        reply = QNetworkAccessManager.createRequest(self.manager, 
            operation, request, data)        
        return reply

    def _on_reply(self, reply):
        self._replies += 1
        self._reply_url = unicode(reply.url().toString())
        self._reply_status = not bool(reply.error())

        if reply.error():
            self._debug(WARNING, "Reply error: %s - %d (%s)" % 
                (self._reply_url, reply.error(), reply.errorString()))
            self.errorCode = reply.error()
            self.errorMessage = reply.errorString()
        else:
            self._debug(INFO, "Reply successful: %s" % self._reply_url)
        for header in reply.rawHeaderList():
            self._debug(DEBUG, "  %s: %s" % (header, reply.rawHeader(header)))

    def _on_unsupported_content(self, reply, outfd=None):
        if not reply.error():
            self._start_download(reply, outfd)
        else:            
            self._debug(ERROR, "Error on unsupported content: %s" % reply.errorString())
                             
    def _javascript_alert(self, webframe, message):
        self._debug(INFO, "Javascript alert: %s" % message)
        if self.webview:
            QWebPage.javaScriptAlert(self.webpage, webframe, message)
        
    def _javascript_console_message(self, message, line, sourceid):
        if line:
            self._debug(INFO, "Javascript console (%s:%d): %s" %
                (sourceid, line, message))
        else:
            self._debug(INFO, "Javascript console: %s" % message)

    def _javascript_confirm(self, webframe, message):
        smessage = unicode(message)
        url = webframe.url()
        self._debug(INFO, "Javascript confirm (webframe url = %s): %s" % 
            (url, smessage))
        if self._javascript_confirm_callback:
            value = self._javascript_confirm_callback(url, smessage)
            self._debug(INFO, "Javascript confirm callback returned %s" % value)
            return value 
        return QWebPage.javaScriptConfirm(self.webpage, webframe, message)

    def _javascript_prompt(self, webframe, message, defaultvalue, result):
        url = webframe.url()
        smessage = unicode(message)
        self._debug(INFO, "Javascript prompt (webframe url = %s): %s" % 
            (url, smessage))
        if self._javascript_prompt_callback:
            value = self._javascript_prompt_callback(url, smessage, defaultvalue)
            self._debug(INFO, "Javascript prompt callback returned: %s" % value)
            if value in (False, None):
                return False
            result.clear()
            result.append(value)
            return True
        return QWebPage.javaScriptPrompt(self.webpage, webframe, message,
            defaultvalue, result)
        
    def _on_webview_destroyed(self, window):
        self.webview = None
                                             
    def _on_load_finished(self, successful):        
        self._load_status = successful  
        status = {True: "successful", False: "error"}[successful]
        self._debug(INFO, "Page load finished (%d bytes): %s (%s)" % 
            (len(self.html), self.url, status))

    def _get_filepath_for_url(self, url):
        urlinfo = urlparse.urlsplit(url)
        path = os.path.join(self.download_directory,
            urlinfo.netloc + urlinfo.path)
        if not os.path.isdir(os.path.dirname(path)):
            os.makedirs(os.path.dirname(path))
        return path

    def _start_download(self, reply, outfd):
        def _on_ready_read():
            data = reply.readAll()
            reply.downloaded_nbytes += len(data)
            outfd.write(data)
            self._debug(DEBUG, "Read from download stream (%d bytes): %s" 
                % (len(data), url))
        def _on_network_error():
            self.debug(ERROR, "Network error on download: %s" % url)
        def _on_finished():
            self._debug(INFO, "Download finished: %s" % url)
        url = unicode(reply.url().toString())
        if outfd is None:
            path = self._get_filepath_for_url(url)
            outfd = open(path, "wb")            
        reply.connect(reply, SIGNAL("readyRead()"), _on_ready_read)
        reply.connect(reply, SIGNAL("NetworkError()"), _on_network_error)
        reply.connect(reply, SIGNAL("finished()"), _on_finished)
        self._debug(INFO, "Start download: %s" % url)

    def _wait_load(self, timeout=None):
        self._events_loop(0.0)
        if self._load_status is not None:
            load_status = self._load_status
            self._load_status = None
            return load_status        
        itime = time.time()
        while self._load_status is None:
            if timeout and time.time() - itime > timeout:
                raise SpynnerTimeout("Timeout reached: %d seconds" % timeout)
            self._events_loop()
        self._events_loop(0.0)
        if self._load_status:
            jscode = "var %s = jQuery.noConflict();" % self.jslib
            self.runjs(self.javascript + jscode, debug=False)
            self.webpage.setViewportSize(self.webpage.mainFrame().contentsSize())            
        load_status = self._load_status
        self._load_status = None
        return load_status        

    def _debug(self, level, *args):
        if level <= self.debug_level:
            kwargs = dict(outfd=self.debug_stream)
            _debug(*args, **kwargs)

    def _user_agent_for_url(self, url):
        if self.user_agent:
            return self.user_agent
        return QWebPage.userAgentForUrl(self.webpage, url)

    def get_js_obj_length(self, res):
        if res.type() != res.Map:
            return False
        resmap = res.toMap()
        lenfield = QString(u'length')
        if lenfield not in resmap:
            return False
        return resmap[lenfield].toInt()[0]
    
    def jslen(self, selector):
        res = self.runjs("%s('%s')" % (self.jslib, selector))
        return self.get_js_obj_length(res)
    
    def _runjs_on_jquery(self, name, code):
        res = self.runjs(code)
        if self.get_js_obj_length(res) < 1:
            raise SpynnerJavascriptError("error on %s: %s" % (name, code))

    def _get_html(self):
        return unicode(self.webframe.toHtml())

    def _get_soup(self):
        if not self._html_parser:
            raise SpynnerError("Cannot get soup with no HTML parser defined")
        return self._html_parser(self.html)

    def _get_url(self):
        return unicode(self.webframe.url().toString())

    # Properties
                 
    url = property(_get_url)
    """Current URL."""        
                 
    html = property(_get_html)
    """Rendered HTML in current page."""
                 
    #soup = property(_get_soup)
    soup = None #change to none so that changes are retained through mulitple calls
    """HTML soup (see L{set_html_parser})."""
               
    #{ Basic interaction with browser

    def load(self, url):
        """Load a web page and return status (a boolean)."""
        self.webframe.load(QUrl(url))
        return self._wait_load()

    def load_request(self, req):
        """Load a network request and return status (a boolean)."""
        self.webframe.load(req)
        return self._wait_load()

    def wait_requests(self, wait_requests = None, url = None, url_regex = None):
        if wait_requests:
            while self._replies < wait_requests:
                self._events_loop()
            self._events_loop(0.0)
        if url_regex or url:
            last_replies = self._replies
            while True:
                if last_replies != self._replies:
                    if url_regex:
                        if re.search(url_regex, self._reply_url):
                            break
                    elif url:
                        if url == self._reply_url:
                            break
                self._events_loop()
            self._events_loop(0.0)
    
    def click(self, selector, wait_load=False, wait_requests=None, timeout=None):
        """
        Click any clickable element in page.
        
        @param selector: jQuery selector.
        @param wait_load: If True, it will wait until a new page is loaded.
        @param timeout: Seconds to wait for the page to load before 
                                       raising an exception.
        @param wait_requests: How many requests to wait before returning. Useful
                              for AJAX requests.
    
        By default this method will not wait for a page to load. 
        If you are clicking a link or submit button, you must call this
        method with C{wait_load=True} or, alternatively, call 
        L{wait_load} afterwards. However, the recommended way it to use 
        L{click_link}.
                        
        When a non-HTML file is clicked this method will download it. The 
        file is automatically saved keeping the original structure (as 
        wget --recursive does). For example, a file with URL 
        I{http://server.org/dir1/dir2/file.ext} will be saved to  
        L{download_directory}/I{server.org/dir1/dir2/file.ext}.                 
        """
        jscode = "%s('%s').simulate('click')" % (self.jslib, selector)
        self._replies = 0
        self._runjs_on_jquery("click", jscode)
        self.wait_requests(wait_requests)
        if wait_load:
            return self._wait_load(timeout)

    def click_link(self, selector, timeout=None):
        """Click a link and wait for the page to load."""
        return self.click(selector, wait_load=True, timeout=timeout)

    def click_ajax(self, selector, wait_requests=1, timeout=None):
        """Click a AJAX link and wait for the request to finish."""
        return self.click(selector, wait_requests=wait_requests, timeout=timeout)
    
    def wait_load(self, timeout=None):
        """
        Wait until the page is loaded.
        
        @param timeout: Time to wait (seconds) for the page load to complete.
        @return: Boolean state
        @raise SpynnerTimeout: If timeout is reached.
        """
        return self._wait_load(timeout)

    def wait(self, waittime):
        """
        Wait some time.
        
        @param waittime: Time to wait (seconds).
        
        This is an active wait, the events loop will be run, so it
        may be useful to wait for synchronous Javascript events that
        change the DOM.
        """   
        itime = time.time()
        while time.time() - itime < waittime:
            self._events_loop()        

    def close(self):
        """Close Browser instance and release resources."""        
        if self.webview:
            self.destroy_webview()
        if self.webpage:
            del self.webpage

    #}
                      
    #{ Webview
    
    def create_webview(self, show=False):
        """Create a QWebView object and insert current QWebPage."""
        if self.webview:
            raise SpynnerError("Cannot create webview (already initialized)")
        self.webview = QWebView()
        self.webview.setPage(self.webpage)
        window = self.webview.window()
        window.setAttribute(Qt.WA_DeleteOnClose)
        window.connect(window, SIGNAL('destroyed(QObject *)'),
            self._on_webview_destroyed)
        if show:
            self.show()

    def destroy_webview(self):
        """Destroy current QWebView."""
        if not self.webview:
            raise SpynnerError("Cannot destroy webview (not initialized)")
        del self.webview 

    def show(self):
        """Show webview browser."""
        if not self.webview:
            raise SpynnerError("Webview is not initialized")
        self.webview.show()

    def hide(self):
        """Hide webview browser."""
        if not self.webview:
            raise SpynnerError("Webview is not initialized")
        self.webview.hide()

    def browse(self):
        """Let the user browse the current page (infinite loop).""" 
        if not self.webview:
            raise SpynnerError("Webview is not initialized")
        self.show()
        while self.webview:
            self._events_loop()

    #}

    #{ Webframe

    def set_webframe_to_default(self):
        self.webframe = self.webpage.mainFrame()

    def set_webframe(self, framenumber):
        cf = self.webframe.childFrames()
	
        try:
           self.webframe = cf[int(framenumber)]
        except:
            raise SpynnerError("childframe does not exist")
		
	"""Inject jquery into frame"""
        jscode = "var %s = jQuery.noConflict();" % self.jslib
        self.runjs(self.javascript + jscode, debug=False)

    #}
                        
    #{ Form manipulation
    
    def fill(self, selector, value):
        """Fill an input text with a string value using a jQuery selector."""
        escaped_value = value.replace("'", "\\'")
        jscode = "%s('%s').val('%s')" % (self.jslib, selector, escaped_value)
        self._runjs_on_jquery("fill", jscode)

    def check(self, selector):
        """Check an input checkbox using a jQuery selector."""
        jscode = "%s('%s').attr('checked', true)" % (self.jslib, selector)
        self._runjs_on_jquery("check", jscode)

    def uncheck(self, selector):
        """Uncheck input checkbox using a jQuery selector"""
        jscode = "%s('%s').attr('checked', false)" % (self.jslib, selector)
        self._runjs_on_jquery("uncheck", jscode)

    def choose(self, selector, value):
        """Choose a radio input using a jQuery selector."""
        escaped_value = value.replace("'", "\\'")
        jscode = "%s('%s').filter('[value=%s]').simulate('click')" % (self.jslib, selector, escaped_value)
        self._runjs_on_jquery("choose", jscode)


    def select(self, selector):        
        """Choose a option in a select using a jQuery selector."""
        jscode = "%s('%s').attr('selected', 'selected')" % (self.jslib, selector)
        self._runjs_on_jquery("select", jscode)
    
    submit = click_link
      
    #}
    
    #{ Javascript 
    
    def runjs(self, jscode, debug=True):
        """
        Inject Javascript code into the current context of page.

        @param jscode: Javascript code to injected.
        @param debug: Set to False to disable debug output for this injection.
        
        You can call Jquery even if the original page does not include it 
        as Spynner injects the library for every loaded page. You must 
        use C{jq(...)} instead of of C{jQuery} or the common {$(...)} 
        shortcut. 
        
        @note: You can change the jq alias (see L{jslib}).        
        """
        if debug:
            self._debug(DEBUG, "Run Javascript code: %s" % jscode)

        #XXX evaluating JS twice must be wrong but finding the bug is proving tricky...
        #JavaScriptCore/interpreter/Interpreter.cpp and JavaScriptCore/runtime/Completion.cpp
        #JavaScriptCore/runtime/Completion.cpp is catching an exception (sometimes) and 
        #returning "TypeError: Type error" - BUT it looks like the JS does complete after
        #the function has already returned
        r = self.webframe.evaluateJavaScript(jscode)
        if r.isValid() == False:
            r = self.webframe.evaluateJavaScript(jscode)
        return r

    def set_javascript_confirm_callback(self, callback):
        """
        Set function callback for Javascript confirm pop-ups.
        
        By default Javascript confirmations are not answered. If the webpage
        you are working pops Javascript confirmations, be sure to set a callback
        for them. 
        
        Calback signature: C{javascript_confirm_callback(url, message)}
        
            - url: Url where the popup was launched.        
            - param message: String message.
        
        The callback should return a boolean (True meaning 'yes', False meaning 'no')
        """
        self._javascript_confirm_callback = callback

    def set_javascript_prompt_callback(self, callback):
        """
        Set function callback for Javascript prompt.
        
        By default Javascript prompts are not answered. If the webpage
        you are working pops Javascript prompts, be sure to set a callback
        for them. 
        
        Callback signature: C{javascript_prompt_callback(url, message, defaultvalue)}
        
            - url: Url where the popup prompt was launched.
            - message: String message.
            - defaultvalue: Default value for prompt answer
            
        The callback should return a string with the answer or None to cancel the prompt.
        """
        self._javascript_prompt_callback = callback

    #}
    
    #{ Cookies
    
    def get_cookies(self):
        """Return string containing the current cookies in Mozilla format.""" 
        return self.cookiesjar.mozillaCookies()

    def set_cookies(self, string_cookies):
        """Set cookies from a string with Mozilla-format cookies.""" 
        return self.cookiesjar.setMozillaCookies(string_cookies)

    #}

    #{ Proxies

    def get_proxy(self):
        """Return string containing the current proxy."""
        return self.manager.proxy()

    def set_proxy(self, string_proxy):
        """Set proxy [http|socks5]://username:password@hostname:port"""
        urlinfo = urlparse.urlparse(string_proxy)

        proxy = QNetworkProxy()
        if urlinfo.scheme == 'socks5' :
                proxy.setType(1)
        elif urlinfo.scheme == 'http' :
                proxy.setType(3)
        else : 
                proxy.setType(2)
                self.manager.setProxy(proxy)
                return self.manager.proxy()

        proxy.setHostName(urlinfo.hostname)
        proxy.setPort(urlinfo.port)
        if urlinfo.username != None :
                proxy.setUser(urlinfo.username)
        else :
                proxy.setUser('')

        if urlinfo.password != None :
                proxy.setPassword(urlinfo.password)
        else :
                proxy.setPassword('')

        self.manager.setProxy(proxy)
        return self.manager.proxy()
      
    #}
    
    #{ Download files
                
    def download(self, url, outfd=None):
        """
        Download a given URL using current cookies.
        
        @param url: URL or path to download
        @param outfd: Output file-like stream. If None, return data string.
        @return: Bytes downloaded (None if something went wrong)
        @note: If url is a path, the current base URL will be pre-appended.        
        """
        def _on_reply(reply):
            url = unicode(reply.url().toString())
            self._download_reply_status = not bool(reply.error())
        self._download_reply_status = None
        if not urlparse.urlsplit(url).scheme:
            url = urlparse.urljoin(self.url, url) 
        request = QNetworkRequest(QUrl(url))
        # Create a new manager to process this download        
        manager = QNetworkAccessManager()
        reply = manager.get(request)
        if reply.error():
            raise SpynnerError("Download error: %s" % reply.errorString())
        reply.downloaded_nbytes = 0
        manager.setCookieJar(self.manager.cookieJar())
        manager.connect(manager, SIGNAL('finished(QNetworkReply *)'), _on_reply)
        outfd_set = bool(outfd)
        if not outfd_set:
            outfd = StringIO()            
        self._start_download(reply, outfd)
        while self._download_reply_status is None:
            self._events_loop()
        if outfd_set:
            return (reply.downloaded_nbytes if not reply.error() else None)
        else:
            return outfd.getvalue()  
    
    #}
            
    #{ HTML and tag soup parsing
    
    def set_html_parser(self, parser):
        """
        Set HTML parser used to generate the HTML L{soup}.
        
        @param parser: Callback called to generate the soup.
        
        When a HTML parser is set for a Browser, the property L{soup} returns
        the parsed HTML.        
        """
        self._html_parser = parser

    def html_contains(self, regexp):
        """Return True if current HTML contains a given regular expression."""
        return bool(re.search(regexp, self.html))

    #}

    #{ HTTP Authentication
     
    def set_http_authentication_callback(self, callback):
        """
        Set HTTP authentication request callback.
        
        The callback must have this signature: 
        
        C{http_authentication_callback(url, realm)}: 
                        
            - C{url}: URL where the requested was made.
            - C{realm}: Realm requiring authentication.
            
        The callback should return a pair of string containing (user, password) 
        or None if you don't want to answer.
        """
        self._http_authentication_callback = callback
    
    #}
             
    #{ Miscellaneous
    
    def snapshot(self, box=None, format=QImage.Format_ARGB32):
        """        
        Take an image snapshot of the current frame.
        
        @param box: 4-element tuple containing box to capture (x1, y1, x2, y2).
                    If None, capture the whole page.
        @param format: QImage format (see QImage::Format_*).
        @return: A QImage image.
        
        Typical usage:
        
        >>> browser.load(url)
        >>> browser.snapshot().save("webpage.png") 
        """
        if box:
            x1, y1, x2, y2 = box        
            w, h = (x2 - x1), (y2 - y1)
            image0 = QImage(QSize(x2, y2), format)
            painter = QPainter(image0)
            self.webpage.mainFrame().render(painter)
            painter.end()
            image = image0.copy(x1, y1, w, h)
        else:
            image = QImage(self.webpage.viewportSize(), format)
            painter = QPainter(image)                        
            self.webpage.mainFrame().render(painter)
            painter.end()
        return image
            
    def get_url_from_path(self, path):
        """Return the URL for a given path using the current URL as base."""
        return urlparse.urljoin(self.url, path)

    def set_url_filter(self, url_filter):
        """
        Set function callback to filter URL.
        
        By default all requested elements of a page are loaded. That includes 
        stylesheets, images and many other elements that you may not need at all.         
        Use this method to define the callback that will be called every time 
        a new request is made. The callback must have this signature: 
        
        C{my_url_filter(operation, url)}: 
                        
            - C{operation}: string with HTTP operation: C{get}, C{head}, 
                            C{post} or C{put}.
            - C{url}: requested item URL.
            
        It should return C{True} (proceed) or C{False} (reject).
        """
        self._url_filter = url_filter
Example #5
0
class WK2pdf(object):
    option_defaults = {
        "disable_javascript": False,
        "dpi": -1,
        "colorMode": True,
        "resolution": False,
        "margin_bottom": '10mm',
        "margin_left": '10mm',
        "margin_right": '10mm',
        "margin_top": '10mm',
        "background": True,
        "user_style_sheet": '',
        "orientation": 'V',
        "pageSize": 'A4',
        "proxy": False,
        "quiet": True,
        "jsredirectwait": 200,
        "enable_plugins": False,
        "use_x11": False
    }
    page_sizes = {
        'A0': QPrinter.A0, 'B1': QPrinter.B1, 'A5': QPrinter.A5, 'B6': QPrinter.B6,
        'A1': QPrinter.A1, 'B2': QPrinter.B2, 'A6': QPrinter.A6, 'B7': QPrinter.B7,
        'A2': QPrinter.A2, 'B3': QPrinter.B3, 'A7': QPrinter.A7, 'B8': QPrinter.B8,
        'A3': QPrinter.A3, 'B4': QPrinter.B4, 'A8': QPrinter.A8, 'B9': QPrinter.B9,
        'A4': QPrinter.A4, 'B5': QPrinter.B5, 'A9': QPrinter.A9, 'B0': QPrinter.B0,
        'C5E': QPrinter.C5E, 'COMM10E': QPrinter.Comm10E, 'DLE': QPrinter.C5E,
        'Executive': QPrinter.Executive, 'Folio': QPrinter.Folio, 'Ledger': QPrinter.Ledger,
        'Legal': QPrinter.Legal, 'Letter': QPrinter.Letter, 'Tabloid': QPrinter.Tabloid
    }

    def __init__(self, *args, **kwargs):
        self.app = QApplication(sys.argv)
        self.option_defaults.update(kwargs)
        for k, v in self.option_defaults.items():
            if isinstance(v, basestring):
                if k in ['margin_left', 'margin_right', 'margin_top', 'margin_bottom']:
                    v = self.parseUnitReal(v)
                if k == 'pageSize':
                    v = self.setPageSize(v)
                if k == 'orientation':
                    v = self.setOrientation(v)
            if k == 'colorMode':
                v = self.setColorMode(v)
            setattr(self, k, v)

        if len(args) < 2:
            self.app.quit(1)
            raise Exception(
                    'You need to specify atleast one input file, and exactly one output file\nUse - for stdin or stdout\n\n')
        args = list(args)
        self.out = args.pop(-1)
        self.in_ = args
        self.pageNum = 0
        self.pages = []
        self.am = QNetworkAccessManager()
        self.am.connect(self.am, SIGNAL("sslErrors(QNetworkReply*, const QList<QSslError>&)"), self.sslErrors)
        self.loading = 0
        self.quiet = True

    def guessUrlFromString(self, url):
        url = url.strip()
        host_re = re.compile(HOST_PORT_RE)
        if host_re.match(url):
            url = 'http://%s' % url
        url_re = re.compile(URL_RE)
        if url_re.match(url):
            url = QUrl(url, QUrl.TolerantMode)
            if url.isValid():
                return url
        elif os.path.isfile(url):
            return QUrl.fromLocalFile(os.path.abspath(url))
        else:
            splitted_url = url.split('.')
            if len(splitted_url) > 1:
                prefix = splitted_url[0]
                schema = prefix.lower() == 'ftp' and 'ftp' or 'http'
                return QUrl('%s://%s' % (schema, url), QUrl.TolerantMode)

    def loadDefaults(self):
        self.pageSize = QPrinter.A4
        self.orientation = QPrinter.Portrait

    def setPageSize(self, size):
        self.pageSize = self.page_sizes.get(size.upper())
        return self.pageSize

    def setOrientation(self, orientation):
        if orientation.lower() in ['l', 'landscape']:
            return QPrinter.Landscape
        return QPrinter.Portrait

    def setColorMode(self, colorMode):
        if not colorMode:
            return QPrinter.GrayScale
        return QPrinter.Color

    def parseUnitReal(self, string):
        scale = 1.0
        unit = QPrinter.Millimeter
        r = re.compile(UNIT_RE)
        parsed = r.match(string)
        if not parsed:
            return
        number = float(parsed.group(1))
        unit_text = parsed.group(2).lower()
        if unit_text in ['', 'mm', 'millimeter']:
            scale = 1.0
        elif unit_text in ['cm', 'centimeter']:
            scale = 10.0
        elif unit_text in ['m', 'meter']:
            scale = 1000.0
        elif unit_text in ['didot']:
            unit = QPrinter.Didot
        elif unit_text in ['in', 'inch']:
            unit = QPrinter.Inch
        elif unit_text in ['pc', 'pica']:
            unit = QPrinter.Pica
        elif unit_text in ['cicero']:
            unit = QPrinter.Cicero
        elif unit_text in ['px', 'pixel']:
            unit = QPrinter.DevicePixel
        elif unit_text in ['m', 'meter']:
            unit = QPrinter.Point
        elif unit in ['point', 'pt']:
            unit = QPrinter.Point
        return number * scale, unit


    def resetPages(self):
        self.pages = []
        self.pageStart = []

    def hfreplace(self, q):
        replaces = []
        #replaces.append(("[page]",pageNum))
        #replaces.append(("[toPage]",str(self.pageStart[-1])))
        #replaces.append(("[fromPage]",str(1)))
        #replaces.append(("[section]",sec[0]))
        #replaces.append(("[subsection]",sec[1]))
        #replaces.append(("[subsubsection]",sec[3]))
        #replaces.append(("[webpage]",currentPage ==-1 and "Table Of Content" or index[currentPage]))
        for tag, replace in replaces:
            q = q.replace(tag, replace)
        return q

    def newPage(self, printer, f, t, p):
        self.pageNum += 1
        painter = printer.paintEngine().painter()
        painter.save();
        painter.resetMatrix();
        h = printer.pageRect().height();
        w = printer.pageRect().width();
        # do stuff like header and footer
        painter.restore();


    def run(self):
        for in_url in self.in_:
            webpage = QWebPage()
            webpage.setNetworkAccessManager(self.am)
            webpage.connect(webpage, SIGNAL('loadProgress(int)'), self.loadProgress)
            webpage.connect(webpage, SIGNAL('loadFinished(bool)'), self.loadFinished)
            webpage.connect(webpage, SIGNAL('loadStarted()'), self.loadStarted)
            webpage.settings().setAttribute(QWebSettings.JavaEnabled, self.enable_plugins)
            webpage.settings().setAttribute(QWebSettings.JavascriptEnabled, not self.disable_javascript)
            webpage.settings().setAttribute(QWebSettings.JavascriptCanOpenWindows, False)
            webpage.settings().setAttribute(QWebSettings.JavascriptCanAccessClipboard, False)
            webpage.settings().setAttribute(QWebSettings.PrintElementBackgrounds, self.background)
            webpage.settings().setAttribute(QWebSettings.PluginsEnabled, self.enable_plugins)
            if self.user_style_sheet:
                webpage.settings().setUserStyleSheetUrl(self.guessUrlFromString(self.user_style_sheet))
            url = in_url
            if url == '-':
                tmp = tempfile.NamedTemporaryFile(prefix='tmp', suffix='.html')
                tmp.write(sys.stdin.read())
                url = tmp.name
            url = self.guessUrlFromString(url)
            webpage.mainFrame().load(url)
            self.pages.append(webpage)


    def sslErrors(self, reply, error):
        reply.ignoreSslErrors()

    def printPage(self):
        if self.loading:
            return
        printer = QPrinter()
        painter = QPainter()
        printer.setOutputFormat(QPrinter.PdfFormat)
        printer.setOutputFileName(self.out)
        self.pageNum = 0;
        self.currentPage = 0;
        if not (self.margin_left[1] == self.margin_right[1] == self.margin_top[1] == self.margin_bottom[1]):
            raise Exception("Currently all margin units must be the same!")
        printer.setPageMargins(self.margin_left[0], self.margin_top[0], self.margin_right[0], self.margin_bottom[0],
                               self.margin_left[1])
        printer.setPageSize(self.pageSize)
        printer.setOrientation(self.orientation)
        printer.setColorMode(self.colorMode)
        if not printer.isValid():
            self.app.quit(1)
            raise Exception("Unable to write to output file")
        while len(self.pages):
            page = self.pages.pop(0)
            page.mainFrame().print_(printer)
            if self.pages:
                printer.newPage()
        self.app.quit()

    def loadFinished(self, ok):
        self.loading -= 1
        #if not ok:
        #    raise Exception("Failed loading page")
        if not self.quiet:
            print "Waiting for redirect"

        if self.loading == 0:
            QTimer.singleShot(self.jsredirectwait, self.printPage)


    def loadStarted(self):
        self.loading += 1

    def loadProgress(self, progress):
        if not self.quiet:
            print "Loading page: %02i\r" % progress

    def exec_(self):
        return self.app.exec_()