class SplashQWebPage(QWebPage): """ QWebPage subclass that: * changes user agent; * logs JS console messages; * handles alert and confirm windows; * returns additional info about render errors; * logs HAR events. """ error_info = None custom_user_agent = None custom_headers = None skip_custom_headers = False navigation_locked = False def __init__(self, verbosity=0): super(QWebPage, self).__init__() self.verbosity = verbosity self.har_log = HarLog() self.cookiejar = SplashCookieJar(self) self.mainFrame().urlChanged.connect(self.onUrlChanged) self.mainFrame().titleChanged.connect(self.onTitleChanged) self.mainFrame().loadFinished.connect(self.onLoadFinished) self.mainFrame().initialLayoutCompleted.connect(self.onLayoutCompleted) def onTitleChanged(self, title): self.har_log.store_title(title) def onUrlChanged(self, url): self.har_log.store_url(url.toString()) def onLoadFinished(self, ok): self.har_log.store_timing("onLoad") def onLayoutCompleted(self): self.har_log.store_timing("onContentLoad") def acceptNavigationRequest(self, webFrame, networkRequest, navigationType): if self.navigation_locked: return False self.error_info = None return super(SplashQWebPage, self).acceptNavigationRequest(webFrame, networkRequest, navigationType) def javaScriptAlert(self, frame, msg): return def javaScriptConfirm(self, frame, msg): return False def javaScriptConsoleMessage(self, msg, line_number, source_id): if self.verbosity >= 2: log.msg("JsConsole(%s:%d): %s" % (source_id, line_number, msg), system='render') def userAgentForUrl(self, url): if self.custom_user_agent is None: return super(SplashQWebPage, self).userAgentForUrl(url) else: return self.custom_user_agent # loadFinished signal handler receives ok=False at least these cases: # 1. when there is an error with the page (e.g. the page is not available); # 2. when a redirect happened before all related resource are loaded; # 3. when page sends headers that are not parsed correctly # (e.g. a bad Content-Type). # By implementing ErrorPageExtension we can catch (1) and # distinguish it from (2) and (3). def extension(self, extension, info=None, errorPage=None): if extension == QWebPage.ErrorPageExtension: # catch the error, populate self.errorInfo and return an error page info = sip.cast(info, QWebPage.ErrorPageExtensionOption) domain = 'Unknown' if info.domain == QWebPage.QtNetwork: domain = 'Network' elif info.domain == QWebPage.Http: domain = 'HTTP' elif info.domain == QWebPage.WebKit: domain = 'WebKit' self.error_info = RenderErrorInfo( domain, int(info.error), unicode(info.errorString), unicode(info.url.toString()) ) # XXX: this page currently goes nowhere content = u""" <html><head><title>Failed loading page</title></head> <body> <h1>Failed loading page ({0.text})</h1> <h2>{0.url}</h2> <p>{0.type} error #{0.code}</p> </body></html>""".format(self.error_info) errorPage = sip.cast(errorPage, QWebPage.ErrorPageExtensionReturn) errorPage.content = QByteArray(content.encode('utf-8')) return True # XXX: this method always returns True, even if we haven't # handled the extension. Is it correct? When can this method be # called with extension which is not ErrorPageExtension if we # are returning False in ``supportsExtension`` for such extensions? return True def supportsExtension(self, extension): if extension == QWebPage.ErrorPageExtension: return True return False def maybe_redirect(self, load_finished_ok): """ Return True if the current webpage state looks like a redirect. Use this function from loadFinished handler to ignore spurious signals. FIXME: This can return True if server returned incorrect Content-Type header, but there is no an additional loadFinished signal in this case. """ return not load_finished_ok and self.error_info is None def is_ok(self, load_finished_ok): return load_finished_ok and self.error_info is None def error_loading(self, load_finished_ok): return load_finished_ok and self.error_info is not None
class HarBuilder(object): """ Splash-specific HAR builder class. It knows how to update timings based on events available in QT. Also it maintains a history of browser URL changes. """ REQUEST_CREATED = "created" REQUEST_FINISHED = "finished" REQUEST_HEADERS_RECEIVED = "headers_received" def __init__(self): self.log = HarLog() self.history = [] def todict(self): """ Return HAR log as a Python dict. """ return self.log.todict() def get_history(self): """ Get a history of browser URL changes """ return copy.deepcopy(self.history) def reset(self): """ Start building a new HAR log """ self.log = HarLog() def get_last_http_status(self): """ Return HTTP status code of the currently loaded webpage or None if it is not available. """ if not self.history: return try: return self.history[-1]["response"]["status"] except KeyError: return def get_entry(self, req_id): """ Return HAR entry for a given req_id """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) return entry def _initial_entry_data(self, start_time, operation, request, outgoingData): """ Return initial values for a new HAR entry. """ return { # custom fields "_tmp": { "start_time": start_time, "request_start_sending_time": start_time, "request_sent_time": start_time, "response_start_time": start_time, # 'outgoingData': outgoingData, }, "_splash_processing_state": self.REQUEST_CREATED, # standard fields "startedDateTime": format_datetime(start_time), "request": request2har(request, operation, outgoingData), "response": {"bodySize": -1}, "cache": {}, "timings": {"blocked": -1, "dns": -1, "connect": -1, "ssl": -1, "send": 0, "wait": 0, "receive": 0}, "time": 0, } def store_title(self, title): self.log.store_title(title) def store_url(self, url): if hasattr(url, "toString"): url = url.toString() self.log.store_url(url) def store_timing(self, name): self.log.store_timing(name) def store_new_request(self, req_id, start_time, operation, request, outgoingData): """ Store information about a new QNetworkRequest. """ entry = self.log.get_mutable_entry(req_id, create=True) entry.update( self._initial_entry_data( start_time=start_time, operation=operation, request=request, outgoingData=outgoingData ) ) def store_new_reply(self, req_id, reply): """ Store initial reply information. """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) entry["response"].update(reply2har(reply)) def store_reply_finished(self, req_id, reply, content): """ Store information about a finished reply. """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) entry["_splash_processing_state"] = self.REQUEST_FINISHED # update timings now = datetime.utcnow() start_time = entry["_tmp"]["start_time"] response_start_time = entry["_tmp"]["response_start_time"] receive_time = get_duration(response_start_time, now) total_time = get_duration(start_time, now) entry["timings"]["receive"] = receive_time entry["time"] = total_time if not entry["timings"]["send"]: wait_time = entry["timings"]["wait"] entry["timings"]["send"] = total_time - receive_time - wait_time if entry["timings"]["send"] < 1e-6: entry["timings"]["send"] = 0 # update other reply information entry["response"].update(reply2har(reply, content=content)) def store_reply_headers_received(self, req_id, reply): """ Update reply information when HTTP headers are received. """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) if entry["_splash_processing_state"] == self.REQUEST_FINISHED: # self.log("Headers received for {url}; ignoring", reply, # min_level=3) return entry["_splash_processing_state"] = self.REQUEST_HEADERS_RECEIVED entry["response"].update(reply2har(reply)) now = datetime.utcnow() request_sent = entry["_tmp"]["request_sent_time"] entry["_tmp"]["response_start_time"] = now entry["timings"]["wait"] = get_duration(request_sent, now) def store_reply_download_progress(self, req_id, received, total): """ Update reply information when new data is available """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) entry["response"]["bodySize"] = int(received) def store_request_upload_progress(self, req_id, sent, total): """ Update request information when outgoing data is sent. """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) entry["request"]["bodySize"] = int(sent) now = datetime.utcnow() if sent == 0: # it is a moment the sending is started start_time = entry["_tmp"]["request_start_time"] entry["_tmp"]["request_start_sending_time"] = now entry["timings"]["blocked"] = get_duration(start_time, now) entry["_tmp"]["request_sent_time"] = now if sent == total: entry["_tmp"]["response_start_time"] = now start_sending_time = entry["_tmp"]["request_start_sending_time"] entry["timings"]["send"] = get_duration(start_sending_time, now) def store_redirect(self, url): """ Update history when redirect happens """ cause_ev = self.log._prev_entry(url, last_idx=-1) if cause_ev: self.history.append(cleaned_har_entry(cause_ev.data))
class SplashQWebPage(QWebPage): """ QWebPage subclass that: * changes user agent; * logs JS console messages; * handles alert and confirm windows; * returns additional info about render errors; * logs HAR events. """ error_info = None custom_user_agent = None def __init__(self, verbosity=0): super(QWebPage, self).__init__() self.verbosity = verbosity self.har_log = HarLog() self.mainFrame().urlChanged.connect(self.onUrlChanged) self.mainFrame().titleChanged.connect(self.onTitleChanged) self.mainFrame().loadFinished.connect(self.onLoadFinished) self.mainFrame().initialLayoutCompleted.connect(self.onLayoutCompleted) def onTitleChanged(self, title): self.har_log.store_title(title) def onUrlChanged(self, url): self.har_log.store_url(url.toString()) def onLoadFinished(self, ok): self.har_log.store_timing("onLoad") def onLayoutCompleted(self): self.har_log.store_timing("onContentLoad") def javaScriptAlert(self, frame, msg): return def javaScriptConfirm(self, frame, msg): return False def javaScriptConsoleMessage(self, msg, line_number, source_id): if self.verbosity >= 2: log.msg("JsConsole(%s:%d): %s" % (source_id, line_number, msg), system='render') def userAgentForUrl(self, url): if self.custom_user_agent is None: return super(SplashQWebPage, self).userAgentForUrl(url) else: return self.custom_user_agent # loadFinished signal handler receives ok=False at least these cases: # 1. when there is an error with the page (e.g. the page is not available); # 2. when a redirect happened before all related resource are loaded; # 3. when page sends headers that are not parsed correctly # (e.g. a bad Content-Type). # By implementing ErrorPageExtension we can catch (1) and # distinguish it from (2) and (3). def extension(self, extension, info=None, errorPage=None): if extension == QWebPage.ErrorPageExtension: # catch the error, populate self.errorInfo and return an error page info = sip.cast(info, QWebPage.ErrorPageExtensionOption) domain = 'Unknown' if info.domain == QWebPage.QtNetwork: domain = 'Network' elif info.domain == QWebPage.Http: domain = 'HTTP' elif info.domain == QWebPage.WebKit: domain = 'WebKit' self.error_info = RenderErrorInfo(domain, int(info.error), unicode(info.errorString), unicode(info.url.toString())) # XXX: this page currently goes nowhere content = u""" <html><head><title>Failed loading page</title></head> <body> <h1>Failed loading page ({0.text})</h1> <h2>{0.url}</h2> <p>{0.type} error #{0.code}</p> </body></html>""".format(self.error_info) errorPage = sip.cast(errorPage, QWebPage.ErrorPageExtensionReturn) errorPage.content = QByteArray(content.encode('utf-8')) return True # XXX: this method always returns True, even if we haven't # handled the extension. Is it correct? When can this method be # called with extension which is not ErrorPageExtension if we # are returning False in ``supportsExtension`` for such extensions? return True def supportsExtension(self, extension): if extension == QWebPage.ErrorPageExtension: return True return False def maybe_redirect(self, load_finished_ok): """ Return True if the current webpage state looks like a redirect. Use this function from loadFinished handler to ignore spurious signals. FIXME: This can return True if server returned incorrect Content-Type header, but there is no an additional loadFinished signal in this case. """ return not load_finished_ok and self.error_info is None def is_ok(self, load_finished_ok): return load_finished_ok and self.error_info is None def error_loading(self, load_finished_ok): return load_finished_ok and self.error_info is not None
class HarBuilder(object): """ Splash-specific HAR builder class. It knows how to update timings based on events available in QT. Also it maintains a history of browser URL changes. """ REQUEST_CREATED = "created" REQUEST_FINISHED = "finished" REQUEST_HEADERS_RECEIVED = "headers_received" def __init__(self): self.log = HarLog() self.history = [] def todict(self): """ Return HAR log as a Python dict. """ return self.log.todict() def get_history(self): """ Get a history of browser URL changes """ return copy.deepcopy(self.history) def reset(self): """ Start building a new HAR log """ self.log = HarLog() def get_last_http_status(self): """ Return HTTP status code of the currently loaded webpage or None if it is not available. """ if not self.history: return try: return self.history[-1]["response"]["status"] except KeyError: return def get_entry(self, req_id): """ Return HAR entry for a given req_id """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) return entry def _initial_entry_data(self, start_time, operation, request, outgoingData): """ Return initial values for a new HAR entry. """ return { # custom fields '_tmp': { 'start_time': start_time, 'request_start_sending_time': start_time, 'request_sent_time': start_time, 'response_start_time': start_time, # 'outgoingData': outgoingData, }, '_splash_processing_state': self.REQUEST_CREATED, # standard fields "startedDateTime": format_datetime(start_time), "request": request2har(request, operation, outgoingData), "response": { "bodySize": -1, }, "cache": {}, "timings": { "blocked": -1, "dns": -1, "connect": -1, "ssl": -1, "send": 0, "wait": 0, "receive": 0, }, "time": 0, } def store_title(self, title): self.log.store_title(title) def store_url(self, url): if hasattr(url, 'toString'): url = url.toString() self.log.store_url(url) def store_timing(self, name): self.log.store_timing(name) def store_new_request(self, req_id, start_time, operation, request, outgoingData): """ Store information about a new QNetworkRequest. """ entry = self.log.get_mutable_entry(req_id, create=True) entry.update( self._initial_entry_data(start_time=start_time, operation=operation, request=request, outgoingData=outgoingData)) def store_new_reply(self, req_id, reply): """ Store initial reply information. """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) entry["response"].update(reply2har(reply)) def store_reply_finished(self, req_id, reply): """ Store information about a finished reply. """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) entry["_splash_processing_state"] = self.REQUEST_FINISHED # update timings now = datetime.utcnow() start_time = entry['_tmp']['start_time'] response_start_time = entry['_tmp']['response_start_time'] receive_time = get_duration(response_start_time, now) total_time = get_duration(start_time, now) entry["timings"]["receive"] = receive_time entry["time"] = total_time if not entry["timings"]["send"]: wait_time = entry["timings"]["wait"] entry["timings"]["send"] = total_time - receive_time - wait_time if entry["timings"]["send"] < 1e-6: entry["timings"]["send"] = 0 # update other reply information entry["response"].update(reply2har(reply, include_content=True)) def store_reply_headers_received(self, req_id, reply): """ Update reply information when HTTP headers are received. """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) if entry["_splash_processing_state"] == self.REQUEST_FINISHED: # self.log("Headers received for {url}; ignoring", reply, # min_level=3) return entry["_splash_processing_state"] = self.REQUEST_HEADERS_RECEIVED entry["response"].update(reply2har(reply)) now = datetime.utcnow() request_sent = entry["_tmp"]["request_sent_time"] entry["_tmp"]["response_start_time"] = now entry["timings"]["wait"] = get_duration(request_sent, now) def store_reply_download_progress(self, req_id, received, total): """ Update reply information when new data is available """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) entry["response"]["bodySize"] = int(received) def store_request_upload_progress(self, req_id, sent, total): """ Update request information when outgoing data is sent. """ if not self.log.has_entry(req_id): return entry = self.log.get_mutable_entry(req_id) entry["request"]["bodySize"] = int(sent) now = datetime.utcnow() if sent == 0: # it is a moment the sending is started start_time = entry["_tmp"]["request_start_time"] entry["_tmp"]["request_start_sending_time"] = now entry["timings"]["blocked"] = get_duration(start_time, now) entry["_tmp"]["request_sent_time"] = now if sent == total: entry["_tmp"]["response_start_time"] = now start_sending_time = entry["_tmp"]["request_start_sending_time"] entry["timings"]["send"] = get_duration(start_sending_time, now) def store_redirect(self, url): """ Update history when redirect happens """ cause_ev = self.log._prev_entry(url, last_idx=-1) if cause_ev: self.history.append(cleaned_har_entry(cause_ev.data))