def __init__(self, gui=False, traffic_rules=None): if not WKitScope.app: raise InternalError('You should use Browser instance' ' inside `with WKitScope():` block') self.app = WKitScope.app self.manager = WKitNetworkAccessManager(traffic_rules=traffic_rules) self.manager.finished.connect(self.handle_finished_network_reply) self.cookie_jar = QNetworkCookieJar() self.manager.setCookieJar(self.cookie_jar) self.page = WKitWebPage() self.page.setNetworkAccessManager(self.manager) self.page.loadFinished.connect(self.handle_page_load_finished) self.view = WKitWebView() self.view.setPage(self.page) self.view.setApplication(self.app) self._response = None self.gui = gui if gui: self.view.show()
class Browser(MouseMixin, PositionMixin, WaitMixin, JavaScriptMixin): def __init__(self, gui=False, traffic_rules=None): if not WKitScope.app: raise InternalError('You should use Browser instance' ' inside `with WKitScope():` block') self.app = WKitScope.app self.manager = WKitNetworkAccessManager(traffic_rules=traffic_rules) self.manager.finished.connect(self.handle_finished_network_reply) self.cookie_jar = QNetworkCookieJar() self.manager.setCookieJar(self.cookie_jar) self.page = WKitWebPage() self.page.setNetworkAccessManager(self.manager) self.page.loadFinished.connect(self.handle_page_load_finished) self.view = WKitWebView() self.view.setPage(self.page) self.view.setApplication(self.app) self._response = None self.gui = gui if gui: self.view.show() #def __del__(self): # self.view.close() # self.view.setPage(None) # del self.view # del self.page def get_cookies(self): return self.cookie_jar.allCookies() def get_simple_cookies(self): res = {} for cookie in self.cookie_jar.allCookies(): key = cookie.name().data().decode('latin') val = cookie.value().data().decode('latin') res[key] = val return res def go(self, url, **kwargs): return self.request(url=url, **kwargs) def request(self, url=None, user_agent=None, cookies=None, timeout=DEFAULT_PAGE_LOAD_TIMEOUT, referer=None, method='get', data=None, headers=None, proxy=None, wait=True): # Reset things bound to previous response self._response = None self.resource_list = [] self._page_loaded = False #self.view.setHtml('', QUrl('blank://')) # Proxy if proxy: self.manager.setup_proxy(proxy) # User-Agent if user_agent is None: user_agent = DEFAULT_USER_AGENT self.page.set_user_agent(user_agent) # Cookies if cookies is None: cookies = {} cookie_obj_list = [] for name, value in cookies.items(): domain = ('.' + urlsplit(url).netloc).split(':')[0] #print 'CREATE COOKIE %s=%s' % (name, value) #print 'DOMAIN = %s' % domain cookie_obj = QNetworkCookie(name, value) cookie_obj.setDomain(domain) cookie_obj_list.append(cookie_obj) #self.cookie_jar.setAllCookies(cookie_obj_list) # HTTP Method method_obj = getattr(QNetworkAccessManager, '%sOperation' % method.capitalize()) # Ensure that Content-Type is correct if method is post if method == 'post': headers['Content-Type'] = 'application/x-www-form-urlencoded' # POST Data if data is None: data = QByteArray() # Build Request object req = QNetworkRequest(QUrl(url)) # Referer if referer: req.setRawHeader('Referer', referer) # Headers if headers is None: headers = {} for name, value in headers.items(): req.setRawHeader(name, value) self.content_type_stats = Counter() # Spawn request self.view.load(req, method_obj, data) if wait: self.wait_for_page_loaded(timeout=timeout) return self.get_page_response() else: return None def sleep(self, sleep_time): start = time.time() while time.time() < start + sleep_time: time.sleep(0.01) self.app.processEvents() def get_url(self): return self.page.mainFrame().url().toString()\ .split('#')[0].rstrip('/') def get_page_response(self): if self._response: return self._response else: url = self.page.mainFrame().url().toString()\ .split('#')[0].rstrip('/') for res in self.resource_list: print('TEST', url, res.url.rstrip('/')) if url == res.url.rstrip('/'): self._response = res return res print('Resource list:') for res in self.resource_list: print(' * %s' % res.url) print('Current page URL: %s' % self.page.mainFrame().url().toString()) raise InternalError('Could not associate any of loaded responses' ' with requested URL: %s' % url) def assert_ok_response(self): if self.get_page_response().status_code != 200: raise HttpStatusNotSuccess def get_html(self): return self.page.mainFrame().toHtml() def get_doc(self): return self.page.mainFrame().documentElement() def get_element(self, query): elem = self.get_doc().findFirst(query) if elem.isNull(): raise IndexError('Could not find element: %s' % query) else: return elem def element_exists(self, query): try: self.get_element(query) except IndexError: return False else: return True def find_elements(self, query): return self.get_doc().findAll(query) def get_base_url(self): try: base = self.get_element('base[href]') except IndexError: return self.get_page_response().url else: url = base.attribute('href') return url or self.get_page_response().url def get_random_int_link(self): base_url = self.get_base_url() base_host = urlsplit(base_url).hostname links = [] for elem in self.find_elements('a[href]'): url = urljoin(base_url, elem.attribute('href')) host = urlsplit(url).hostname if host == base_host: if url != self.get_page_response().url: links.append(url) if len(links) > 50: break if links: return choice(links) else: return None # ************** # Event Handlers # ************** def handle_page_load_finished(self): self._page_loaded = True if self.gui: scripts = [] if False:#self.jquery_namespace: scripts.append('jquery-1.9.1.min.js', ) for script in scripts: self.evaluate_js_file(os.path.dirname(__file__) + '/js/' + script) self.evaluate(u"WKit = jQuery.noConflict();" % self.jquery_namespace) def handle_finished_network_reply(self, reply): status_code = reply.attribute(QNetworkRequest.HttpStatusCodeAttribute) if status_code: if not isinstance(status_code, int): status_code = status_code.toInt()[0] logger_response.debug('HttpResource [%d]: %s' % (status_code, reply.url().toString())) self.resource_list.append(HttpResponse.build_from_reply(reply)) ctype = reply.rawHeader('Content-Type').data()\ .decode('latin').split(';')[0] self.content_type_stats[ctype] += 1