def test_select(self): # {{{ document = etree.fromstring(self.HTML_IDS) select = Select(document) from PyQt5.Qt import QApplication, QWebPage app = QApplication([]) w = QWebPage() w.mainFrame().setHtml(self.HTML_IDS) def select_ids(selector): for elem in select(selector): yield elem.get('id') def pcss(main, *selectors, **kwargs): result = list(select_ids(main)) for selector in selectors: self.ae(list(select_ids(selector)), result) if not kwargs.get('skip_webkit'): wk = set(run_webkit_selector(w, main)) self.ae( set(result), wk, 'WebKit did not match result for: %r. Result: %r WebKit: %r' % (main, set(result), wk)) return result all_ids = pcss('*') self.ae(all_ids[:6], ['html', None, 'link-href', 'link-nohref', None, 'outer-div']) self.ae(all_ids[-1:], ['foobar-span']) self.ae(pcss('div'), ['outer-div', 'li-div', 'foobar-div']) self.ae( pcss('DIV'), ['outer-div', 'li-div', 'foobar-div']) # case-insensitive in HTML self.ae(pcss('div div'), ['li-div']) self.ae(pcss('div, div div'), ['outer-div', 'li-div', 'foobar-div']) self.ae(pcss('a[name]'), ['name-anchor']) self.ae(pcss('a[NAme]'), ['name-anchor']) # case-insensitive in HTML: self.ae(pcss('a[rel]'), ['tag-anchor', 'nofollow-anchor']) self.ae(pcss('a[rel="tag"]'), ['tag-anchor']) self.ae(pcss('a[href*="localhost"]'), ['tag-anchor']) self.ae(pcss('a[href*=""]'), []) self.ae(pcss('a[href^="http"]'), ['tag-anchor', 'nofollow-anchor']) self.ae(pcss('a[href^="http:"]'), ['tag-anchor']) self.ae(pcss('a[href^=""]'), []) self.ae(pcss('a[href$="org"]'), ['nofollow-anchor']) self.ae(pcss('a[href$=""]'), []) self.ae( pcss('div[foobar~="bc"]', 'div[foobar~="cde"]', skip_webkit=True), ['foobar-div']) self.ae(pcss('[foobar~="ab bc"]', '[foobar~=""]', '[foobar~=" \t"]'), []) self.ae(pcss('div[foobar~="cd"]'), []) self.ae(pcss('*[lang|="En"]', '[lang|="En-us"]'), ['second-li']) # Attribute values are case sensitive self.ae(pcss('*[lang|="en"]', '[lang|="en-US"]', skip_webkit=True), []) self.ae(pcss('*[lang|="e"]'), []) self.ae(pcss(':lang("EN")', '*:lang(en-US)', skip_webkit=True), ['second-li', 'li-div']) self.ae(pcss(':lang("e")'), []) self.ae(pcss('li:nth-child(1)', 'li:first-child'), ['first-li']) self.ae(pcss('li:nth-child(3)', '#first-li ~ :nth-child(3)'), ['third-li']) self.ae(pcss('li:nth-child(10)'), []) self.ae( pcss('li:nth-child(2n)', 'li:nth-child(even)', 'li:nth-child(2n+0)'), ['second-li', 'fourth-li', 'sixth-li']) self.ae(pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)'), ['first-li', 'third-li', 'fifth-li', 'seventh-li']) self.ae(pcss('li:nth-child(2n+4)'), ['fourth-li', 'sixth-li']) self.ae(pcss('li:nth-child(3n+1)'), ['first-li', 'fourth-li', 'seventh-li']) self.ae(pcss('li:nth-last-child(0)'), []) self.ae(pcss('li:nth-last-child(1)', 'li:last-child'), ['seventh-li']) self.ae(pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)'), ['second-li', 'fourth-li', 'sixth-li']) self.ae(pcss('li:nth-last-child(2n+2)'), ['second-li', 'fourth-li', 'sixth-li']) self.ae(pcss('ol:first-of-type'), ['first-ol']) self.ae(pcss('ol:nth-child(1)'), []) self.ae(pcss('ol:nth-of-type(2)'), ['second-ol']) self.ae(pcss('ol:nth-last-of-type(1)'), ['second-ol']) self.ae(pcss('span:only-child'), ['foobar-span']) self.ae(pcss('li div:only-child'), ['li-div']) self.ae(pcss('div *:only-child'), ['li-div', 'foobar-span']) self.ae(pcss('p *:only-of-type', skip_webkit=True), ['p-em', 'fieldset']) self.ae(pcss('p:only-of-type', skip_webkit=True), ['paragraph']) self.ae(pcss('a:empty', 'a:EMpty'), ['name-anchor']) self.ae(pcss('li:empty'), ['third-li', 'fourth-li', 'fifth-li', 'sixth-li']) self.ae(pcss(':root', 'html:root', 'li:root'), ['html']) self.ae(pcss('* :root', 'p *:root'), []) self.ae(pcss('.a', '.b', '*.a', 'ol.a'), ['first-ol']) self.ae(pcss('.c', '*.c'), ['first-ol', 'third-li', 'fourth-li']) self.ae(pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c'), ['third-li', 'fourth-li']) self.ae(pcss('#first-li', 'li#first-li', '*#first-li'), ['first-li']) self.ae(pcss('li div', 'li > div', 'div div'), ['li-div']) self.ae(pcss('div > div'), []) self.ae(pcss('div>.c', 'div > .c'), ['first-ol']) self.ae(pcss('div + div'), ['foobar-div']) self.ae(pcss('a ~ a'), ['tag-anchor', 'nofollow-anchor']) self.ae(pcss('a[rel="tag"] ~ a'), ['nofollow-anchor']) self.ae(pcss('ol#first-ol li:last-child'), ['seventh-li']) self.ae(pcss('ol#first-ol *:last-child'), ['li-div', 'seventh-li']) self.ae(pcss('#outer-div:first-child'), ['outer-div']) self.ae(pcss('#outer-div :first-child'), [ 'name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-fieldset-disabled', 'area-href' ]) self.ae(pcss('a[href]'), ['tag-anchor', 'nofollow-anchor']) self.ae(pcss(':not(*)'), []) self.ae(pcss('a:not([href])'), ['name-anchor']) self.ae(pcss('ol :Not(li[class])', skip_webkit=True), [ 'first-li', 'second-li', 'li-div', 'fifth-li', 'sixth-li', 'seventh-li' ]) self.ae(pcss(r'di\a0 v', r'div\['), []) self.ae(pcss(r'[h\a0 ref]', r'[h\]ref]'), []) del app
def test_select(self): # {{{ document = etree.fromstring(self.HTML_IDS) select = Select(document) from PyQt5.Qt import QApplication, QWebPage app = QApplication([]) w = QWebPage() w.mainFrame().setHtml(self.HTML_IDS) def select_ids(selector): for elem in select(selector): yield elem.get('id') def pcss(main, *selectors, **kwargs): result = list(select_ids(main)) for selector in selectors: self.ae(list(select_ids(selector)), result) if not kwargs.get('skip_webkit'): wk = set(run_webkit_selector(w, main)) self.ae(set(result), wk, 'WebKit did not match result for: %r. Result: %r WebKit: %r' % (main, set(result), wk)) return result all_ids = pcss('*') self.ae(all_ids[:6], [ 'html', None, 'link-href', 'link-nohref', None, 'outer-div']) self.ae(all_ids[-1:], ['foobar-span']) self.ae(pcss('div'), ['outer-div', 'li-div', 'foobar-div']) self.ae(pcss('DIV'), [ 'outer-div', 'li-div', 'foobar-div']) # case-insensitive in HTML self.ae(pcss('div div'), ['li-div']) self.ae(pcss('div, div div'), ['outer-div', 'li-div', 'foobar-div']) self.ae(pcss('a[name]'), ['name-anchor']) self.ae(pcss('a[NAme]'), ['name-anchor']) # case-insensitive in HTML: self.ae(pcss('a[rel]'), ['tag-anchor', 'nofollow-anchor']) self.ae(pcss('a[rel="tag"]'), ['tag-anchor']) self.ae(pcss('a[href*="localhost"]'), ['tag-anchor']) self.ae(pcss('a[href*=""]'), []) self.ae(pcss('a[href^="http"]'), ['tag-anchor', 'nofollow-anchor']) self.ae(pcss('a[href^="http:"]'), ['tag-anchor']) self.ae(pcss('a[href^=""]'), []) self.ae(pcss('a[href$="org"]'), ['nofollow-anchor']) self.ae(pcss('a[href$=""]'), []) self.ae(pcss('div[foobar~="bc"]', 'div[foobar~="cde"]', skip_webkit=True), ['foobar-div']) self.ae(pcss('[foobar~="ab bc"]', '[foobar~=""]', '[foobar~=" \t"]'), []) self.ae(pcss('div[foobar~="cd"]'), []) self.ae(pcss('*[lang|="En"]', '[lang|="En-us"]'), ['second-li']) # Attribute values are case sensitive self.ae(pcss('*[lang|="en"]', '[lang|="en-US"]', skip_webkit=True), []) self.ae(pcss('*[lang|="e"]'), []) self.ae(pcss(':lang("EN")', '*:lang(en-US)', skip_webkit=True), ['second-li', 'li-div']) self.ae(pcss(':lang("e")'), []) self.ae(pcss('li:nth-child(1)', 'li:first-child'), ['first-li']) self.ae(pcss('li:nth-child(3)', '#first-li ~ :nth-child(3)'), ['third-li']) self.ae(pcss('li:nth-child(10)'), []) self.ae(pcss('li:nth-child(2n)', 'li:nth-child(even)', 'li:nth-child(2n+0)'), ['second-li', 'fourth-li', 'sixth-li']) self.ae(pcss('li:nth-child(+2n+1)', 'li:nth-child(odd)'), ['first-li', 'third-li', 'fifth-li', 'seventh-li']) self.ae(pcss('li:nth-child(2n+4)'), ['fourth-li', 'sixth-li']) self.ae(pcss('li:nth-child(3n+1)'), ['first-li', 'fourth-li', 'seventh-li']) self.ae(pcss('li:nth-last-child(0)'), []) self.ae(pcss('li:nth-last-child(1)', 'li:last-child'), ['seventh-li']) self.ae(pcss('li:nth-last-child(2n)', 'li:nth-last-child(even)'), ['second-li', 'fourth-li', 'sixth-li']) self.ae(pcss('li:nth-last-child(2n+2)'), ['second-li', 'fourth-li', 'sixth-li']) self.ae(pcss('ol:first-of-type'), ['first-ol']) self.ae(pcss('ol:nth-child(1)'), []) self.ae(pcss('ol:nth-of-type(2)'), ['second-ol']) self.ae(pcss('ol:nth-last-of-type(1)'), ['second-ol']) self.ae(pcss('span:only-child'), ['foobar-span']) self.ae(pcss('li div:only-child'), ['li-div']) self.ae(pcss('div *:only-child'), ['li-div', 'foobar-span']) self.ae(pcss('p *:only-of-type', skip_webkit=True), ['p-em', 'fieldset']) self.ae(pcss('p:only-of-type', skip_webkit=True), ['paragraph']) self.ae(pcss('a:empty', 'a:EMpty'), ['name-anchor']) self.ae(pcss('li:empty'), ['third-li', 'fourth-li', 'fifth-li', 'sixth-li']) self.ae(pcss(':root', 'html:root', 'li:root'), ['html']) self.ae(pcss('* :root', 'p *:root'), []) self.ae(pcss('.a', '.b', '*.a', 'ol.a'), ['first-ol']) self.ae(pcss('.c', '*.c'), ['first-ol', 'third-li', 'fourth-li']) self.ae(pcss('ol *.c', 'ol li.c', 'li ~ li.c', 'ol > li.c'), [ 'third-li', 'fourth-li']) self.ae(pcss('#first-li', 'li#first-li', '*#first-li'), ['first-li']) self.ae(pcss('li div', 'li > div', 'div div'), ['li-div']) self.ae(pcss('div > div'), []) self.ae(pcss('div>.c', 'div > .c'), ['first-ol']) self.ae(pcss('div + div'), ['foobar-div']) self.ae(pcss('a ~ a'), ['tag-anchor', 'nofollow-anchor']) self.ae(pcss('a[rel="tag"] ~ a'), ['nofollow-anchor']) self.ae(pcss('ol#first-ol li:last-child'), ['seventh-li']) self.ae(pcss('ol#first-ol *:last-child'), ['li-div', 'seventh-li']) self.ae(pcss('#outer-div:first-child'), ['outer-div']) self.ae(pcss('#outer-div :first-child'), [ 'name-anchor', 'first-li', 'li-div', 'p-b', 'checkbox-fieldset-disabled', 'area-href']) self.ae(pcss('a[href]'), ['tag-anchor', 'nofollow-anchor']) self.ae(pcss(':not(*)'), []) self.ae(pcss('a:not([href])'), ['name-anchor']) self.ae(pcss('ol :Not(li[class])', skip_webkit=True), [ 'first-li', 'second-li', 'li-div', 'fifth-li', 'sixth-li', 'seventh-li']) self.ae(pcss(r'di\a0 v', r'div\['), []) self.ae(pcss(r'[h\a0 ref]', r'[h\]ref]'), []) self.assertRaises(ExpressionError, lambda : tuple(select('body:nth-child'))) select = Select(document, ignore_inappropriate_pseudo_classes=True) self.assertGreater(len(tuple(select('p:hover'))), 0) del app
def __init__(self, parent, proxy="", port=0, crawl_speed=CrawlSpeed.Medium, network_access_manager=None): QWebPage.__init__(self, parent) self.app = parent.app self._js_bridge = JsBridge(self) self.loadFinished.connect(self.loadFinishedHandler) self.mainFrame().javaScriptWindowObjectCleared.connect( self.jsWinObjClearedHandler) self.frameCreated.connect(self.frameCreatedHandler) self.setViewportSize(QSize(1024, 800)) if crawl_speed == CrawlSpeed.Slow: self.wait_for_processing = 1 self.wait_for_event = 2 if crawl_speed == CrawlSpeed.Medium: self.wait_for_processing = 0.3 self.wait_for_event = 1 if crawl_speed == CrawlSpeed.Fast: self.wait_for_processing = 0.1 self.wait_for_event = 0.5 if crawl_speed == CrawlSpeed.Speed_of_Lightning: self.wait_for_processing = 0.01 self.wait_for_event = 0.1 f = open("js/lib.js", "r") self._lib_js = f.read() f.close() f = open("js/ajax_observer.js") self._xhr_observe_js = f.read() f.close() f = open("js/timing_wrapper.js") self._timeming_wrapper_js = f.read() f.close() f = open("js/ajax_interceptor.js") self._xhr_interception_js = f.read() f.close() f = open("js/addeventlistener_wrapper.js") self._addEventListener = f.read() f.close() f = open("js/md5.js") self._md5 = f.read() f.close() f = open("js/property_obs.js") self._property_obs_js = f.read() f.close() enablePlugins = True loadImages = False self.settings().setAttribute(QWebSettings.PluginsEnabled, enablePlugins) self.settings().setAttribute(QWebSettings.JavaEnabled, enablePlugins) #self.settings().setAttribute(QWebSettings.AutoLoadImages, loadImages) self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True) self.settings().setAttribute(QWebSettings.JavascriptEnabled, True) self.settings().setAttribute(QWebSettings.JavascriptCanOpenWindows, True) if network_access_manager: self.setNetworkAccessManager(network_access_manager) if proxy != "" and port != 0: manager = self.networkAccessManager() p = QNetworkProxy(QNetworkProxy.HttpProxy, proxy, port, None, None) manager.setProxy(p) self.setNetworkAccessManager(manager) #Have to connect it here, otherwise I could connect it to the old one and then replaces it self.networkAccessManager().finished.connect(self.loadComplete)