def __init__(self, parent, url=''): super(WebView, self).__init__(parent) self.draging = False self.drag = QDrag(self) self.dragStartPos = None self.webPage = WebPage() self.setPage(self.webPage) self.mainFrame = self.page().mainFrame() self.setAttribute(Qt.WA_DeleteOnClose, True) self.titleChanged.connect(parent.setWindowTitle) self.load(url) webSettings = self.settings() webSettings.setDefaultTextEncoding("utf-8") # webSettings.setOfflineStorageDefaultQuota(sys.maxsize) # webSettings.setOfflineWebApplicationCacheQuota(sys.maxsize) webSettings.enablePersistentStorage(assets.fs.dataPath()) webSettings.setAttribute(QWebSettings.PluginsEnabled, True) webSettings.setAttribute(QWebSettings.DnsPrefetchEnabled, True) webSettings.setAttribute(QWebSettings.XSSAuditingEnabled, True) webSettings.setAttribute(QWebSettings.CSSGridLayoutEnabled, True) webSettings.setAttribute(QWebSettings.ScrollAnimatorEnabled, True) webSettings.setAttribute(QWebSettings.DeveloperExtrasEnabled, True) webSettings.setAttribute(QWebSettings.JavascriptCanOpenWindows, True) webSettings.setAttribute(QWebSettings.JavascriptCanCloseWindows, True) webSettings.setAttribute(QWebSettings.JavascriptCanAccessClipboard, True) webSettings.setAttribute(QWebSettings.LocalContentCanAccessFileUrls, True) webSettings.setAttribute(QWebSettings.LocalContentCanAccessRemoteUrls, True) self.mainFrame.javaScriptWindowObjectCleared.connect( self.setJavaScriptObject) self.mainFrame.iconChanged.connect(self.changeIcon)
def crawl(self): while len(self.url_queue) > 0 and len( self.discovered) <= self.MAX_LINKS_TO_VISIT: url = self.url_queue.popleft() if 'DEBUG' in os.environ: print "Queue Size:", len(self.url_queue) print "Fetching: ", url webpage = WebPage(url) self.unvisited[url] = False all_links = webpage.get_anchors( False) # False: dont keep fragments all_assets = webpage.get_assets() self.assets.append({'url': url, 'assets': all_assets}) for link in all_links: # if belongs to same domain & is not already discovered if self.same_domain_rule.matches( link) and self.discovered[link.geturl()] is None: self.discovered[link.geturl()] = True # process if not already in the queue if self.unvisited[link.geturl()] is None: self.url_queue.append(link.geturl()) self.unvisited[link.geturl()] = True
def __init__(self, profile: QWebEngineProfile): super().__init__() self.m_addressBar = UrlLineEdit(self) self.m_view = WebView(self) self.setAttribute(Qt.WA_DeleteOnClose) self.setSizePolicy(QSizePolicy.Minimum, QSizePolicy.Minimum) layout = QVBoxLayout() # layout.setMargin(0) self.setLayout(layout) layout.addWidget(self.m_addressBar) layout.addWidget(self.m_view) self.m_view.setPage(WebPage(profile, self.m_view)) self.m_view.setFocus() self.m_addressBar.setReadOnly(True) self.m_addressBar.setFavIcon(QIcon(":defaulticon.png")) self.m_view.titleChanged.connect(self.setWindowTitle) self.m_view.urlChanged.connect(self.setUrl) self.m_view.page().iconChanged.connect(self.handleIconChanged) self.m_view.page().geometryChangeRequested.connect( self.handleGeometryChangeRequested) self.m_view.page().windowCloseRequested.connect(self.close)
def createWebPage(self): page = WebPage(self) self.m_pages.append(page) page.applySettings(self.m_defaultPageSettings) page.setNetworkAccessManager(self.m_netAccessMan) page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) return page
def getlinks(self, url, html): self.webpage = WebPage(url, html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags=['a'], patterns=ruptn) return links
def Crawl(address, recursions=2, txt=""): ''' gathers 5 random links from given website repeats recursively to each of the links that were found ''' links_amount = 0 max_links_amount = 5 web_page = WebPage(address) links = web_page.get_links(only_external_links_allowed=True) #print("Links on page: {} found: {}".format(address, len(links))) if recursions > 0: recursions -= 1 for link in links: if links_amount >= max_links_amount: break if link not in words_list.keys(): try: Crawl(link, recursions, txt) all_text = web_page.get_all_text() words_parser = GetWords(all_text) words_list[link] = words_parser.top_words(5) links_amount += 1 except (TooFewLinksOnPage, BannedExtensionPage, ConnectionError): #print("Getting page from previous round...") continue
def start(self): while 1: try: url = self.dbop.pop_url() print "url: %s" % url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html != None: self.webpage = WebPage(url, html) article = self.webpage.extract() if len(article) > 5: addtime = "%s %s" % (article[1], article[2]) self.dbop.html2db(url, html, article[0], addtime, article[3], article[5]) else: self.dbop.html2db(url, html) print self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) links = self.webpage.filter_links(tags=['a'], str_patterns=ruptn) self.add_seeds(links) self.mysleep(3) except Exception, err: print "!!error!! Exception happend! %s %s" % (url, err) self.dbop.close()
def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_defaultPageSettings = {} self.m_pages = [] self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.m_scriptFile = args.script self.m_args = args.script_args self.m_filesystem = FileSystem(self) self.m_pages.append(self.m_page) do_action('PhantomInitPre') if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) # Provide WebPage with a non-standard Network Access Manager self.m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(self.m_netAccessMan) self.m_page.javaScriptConsoleMessageSent.connect( self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_defaultPageSettings[ 'localAccessRemote'] = args.local_access_remote self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) self.m_page.mainFrame().addToJavaScriptWindowObject( 'fs', self.m_filesystem) bootstrap = QFile(':/bootstrap.js') if not bootstrap.open(QFile.ReadOnly): sys.exit('Can not bootstrap!') bootstrapper = str(bootstrap.readAll()) bootstrap.close() if not bootstrapper: sys.exit('Can not bootstrap!') self.m_page.mainFrame().evaluateJavaScript(bootstrapper) do_action('PhantomInitPost')
def __init__(self): super(Window, self).__init__() self.view = QWebView(self) self.view.setPage(WebPage()) layout = QVBoxLayout(self) layout.setMargin(0) layout.addWidget(self.view)
def __init__(self, settings): super(Browser, self).__init__() self._settings = settings self._cookie_jar = CookieJar() self._network_access_manager = NetworkAccessManager(self._cookie_jar) self._web_page = WebPage(settings) self._web_page.setNetworkAccessManager(self._network_access_manager) self._web_view = WebView(settings) self._web_view.setPage(self._web_page)
def createTab(self, makeCurrent: bool = True) -> WebView: webView = WebView() webPage = WebPage(QWebEngineProfile.defaultProfile(), webView) webView.setPage(webPage) self.setupView(webView) self.addTab(webView, self.tr("(Untitled)")) if makeCurrent: self.setCurrentWidget(webView) return webView
def test_file_links(self): self.start_server(TestWebPage.FILE_LINKS_HTML) webpage = WebPage(TestWebPage.SERVER) self.assertEqual(0, len(webpage.get_js())) self.assertEqual(0, len(webpage.get_stylesheets())) self.assertEqual(0, len(webpage.get_links())) self.assertEqual(2, len(webpage.get_anchors())) self.assertEqual(0, len(webpage.get_images())) self.assertEqual(2, len(webpage.get_files()))
def updateView(self): page = WebPage(logger=None, parent=self) page.setLinkDelegationPolicy(QWebPage.DelegateAllLinks) page.mainFrame().addToJavaScriptWindowObject("qtWindow", self) self.ui.webView.setPage(page) html = self.generateHtml() # baseUrl must end with a trailing '/' otherwise QWebView won't be able # to load files from there baseUrl = QUrl.fromLocalFile(os.path.join(self.dataDir, "static/")) self.ui.webView.setHtml(html, baseUrl)
def get_course_manager(self, semester, departments, show_html=0): htmldata = self.get_course_page(semester, departments) if show_html: self.show_html(htmldata) htmlpath = "temp.html" with open(htmlpath, "w") as f: f.write(htmldata) webpage = WebPage(htmlpath=htmlpath) cm = CourseManager(webpage, logger=self.logger) return cm
def __init__(self, parent, args): super(Phantom, self).__init__(parent) # variable declarations self.m_defaultPageSettings = {} self.m_pages = [] self.m_servers = [] self.m_verbose = args.verbose self.m_page = WebPage(self, args) self.m_returnValue = 0 self.m_terminated = False # setup the values from args self.app_args = args self.m_scriptFile = args.script self.m_args = args.script_args self.m_scriptEncoding = Encode(args.script_encoding, 'utf-8') self.m_outputEncoding = Encode(args.output_encoding, sys.stdout.encoding_sys) self.m_pages.append(self.m_page) do_action('PhantomInitPre') if args.proxy is None: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(args.proxy_type, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.javaScriptConsoleMessageSent.connect( self.printConsoleMessage) self.m_defaultPageSettings['loadImages'] = args.load_images self.m_defaultPageSettings['loadPlugins'] = args.load_plugins self.m_defaultPageSettings['javascriptEnabled'] = True self.m_defaultPageSettings['XSSAuditingEnabled'] = False self.m_defaultPageSettings['userAgent'] = self.m_page.userAgent() self.m_defaultPageSettings[ 'localToRemoteUrlAccessEnabled'] = args.local_to_remote_url_access self.m_page.applySettings(self.m_defaultPageSettings) self.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) # inject our properties and slots into javascript self.m_page.mainFrame().addToJavaScriptWindowObject('phantom', self) with QPyFile(':/bootstrap.js') as f: self.m_page.mainFrame().evaluateJavaScript(f.readAll()) do_action('PhantomInitPost')
def process_web_page(thread_id, sites, q): webpages = [] for site in sites: try: w = WebPageLinkExtractor(WebPage(site)) # Discard invalid webpages for web in w.external_links: if FORBIDDEN_DOMAINS.match(web.qdn) or\ FORBIDDEN_FILETYPES.match(web.url): continue webpages.append(web) time.sleep(0.1) except: pass q.put(webpages)
def run(self): while self.status: try: url = self.spider.task_list.get(timeout=1) except Empty: # log.info('%s: task_list Empty' % self.name) continue self.spider.increase_running() if not self.spider.check_robots(url): log.info('%s - robots forbidden : %s' % (self.name, url)) continue page = WebPage(url) # print('%s prepare to fetch %s' % (self.name, url)) if page.fetch(): self.spider.db.save_data(page.get_data()) for link in page.get_link(): # retrive links from html if link not in self.spider.visited_list: # not visited yet self.spider.extend_list.add(link) else: print('%s: Page fetch failed: %s' % (self.name, page.url)) self.spider.decrease_running()
def get_all_urls(self): """Get all building urls from all index pages. Returns: dict: {"page 1": [url1, url2, url3 ...], "page 2": [...], ...} """ all_url_list = {} page_count = 0 for page_url in self.get_all_page_urls(): single_page_urls = [] page_count += 1 building_url_list = WebPage(page_url).get_soup().findAll( 'a', {"se:clickable:target": "true"}) for building in building_url_list: building_url = "https://streeteasy.com" + building.get("href") single_page_urls.append(building_url) IndexPage.url_count += 1 all_url_list[f"page {page_count}"] = single_page_urls print(f"Retrived all url on page {page_count}") print("=============================") print(f"RETRIVED ALL URLS FOR {page_count} PAGES") print(f"TOTAL URLS IDENTIFIED: ", IndexPage.url_count) return all_url_list
def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_loadStatus = self.m_state = '' self.m_var = self.m_paperSize = self.m_loadScript_cache = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_clipRect = QRect() # setup the values from args self.m_script = args.script.read() self.m_scriptFile = args.script.name self.m_scriptDir = os.path.dirname(args.script.name) + '/' self.m_args = args.script_args self.m_upload_file = args.upload_file autoLoadImages = False if args.load_images == 'no' else True pluginsEnabled = True if args.load_plugins == 'yes' else False args.script.close() do_action('PhantomInitPre', Bunch(locals())) palette = self.m_page.palette() palette.setBrush(QPalette.Base, Qt.transparent) self.m_page.setPalette(palette) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages) self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled) self.m_page.settings().setAttribute( QWebSettings.FrameFlatteningEnabled, True) self.m_page.settings().setAttribute( QWebSettings.OfflineStorageDatabaseEnabled, True) self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True) self.m_page.settings().setLocalStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) self.m_page.settings().setOfflineStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) # Ensure we have a document.body. self.m_page.mainFrame().setHtml('<html><body></body></html>') self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) m_netAccessMan = NetworkAccessManager(args.disk_cache, args.ignore_ssl_errors, self) self.m_page.setNetworkAccessManager(m_netAccessMan) # inject our properties and slots into javascript self.m_page.mainFrame().javaScriptWindowObjectCleared.connect( self.inject) self.m_page.loadFinished.connect(self.finish) do_action('PhantomInitPost', Bunch(locals()))
error_msg = "lxml error" return error_msg, url, redirected_url, html """ if __name__ == "__main__": #''' url = "http://www.cnbeta.com/" downloader = DownloadManager() error_msg, url, redirected_url, html = downloader.download(url) print "error_msg=%s" %error_msg print "url=%s" %url print "redirected_url=%s" %redirected_url f = open("www.cnbeta.com.html",'w') f.write(html) f.close() webpage = WebPage(url, html) webpage.parse_links() website = 'cnbeta\.com' patnstr = '^(http|https)://(.*\.' + website + ')(.+)$'; links = webpage.filter_links(tags=['a'], str_patterns=[patnstr]) links.sort() f_filter_links = open('filter_links_cnbeta.txt', 'w') #print links f = open('links_regged_cnbeta.txt','w') for link in links: f_filter_links.write('%s\n' % link) f.write('%s\n' % link) for elem, attr, lnk, pos in webpage.doc.iterlinks():
print("Usage: {} <source_url> [max num of internal_links]".format( sys.argv[0])) sys.exit(0) source_url = sys.argv[1] max_sites = float('inf') if len(sys.argv) == 3: max_sites = int(sys.argv[2]) to_process = [source_url] visited = set() internal_links = [] while len(to_process): url = to_process.pop() w = WebPageLinkExtractor(WebPage(url)) if len(visited) >= max_sites: break for web in w.internal_links: index = web.url.find('#') if index != -1: web.url = web.url[:index] if web.url not in visited: visited.add(web.url) to_process.append(web.url) time.sleep(0.3) for link in visited:
def __init__(self, args, parent=None): QObject.__init__(self, parent) # variable declarations self.m_loadStatus = self.m_state = QString() self.m_var = self.m_paperSize = self.m_loadScript_cache = {} self.m_verbose = args.verbose self.m_page = WebPage(self) self.m_clipRect = QRect() # setup the values from args self.m_script = QString.fromUtf8(args.script[0].read()) self.m_scriptFile = args.script[0].name self.m_args = args.script[1:] self.m_upload_file = args.upload_file autoLoadImages = False if args.load_images == 'no' else True pluginsEnabled = True if args.load_plugins == 'yes' else False args.script[0].close() palette = self.m_page.palette() palette.setBrush(QPalette.Base, Qt.transparent) self.m_page.setPalette(palette) if not args.proxy: QNetworkProxyFactory.setUseSystemConfiguration(True) else: proxy = QNetworkProxy(QNetworkProxy.HttpProxy, args.proxy[0], int(args.proxy[1])) QNetworkProxy.setApplicationProxy(proxy) self.m_page.settings().setAttribute(QWebSettings.AutoLoadImages, autoLoadImages) self.m_page.settings().setAttribute(QWebSettings.PluginsEnabled, pluginsEnabled) self.m_page.settings().setAttribute( QWebSettings.FrameFlatteningEnabled, True) self.m_page.settings().setAttribute( QWebSettings.OfflineStorageDatabaseEnabled, True) self.m_page.settings().setAttribute(QWebSettings.LocalStorageEnabled, True) self.m_page.settings().setLocalStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) self.m_page.settings().setOfflineStoragePath( QDesktopServices.storageLocation(QDesktopServices.DataLocation)) # Ensure we have a document.body. self.m_page.mainFrame().setHtml('<html><body></body></html>') self.m_page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) self.m_page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) # if our script was called in a different directory, change to it # to make any dealings with files be relative to the scripts directory if os.path.dirname(self.m_scriptFile): os.chdir(os.path.dirname(self.m_scriptFile)) if self.m_verbose: m_netAccessMan = NetworkAccessManager(self) self.m_page.setNetworkAccessManager(m_netAccessMan) # inject our properties and slots into javascript self.connect(self.m_page.mainFrame(), SIGNAL('javaScriptWindowObjectCleared()'), self.inject) self.connect(self.m_page, SIGNAL('loadFinished(bool)'), self.finish)
def get(self, url): r = requests.get(self.baseUrl + url) return WebPage(r.text)
def createWebPage(self): page = WebPage(self, self.app_args) self.m_pages.append(page) page.applySettings(self.m_defaultPageSettings) page.libraryPath = os.path.dirname(os.path.abspath(self.m_scriptFile)) return page